pax_global_header00006660000000000000000000000064151746625720014531gustar00rootroot0000000000000052 comment=0430370c7f84de6b81839785c5e5411a9d39dcec dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/000077500000000000000000000000001517466257200200445ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/.gitignore000066400000000000000000000001631517466257200220340ustar00rootroot00000000000000/build* /Session.vim [._]*.swp *~ tags .DS_Store /tests/argon /tests/dav2d-test-data *.snap /tools/output/xxhash.h dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/.gitlab-ci.yml000066400000000000000000000132001517466257200224740ustar00rootroot00000000000000stages: - style - build .debian-amd64-common: image: registry.videolan.org/vlc-debian-unstable:20250528090433 stage: build tags: - docker - amd64 .debian-aarch64-common: image: registry.videolan.org/dav1d-debian-bookworm-aarch64:20250215002814 stage: build tags: - docker - aarch64 .debian-armv7-common: image: registry.videolan.org/dav1d-debian-bookworm-armv7:20250215014239 stage: build tags: - docker - armv7 style-check: extends: .debian-amd64-common stage: style script: - git grep -I -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' ':(exclude)patches/*' && echo "Trailing whitespace" && exit 1 - git grep -I -l -z "" -- . ':(exclude)*/compat/*' ':(exclude)patches/*' | while IFS= read -r -d '' i; do if grep -n "[$'\u061c'$'\u2000'-$'\u200f'$'\u2028'-$'\u202f'$'\u205f'-$'\u206f']" "$i"; then echo "Invisible Unicode characters"; exit 1; fi; if [ -n "$(tail -c 1 "$i")" ]; then echo "No newline at end of $i"; exit 1; fi; done build-debian: extends: .debian-amd64-common tags: - docker - avx2 - amd64 script: - set -eo pipefail - git clone -b research-v14.1.0 --depth 1 --shallow-submodules https://gitlab.com/AOMediaCodec/avm.git avm - | cd avm git config user.name "CI Bot" git config user.email "ci-bot@example.com" git am ../patches/*.patch - | sed -i 's|#define DEBUG_OBU_HDR 0|#define DEBUG_OBU_HDR 1|' avm/debug.h sed -i 's|#define DEBUG_SEQ_HDR 0|#define DEBUG_SEQ_HDR 1|' avm/debug.h sed -i 's|#define DEBUG_FRAME_HDR 0|#define DEBUG_FRAME_HDR 1|' avm/debug.h sed -i 's|#define DEBUG_BLOCK_INFO 0|#define DEBUG_BLOCK_INFO 1|' avm/debug.h sed -i 's|#define DEBUG_B_PIXELS 0|#define DEBUG_B_PIXELS 1|' avm/debug.h sed -i 's|#define DEBUG_REFMV 0|#define DEBUG_REFMV 1|' av2/common/mvref_common.c sed -i 's|#define DEBUG_CI_HDR 0|#define DEBUG_CI_HDR 1|' av2/decoder/obu_ci.c sed -i 's|#define DEBUG_FGM_HDR 0|#define DEBUG_FGM_HDR 1|' av2/decoder/obu_fgm.c sed -i 's|cm->current_frame.frame_number == 0|cm->current_frame.frame_number <= 4|' avm/debug.h git diff - | mkdir x86-64 cd x86-64 cmake .. -DENABLE_TESTS=0 -DCONFIG_AV2_ENCODER=0 make -j$(nproc) avmdec cd ../.. - | sed -i 's|#define DEBUG_OBU_HDR 0|#define DEBUG_OBU_HDR 1|' src/obu.c sed -i 's|#define DEBUG_SEQ_HDR 0|#define DEBUG_SEQ_HDR 1|' src/obu.c sed -i 's|#define DEBUG_FRAME_HDR 0|#define DEBUG_FRAME_HDR 1|' src/obu.c sed -i 's|#define DEBUG_BLOCK_INFO 0|#define DEBUG_BLOCK_INFO 1|' src/debug.h sed -i 's|#define DEBUG_B_PIXELS 0|#define DEBUG_B_PIXELS 1|' src/debug.h sed -i 's|#define DEBUG_REFMV 0|#define DEBUG_REFMV 1|' src/refmvs.c sed -i 's|#define DEBUG_CI_HDR 0|#define DEBUG_CI_HDR 1|' src/obu.c sed -i 's|#define DEBUG_FGM_HDR 0|#define DEBUG_FGM_HDR 1|' src/obu.c sed -i 's|frame_offset == 0|frame_offset <= 4|' src/debug.h git diff - meson setup build --buildtype debugoptimized -Dtrim_dsp=false && ninja -C build - meson test -v -C build - ninja -C build - | files=( "./media/avm-v14.1.0-bus.64x64.l1.sdp0.obu" "./media/avm-v14.1.0-bus.64x64.l1.sdp1.obu" "./media/avm-v14.1.0-bus.64x64.l5.obu" "./media/avm-v14.1.0-bus.64x64.l5.opfl0-refinemv0.obu" "./media/avm-v14.1.0-bus.64x64.l5.lossless.obu" "./media/avm-v14.1.0-hm.64x64.l5.filmgrain.obu" ) for file in "${files[@]}"; do avm/x86-64/avmdec $file -o /dev/null > avm.log build/tools/dav2d -i $file --muxer=null --threads=1 --quiet > dav2d.log diff -uw {avm,dav2d}.log wc -l avm.log done - | git reset --hard - ninja -C build - | echo "[CI] Running AVM vs DAV2D MD5 regression test" time tests/test-md5.sh build/tools/dav2d build-debian-avx512: extends: .debian-amd64-common tags: - docker - amd64-avx512 variables: CFLAGS: '-mavx' script: - meson setup build --buildtype debugoptimized - ninja -C build - cd build - time meson test -v --suite checkasm - time ../tests/test-md5.sh tools/dav2d test-win64: extends: .debian-amd64-common image: registry.videolan.org/dav1d-debian-unstable:20260103195850 script: - wineserver -p && wine wineboot - meson setup build --buildtype release -Dlogging=false -Dtrim_dsp=false --cross-file package/crossfiles/x86_64-w64-mingw32.meson - ninja -C build - cd build && time meson test -v build-debian-aarch64: extends: .debian-aarch64-common script: - meson setup build --buildtype debugoptimized -Dtrim_dsp=false - ninja -C build - meson test -v -C build - time tests/test-md5.sh build/tools/dav2d build-debian-armv7: extends: .debian-armv7-common script: - linux32 meson setup build --buildtype debugoptimized -Dtrim_dsp=false - ninja -C build - meson test -v -C build - time tests/test-md5.sh build/tools/dav2d dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/CONTRIBUTING.md000066400000000000000000000040611517466257200222760ustar00rootroot00000000000000# dav2d contribution guide ## CoC The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies fully to this project. ## ToDo The todo list can be found [on the wiki](https://code.videolan.org/videolan/dav2d/wikis/task-list). ## Codebase language The codebase is developed with the following assumptions: For the library: - C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code. - x86 asm in .asm files, using the NASM syntax, - arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports, - no C++ is allowed, whatever the version. For the tools and utils: - C *(see above for restrictions)* - Rust - C++ is only allowed for the MFT. If you want to use *Threads* or *Atomic* features, please conform to the **C11**/**POSIX** semantic and use a wrapper for older compilers/platforms *(like done in VLC)*. Please use modern standard POSIX functions *(strscpy, asprintf, tdestroy)*, and provide a compatibility fallback *(like done in VLC)*. We will make reasonable efforts for compilers that are a bit older, but we won't support gcc 3 or MSVC 2012. ## Authorship Please provide a correct authorship for your commit logs, with a name and a valid email. We will reject anonymous contributions for now. As an exception, known pseudonyms from the multimedia community are accepted. This project is respecting **Copyright** and **Droit d'auteur**. There is no copyright attribution or CLA. ## Commit logs Please read [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/). ## Submit requests (WIP) - Code, - [Compile](https://xkcd.com/303/), - Check your [code style](https://code.videolan.org/videolan/dav2d/wikis/Coding-style), - Test, - Try, - Submit patches through merge requests, - Check that this passes the CI. ## Patent license You need to read, understand, and agree to the [AOMedia patent license](doc/PATENTS), before committing. dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/COPYING000066400000000000000000000024451517466257200211040ustar00rootroot00000000000000Copyright © 2018-2026, VideoLAN and dav2d authors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/NEWS000066400000000000000000000000731517466257200205430ustar00rootroot00000000000000Changes for 0.0.1 'Walking': ---------------------------- dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/README.md000066400000000000000000000131151517466257200213240ustar00rootroot00000000000000# dav2d **dav2d** is an **AV2** cross-platform **d**ecoder, open-source, and focused on speed and correctness. It is based on our popular **dav1d** decoder. It is now battle-tested and production-ready and can be used everywhere. The canonical repository URL for this repo is https://code.videolan.org/videolan/dav2d This project is edited by VideoLAN as part of its membership by the *Alliance for Open Media*/**AOM**. ## Goal and Features The goal of this project is to provide a decoder for **most platforms**, and achieve the **highest speed** possible to overcome the temporary lack of AV2 hardware decoder. It will support all features from AV2, including all subsampling and bit-depth parameters. In the future, this project will host simple tools or simple wrappings. ## License **dav2d** is released under a very liberal license, a contrario from the other VideoLAN projects, so that it can be embedded anywhere, including non-open-source software; or even drivers, to allow the creation of hybrid decoders. The reasoning behind this decision is the same as for libvorbis, see [RMS on vorbis](https://lwn.net/2001/0301/a/rms-ov-license.php3). Please note that the license does not grant you any patents rights from AOM. # Roadmap The plan is the following: ### On-going 1. Complete C implementation of the decoder, 2. Provide a usable API, 3. Port to most platforms, ### After 4. Make it fast on desktop, by writing asm for AVX2 chips. 5. Make it fast on mobile, by writing asm for ARMv8 chips, 6. Make it fast on older desktop, by writing asm for SSSE3+ chips, 7. Make high bit-depth fast on mobile, by writing asm for ARMv8 chips. 8. Make it fast on older mobile, by writing asm for ARMv7 chips, 9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips, 10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips, 11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips, 12. Improve threading. 13. Improve C code base with various tweaks. 14. Accelerate for less common architectures, like PPC, SSE2, RISC-V or AVX-512. 15. Use more GPU decoding, when possible. # Contribute Currently, we are looking for help from: - C developers, - asm developers, - platform-specific developers, - testers. Our contributions guidelines are quite strict. We want to build a coherent codebase to simplify maintenance and achieve the highest possible speed. Notably, the codebase is in pure C and asm. We are on IRC, on the **#dav2d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [IRC Web Interface](https://web.libera.chat/#dav2d). See the [contributions document](CONTRIBUTING.md). ## CLA There is no CLA. People will keep their copyright and their authorship rights, while adhering to the BSD 2-clause license. VideoLAN will only have the collective work rights. ## CoC The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this project. # Compile ## General compilation steps 1. Install [Meson](https://mesonbuild.com/) (0.49 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher) 2. Run `mkdir build && cd build` to create a build directory and enter it 3. Run `meson setup ..` to configure meson, add `--default-library=static` if static linking is desired 4. Run `ninja` to compile Following are modification of step 3 and 4, for specific purpose. ## Cross-Compilation for 32- or 64-bit Windows, 32-bit Linux If you're on a linux build machine trying to compile .exe for a Windows target/host machine, configure meson like this ``` meson setup .. --cross-file=../package/crossfiles/x86_64-w64-mingw32.meson ``` or, for 32-bit: ``` meson setup .. --cross-file=../package/crossfiles/i686-w64-mingw32.meson ``` `mingw-w64` is a pre-requisite and should be installed on your linux machine via your preferred method or package manager. Note the binary name formats may differ between distributions. Verify the names, and use `alias` if certain binaries cannot be found. For 32-bit linux, run ``` meson setup .. --cross-file=../package/crossfiles/i686-linux32.meson ``` ## Build documentation 1. Make sure [doxygen](https://www.doxygen.nl/) and [graphviz](https://www.graphviz.org/) are installed. 2. Run `meson setup .. -Denable_docs=true` to configure meson to generate docs from the build directory. 3. Run `ninja doc/html` to build the docs The result can be found in `build/doc/html/`. An online version built from master can be found [here](https://videolan.videolan.me/dav2d/). # Run tests 1. In the root directory, run `git clone https://code.videolan.org/videolan/dav2d-test-data.git tests/dav2d-test-data` to fetch the test data repository 2. During meson configuration, specify `-Dtestdata_tests=true` 3. Run `meson test -v` after compiling # Support This project is partially funded by the *Alliance for Open Media*/**AOM** and is supported by TwoOrioles and VideoLabs. These companies can provide support and integration help, should you need it. # FAQ ## Is dav2d a recursive acronym? - Yes. ## Can I help? - Yes. See the [contributions document](CONTRIBUTING.md). ## I am not a developer. Can I help? - Yes. We need testers, bug reporters and documentation writers. ## What about the AV2 patent license? - This project is an implementation of a decoder. It gives you no special rights on the AV2 patents. Please read the [AOMedia patent license](doc/PATENTS) that applies to the AV2 specification and codec. ## Will you care about ? ? - We do, but we don't have either the time or the knowledge. Therefore, patches and contributions welcome. dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/THANKS.md000066400000000000000000000030671517466257200213640ustar00rootroot00000000000000# The dav2d project and VideoLAN association would like to thank ## AOM The Alliance for Open Media (AOM) for partially funding this project. ## Companies * Two Orioles LLC, for important coding effort * VideoLabs SAS ## Projects * VideoLAN * FFmpeg * libplacebo ## Individual And all the dav2d Authors (git shortlog -sn), including: Henrik Gramner, Martin Storsjö, Ronald S. Bultje, Janne Grunau, James Almer, Victorien Le Couviour--Tuffet, Matthias Dressel, Nathan E. Egge, Jean-Baptiste Kempf, Marvin Scholz, Luc Trudeau, Niklas Haas, Hugo Beauzée-Luyssen, Konstantin Pavlov, David Michael Barr, Steve Lhomme, yuanhecai, Luca Barbato, Wan-Teh Chang, Kyle Siefring, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, David Conrad, Derek Buitenhuis, Jan Beich, Michael Bradshaw, Raphaël Zumer, Xuefeng Jiang, Arpad Panyik, Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, Raphael Zumer, Rupert Swarbrick, Thierry Foucu, Thomas Daede, jinbo, André Kempe, Colin Lee, Jonathan Wright, Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, Tristan Laurent, Tristan Matthews, Vittorio Giovara, Yannis Guyon, Andrey Semashev, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Hao Chen, Jean-Yves Avenard, Joe Drago, Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, Pablo Stebler, Rostislav Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, Sylvain BERTRAND, Sylvestre Ledru, Timo Gurr, Vibhoothi, Vignesh Venkatasubramanian, Xavier Claessens, Xu Guangxin, kossh1 and skal. dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/doc/000077500000000000000000000000001517466257200206115ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/doc/Doxyfile.in.in000066400000000000000000000012561517466257200233350ustar00rootroot00000000000000PROJECT_NAME = dav2d PROJECT_NUMBER = \@VCS_TAG\@ PROJECT_BRIEF = dav2d is an AV2 decoder OUTPUT_DIRECTORY = @DOXYGEN_OUTPUT@ STRIP_FROM_PATH = @DOXYGEN_STRIP@ OUTPUT_LANGUAGE = English TAB_SIZE = 4 EXTRACT_ALL = YES OPTIMIZE_OUTPUT_FOR_C = YES DOXYFILE_ENCODING = UTF-8 TYPEDEF_HIDES_STRUCT = YES HAVE_DOT = YES QUIET = YES WARNINGS = YES WARN_IF_UNDOCUMENTED = YES WARN_AS_ERROR = FAIL_ON_WARNINGS INPUT = @DOXYGEN_INPUT@ FILE_PATTERNS = *.h GENERATE_HTML = YES GENERATE_LATEX = NO dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/doc/PATENTS000066400000000000000000000131421517466257200216530ustar00rootroot00000000000000Alliance for Open Media Patent License 1.0 1. License Terms. 1.1. Patent License. Subject to the terms and conditions of this License, each Licensor, on behalf of itself and successors in interest and assigns, grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as expressly stated in this License) patent license to its Necessary Claims to make, use, sell, offer for sale, import or distribute any Implementation. 1.2. Conditions. 1.2.1. Availability. As a condition to the grant of rights to Licensee to make, sell, offer for sale, import or distribute an Implementation under Section 1.1, Licensee must make its Necessary Claims available under this License, and must reproduce this License with any Implementation as follows: a. For distribution in source code, by including this License in the root directory of the source code with its Implementation. b. For distribution in any other form (including binary, object form, and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist, GDSII, etc.)), by including this License in the documentation, legal notices, and/or other written materials provided with the Implementation. 1.2.2. Additional Conditions. This license is directly from Licensor to Licensee. Licensee acknowledges as a condition of benefiting from it that no rights from Licensor are received from suppliers, distributors, or otherwise in connection with this License. 1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents initiates patent litigation or files, maintains, or voluntarily participates in a lawsuit against another entity or any person asserting that any Implementation infringes Necessary Claims, any patent licenses granted under this License directly to the Licensee are immediately terminated as of the date of the initiation of action unless 1) that suit was in response to a corresponding suit regarding an Implementation first brought against an initiating entity, or 2) that suit was brought to enforce the terms of this License (including intervention in a third-party action by a Licensee). 1.4. Disclaimers. The Reference Implementation and Specification are provided "AS IS" and without warranty. The entire risk as to implementing or otherwise using the Reference Implementation or Specification is assumed by the implementer and user. Licensor expressly disclaims any warranties (express, implied, or otherwise), including implied warranties of merchantability, non-infringement, fitness for a particular purpose, or title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2. Definitions. 2.1. Affiliate. “Affiliate” means an entity that directly or indirectly Controls, is Controlled by, or is under common Control of that party. 2.2. Control. “Control” means direct or indirect control of more than 50% of the voting power to elect directors of that corporation, or for any other entity, the power to direct management of such entity. 2.3. Decoder. "Decoder" means any decoder that conforms fully with all non-optional portions of the Specification. 2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can be decoded by a Decoder only to the extent it produces such a bitstream. 2.5. Final Deliverable. “Final Deliverable” means the final version of a deliverable approved by the Alliance for Open Media as a Final Deliverable. 2.6. Implementation. "Implementation" means any implementation, including the Reference Implementation, that is an Encoder and/or a Decoder. An Implementation also includes components of an Implementation only to the extent they are used as part of an Implementation. 2.7. License. “License” means this license. 2.8. Licensee. “Licensee” means any person or entity who exercises patent rights granted under this License. 2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers for sale, imports or distributes any Implementation, or (ii) a person or entity that has a licensing obligation to the Implementation as a result of its membership and/or participation in the Alliance for Open Media working group that developed the Specification. 2.10. Necessary Claims. "Necessary Claims" means all claims of patents or patent applications, (a) that currently or at any time in the future, are owned or controlled by the Licensor, and (b) (i) would be an Essential Claim as defined by the W3C Policy as of February 5, 2004 (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential) as if the Specification was a W3C Recommendation; or (ii) are infringed by the Reference Implementation. 2.11. Reference Implementation. “Reference Implementation” means an Encoder and/or Decoder released by the Alliance for Open Media as a Final Deliverable. 2.12. Specification. “Specification” means the specification designated by the Alliance for Open Media as a Final Deliverable for which this License was issued. dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/doc/meson.build000066400000000000000000000041671517466257200227630ustar00rootroot00000000000000# Copyright © 2018-2022, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. if not get_option('enable_docs') subdir_done() endif doxygen = find_program('doxygen') dot = find_program('dot') conf_data = configuration_data() conf_data.set('DOXYGEN_INPUT', dav2d_src_root / 'include/dav2d') conf_data.set('DOXYGEN_STRIP', dav2d_src_root / 'include') conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir()) doxyfile = configure_file(input: 'Doxyfile.in.in', output: 'Doxyfile.in', configuration: conf_data) doxyfile_rev_target = vcs_tag(command: [ 'git', '--git-dir', dav2d_git_dir, 'describe', '--long', '--always' ], input: doxyfile, output: 'Doxyfile' ) custom_target('doc', build_by_default: false, command: [doxygen, doxyfile_rev_target], output: ['html'] ) dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/000077500000000000000000000000001517466257200216625ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/dav2dplay.c000066400000000000000000000617641517466257200237320ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "vcs_version.h" #include #include #include #include "dav2d/dav2d.h" #include "common/attributes.h" #include "tools/input/input.h" #include "dp_fifo.h" #include "dp_renderer.h" #define FRAME_OFFSET_TO_PTS(foff) \ (uint64_t)(((foff) * rd_ctx->spf) * 1000000000.0 + .5) #define TS_TO_PTS(ts) \ (uint64_t)(((ts) * rd_ctx->timebase) * 1000000000.0 + .5) // Selected renderer callbacks and cookie static const Dav2dPlayRenderInfo *renderer_info = { NULL }; /** * Render context structure * This structure contains informations necessary * to be shared between the decoder and the renderer * threads. */ typedef struct render_context { Dav2dPlaySettings settings; Dav2dSettings lib_settings; // Renderer private data (passed to callbacks) void *rd_priv; // Lock to protect access to the context structure SDL_mutex *lock; // Timestamp of last displayed frame (in timebase unit) int64_t last_ts; // Timestamp of last decoded frame (in timebase unit) int64_t current_ts; // Ticks when last frame was received uint32_t last_ticks; // PTS time base double timebase; // Seconds per frame double spf; // Number of frames uint32_t total; // Fifo Dav2dPlayPtrFifo *fifo; // Custom SDL2 event types uint32_t event_types; // User pause state uint8_t user_paused; // Internal pause state uint8_t paused; // Start of internal pause state uint32_t pause_start; // Duration of internal pause state uint32_t pause_time; // Seek accumulator int seek; // Indicates if termination of the decoder thread was requested uint8_t dec_should_terminate; } Dav2dPlayRenderContext; static void dp_settings_print_usage(const char *const app, const char *const reason, ...) { if (reason) { va_list args; va_start(args, reason); vfprintf(stderr, reason, args); va_end(args); fprintf(stderr, "\n\n"); } fprintf(stderr, "Usage: %s [options]\n\n", app); fprintf(stderr, "Supported options:\n" " --input/-i $file: input file\n" " --untimed/-u: ignore PTS, render as fast as possible\n" " --threads $num: number of threads (default: 0)\n" " --framedelay $num: maximum frame delay, capped at $threads (default: 0);\n" " set to 1 for low-latency decoding\n" " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" " --gpugrain/-g: enable GPU grain synthesis\n" " --fullscreen/-f: enable full screen mode\n" " --version/-v: print version and exit\n" " --renderer/-r: select renderer backend (default: auto)\n"); exit(1); } static unsigned parse_unsigned(const char *const optarg, const int option, const char *const app) { char *end; const unsigned res = (unsigned) strtoul(optarg, &end, 0); if (*end || end == optarg) dp_settings_print_usage(app, "Invalid argument \"%s\" for option %s; should be an integer", optarg, option); return res; } static void dp_rd_ctx_parse_args(Dav2dPlayRenderContext *rd_ctx, const int argc, char *const *const argv) { int o; Dav2dPlaySettings *settings = &rd_ctx->settings; Dav2dSettings *lib_settings = &rd_ctx->lib_settings; // Short options static const char short_opts[] = "i:vuzgfr:"; enum { ARG_THREADS = 256, ARG_FRAME_DELAY, ARG_HIGH_QUALITY, }; // Long options static const struct option long_opts[] = { { "input", 1, NULL, 'i' }, { "version", 0, NULL, 'v' }, { "untimed", 0, NULL, 'u' }, { "threads", 1, NULL, ARG_THREADS }, { "framedelay", 1, NULL, ARG_FRAME_DELAY }, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, { "gpugrain", 0, NULL, 'g' }, { "fullscreen", 0, NULL, 'f'}, { "renderer", 0, NULL, 'r'}, { NULL, 0, NULL, 0 }, }; while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) != -1) { switch (o) { case 'i': settings->inputfile = optarg; break; case 'v': fprintf(stderr, "%s\n", dav2d_version()); exit(0); case 'u': settings->untimed = true; break; case ARG_HIGH_QUALITY: settings->highquality = true; break; case 'z': settings->zerocopy = true; break; case 'g': settings->gpugrain = true; break; case 'f': settings->fullscreen = true; break; case 'r': settings->renderer_name = optarg; break; case ARG_THREADS: lib_settings->n_threads = parse_unsigned(optarg, ARG_THREADS, argv[0]); break; case ARG_FRAME_DELAY: lib_settings->max_frame_delay = parse_unsigned(optarg, ARG_FRAME_DELAY, argv[0]); break; default: dp_settings_print_usage(argv[0], NULL); } } if (optind < argc) dp_settings_print_usage(argv[0], "Extra/unused arguments found, e.g. '%s'\n", argv[optind]); if (!settings->inputfile) dp_settings_print_usage(argv[0], "Input file (-i/--input) is required"); if (settings->renderer_name && strcmp(settings->renderer_name, "auto") == 0) settings->renderer_name = NULL; } /** * Destroy a Dav2dPlayRenderContext */ static void dp_rd_ctx_destroy(Dav2dPlayRenderContext *rd_ctx) { assert(rd_ctx != NULL); renderer_info->destroy_renderer(rd_ctx->rd_priv); dp_fifo_destroy(rd_ctx->fifo); SDL_DestroyMutex(rd_ctx->lock); free(rd_ctx); } /** * Create a Dav2dPlayRenderContext * * \note The Dav2dPlayRenderContext must be destroyed * again by using dp_rd_ctx_destroy. */ static Dav2dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv) { Dav2dPlayRenderContext *rd_ctx; // Alloc rd_ctx = calloc(1, sizeof(Dav2dPlayRenderContext)); if (rd_ctx == NULL) { return NULL; } // Parse and validate arguments dav2d_default_settings(&rd_ctx->lib_settings); memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); dp_rd_ctx_parse_args(rd_ctx, argc, argv); // Init SDL2 library if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) { fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError()); goto fail; } // Register a custom event to notify our SDL main thread // about new frames rd_ctx->event_types = SDL_RegisterEvents(3); if (rd_ctx->event_types == UINT32_MAX) { fprintf(stderr, "Failure to create custom SDL event types!\n"); goto fail; } rd_ctx->fifo = dp_fifo_create(5); if (rd_ctx->fifo == NULL) { fprintf(stderr, "Failed to create FIFO for output pictures!\n"); goto fail; } rd_ctx->lock = SDL_CreateMutex(); if (rd_ctx->lock == NULL) { fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); goto fail; } // Select renderer renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name); if (renderer_info == NULL) { printf("No suitable renderer matching %s found.\n", (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto"); } else { printf("Using %s renderer\n", renderer_info->name); } rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer(&rd_ctx->settings) : NULL; if (rd_ctx->rd_priv == NULL) { goto fail; } return rd_ctx; fail: if (rd_ctx->lock) SDL_DestroyMutex(rd_ctx->lock); if (rd_ctx->fifo) dp_fifo_destroy(rd_ctx->fifo); free(rd_ctx); SDL_Quit(); return NULL; } /** * Notify about new event */ static void dp_rd_ctx_post_event(Dav2dPlayRenderContext *rd_ctx, uint32_t type) { SDL_Event event; SDL_zero(event); event.type = type; SDL_PushEvent(&event); } /** * Update the decoder context with a new dav2d picture * * Once the decoder decoded a new picture, this call can be used * to update the internal texture of the render context with the * new picture. */ static void dp_rd_ctx_update_with_dav2d_picture(Dav2dPlayRenderContext *rd_ctx, Dav2dPicture *dav2d_pic) { rd_ctx->current_ts = dav2d_pic->m.timestamp; renderer_info->update_frame(rd_ctx->rd_priv, dav2d_pic, &rd_ctx->settings); } /** * Toggle pause state */ static void dp_rd_ctx_toggle_pause(Dav2dPlayRenderContext *rd_ctx) { SDL_LockMutex(rd_ctx->lock); rd_ctx->user_paused = !rd_ctx->user_paused; if (rd_ctx->seek) goto out; rd_ctx->paused = rd_ctx->user_paused; uint32_t now = SDL_GetTicks(); if (rd_ctx->paused) rd_ctx->pause_start = now; else { rd_ctx->pause_time += now - rd_ctx->pause_start; rd_ctx->pause_start = 0; rd_ctx->last_ticks = now; } out: SDL_UnlockMutex(rd_ctx->lock); } /** * Query pause state */ static int dp_rd_ctx_is_paused(Dav2dPlayRenderContext *rd_ctx) { int ret; SDL_LockMutex(rd_ctx->lock); ret = rd_ctx->paused; SDL_UnlockMutex(rd_ctx->lock); return ret; } /** * Request seeking, in seconds */ static void dp_rd_ctx_seek(Dav2dPlayRenderContext *rd_ctx, int sec) { SDL_LockMutex(rd_ctx->lock); rd_ctx->seek += sec; if (!rd_ctx->paused) rd_ctx->pause_start = SDL_GetTicks(); rd_ctx->paused = 1; SDL_UnlockMutex(rd_ctx->lock); } static int decode_frame(Dav2dPicture **p, Dav2dContext *c, Dav2dData *data, DemuxerContext *in_ctx); static inline void destroy_pic(void *a); /** * Seek the stream, if requested */ static int dp_rd_ctx_handle_seek(Dav2dPlayRenderContext *rd_ctx, DemuxerContext *in_ctx, Dav2dContext *c, Dav2dData *data) { int res = 0; SDL_LockMutex(rd_ctx->lock); if (!rd_ctx->seek) goto out; int64_t seek = rd_ctx->seek * 1000000000ULL; uint64_t pts = TS_TO_PTS(rd_ctx->current_ts); pts = ((int64_t)pts > -seek) ? pts + seek : 0; int end = pts >= FRAME_OFFSET_TO_PTS(rd_ctx->total); if (end) pts = FRAME_OFFSET_TO_PTS(rd_ctx->total - 1); uint64_t target_pts = pts; dav2d_flush(c); uint64_t shift = FRAME_OFFSET_TO_PTS(5); while (1) { if (shift > pts) shift = pts; if ((res = input_seek(in_ctx, pts - shift))) goto out; Dav2dSequenceHeader seq; uint64_t cur_pts; do { if ((res = input_read(in_ctx, data))) break; cur_pts = TS_TO_PTS(data->m.timestamp); res = dav2d_parse_sequence_header(&seq, data->data, data->sz); } while (res && cur_pts < pts); if (!res && cur_pts <= pts) break; if (shift > pts) shift = pts; pts -= shift; } if (!res) { pts = TS_TO_PTS(data->m.timestamp); while (pts < target_pts) { Dav2dPicture *p; if ((res = decode_frame(&p, c, data, in_ctx))) break; if (p) { pts = TS_TO_PTS(p->m.timestamp); if (pts < target_pts) destroy_pic(p); else { dp_fifo_push(rd_ctx->fifo, p); uint32_t type = rd_ctx->event_types + DAV2D_EVENT_SEEK_FRAME; dp_rd_ctx_post_event(rd_ctx, type); } } } if (!res) { rd_ctx->last_ts = data->m.timestamp - rd_ctx->spf / rd_ctx->timebase; rd_ctx->current_ts = data->m.timestamp; } } out: rd_ctx->paused = rd_ctx->user_paused; if (!rd_ctx->paused && rd_ctx->seek) { uint32_t now = SDL_GetTicks(); rd_ctx->pause_time += now - rd_ctx->pause_start; rd_ctx->pause_start = 0; rd_ctx->last_ticks = now; } rd_ctx->seek = 0; SDL_UnlockMutex(rd_ctx->lock); if (res) fprintf(stderr, "Error seeking, aborting\n"); return res; } /** * Terminate decoder thread (async) */ static void dp_rd_ctx_request_shutdown(Dav2dPlayRenderContext *rd_ctx) { SDL_LockMutex(rd_ctx->lock); rd_ctx->dec_should_terminate = 1; SDL_UnlockMutex(rd_ctx->lock); } /** * Query state of decoder shutdown request */ static int dp_rd_ctx_should_terminate(Dav2dPlayRenderContext *rd_ctx) { int ret = 0; SDL_LockMutex(rd_ctx->lock); ret = rd_ctx->dec_should_terminate; SDL_UnlockMutex(rd_ctx->lock); return ret; } /** * Render the currently available texture * * Renders the currently available texture, if any. */ static void dp_rd_ctx_render(Dav2dPlayRenderContext *rd_ctx) { SDL_LockMutex(rd_ctx->lock); // Calculate time since last frame was received uint32_t ticks_now = SDL_GetTicks(); uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0; // Calculate when to display the frame int64_t ts_diff = rd_ctx->current_ts - rd_ctx->last_ts; int32_t pts_diff = (ts_diff * rd_ctx->timebase) * 1000.0 + .5; int32_t wait_time = pts_diff - ticks_diff; // In untimed mode, simply don't wait if (rd_ctx->settings.untimed) wait_time = 0; // This way of timing the playback is not accurate, as there is no guarantee // that SDL_Delay will wait for exactly the requested amount of time so in a // accurate player this would need to be done in a better way. if (wait_time > 0) { SDL_Delay(wait_time); } else if (wait_time < -10 && !rd_ctx->paused) { // Do not warn for minor time drifts fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time / 1000.0); } renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings); rd_ctx->last_ts = rd_ctx->current_ts; rd_ctx->last_ticks = SDL_GetTicks(); SDL_UnlockMutex(rd_ctx->lock); } static int decode_frame(Dav2dPicture **p, Dav2dContext *c, Dav2dData *data, DemuxerContext *in_ctx) { int res; // Send data packets we got from the demuxer to dav2d if ((res = dav2d_send_data(c, data)) < 0) { // On EAGAIN, dav2d can not consume more data and // dav2d_get_picture needs to be called first, which // will happen below, so just keep going in that case // and do not error out. if (res != DAV2D_ERR(EAGAIN)) { dav2d_data_unref(data); goto err; } } *p = calloc(1, sizeof(**p)); // Try to get a decoded frame if ((res = dav2d_get_picture(c, *p)) < 0) { // In all error cases, even EAGAIN, p needs to be freed as // it is never added to the queue and would leak. free(*p); *p = NULL; // On EAGAIN, it means dav2d has not enough data to decode // therefore this is not a decoding error but just means // we need to feed it more data, which happens in the next // run of the decoder loop. if (res != DAV2D_ERR(EAGAIN)) goto err; } return data->sz == 0 ? input_read(in_ctx, data) : 0; err: fprintf(stderr, "Error decoding frame: %s\n", strerror(-res)); return res; } static inline void destroy_pic(void *a) { Dav2dPicture *p = (Dav2dPicture *)a; dav2d_picture_unref(p); free(p); } /* Decoder thread "main" function */ static int decoder_thread_main(void *cookie) { Dav2dPlayRenderContext *rd_ctx = cookie; Dav2dPicture *p; Dav2dContext *c = NULL; Dav2dData data; DemuxerContext *in_ctx = NULL; int res = 0; unsigned total, timebase[2], fps[2]; Dav2dPlaySettings settings = rd_ctx->settings; if ((res = input_open(&in_ctx, "ivf", settings.inputfile, fps, &total, timebase)) < 0) { fprintf(stderr, "Failed to open demuxer\n"); res = 1; goto cleanup; } rd_ctx->timebase = (double)timebase[1] / timebase[0]; rd_ctx->spf = (double)fps[1] / fps[0]; rd_ctx->total = total; if ((res = dav2d_open(&c, &rd_ctx->lib_settings))) { fprintf(stderr, "Failed opening dav2d decoder\n"); res = 1; goto cleanup; } if ((res = input_read(in_ctx, &data)) < 0) { fprintf(stderr, "Failed demuxing input\n"); res = 1; goto cleanup; } // Decoder loop while (1) { if (dp_rd_ctx_should_terminate(rd_ctx) || (res = dp_rd_ctx_handle_seek(rd_ctx, in_ctx, c, &data)) || (res = decode_frame(&p, c, &data, in_ctx))) { break; } else if (p) { // Queue frame SDL_LockMutex(rd_ctx->lock); int seek = rd_ctx->seek; SDL_UnlockMutex(rd_ctx->lock); if (!seek) { dp_fifo_push(rd_ctx->fifo, p); uint32_t type = rd_ctx->event_types + DAV2D_EVENT_NEW_FRAME; dp_rd_ctx_post_event(rd_ctx, type); } } } // Release remaining data if (data.sz > 0) dav2d_data_unref(&data); // Do not drain in case an error occured and caused us to leave the // decoding loop early. if (res < 0) goto cleanup; // Drain decoder // When there is no more data to feed to the decoder, for example // because the file ended, we still need to request pictures, as // even though we do not have more data, there can be frames decoded // from data we sent before. So we need to call dav2d_get_picture until // we get an EAGAIN error. do { if (dp_rd_ctx_should_terminate(rd_ctx)) break; p = calloc(1, sizeof(*p)); res = dav2d_get_picture(c, p); if (res < 0) { free(p); if (res != DAV2D_ERR(EAGAIN)) { fprintf(stderr, "Error decoding frame: %s\n", strerror(-res)); break; } } else { // Queue frame dp_fifo_push(rd_ctx->fifo, p); uint32_t type = rd_ctx->event_types + DAV2D_EVENT_NEW_FRAME; dp_rd_ctx_post_event(rd_ctx, type); } } while (res != DAV2D_ERR(EAGAIN)); cleanup: dp_rd_ctx_post_event(rd_ctx, rd_ctx->event_types + DAV2D_EVENT_DEC_QUIT); if (in_ctx) input_close(in_ctx); if (c) dav2d_close(&c); return (res != DAV2D_ERR(EAGAIN) && res < 0); } int main(int argc, char **argv) { SDL_Thread *decoder_thread; // Check for version mismatch between library and tool const char *version = dav2d_version(); if (strcmp(version, DAV2D_VERSION)) { fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n", version, DAV2D_VERSION); return 1; } // Create render context Dav2dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv); if (rd_ctx == NULL) { fprintf(stderr, "Failed creating render context\n"); return 5; } if (rd_ctx->settings.zerocopy) { if (renderer_info->alloc_pic) { rd_ctx->lib_settings.allocator = (Dav2dPicAllocator) { .cookie = rd_ctx->rd_priv, .alloc_picture_callback = renderer_info->alloc_pic, .release_picture_callback = renderer_info->release_pic, }; } else { fprintf(stderr, "--zerocopy unsupported by selected renderer\n"); } } if (rd_ctx->settings.gpugrain) { if (renderer_info->supports_gpu_grain) { rd_ctx->lib_settings.apply_grain = 0; } else { fprintf(stderr, "--gpugrain unsupported by selected renderer\n"); } } // Start decoder thread decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx); // Main loop #define NUM_MAX_EVENTS 8 SDL_Event events[NUM_MAX_EVENTS]; int num_frame_events = 0; uint32_t start_time = 0, n_out = 0; while (1) { int num_events = 0; SDL_WaitEvent(NULL); while (num_events < NUM_MAX_EVENTS && SDL_PollEvent(&events[num_events++])) break; for (int i = 0; i < num_events; ++i) { SDL_Event *e = &events[i]; if (e->type == SDL_QUIT) { dp_rd_ctx_request_shutdown(rd_ctx); dp_fifo_flush(rd_ctx->fifo, destroy_pic); goto out; } else if (e->type == SDL_WINDOWEVENT) { if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) { // TODO: Handle window resizes } else if(e->window.event == SDL_WINDOWEVENT_EXPOSED) { dp_rd_ctx_render(rd_ctx); } } else if (e->type == SDL_KEYDOWN) { SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e; if (kbde->keysym.sym == SDLK_SPACE) { dp_rd_ctx_toggle_pause(rd_ctx); } else if (kbde->keysym.sym == SDLK_ESCAPE) { dp_rd_ctx_request_shutdown(rd_ctx); dp_fifo_flush(rd_ctx->fifo, destroy_pic); goto out; } else if (kbde->keysym.sym == SDLK_LEFT || kbde->keysym.sym == SDLK_RIGHT) { if (kbde->keysym.sym == SDLK_LEFT) dp_rd_ctx_seek(rd_ctx, -5); else if (kbde->keysym.sym == SDLK_RIGHT) dp_rd_ctx_seek(rd_ctx, +5); dp_fifo_flush(rd_ctx->fifo, destroy_pic); SDL_FlushEvent(rd_ctx->event_types + DAV2D_EVENT_NEW_FRAME); num_frame_events = 0; } } else if (e->type == rd_ctx->event_types + DAV2D_EVENT_NEW_FRAME) { num_frame_events++; // Store current ticks for stats calculation if (start_time == 0) start_time = SDL_GetTicks(); } else if (e->type == rd_ctx->event_types + DAV2D_EVENT_SEEK_FRAME) { // Dequeue frame and update the render context with it Dav2dPicture *p = dp_fifo_shift(rd_ctx->fifo); // Do not update textures during termination if (!dp_rd_ctx_should_terminate(rd_ctx)) { dp_rd_ctx_update_with_dav2d_picture(rd_ctx, p); n_out++; } destroy_pic(p); } else if (e->type == rd_ctx->event_types + DAV2D_EVENT_DEC_QUIT) { goto out; } } if (num_frame_events && !dp_rd_ctx_is_paused(rd_ctx)) { // Dequeue frame and update the render context with it Dav2dPicture *p = dp_fifo_shift(rd_ctx->fifo); // Do not update textures during termination if (!dp_rd_ctx_should_terminate(rd_ctx)) { dp_rd_ctx_update_with_dav2d_picture(rd_ctx, p); dp_rd_ctx_render(rd_ctx); n_out++; } destroy_pic(p); num_frame_events--; } } out:; // Print stats uint32_t time_ms = SDL_GetTicks() - start_time - rd_ctx->pause_time; printf("Decoded %u frames in %d seconds, avg %.02f fps\n", n_out, time_ms / 1000, n_out/ (time_ms / 1000.0)); int decoder_ret = 0; SDL_WaitThread(decoder_thread, &decoder_ret); dp_rd_ctx_destroy(rd_ctx); SDL_Quit(); return decoder_ret; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/dp_fifo.c000066400000000000000000000101131517466257200234300ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "dp_fifo.h" // FIFO structure struct dp_fifo { SDL_mutex *lock; SDL_cond *cond_change; size_t capacity; size_t count; void **entries; int push_wait; int flush; }; Dav2dPlayPtrFifo *dp_fifo_create(size_t capacity) { Dav2dPlayPtrFifo *fifo; assert(capacity > 0); if (capacity <= 0) return NULL; fifo = malloc(sizeof(*fifo)); if (fifo == NULL) return NULL; fifo->capacity = capacity; fifo->count = 0; fifo->push_wait = 0; fifo->flush = 0; fifo->lock = SDL_CreateMutex(); if (fifo->lock == NULL) { free(fifo); return NULL; } fifo->cond_change = SDL_CreateCond(); if (fifo->cond_change == NULL) { SDL_DestroyMutex(fifo->lock); free(fifo); return NULL; } fifo->entries = calloc(capacity, sizeof(void*)); if (fifo->entries == NULL) { dp_fifo_destroy(fifo); return NULL; } return fifo; } // Destroy FIFO void dp_fifo_destroy(Dav2dPlayPtrFifo *fifo) { assert(fifo->count == 0); SDL_DestroyMutex(fifo->lock); SDL_DestroyCond(fifo->cond_change); free(fifo->entries); free(fifo); } // Push to FIFO void dp_fifo_push(Dav2dPlayPtrFifo *fifo, void *element) { SDL_LockMutex(fifo->lock); while (fifo->count == fifo->capacity) { fifo->push_wait = 1; SDL_CondWait(fifo->cond_change, fifo->lock); fifo->push_wait = 0; if (fifo->flush) { SDL_CondSignal(fifo->cond_change); SDL_UnlockMutex(fifo->lock); return; } } fifo->entries[fifo->count++] = element; if (fifo->count == 1) SDL_CondSignal(fifo->cond_change); SDL_UnlockMutex(fifo->lock); } // Helper that shifts the FIFO array static void *dp_fifo_array_shift(void **arr, size_t len) { void *shifted_element = arr[0]; for (size_t i = 1; i < len; ++i) arr[i-1] = arr[i]; return shifted_element; } // Get item from FIFO void *dp_fifo_shift(Dav2dPlayPtrFifo *fifo) { SDL_LockMutex(fifo->lock); while (fifo->count == 0) SDL_CondWait(fifo->cond_change, fifo->lock); void *res = dp_fifo_array_shift(fifo->entries, fifo->count--); if (fifo->count == fifo->capacity - 1) SDL_CondSignal(fifo->cond_change); SDL_UnlockMutex(fifo->lock); return res; } void dp_fifo_flush(Dav2dPlayPtrFifo *fifo, void (*destroy_elem)(void *)) { SDL_LockMutex(fifo->lock); fifo->flush = 1; if (fifo->push_wait) { SDL_CondSignal(fifo->cond_change); SDL_CondWait(fifo->cond_change, fifo->lock); } while (fifo->count) destroy_elem(fifo->entries[--fifo->count]); fifo->flush = 0; SDL_UnlockMutex(fifo->lock); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/dp_fifo.h000066400000000000000000000045301517466257200234430ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Dav2dPlay FIFO helper */ typedef struct dp_fifo Dav2dPlayPtrFifo; /* Create a FIFO * * Creates a FIFO with the given capacity. * If the capacity is reached, new inserts into the FIFO * will block until enough space is available again. */ Dav2dPlayPtrFifo *dp_fifo_create(size_t capacity); /* Destroy a FIFO * * The FIFO must be empty before it is destroyed! */ void dp_fifo_destroy(Dav2dPlayPtrFifo *fifo); /* Shift FIFO * * Return the first item from the FIFO, thereby removing it from * the FIFO and making room for new entries. */ void *dp_fifo_shift(Dav2dPlayPtrFifo *fifo); /* Push to FIFO * * Add an item to the end of the FIFO. * If the FIFO is full, this call will block until there is again enough * space in the FIFO, so calling this from the "consumer" thread if no * other thread will call dp_fifo_shift will lead to a deadlock. */ void dp_fifo_push(Dav2dPlayPtrFifo *fifo, void *element); void dp_fifo_flush(Dav2dPlayPtrFifo *fifo, void (*destroy_elem)(void *)); dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/dp_renderer.h000066400000000000000000000110311517466257200243200ustar00rootroot00000000000000/* * Copyright © 2020, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "dav2d/dav2d.h" #include #if HAVE_PLACEBO # include #endif // Check libplacebo Vulkan rendering #if HAVE_VULKAN && defined(SDL_VIDEO_VULKAN) # if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN # define HAVE_RENDERER_PLACEBO 1 # define HAVE_PLACEBO_VULKAN 1 # endif #endif // Check libplacebo OpenGL rendering #if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL # define HAVE_RENDERER_PLACEBO 1 # define HAVE_PLACEBO_OPENGL 1 #endif #ifndef HAVE_RENDERER_PLACEBO #define HAVE_RENDERER_PLACEBO 0 #endif #ifndef HAVE_PLACEBO_VULKAN #define HAVE_PLACEBO_VULKAN 0 #endif #ifndef HAVE_PLACEBO_OPENGL #define HAVE_PLACEBO_OPENGL 0 #endif /** * Settings structure * Hold all settings available for the player, * this is usually filled by parsing arguments * from the console. */ typedef struct { const char *inputfile; const char *renderer_name; int highquality; int untimed; int zerocopy; int gpugrain; int fullscreen; } Dav2dPlaySettings; #define WINDOW_WIDTH 910 #define WINDOW_HEIGHT 512 enum { DAV2D_EVENT_NEW_FRAME, DAV2D_EVENT_SEEK_FRAME, DAV2D_EVENT_DEC_QUIT }; /** * Renderer info */ typedef struct rdr_info { // Renderer name const char *name; // Cookie passed to the renderer implementation callbacks void *cookie; // Callback to create the renderer void* (*create_renderer)(const Dav2dPlaySettings *settings); // Callback to destroy the renderer void (*destroy_renderer)(void *cookie); // Callback to the render function that renders a prevously sent frame void (*render)(void *cookie, const Dav2dPlaySettings *settings); // Callback to the send frame function, _may_ also unref dav2d_pic! int (*update_frame)(void *cookie, Dav2dPicture *dav2d_pic, const Dav2dPlaySettings *settings); // Callback for alloc/release pictures (optional) int (*alloc_pic)(Dav2dPicture *pic, void *cookie); void (*release_pic)(Dav2dPicture *pic, void *cookie); // Whether or not this renderer can apply on-GPU film grain synthesis int supports_gpu_grain; } Dav2dPlayRenderInfo; extern const Dav2dPlayRenderInfo rdr_placebo_vk; extern const Dav2dPlayRenderInfo rdr_placebo_gl; extern const Dav2dPlayRenderInfo rdr_sdl; // Available renderes ordered by priority static const Dav2dPlayRenderInfo* const dp_renderers[] = { &rdr_placebo_vk, &rdr_placebo_gl, &rdr_sdl, }; static inline const Dav2dPlayRenderInfo *dp_get_renderer(const char *name) { for (size_t i = 0; i < (sizeof(dp_renderers)/sizeof(*dp_renderers)); ++i) { if (dp_renderers[i]->name == NULL) continue; if (name == NULL || strcmp(name, dp_renderers[i]->name) == 0) { return dp_renderers[i]; } } return NULL; } static inline SDL_Window *dp_create_sdl_window(int window_flags) { SDL_Window *win; window_flags |= SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI; win = SDL_CreateWindow("Dav2dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, window_flags); if (!win) return NULL; SDL_SetWindowResizable(win, SDL_TRUE); return win; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/dp_renderer_placebo.c000066400000000000000000000316541517466257200260150ustar00rootroot00000000000000/* * Copyright © 2020, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "dp_renderer.h" #if HAVE_RENDERER_PLACEBO #include #include #include #if HAVE_PLACEBO_VULKAN # include # include #endif #if HAVE_PLACEBO_OPENGL # include # include #endif /** * Renderer context for libplacebo */ typedef struct renderer_priv_ctx { // SDL window SDL_Window *win; // Placebo log pl_log log; // Placebo renderer pl_renderer renderer; #if HAVE_PLACEBO_VULKAN // Placebo Vulkan handle pl_vulkan vk; // Placebo Vulkan instance pl_vk_inst vk_inst; // Vulkan surface VkSurfaceKHR surf; #endif #if HAVE_PLACEBO_OPENGL // Placebo OpenGL handle pl_opengl gl; // SDL OpenGL context SDL_GLContext gl_context; #endif // Placebo GPU pl_gpu gpu; // Placebo swapchain pl_swapchain swapchain; // Lock protecting access to the texture SDL_mutex *lock; // Image to render, and planes backing them struct pl_frame image; pl_tex plane_tex[3]; } Dav2dPlayRendererPrivateContext; static Dav2dPlayRendererPrivateContext* placebo_renderer_create_common(const Dav2dPlaySettings *settings, int window_flags) { if (settings->fullscreen) window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP; // Create Window SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE); if (sdlwin == NULL) { fprintf(stderr, "Creating SDL window failed: %s\n", SDL_GetError()); return NULL; } SDL_ShowCursor(0); // Alloc Dav2dPlayRendererPrivateContext *const rd_priv_ctx = calloc(1, sizeof(Dav2dPlayRendererPrivateContext)); if (rd_priv_ctx == NULL) { fprintf(stderr, "Out of memory!\n"); return NULL; } rd_priv_ctx->win = sdlwin; // Init libplacebo rd_priv_ctx->log = pl_log_create(PL_API_VER, pl_log_params( .log_cb = pl_log_color, #ifndef NDEBUG .log_level = PL_LOG_DEBUG, #else .log_level = PL_LOG_WARN, #endif )); if (rd_priv_ctx->log == NULL) { fprintf(stderr, "pl_log_create failed!\n"); free(rd_priv_ctx); return NULL; } // Create Mutex rd_priv_ctx->lock = SDL_CreateMutex(); if (rd_priv_ctx->lock == NULL) { fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); pl_log_destroy(&rd_priv_ctx->log); free(rd_priv_ctx); return NULL; } return rd_priv_ctx; } #if HAVE_PLACEBO_OPENGL static void *placebo_renderer_create_gl(const Dav2dPlaySettings *settings) { SDL_Window *sdlwin = NULL; SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 0); SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); // Common init Dav2dPlayRendererPrivateContext *rd_priv_ctx = placebo_renderer_create_common(settings, SDL_WINDOW_OPENGL); if (rd_priv_ctx == NULL) return NULL; sdlwin = rd_priv_ctx->win; rd_priv_ctx->gl_context = SDL_GL_CreateContext(sdlwin); if (!rd_priv_ctx->gl_context) { fprintf(stderr, "Failed creating opengl context: %s\n", SDL_GetError()); exit(2); } SDL_GL_MakeCurrent(sdlwin, rd_priv_ctx->gl_context); rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->log, pl_opengl_params( .allow_software = true, #ifndef NDEBUG .debug = true, #endif )); if (!rd_priv_ctx->gl) { fprintf(stderr, "Failed creating opengl device!\n"); exit(2); } rd_priv_ctx->swapchain = pl_opengl_create_swapchain(rd_priv_ctx->gl, pl_opengl_swapchain_params( .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow, .priv = sdlwin, )); if (!rd_priv_ctx->swapchain) { fprintf(stderr, "Failed creating opengl swapchain!\n"); exit(2); } int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; SDL_GL_GetDrawableSize(sdlwin, &w, &h); if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { fprintf(stderr, "Failed resizing vulkan swapchain!\n"); exit(2); } rd_priv_ctx->gpu = rd_priv_ctx->gl->gpu; if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) printf("Note: window dimensions differ (got %dx%d)\n", w, h); return rd_priv_ctx; } #endif #if HAVE_PLACEBO_VULKAN static void *placebo_renderer_create_vk(const Dav2dPlaySettings *settings) { SDL_Window *sdlwin = NULL; // Common init Dav2dPlayRendererPrivateContext *rd_priv_ctx = placebo_renderer_create_common(settings, SDL_WINDOW_VULKAN); if (rd_priv_ctx == NULL) return NULL; sdlwin = rd_priv_ctx->win; // Init Vulkan unsigned num = 0; if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) { fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError()); exit(1); } const char **extensions = malloc(num * sizeof(const char *)); assert(extensions); SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, extensions); if (!ok) { fprintf(stderr, "Failed getting Vk instance extensions\n"); exit(1); } if (num > 0) { printf("Requesting %d additional Vulkan extensions:\n", num); for (unsigned i = 0; i < num; i++) printf(" %s\n", extensions[i]); } rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->log, pl_vk_inst_params( .extensions = extensions, .num_extensions = num, )); if (!rd_priv_ctx->vk_inst) { fprintf(stderr, "Failed creating Vulkan instance!\n"); exit(1); } free(extensions); if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) { fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError()); exit(1); } rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->log, pl_vulkan_params( .instance = rd_priv_ctx->vk_inst->instance, .surface = rd_priv_ctx->surf, .allow_software = true, )); if (!rd_priv_ctx->vk) { fprintf(stderr, "Failed creating vulkan device!\n"); exit(2); } // Create swapchain rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk, pl_vulkan_swapchain_params( .surface = rd_priv_ctx->surf, .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR, )); if (!rd_priv_ctx->swapchain) { fprintf(stderr, "Failed creating vulkan swapchain!\n"); exit(2); } int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { fprintf(stderr, "Failed resizing vulkan swapchain!\n"); exit(2); } rd_priv_ctx->gpu = rd_priv_ctx->vk->gpu; if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) printf("Note: window dimensions differ (got %dx%d)\n", w, h); return rd_priv_ctx; } #endif static void placebo_renderer_destroy(void *cookie) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); pl_renderer_destroy(&(rd_priv_ctx->renderer)); pl_swapchain_destroy(&(rd_priv_ctx->swapchain)); for (int i = 0; i < 3; i++) pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i])); #if HAVE_PLACEBO_VULKAN if (rd_priv_ctx->vk) { pl_vulkan_destroy(&(rd_priv_ctx->vk)); vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL); pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst)); } #endif #if HAVE_PLACEBO_OPENGL if (rd_priv_ctx->gl) pl_opengl_destroy(&(rd_priv_ctx->gl)); if (rd_priv_ctx->gl_context) SDL_GL_DeleteContext(rd_priv_ctx->gl_context); #endif SDL_DestroyWindow(rd_priv_ctx->win); pl_log_destroy(&rd_priv_ctx->log); } static void placebo_render(void *cookie, const Dav2dPlaySettings *settings) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); if (!rd_priv_ctx->image.num_planes) { SDL_UnlockMutex(rd_priv_ctx->lock); return; } // Prepare rendering if (rd_priv_ctx->renderer == NULL) { rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->log, rd_priv_ctx->gpu); } struct pl_swapchain_frame frame; bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame); if (!ok) { SDL_UnlockMutex(rd_priv_ctx->lock); return; } struct pl_frame target; pl_frame_from_swapchain(&target, &frame); pl_rect2df_aspect_copy(&target.crop, &rd_priv_ctx->image.crop, 0.0); if (pl_frame_is_cropped(&target)) pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 0.0 }); if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, settings->highquality ? &pl_render_default_params : &pl_render_fast_params)) { fprintf(stderr, "Failed rendering frame!\n"); pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 1.0 }); } ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain); if (!ok) { fprintf(stderr, "Failed submitting frame!\n"); SDL_UnlockMutex(rd_priv_ctx->lock); return; } pl_swapchain_swap_buffers(rd_priv_ctx->swapchain); SDL_UnlockMutex(rd_priv_ctx->lock); } static int placebo_upload_image(void *cookie, Dav2dPicture *dav2d_pic, const Dav2dPlaySettings *settings) { Dav2dPlayRendererPrivateContext *p = cookie; assert(p != NULL); int ret = 0; if (!dav2d_pic) return ret; SDL_LockMutex(p->lock); if (!pl_upload_dav2dpicture(p->gpu, &p->image, p->plane_tex, pl_dav2d_upload_params( .picture = dav2d_pic, .film_grain = settings->gpugrain, .gpu_allocated = settings->zerocopy, .asynchronous = true, ))) { fprintf(stderr, "Failed uploading planes!\n"); p->image = (struct pl_frame) {0}; ret = -1; } SDL_UnlockMutex(p->lock); return ret; } static int placebo_alloc_pic(Dav2dPicture *const pic, void *cookie) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); int ret = pl_allocate_dav2dpicture(pic, (void *) rd_priv_ctx->gpu); SDL_UnlockMutex(rd_priv_ctx->lock); return ret; } static void placebo_release_pic(Dav2dPicture *pic, void *cookie) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); pl_release_dav2dpicture(pic, (void *) rd_priv_ctx->gpu); SDL_UnlockMutex(rd_priv_ctx->lock); } #if HAVE_PLACEBO_VULKAN const Dav2dPlayRenderInfo rdr_placebo_vk = { .name = "placebo-vk", .create_renderer = placebo_renderer_create_vk, .destroy_renderer = placebo_renderer_destroy, .render = placebo_render, .update_frame = placebo_upload_image, .alloc_pic = placebo_alloc_pic, .release_pic = placebo_release_pic, .supports_gpu_grain = 1, }; #else const Dav2dPlayRenderInfo rdr_placebo_vk = { NULL }; #endif #if HAVE_PLACEBO_OPENGL const Dav2dPlayRenderInfo rdr_placebo_gl = { .name = "placebo-gl", .create_renderer = placebo_renderer_create_gl, .destroy_renderer = placebo_renderer_destroy, .render = placebo_render, .update_frame = placebo_upload_image, .supports_gpu_grain = 1, }; #else const Dav2dPlayRenderInfo rdr_placebo_gl = { NULL }; #endif #else const Dav2dPlayRenderInfo rdr_placebo_vk = { NULL }; const Dav2dPlayRenderInfo rdr_placebo_gl = { NULL }; #endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/dp_renderer_sdl.c000066400000000000000000000126511517466257200251660ustar00rootroot00000000000000/* * Copyright © 2020, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "dp_renderer.h" #include /** * Renderer context for SDL */ typedef struct renderer_priv_ctx { // SDL window SDL_Window *win; // SDL renderer SDL_Renderer *renderer; // Lock protecting access to the texture SDL_mutex *lock; // Texture to render SDL_Texture *tex; } Dav2dPlayRendererPrivateContext; static void *sdl_renderer_create(const Dav2dPlaySettings *settings) { int window_flags = 0; if (settings->fullscreen) window_flags |= SDL_WINDOW_FULLSCREEN_DESKTOP; SDL_Window *win = dp_create_sdl_window(window_flags); if (win == NULL) { fprintf(stderr, "Creating SDL window failed: %s\n", SDL_GetError()); return NULL; } SDL_ShowCursor(0); // Alloc Dav2dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav2dPlayRendererPrivateContext)); if (rd_priv_ctx == NULL) { fprintf(stderr, "Out of memory!\n"); return NULL; } rd_priv_ctx->win = win; // Create renderer rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED); // Set scale quality SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear"); // Create Mutex rd_priv_ctx->lock = SDL_CreateMutex(); if (rd_priv_ctx->lock == NULL) { fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); free(rd_priv_ctx); return NULL; } rd_priv_ctx->tex = NULL; return rd_priv_ctx; } static void sdl_renderer_destroy(void *cookie) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); SDL_DestroyTexture(rd_priv_ctx->tex); SDL_DestroyRenderer(rd_priv_ctx->renderer); SDL_DestroyWindow(rd_priv_ctx->win); SDL_DestroyMutex(rd_priv_ctx->lock); free(rd_priv_ctx); } static void sdl_render(void *cookie, const Dav2dPlaySettings *settings) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); if (rd_priv_ctx->tex == NULL) { SDL_UnlockMutex(rd_priv_ctx->lock); return; } // Display the frame SDL_RenderClear(rd_priv_ctx->renderer); SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL); SDL_RenderPresent(rd_priv_ctx->renderer); SDL_UnlockMutex(rd_priv_ctx->lock); } static int sdl_update_texture(void *cookie, Dav2dPicture *dav2d_pic, const Dav2dPlaySettings *settings) { Dav2dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); if (dav2d_pic == NULL) { rd_priv_ctx->tex = NULL; SDL_UnlockMutex(rd_priv_ctx->lock); return 0; } int width = dav2d_pic->p.w; int height = dav2d_pic->p.h; int tex_w = width; int tex_h = height; enum Dav2dPixelLayout dav2d_layout = dav2d_pic->p.layout; if (DAV2D_PIXEL_LAYOUT_I420 != dav2d_layout || dav2d_pic->p.bpc != 8) { fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); exit(50); } SDL_Texture *texture = rd_priv_ctx->tex; if (texture != NULL) { SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h); if (tex_w != width || tex_h != height) { SDL_DestroyTexture(texture); texture = NULL; } } if (texture == NULL) { texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, width, height); SDL_RenderSetLogicalSize(rd_priv_ctx->renderer, width, height); } SDL_UpdateYUVTexture(texture, NULL, dav2d_pic->data[0], (int)dav2d_pic->stride[0], // Y dav2d_pic->data[1], (int)dav2d_pic->stride[1], // U dav2d_pic->data[2], (int)dav2d_pic->stride[1] // V ); rd_priv_ctx->tex = texture; SDL_UnlockMutex(rd_priv_ctx->lock); return 0; } const Dav2dPlayRenderInfo rdr_sdl = { .name = "sdl", .create_renderer = sdl_renderer_create, .destroy_renderer = sdl_renderer_destroy, .render = sdl_render, .update_frame = sdl_update_texture }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/examples/meson.build000066400000000000000000000054471517466257200240360ustar00rootroot00000000000000# Copyright © 2018, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Build definition for the dav2d examples # # Leave subdir if examples are disabled if not get_option('enable_examples') subdir_done() endif # dav2d player sources dav2dplay_sources = files( 'dav2dplay.c', 'dp_fifo.c', 'dp_renderer_placebo.c', 'dp_renderer_sdl.c', ) sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true) if sdl2_dependency.found() dav2dplay_deps = [sdl2_dependency, libm_dependency] dav2dplay_cflags = [] placebo_dependency = dependency('libplacebo', version: '>= 4.160.0', required: false) have_vulkan = false have_placebo = placebo_dependency.found() if have_placebo dav2dplay_deps += placebo_dependency # If libplacebo is found, we might be able to use Vulkan # with it, in which case we need the Vulkan library too. vulkan_dependency = dependency('vulkan', required: false) if vulkan_dependency.found() dav2dplay_deps += vulkan_dependency have_vulkan = true endif endif dav2dplay_cflags += '-DHAVE_PLACEBO=' + (have_placebo ? '1' : '0') dav2dplay_cflags += '-DHAVE_VULKAN=' + (have_vulkan ? '1' : '0') dav2dplay = executable('dav2dplay', dav2dplay_sources, rev_target, link_with : [libdav2d, dav2d_input_objs], include_directories : [dav2d_inc_dirs], dependencies : [getopt_dependency, dav2dplay_deps], install : true, c_args : dav2dplay_cflags, ) endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/gcovr.cfg000066400000000000000000000001751517466257200216500ustar00rootroot00000000000000exclude = .*/tests/.* exclude = .*/tools/.* exclude = .*/include/common/dump.h gcov-ignore-parse-errors = negative_hits.warn dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/000077500000000000000000000000001517466257200214675ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/000077500000000000000000000000001517466257200227575ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/attributes.h000066400000000000000000000143411517466257200253210ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_ATTRIBUTES_H #define DAV2D_COMMON_ATTRIBUTES_H #include "config.h" #include #include #ifndef __has_attribute #define __has_attribute(x) 0 #endif #ifndef __has_feature #define __has_feature(x) 0 #endif #ifdef __GNUC__ #define ATTR_ALIAS __attribute__((may_alias)) #if defined(__MINGW32__) && !defined(__clang__) #define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__gnu_printf__, fmt, attr))) #else #define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr))) #endif #define COLD __attribute__((cold)) #else #define ATTR_ALIAS #define ATTR_FORMAT_PRINTF(fmt, attr) #define COLD #endif #if ARCH_X86_64 /* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */ #define ALIGN_64_VAL 64 #define ALIGN_32_VAL 32 #define ALIGN_16_VAL 16 #elif ARCH_AARCH64 || ARCH_ARM || ARCH_LOONGARCH || ARCH_PPC64LE || ARCH_X86_32 /* ARM doesn't benefit from anything more than 16-byte alignment. */ #define ALIGN_64_VAL 16 #define ALIGN_32_VAL 16 #define ALIGN_16_VAL 16 #else /* No need for extra alignment on platforms without assembly. */ #define ALIGN_64_VAL 8 #define ALIGN_32_VAL 8 #define ALIGN_16_VAL 8 #endif /* * API for variables, struct members (ALIGN()) like: * uint8_t var[1][2][3][4] * becomes: * ALIGN(uint8_t var[1][2][3][4], alignment). */ #ifdef _MSC_VER #define ALIGN(ll, a) \ __declspec(align(a)) ll #else #define ALIGN(line, align) \ line __attribute__((aligned(align))) #endif /* * API for stack alignment (ALIGN_STK_$align()) of variables like: * uint8_t var[1][2][3][4] * becomes: * ALIGN_STK_$align(uint8_t, var, 1, [2][3][4]) */ #define ALIGN_STK_64(type, var, sz1d, sznd) \ ALIGN(type var[sz1d]sznd, ALIGN_64_VAL) #define ALIGN_STK_32(type, var, sz1d, sznd) \ ALIGN(type var[sz1d]sznd, ALIGN_32_VAL) #define ALIGN_STK_16(type, var, sz1d, sznd) \ ALIGN(type var[sz1d]sznd, ALIGN_16_VAL) #define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n))) /* * Forbid inlining of a function: * static NOINLINE void func() {} */ #ifdef _MSC_VER #define NOINLINE __declspec(noinline) #elif __has_attribute(noclone) #define NOINLINE __attribute__((noinline, noclone)) #else #define NOINLINE __attribute__((noinline)) #endif #ifdef _MSC_VER #define ALWAYS_INLINE __forceinline #else #define ALWAYS_INLINE __attribute__((always_inline)) inline #endif #if (defined(__ELF__) || defined(__MACH__) || (defined(_WIN32) && defined(__clang__))) && __has_attribute(visibility) #define EXTERN extern __attribute__((visibility("hidden"))) #else #define EXTERN extern #endif #if ARCH_X86_64 && __has_attribute(model) #define ATTR_MCMODEL_SMALL __attribute__((model("small"))) #else #define ATTR_MCMODEL_SMALL #endif #ifdef __clang__ #define NO_SANITIZE(x) __attribute__((no_sanitize(x))) #else #define NO_SANITIZE(x) #endif #if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__)) #undef assert #define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0) #elif defined(NDEBUG) && defined(_MSC_VER) #undef assert #define assert __assume #endif #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) # define dav2d_uninit(x) x=x #else # define dav2d_uninit(x) x #endif #if defined(_MSC_VER) && !defined(__clang__) #include static inline int ctz(const unsigned int mask) { unsigned long idx; _BitScanForward(&idx, mask); return idx; } static inline int clz(const unsigned int mask) { unsigned long leading_zero = 0; _BitScanReverse(&leading_zero, mask); return (31 - leading_zero); } #ifdef _WIN64 static inline int clzll(const unsigned long long mask) { unsigned long leading_zero = 0; _BitScanReverse64(&leading_zero, mask); return (63 - leading_zero); } #else /* _WIN64 */ static inline int clzll(const unsigned long long mask) { if (mask >> 32) return clz((unsigned)(mask >> 32)); else return clz((unsigned)mask) + 32; } #endif /* _WIN64 */ #else /* !_MSC_VER */ static inline int ctz(const unsigned int mask) { return __builtin_ctz(mask); } static inline int clz(const unsigned int mask) { return __builtin_clz(mask); } static inline int clzll(const unsigned long long mask) { return __builtin_clzll(mask); } #endif /* !_MSC_VER */ #ifndef static_assert #define CHECK_OFFSET(type, field, name) \ struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; } #define CHECK_SIZE(type, size) \ struct check_##type##_size { int x[(size == sizeof(type)) ? 1 : -1]; } #else #define CHECK_OFFSET(type, field, name) \ static_assert(name == offsetof(type, field), #field) #define CHECK_SIZE(type, size) \ static_assert(size == sizeof(type), #type) #endif #ifdef _MSC_VER #define PACKED(...) __pragma(pack(push, 1)) __VA_ARGS__ __pragma(pack(pop)) #else #define PACKED(...) __VA_ARGS__ __attribute__((__packed__)) #endif #endif /* DAV2D_COMMON_ATTRIBUTES_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/bitdepth.h000066400000000000000000000064151517466257200247410ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_BITDEPTH_H #define DAV2D_COMMON_BITDEPTH_H #include #include #include "common/attributes.h" #if !defined(BITDEPTH) typedef uint8_t pixel; /* can't be void due to pointer-to-array usage */ typedef void coef; #define HIGHBD_DECL_SUFFIX /* nothing */ #define HIGHBD_CALL_SUFFIX /* nothing */ #define HIGHBD_TAIL_SUFFIX /* nothing */ #elif BITDEPTH == 8 typedef uint8_t pixel; typedef int16_t coef; #define PIXEL_TYPE uint8_t #define COEF_TYPE int16_t #define pixel_copy memcpy #define pixel_set memset #define iclip_pixel iclip_u8 #define PIX_HEX_FMT "%02x" #define bitfn(x) x##_8bpc #define BF(x, suffix) x##_8bpc_##suffix #define PXSTRIDE(x) (x) #define highbd_only(x) #define HIGHBD_DECL_SUFFIX /* nothing */ #define HIGHBD_CALL_SUFFIX /* nothing */ #define HIGHBD_TAIL_SUFFIX /* nothing */ #define bitdepth_from_max(x) 8 #define BITDEPTH_MAX 0xff #elif BITDEPTH == 16 typedef uint16_t pixel; typedef int32_t coef; #define PIXEL_TYPE uint16_t #define COEF_TYPE int32_t #define pixel_copy(a, b, c) memcpy(a, b, (c) << 1) static inline void pixel_set(pixel *const dst, const int val, const int num) { for (int n = 0; n < num; n++) dst[n] = val; } #define PIX_HEX_FMT "%03x" #define iclip_pixel(x) iclip(x, 0, bitdepth_max) #define HIGHBD_DECL_SUFFIX , const int bitdepth_max #define HIGHBD_CALL_SUFFIX , f->bitdepth_max #define HIGHBD_TAIL_SUFFIX , bitdepth_max #define bitdepth_from_max(bitdepth_max) (32 - clz(bitdepth_max)) #define BITDEPTH_MAX bitdepth_max #define bitfn(x) x##_16bpc #define BF(x, suffix) x##_16bpc_##suffix static inline ptrdiff_t PXSTRIDE(const ptrdiff_t x) { assert(!(x & 1)); return x >> 1; } #define highbd_only(x) x #else #error invalid value for bitdepth #endif #define bytefn(x) bitfn(x) #define bitfn_decls(name, ...) \ name##_8bpc(__VA_ARGS__); \ name##_16bpc(__VA_ARGS__) #endif /* DAV2D_COMMON_BITDEPTH_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/dump.h000066400000000000000000000057531517466257200241070ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_DUMP_H #define DAV2D_COMMON_DUMP_H #include #include #include #include "common/bitdepth.h" static inline void append_plane_to_file(const pixel *buf, ptrdiff_t stride, int w, int h, const char *const file) { FILE *const f = fopen(file, "ab"); while (h--) { fwrite(buf, w * sizeof(pixel), 1, f); buf += PXSTRIDE(stride); } fclose(f); } static inline void hex_fdump(FILE *out, const pixel *buf, ptrdiff_t stride, int w, int h, const char *what) { fprintf(out, "%s\n", what); while (h--) { int x; for (x = 0; x < w; x++) fprintf(out, " " PIX_HEX_FMT, buf[x]); buf += PXSTRIDE(stride); fprintf(out, "\n"); } } static inline void hex_dump(const pixel *buf, ptrdiff_t stride, int w, int h, const char *what) { hex_fdump(stdout, buf, stride, w, h, what); } static inline void coef_dump(const coef *buf, const int w, const int h, const int len, const char *what) { int y; printf("%s\n", what); for (y = 0; y < h; y++) { int x; for (x = 0; x < w; x++) printf(" %*d", len, buf[x]); buf += w; printf("\n"); } } static inline void ac_dump(const int16_t *buf, int w, int h, const char *what) { printf("%s\n", what); while (h--) { for (int x = 0; x < w; x++) printf(" %03d", buf[x]); buf += w; printf("\n"); } } #endif /* DAV2D_COMMON_DUMP_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/frame.h000066400000000000000000000035501517466257200242250ustar00rootroot00000000000000/* * Copyright © 2021-2026, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_FRAME_H #define DAV2D_COMMON_FRAME_H /* * Checks whether Dav2dFrameType == INTER || == SWITCH * Both are defined as odd numbers {1, 3} and therefore have the LSB set. * See also: AV1 spec 6.8.2 */ #define IS_INTER_OR_SWITCH(frame_header) \ ((frame_header)->frame_type & 1) /* * Checks whether Dav2dFrameType == KEY || == INTRA * See also: AV1 spec 6.8.2 */ #define IS_KEY_OR_INTRA(frame_header) \ (!IS_INTER_OR_SWITCH(frame_header)) #endif /* DAV2D_COMMON_FRAME_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/intops.h000066400000000000000000000105671517466257200244550ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_INTOPS_H #define DAV2D_COMMON_INTOPS_H #include #include "common/attributes.h" static inline int imax(const int a, const int b) { return a > b ? a : b; } static inline int imin(const int a, const int b) { return a < b ? a : b; } static inline unsigned umax(const unsigned a, const unsigned b) { return a > b ? a : b; } static inline unsigned umin(const unsigned a, const unsigned b) { return a < b ? a : b; } static inline int iclip64to32(int64_t v, const int min, const int max) { return v < min ? min : v > max ? max : (int) v; } static inline int iclip(const int v, const int min, const int max) { return v < min ? min : v > max ? max : v; } static inline int iclip_u8(const int v) { return iclip(v, 0, 255); } static inline int apply_sign(const int v, const int s) { return s < 0 ? -v : v; } static inline int apply_sign64(const int64_t v, const int64_t s) { return s < 0 ? -(int)v : (int)v; } static inline int ulog2(const unsigned v) { return 31 ^ clz(v); } static inline int u64log2(const uint64_t v) { return 63 ^ clzll(v); } static inline unsigned inv_recenter(const unsigned r, const unsigned v) { if (v > (r << 1)) return v; else if ((v & 1) == 0) return (v >> 1) + r; else return r - ((v + 1) >> 1); } static inline unsigned popcnt(const unsigned x) { #ifdef _MSC_VER #if defined(_M_ARM64) || defined(__AVX__) // On x86/x64, this intrinsic produces a raw "popcnt" instruction, which // only works if running on SSE4.2 or newer, and would require us to // do runtime checking for the feature. // // Unconditionally use it, if we allow the compiler to use AVX anywhere // (if built with /arch:AVX). Also use it on ARM64, where the corresponding // instructions always are available. return __popcnt(x); #else static const uint8_t table[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, }; return table[x & 0xff] + table[(x >> 8) & 0xff] + table[(x >> 16) & 0xff] + table[(x >> 24) & 0xff]; #endif #else return __builtin_popcount(x); #endif } #endif /* DAV2D_COMMON_INTOPS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/common/validate.h000066400000000000000000000044011517466257200247200ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_VALIDATE_H #define DAV2D_COMMON_VALIDATE_H #include #include #if defined(NDEBUG) #define debug_print(...) do {} while (0) #define debug_abort() do {} while (0) #else #define debug_print(...) fprintf(stderr, __VA_ARGS__) #define debug_abort abort #endif #define validate_input_or_ret_with_msg(x, r, ...) \ if (!(x)) { \ debug_print("Input validation check \'%s\' failed in %s!\n", \ #x, __func__); \ debug_print(__VA_ARGS__); \ debug_abort(); \ return r; \ } #define validate_input_or_ret(x, r) \ if (!(x)) { \ debug_print("Input validation check \'%s\' failed in %s!\n", \ #x, __func__); \ debug_abort(); \ return r; \ } #define validate_input(x) validate_input_or_ret(x, ) #endif /* DAV2D_COMMON_VALIDATE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/compat/000077500000000000000000000000001517466257200227525ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/compat/gcc/000077500000000000000000000000001517466257200235065ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/compat/gcc/stdatomic.h000066400000000000000000000047371517466257200256610ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef GCCVER_STDATOMIC_H_ #define GCCVER_STDATOMIC_H_ #if !defined(__cplusplus) typedef int atomic_int; typedef unsigned int atomic_uint; #define memory_order_relaxed __ATOMIC_RELAXED #define memory_order_acquire __ATOMIC_ACQUIRE #define atomic_init(p_a, v) do { *(p_a) = (v); } while(0) #define atomic_store(p_a, v) __atomic_store_n(p_a, v, __ATOMIC_SEQ_CST) #define atomic_load(p_a) __atomic_load_n(p_a, __ATOMIC_SEQ_CST) #define atomic_load_explicit(p_a, mo) __atomic_load_n(p_a, mo) #define atomic_fetch_add(p_a, inc) __atomic_fetch_add(p_a, inc, __ATOMIC_SEQ_CST) #define atomic_fetch_add_explicit(p_a, inc, mo) __atomic_fetch_add(p_a, inc, mo) #define atomic_fetch_sub(p_a, dec) __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST) #define atomic_exchange(p_a, v) __atomic_exchange_n(p_a, v, __ATOMIC_SEQ_CST) #define atomic_fetch_or(p_a, v) __atomic_fetch_or(p_a, v, __ATOMIC_SEQ_CST) #define atomic_compare_exchange_strong(p_a, expected, desired) __atomic_compare_exchange_n(p_a, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) #endif /* !defined(__cplusplus) */ #endif /* GCCVER_STDATOMIC_H_ */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/compat/getopt.h000066400000000000000000000060311517466257200244250ustar00rootroot00000000000000#ifndef __GETOPT_H__ /** * DISCLAIMER * This file has no copyright assigned and is placed in the Public Domain. * This file is part of the mingw-w64 runtime package. * * The mingw-w64 runtime package and its code is distributed in the hope that it * will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #define __GETOPT_H__ /* All the headers include this file. */ #ifdef _WIN32 #include #endif #ifdef __cplusplus extern "C" { #endif extern int optind; /* index of first non-option in argv */ extern int optopt; /* single option character, as parsed */ extern int opterr; /* flag to enable built-in diagnostics... */ /* (user may set to zero, to suppress) */ extern char *optarg; /* pointer to argument of current option */ extern int getopt(int nargc, char * const *nargv, const char *options); #ifdef _BSD_SOURCE /* * BSD adds the non-standard `optreset' feature, for reinitialisation * of `getopt' parsing. We support this feature, for applications which * proclaim their BSD heritage, before including this header; however, * to maintain portability, developers are advised to avoid it. */ # define optreset __mingw_optreset extern int optreset; #endif #ifdef __cplusplus } #endif /* * POSIX requires the `getopt' API to be specified in `unistd.h'; * thus, `unistd.h' includes this header. However, we do not want * to expose the `getopt_long' or `getopt_long_only' APIs, when * included in this manner. Thus, close the standard __GETOPT_H__ * declarations block, and open an additional __GETOPT_LONG_H__ * specific block, only when *not* __UNISTD_H_SOURCED__, in which * to declare the extended API. */ #if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) #define __GETOPT_LONG_H__ #ifdef __cplusplus extern "C" { #endif struct option /* specification for a long form option... */ { const char *name; /* option name, without leading hyphens */ int has_arg; /* does it take an argument? */ int *flag; /* where to save its status, or NULL */ int val; /* its associated status value */ }; enum /* permitted values for its `has_arg' field... */ { no_argument = 0, /* option never takes an argument */ required_argument, /* option always requires an argument */ optional_argument /* option may take an argument */ }; extern int getopt_long(int nargc, char * const *nargv, const char *options, const struct option *long_options, int *idx); extern int getopt_long_only(int nargc, char * const *nargv, const char *options, const struct option *long_options, int *idx); /* * Previous MinGW implementation had... */ #ifndef HAVE_DECL_GETOPT /* * ...for the long form API only; keep this for compatibility. */ # define HAVE_DECL_GETOPT 1 #endif #ifdef __cplusplus } #endif #endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */ #endif /* !defined(__GETOPT_H__) */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/compat/msvc/000077500000000000000000000000001517466257200237225ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/compat/msvc/stdatomic.h000066400000000000000000000062451517466257200260710ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MSCVER_STDATOMIC_H_ #define MSCVER_STDATOMIC_H_ #if !defined(__cplusplus) && defined(_MSC_VER) #pragma warning(push) #pragma warning(disable:4067) /* newline for __has_include_next */ #if defined(__clang__) && __has_include_next() /* use the clang stdatomic.h with clang-cl*/ # include_next #else /* ! stdatomic.h */ #include #include "common/attributes.h" typedef volatile LONG atomic_int; typedef volatile ULONG atomic_uint; typedef enum { memory_order_relaxed, memory_order_acquire } msvc_atomic_memory_order; #define atomic_init(p_a, v) do { *(p_a) = (v); } while(0) #define atomic_store(p_a, v) InterlockedExchange((LONG*)p_a, v) #define atomic_load(p_a) InterlockedCompareExchange((LONG*)p_a, 0, 0) #define atomic_exchange(p_a, v) InterlockedExchange(p_a, v) #define atomic_load_explicit(p_a, mo) atomic_load(p_a) static inline int atomic_compare_exchange_strong_int(LONG *obj, LONG *expected, LONG desired) { LONG orig = *expected; *expected = InterlockedCompareExchange(obj, desired, orig); return *expected == orig; } #define atomic_compare_exchange_strong(p_a, expected, desired) atomic_compare_exchange_strong_int((LONG *)p_a, (LONG *)expected, (LONG)desired) /* * TODO use a special call to increment/decrement * using InterlockedIncrement/InterlockedDecrement */ #define atomic_fetch_add(p_a, inc) InterlockedExchangeAdd(p_a, inc) #define atomic_fetch_sub(p_a, dec) InterlockedExchangeAdd(p_a, -(dec)) #define atomic_fetch_or(p_a, v) InterlockedOr(p_a, v) #define atomic_fetch_add_explicit(p_a, inc, mo) atomic_fetch_add(p_a, inc) #endif /* ! stdatomic.h */ #pragma warning(pop) #endif /* !defined(__cplusplus) && defined(_MSC_VER) */ #endif /* MSCVER_STDATOMIC_H_ */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/000077500000000000000000000000001517466257200224675ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/common.h000066400000000000000000000065031517466257200241340ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_COMMON_H #define DAV2D_COMMON_H #include #include #include #ifdef __cplusplus extern "C" { #endif #ifndef DAV2D_API #if defined _WIN32 #if defined DAV2D_BUILDING_DLL #define DAV2D_API __declspec(dllexport) #else #define DAV2D_API #endif #elif defined __OS2__ #define DAV2D_API __declspec(dllexport) #else #if __GNUC__ >= 4 #define DAV2D_API __attribute__ ((visibility ("default"))) #else #define DAV2D_API #endif #endif #endif #if EPERM > 0 #define DAV2D_ERR(e) (-(e)) ///< Negate POSIX error code. #else #define DAV2D_ERR(e) (e) #endif #define DAV2D_EOF -('E' | ('O' << 8) | ('F' << 16)) /** * A reference-counted object wrapper for a user-configurable pointer. */ typedef struct Dav2dUserData { const uint8_t *data; ///< data pointer struct Dav2dRef *ref; ///< allocation origin } Dav2dUserData; /** * Input packet metadata which are copied from the input data used to * decode each image into the matching structure of the output image * returned back to the user. Since these are metadata fields, they * can be used for other purposes than the documented ones, they will * still be passed from input data to output picture without being * used internally. */ typedef struct Dav2dDataProps { int64_t timestamp; ///< container timestamp of input data, INT64_MIN if unknown (default) int64_t duration; ///< container duration of input data, 0 if unknown (default) int64_t offset; ///< stream offset of input data, -1 if unknown (default) size_t size; ///< packet size, default Dav2dData.sz struct Dav2dUserData user_data; ///< user-configurable data, default NULL members } Dav2dDataProps; /** * Release reference to a Dav2dDataProps. */ DAV2D_API void dav2d_data_props_unref(Dav2dDataProps *props); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* DAV2D_COMMON_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/data.h000066400000000000000000000106221517466257200235520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_DATA_H #define DAV2D_DATA_H #include #include #include "common.h" #ifdef __cplusplus extern "C" { #endif typedef struct Dav2dData { const uint8_t *data; ///< data pointer size_t sz; ///< data size struct Dav2dRef *ref; ///< allocation origin Dav2dDataProps m; ///< user provided metadata passed to the output picture } Dav2dData; /** * Allocate data. * * @param data Input context. * @param sz Size of the data that should be allocated. * * @return Pointer to the allocated buffer on success. NULL on error. */ DAV2D_API uint8_t * dav2d_data_create(Dav2dData *data, size_t sz); /** * Wrap an existing data array. * * @param data Input context. * @param buf The data to be wrapped. * @param sz Size of the data. * @param free_callback Function to be called when we release our last * reference to this data. In this callback, $buf will be * the $buf argument to this function, and $cookie will * be the $cookie input argument to this function. * @param cookie Opaque parameter passed to free_callback(). * * @return 0 on success. A negative DAV2D_ERR value on error. */ DAV2D_API int dav2d_data_wrap(Dav2dData *data, const uint8_t *buf, size_t sz, void (*free_callback)(const uint8_t *buf, void *cookie), void *cookie); /** * Wrap a user-provided data pointer into a reference counted object. * * data->m.user_data field will initialized to wrap the provided $user_data * pointer. * * $free_callback will be called on the same thread that released the last * reference. If frame threading is used, make sure $free_callback is * thread-safe. * * @param data Input context. * @param user_data The user data to be wrapped. * @param free_callback Function to be called when we release our last * reference to this data. In this callback, $user_data * will be the $user_data argument to this function, and * $cookie will be the $cookie input argument to this * function. * @param cookie Opaque parameter passed to $free_callback. * * @return 0 on success. A negative DAV2D_ERR value on error. */ DAV2D_API int dav2d_data_wrap_user_data(Dav2dData *data, const uint8_t *user_data, void (*free_callback)(const uint8_t *user_data, void *cookie), void *cookie); /** * Free the data reference. * * The reference count for data->m.user_data will be decremented (if it has been * initialized with dav2d_data_wrap_user_data). The $data object will be memset * to 0. * * @param data Input context. */ DAV2D_API void dav2d_data_unref(Dav2dData *data); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* DAV2D_DATA_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/dav2d.h000066400000000000000000000316011517466257200236410ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_H #define DAV2D_H #include #include #include "common.h" #include "picture.h" #include "data.h" #include "version.h" #ifdef __cplusplus extern "C" { #endif typedef struct Dav2dContext Dav2dContext; typedef struct Dav2dRef Dav2dRef; #define DAV2D_MAX_THREADS 256 #define DAV2D_MAX_FRAME_DELAY 256 typedef struct Dav2dLogger { void *cookie; ///< Custom data to pass to the callback. /** * Logger callback. May be NULL to disable logging. * * @param cookie Custom pointer passed to all calls. * @param format The vprintf compatible format string. * @param ap List of arguments referenced by the format string. */ void (*callback)(void *cookie, const char *format, va_list ap); } Dav2dLogger; enum Dav2dInloopFilterType { DAV2D_INLOOPFILTER_DEBLOCK = 1 << 0, DAV2D_INLOOPFILTER_CDEF = 1 << 1, DAV2D_INLOOPFILTER_CCSO = 1 << 2, DAV2D_INLOOPFILTER_WIENER = 1 << 3, DAV2D_INLOOPFILTER_GDF = 1 << 4, DAV2D_INLOOPFILTER_ALL = DAV2D_INLOOPFILTER_DEBLOCK | DAV2D_INLOOPFILTER_CDEF | DAV2D_INLOOPFILTER_CCSO | DAV2D_INLOOPFILTER_WIENER | DAV2D_INLOOPFILTER_GDF, }; enum Dav2dDecodeFrameType { DAV2D_DECODEFRAMETYPE_ALL = 0, ///< decode and return all frames DAV2D_DECODEFRAMETYPE_REFERENCE = 1,///< decode and return frames referenced by other frames only DAV2D_DECODEFRAMETYPE_INTRA = 2, ///< decode and return intra frames only (includes keyframes) DAV2D_DECODEFRAMETYPE_KEY = 3, ///< decode and return keyframes only }; typedef struct Dav2dSettings { int n_threads; ///< number of threads (0 = number of logical cores in host system, default 0) int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = ceil(sqrt(n_threads)), default 0) int apply_grain; ///< whether to apply film grain on output frames (default 1) int operating_point; ///< select an operating point for scalable AV2 bitstreams (0 - 31, default 0) int all_layers; ///< output all spatial layers of a scalable AV2 biststream (default 1) unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited, default 0) Dav2dPicAllocator allocator; ///< Picture allocator callback. Dav2dLogger logger; ///< Logger callback. int strict_std_compliance; ///< whether to abort decoding on standard compliance violations ///< that don't affect actual bitstream decoding (e.g. inconsistent ///< or invalid metadata, default 0) int output_invisible_frames; ///< output invisibly coded frames (in coding order) in addition ///< to all visible frames. Because of show-existing-frame, this ///< means some frames may appear twice (once when coded, ///< once when shown, default 0) enum Dav2dInloopFilterType inloop_filters; ///< postfilters to enable during decoding (default ///< DAV2D_INLOOPFILTER_ALL) enum Dav2dDecodeFrameType decode_frame_type; ///< frame types to decode (default ///< DAV2D_DECODEFRAMETYPE_ALL) uint8_t reserved[16]; ///< reserved for future use } Dav2dSettings; /** * Get library version. */ DAV2D_API const char *dav2d_version(void); /** * Get library API version. * * @return A value in the format 0x00XXYYZZ, where XX is the major version, * YY the minor version, and ZZ the patch version. * @see DAV2D_API_MAJOR, DAV2D_API_MINOR, DAV2D_API_PATCH */ DAV2D_API unsigned dav2d_version_api(void); /** * Initialize settings to default values. * * @param s Input settings context. */ DAV2D_API void dav2d_default_settings(Dav2dSettings *s); /** * Allocate and open a decoder instance. * * @param c_out The decoder instance to open. *c_out will be set to the * allocated context. * @param s Input settings context. * * @note The context must be freed using dav2d_close() when decoding is * finished. * * @return 0 on success, or < 0 (a negative DAV2D_ERR code) on error. */ DAV2D_API int dav2d_open(Dav2dContext **c_out, const Dav2dSettings *s); /** * Parse a Sequence Header OBU from bitstream data. * * @param out Output Sequence Header. * @param buf The data to be parser. * @param sz Size of the data. * * @return * 0: Success, and out is filled with the parsed Sequence Header * OBU parameters. * DAV2D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer. * Other negative DAV2D_ERR codes: Invalid data in the buffer, invalid passed-in * arguments, and other errors during parsing. * * @note It is safe to feed this function data containing other OBUs than a * Sequence Header, as they will simply be ignored. If there is more than * one Sequence Header OBU present, only the last will be returned. */ DAV2D_API int dav2d_parse_sequence_header(Dav2dSequenceHeader *out, const uint8_t *buf, const size_t sz); /** * Feed bitstream data to the decoder, in the form of one or multiple AV2 * Open Bitstream Units (OBUs). * * @param c Input decoder instance. * @param in Input bitstream data. On success, ownership of the reference is * passed to the library. May be NULL, in which case the decoder will * enter draining mode. * * @return * 0: Success, and the data was consumed. * DAV2D_ERR(EAGAIN): The data can't be consumed. dav2d_get_picture() should * be called to get one or more frames before the function * can consume new data. * DAV2D_EOF: The decoder is in draining mode. No further data can be * consumed. Use dav2d_flush() to force out of draining * mode, or call get_picture() until it returns DAV2D_EOF. * Other negative DAV2D_ERR codes: Error during decoding or because of invalid * passed-in arguments. The reference remains * owned by the caller. */ DAV2D_API int dav2d_send_data(Dav2dContext *c, Dav2dData *in); /** * Return a decoded picture. * * @param c Input decoder instance. * @param out Output frame. The caller assumes ownership of the returned * reference. * * @return * 0: Success, and a frame is returned. * DAV2D_ERR(EAGAIN): Not enough data to output a frame. dav2d_send_data() * should be called with new input. * DAV2D_EOF: The decoder has been drained. No more frame will be * output. * * Other negative DAV2D_ERR codes: Error during decoding or because of invalid * passed-in arguments. * * @code{.c} * Dav2dData data = { 0 }; * Dav2dPicture p = { 0 }; * int res; * * read_data(&data); * do { * res = dav2d_send_data(c, &data); * // Given we fetch all available frames in the loop below, * // DAV2D_ERR(EAGAIN) should not happen. * assert(res != DAV2D_ERR(EGAIN)); * if (res < 0) * free_and_abort(); * for (;;) { * res = dav2d_get_picture(c, &p); * if (res < 0) { * if (res != DAV2D_ERR(EAGAIN)) * free_and_abort(); * break; * } else * output_and_unref_picture(&p); * // Stay in the loop until no more frames are available. * } * // Stay in the loop as long as there's data to consume. * } while (data.sz || read_data(&data) == SUCCESS); * * // Handle EOS by draining all buffered frames. * dav2d_send_data(c, NULL); * do { * res = dav2d_get_picture(c, &p); * if (res < 0) { * if (res != DAV2D_EOF) * free_and_abort(); * } else * output_and_unref_picture(&p); * } while (res == 0); * @endcode */ DAV2D_API int dav2d_get_picture(Dav2dContext *c, Dav2dPicture *out); /** * Apply film grain to a previously decoded picture. If the picture contains no * film grain metadata, then this function merely returns a new reference. * * @param c Input decoder instance. * @param out Output frame. The caller assumes ownership of the returned * reference. * @param in Input frame. No ownership is transferred. * * @return * 0: Success, and a frame is returned. * Other negative DAV2D_ERR codes: Error due to lack of memory or because of * invalid passed-in arguments. * * @note If `Dav2dSettings.apply_grain` is true, film grain was already applied * by `dav2d_get_picture`, and so calling this function leads to double * application of film grain. Users should only call this when needed. */ DAV2D_API int dav2d_apply_grain(Dav2dContext *c, Dav2dPicture *out, const Dav2dPicture *in); /** * Close a decoder instance and free all associated memory. * * @param c_out The decoder instance to close. *c_out will be set to NULL. */ DAV2D_API void dav2d_close(Dav2dContext **c_out); /** * Flush all delayed frames in decoder and clear internal decoder state, * to be used when seeking. * * @param c Input decoder instance. * * @note Decoding will start only after a valid sequence header OBU is * delivered to dav2d_send_data(). * */ DAV2D_API void dav2d_flush(Dav2dContext *c); enum Dav2dEventFlags { /** * The last returned picture contains a reference to a new Sequence Header, * either because it's the start of a new coded sequence, or the decoder was * flushed before it was generated. */ DAV2D_EVENT_FLAG_NEW_SEQUENCE = 1 << 0, /** * The last returned picture contains a reference to a Sequence Header with * new operating parameters information for the current coded sequence. */ DAV2D_EVENT_FLAG_NEW_OP_PARAMS_INFO = 1 << 1, }; /** * Fetch a combination of DAV2D_EVENT_FLAG_* event flags generated by the decoding * process. * * @param c Input decoder instance. * @param flags Where to write the flags. * * @return 0 on success, or < 0 (a negative DAV2D_ERR code) on error. * * @note Calling this function will clear all the event flags currently stored in * the decoder. * */ DAV2D_API int dav2d_get_event_flags(Dav2dContext *c, enum Dav2dEventFlags *flags); /** * Retrieve the user-provided metadata associated with the input data packet * for the last decoding error reported to the user, i.e. a negative return * value (not EAGAIN) from dav2d_send_data() or dav2d_get_picture(). * * @param c Input decoder instance. * @param out Output Dav2dDataProps. On success, the caller assumes ownership of * the returned reference. * * @return 0 on success, or < 0 (a negative DAV2D_ERR code) on error. */ DAV2D_API int dav2d_get_decode_error_data_props(Dav2dContext *c, Dav2dDataProps *out); /** * Get the decoder delay, which is the number of internally buffered frames, not * including reference frames. * This value is guaranteed to be >= 1 and <= max_frame_delay. * * @param s Input settings context. * * @return Decoder frame delay on success, or < 0 (a negative DAV2D_ERR code) on * error. * * @note The returned delay is valid only for a Dav2dContext initialized with the * provided Dav2dSettings. */ DAV2D_API int dav2d_get_frame_delay(const Dav2dSettings *s); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* DAV2D_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/headers.h000066400000000000000000000435231517466257200242620ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_HEADERS_H #define DAV2D_HEADERS_H #include #include #ifdef __cplusplus extern "C" { #endif // Constants from Section 3. "Symbols and abbreviated terms" #define DAV2D_MAX_CDEF_STRENGTHS 8 #define DAV2D_MAX_OPERATING_POINTS 64 #define DAV2D_MAX_TILE_COLS 64 #define DAV2D_MAX_TILE_ROWS 64 #define DAV2D_MAX_SEGMENTS 16 #define DAV2D_NUM_REF_FRAMES 8 #define DAV2D_PRIMARY_REF_NONE 7 #define DAV2D_REFS_PER_FRAME 7 #define DAV2D_TOTAL_REFS_PER_FRAME (DAV2D_REFS_PER_FRAME + 1) enum Dav2dObuType { DAV2D_OBU_SEQ_HDR = 1, DAV2D_OBU_TD = 2, DAV2D_OBU_MULTI_FRAME_HDR = 3, DAV2D_OBU_CLOSED_LOOP_KF = 4, DAV2D_OBU_OPEN_LOOP_KF = 5, DAV2D_OBU_LEADING_TILE_GRP = 6, DAV2D_OBU_TILE_GRP = 7, DAV2D_OBU_METADATA = 8, DAV2D_OBU_METADATA_GRP = 9, DAV2D_OBU_SWITCH = 10, DAV2D_OBU_LEADING_SEF = 11, DAV2D_OBU_SEF = 12, DAV2D_OBU_LEADING_TIP = 13, DAV2D_OBU_TIP = 14, DAV2D_OBU_BUF_RM_TIMING = 15, DAV2D_OBU_LAYER_CFG_REC = 16, DAV2D_OBU_ATLAS_SEG = 17, DAV2D_OBU_OP_PT_SET = 18, DAV2D_OBU_BRIDGE = 19, DAV2D_OBU_MSDO = 20, DAV2D_OBU_RAS = 21, DAV2D_OBU_QM = 22, DAV2D_OBU_FGM = 23, DAV2D_OBU_CONTENT_INTERP = 24, DAV2D_OBU_PADDING = 25, }; enum Dav2dTxfmMode { DAV2D_TX_4X4_ONLY, DAV2D_TX_LARGEST, DAV2D_TX_SWITCHABLE, DAV2D_N_TX_MODES, }; enum Dav2dFilterMode { DAV2D_FILTER_8TAP_REGULAR, DAV2D_FILTER_8TAP_SMOOTH, DAV2D_FILTER_8TAP_SHARP, DAV2D_N_SWITCHABLE_FILTERS, DAV2D_FILTER_BILINEAR = DAV2D_N_SWITCHABLE_FILTERS, DAV2D_N_FILTERS, DAV2D_FILTER_SWITCHABLE = DAV2D_N_FILTERS, }; enum Dav2dAdaptiveBoolean { DAV2D_OFF = 0, DAV2D_ON = 1, DAV2D_ADAPTIVE = 2, }; enum Dav2dRestorationType { DAV2D_RESTORATION_NONE, DAV2D_RESTORATION_PC_WIENER, DAV2D_RESTORATION_NS_WIENER, DAV2D_RESTORATION_SWITCHABLE, }; enum Dav2dWarpedMotionType { DAV2D_WM_TYPE_INVALID = -1, DAV2D_WM_TYPE_IDENTITY, DAV2D_WM_TYPE_TRANSLATION, DAV2D_WM_TYPE_ROT_ZOOM, DAV2D_WM_TYPE_AFFINE, }; typedef struct Dav2dWarpedMotionParams { enum Dav2dWarpedMotionType type; int32_t matrix[6]; union { struct { int16_t alpha, beta, gamma, delta; } p; int16_t abcd[4]; } u; int affine; } Dav2dWarpedMotionParams; enum Dav2dPixelLayout { DAV2D_PIXEL_LAYOUT_I400, ///< monochrome DAV2D_PIXEL_LAYOUT_I420, ///< 4:2:0 planar DAV2D_PIXEL_LAYOUT_I422, ///< 4:2:2 planar DAV2D_PIXEL_LAYOUT_I444, ///< 4:4:4 planar }; enum Dav2dFrameType { DAV2D_FRAME_TYPE_KEY = 0, ///< Key Intra frame DAV2D_FRAME_TYPE_INTER = 1, ///< Inter frame DAV2D_FRAME_TYPE_INTRA = 2, ///< Non key Intra frame DAV2D_FRAME_TYPE_SWITCH = 3, ///< Switch Inter frame }; enum Dav2dColorDescription { DAV2D_COLOR_DESC_EXPLICIT = 0, // Explicitly signaled DAV2D_COLOR_DESC_BT709SDR = 1, // CP=1, TC=1, MC=5 DAV2D_COLOR_DESC_BT2100PQ = 2, // CP=9, TC=16, MC=9 DAV2D_COLOR_DESC_BT2100HLG = 3, // CP=9, TC=14, MC=9 DAV2D_COLOR_DESC_SRGB = 4, // CP=1, TC=13, MC=0 DAV2D_COLOR_DESC_SRGBSYCC = 5, // CP=1, TC=13, MC=5 }; enum Dav2dColorPrimaries { DAV2D_COLOR_PRI_BT709 = 1, DAV2D_COLOR_PRI_UNKNOWN = 2, DAV2D_COLOR_PRI_BT470M = 4, DAV2D_COLOR_PRI_BT470BG = 5, DAV2D_COLOR_PRI_BT601 = 6, DAV2D_COLOR_PRI_SMPTE240 = 7, DAV2D_COLOR_PRI_FILM = 8, DAV2D_COLOR_PRI_BT2020 = 9, DAV2D_COLOR_PRI_XYZ = 10, DAV2D_COLOR_PRI_SMPTE431 = 11, DAV2D_COLOR_PRI_SMPTE432 = 12, DAV2D_COLOR_PRI_EBU3213 = 22, DAV2D_COLOR_PRI_RESERVED = 255, }; enum Dav2dTransferCharacteristics { DAV2D_TRC_BT709 = 1, DAV2D_TRC_UNKNOWN = 2, DAV2D_TRC_BT470M = 4, DAV2D_TRC_BT470BG = 5, DAV2D_TRC_BT601 = 6, DAV2D_TRC_SMPTE240 = 7, DAV2D_TRC_LINEAR = 8, DAV2D_TRC_LOG100 = 9, ///< logarithmic (100:1 range) DAV2D_TRC_LOG100_SQRT10 = 10, ///< lograithmic (100*sqrt(10):1 range) DAV2D_TRC_IEC61966 = 11, DAV2D_TRC_BT1361 = 12, DAV2D_TRC_SRGB = 13, DAV2D_TRC_BT2020_10BIT = 14, DAV2D_TRC_BT2020_12BIT = 15, DAV2D_TRC_SMPTE2084 = 16, ///< PQ DAV2D_TRC_SMPTE428 = 17, DAV2D_TRC_HLG = 18, ///< hybrid log/gamma (BT.2100 / ARIB STD-B67) DAV2D_TRC_RESERVED = 255, }; enum Dav2dMatrixCoefficients { DAV2D_MC_IDENTITY = 0, DAV2D_MC_BT709 = 1, DAV2D_MC_UNKNOWN = 2, DAV2D_MC_FCC = 4, DAV2D_MC_BT470BG = 5, DAV2D_MC_BT601 = 6, DAV2D_MC_SMPTE240 = 7, DAV2D_MC_SMPTE_YCGCO = 8, DAV2D_MC_BT2020_NCL = 9, DAV2D_MC_BT2020_CL = 10, DAV2D_MC_SMPTE2085 = 11, DAV2D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived DAV2D_MC_CHROMAT_CL = 13, DAV2D_MC_ICTCP = 14, DAV2D_MC_IPT_C2 = 15, DAV2D_MC_YCGCO_RE = 16, DAV2D_MC_YCGCO_RO = 17, DAV2D_MC_RESERVED = 255, }; enum Dav2dChromaSamplePosition { DAV2D_CHR_LEFT = 0, DAV2D_CHR_CENTER = 1, DAV2D_CHR_TOPLEFT = 2, DAV2D_CHR_TOP = 3, DAV2D_CHR_BOTTOMLEFT = 4, DAV2D_CHR_BOTTOM = 5, DAV2D_CHR_UNKNOWN = 6, }; enum Dav2dAspectRatio { DAV2D_SAR_UNKNOWN = 0, DAV2D_SAR_1_1 = 1, DAV2D_SAR_12_11 = 2, DAV2D_SAR_10_11 = 3, DAV2D_SAR_16_11 = 4, DAV2D_SAR_40_33 = 5, DAV2D_SAR_24_11 = 6, DAV2D_SAR_20_11 = 7, DAV2D_SAR_32_11 = 8, DAV2D_SAR_80_33 = 9, DAV2D_SAR_18_11 = 10, DAV2D_SAR_15_11 = 11, DAV2D_SAR_64_33 = 12, DAV2D_SAR_160_99 = 13, DAV2D_SAR_4_3 = 14, DAV2D_SAR_3_2 = 15, DAV2D_SAR_2_1 = 16, DAV2D_SAR_EXPLICIT = 255, }; enum Dav2dScanType { DAV2D_SCAN_TYPE_UNKNOWN = 0, DAV2D_SCAN_TYPE_PROGRESSIVE = 1, DAV2D_SCAN_TYPE_INTERLACE = 2, DAV2D_SCAN_TYPE_INTERLACE_COMPLEMENTARY = 3, }; // Specifies the params related to the content in the sequence typedef struct Dav2dContentInterpretation { uint8_t /*enum Dav2dScanType*/ scan_type; uint8_t color_description_present; uint8_t chroma_sample_position_present; uint8_t aspect_ratio_info_present; uint8_t timing_info_present; uint8_t extension_present; uint8_t /*enum Dav2dChromaSamplePosition*/ chr[2]; struct { uint8_t /*enum Dav2dColorDescription*/ type; uint8_t /*enum Dav2dColorPrimaries*/ pri; uint8_t /*enum Dav2dTransferCharacteristics*/ trc; uint8_t /*enum Dav2dMatrixCoefficients*/ mtrx; uint8_t range; } color; struct { uint8_t /*enum Dav2dAspectRatio*/ type; uint32_t w, h; } sar; struct { uint32_t num_units_in_display_tick; uint32_t time_scale; uint8_t equal_elemental_interval; uint32_t num_ticks_per_elemental_duration; } timing; } Dav2dContentInterpretation; typedef struct Dav2dContentLightLevel { uint16_t max_content_light_level; uint16_t max_frame_average_light_level; } Dav2dContentLightLevel; typedef struct Dav2dMasteringDisplay { uint16_t primaries[3][2]; ///< 0.16 fixed point uint16_t white_point[2]; ///< 0.16 fixed point uint32_t max_luminance; ///< 24.8 fixed point uint32_t min_luminance; ///< 18.14 fixed point } Dav2dMasteringDisplay; typedef struct Dav2dITUTT35 { uint8_t country_code; uint8_t country_code_extension_byte; size_t payload_size; uint8_t *payload; } Dav2dITUTT35; typedef struct Dav2dSegmentationDataSet { int16_t delta_q[DAV2D_MAX_SEGMENTS]; uint16_t delta_q_mask, skip_mask, globalmv_mask; } Dav2dSegmentationDataSet; typedef struct Dav2dSequenceHeader { uint8_t id; /** * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome; * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component, * or 12 bits/component at any chroma subsampling. */ uint8_t profile; uint8_t reduced_still_picture_header; uint8_t level; uint8_t tier; uint8_t /*enum Dav2dPixelLayout*/ layout; ///< format of the picture uint8_t ss_hor, ss_ver; /** * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes * between 8 (0) and 10-12 (1) bits/component, and another element * (twelve_bit) to distinguish between 10 and 12 bits/component. To get * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2. */ uint8_t hbd; uint8_t lcr_id; uint8_t still_picture; uint8_t max_tlayer_id, max_mlayer_id, monotonic; /** * Maximum dimensions for this stream. In non-scalable streams, these * are often the actual dimensions of the stream, although that is not * a normative requirement. */ int max_width, max_height; uint8_t width_n_bits, height_n_bits; struct { uint8_t enabled; unsigned left, right, top, bottom; } crop; uint8_t max_display_model_info_present; uint8_t max_initial_display_delay; uint8_t decoder_model_info_present; uint8_t max_decoder_model_present; uint32_t num_units_in_decoding_tick; uint32_t max_decoder_buffer_delay; uint32_t max_encoder_buffer_delay; uint8_t max_low_delay_mode; uint8_t tlayer_dependency_present, mlayer_dependency_present; uint8_t tlayer_dependencies[8], mlayer_dependencies[8]; uint8_t sb128; // 2: 256x256, 1: 128x128, 0: 64x64 // partition flags uint8_t sdp, ext_sdp; uint8_t ext_partitions, uneven_4way_partitions; uint8_t max_pb_aspect_ratio_log2; // segmentation struct { uint8_t ext, info_present, adaptive; Dav2dSegmentationDataSet d; } segmentation; // intra tools uint8_t intra_dip, intra_edge_filter; uint8_t mrls, cfl, cfl_ds_filter_index, mhccp, ibp; // inter tools uint8_t motion_modes; // translation, inter-intra, warp [3x] uint8_t frame_motion_modes_present; uint8_t six_param_warp_delta; uint8_t masked_compound; uint8_t ref_frame_mvs; uint8_t reduced_ref_frame_mvs_mode; uint8_t order_hint_n_bits; uint8_t refmv_bank, drl_reorder; uint8_t explicit_ref_frame_map; uint8_t ref_frames, ref_frames_log2, number_of_bits_for_lt_frame_id; uint8_t def_max_drl_bits, allow_frame_max_drl_bits; uint8_t def_max_bvp_drl_bits, allow_max_bvp_drl_bits; uint8_t num_same_ref_comp; uint8_t tip, tip_hole_fill; uint8_t mv_traj, bawp, cwp, imp_msk_bld; uint8_t db_sub_pu, tip_explicit_qp; uint8_t opfl_refine, refine_mv, tip_refine_mv; uint8_t bru, adaptive_mvd, mvd_sign_derive, flex_mvres; uint8_t global_motion, short_refresh_frame_flags; // screen content flags uint8_t /*enum Dav2dAdaptiveBoolean*/ screen_content_tools; uint8_t /*enum Dav2dAdaptiveBoolean*/ force_integer_mv; // tx group tools uint8_t fsc, idtx_intra; uint8_t ist[2 /* intra, inter */]; uint8_t chroma_dctonly, inter_ddt, reduced_tx_part_set; uint8_t cctx; // coef flags uint8_t /*enum Dav2dAdaptiveBoolean*/ tcq; uint8_t parity_hiding; uint8_t avg_cdf, avg_cdf_type; // filtering flags uint8_t disable_loopfilters_across_tiles; uint8_t cdef; uint8_t gdf, gdf_unit_matches_sbsz; uint8_t restoration; uint8_t rst_disable_mask[2]; uint8_t ccso, ccso_unit_matches_sbsz; uint8_t /*enum Dav2dAdaptiveBoolean*/ cdef_on_skiptx; uint8_t df_par_bits; // quant tools uint8_t separate_uv_delta_q; uint8_t equal_ac_dc_q; int8_t base_ydc_dq, ydc_dq_enabled; uint8_t base_uvdc_dq, uvdc_dq_enabled; uint8_t base_uvac_dq, uvac_dq_enabled; struct { uint8_t /*enum Dav2dAdaptiveBoolean*/ present; struct Dav2dTileInfo { uint8_t uniform; uint8_t min_log2_cols, max_log2_cols, log2_cols, cols; uint8_t min_log2_rows, max_log2_rows, log2_rows, rows; uint16_t col_start_sb[DAV2D_MAX_TILE_COLS + 1]; uint16_t row_start_sb[DAV2D_MAX_TILE_ROWS + 1]; } t; } tiling; uint8_t film_grain_present; } Dav2dSequenceHeader; typedef struct Dav2dFilmGrainData { int chroma_scaling_from_luma; int num_points[3]; uint8_t points[3][14][2 /* value, scaling */]; int scaling_shift; int ar_coeff_lag; int8_t ar_coeffs[3][25 + 3 /* padding for alignment purposes */]; uint64_t ar_coeff_shift; int grain_scale_shift; int uv_mult[2]; int uv_luma_mult[2]; int uv_offset[2]; int overlap_flag; int clip_to_restricted_range; int mc_identity; int block_size; } Dav2dFilmGrainData; typedef struct Dav2dFrameHeader { uint8_t id; enum Dav2dFrameType frame_type; ///< type of the picture int width, height; uint8_t frame_offset; ///< frame number uint8_t tlayer_id, mlayer_id, xlayer_id; uint8_t show_existing_frame; int8_t existing_frame_idx; int8_t ltr_id; uint32_t frame_presentation_delay; uint8_t show_immediate; uint8_t show_implicit; uint8_t cross_frame_context; uint8_t disable_cdf_update; uint8_t allow_screen_content_tools; uint8_t force_integer_mv; uint8_t frame_size_override; uint8_t primary_ref_signaled, primary_ref_frame, secondary_ref_frame; uint8_t n_ref_frames; uint8_t refresh_frame_flags; uint8_t allow_intrabc, allow_global_intrabc, allow_local_intrabc; uint8_t max_bvp_drl_bits, max_drl_bits; int8_t refidx[DAV2D_REFS_PER_FRAME]; uint8_t has_future_refs, has_past_refs, has_bothside_refs; uint8_t mv_precision; // 0-3 for {f,h,q,e}pel enum Dav2dFilterMode subpel_filter_mode; uint8_t motion_modes; uint8_t use_ref_frame_mvs; uint8_t tmvp_sample_step; uint8_t opfl_refine_type; struct { uint8_t frame_mode; uint8_t hole_fill; uint8_t global_wtd_idx; uint8_t apply_filter; struct { int8_t y, x; } gmv; uint8_t subpel_filter; int8_t ref[2]; } tip; uint8_t sb128; // not literally coded, but derived from seqhdr/frame_type struct { struct Dav2dTileInfo t; uint8_t n_bytes; uint16_t update; } tiling; struct { uint16_t yac; int8_t ydc_delta; int8_t udc_delta, uac_delta, vdc_delta, vac_delta; struct { uint8_t enabled, num, y[4], u[4], v[4]; } qm; } quant; struct { uint8_t enabled, update_map, temporal; Dav2dSegmentationDataSet d; uint8_t preskip; int8_t last_active_segid; uint8_t lossless[DAV2D_MAX_SEGMENTS], qidx[DAV2D_MAX_SEGMENTS]; } segmentation; struct { struct { uint8_t present; uint8_t res_log2; } q; } delta; uint8_t all_lossless, any_lossless; uint8_t tcq, parity_hiding; struct { uint8_t sub_pu; uint8_t level_y[2 /* dir */]; uint8_t level_u, level_v; int8_t delta_q_y[2], delta_q_u, delta_q_v; } deblock; struct { enum Dav2dAdaptiveBoolean enabled; uint8_t qp_idx, scale; } gdf; struct { uint8_t enabled; uint8_t damping; uint8_t n_strengths; uint8_t on_skiptx; uint8_t y_strength[DAV2D_MAX_CDEF_STRENGTHS]; uint8_t uv_strength[DAV2D_MAX_CDEF_STRENGTHS]; } cdef; struct { struct { uint8_t /*enum Dav2dRestorationType*/ type; struct Dav2dNSWienerPlane { uint8_t frame_filters_on; uint8_t num_classes_idx, num_classes, temporal, refidx; int8_t filter[16][18]; } ns; } p[3 /* plane */]; uint8_t unit_size[2 /* y, uv */]; } restoration; struct { uint8_t enabled; struct { uint8_t enabled; uint8_t reuse, sb_reuse, refidx; uint8_t bo_only, scale_idx, quant_idx; uint8_t ext_filter_support, edge_clf, max_band_log2; uint8_t filter_off[64 /* nibbles. if bo_only { [band:128] } else { [d0:4][d1:4][band:8] } */]; } p[3]; } ccso; enum Dav2dTxfmMode txfm_mode; uint8_t switchable_comp_refs; uint8_t skip_mode_enabled; uint8_t bawp; uint8_t warp_motion; uint8_t reduced_txtp_set; struct { uint8_t ref; // index in our reference array uint8_t refref; // index in that reference's refrence array Dav2dWarpedMotionParams m[DAV2D_REFS_PER_FRAME]; } gmv; struct { uint8_t present; uint8_t id; unsigned seed; } film_grain; ///< film grain parameters } Dav2dFrameHeader; #ifdef __cplusplus } /* extern "C" */ #endif #endif /* DAV2D_HEADERS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/meson.build000066400000000000000000000030331517466257200246300ustar00rootroot00000000000000# Copyright © 2019, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dav2d_api_headers = [ 'common.h', 'data.h', 'dav2d.h', 'headers.h', 'picture.h', 'version.h', ] # install headers install_headers(dav2d_api_headers, subdir : 'dav2d') dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/picture.h000066400000000000000000000137761517466257200243310ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_PICTURE_H #define DAV2D_PICTURE_H #include #include #include "common.h" #include "headers.h" #ifdef __cplusplus extern "C" { #endif /* Number of bytes to align AND pad picture memory buffers by, so that SIMD * implementations can over-read by a few bytes, and use aligned read/write * instructions. */ #define DAV2D_PICTURE_ALIGNMENT 64 typedef struct Dav2dPictureParameters { int w; ///< width (in pixels) int h; ///< height (in pixels) enum Dav2dPixelLayout layout; ///< format of the picture int bpc; ///< bits per pixel component (8 or 10) } Dav2dPictureParameters; typedef struct Dav2dPicture { Dav2dSequenceHeader *seq_hdr; Dav2dFrameHeader *frame_hdr; /** * Pointers to planar image data (Y is [0], U is [1], V is [2]). The data * should be bytes (for 8 bpc) or words (for 10 bpc). In case of words * containing 10 bpc image data, the pixels should be located in the LSB * bits, so that values range between [0, 1023]; the upper bits should be * zero'ed out. */ void *data[3]; /** * Number of bytes between 2 lines in data[] for luma [0] or chroma [1]. */ ptrdiff_t stride[2]; Dav2dPictureParameters p; Dav2dDataProps m; /** * High Dynamic Range Content Light Level metadata applying to this picture, * as defined in section 5.8.3 and 6.7.3 */ Dav2dContentLightLevel *content_light; /** * High Dynamic Range Mastering Display Color Volume metadata applying to * this picture, as defined in section 5.8.4 and 6.7.4 */ Dav2dMasteringDisplay *mastering_display; /** * Array of ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2 */ Dav2dITUTT35 *itut_t35; Dav2dFilmGrainData *fgm; Dav2dContentInterpretation *ci; /** * Number of ITU-T T35 metadata entries in the array */ size_t n_itut_t35; uintptr_t reserved[4]; ///< reserved for future use struct Dav2dRef *frame_hdr_ref; ///< Dav2dFrameHeader allocation origin struct Dav2dRef *seq_hdr_ref; ///< Dav2dSequenceHeader allocation origin struct Dav2dRef *content_light_ref; ///< Dav2dContentLightLevel allocation origin struct Dav2dRef *mastering_display_ref; ///< Dav2dMasteringDisplay allocation origin struct Dav2dRef *itut_t35_ref; ///< Dav2dITUTT35 allocation origin struct Dav2dRef *fgm_ref; struct Dav2dRef *ci_ref; uintptr_t reserved_ref[4]; ///< reserved for future use struct Dav2dRef *ref; ///< Frame data allocation origin void *allocator_data; ///< pointer managed by the allocator } Dav2dPicture; typedef struct Dav2dPicAllocator { void *cookie; ///< custom data to pass to the allocator callbacks. /** * Allocate the picture buffer based on the Dav2dPictureParameters. * * The data[0], data[1] and data[2] must be DAV2D_PICTURE_ALIGNMENT byte * aligned and with a pixel width/height multiple of 128 pixels. Any * allocated memory area should also be padded by DAV2D_PICTURE_ALIGNMENT * bytes. * data[1] and data[2] must share the same stride[1]. * * This function will be called on the main thread (the thread which calls * dav2d_get_picture()). * * @param pic The picture to allocate the buffer for. The callback needs to * fill the picture data[0], data[1], data[2], stride[0] and * stride[1]. * The allocator can fill the pic allocator_data pointer with * a custom pointer that will be passed to * release_picture_callback(). * @param cookie Custom pointer passed to all calls. * * @note No fields other than data, stride and allocator_data must be filled * by this callback. * @return 0 on success. A negative DAV2D_ERR value on error. */ int (*alloc_picture_callback)(Dav2dPicture *pic, void *cookie); /** * Release the picture buffer. * * If frame threading is used, this function may be called by the main * thread (the thread which calls dav2d_get_picture()) or any of the frame * threads and thus must be thread-safe. If frame threading is not used, * this function will only be called on the main thread. * * @param pic The picture that was filled by alloc_picture_callback(). * @param cookie Custom pointer passed to all calls. */ void (*release_picture_callback)(Dav2dPicture *pic, void *cookie); } Dav2dPicAllocator; /** * Release reference to a picture. */ DAV2D_API void dav2d_picture_unref(Dav2dPicture *p); #ifdef __cplusplus } /* extern "C" */ #endif #endif /* DAV2D_PICTURE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/dav2d/version.h000066400000000000000000000035421517466257200243310ustar00rootroot00000000000000/* * Copyright © 2019-2026, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_VERSION_H #define DAV2D_VERSION_H #ifdef __cplusplus extern "C" { #endif #define DAV2D_API_VERSION_MAJOR 1 #define DAV2D_API_VERSION_MINOR 0 #define DAV2D_API_VERSION_PATCH 0 /** * Extract version components from the value returned by * dav2d_version_int() */ #define DAV2D_API_MAJOR(v) (((v) >> 16) & 0xFF) #define DAV2D_API_MINOR(v) (((v) >> 8) & 0xFF) #define DAV2D_API_PATCH(v) (((v) >> 0) & 0xFF) #ifdef __cplusplus } /* extern "C" */ #endif #endif /* DAV2D_VERSION_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/meson.build000066400000000000000000000031521517466257200236320ustar00rootroot00000000000000# Copyright © 2018, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # Revision file (vcs_version.h) generation dav2d_git_dir = join_paths(dav2d_src_root, '.git') rev_target = vcs_tag(command: [ 'git', '--git-dir', dav2d_git_dir, 'describe', '--long', '--always' ], input: 'vcs_version.h.in', output: 'vcs_version.h' ) subdir('dav2d') dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/include/vcs_version.h.in000066400000000000000000000001041517466257200246000ustar00rootroot00000000000000/* auto-generated, do not edit */ #define DAV2D_VERSION "@VCS_TAG@" dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/000077500000000000000000000000001517466257200211235ustar00rootroot00000000000000avm-v14.1.0-bus.352x288.l1.partial_lossless.obu000066400000000000000000000122021517466257200306660ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media y$@(t3 gC:GJգAS.p)+5w?Nwy;z8=\Fx9S$~6t~y^0Cr=o7ɝGkg[ag&gNY1vy 8Zl56}5Yc2EAͯHЗ~uic ؅y4G@]=Fދv yq@B]@C.#,Cd8Eț'|lg".}d 7puf9%sʘv7JUE4Z#F'eޗiP*%4Cu5ne~1˔D F h\# ^>GzpvfBkDqt4Q2<Ƴ69W2)貍ߨc??Q4 ?3H^*ӗ+X!k)>Fr!M19k0]@t"ӗMݡw!HVİ#e#f,[ olC4u7˚(ŦZsuCQ`-$,`~n5Np͑l}u<+ 1_A!4& ѝJ ;u _nX%i_ڛ o7/I2?0+VR!EkF⸎G@ö9/"ȂYQuӘ8"CU,c(9pA/BFO1M0^I,XMzy4E3 =Cx=Fx?b†Vi.vliUᕚƲ ,!9#spq=mPt("yτX@r$pA|]a,]ȠBT8_Ƅۃ jw W|dLs4tb[==SŏT|la-ɚWQwH-WO!9 Woe>y,O^j]1Qd'>IwF;l?^v-7V3EpOX,ݵ wz䧋]D,7f1A=|,dbƐB34кXg z`K̃m?{임` 0djB{j'9ڞŵGp䊟N7>LWhXU1[@Җ3W_rܐftV=69FIe+ xu5c@7@a:ݴD#ڹ#7gIw=lw5 3ݰ2Y8|dI$`/sjnS_Z_!J纲!U( \ "W.٤ f[;9Of?~߲/NOU#B|f A 0/%sXXb?܆I/`-rȵrL}Ab+F>RmS=0N¦3<`s߾{1a#&nVPLz: AbrCٲ#yGIn";ܙڧzM rmhQˍ+DBV/cWi*ӞGv琩,!ʟէŌzx ]GbsB~:ZXw}ƪxxUoH_1ɨ1B]Q7:v*Ϝ'ۦ^ѹ}RG紒?\CnH[J/K&eDW#6k߳ tҜ/2{"XisdtD-Fkҝfx4:.4<OJaEq7ȈJ zoY_^PDc~io#PE#z83[[+QY#S&Vv|=Rv Ɖ=ͥVC~-D‚S9l zlAgz(R*}ZjBv-b.IT޶'wu @4n_PEki〲Bl]oR';淎ܕӧ!gvT @9Mۛp"p1>2>ywLnnNqTU@,ի1嚐"%}Ji"c>p"^OeҗSę.oE鈛TAӈbr̍ թtwvUsίS23X!Ԇo=;ʃS<"FDI'D&U,|Fu+I(#r@`kp6sl)Jq mW{/r4=豷C@ /`-~^ )ͮ9}jVhɝU/5TVT^,I[@ejQK(ȏEvx W<YAqSO(!W?%?//.q5m@:L@ݪNQw4Q'#n)9 w`}仺,z=MÚoX*8o$Pޏv!Č/zSRBqV*1<6K=0vv?d uhoGMO_X0ܼt򰔤bFtFR`o<ŰO*ۍwNk xR:ܔ൶>BMJoˬ6YׇͤOaqY_s5}6dUfwŨA:Q8X!ǔڝ0"5bψǼ9=tnht2IrZÈSU"h}>ݝY1>8Ohm 40WMAnY>) 6 \T˅31]@iB>P}mPG*X7,*oM,WEu9J|G{:` ]B}x ("8;?THmxT?9zY1,oO [y0hԍ<˲P뙇nucu`Á#̎(-2"i?)Ƕtux?ȿiX24^$`:_B:U_Kאu|_{osy6u;`bjH:/'#7HFC^9[d46$ 2^LDk=sJͳ古Ze*{)ot[g;!kyΐU®F|к.gz Q-%@WտB?Slq +5n 1]W'TGY8Hp iT2_>L[$8(w(0r%X=P{)U/Is9+X o֮BwMD;"$pv-\ArVTR)1o͗V6h[sYި>&ٌ|ĭg68*[*~MqafQq/'&'51Cg)ZZEItpPGZ`ujX9\Mhv꾻_O<,*Hcs*aVw=aP[>f K5/]UdQXMAUqSZlv c9J5!Z6IENy'֧4^8'`sѩ (.+wdLe"$x0a{7$UyV97_n4$JX02Sȡ]G"EʐKzOYyMPRg#Zy"g,B 75 (ԠM.Qk9T<>(Iϓ yV''#f~8.ߋNBo}`2[zQmеr߃hE!A-!TWQjݤ6%}_bAF+N"q@[@{} BTdiJt+E+m4% (H2VOǂ9| Y >/O#9(ꔪQ-@ ( bμ}e ^yVAE/7J2Vm{cߦ&E:`6٭\Ҷh$Cf^>8G~ke}7,Gzs/"Ɖ"[B(_:gCqaI$#@'&AVGYssLNP\rޡ)K}⼀9[ժ)[-V 3B%A$ש+&6L qa@\r]&"M{xm CʪI8,vߝ`?A4Կ"̍*5\r ԉ_4ݸ^oKUnDΪ"0bBp yPո)6xvHؽvx92Fw<|:xMPh|Dnt k0avm-v14.1.0-bus.352x288.l10.deltaq1.obu.md5000066400000000000000000000000411517466257200274000ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/mediaa51faffac365722f94697051af0d9e00 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-bus.352x288.l5.seg1.obu000066400000000000000000000005041517466257200262270ustar00rootroot00000000000000# "+' H{jih@Ǚq(8 Cne*![_+b+$ Z}+~@탠3d,>)r`:9t,dFYo V,9o( Ep.%91 y~hg*O?۠:xY`n8CE e t_XE? @Д3O}TY;p~hwS ] 1j68]/P8%P8+8ldav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-bus.352x288.l5.seg1.obu.md5000066400000000000000000000000411517466257200267070ustar00rootroot00000000000000841f3395b0836a92d5454e9bd30a3bb6 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-bus.64x64.l1.sdp0.obu000066400000000000000000000031531517466257200260650ustar00rootroot00000000000000  Uǜm 3BզkzZ(?DI㯝@m8D;Mm&ڗ S;cRQ|հC.zar }b2],Ʈ/nTdȓ7]-[ƣ)쎹7(ςYbS|ܔn˃݄ݜ5K7ѷb: &^bT}+Ji4qVhDєbձ_"0YgD4GD4Gt"LC/S3$$Ǟqo+#tA9!`T,Cx(/}.$R㌀%; "bS'9̞͘mQoeKή<}S!{\fjcPFS&3TmeTIiWp7l%p[ZKz&r7Y#k@ i󛈓O :zjL~_^.,pDfȺG垮D1Ԓ/& ~.|l&" 68} ^I% 3eA(Pk|ySRdO.ADo.~m3@gY}`I{\0I%_{͏M+䩥x[}fa #F!Ȑ7xß! ޣ6FA4Mq/-5JX4M{?,^nPAߧ e@Ү2%VP !-U$~ e+ 铢Mfatb-69\LK4VaXJGC[j5ˋ-\!A19dA鵮C6 8+GG=P@v6SQ#LұlT_ETIFXM\ԑž1%x䜯ܲoi4ՙ[0O$˶ aDsw y{vgzvh/+pT^dL ng86嚘sEi`XakK}`^8_2͠=w VH|9nuPAv*\e4S߷pަӷBU'R'YC9)U'f2 ' ]^rG5>: ݬtXϰģ%- ]\n"νƍ0H4%Ϣr^vD{S@C1eRV>\Gl:F#XC&] `40ΤH]Scd>Cnc'7sbj@Hn{DF2h˦{~!:=DTQ4ɷHdav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-bus.64x64.l1.sdp0.obu.md5000066400000000000000000000000411517466257200265420ustar00rootroot0000000000000015c9e34e1df4630094833207c5fc6a9c dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-bus.64x64.l1.sdp1.obu000066400000000000000000000031341517466257200260650ustar00rootroot00000000000000  Uǜm ?2k;RM0PxrC b7)ؿma_/)wy1NjFX!m5+ӔqT?\e(iFsg*d" cr0'mOӯ4y |A}̪.CAba+4luַa+ҳ63.}l8Z_wk*`su>P/Z?M+GlX :mޔ1,fLd 5#Lg唴[܁Y߰۰U .a~F%m7 6? _*a{*B?u[ U0m!Lj-D-x,ӲVqLk|uurodyP1Eb\lFИ8KX$Ƹ_igP]{"B(:5.y$Ka pԞ"R0|DpWo\rghQ[3ۍehً\a ] sBB % ^:D .?A_Ŗ[t- i СW>v; yL!-.@Drf(/:[3zP[w:b7)/%dD{3?[Twڈ(2;=prD*͋m 5HPπ#KԠ4׼[pP @K}y$`A);疥?ஏA$Ϣ)1X!$3L6=ރޡgƣJy#E㳋ڥ{ ~Sv꺧<|Rp-3 Ydy].]//yQ`~怚;nqZ@x杖j]Q9(Jk .@=$m7 (/oj>hGB$*s yO!Or!?"z`/l2e$`Yk;.+I/yjierjNMVR~>`(Ԛ1ML8xƆ@(٫C#2evMmY{Z!{IYpr_ Ÿ숞L;,H-)jҕ>,v"cϓN۝O ˤqGv̕J\H1}\rvT f*[&&˝n|%O^R.Y5+w3J$unoϘ B$p`̀A(3-fi0U%]. yijؚb5Wu҉ohaoΪ.2"^$Zueȝ!tӻn jt;qm<Q<#?C =E itO,5Ki139\FrRW E¥H&LrYc#>ou!S(W t8KLu1@`EAK?\lQ݄.IH|b׌ |rPN*cP_4<o%=E6$0W_(1F)1mQdd$e7 W՗; L|#4V/Z: 9Arp1!M@1qDWE^|Mb^[ qU Ix8;ˤZhvapf g:lž cilmMjɷ5zkIYmYxnhkg};^69=憰*Na@KfԬo3pC0r~y#q73 UekRG: I/'_/$˖tkO`.J;WY{<tk@íL;m^N[ @)ԋ}YyL`ލZ#{Zz)f 1l^QɿMsY۫xR,I3 j|A33JY0)mS q´dy=+hZo6v?q\|sVt= 6bDTlk1Fv$ "Va:|#)rR. J# zUt~8z>> N 7oH(64/]/#^EMBϼFĺ}"1z/GV6U-{@ޖaF[z =᜵W7Oqĺ/}4mƘTWPl ?>4'8CA7f~*nfCIIasUnn1ړ=Kwzfgd(bhك qAnektYOST[[$OId#}~{7ⅵG;F1łAe=DVVU8¬`L~5PPtc\vk/4Ӳwy~ i|ҩvMͤC)a;w|x$jqd3H0\,,Ξz^Me;ڮ}7LdkQĘ=. WA?8 PEUꊨ2R'1j=@/e}=$S5 D^58TMMb0s~jU,vB2#M}`Dh u;Zs : X yôArcW{_$7u8nqi9:\ch(/TjֱS)yf= $`=-ObTFoBVЙ|L=EI~fTGndm ÓϓZ+2_=v!~bKgTI< VcT<L$xF7+iZ @2)Ȅ櫚aк.>+6g@% ώPÒD 1MȌ&0,_Y5gaVN`PϜAͦ:5`9I"I=|XS :v/_u,cӡBC%Ql_fN)l1`BMW 6 xaW|ݫ= xCQC;bF!dPP~u@*i`F֤v*==dkHϛ> O ^B1LwS">ztMS(7MށLO؇Pb:wF.k { Ez]1%'zX6wfbi`+jˋaIk(v^*vQոʦg@r8P.̉(ɝI.)JRji>eCīvPF^<6xv$R)Xח =KENkHxy,'FdQ)}[B4/R탳j\Y *}vXpd˔AJCI+a@e y=9u?E ՘P:`RbZG=k+s+l\?`s9$=yh8Ch(F1!WOaC2q8}5mTEXJJ\UT%Ye/t3+m׭b5\o6MYw J5'Ae6G| ɚfQM`1SRlgj۽A*FA.M\J]=(&R |Tų(:Ihʙ\j3گs xIwJ pwV'0雜xu^eaMH$UЎ]KtŞ)|"/F׈yW\0skx&j^i" Zăl!p ⛸g"8O6pP'Q9 @oAn=quwb]%-(̄b:2pA}=`n2MrFJ)wy/δBaBv<+a.}FH1 >1ыwN@?mǎ/Ք̘zP+=ȲuPlrRtEs17-HlMFiy0q) XfԪ[$TdD[[f"֎"=[P#g<Ē?ئ7{p!Pْ`1#$--4t-w7frjn"L [I'?([coՀ/>24ѝWF.7s%T1?3LƁ?Doqڔ Z1Cu*jsB#_H悢6>O[FUVDvBĴEtWTm ٹp"fgISNNwkMd"""/]Z8^ik!^t%7g<LӦ򝖻Qʰu+IHjhmlg@c i~0Y. ;9͏맙F 'V}_@}x[@ggh >բ,ZDcAxppCEmH&wöUH|ۄsf`{9˟7bu X޿S(^WB3c)rXu짃9/V[<@s߮ 8ǜn='CUJtkHQ><8Uq;[~䰝cipn.y =wc"ac<_AOez5R[`.JX=[ն5CSCą?,i؝Gy"CLAV濆e`zn a\Bt?JL<3?Tz%ŎI$vq𬣵;Cpt{̿:2#nw|DHw|HCC;c#9\ 7j:IQ]0]`9=\ISXZ7V = RٚTQ|Jƽi376Lpr"h˕IO{ 䇻?C@v{O(Ɗ:SݛO%DNov YGcPDHoޟr=Q15۰A@%DC ֳq70Q&n̾&Uὲ RlY%^7cN$/vvrmƎ$I@So{09bFhB sA_dz3*2%NDXE'Gý?ܐ.xQl2BոF+}y6 ;lS#-*M_i72v;cH}ԛ |Ǖ[)cH!Fb Kp̜@,+#Led/_d_9.3=8ias*}_`r[%Q@6 |rWpdA)4\&e$"kz˻k?!ZFE RY H>nvʼn P=M6a[߅c6'nv?eTZ\nntfíp1-J \i 9E?}:Ce:s!VvaLMB^5XCN| ҭ5Wjb=w o5,%۴oilm.ngeX)WWStTŽldxymf+D!qk"n %NeOr ]HZĪm IsKsMV3=r^ºk!,RR{#;9l5+I#8|W~0|; n!)QU+Bͷ+Cu !/>è'-w4/ܚ]-rRGCWylsU;%7(8[6M )q1Z17u+. Y 0R%$@9 U7t ěC6T3V]bXWeH!mz[#9k08"F./ .6{.Qq{cjK \"$gmB]*/1(mOꟃu8ˣ\)WiZ!Tߤ뮝y2.fs|xJX5ҙcA,SgPhc&7J|/\lOL+ŠMBDd9~L( 1hD(_h`þsG(J_@S0sՓ~@އ TUSdBj+SF#t ;SόSNhS,|޶Xj64lm۾0z$TL,q+X# d ..kpِ΁|b(m`)b̆F {24MB5BymI{&k=CM.pH&N̈9[K -\G*+U)NP} vEUMO-IO3$2l=.`bA.` Yh;I~Y!Pr$J!+Pd*Yļ A۸BнFt`t֘D%?kO:(I0xj:-3^9j._ љ^2|y3mi {-@5,H\} Wn:xiu9ÍlD8Jx4=@N W[:SviqN Ǵr!(4i$2vAFs8mxi-C`8dETTKn>GzqX?g7*E$w2՛r6.㗻]n4Z&W)aHJq$1p="[N8l}t~aM",; 9rlM$bk.YP Qv{=,zB9jKLZ?q4 3y@N(,2ع*Y-ϔQvM+]/9n3+[e69aŏZv 77#A٬:S(IplV5g ![ŋ8GK}>J2|HboUpv)bSaH2fqJwfS\hIwJphu6z̫x=c2cY ^`Fx?x/<*h^-^YkX:U.;MbeyI+ԟ]}MLڜ'bzƄ9HG&{kvXI @)оfwXbFA},>5E/"2aATS"K*`Ɔ1*ݱll1xS)zwUgpZIHvd, 7`6#4WXNiNFʜ*OG Bς s;{+\yYɜEHK"ہL [ނX/ۓYw]3ԂkٿtpL,Nd9=P>U)Ò8Uol 0x>Pc[kX)Wd\t?Ru.w8]uڤMeC`Qۃ'\tuV<#Ga%[oD{o=J& D !?,?{Ysonw͉#8)߯cɉ7IM@DA0v*BAP>j|WO? /vj 7ؽijig{t 7uveYi[q]S"D%r[HM͠+Z_VҀ [{/Ŷ¡'?qc%WQZ+6GMHOֿm;4>VJk$?tSɁ0a&gک[X82[X%.=1u} So dh@;_Mmt"_&y՟J!O'^.4ې4d{[z{}FHe|j#]ԗY.tzJfNuU@d4]?SITnk㈱4SNb6XM= Y "\PCQ@^ט`知]NM5=Y!X$@E!tbe?:mo,UUlN'c‡uE$26Opy>*ŬBI n/ e/Ÿk @݀F>#* ykU,ISI[߿vE?mzDz9øl7ZMRNPC?vs,4S}kXFp-^@SǾ D9!4, D:B:K )K[sXK>Ԧ~ٖ_O fV4nQBuM }`z{K0g*, ;=j sfDjX`5g G=D" Vk,)&f-rL2hfbx'^|pAo%D񽊳j~PPzAk1`uB> $Lɪ}Tk?eU{2"8,j [/T$,Lj~xX!#bg*?He"?I3^_cW.|Ͼ VGARVa, v0ǧи)T}h p'o%`vc鸍,찃?xŏΆ)XbG5_siKH|b?p:׷Б;Y4J+8rxetZQG6$st$edzjcbYT|% d!*(SYkL XzyN^v ږޥe{pUYB $Cr1\&F P`p F4?㓟ەR]k)X1`EA;l,X0`YX0<#݂/d^R{}6?wB[m'w=LDyo 5/Х R`|ѭH#:iud@]Q2$gI%t#{ԭsU&3 .(Ax.S\ *#1R؂cTQMjF-0APZb[{zv} All~ԣUp=01]9V!΢dz3۫:$mo8?QVÃdIkHزk1_sP!?E"*tJTBg(]\J&:xEw^QtWy@4f9V2pSx.,otG4mty 7Zx'64X)C04NnElt8BcݚwhVb<$#0{2V%,,'=yXWi:O譛k8bKm?_M̪nTu*|ljTJKz*d}l2}+,,K+۱?dA'ҙsX \iӽ/UܷBrqpEʣ7\P%TėuanK ՚elh"~ȶ0$@'Tb':ϦKGgX;T03+/bjpo%;C}=^+Jk!Vt}9"Mb8:E)N5%??cҟ{$Mƨ"&G1H'|&dTyPܤ#) m@tC1|B pXjʓ%&Al`H!J+?[b~ lNoXN9&唛9&559HRK%~5S>bG‚X־|]ƹ#'>$['ǹ 6}iyP5w 'Bm mk-Ӄ㵍}[_Cu=USY1iDG^?yH#e~>1Rb@@AЖ{j`9mb|`wpl>Kq24k+ڦ\o٩<0 (>< %_/u'iLWK5$4rƾ]7z<IK*䞣oI`%. G>J&mn1 X@_U j?RU-\o2lU@/R| 8:K9hX;g] hg `p#pkP֛/˗m G{J\h0$+q$wq-ɢSW1/oߛ@F}TvRBIxS>pvYh[#Y[VqvL`AMkvx<&!Žw{yj溎֗tp=-Ի9}/)yfϚ1&e?wyHH b¦&UfG"30Au'~9|5%$0fw*p+أ5&Պ9;Tg0̂;M#ctߘwGzK!bCX L'ϯ^Л# >u Mz,T~eG+s}93߯iŽq}rn$5ic)^A3v )=f+r,"UEnV\HOjް`>yL`&T;&ɉzz_el IP`CW4M~\U3 0np>U-ss6.75 #2%>%g0bs:Z3:<QbI5m0қrYш6<ڣKU>MP2h$V;r=H = 3Nm%h{?/f6Uyh=Z$$2L_Lo L)"saKG0_hKw)Z' ڴ>8'&+ A Ԃ6*$DZ719 En+-K+B@H4=%DҺ%9V(Sf%%ft7yu[CĥG.(Z ܋ UtXPs{COQJu$;A:rW A5CgU>=X{nMPFO`7 WV 9׺J,CXM: =^,qN^Wd®&e2oJmKB2 A=('.?Bi9Z# >޶/c7S8/"pP<ˡmP;dd+rܧh2jC!UHu#'w&YsΞ.]\W2}g6Yb|h45kiCY%l::_RC :r- M!% 8OީzDmy`"BB_,dy3n;P3t-7 G[>^SNYrGi[RX+8K`~怚;ZnqZ@ B]Ɇ7{cakҁc a<];t>t,?P0+ᑵ /:F#HK[]) +GgM0= *R"V܊cP4<.C@k43tE1AI agi}o#ʌڠOI0~hnIT&EkHBhq-i !@^-$ٓgp _#388ڈhMJUxi[QWC/3EL\eUn ڏYL]C=bX\N^g/=Por2'9B-A/;ݥaA`ĨOX~!%UL ѯ Uk6/%'KFj' Rvam)uOht: 睛0mW??+wB)o!]8h\jNK#%GJuk#<4?_9;Pb)2EO}SC/yƇ#QQ^}F80}!Gۗ}CL(@M2t0Q8FcCAǪ fхk +A^H%!G%#U1m.sCSU ":: Dq|o!\,ul%i@Ć:?a[\uȖ1՚-O=)/lNy|,#gԶX=B|pԷHlj"xq̑ # n~8ڏ?f1-.yl1A']jVgf."SԘ'ؖL88K<"@tsAdWm˾JOs,1J-s}I ŞRu0;P]-Z.HFРsd2^v।sp]WɄ12i1ahblEB,F A'k슶а4JKP%\w ITդE2GH؄^kdǽ0q wkwҁyO@Ah%J4";ks"m-T%p휷½+9^tf 0* Ӏ۸M\nT()\j̡-@J ҵZpēnp5j,,NMx):Jp|OYoc4Dz̎9"yVj}7`Iw8؊kMUނGz8J|HFH7}zA{d?)(ĉ:apoC_%;ƒ\_@ [ v3gP52dGkv0jx~4lm܈pt6〜 ?}mΗx]Nwjsu3g.Q[8<;*ܹ[q'&cSkhq0q=> L?h4BUu߉ eAG{T `~怚~ⴀ B]Ɇ7{cakҁc a<];t>t,?P0+ᑵ /:F#HK[]) +GgM0= *R"V܊cP4<.C@k43tE1AI agi}o#ʌڠOI0~hnIT&EkHBhq-i !@^-$ٓgp _#388ڈhMJUxi[QWC/3EL\eUn ڏYL]C=bX\N^g/=Por2'9B-A/;ݥaA`ĨOX~!%UL ѯ Uk6/%'KFj' Rvam)uOht: 睛0mW??+wB)o!]8h\jNK#%GJuk#<4?_9;Pb)2EO}SC/yƇ#QQ^}F80}!Gۗ}CL(@M2t0Q8FcCAǪ fхk +A^H%!G%#U1m.sCSU ":: Dq|o!\,ul%i@Ć:?a[\uȖ1՚-O=)/lNy|,#gԶX=B|pԷHlj"xq̑ # n~8ڏ?f1-.yl1A']jVgf."SԘ'ؖL88K<"@tsAdWm˾JOs,1J-s}I ŞRu0;P]-Z.HFРsd2^v।sp]WɄ12i1ahblEB,F A'k슶а4JKP%\w ITդE2GH؄^kdǽ0q wkwҁyO@Ah%J4";ks"m-T%p휷½+9^tf 0* Ӏ۸M\nT()\j̡-@J ҵZpēnp5j,,NMx):Jp|OYoc4Dz̎9"yVj}7`Iw8؊kMUނGz8J|HFH7}zA{d?)(ѥLx2vz Tp:(\hQm 1F`1uvOvW. B&zR$9q@'$T-|[Ɲ j*î)Dž*[JDƭu^&8 - f6lT1-0|kkq^' ƫ)=;|`D(s~}.Q>;W vu@|H>x C<ƻYJ#; '1p̤ӎ"$nzYZQͽ̟yCIkcu ߘ0LRkxmx*J+śH=Xq练D1}w[/  ß^Ķ`7Uo#U"avm-v14.1.0-bus.64x64.l5.opfl0-refinemv0.obu.md5000066400000000000000000000000411517466257200305320ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/mediaae41842a250d29146e5316a4f77271ed dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-hm.64x64.l5.filmgrain.obu000066400000000000000000000031451517466257200270070ustar00rootroot00000000000000 >`怚;Znq[@}\o_ooGOU0OOϮnpc4Zt+c54   qK  &E9B'*q3856-I>yP;뭭QSA7NOFn-iqT_a?<޷hAJff!ԨVkM2]w6BI8|_,NOcDߜJD#MHK=dN&4sI*uJ=+1'x$EMg0d˺HrӟXL,[%\\N™Ko ñmX{7v,> Q\2l}kʕUrvz\y l2^rXxHh,.+F nj*T$Λ)/|+k+WEA;8 ,RhLDzwÕr7 @2R=timSO `~sK@smP9YK _G򪧟ػ1;aBT%K%ǘCٍh`Oiˬ( vɄcG \gιqx$8ʹgcRa)Ҫӆ5ڽk)gHggg]9PǏL X  12 e\﯏P.pON/Ӭ +*;S&  _9 8J{e\o/0.P/OoNOpԓkӌ,JCc[,)kA 8ke\ϏT00.p/o._ΏѫĬ b*.}5cD 8dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/media/avm-v14.1.0-hm.64x64.l5.filmgrain.obu.md5000066400000000000000000000000411517466257200274630ustar00rootroot0000000000000054c81720202e37bf70ab2cbd21dcc697 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/meson.build000066400000000000000000000573131517466257200222170ustar00rootroot00000000000000# Copyright © 2018-2026, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav2d', ['c'], version: '0.1.0', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.49.0') dav2d_src_root = meson.current_source_dir() cc = meson.get_compiler('c') # Configuratin data for config.h cdata = configuration_data() # Configuration data for config.asm cdata_asm = configuration_data() # Include directories dav2d_inc_dirs = include_directories(['.', 'include/dav2d', 'include']) dav2d_api_version_major = cc.get_define('DAV2D_API_VERSION_MAJOR', prefix: '#include "dav2d/version.h"', include_directories: dav2d_inc_dirs).strip() dav2d_api_version_minor = cc.get_define('DAV2D_API_VERSION_MINOR', prefix: '#include "dav2d/version.h"', include_directories: dav2d_inc_dirs).strip() dav2d_api_version_revision = cc.get_define('DAV2D_API_VERSION_PATCH', prefix: '#include "dav2d/version.h"', include_directories: dav2d_inc_dirs).strip() dav2d_soname_version = '@0@.@1@.@2@'.format(dav2d_api_version_major, dav2d_api_version_minor, dav2d_api_version_revision) # # Option handling # # Bitdepth option dav2d_bitdepths = get_option('bitdepths') foreach bitdepth : ['8', '16'] cdata.set10('CONFIG_@0@BPC'.format(bitdepth), dav2d_bitdepths.contains(bitdepth)) endforeach # ASM option is_asm_enabled = (get_option('enable_asm') == true and (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu() == 'ppc64le' or host_machine.cpu_family().startswith('riscv') or host_machine.cpu_family().startswith('loongarch') or (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == ''))) cdata.set10('HAVE_ASM', is_asm_enabled) if is_asm_enabled and get_option('b_sanitize') == 'memory' error('asm causes false positive with memory sanitizer. Use \'-Denable_asm=false\'.') endif cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or (get_option('trim_dsp') == 'if-release' and get_option('buildtype') == 'release')) # Logging option cdata.set10('CONFIG_LOG', get_option('logging')) # # OS/Compiler checks and defines # # Arguments in test_args will be used even on feature tests test_args = [] optional_arguments = [] optional_link_arguments = [] if host_machine.system() in ['linux', 'gnu', 'emscripten'] test_args += '-D_GNU_SOURCE' add_project_arguments('-D_GNU_SOURCE', language: 'c') endif have_clock_gettime = false have_sigaction = false have_posix_memalign = false have_memalign = false have_aligned_alloc = false if host_machine.system() == 'windows' cdata.set('_WIN32_WINNT', '0x0601') cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs cdata.set('_UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf cdata.set('_CRT_DECLARE_NONSTDC_NAMES', 1) # Define to get off_t from sys/types.h on MSVC if cc.has_function('fseeko', prefix : '#include ', args : test_args) cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows else cdata.set('fseeko', '_fseeki64') cdata.set('ftello', '_ftelli64') endif if host_machine.cpu_family() == 'x86_64' if cc.get_argument_syntax() != 'msvc' optional_link_arguments += '-Wl,--dynamicbase,--nxcompat,--tsaware,--high-entropy-va' endif elif host_machine.cpu_family() == 'x86' or host_machine.cpu_family() == 'arm' if cc.get_argument_syntax() == 'msvc' optional_link_arguments += '/largeaddressaware' else optional_link_arguments += '-Wl,--dynamicbase,--nxcompat,--tsaware,--large-address-aware' endif endif # On Windows, we use a compatibility layer to emulate pthread thread_dependency = [] thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c')) rt_dependency = [] rc_version_array = meson.project_version().split('.') winmod = import('windows') rc_data = configuration_data() rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) rc_data.set('API_VERSION_MAJOR', dav2d_api_version_major) rc_data.set('API_VERSION_MINOR', dav2d_api_version_minor) rc_data.set('API_VERSION_REVISION', dav2d_api_version_revision) rc_data.set('COPYRIGHT_YEARS', '2018-2026') else thread_dependency = dependency('threads') thread_compat_dep = [] rt_dependency = [] if cc.has_function('clock_gettime', prefix : '#include ', args : test_args) have_clock_gettime = true elif host_machine.system() not in ['darwin', 'ios', 'tvos'] rt_dependency = cc.find_library('rt', required: false) if not cc.has_function('clock_gettime', prefix : '#include ', args : test_args, dependencies : rt_dependency) error('clock_gettime not found') endif have_clock_gettime = true endif have_sigaction = cc.has_function('sigaction', prefix : '#include ', args : test_args) have_posix_memalign = cc.has_function('posix_memalign', prefix : '#include ', args : test_args) have_memalign = cc.has_function('memalign', prefix : '#include ', args : test_args) have_aligned_alloc = cc.has_function('aligned_alloc', prefix : '#include ', args : test_args) endif cdata.set10('HAVE_CLOCK_GETTIME', have_clock_gettime) cdata.set10('HAVE_SIGACTION', have_sigaction) cdata.set10('HAVE_POSIX_MEMALIGN', have_posix_memalign) cdata.set10('HAVE_MEMALIGN', have_memalign) cdata.set10('HAVE_ALIGNED_ALLOC', have_aligned_alloc) # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64 have_fseeko = true if host_machine.system() == 'android' if not cc.has_function('fseeko', prefix : '#include ', args : test_args) if cc.has_function('fseeko', prefix : '#include ', args : test_args + ['-U_FILE_OFFSET_BITS']) warning('Files larger than 2 gigabytes might not be supported in the dav2d CLI tool.') add_project_arguments('-U_FILE_OFFSET_BITS', language: 'c') elif get_option('enable_tools') error('dav2d CLI tool needs fseeko()') else have_fseeko = false endif endif endif libdl_dependency = [] have_dlsym = false if host_machine.system() == 'linux' libdl_dependency = cc.find_library('dl', required : false) have_dlsym = cc.has_function('dlsym', prefix : '#include ', args : test_args, dependencies : libdl_dependency) endif cdata.set10('HAVE_DLSYM', have_dlsym) libm_dependency = cc.find_library('m', required: false) # Header checks stdatomic_dependencies = [] if not cc.check_header('stdatomic.h') if cc.get_id() == 'msvc' # we have a custom replacement for MSVC stdatomic_dependencies += declare_dependency( include_directories : include_directories('include/compat/msvc'), ) elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''', name : 'GCC-style atomics', args : test_args) stdatomic_dependencies += declare_dependency( include_directories : include_directories('include/compat/gcc'), ) else error('Atomics not supported') endif endif if host_machine.cpu_family().startswith('wasm') # enable atomics + bulk-memory features stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true) endif cdata.set10('HAVE_SYS_TYPES_H', cc.check_header('sys/types.h')) cdata.set10('HAVE_UNISTD_H', cc.check_header('unistd.h')) cdata.set10('HAVE_IO_H', cc.check_header('io.h')) have_pthread_np = cc.check_header('pthread_np.h') cdata.set10('HAVE_PTHREAD_NP_H', have_pthread_np) test_args += '-DHAVE_PTHREAD_NP_H=' + (have_pthread_np ? '1' : '0') # Function checks if not cc.has_function('getopt_long', prefix : '#include ', args : test_args) getopt_dependency = declare_dependency( sources: files('tools/compat/getopt.c'), include_directories : include_directories('include/compat'), ) else getopt_dependency = [] endif have_getauxval = false have_elf_aux_info = false if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu_family().startswith('loongarch') or host_machine.cpu() == 'ppc64le' or host_machine.cpu_family().startswith('riscv')) have_getauxval = cc.has_function('getauxval', prefix : '#include ', args : test_args) have_elf_aux_info = cc.has_function('elf_aux_info', prefix : '#include ', args : test_args) endif cdata.set10('HAVE_GETAUXVAL', have_getauxval) cdata.set10('HAVE_ELF_AUX_INFO', have_elf_aux_info) pthread_np_prefix = ''' #include #if HAVE_PTHREAD_NP_H #include #endif ''' cdata.set10('HAVE_PTHREAD_GETAFFINITY_NP', cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) cdata.set10('HAVE_PTHREAD_SETAFFINITY_NP', cc.has_function('pthread_setaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) cdata.set10('HAVE_PTHREAD_SETNAME_NP', cc.has_function('pthread_setname_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) cdata.set10('HAVE_PTHREAD_SET_NAME_NP', cc.has_function('pthread_set_name_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)) cdata.set10('HAVE_C11_GENERIC', cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)) # Compiler flag tests if cc.has_argument('-fvisibility=hidden') add_project_arguments('-fvisibility=hidden', language: 'c') else warning('Compiler does not support -fvisibility=hidden, all symbols will be public!') endif # Compiler flags that should be set # But when the compiler does not supports them # it is not an error and silently tolerated if cc.get_argument_syntax() != 'msvc' optional_arguments += [ '-Wundef', '-Werror=vla', '-Wno-maybe-uninitialized', '-Wno-missing-field-initializers', '-Wno-unused-parameter', '-Wstrict-prototypes', '-Werror=missing-prototypes', '-Wshorten-64-to-32', ] if host_machine.cpu_family() == 'x86' optional_arguments += [ '-msse2', '-mfpmath=sse', ] endif else optional_arguments += [ '-wd4028', # parameter different from declaration '-wd4090', # broken with arrays of pointers '-wd4996', # use of POSIX functions '-wd5287', # operands are different enum types ] endif if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain') optional_arguments += '-fomit-frame-pointer' optional_arguments += '-ffast-math' endif if (host_machine.system() in ['darwin', 'ios', 'tvos'] and cc.get_id() == 'clang' and cc.version().startswith('11')) # Workaround for Xcode 11 -fstack-check bug, see #301 optional_arguments += '-fno-stack-check' endif if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm')) optional_arguments += '-fno-align-functions' endif add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c') add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c') # libFuzzer related things fuzzing_engine = get_option('fuzzing_engine') if fuzzing_engine == 'libfuzzer' if not cc.has_argument('-fsanitize=fuzzer') error('fuzzing_engine libfuzzer requires "-fsanitize=fuzzer"') endif fuzzer_args = ['-fsanitize=fuzzer-no-link', '-fsanitize=fuzzer'] add_project_arguments(cc.first_supported_argument(fuzzer_args), language : 'c') endif cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big') if host_machine.cpu_family().startswith('x86') if get_option('stack_alignment') > 0 stack_alignment = get_option('stack_alignment') elif host_machine.cpu_family() == 'x86_64' or host_machine.system() in ['linux', 'darwin', 'ios', 'tvos'] stack_alignment = 16 else stack_alignment = 4 endif cdata_asm.set('STACK_ALIGNMENT', stack_alignment) endif # # ASM specific stuff # use_gaspp = false if (is_asm_enabled and (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm')) and cc.get_argument_syntax() == 'msvc' and (cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0'))) gaspp = find_program('gas-preprocessor.pl') use_gaspp = true gaspp_args = [ '-as-type', 'armasm', '-arch', host_machine.cpu_family(), '--', host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm', '-nologo', '-I@0@'.format(dav2d_src_root), '-I@0@/'.format(meson.current_build_dir()), ] gaspp_gen = generator(gaspp, output: '@BASENAME@.obj', arguments: gaspp_args + [ '@INPUT@', '-c', '-o', '@OUTPUT@' ]) endif cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') cdata.set10('ARCH_ARM', host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64') have_as_func = false have_as_arch = false aarch64_extensions = { 'dotprod': 'udot v0.4s, v0.16b, v0.16b', 'i8mm': 'usdot v0.4s, v0.16b, v0.16b', 'sve': 'whilelt p0.s, x0, x1', 'sve2': 'sqrdmulh z0.s, z0.s, z0.s', } supported_aarch64_archexts = [] supported_aarch64_instructions = [] if (is_asm_enabled and (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm'))) as_func_code = '''__asm__ ( ".func meson_test" ".endfunc" ); ''' have_as_func = cc.compiles(as_func_code) # fedora package build infrastructure uses a gcc specs file to enable # '-fPIE' by default. The chosen way only adds '-fPIE' to the C compiler # with integrated preprocessor. It is not added to the standalone # preprocessor or the preprocessing stage of '.S' files. So we have to # compile code to check if we have to define PIC for the arm asm to # avoid absolute relocations when building for example checkasm. check_pic_code = ''' #if defined(PIC) #error "PIC already defined" #elif !(defined(__PIC__) || defined(__pic__)) #error "no pic" #endif ''' if cc.compiles(check_pic_code) cdata.set('PIC', '3') endif if host_machine.cpu_family() == 'aarch64' have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''') as_arch_str = '' if have_as_arch as_arch_level = 'armv8-a' # Check what .arch levels are supported. In principle, we only # want to detect up to armv8.2-a here (binutils requires that # in order to enable i8mm). However, older Clang versions # (before Clang 17, and Xcode versions up to and including 15.0) # didn't support controlling dotprod/i8mm extensions via # .arch_extension, therefore try to enable a high enough .arch # level as well, to implicitly make them available via that. foreach arch : ['armv8.2-a', 'armv8.4-a', 'armv8.6-a'] if cc.compiles('__asm__ (".arch ' + arch + '\\n");') as_arch_level = arch endif endforeach # Clang versions before 17 also had a bug # (https://github.com/llvm/llvm-project/issues/32220) # causing a plain ".arch " to not have any effect unless it # had an extra "+" included - but it was activated on the # next ".arch_extension" directive instead. Check if we can include # "+crc" as dummy feature to make the .arch directive behave as # expected and take effect right away. if cc.compiles('__asm__ (".arch ' + as_arch_level + '+crc\\n");') as_arch_level = as_arch_level + '+crc' endif cdata.set('AS_ARCH_LEVEL', as_arch_level) as_arch_str = '".arch ' + as_arch_level + '\\n"' endif if use_gaspp python3 = import('python').find_installation() endif foreach name, instr : aarch64_extensions if use_gaspp f = configure_file( command: [python3, '-c', 'import sys; print(sys.argv[1])', '@0@'.format(instr)], output: 'test-@0@.S'.format(name), capture: true) r = run_command(gaspp, gaspp_args, f, '-c', '-o', meson.current_build_dir() / 'test-' + name + '.obj', check: false) message('Checking for gaspp/armasm64 ' + name.to_upper() + ': ' + (r.returncode() == 0 ? 'YES' : 'NO')) if r.returncode() == 0 supported_aarch64_instructions += name endif else # Test for support for the various extensions. First test if # the assembler supports the .arch_extension directive for # enabling/disabling the extension, then separately check whether # the instructions themselves are supported. Even if .arch_extension # isn't supported, we may be able to assemble the instructions # if the .arch level includes support for them. code = '__asm__ (' + as_arch_str code += '".arch_extension ' + name + '\\n"' code += ');' supports_archext = cc.compiles(code) code = '__asm__ (' + as_arch_str if supports_archext supported_aarch64_archexts += name code += '".arch_extension ' + name + '\\n"' endif code += '"' + instr + '\\n"' code += ');' if cc.compiles(code, name: name.to_upper()) supported_aarch64_instructions += name endif endif endforeach endif endif cdata.set10('HAVE_AS_FUNC', have_as_func) cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch) foreach name, _ : aarch64_extensions cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', name in supported_aarch64_archexts) cdata.set10('HAVE_' + name.to_upper(), name in supported_aarch64_instructions) endforeach cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86')) cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64') cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') if host_machine.cpu_family().startswith('x86') cdata_asm.set('private_prefix', 'dav2d') cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64') cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') cdata_asm.set10('PIC', true) # Convert SSE asm into (128-bit) AVX when compiler flags are set to use AVX instructions cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__').strip() != '') endif cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le') cdata.set10('ARCH_RISCV', host_machine.cpu_family().startswith('riscv')) cdata.set10('ARCH_RV32', host_machine.cpu_family() == 'riscv32') cdata.set10('ARCH_RV64', host_machine.cpu_family() == 'riscv64') cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch')) cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32') cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64') # meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably # when additional flags like '-fprofile-instr-generate' are passed via CFLAGS # see following meson issue https://github.com/mesonbuild/meson/issues/5482 if (host_machine.system() in ['darwin', 'ios', 'tvos'] or (host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86')) cdata.set10('PREFIX', true) cdata_asm.set10('PREFIX', true) endif if is_asm_enabled and host_machine.cpu_family().startswith('x86') # NASM compiler support nasm = find_program('nasm') # check NASM version if nasm.found() nasm_r = run_command(nasm, '-v', check: true) out = nasm_r.stdout().strip().split() if out[1].to_lower() == 'version' if out[2].version_compare('<2.14') error('nasm 2.14 or later is required, found nasm @0@'.format(out[2])) endif else error('unexpected nasm version string: @0@'.format(nasm_r.stdout())) endif endif # Generate config.asm config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm) if host_machine.system() == 'windows' nasm_format = 'win' elif host_machine.system() in ['darwin', 'ios', 'tvos'] nasm_format = 'macho' else nasm_format = 'elf' endif if host_machine.cpu_family() == 'x86_64' nasm_format += '64' else nasm_format += '32' endif nasm_gen = generator(nasm, output: '@BASENAME@.obj', depfile: '@BASENAME@.obj.ndep', arguments: [ '-f', nasm_format, '-I', '@0@/src/'.format(dav2d_src_root), '-I', '@0@/'.format(meson.current_build_dir()), '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@', '@EXTRA_ARGS@', '@INPUT@', '-o', '@OUTPUT@' ]) endif if is_asm_enabled and host_machine.cpu_family().startswith('riscv') as_option_code = '''__asm__ ( ".option arch, +v\n" "vsetivli zero, 0, e8, m1, ta, ma" ); ''' if not cc.compiles(as_option_code, name : 'RISC-V Vector') error('Compiler doesn\'t support \'.option arch\' asm directive. Update to binutils>=2.38 or clang>=17 or use \'-Denable_asm=false\'.') endif endif # Generate config.h config_h_target = configure_file(output: 'config.h', configuration: cdata) # # Include subdir meson.build files # The order is important! subdir('include') subdir('doc') subdir('src') subdir('tools') subdir('examples') subdir('tests') dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/meson_options.txt000066400000000000000000000031451517466257200235040ustar00rootroot00000000000000# General options option('bitdepths', type: 'array', choices: ['8', '16'], description: 'Enable only specified bitdepths') option('enable_asm', type: 'boolean', value: true, description: 'Build asm files, if available') option('enable_tools', type: 'boolean', value: true, description: 'Build dav2d cli tools') option('enable_examples', type: 'boolean', value: false, description: 'Build dav2d examples') option('enable_tests', type: 'boolean', value: true, description: 'Build dav2d tests') option('enable_seek_stress', type: 'boolean', value: false, description: 'Build seek_stress test tool') option('enable_docs', type: 'boolean', value: false, description: 'Build dav2d documentation') option('logging', type: 'boolean', value: true, description: 'Print error log messages using the provided callback function') option('testdata_tests', type: 'boolean', value: false, description: 'Run tests requiring the test data repository') option('fuzzing_engine', type: 'combo', choices : ['none', 'libfuzzer', 'oss-fuzz'], value: 'none', description: 'Select the fuzzing engine') option('fuzzer_ldflags', type: 'string', description: 'Extra LDFLAGS used during linking of fuzzing binaries') option('stack_alignment', type: 'integer', value: 0) option('xxhash_muxer', type : 'feature', value : 'auto') option('trim_dsp', type: 'combo', choices: ['true', 'false', 'if-release'], value: 'if-release', description: 'Eliminate redundant DSP functions where possible') dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/000077500000000000000000000000001517466257200214375ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/000077500000000000000000000000001517466257200236135ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/aarch64-android.meson000066400000000000000000000004271517466257200275270ustar00rootroot00000000000000[binaries] c = 'aarch64-linux-android21-clang' cpp = 'aarch64-linux-android21-clang++' ar = 'llvm-ar' strip = 'llvm-strip' pkgconfig = 'pkg-config' [properties] needs_exe_wrapper = true [host_machine] system = 'android' cpu_family = 'aarch64' cpu = 'aarch64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/aarch64-linux-clang.meson000066400000000000000000000004711517466257200303270ustar00rootroot00000000000000[binaries] c = 'clang' cpp = 'clang++' ar = 'aarch64-linux-gnu-ar' strip = 'aarch64-linux-gnu-strip' exe_wrapper = 'qemu-aarch64' [properties] c_args = '-target aarch64-linux-gnu' c_link_args = '-target aarch64-linux-gnu' [host_machine] system = 'linux' cpu_family = 'aarch64' cpu = 'aarch64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/aarch64-linux.meson000066400000000000000000000003721517466257200272450ustar00rootroot00000000000000[binaries] c = 'aarch64-linux-gnu-gcc' cpp = 'aarch64-linux-gnu-g++' ar = 'aarch64-linux-gnu-ar' strip = 'aarch64-linux-gnu-strip' exe_wrapper = 'qemu-aarch64' [host_machine] system = 'linux' cpu_family = 'aarch64' cpu = 'aarch64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/aarch64-w64-mingw32.meson000066400000000000000000000005351517466257200300130ustar00rootroot00000000000000[binaries] c = 'aarch64-w64-mingw32-clang' cpp = 'aarch64-w64-mingw32-clang++' ar = 'aarch64-w64-mingw32-ar' strip = 'aarch64-w64-mingw32-strip' pkgconfig = 'pkg-config' windres = 'aarch64-w64-mingw32-windres' [properties] c_link_args = ['-static-libgcc'] [host_machine] system = 'windows' cpu_family = 'aarch64' cpu = 'aarch64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/arm-android.meson000066400000000000000000000004251517466257200270540ustar00rootroot00000000000000[binaries] c = 'armv7a-linux-androideabi21-clang' cpp = 'armv7a-linux-androideabi21-clang++' ar = 'llvm-ar' strip = 'llvm-strip' pkgconfig = 'pkg-config' [properties] needs_exe_wrapper = true [host_machine] system = 'android' cpu_family = 'arm' cpu = 'arm' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/arm64-iPhoneOS.meson000066400000000000000000000022341517466257200272720ustar00rootroot00000000000000[binaries] c = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] cpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] objc = ['clang', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] objcpp = ['clang++', '-arch', 'arm64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk'] ar = 'ar' strip = 'strip' [built-in options] c_args = ['-miphoneos-version-min=11.0'] cpp_args = ['-miphoneos-version-min=11.0'] c_link_args = ['-miphoneos-version-min=11.0'] cpp_link_args = ['-miphoneos-version-min=11.0'] objc_args = ['-miphoneos-version-min=11.0'] objcpp_args = ['-miphoneos-version-min=11.0'] [properties] root = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer' needs_exe_wrapper = true [host_machine] system = 'darwin' subsystem = 'ios' kernel = 'xnu' cpu_family = 'aarch64' cpu = 'aarch64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/armv7-w64-mingw32.meson000066400000000000000000000005151517466257200276150ustar00rootroot00000000000000[binaries] c = 'armv7-w64-mingw32-clang' cpp = 'armv7-w64-mingw32-clang++' ar = 'armv7-w64-mingw32-ar' strip = 'armv7-w64-mingw32-strip' pkgconfig = 'pkg-config' windres = 'armv7-w64-mingw32-windres' [properties] c_link_args = ['-static-libgcc'] [host_machine] system = 'windows' cpu_family = 'arm' cpu = 'armv7' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/i686-linux32.meson000066400000000000000000000003241517466257200266530ustar00rootroot00000000000000[binaries] c = 'gcc' cpp = 'g++' ar = 'ar' strip = 'strip' [properties] c_link_args = ['-m32', '-Wl,-z,text'] c_args = ['-m32'] [host_machine] system = 'linux' cpu_family = 'x86' cpu = 'i686' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/i686-w64-mingw32.meson000066400000000000000000000004751517466257200272620ustar00rootroot00000000000000[binaries] c = 'i686-w64-mingw32-gcc' cpp = 'i686-w64-mingw32-g++' ar = 'i686-w64-mingw32-ar' strip = 'i686-w64-mingw32-strip' windres = 'i686-w64-mingw32-windres' exe_wrapper = 'wine' [properties] c_link_args = ['-static-libgcc'] [host_machine] system = 'windows' cpu_family = 'x86' cpu = 'i686' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/loongarch64-linux.meson000066400000000000000000000005171517466257200301440ustar00rootroot00000000000000[binaries] c = 'loongarch64-unknown-linux-gnu-gcc' cpp = 'loongarch64-unknown-linux-gnu-c++' ar = 'loongarch64-unknown-linux-gnu-ar' strip = 'loongarch64-unknown-linux-gnu-strip' pkgconfig = 'pkg-config' exe_wrapper = 'qemu-loongarch64' [host_machine] system = 'linux' cpu_family = 'loongarch64' cpu = 'loongarch64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/riscv64-linux-clang.meson000066400000000000000000000004711517466257200303770ustar00rootroot00000000000000[binaries] c = 'clang' cpp = 'clang++' ar = 'riscv64-linux-gnu-ar' strip = 'riscv64-linux-gnu-strip' exe_wrapper = 'qemu-riscv64' [properties] c_args = '-target riscv64-linux-gnu' c_link_args = '-target riscv64-linux-gnu' [host_machine] system = 'linux' cpu_family = 'riscv64' cpu = 'riscv64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/riscv64-linux.meson000066400000000000000000000003721517466257200273150ustar00rootroot00000000000000[binaries] c = 'riscv64-linux-gnu-gcc' cpp = 'riscv64-linux-gnu-g++' ar = 'riscv64-linux-gnu-ar' strip = 'riscv64-linux-gnu-strip' exe_wrapper = 'qemu-riscv64' [host_machine] system = 'linux' cpu_family = 'riscv64' cpu = 'riscv64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/wasm32.meson000066400000000000000000000003421517466257200257710ustar00rootroot00000000000000[binaries] c = 'emcc' cpp = 'em++' ar = 'emar' strip = 'emstrip' exe_wrapper = 'node' [properties] c_link_args = ['-sEXPORT_ALL=1'] [host_machine] system = 'emscripten' cpu_family = 'wasm32' cpu = 'wasm32' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/wasm64.meson000066400000000000000000000003421517466257200257760ustar00rootroot00000000000000[binaries] c = 'emcc' cpp = 'em++' ar = 'emar' strip = 'emstrip' exe_wrapper = 'node' [properties] c_link_args = ['-sEXPORT_ALL=1'] [host_machine] system = 'emscripten' cpu_family = 'wasm64' cpu = 'wasm64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/x86-android.meson000066400000000000000000000004121517466257200267160ustar00rootroot00000000000000[binaries] c = 'i686-linux-android19-clang' cpp = 'i686-linux-android19-clang++' ar = 'llvm-ar' strip = 'llvm-strip' pkgconfig = 'pkg-config' [properties] needs_exe_wrapper = true [host_machine] system = 'android' cpu_family = 'x86' cpu = 'i686' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/x86_64-android.meson000066400000000000000000000004231517466257200272310ustar00rootroot00000000000000[binaries] c = 'x86_64-linux-android21-clang' cpp = 'x86_64-linux-android21-clang++' ar = 'llvm-ar' strip = 'llvm-strip' pkgconfig = 'pkg-config' [properties] needs_exe_wrapper = true [host_machine] system = 'android' cpu_family = 'x86_64' cpu = 'x86_64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/x86_64-iPhoneSimulator.meson000066400000000000000000000023471517466257200307420ustar00rootroot00000000000000[binaries] c = ['clang', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk'] cpp = ['clang++', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk'] objc = ['clang', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk'] objcpp = ['clang++', '-arch', 'x86_64', '-isysroot', '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk'] ar = 'ar' strip = 'strip' [built-in options] c_args = ['-miphoneos-version-min=11.0'] cpp_args = ['-miphoneos-version-min=11.0'] c_link_args = ['-miphoneos-version-min=11.0'] cpp_link_args = ['-miphoneos-version-min=11.0'] objc_args = ['-miphoneos-version-min=11.0'] objcpp_args = ['-miphoneos-version-min=11.0'] [properties] root = '/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer' needs_exe_wrapper = true [host_machine] system = 'darwin' subsystem = 'ios-simulator' kernel = 'xnu' cpu_family = 'x86_64' cpu = 'x86_64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/crossfiles/x86_64-w64-mingw32.meson000066400000000000000000000005141517466257200275160ustar00rootroot00000000000000[binaries] c = 'x86_64-w64-mingw32-gcc' cpp = 'x86_64-w64-mingw32-g++' ar = 'x86_64-w64-mingw32-ar' strip = 'x86_64-w64-mingw32-strip' windres = 'x86_64-w64-mingw32-windres' exe_wrapper = 'wine' [properties] c_link_args = ['-static-libgcc'] [host_machine] system = 'windows' cpu_family = 'x86_64' cpu = 'x86_64' endian = 'little' dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/snap/000077500000000000000000000000001517466257200224005ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/package/snap/snapcraft.yaml000066400000000000000000000007771517466257200252600ustar00rootroot00000000000000name: dav2d base: core18 version: git version-script: git describe HEAD --always summary: AV2 decoder from VideoLAN description: | A small and fast AV2 decoder from the people who brought you VLC. grade: stable confinement: strict # use 'strict' once you have the right plugs and slots apps: dav2d: command: usr/bin/dav2d plugs: [ 'home' ] parts: dav2d: plugin: meson source: ../../ build-packages: [ 'nasm' ] meson-parameters: - --prefix=/usr - --buildtype=release dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/patches/000077500000000000000000000000001517466257200214735ustar00rootroot000000000000000001-OBU-header-sequence-header-debugging.patch000066400000000000000000000464421517466257200316650ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/patchesFrom b7f448237df262a6822ca32332b115cf645001ac Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 23 Jan 2026 07:46:22 -0500 Subject: [PATCH 1/5] OBU header & sequence header debugging. --- av2/decoder/decodeframe.c | 172 ++++++++++++++++++++++++++++++++++++++ av2/decoder/decoder.h | 2 + av2/decoder/obu.c | 48 ++++++++++- avm/debug.h | 34 ++++++++ 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 avm/debug.h diff --git a/av2/decoder/decodeframe.c b/av2/decoder/decodeframe.c index 13db610b38..4910c427d5 100644 --- a/av2/decoder/decodeframe.c +++ b/av2/decoder/decodeframe.c @@ -5801,6 +5801,10 @@ static AVM_INLINE void read_bitdepth( avm_internal_error(error_info, AVM_CODEC_UNSUP_BITSTREAM, "Unsupported profile/bit-depth combination"); } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-bitdepth[%d]: off=%u\n", + seq_params->bit_depth, rb->bit_offset); +#endif } static void setup_film_grain(AV2Decoder *pbi, struct avm_read_bit_buffer *rb) { AV2_COMMON *const cm = &pbi->common; @@ -5881,6 +5885,10 @@ void av2_read_chroma_format_bitdepth( struct avm_internal_error_info *error_info) { const uint32_t seq_chroma_format_idc = avm_rb_read_uvlc(rb); set_seq_chroma_format(seq_chroma_format_idc, seq_params, error_info); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-layout[%d]: off=%u\n", + seq_chroma_format_idc, rb->bit_offset); +#endif read_bitdepth(rb, seq_params, error_info); @@ -5932,6 +5940,15 @@ void av2_read_conformance_window(struct avm_read_bit_buffer *rb, conf->conf_win_top_offset = 0; conf->conf_win_bottom_offset = 0; } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-cropwindow[%d,l:%d,r:%d,t:%d,b:%d]: off=%u\n", + conf->conf_win_enabled_flag, + conf->conf_win_left_offset, + conf->conf_win_right_offset, + conf->conf_win_top_offset, + conf->conf_win_bottom_offset, + rb->bit_offset); +#endif } void read_tile_syntax_info(TileInfoSyntax *tile_params, @@ -6052,6 +6069,12 @@ static void read_seg_syntax_info(struct SegmentationInfoSyntax *seg_params, void read_sequence_partition_group_tool_flags(struct SequenceHeader *seq_params, struct avm_read_bit_buffer *rb) { setup_seq_sb_size(seq_params, rb); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-sbsz[%dx%d]: off=%u\n", + seq_params->mib_size * 4, + seq_params->mib_size * 4, + rb->bit_offset); +#endif seq_params->enable_sdp = seq_params->monochrome ? 0 : avm_rb_read_bit(rb); seq_params->enable_extended_sdp = (seq_params->enable_sdp && !seq_params->single_picture_header_flag) @@ -6066,6 +6089,16 @@ void read_sequence_partition_group_tool_flags(struct SequenceHeader *seq_params, if (avm_rb_read_bit(rb)) { seq_params->max_pb_aspect_ratio_log2_m1 = avm_rb_read_bit(rb); } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-partition[sdp:%d,extsdp:%d,extpart:%d,uneven4way:%d," + "maxpbaspectratio:%d]: off=%u\n", + seq_params->enable_sdp, + seq_params->enable_extended_sdp, + seq_params->enable_ext_partitions, + seq_params->enable_uneven_4way_partitions, + seq_params->max_pb_aspect_ratio_log2_m1 + 1, + rb->bit_offset); +#endif } void read_sequence_intra_group_tool_flags(struct SequenceHeader *seq_params, @@ -6078,6 +6111,18 @@ void read_sequence_intra_group_tool_flags(struct SequenceHeader *seq_params, seq_params->monochrome ? 0 : avm_rb_read_literal(rb, 2); seq_params->enable_mhccp = avm_rb_read_bit(rb); seq_params->enable_ibp = avm_rb_read_bit(rb); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-intratools[dip:%d,edgefilter:%d,mrl:%d,cfl:%d," + "cfldsfilter:%d,mhccp:%d,ibp:%d]: off=%u\n", + seq_params->enable_intra_dip, + seq_params->enable_intra_edge_filter, + seq_params->enable_mrls, + seq_params->enable_cfl_intra, + seq_params->cfl_ds_filter_index, + seq_params->enable_mhccp, + seq_params->enable_ibp, + rb->bit_offset); +#endif } void read_sequence_inter_group_tool_flags(struct SequenceHeader *seq_params, struct avm_read_bit_buffer *rb) { @@ -6116,6 +6161,18 @@ void read_sequence_inter_group_tool_flags(struct SequenceHeader *seq_params, seq_params->order_hint_info.order_hint_bits_minus_1 = avm_rb_read_literal(rb, 4); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-interframetools[mm:%x,fmm:%d,6pwarp:%d," + "maskcomp:%d,refmvs:%d,redrefmvs:%d,pocbits:%d]: off=%u\n", + seq_params->seq_enabled_motion_modes, + seq_params->seq_frame_motion_modes_present_flag, + seq_params->enable_six_param_warp_delta, + seq_params->enable_masked_compound, + seq_params->order_hint_info.enable_ref_frame_mvs, + seq_params->order_hint_info.reduced_ref_frame_mvs_mode, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1, + rb->bit_offset); +#endif } seq_params->enable_refmvbank = avm_rb_read_bit(rb); if (avm_rb_read_bit(rb)) { @@ -6155,6 +6212,22 @@ void read_sequence_inter_group_tool_flags(struct SequenceHeader *seq_params, seq_params->num_same_ref_compound = seq_params->single_picture_header_flag ? 0 : avm_rb_read_literal(rb, 2); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-refs[bank:%d,drlreorder:%d,explrefmap:%d,nrefs:%d," + "nbitsltfid:%d,drlbits:%d,fdrlbits:%d,bvpdrlbits:%d,fbvpdrlbits:%d," + "numsamerefcomp:%d]: off=%u\n", + seq_params->enable_refmvbank, + seq_params->enable_drl_reorder, + seq_params->enable_explicit_ref_frame_map, + seq_params->ref_frames, + seq_params->number_of_bits_for_lt_frame_id, + seq_params->def_max_drl_bits, + seq_params->allow_frame_max_drl_bits, + seq_params->def_max_bvp_drl_bits, + seq_params->allow_frame_max_bvp_drl_bits, + seq_params->num_same_ref_compound, + rb->bit_offset); +#endif uint8_t enable_tip = seq_params->single_picture_header_flag ? 0 : avm_rb_read_bit(rb); @@ -6182,6 +6255,19 @@ void read_sequence_inter_group_tool_flags(struct SequenceHeader *seq_params, } else { seq_params->enable_tip_explicit_qp = 0; } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-intertools1[tip:%d,tipholefill:%d,mvtraj:%d,bawp:%d," + "cwp:%d,impmskbld:%d,lfsubpu:%d,tipqp:%d]: off=%u\n", + seq_params->enable_tip, + seq_params->enable_tip_hole_fill, + seq_params->enable_mv_traj, + seq_params->enable_bawp, + seq_params->enable_cwp, + seq_params->enable_imp_msk_bld, + seq_params->enable_lf_sub_pu, + seq_params->enable_tip_explicit_qp, + rb->bit_offset); +#endif if (seq_params->single_picture_header_flag) { seq_params->enable_opfl_refine = AVM_OPFL_REFINE_NONE; seq_params->enable_refinemv = 0; @@ -6218,6 +6304,21 @@ void read_sequence_inter_group_tool_flags(struct SequenceHeader *seq_params, } else { seq_params->enable_short_refresh_frame_flags = avm_rb_read_bit(rb); } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-intertools2[opflrefine:%d,refinemv:%d,tiprefinemv:%d," + "bru:%d,adaptivemvd:%d,mvdsignderive:%d,flexmvres:%d,gmv:%d," + "shortrefeshmsk:%d]: off=%u\n", + seq_params->enable_opfl_refine, + seq_params->enable_refinemv, + seq_params->enable_tip_refinemv, + seq_params->enable_bru, + seq_params->enable_adaptive_mvd, + seq_params->enable_mvd_sign_derive, + seq_params->enable_flex_mvres, + seq_params->enable_global_motion, + seq_params->enable_short_refresh_frame_flags, + rb->bit_offset); +#endif } void read_sequence_scc_group_tool_flags(struct SequenceHeader *seq_params, @@ -6242,6 +6343,12 @@ void read_sequence_scc_group_tool_flags(struct SequenceHeader *seq_params, } else { seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-screentools[scc:%d,forceintmv:%d]: off=%u\n", + seq_params->force_screen_content_tools, + seq_params->force_integer_mv, + rb->bit_offset); +#endif } } @@ -6292,6 +6399,20 @@ void read_sequence_filter_group_tool_flags(struct SequenceHeader *seq_params, } } seq_params->df_par_bits_minus2 = avm_rb_read_literal(rb, 2); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-inloopfilters[disablelfacrosstiles:%d,cdef:%d,gdf:%d," + "rst:%d,lrdisablemsk:%x,%x,cccso:%d,cdefonskiptxfm:%d,dfparbits:%d]: off=%u\n", + seq_params->disable_loopfilters_across_tiles, + seq_params->enable_cdef, + seq_params->enable_gdf, + seq_params->enable_restoration, + seq_params->lr_tools_disable_mask[0], + seq_params->lr_tools_disable_mask[1], + seq_params->enable_ccso, + seq_params->enable_cdef_on_skip_txfm, + seq_params->df_par_bits_minus2 + 2, + rb->bit_offset); +#endif } void read_sequence_transform_quant_entropy_group_tool_flags( @@ -6310,6 +6431,19 @@ void read_sequence_transform_quant_entropy_group_tool_flags( seq_params->single_picture_header_flag ? 0 : avm_rb_read_bit(rb); seq_params->reduced_tx_part_set = avm_rb_read_bit(rb); seq_params->enable_cctx = seq_params->monochrome ? 0 : avm_rb_read_bit(rb); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-txgrptools[fsc:%d,idtxintra:%d,ist:%d,interist:%d," + "chromadctonly:%d,interddt:%d,reducedtxtpset:%d,cctx:%d]: off=%u\n", + seq_params->enable_fsc, + seq_params->enable_idtx_intra, + seq_params->enable_ist, + seq_params->enable_inter_ist, + seq_params->enable_chroma_dctonly, + seq_params->enable_inter_ddt, + seq_params->reduced_tx_part_set, + seq_params->enable_cctx, + rb->bit_offset); +#endif seq_params->enable_tcq = 0; int enable_tcq = avm_rb_read_bit(rb); if (enable_tcq) { @@ -6324,6 +6458,12 @@ void read_sequence_transform_quant_entropy_group_tool_flags( } else { seq_params->enable_parity_hiding = 0; } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-coef[tcq:%d,parityhiding:%d]: off=%u\n", + seq_params->enable_tcq, + seq_params->enable_parity_hiding, + rb->bit_offset); +#endif if (seq_params->single_picture_header_flag) { // Setting both enable_avg_cdf and avg_cdf_type to 1 allows us to omit the @@ -6337,6 +6477,12 @@ void read_sequence_transform_quant_entropy_group_tool_flags( seq_params->avg_cdf_type = avm_rb_read_bit(rb); } } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-cdfbits[avgcdf:%d,cdftype:%d]: off=%u\n", + seq_params->enable_avg_cdf, + seq_params->avg_cdf_type, + rb->bit_offset); +#endif const int is_monochrome = seq_params->monochrome; if (is_monochrome) { seq_params->separate_uv_delta_q = 0; @@ -6372,6 +6518,19 @@ void read_sequence_transform_quant_entropy_group_tool_flags( seq_params->uv_dc_delta_q_enabled = 0; seq_params->uv_ac_delta_q_enabled = 0; } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-quantflags[sepuvdq:%d,aceqdc:%d,ydcdq:%d," + "fydcdq:%d,uvdcdq:%d,fuvdcdq:%d,uvacdq:%d,fuvacdq:%d]: off=%u\n", + seq_params->separate_uv_delta_q, + seq_params->equal_ac_dc_q, + seq_params->base_y_dc_delta_q, + seq_params->y_dc_delta_q_enabled, + seq_params->base_uv_dc_delta_q, + seq_params->uv_dc_delta_q_enabled, + seq_params->base_uv_ac_delta_q, + seq_params->uv_ac_delta_q_enabled, + rb->bit_offset); +#endif } void read_sequence_segment_tool_flags(struct SequenceHeader *seq_params, @@ -6382,6 +6541,12 @@ void read_sequence_segment_tool_flags(struct SequenceHeader *seq_params, seq_params->seg_params.enable_ext_seg = seq_params->enable_ext_seg; read_seg_syntax_info(&seq_params->seg_params, rb); } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-segmentation[extseg:%d,seginfo:%d]: off=%u\n", + seq_params->enable_ext_seg, + seq_params->seq_seg_info_present_flag, + rb->bit_offset); +#endif } void av2_read_sequence_header(struct avm_read_bit_buffer *rb, @@ -6396,6 +6561,13 @@ void av2_read_sequence_header(struct avm_read_bit_buffer *rb, read_sequence_transform_quant_entropy_group_tool_flags(seq_params, rb); read_sequence_filter_group_tool_flags(seq_params, rb); read_sequence_tile_config(seq_params, rb); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-tileinfo[%d,%dx%d]: off=%u\n", + seq_params->seq_tile_info_present_flag, + seq_params->tile_params.tile_info.cols, + seq_params->tile_params.tile_info.rows, + rb->bit_offset); +#endif } static AVM_INLINE void read_multi_frame_header_seg_info( diff --git a/av2/decoder/decoder.h b/av2/decoder/decoder.h index f102a0ce36..bdcdbbe880 100644 --- a/av2/decoder/decoder.h +++ b/av2/decoder/decoder.h @@ -33,6 +33,8 @@ #include "av2/decoder/annexF.h" #include "av2/common/banding_metadata.h" +#include "avm/debug.h" + #ifdef __cplusplus extern "C" { #endif diff --git a/av2/decoder/obu.c b/av2/decoder/obu.c index 2e56040f2b..0e9f76bcda 100644 --- a/av2/decoder/obu.c +++ b/av2/decoder/obu.c @@ -561,6 +561,12 @@ static uint32_t read_sequence_header_obu(AV2Decoder *pbi, int xlayer_id, seq_params->seq_tier = avm_rb_read_bit(rb); else seq_params->seq_tier = 0; +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-profile_stillpic_level_tier[profile:%d,reducedhdr:%d," + "level:%d,tier:%d]: off=%u\n", + seq_params->seq_profile_idc, seq_params->single_picture_header_flag, + seq_params->seq_max_level_idx, seq_params->seq_tier, rb->bit_offset); +#endif av2_read_chroma_format_bitdepth(rb, seq_params, &cm->error); if (seq_params->single_picture_header_flag) { seq_params->seq_lcr_id = LCR_ID_UNSPECIFIED; @@ -594,12 +600,26 @@ static uint32_t read_sequence_header_obu(AV2Decoder *pbi, int xlayer_id, } seq_params->monotonic_output_order_flag = avm_rb_read_bit(rb); } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-seqlcrid_stillpic_maxtmlayerid[lcrid:%d,stillpic:%d," + "maxtlayerid:%d,maxmlayerid:%d,monotonic:%d]: off=%u\n", + seq_params->seq_lcr_id, seq_params->still_picture, + seq_params->max_tlayer_id, seq_params->max_mlayer_id, + seq_params->monotonic_output_order_flag, rb->bit_offset); +#endif const int num_bits_width = avm_rb_read_literal(rb, 4) + 1; const int num_bits_height = avm_rb_read_literal(rb, 4) + 1; const int max_frame_width = avm_rb_read_literal(rb, num_bits_width) + 1; const int max_frame_height = avm_rb_read_literal(rb, num_bits_height) + 1; +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-size[bits:%dx%d,max:%dx%d]: off=%u\n", + num_bits_width, num_bits_height, + max_frame_width, max_frame_height, + rb->bit_offset); +#endif + seq_params->num_bits_width = num_bits_width; seq_params->num_bits_height = num_bits_height; seq_params->max_frame_width = max_frame_width; @@ -638,6 +658,12 @@ static uint32_t read_sequence_header_obu(AV2Decoder *pbi, int xlayer_id, seq_params->seq_max_encoder_buffer_delay = 20000; seq_params->seq_max_low_delay_mode_flag = 0; } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-decodermodel[maxdisplaymodel:%d,decodermodel:%d]: off=%u\n", + seq_params->seq_max_display_model_info_present_flag, + seq_params->decoder_model_info_present_flag, + rb->bit_offset); +#endif // Configurable profile does not define bitrate and buffer size constraints if (seq_params->seq_profile_idc != CONFIGURABLE) { int64_t seq_bitrate = av2_max_level_bitrate( @@ -670,6 +696,11 @@ static uint32_t read_sequence_header_obu(AV2Decoder *pbi, int xlayer_id, av2_read_mlayer_dependency_info(seq_params, rb); } } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-layerdesc[%d]: off=%u\n", + seq_params->mlayer_dependency_present_flag, + rb->bit_offset); +#endif // tlayer dependency description seq_params->tlayer_dependency_present_flag = 0; @@ -694,13 +725,19 @@ static uint32_t read_sequence_header_obu(AV2Decoder *pbi, int xlayer_id, av2_read_sequence_header(rb, seq_params); seq_params->film_grain_params_present = avm_rb_read_bit(rb); +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-filmgrain[%d]: off=%u\n", + seq_params->film_grain_params_present, + rb->bit_offset); +#endif size_t bits_before_ext = rb->bit_offset - saved_bit_offset; seq_params->seq_extension_present_flag = avm_rb_read_bit(rb); + int extension_bits = 0; if (seq_params->seq_extension_present_flag) { // Extension data bits = total - bits_read_before_extension -1 (ext flag) - // trailing bits - int extension_bits = read_obu_extension_bits( + extension_bits = read_obu_extension_bits( rb->bit_buffer, rb->bit_buffer_end - rb->bit_buffer, bits_before_ext, &cm->error); if (extension_bits > 0) { @@ -710,6 +747,11 @@ static uint32_t read_sequence_header_obu(AV2Decoder *pbi, int xlayer_id, // No extension data is present } } +#if DEBUG_SEQ_HDR + printf("SEQHDR: post-extension[%d,nbits=%d]: off=%u\n", + seq_params->seq_extension_present_flag, extension_bits, + rb->bit_offset); +#endif if (av2_check_trailing_bits(pbi, rb) != 0) { // cm->error.error_code is already set. @@ -2629,6 +2671,10 @@ int avm_decode_frame_from_obus(struct AV2Decoder *pbi, const uint8_t *data, } else { cm->bridge_frame_info.is_bridge_frame = 0; } +#if DEBUG_OBU_HDR + printf("OBU type=%d size=%ld\n", + obu_header.type, payload_size); +#endif if (is_single_tile_vcl_obu(obu_header.type) || is_multi_tile_vcl_obu(obu_header.type)) { diff --git a/avm/debug.h b/avm/debug.h new file mode 100644 index 0000000000..36030e147d --- /dev/null +++ b/avm/debug.h @@ -0,0 +1,34 @@ +/* + * Copyright © 2018, VideoLAN and dav2d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef AVM_DEBUG_H +#define AVM_DEBUG_H + +#define DEBUG_OBU_HDR 0 +#define DEBUG_SEQ_HDR 0 + +#endif /* AVM_DEBUG_H */ -- 2.48.1 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/patches/0002-frame-header-debugging.patch000066400000000000000000000645071517466257200273600ustar00rootroot00000000000000From b0ea5b44a5e369559cdf83b789642ce00905ca13 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 23 Jan 2026 09:07:44 -0500 Subject: [PATCH 2/5] Frame header debugging. --- av2/decoder/decodeframe.c | 212 +++++++++++++++++++++++++++++++++++++- av2/decoder/obu_ci.c | 43 +++++++- av2/decoder/obu_fgm.c | 44 +++++++- avm/debug.h | 1 + 4 files changed, 294 insertions(+), 6 deletions(-) diff --git a/av2/decoder/decodeframe.c b/av2/decoder/decodeframe.c index 4910c427d5..7cd0394751 100644 --- a/av2/decoder/decodeframe.c +++ b/av2/decoder/decodeframe.c @@ -3609,6 +3609,11 @@ static AVM_INLINE void setup_qm_params(AV2Decoder *pbi, quant_params->qm_v[i] = 0; } } +#if DEBUG_FRAME_HDR + printf("HDR: post-qm[%d]: off=%u\n", + quant_params->using_qmatrix, + rb->bit_offset); +#endif } // Build y/uv dequant values based on segmentation. static AVM_INLINE void setup_segmentation_dequant(AV2Decoder *const pbi, @@ -4021,6 +4026,10 @@ static AVM_INLINE void setup_frame_size(AV2_COMMON *cm, realloc_bru_info(cm); av2_validate_frame_level_conformance(&cm->seq_params, width, height, &cm->error); +#if DEBUG_FRAME_HDR + printf("HDR: post-framesize[%dx%d]: off=%u\n", + cm->width, cm->height, rb->bit_offset); +#endif } static AVM_INLINE void setup_seq_sb_size(SequenceHeader *seq_params, @@ -4122,6 +4131,10 @@ static AVM_INLINE void setup_frame_size_with_refs( realloc_bru_info(cm); av2_validate_frame_level_conformance(&cm->seq_params, width, height, &cm->error); +#if DEBUG_FRAME_HDR + printf("HDR: post-framesize[%dx%d]: off=%u\n", + cm->width, cm->height, rb->bit_offset); +#endif } // Reconstructs the tile information @@ -4264,6 +4277,7 @@ static AVM_INLINE void read_tile_info(AV2Decoder *const pbi, } } pbi->context_update_tile_id = 0; + pbi->tile_size_bytes = 0; if (cm->tiles.rows * cm->tiles.cols > 1 && cm->features.tip_frame_mode != TIP_FRAME_AS_OUTPUT) { if (!cm->seq_params.enable_avg_cdf || !cm->seq_params.avg_cdf_type) { @@ -4278,6 +4292,11 @@ static AVM_INLINE void read_tile_info(AV2Decoder *const pbi, // tile size magnitude pbi->tile_size_bytes = avm_rb_read_literal(rb, 2) + 1; } +#if DEBUG_FRAME_HDR + printf("HDR: post-tiling[%dx%dtiles,%dbytes]: off=%u\n", + pbi->common.tiles.cols, pbi->common.tiles.rows, pbi->tile_size_bytes, + rb->bit_offset); +#endif } static size_t mem_get_varsize(const uint8_t *src, int sz) { @@ -5859,6 +5878,11 @@ static void setup_film_grain(AV2Decoder *pbi, struct avm_read_bit_buffer *rb) { } copy_fgm_from_list(cm, pars, &pbi->fgm_list[cm->fgm_id]); } +#if DEBUG_FRAME_HDR + printf("HDR: post-filmgrain[%d]: off=%u\n", + cm->film_grain_params.apply_grain, + rb->bit_offset); +#endif } cm->cur_frame->film_grain_params = cm->film_grain_params; } @@ -6633,7 +6657,11 @@ static void read_global_motion_params(WarpedMotionParams *params, const struct scale_factors *sf, - MvSubpelPrecision precision) { + MvSubpelPrecision precision +#if DEBUG_FRAME_HDR + , const int idx +#endif + ) { const int precision_loss = get_gm_precision_loss(precision); (void)precision_loss; TransformationType type = avm_rb_read_bit(rb); @@ -6692,6 +6720,14 @@ static void read_global_motion_params(WarpedMotionParams *params, trans_dec_factor; } + if (params->wmtype > IDENTITY) { +#if DEBUG_FRAME_HDR + printf("HDR: post-gmv[%d]matrix[%d,%d|%d,%d,%d,%d,t=%d]: off=%u\n", idx, + params->wmmat[0], params->wmmat[1], params->wmmat[2], + params->wmmat[3], params->wmmat[4], params->wmmat[5], + type, rb->bit_offset); +#endif + } if (params->wmtype <= AFFINE) { av2_reduce_warp_model(params); av2_get_shear_params(params @@ -6781,7 +6817,11 @@ static AVM_INLINE void read_global_motion(AV2_COMMON *cm, get_ref_scale_factors_const(cm, frame), - cm->features.fr_mv_precision); + cm->features.fr_mv_precision +#if DEBUG_FRAME_HDR + , frame +#endif + ); // TODO(sarahparker, debargha): The logic in the commented out code below // does not work currently and causes mismatches when resize is on. Fix it @@ -7043,6 +7083,14 @@ static INLINE void read_intrabc_params(AV2_COMMON *const cm, } read_frame_max_bvp_drl_bits(cm, rb); } +#if DEBUG_FRAME_HDR + printf("HDR: post-ibc[intrabc:%d,global:%d,local:%d,drlbits:%d]: off=%u\n", + features->allow_intrabc, + features->allow_intrabc ? features->allow_global_intrabc : 0, + features->allow_intrabc ? features->allow_local_intrabc : 0, + features->allow_intrabc ? features->max_bvp_drl_bits : 0, + rb->bit_offset); +#endif } static INLINE void read_screen_content_params(AV2_COMMON *const cm, struct avm_read_bit_buffer *rb) { @@ -7065,6 +7113,12 @@ static INLINE void read_screen_content_params(AV2_COMMON *const cm, } else { features->cur_frame_force_integer_mv = 0; } +#if DEBUG_FRAME_HDR + printf("HDR: post-screencontent[sctools:%d,forceintmv:%d]: off=%u\n", + features->allow_screen_content_tools, + features->cur_frame_force_integer_mv, + rb->bit_offset); +#endif } static void set_primary_ref_frame_and_ctx(AV2Decoder *pbi) { @@ -7424,6 +7478,10 @@ static int read_show_existing_frame(AV2Decoder *pbi, bool is_regular_obu, // assign_frame_buffer_p()! assert(IMPLIES(cm->derive_sef_order_hint, !cm->cur_frame->raw_frame_buffer.data)); +#if DEBUG_FRAME_HDR + printf("HDR: post-existing_frame_idx[%d,poc=%d]: off=%u\n", + existing_frame_idx, current_frame->order_hint, rb->bit_offset); +#endif FrameHash raw_frame_hash = cm->cur_frame->raw_frame_hash; FrameHash grain_frame_hash = cm->cur_frame->grain_frame_hash; @@ -8154,6 +8212,11 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, int seq_header_id_for_frame_header = setup_sequence_header_id(cm, rb); assert(seq_header_id_for_frame_header >= 0); +#if DEBUG_FRAME_HDR + printf("HDR: post-ids[f:%d,s:%d]: off=%u\n", + cm->cur_mfh_id, cm->mfh_params[cm->cur_mfh_id].mfh_seq_header_id, + rb->bit_offset); +#endif handle_sequence_header(pbi, obu_type, obu_xlayer_id, seq_header_id_for_frame_header); @@ -8338,6 +8401,11 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, cm->cur_frame->immediate_output_picture = cm->immediate_output_picture; cm->cur_frame->implicit_output_picture = cm->implicit_output_picture; cm->cur_frame->frame_output_done = 0; +#if DEBUG_FRAME_HDR + printf("HDR: post-frametype_bits[type:%d,ltrid:%d,show:%d|%d]: off=%u\n", + current_frame->frame_type, current_frame->long_term_id, + cm->immediate_output_picture, cm->implicit_output_picture, rb->bit_offset); +#endif } av2_set_frame_sb_size(cm, cm->seq_params.sb_size); @@ -8457,6 +8525,12 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, } } } +#if DEBUG_FRAME_HDR + printf("HDR: post-frame_size_override_flag[%d,poc:%d,p_ref:%d|%d]: off=%u\n", + frame_size_override_flag, current_frame->order_hint, + pbi->signal_primary_ref_frame, features->primary_ref_frame, + rb->bit_offset); +#endif } if (obu_type == OBU_CLOSED_LOOP_KEY || obu_type == OBU_OPEN_LOOP_KEY) { @@ -8581,6 +8655,10 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, } } } +#if DEBUG_FRAME_HDR + printf("HDR: post-refresh_frame_flags[%x]: off=%u\n", + current_frame->refresh_frame_flags, rb->bit_offset); +#endif if (cm->immediate_output_picture == 0 && current_frame->refresh_frame_flags == 0) { @@ -8760,6 +8838,14 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, cm->remapped_ref_idx[i] = ref; } } +#if DEBUG_FRAME_HDR + printf("HDR: post-refs[explicit:%d,refs:%d,%d,%d,%d,%d,%d,%d]: off=%u\n", + explicit_ref_frame_map, + cm->remapped_ref_idx[0], cm->remapped_ref_idx[1], + cm->remapped_ref_idx[2], cm->remapped_ref_idx[3], + cm->remapped_ref_idx[4], cm->remapped_ref_idx[5], + cm->remapped_ref_idx[6], rb->bit_offset); +#endif if (!frame_is_sframe(cm) && frame_size_override_flag && !cm->bridge_frame_info.is_bridge_frame) { setup_frame_size_with_refs(cm, explicit_ref_frame_map, rb); @@ -8771,6 +8857,13 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, if (!explicit_ref_frame_map) { av2_get_ref_frames(cm, current_frame->display_order_hint, 1, 0, cm->ref_frame_map_pairs); +#if DEBUG_FRAME_HDR + printf("HDR: post-refs2[refs:%d,%d,%d,%d,%d,%d,%d]: off=%u\n", + cm->remapped_ref_idx[0], cm->remapped_ref_idx[1], + cm->remapped_ref_idx[2], cm->remapped_ref_idx[3], + cm->remapped_ref_idx[4], cm->remapped_ref_idx[5], + cm->remapped_ref_idx[6], rb->bit_offset); +#endif // Note: The following if block implements bitstream constraint checks // for consistent reference frame mapping when (embedded or temporal) @@ -9023,6 +9116,11 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, cm->tmvp_sample_step = 1; cm->tmvp_sample_stepl2 = 0; } +#if DEBUG_FRAME_HDR + printf("HDR: post-refmvbits[%d,step:%d]: off=%u\n", + features->allow_ref_frame_mvs, cm->tmvp_sample_step, + rb->bit_offset); +#endif cm->tip_global_motion.as_int = 0; cm->tip_interp_filter = MULTITAP_SHARP; @@ -9081,6 +9179,16 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, !cm->bridge_frame_info.is_bridge_frame) read_frame_opfl_refine_type(cm, rb); } +#if DEBUG_FRAME_HDR + printf("HDR: post-refinemv-tip[opfl/refine:%d,tip:%d,holefill:%d,glbwt:%d," + "gmv:y=%d,x=%d,interpfilt:%d]: off=%u\n", + cm->features.opfl_refine_type, + features->tip_frame_mode, + features->tip_frame_mode ? features->allow_tip_hole_fill : 0, + cm->tip_global_wtd_index, + cm->tip_global_motion.as_mv.row, cm->tip_global_motion.as_mv.col, + cm->tip_interp_filter, rb->bit_offset); +#endif if (features->tip_frame_mode != TIP_FRAME_AS_OUTPUT && !cm->bridge_frame_info.is_bridge_frame && @@ -9126,6 +9234,12 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, features->enabled_motion_modes = cm->seq_params.seq_enabled_motion_modes; } +#if DEBUG_FRAME_HDR + printf("HDR: post-frametype-specific-bits[drlbits:%d,mvprec:%d,flt:%d,mm:%x]: off=%u\n", + features->max_drl_bits, features->fr_mv_precision - 3, + features->interp_filter, features->enabled_motion_modes, + rb->bit_offset); +#endif } } @@ -9303,6 +9417,10 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, } else { features->disable_cdf_update = 1; } +#if DEBUG_FRAME_HDR + printf("HDR: post-disable_cdf_update[%d]: off=%u\n", + features->disable_cdf_update, rb->bit_offset); +#endif read_tile_info(pbi, rb); if (!av2_is_min_tile_width_satisfied(cm)) { @@ -9315,6 +9433,16 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, cm->cur_frame->base_qindex = quant_params->base_qindex; cm->cur_frame->u_ac_delta_q = quant_params->u_ac_delta_q; cm->cur_frame->v_ac_delta_q = quant_params->v_ac_delta_q; +#if DEBUG_FRAME_HDR + printf("HDR: post-quant[yac:%d,deltas=ydc:%d,uac:%d/dc:%d,vac:%d/dc:%d]: off=%u\n", + quant_params->base_qindex, + quant_params->y_dc_delta_q, + quant_params->u_ac_delta_q, + quant_params->u_dc_delta_q, + quant_params->v_ac_delta_q, + quant_params->v_dc_delta_q, + rb->bit_offset); +#endif set_primary_ref_frame_and_ctx(pbi); @@ -9332,6 +9460,11 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, } setup_segmentation(cm, rb); +#if DEBUG_FRAME_HDR + printf("HDR: post-segmentation[%d]: off=%u\n", + cm->seg.enabled, + rb->bit_offset); +#endif setup_qm_params(pbi, quant_params, cm->seg.enabled, av2_num_planes(cm), rb); cm->delta_q_info.delta_q_res = 1; @@ -9341,6 +9474,11 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, xd->current_base_qindex = quant_params->base_qindex; cm->delta_q_info.delta_q_res = 1 << avm_rb_read_literal(rb, 2); } +#if DEBUG_FRAME_HDR + printf("HDR: post-delta_q[%d]: off=%u\n", + cm->delta_q_info.delta_q_present_flag, + rb->bit_offset); +#endif xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv; features->has_lossless_segment = 0; @@ -9401,6 +9539,11 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, features->allow_parity_hiding = false; else features->allow_parity_hiding = avm_rb_read_bit(rb); +#if DEBUG_FRAME_HDR + printf("HDR: post-tcq_parity[tcq:%d,par:%d]: off=%u\n", + features->tcq_mode, features->allow_parity_hiding, + rb->bit_offset); +#endif setup_segmentation_dequant(pbi, xd); @@ -9441,29 +9584,76 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, cm->cur_frame->base_qindex = cm->quant_params.base_qindex; cm->cur_frame->u_ac_delta_q = cm->quant_params.u_ac_delta_q; cm->cur_frame->v_ac_delta_q = cm->quant_params.v_ac_delta_q; +#if DEBUG_FRAME_HDR + printf("HDR: post-tip_quant[yac:%d,deltas=uac:%d,vac:%d]: off=%u\n", + cm->quant_params.base_qindex, + cm->quant_params.u_ac_delta_q, + cm->quant_params.v_ac_delta_q, + rb->bit_offset); +#endif } features->disable_cdf_update = 1; features->coded_lossless = 0; features->all_lossless = 0; } setup_loopfilter(cm, rb); +#if DEBUG_FRAME_HDR + if (features->tip_frame_mode != TIP_FRAME_AS_OUTPUT) + printf("HDR: post-deblock[lfsubpu:%d,y:%d|%d,u:%d,v:%d," + "dqy:%d|%d,dqu:%d,dqv:%d]: off=%u\n", + cm->features.allow_lf_sub_pu, + cm->lf.apply_deblocking_filter[0], cm->lf.apply_deblocking_filter[1], + cm->lf.apply_deblocking_filter_u, cm->lf.apply_deblocking_filter_v, + cm->lf.delta_q_luma[0], cm->lf.delta_q_luma[1], + cm->lf.delta_q_u, cm->lf.delta_q_v, rb->bit_offset); + else + printf("HDR: post-tip_deblock[lfsubpu:%d,apply:%d]: off=%u\n", + cm->features.allow_lf_sub_pu, cm->lf.apply_deblocking_filter_tip, + rb->bit_offset); +#endif if (!features->coded_lossless && seq_params->enable_gdf) { setup_gdf(cm, rb); +#if DEBUG_FRAME_HDR + if (features->tip_frame_mode != TIP_FRAME_AS_OUTPUT) + printf("HDR: post-gdf[%d]: off=%u\n", + cm->gdf_info.gdf_mode, + rb->bit_offset); +#endif } else { cm->gdf_info.gdf_mode = 0; } if (!features->coded_lossless && seq_params->enable_cdef) { setup_cdef(cm, rb); +#if DEBUG_FRAME_HDR + if (features->tip_frame_mode != TIP_FRAME_AS_OUTPUT) + printf("HDR: post-cdef[%d]: off=%u\n", + cm->cdef_info.cdef_frame_enable, + rb->bit_offset); +#endif } if (!features->all_lossless && seq_params->enable_restoration) { decode_restoration_mode(cm, rb); +#if DEBUG_FRAME_HDR + if (features->tip_frame_mode != TIP_FRAME_AS_OUTPUT) + printf("HDR: post-restoration[y:%d,u:%d,v:%d]: off=%u\n", + cm->rst_info[0].frame_restoration_type, + cm->rst_info[1].frame_restoration_type, + cm->rst_info[2].frame_restoration_type, + rb->bit_offset); +#endif } for (int plane = 0; plane < CCSO_NUM_COMPONENTS; plane++) { cm->ccso_info.ccso_enable[plane] = false; } if (!features->coded_lossless && seq_params->enable_ccso) { setup_ccso(cm, rb); +#if DEBUG_FRAME_HDR + if (features->tip_frame_mode != TIP_FRAME_AS_OUTPUT) + printf("HDR: post-ccso[%d]: off=%u\n", + cm->ccso_info.ccso_frame_flag, + rb->bit_offset); +#endif } if (features->tip_frame_mode == TIP_FRAME_AS_OUTPUT) { if (cm->seq_params.enable_lf_sub_pu && cm->features.allow_lf_sub_pu && @@ -9500,6 +9690,16 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, features->enable_imp_msk_bld = seq_params->enable_imp_msk_bld; features->reduced_tx_set_used = avm_rb_read_literal(rb, 2); +#if DEBUG_FRAME_HDR + printf("HDR: post-modebits[tx:%d,refmode:%d,skipmode:%d,bawp:%d,warp:%d,redtxset:%d]: off=%u\n", + features->tx_mode, + current_frame->reference_mode >> 1, + current_frame->skip_mode_info.skip_mode_flag, + features->enable_bawp, + features->allow_warpmv_mode, + features->reduced_tx_set_used, + rb->bit_offset); +#endif if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) { avm_internal_error(&cm->error, AVM_CODEC_CORRUPT_FRAME, @@ -9511,7 +9711,13 @@ static int read_uncompressed_header(AV2Decoder *pbi, OBU_TYPE obu_type, "Frame wrongly requests TIP mode"); } - if (!frame_is_intra_only(cm)) read_global_motion(cm, rb); + if (!frame_is_intra_only(cm)) { + read_global_motion(cm, rb); +#if DEBUG_FRAME_HDR + printf("HDR: post-gmv: off=%u\n", + rb->bit_offset); +#endif + } setup_film_grain(pbi, rb); features->enable_ext_seg = seq_params->enable_ext_seg; return 0; diff --git a/av2/decoder/obu_ci.c b/av2/decoder/obu_ci.c index ba2d076e06..5f9768cd53 100644 --- a/av2/decoder/obu_ci.c +++ b/av2/decoder/obu_ci.c @@ -25,6 +25,8 @@ #include "av2/decoder/obu.h" #include "av2/common/av2_common_int.h" +#define DEBUG_CI_HDR 0 + static int av2_set_sar_info(ContentInterpretation *ci_params) { int supported_sample_aspect_ratio = 1; switch (ci_params->sar_info.sar_aspect_ratio_idc) { @@ -159,8 +161,27 @@ uint32_t av2_read_content_interpretation_obu(struct AV2Decoder *pbi, ci_temp.ci_timing_info_present_flag = avm_rb_read_bit(rb); (void)avm_rb_read_literal(rb, 2); // ci_reserved_2bit +#if DEBUG_CI_HDR + printf("CI: post-flags[scan=%d,colordesc=%d,chrsamplepos=%d," + "aspectratio=%d,timinginfo=%d,extension=%d]: off=%u\n", + ci_temp.ci_scan_type_idc, ci_temp.ci_color_description_present_flag, + ci_temp.ci_chroma_sample_position_present_flag, + ci_temp.ci_aspect_ratio_info_present_flag, + ci_temp.ci_timing_info_present_flag, ci_temp.ci_extension_present_flag, + rb->bit_offset); +#endif + if (ci_temp.ci_color_description_present_flag) { read_ci_color_info(&ci_temp, rb); + +#if DEBUG_CI_HDR + printf("CI: post-colordesc[id=%d,pri=%d,trc=%d,mtrx=%d,rng=%d]: off=%u\n", + ci_temp.color_info.color_description_idc, + ci_temp.color_info.color_primaries, + ci_temp.color_info.matrix_coefficients, + ci_temp.color_info.transfer_characteristics, + ci_temp.color_info.full_range_flag, rb->bit_offset); +#endif } else { ci_temp.color_info.color_description_idc = AVM_COLOR_DESC_IDC_EXPLICIT; ci_temp.color_info.color_primaries = AVM_CICP_CP_UNSPECIFIED; @@ -176,6 +197,11 @@ uint32_t av2_read_content_interpretation_obu(struct AV2Decoder *pbi, else ci_temp.ci_chroma_sample_position[1] = ci_temp.ci_chroma_sample_position[0]; +#if DEBUG_CI_HDR + printf("CI: post-chromasampleposition[chr=%d/%d]: off=%u\n", + ci_temp.ci_chroma_sample_position[0], + ci_temp.ci_chroma_sample_position[1], rb->bit_offset); +#endif } else { ci_temp.ci_chroma_sample_position[0] = AVM_CSP_UNSPECIFIED; ci_temp.ci_chroma_sample_position[1] = AVM_CSP_UNSPECIFIED; @@ -187,10 +213,25 @@ uint32_t av2_read_content_interpretation_obu(struct AV2Decoder *pbi, avm_internal_error(&cm->error, AVM_CODEC_UNSUP_BITSTREAM, "Incorrect SAR values"); } +#if DEBUG_CI_HDR + printf("CI: post-sampleaspectratio[id=%d,sar=%d:%d]: off=%u\n", + ci_temp.sar_info.sar_aspect_ratio_idc, + ci_temp.sar_info.sar_width, + ci_temp.sar_info.sar_height, rb->bit_offset); +#endif } - if (ci_temp.ci_timing_info_present_flag) + if (ci_temp.ci_timing_info_present_flag) { av2_read_timing_info_header(&ci_temp.timing_info, &cm->error, rb); +#if DEBUG_CI_HDR + printf("CI: post-timinginfo[nuidt:%d,ts:%d,eei:%d,ntped:%d]: off=%u\n", + ci_temp.timing_info.num_units_in_display_tick, + ci_temp.timing_info.time_scale, + ci_temp.timing_info.equal_elemental_interval, + ci_temp.timing_info.num_ticks_per_elemental_duration, + rb->bit_offset); +#endif + } size_t bits_before_ext = rb->bit_offset - saved_bit_offset; ci_temp.ci_extension_present_flag = avm_rb_read_bit(rb); diff --git a/av2/decoder/obu_fgm.c b/av2/decoder/obu_fgm.c index 309d09dfeb..8ebf851f71 100644 --- a/av2/decoder/obu_fgm.c +++ b/av2/decoder/obu_fgm.c @@ -26,6 +26,8 @@ #include "av2/decoder/decodeframe.h" #include "av2/decoder/obu.h" +#define DEBUG_FGM_HDR 0 + void copy_fgm_from_list(AV2_COMMON *cm, avm_film_grain_t *pars, const struct film_grain_model *fgm) { const SequenceHeader *const seq_params = &cm->seq_params; @@ -68,7 +70,11 @@ void copy_fgm_from_list(AV2_COMMON *cm, avm_film_grain_t *pars, static void read_film_grain_model(struct film_grain_model *fgm, int chroma_idc, struct avm_read_bit_buffer *rb, - struct avm_internal_error_info *error_info) { + struct avm_internal_error_info *error_info +#if DEBUG_FGM_HDR + , const int idx +#endif + ) { int monochrome = chroma_idc == CHROMA_FORMAT_400; int subsampling_x = chroma_idc == CHROMA_FORMAT_444 ? 0 : 1; int subsampling_y = @@ -120,6 +126,11 @@ static void read_film_grain_model(struct film_grain_model *fgm, int chroma_idc, } fgm->fgm_scaling_points[c][i][1] = avm_rb_read_literal(rb, bitsScal); } +#if DEBUG_FGM_HDR + printf("FGM: post-scaling_points[id=%d,pl=%d,cnt=%d,bits=%d|%d]: off=%u\n", + idx, c, fgm->fgm_points[c], bitsIncr, bitsScal, + rb->bit_offset); +#endif } } @@ -151,6 +162,10 @@ static void read_film_grain_model(struct film_grain_model *fgm, int chroma_idc, int midPointY = 1 << (BitsArY - 1); for (int i = 0; i < num_pos_luma; i++) fgm->ar_coeffs_y[i] = avm_rb_read_literal(rb, BitsArY) - midPointY; +#if DEBUG_FGM_HDR + printf("FGM: post-ar_coefs[id=%d,pl=%d,cnt=%d->%d,bits=%d]: off=%u\n", + idx, 0, fgm->ar_coeff_lag, num_pos_luma, BitsArY, rb->bit_offset); +#endif } if (fgm->fgm_points[1] || fgm->fgm_scale_from_channel0_flag) { @@ -159,6 +174,10 @@ static void read_film_grain_model(struct film_grain_model *fgm, int chroma_idc, int midPointCb = 1 << (BitsArCb - 1); for (int i = 0; i < num_pos_chroma; i++) fgm->ar_coeffs_cb[i] = avm_rb_read_literal(rb, BitsArCb) - midPointCb; +#if DEBUG_FGM_HDR + printf("FGM: post-ar_coefs[id=%d,pl=%d,cnt=%d->%d,bits=%d]: off=%u\n", + idx, 1, fgm->ar_coeff_lag, num_pos_chroma, BitsArCb, rb->bit_offset); +#endif } if (fgm->fgm_points[2] || fgm->fgm_scale_from_channel0_flag) { @@ -167,6 +186,10 @@ static void read_film_grain_model(struct film_grain_model *fgm, int chroma_idc, int midPointCr = 1 << (BitsArCr - 1); for (int i = 0; i < num_pos_chroma; i++) fgm->ar_coeffs_cr[i] = avm_rb_read_literal(rb, BitsArCr) - midPointCr; +#if DEBUG_FGM_HDR + printf("FGM: post-ar_coefs[id=%d,pl=%d,cnt=%d->%d,bits=%d]: off=%u\n", + idx, 2, fgm->ar_coeff_lag, num_pos_chroma, BitsArCr, rb->bit_offset); +#endif } fgm->ar_coeff_shift = avm_rb_read_literal(rb, 2) + 6; // 6 + value @@ -193,6 +216,15 @@ static void read_film_grain_model(struct film_grain_model *fgm, int chroma_idc, fgm->mc_identity = 0; fgm->block_size = avm_rb_read_bit(rb); +#if DEBUG_FGM_HDR + printf("FGM: post-data[id=%d,sh=%d|%d|%d,uvm=%d|%d|%d|%d|%d|%d,overlap=%d," + "clip=%d,mcid=%d,bs=%d]: off=%u\n", idx, + fgm->scaling_shift, fgm->ar_coeff_shift, fgm->grain_scale_shift, + fgm->cb_mult - 128, fgm->cb_luma_mult - 128, fgm->cb_offset - 256, + fgm->cr_mult - 128, fgm->cr_luma_mult - 128, fgm->cr_offset - 256, + fgm->overlap_flag, fgm->clip_to_restricted_range, + fgm->mc_identity, fgm->block_size, rb->bit_offset); +#endif } // acc_fgm_id_bitmap is an in/out parameter. The caller should set @@ -217,6 +249,10 @@ uint32_t read_fgm_obu(AV2Decoder *pbi, const int obu_tlayer_id, avm_internal_error(&pbi->common.error, AVM_CODEC_UNSUP_BITSTREAM, "Invalid fgm_chroma_idc [%d].", fgm_chroma_idc); } +#if DEBUG_FGM_HDR + printf("FGM: post-init[mask=0x%x,layout=%d]: off=%u\n", + fgm_bit_map, fgm_chroma_idc, rb->bit_offset); +#endif for (int j = 0; j < MAX_FGM_NUM; j++) { // This process overwrites the position(pbi->fg_list[fgm_id]) if the fgm_id // is the same. @@ -228,7 +264,11 @@ uint32_t read_fgm_obu(AV2Decoder *pbi, const int obu_tlayer_id, pbi->fgm_list[fgm_id].fgm_tlayer_id = obu_tlayer_id; pbi->fgm_list[fgm_id].fgm_chroma_idc = fgm_chroma_idc; read_film_grain_model(&pbi->fgm_list[fgm_id], fgm_chroma_idc, rb, - &pbi->common.error); + &pbi->common.error +#if DEBUG_FGM_HDR + , j +#endif + ); } } if (av2_check_trailing_bits(pbi, rb) != 0) { diff --git a/avm/debug.h b/avm/debug.h index 36030e147d..b605a9125c 100644 --- a/avm/debug.h +++ b/avm/debug.h @@ -30,5 +30,6 @@ #define DEBUG_OBU_HDR 0 #define DEBUG_SEQ_HDR 0 +#define DEBUG_FRAME_HDR 0 #endif /* AVM_DEBUG_H */ -- 2.48.1 0003-Partitioning-block-and-coefficient-decoding-debuggin.patch000066400000000000000000007273511517466257200351430ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/patchesFrom c3b4aeebed403b88c0caded2a96af558e9ff208b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 1 Sep 2025 09:39:53 -0400 Subject: [PATCH 3/5] Partitioning, block and coefficient decoding debugging helpers. --- av2/common/av2_common_int.h | 2 + av2/common/mvref_common.c | 717 ++++++++++++++++++++++++++++-- av2/common/mvref_common.h | 4 +- av2/common/tip.c | 2 + av2/decoder/decodeframe.c | 238 +++++++--- av2/decoder/decodemv.c | 852 ++++++++++++++++++++++++++++++------ av2/decoder/decodemv.h | 21 +- av2/decoder/decoder.c | 5 +- av2/decoder/decoder.h | 12 +- av2/decoder/decodetxb.c | 230 ++++++++-- av2/decoder/decodetxb.h | 14 +- av2/decoder/detokenize.c | 8 +- av2/decoder/detokenize.h | 3 +- av2/encoder/rdopt.c | 10 +- avm/debug.h | 16 + 15 files changed, 1838 insertions(+), 296 deletions(-) diff --git a/av2/common/av2_common_int.h b/av2/common/av2_common_int.h index 4eb58dea87..01b9fc75fa 100644 --- a/av2/common/av2_common_int.h +++ b/av2/common/av2_common_int.h @@ -4292,7 +4292,9 @@ static INLINE void av2_reset_refmv_bank(const AV2_COMMON *const cm, if (sb_mi_row > tile_info->mi_row_start) { int row_hits = 0; int mi_col = 0; + xd->mi_row = sb_mi_row; while (mi_col < block_mi_wide && row_hits < BANK_SB_ABOVE_ROW_MAX_HITS) { + xd->mi_col = mi_col; // Previous row position of SB boundary const int col_aligned_to_8x8 = ((mi_col >> 1) << 1); const int mi_grid_idx = get_mi_grid_idx(mi_params, sb_mi_row - 1, diff --git a/av2/common/mvref_common.c b/av2/common/mvref_common.c index b1a26c4404..6e73acbc94 100644 --- a/av2/common/mvref_common.c +++ b/av2/common/mvref_common.c @@ -12,6 +12,8 @@ #include +#include "avm/debug.h" + #include "av2/common/mv.h" #include "av2/common/mvref_common.h" #include "av2/common/reconintra.h" @@ -53,6 +55,21 @@ enum { #define TIP_MFMV_STACK_SIZE 3 // The limit for original TMVP w/ TIP. #define MFMV_STACK_SIZE 4 // The total limit of motion field candidates. +#define DEBUG_REFMV 0 + +#if DEBUG_BLOCK_INFO && DEBUG_REFMV +#define RDB_ONLY(x...) x +#define DB_ARGS(x...) x, +#define DEBUG_REFMV_printf(fmt...) \ + if (is_dbg && BLOCK_TO_DEBUG) { \ + printf(fmt); \ + } +#else +#define RDB_ONLY(x...) +#define DB_ARGS(x...) +#define DEBUG_REFMV_printf(fmt...) do {} while (0) +#endif + // Check and make sure that the MVs are stored to the correct slots. static INLINE void check_frame_mv_slot(const AV2_COMMON *const cm, MV_REF *mv) { if (mv->ref_frame[0] != NONE_FRAME && mv->ref_frame[1] == NONE_FRAME) { @@ -649,6 +666,7 @@ void av2_copy_frame_mvs(const AV2_COMMON *const cm, const MACROBLOCKD *const xd, // Fetch MVP candidates from derived SMVP into MVP candidate list // when there is no enough MVP candidates. static AVM_INLINE void fill_mvp_from_derived_smvp( + DB_ARGS(const AV2_COMMON *cm, int mi_row, int mi_col, int is_dbg) const MV_REFERENCE_FRAME rf[2], CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, uint8_t *refmv_count, CANDIDATE_MV *derived_mv_stack, uint8_t derived_mv_count, @@ -663,6 +681,11 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( ++(*drl_pr_count); if (ref_mv_stack[index].this_mv.as_int == derived_mv_stack[derived_idx].this_mv.as_int) { + DEBUG_REFMV_printf("derived[%d:%d]: increasing[%d] y=%d,x=%d,w+=0 " + "at offset y=0,x=0\n", + derived_idx, *drl_pr_count, index, + derived_mv_stack[derived_idx].this_mv.as_mv.row, + derived_mv_stack[derived_idx].this_mv.as_mv.col); break; } } @@ -674,6 +697,12 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( ref_mv_stack[index].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[index].cwp_idx = CWP_EQUAL; ref_mv_weight[index] = REF_CAT_LEVEL; + DEBUG_REFMV_printf("derived[%d:%d]: adding[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=0,x=0\n", + derived_idx, *drl_pr_count, index, + derived_mv_stack[derived_idx].this_mv.as_mv.row, + derived_mv_stack[derived_idx].this_mv.as_mv.col, + REF_CAT_LEVEL); ++(*refmv_count); } } else { @@ -684,6 +713,12 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = REF_CAT_LEVEL; + DEBUG_REFMV_printf("derived[%d:%d]: tailing[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=0,x=0\n", + derived_idx, *drl_pr_count, *refmv_count, + derived_mv_stack[derived_idx].this_mv.as_mv.row, + derived_mv_stack[derived_idx].this_mv.as_mv.col, + REF_CAT_LEVEL); ++(*refmv_count); } } @@ -697,6 +732,13 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( derived_mv_stack[derived_idx].this_mv.as_int) && (ref_mv_stack[index].comp_mv.as_int == derived_mv_stack[derived_idx].comp_mv.as_int)) { + DEBUG_REFMV_printf("derived-c[%d]: increasing[%d] y=%d,x=%d," + "y2=%d,x2=%d,w+=0 at offset y=0,x=0\n", + *drl_pr_count, index, + derived_mv_stack[derived_idx].this_mv.as_mv.row, + derived_mv_stack[derived_idx].this_mv.as_mv.col, + derived_mv_stack[derived_idx].comp_mv.as_mv.row, + derived_mv_stack[derived_idx].comp_mv.as_mv.col); break; } } @@ -709,6 +751,14 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( ref_mv_stack[index].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[index].cwp_idx = CWP_EQUAL; ref_mv_weight[index] = REF_CAT_LEVEL; + DEBUG_REFMV_printf("derived-c[%d]: adding[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=0,x=0\n", + *drl_pr_count, index, + derived_mv_stack[derived_idx].this_mv.as_mv.row, + derived_mv_stack[derived_idx].this_mv.as_mv.col, + derived_mv_stack[derived_idx].comp_mv.as_mv.row, + derived_mv_stack[derived_idx].comp_mv.as_mv.col, + REF_CAT_LEVEL); ++(*refmv_count); } } else { @@ -721,6 +771,14 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = REF_CAT_LEVEL; + DEBUG_REFMV_printf("derived-c[%d]: tailing[%d] y=%d,x=%d," + "y2=%d,x2=%d,w=%d at offset y=0,x=0\n", + *drl_pr_count, *refmv_count, + derived_mv_stack[derived_idx].this_mv.as_mv.row, + derived_mv_stack[derived_idx].this_mv.as_mv.col, + derived_mv_stack[derived_idx].comp_mv.as_mv.row, + derived_mv_stack[derived_idx].comp_mv.as_mv.col, + REF_CAT_LEVEL); ++(*refmv_count); } } @@ -729,6 +787,7 @@ static AVM_INLINE void fill_mvp_from_derived_smvp( } static AVM_INLINE void derive_ref_mv_candidate_from_tip_mode( + DB_ARGS(int mi_row, int mi_col, int is_dbg) const AV2_COMMON *cm, int mi_row_cand, int mi_col_cand, const MB_MODE_INFO *const candidate, uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, CANDIDATE_MV *ref_mv_stack, @@ -746,6 +805,12 @@ static AVM_INLINE void derive_ref_mv_candidate_from_tip_mode( if ((ref_mv_stack[index].this_mv.as_int == ref_mv[0].as_int) && (ref_mv_stack[index].comp_mv.as_int == ref_mv[1].as_int)) { ref_mv_weight[index] += weight; + DEBUG_REFMV_printf("tip-spc-c[%d]: increasing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w+=%d at offset y=%d,x=%d\n", + *drl_pr_count, index, + ref_mv[0].as_mv.row, ref_mv[0].as_mv.col, + ref_mv[1].as_mv.row, ref_mv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -759,6 +824,12 @@ static AVM_INLINE void derive_ref_mv_candidate_from_tip_mode( ref_mv_stack[index].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[index].cwp_idx = CWP_EQUAL; ++(*refmv_count); + DEBUG_REFMV_printf("tip-spc-c[%d]: adding[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, index, + ref_mv[0].as_mv.row, ref_mv[0].as_mv.col, + ref_mv[1].as_mv.row, ref_mv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); } } else { if (*refmv_count < MAX_REF_MV_STACK_SIZE) { @@ -768,6 +839,12 @@ static AVM_INLINE void derive_ref_mv_candidate_from_tip_mode( ref_mv_stack[*refmv_count].row_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("tip-spc-c[%d]: tailing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, *refmv_count, + ref_mv[0].as_mv.row, ref_mv[0].as_mv.col, + ref_mv[1].as_mv.row, ref_mv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*refmv_count); } } @@ -817,7 +894,7 @@ static AVM_INLINE void add_ref_mv_candidate_ctx( } } -static AVM_INLINE void add_ref_mv_candidate( +static AVM_INLINE void add_ref_mv_candidate(DB_ARGS(const int is_dbg) int mi_row, int mi_col, int mi_row_cand, int mi_col_cand, const MB_MODE_INFO *const candidate, const SUBMB_INFO *const submi, const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count, @@ -871,6 +948,12 @@ static AVM_INLINE void add_ref_mv_candidate( ++(*drl_pr_count); if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { ref_mv_weight[index] += weight; + DEBUG_REFMV_printf("spc[%d:%d]: increasing[%d] y=%d,x=%d,w+=%d " + "at offset y=%d,x=%d\n", + ref, *drl_pr_count, index, + this_refmv.as_mv.row, this_refmv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -883,6 +966,12 @@ static AVM_INLINE void add_ref_mv_candidate( ref_mv_stack[index].cwp_idx = CWP_EQUAL; ref_mv_weight[index] = weight; ++(*refmv_count); + DEBUG_REFMV_printf("spc[%d:%d]: adding[%d] y=%d,x=%d,w=%d," + "y_off=%d,x_off=%d at offset y=%d,x=%d\n", + ref, *drl_pr_count, index, + this_refmv.as_mv.row, this_refmv.as_mv.col, weight, + row_offset, col_offset, + mi_row_cand - mi_row, mi_col_cand - mi_col); } } else { if (*refmv_count < MAX_REF_MV_STACK_SIZE) { @@ -891,6 +980,12 @@ static AVM_INLINE void add_ref_mv_candidate( ref_mv_stack[*refmv_count].col_offset = col_offset; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = weight; + DEBUG_REFMV_printf("spc[%d:%d]: tailing[%d] y=%d,x=%d,w=%d," + "y_off=%d,x_off=%d at offset y=%d,x=%d\n", + ref, *drl_pr_count, *refmv_count, + this_refmv.as_mv.row, this_refmv.as_mv.col, weight, + row_offset, col_offset, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*refmv_count); } } @@ -915,6 +1010,11 @@ static AVM_INLINE void add_ref_mv_candidate( ++(*drl_pr_count); if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { ref_mv_weight[index] += weight; + DEBUG_REFMV_printf("tip-spc[%d:%d]: increasing[%d] y=%d,x=%d," + "w+=%d at offset y=%d,x=%d\n", + ref, *drl_pr_count, index, + this_refmv.as_mv.row, this_refmv.as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -927,6 +1027,12 @@ static AVM_INLINE void add_ref_mv_candidate( ref_mv_stack[index].cwp_idx = CWP_EQUAL; ref_mv_weight[index] = weight; ++(*refmv_count); + DEBUG_REFMV_printf("tip-spc[%d:%d]: adding[%d] y=%d,x=%d," + "w=%d,y_off=%d,x_off=%d at offset y=%d,x=%d\n", + ref, *drl_pr_count, index, + this_refmv.as_mv.row, this_refmv.as_mv.col, + weight, row_offset, col_offset, + mi_row_cand - mi_row, mi_col_cand - mi_col); } } else { if (*refmv_count < MAX_REF_MV_STACK_SIZE) { @@ -935,6 +1041,12 @@ static AVM_INLINE void add_ref_mv_candidate( ref_mv_stack[*refmv_count].col_offset = col_offset; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = weight; + DEBUG_REFMV_printf("tip-spc[%d:%d]: tailing[%d] y=%d,x=%d," + "w=%d,y_off=%d,x_off=%d at offset y=%d,x=%d\n", + ref, *drl_pr_count, *refmv_count, + this_refmv.as_mv.row, this_refmv.as_mv.col, + weight, row_offset, col_offset, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*refmv_count); } } @@ -977,6 +1089,12 @@ static AVM_INLINE void add_ref_mv_candidate( ++(*drl_dr_pr_count); if (derived_mv_stack[index].this_mv.as_int == derived_mv.as_int) { derived_mv_weight[index] += weight; + DEBUG_REFMV_printf("tip2-spc[%d:%d]: increasing[%d] y=%d,x=%d," + "w+=%d at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, index, + derived_mv.as_mv.row, derived_mv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -992,6 +1110,11 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_weight[index] = weight; derived_mv_stack[index].cwp_idx = CWP_EQUAL; ++(*derived_mv_count); + DEBUG_REFMV_printf("tip2-spc[%d:%d]: adding[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, index, + derived_mv.as_mv.row, derived_mv.as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); } } else { if ( @@ -1004,6 +1127,11 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[*derived_mv_count].this_mv = derived_mv; derived_mv_weight[*derived_mv_count] = weight; derived_mv_stack[*derived_mv_count].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("tip2-spc[%d:%d]: tailing[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, *derived_mv_count, + derived_mv.as_mv.row, derived_mv.as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } @@ -1051,6 +1179,11 @@ static AVM_INLINE void add_ref_mv_candidate( ++(*drl_dr_pr_count); if (derived_mv_stack[index].this_mv.as_int == derived_mv.as_int) { derived_mv_weight[index] += weight; + DEBUG_REFMV_printf("mvtj-spc[%d:%d]: increasing[%d] y=%d,x=%d," + "w+=%d at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, index, + derived_mv.as_mv.row, derived_mv.as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -1065,6 +1198,12 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[index].this_mv = derived_mv; derived_mv_weight[index] = weight; derived_mv_stack[index].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("mvtj-spc[%d:%d]: adding[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, index, + derived_mv.as_mv.row, derived_mv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } else { @@ -1078,6 +1217,12 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[*derived_mv_count].this_mv = derived_mv; derived_mv_weight[*derived_mv_count] = weight; derived_mv_stack[*derived_mv_count].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("mvtj-spc[%d:%d]: tailing[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, *derived_mv_count, + derived_mv.as_mv.row, derived_mv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } @@ -1104,6 +1249,12 @@ static AVM_INLINE void add_ref_mv_candidate( if (derived_mv_stack[index].this_mv.as_int == this_refmv.as_int) { derived_mv_weight[index] += weight; + DEBUG_REFMV_printf("lnr-spc[%d:%d]: increasing[%d] y=%d,x=%d," + "w+=%d at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, index, + this_refmv.as_mv.row, this_refmv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -1120,6 +1271,12 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_weight[index] = weight; derived_mv_stack[index].cwp_idx = CWP_EQUAL; ++(*derived_mv_count); + DEBUG_REFMV_printf("lnr-spc[%d:%d]: adding[%d] y=%d,x=%d," + "w=%d,y_off=0,x_off=0 at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, index, + this_refmv.as_mv.row, this_refmv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); } } else { if ( @@ -1132,6 +1289,12 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[*derived_mv_count].this_mv = this_refmv; derived_mv_weight[*derived_mv_count] = weight; derived_mv_stack[*derived_mv_count].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("lnr-spc[%d:%d]: tailing[%d] y=%d,x=%d," + "w=%d,y_off=0,x_off=0 at offset y=%d,x=%d\n", + ref, *drl_dr_pr_count, *derived_mv_count, + this_refmv.as_mv.row, this_refmv.as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } @@ -1144,7 +1307,7 @@ static AVM_INLINE void add_ref_mv_candidate( candidate->ref_frame[1] == NONE_FRAME && rf[0] == tip_ref->ref_frame[0] && rf[1] == tip_ref->ref_frame[1] && cm->features.tip_frame_mode) { - derive_ref_mv_candidate_from_tip_mode( + derive_ref_mv_candidate_from_tip_mode(DB_ARGS(mi_row, mi_col, is_dbg) cm, mi_row_cand, mi_col_cand, candidate, refmv_count, ref_match_count, newmv_count, ref_mv_stack, ref_mv_weight, weight, drl_pr_count); } else { @@ -1166,6 +1329,13 @@ static AVM_INLINE void add_ref_mv_candidate( if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) && (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { ref_mv_weight[index] += weight; + DEBUG_REFMV_printf("spc-c[%d]: increasing[%d] y=%d,x=%d,y2=%d," + "x2=%d,w+=%d at offset y=%d,x=%d\n", + *drl_pr_count, index, + this_refmv[0].as_mv.row, this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, this_refmv[1].as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -1178,6 +1348,13 @@ static AVM_INLINE void add_ref_mv_candidate( ref_mv_stack[index].row_offset = OFFSET_NONSPATIAL; ref_mv_stack[index].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[index].cwp_idx = candidate->cwp_idx; + DEBUG_REFMV_printf("spc-c[%d]: adding[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, index, + this_refmv[0].as_mv.row, this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, this_refmv[1].as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*refmv_count); } } else { @@ -1188,6 +1365,13 @@ static AVM_INLINE void add_ref_mv_candidate( ref_mv_stack[*refmv_count].row_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = candidate->cwp_idx; + DEBUG_REFMV_printf("spc-c[%d]: tailing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, *refmv_count, + this_refmv[0].as_mv.row, this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, this_refmv[1].as_mv.col, + weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*refmv_count); } } @@ -1246,6 +1430,14 @@ static AVM_INLINE void add_ref_mv_candidate( (derived_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { derived_mv_weight[index] += weight; + DEBUG_REFMV_printf("mvtj-spc-c[%d]: increasing[%d] y=%d,x=%d," + "y2=%d,x2=%d,w+=%d at offset y=%d,x=%d\n", + *drl_dr_pr_count, index, + this_refmv[0].as_mv.row, + this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, + this_refmv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -1262,6 +1454,14 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[index].comp_mv = this_refmv[1]; derived_mv_weight[index] = weight; derived_mv_stack[index].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("mvtj-spc-c[%d]: adding[%d] y=%d,x=%d," + "y2=%d,x2=%d,w=%d at offset y=%d,x=%d\n", + *drl_dr_pr_count, *derived_mv_count, + this_refmv[0].as_mv.row, + this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, + this_refmv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } else { @@ -1276,6 +1476,14 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[*derived_mv_count].comp_mv = this_refmv[1]; derived_mv_weight[*derived_mv_count] = weight; derived_mv_stack[*derived_mv_count].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("mvtj-spc-c[%d]: tailing[%d] y=%d,x=%d," + "y2=%d,x2=%d,w=%d at offset y=%d,x=%d\n", + *drl_dr_pr_count, *derived_mv_count, + this_refmv[0].as_mv.row, + this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, + this_refmv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } @@ -1326,6 +1534,14 @@ static AVM_INLINE void add_ref_mv_candidate( (derived_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { derived_mv_weight[index] += weight; + DEBUG_REFMV_printf("mvxp-spc-c[%d]: increasing[%d] y=%d,x=%d," + "y2=%d,x2=%d,w+=%d at offset y=%d,x=%d\n", + *drl_dr_pr_count, index, + this_refmv[0].as_mv.row, + this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, + this_refmv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -1343,6 +1559,14 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_weight[index] = weight; derived_mv_stack[index].cwp_idx = CWP_EQUAL; ++(*derived_mv_count); + DEBUG_REFMV_printf("mvxp-spc-c[%d]: adding[%d] y=%d,x=%d," + "y2=%d,x2=%d,w=%d at offset y=%d,x=%d\n", + *drl_dr_pr_count, index, + this_refmv[0].as_mv.row, + this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, + this_refmv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); } } else { if ( @@ -1356,6 +1580,14 @@ static AVM_INLINE void add_ref_mv_candidate( derived_mv_stack[*derived_mv_count].comp_mv = this_refmv[1]; derived_mv_weight[*derived_mv_count] = weight; derived_mv_stack[*derived_mv_count].cwp_idx = CWP_EQUAL; + DEBUG_REFMV_printf("mvxp-spc-c[%d]: tailing[%d] y=%d,x=%d," + "y2=%d,x2=%d,w=%d at offset y=%d,x=%d\n", + *drl_dr_pr_count, *derived_mv_count, + this_refmv[0].as_mv.row, + this_refmv[0].as_mv.col, + this_refmv[1].as_mv.row, + this_refmv[1].as_mv.col, weight, + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*derived_mv_count); } } @@ -1368,6 +1600,14 @@ static AVM_INLINE void add_ref_mv_candidate( if (single_mv[cand_idx].ref_frame == rf[candidate_ref_idx0] && (single_mv[cand_idx].mv.as_int == this_refmv[candidate_ref_idx0].as_int)) { + DEBUG_REFMV_printf("sngl-c[%d:%d]: skipping[%d] y=%d,x=%d," + "r=%d at offset y=%d,x=%d\n", + candidate_ref_idx0, *drl_dr_single_pr_count, + cand_idx, + this_refmv[candidate_ref_idx0].as_mv.row, + this_refmv[candidate_ref_idx0].as_mv.col, + rf[candidate_ref_idx0], + mi_row_cand - mi_row, mi_col_cand - mi_col); break; } } @@ -1382,6 +1622,14 @@ static AVM_INLINE void add_ref_mv_candidate( single_mv[cand_idx].mv.as_int = this_refmv[candidate_ref_idx0].as_int; single_mv[cand_idx].ref_frame = rf[candidate_ref_idx0]; + DEBUG_REFMV_printf("sngl-c[%d:%d]: adding[%d] y=%d,x=%d,r=%d " + "at offset y=%d,x=%d\n", + candidate_ref_idx0, *drl_dr_single_pr_count, + *single_mv_count, + this_refmv[candidate_ref_idx0].as_mv.row, + this_refmv[candidate_ref_idx0].as_mv.col, + rf[candidate_ref_idx0], + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*single_mv_count); } } else { @@ -1395,6 +1643,14 @@ static AVM_INLINE void add_ref_mv_candidate( single_mv[*single_mv_count].mv.as_int = this_refmv[candidate_ref_idx0].as_int; single_mv[*single_mv_count].ref_frame = rf[candidate_ref_idx0]; + DEBUG_REFMV_printf("sngl-c[%d:%d]: tailing[%d] y=%d,x=%d,r=%d " + "at offset y=%d,x=%d\n", + candidate_ref_idx0, *drl_dr_single_pr_count, + *single_mv_count, + this_refmv[candidate_ref_idx0].as_mv.row, + this_refmv[candidate_ref_idx0].as_mv.col, + rf[candidate_ref_idx0], + mi_row_cand - mi_row, mi_col_cand - mi_col); ++(*single_mv_count); } } @@ -1445,7 +1701,9 @@ void insert_neighbor_warp_candidate( // Check if the candidate warp parameters are already in the list or not. -void check_this_warp_candidate( +void check_this_warp_candidate(DB_ARGS(const int have_warp, const char *const tag, + const int mi_row, const int mi_col, + const int is_dbg) const AV2_COMMON *cm, const MB_MODE_INFO *const neighbor_mbmi, WARP_CANDIDATE warp_candidates[MAX_WARP_REF_CANDIDATES], const int ref_frame, const int max_num_of_candidates, @@ -1456,6 +1714,15 @@ void check_this_warp_candidate( WarpedMotionParams neigh_params; if (*curr_num_of_candidates < max_num_of_candidates && is_valid_warp_parameters(cm, neighbor_mbmi, ref_frame, &neigh_params)) { +#if DEBUG_REFMV + if (have_warp) + DEBUG_REFMV_printf("Spatial[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d from %s\n", + *curr_num_of_candidates, + neigh_params.wmmat[0], neigh_params.wmmat[1], + neigh_params.wmmat[2], neigh_params.wmmat[3], + neigh_params.wmmat[4], neigh_params.wmmat[5], + neigh_params.wmtype, tag); +#endif insert_neighbor_warp_candidate(warp_candidates, &neigh_params, *curr_num_of_candidates, proj_type); ++(*curr_num_of_candidates); @@ -1502,7 +1769,8 @@ static AVM_INLINE void scan_blk_mbmi_ctx( } } -static AVM_INLINE void scan_blk_mbmi( +static AVM_INLINE void scan_blk_mbmi(DB_ARGS(const int have_warp, const char *const tag, + const int is_dbg) const AV2_COMMON *cm, const MACROBLOCKD *xd, const int mi_row, const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, @@ -1541,14 +1809,15 @@ static AVM_INLINE void scan_blk_mbmi( if (warp_param_stack && valid_num_warp_candidates && max_num_of_warp_candidates) { - check_this_warp_candidate(cm, candidate, warp_param_stack, ref_frame, + check_this_warp_candidate(DB_ARGS(have_warp, tag, mi_row, mi_col, is_dbg) + cm, candidate, warp_param_stack, ref_frame, max_num_of_warp_candidates, valid_num_warp_candidates, PROJ_SPATIAL); } if (*refmv_count >= MAX_REF_MV_STACK_SIZE) return; - add_ref_mv_candidate( + add_ref_mv_candidate(DB_ARGS(is_dbg) mi_row, mi_col, cand_mi_row, cand_mi_col, candidate, submi, rf, refmv_count, ref_match_count, newmv_count, ref_mv_stack, ref_mv_weight, gm_mv_candidates, cm->global_motion, cm, add_more_mvs, single_mv, @@ -1632,7 +1901,8 @@ static AVM_INLINE int compute_cur_to_ref_dist(const AV2_COMMON *cm, return cur_ref_offset; } -static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, +static int add_tpl_ref_mv(DB_ARGS(const int is_dbg) + const AV2_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame, int blk_row, int blk_col, uint8_t *const refmv_count, int *added_tmvp_cnt, @@ -1711,7 +1981,13 @@ static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; } - if (idx < *refmv_count) ref_mv_weight[idx] += weight; + if (idx < *refmv_count) { + ref_mv_weight[idx] += weight; + DEBUG_REFMV_printf("tpl[0:%d]: increasing[%d] y=%d,x=%d,w+=%d " + "at offset y=%d,x=%d\n", + *drl_pr_count, idx, this_refmv.as_mv.row, + this_refmv.as_mv.col, weight, tpl_row, tpl_col); + } if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; @@ -1721,6 +1997,11 @@ static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, ref_mv_weight[idx] = weight; ++(*refmv_count); ++(*added_tmvp_cnt); + DEBUG_REFMV_printf("tpl[0:%d]: adding[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=%d,x=%d\n", + *drl_pr_count, idx, + this_refmv.as_mv.row, this_refmv.as_mv.col, weight, + tpl_row, tpl_col); } } else { if (*refmv_count < MAX_REF_MV_STACK_SIZE) { @@ -1729,6 +2010,11 @@ static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = weight; + DEBUG_REFMV_printf("tpl[0:%d]: tailing[%d] y=%d,x=%d,w=%d," + "y_off=0,x_off=0 at offset y=%d,x=%d\n", + *drl_pr_count, *refmv_count, + this_refmv.as_mv.row, this_refmv.as_mv.col, weight, + tpl_row, tpl_col); ++(*refmv_count); ++(*added_tmvp_cnt); } @@ -1756,7 +2042,15 @@ static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, break; } - if (idx < *refmv_count) ref_mv_weight[idx] += weight; + if (idx < *refmv_count) { + ref_mv_weight[idx] += weight; + DEBUG_REFMV_printf("tpl-c[%d]: increasing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w+=%d at offset y=%d,x=%d\n", + *drl_pr_count, idx, + this_refmv.as_mv.row, this_refmv.as_mv.col, + comp_refmv.as_mv.row, comp_refmv.as_mv.col, weight, + tpl_row, tpl_col); + } if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; @@ -1767,6 +2061,12 @@ static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, ref_mv_weight[idx] = weight; ++(*refmv_count); ++(*added_tmvp_cnt); + DEBUG_REFMV_printf("tpl-c[%d]: adding[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, idx, + this_refmv.as_mv.row, this_refmv.as_mv.col, + comp_refmv.as_mv.row, comp_refmv.as_mv.col, weight, + tpl_row, tpl_col); } } else { if (*refmv_count < MAX_REF_MV_STACK_SIZE) { @@ -1776,6 +2076,12 @@ static int add_tpl_ref_mv(const AV2_COMMON *cm, const MACROBLOCKD *xd, ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = weight; + DEBUG_REFMV_printf("tpl-c[%d]: tailing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, *refmv_count, + this_refmv.as_mv.row, this_refmv.as_mv.col, + comp_refmv.as_mv.row, comp_refmv.as_mv.col, weight, + tpl_row, tpl_col); ++(*refmv_count); ++(*added_tmvp_cnt); } @@ -1818,16 +2124,25 @@ static AVM_INLINE int get_rmb_list_index(const MV_REFERENCE_FRAME ref_frame) { } static AVM_INLINE bool check_rmb_cand( + DB_ARGS(const AV2_COMMON *cm, int bank_idx, int bank_count, const int is_dbg) CANDIDATE_MV cand_mv, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, uint8_t *refmv_count, int is_comp, int mi_row, int mi_col, int block_width, int block_height, int frame_width, int frame_height, int *drl_pr_count) { // Check if the MV candidate is already existing in the ref MV stack. + RDB_ONLY(int did_check = 0); if (*drl_pr_count < MAX_PR_NUM) { + RDB_ONLY(did_check = 1); for (int i = 0; i < *refmv_count; ++i) { ++(*drl_pr_count); if (ref_mv_stack[i].this_mv.as_int == cand_mv.this_mv.as_int && (!is_comp || ref_mv_stack[i].comp_mv.as_int == cand_mv.comp_mv.as_int)) { + DEBUG_REFMV_printf("insert_bank[%d/%d:%d]: skipping[%d] " + "y=%d,x=%d,y2=%d,x2=%d\n", + bank_idx, bank_count, *drl_pr_count, i, + cand_mv.this_mv.as_mv.row, cand_mv.this_mv.as_mv.col, + is_comp ? cand_mv.comp_mv.as_mv.row : 0, + is_comp ? cand_mv.comp_mv.as_mv.col : 0); return false; } } @@ -1852,6 +2167,12 @@ static AVM_INLINE bool check_rmb_cand( ref_mv_stack[*refmv_count].row_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = cand_mv.cwp_idx; + DEBUG_REFMV_printf("insert_bank[%d/%d:%d]: %s[%d] y=%d,x=%d,y2=%d,x2=%d,w=%d\n", + bank_idx, bank_count, *drl_pr_count, + did_check ? "adding" : "tailing", + *refmv_count, cand_mv.this_mv.as_mv.row, + cand_mv.this_mv.as_mv.col, is_comp ? cand_mv.comp_mv.as_mv.row : 0, + is_comp ? cand_mv.comp_mv.as_mv.col : 0, REF_CAT_LEVEL); ++*refmv_count; return true; @@ -1870,7 +2191,7 @@ static AVM_INLINE bool add_to_ref_bv_list(CANDIDATE_MV cand_mv, return true; } -static AVM_INLINE void add_tmvp_candidate( +static AVM_INLINE void add_tmvp_candidate(DB_ARGS(const int is_dbg) const AV2_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], @@ -1903,7 +2224,8 @@ static AVM_INLINE void add_tmvp_candidate( for (int iter = 0; iter < TMVP_SEARCH_COUNT; ++iter) { if (added_tmvp_cnt) break; if (tmvp_units_status[iter].is_available) { - add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, + add_tpl_ref_mv(DB_ARGS(is_dbg) + cm, xd, mi_row, mi_col, ref_frame, tmvp_units_status[iter].row_offset, tmvp_units_status[iter].col_offset, refmv_count, &added_tmvp_cnt, ref_mv_stack, ref_mv_weight, @@ -1930,7 +2252,7 @@ static AVM_INLINE int assign_tmvp_high_priority(const AV2_COMMON *cm, return 0; } -static AVM_INLINE void add_derived_smvp_candidates( +static AVM_INLINE void add_derived_smvp_candidates(DB_ARGS(const int is_dbg) const AV2_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME *rf, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], @@ -1946,13 +2268,14 @@ static AVM_INLINE void add_derived_smvp_candidates( ? AVMMIN(cm->features.max_bvp_drl_bits + 1, MAX_REF_BV_STACK_SIZE) : AVMMIN(cm->features.max_drl_bits + 1, MAX_REF_MV_STACK_SIZE); if (*refmv_count < max_ref_mv_count && derived_mv_count > 0) { - fill_mvp_from_derived_smvp(rf, ref_mv_stack, ref_mv_weight, refmv_count, + fill_mvp_from_derived_smvp(DB_ARGS(cm, xd->mi_row, xd->mi_col, is_dbg) + rf, ref_mv_stack, ref_mv_weight, refmv_count, derived_mv_stack, derived_mv_count, max_ref_mv_count, drl_pr_count); } } -static AVM_INLINE void add_ref_mv_bank_candidates( +static AVM_INLINE void add_ref_mv_bank_candidates(DB_ARGS(const int is_dbg) const AV2_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME *rf, MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], @@ -1980,7 +2303,8 @@ static AVM_INLINE void add_ref_mv_bank_candidates( rmb_ref_frame[idx] != ref_frame) continue; - check_rmb_cand(cand_mv, ref_mv_stack, ref_mv_weight, refmv_count, is_comp, + check_rmb_cand(DB_ARGS(cm, idx_bank, count, is_dbg) + cand_mv, ref_mv_stack, ref_mv_weight, refmv_count, is_comp, xd->mi_row, xd->mi_col, block_width, block_height, xd->plane[0].dst.width, xd->plane[0].dst.height, drl_pr_count); @@ -2193,6 +2517,8 @@ static AVM_INLINE int generate_points_from_corners( } static int insert_mvp_candidate( + DB_ARGS(const AV2_COMMON *cm, int mi_row, int mi_col, int cidx, + const int yidx, const int xidx, const int is_dbg) MV_REFERENCE_FRAME rf[2], CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], MV_COMP_DATA_TYPE this_mv_col, MV_COMP_DATA_TYPE this_mv_row, @@ -2212,6 +2538,20 @@ static int insert_mvp_candidate( (rf[1] == NONE_FRAME || (rf[1] > NONE_FRAME && ref_mv_stack[idx].comp_mv.as_int == ext_mv.comp_mv.as_int))) { + if (rf[1] > NONE_FRAME) { + DEBUG_REFMV_printf("insert_cand-c[%d]: increasing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w+=%d at offset y=%d,x=%d\n", + *drl_pr_count, idx, + ext_mv.this_mv.as_mv.row, ext_mv.this_mv.as_mv.col, + ext_mv.comp_mv.as_mv.row, ext_mv.comp_mv.as_mv.col, + 0, yidx, xidx); + } else { + DEBUG_REFMV_printf("insert_cand[%d:%d]: increasing[%d] y=%d,x=%d," + "w+=%d at offset y=%d,x=%d\n", + cidx, *drl_pr_count, idx, + ext_mv.this_mv.as_mv.row, ext_mv.this_mv.as_mv.col, + 0, yidx, xidx); + } break; } } @@ -2222,8 +2562,22 @@ static int insert_mvp_candidate( ref_mv_stack[idx].row_offset = OFFSET_NONSPATIAL; ref_mv_stack[idx].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[idx].cwp_idx = CWP_EQUAL; - ref_mv_weight[idx] = weight; + ref_mv_weight[idx] = DEBUG_BLOCK_INFO ? 0 : weight; ++(*refmv_count); + if (rf[1] > NONE_FRAME) { + DEBUG_REFMV_printf("insert_cand-c[%d]: adding[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, idx, + ext_mv.this_mv.as_mv.row, ext_mv.this_mv.as_mv.col, + ext_mv.comp_mv.as_mv.row, ext_mv.comp_mv.as_mv.col, + 0, yidx, xidx); + } else { + DEBUG_REFMV_printf("insert_cand[%d:%d]: adding[%d] y=%d,x=%d," + "w=%d,y_off=0,x_off=0 at offset y=%d,x=%d\n", + cidx, *drl_pr_count, idx, + ext_mv.this_mv.as_mv.row, ext_mv.this_mv.as_mv.col, + 0, yidx, xidx); + } return 1; } return 0; @@ -2234,7 +2588,21 @@ static int insert_mvp_candidate( ref_mv_stack[*refmv_count].row_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; - ref_mv_weight[*refmv_count] = weight; + ref_mv_weight[*refmv_count] = DEBUG_BLOCK_INFO ? 0 : weight; + if (rf[1] > NONE_FRAME) { + DEBUG_REFMV_printf("insert_cand-c[%d]: tailing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d at offset y=%d,x=%d\n", + *drl_pr_count, *refmv_count, + ext_mv.this_mv.as_mv.row, ext_mv.this_mv.as_mv.col, + ext_mv.comp_mv.as_mv.row, ext_mv.comp_mv.as_mv.col, + 0, yidx, xidx); + } else { + DEBUG_REFMV_printf("insert_cand[%d:%d]: tailing[%d] y=%d,x=%d," + "w=%d,y_off=0,x_off=0 at offset y=%d,x=%d\n", + cidx, *drl_pr_count, *refmv_count, + ext_mv.this_mv.as_mv.row, ext_mv.this_mv.as_mv.col, + 0, yidx, xidx); + } ++(*refmv_count); return 1; } @@ -2242,9 +2610,9 @@ static int insert_mvp_candidate( } } -static AVM_INLINE void setup_ref_mv_list( - const AV2_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, - uint8_t *const refmv_count, +static AVM_INLINE void setup_ref_mv_list(DB_ARGS(const int is_dbg) + const AV2_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, @@ -2261,6 +2629,21 @@ static AVM_INLINE void setup_ref_mv_list( av2_set_ref_frame(rf, ref_frame); *refmv_count = 0; + DEBUG_REFMV_printf("setup_ref_mv_list(%d,%d) for y=%d,x=%d\n", + rf[0] == TIP_FRAME ? 7 : + rf[0] == INTRA_FRAME ? -1 : rf[0], + rf[1], mi_row, mi_col); + if (rf[1] != NONE_FRAME && rf[1] != INTRA_FRAME) { + DEBUG_REFMV_printf("Gmv2d: y=%d,x=%d, y2=%d,x2=%d\n", + gm_mv_candidates[0].as_mv.row, + gm_mv_candidates[0].as_mv.col, + gm_mv_candidates[1].as_mv.row, + gm_mv_candidates[1].as_mv.col); + } else if (rf[0] != INTRA_FRAME && rf[0] != NONE_FRAME) { + DEBUG_REFMV_printf("Gmv2d: y=%d,x=%d\n", + gm_mv_candidates[0].as_mv.row, + gm_mv_candidates[0].as_mv.col); + } int drl_pr_count = 0; int drl_dr_pr_count = 0; int drl_dr_single_pr_count = 0; @@ -2288,9 +2671,14 @@ static AVM_INLINE void setup_ref_mv_list( MVP_UNIT_STATUS row_smvp_state[4] = { 0 }; get_row_smvp_states(cm, xd, row_smvp_state); + int have_warp = 0; // derive a warp model from the 3 corner MVs if (warp_param_stack && valid_num_warp_candidates && *valid_num_warp_candidates < max_num_of_warp_candidates) { + have_warp = mbmi->mode > NEWMV; + if (have_warp) + DEBUG_REFMV_printf("Warp corners [%d|%d]\n", *refmv_count, + *valid_num_warp_candidates); // 0: top_left, top_right, bottom_left // 1: top, top_right_next, bottom_left #define WRL_CORNER_MVS_NUM 2 @@ -2326,6 +2714,14 @@ static AVM_INLINE void setup_ref_mv_list( xd->mi[0]->sb_type[PLANE_TYPE_Y], get_ref_scale_factors_const(cm, ref_frame)); if (valid_model && !cand_warp_param.invalid) { + if (have_warp) + DEBUG_REFMV_printf("MFC[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d " + "from tl=y:%d,x:%d,bl=y:%d,x:%d,tr=y:%d,x:%d\n", + iter, cand_warp_param.wmmat[0], cand_warp_param.wmmat[1], + cand_warp_param.wmmat[2], cand_warp_param.wmmat[3], + cand_warp_param.wmmat[4], cand_warp_param.wmmat[5], + cand_warp_param.wmtype, mvs_32[1], mvs_32[0], mvs_32[5], + mvs_32[4], mvs_32[3], mvs_32[2]); insert_neighbor_warp_candidate(warp_param_stack, &cand_warp_param, *valid_num_warp_candidates, PROJ_SPATIAL); @@ -2367,12 +2763,18 @@ static AVM_INLINE void setup_ref_mv_list( const int is_tmvp_high_priority = assign_tmvp_high_priority(cm, rf); if (is_tmvp_high_priority) { - add_tmvp_candidate(cm, xd, ref_frame, refmv_count, ref_mv_stack, + DEBUG_REFMV_printf("High-priority TMVP [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); + add_tmvp_candidate(DB_ARGS(is_dbg) + cm, xd, ref_frame, refmv_count, ref_mv_stack, ref_mv_weight, mi_row, mi_col, &drl_pr_count); } + DEBUG_REFMV_printf("Spatial MVP [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); if (xd->left_available) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, (xd->height - 1), -1, + scan_blk_mbmi(DB_ARGS(have_warp, "bml", is_dbg) + cm, xd, mi_row, mi_col, rf, (xd->height - 1), -1, ref_mv_stack, ref_mv_weight, &col_match_count, &newmv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, derived_mv_weight, &derived_mv_count, @@ -2382,7 +2784,8 @@ static AVM_INLINE void setup_ref_mv_list( } if (row_smvp_state[0].is_available) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, row_smvp_state[0].row_offset, + scan_blk_mbmi(DB_ARGS(have_warp, "rmt", is_dbg) + cm, xd, mi_row, mi_col, rf, row_smvp_state[0].row_offset, row_smvp_state[0].col_offset, ref_mv_stack, ref_mv_weight, &row_match_count, &newmv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, @@ -2393,7 +2796,8 @@ static AVM_INLINE void setup_ref_mv_list( } if (height_at_least_two) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, 0, -1, ref_mv_stack, + scan_blk_mbmi(DB_ARGS(have_warp, "tml", is_dbg) + cm, xd, mi_row, mi_col, rf, 0, -1, ref_mv_stack, ref_mv_weight, &col_match_count, &newmv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, derived_mv_weight, &derived_mv_count, @@ -2403,7 +2807,8 @@ static AVM_INLINE void setup_ref_mv_list( } if (row_smvp_state[1].is_available) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, row_smvp_state[1].row_offset, + scan_blk_mbmi(DB_ARGS(have_warp, "lmt", is_dbg) + cm, xd, mi_row, mi_col, rf, row_smvp_state[1].row_offset, row_smvp_state[1].col_offset, ref_mv_stack, ref_mv_weight, &row_match_count, &newmv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, @@ -2413,7 +2818,8 @@ static AVM_INLINE void setup_ref_mv_list( &drl_dr_single_pr_count); } if (has_bl) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, xd->height, -1, ref_mv_stack, + scan_blk_mbmi(DB_ARGS(have_warp, "bl", is_dbg) + cm, xd, mi_row, mi_col, rf, xd->height, -1, ref_mv_stack, ref_mv_weight, &col_match_count, &newmv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, derived_mv_weight, &derived_mv_count, @@ -2423,7 +2829,8 @@ static AVM_INLINE void setup_ref_mv_list( } if (row_smvp_state[2].is_available) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, row_smvp_state[2].row_offset, + scan_blk_mbmi(DB_ARGS(have_warp, "tr", is_dbg) + cm, xd, mi_row, mi_col, rf, row_smvp_state[2].row_offset, row_smvp_state[2].col_offset, ref_mv_stack, ref_mv_weight, &row_match_count, &newmv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, @@ -2434,14 +2841,20 @@ static AVM_INLINE void setup_ref_mv_list( } if (!is_tmvp_high_priority) { - add_tmvp_candidate(cm, xd, ref_frame, refmv_count, ref_mv_stack, + DEBUG_REFMV_printf("Low-priority TMVP [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); + add_tmvp_candidate(DB_ARGS(is_dbg) + cm, xd, ref_frame, refmv_count, ref_mv_stack, ref_mv_weight, mi_row, mi_col, &drl_pr_count); } + DEBUG_REFMV_printf("Extra Spatial MVP [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); if (row_smvp_state[3].is_available) { uint8_t dummy_ref_match_count = 0; uint8_t dummy_new_mv_count = 0; - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, row_smvp_state[3].row_offset, + scan_blk_mbmi(DB_ARGS(have_warp, "tl", is_dbg) + cm, xd, mi_row, mi_col, rf, row_smvp_state[3].row_offset, row_smvp_state[3].col_offset, ref_mv_stack, ref_mv_weight, &dummy_ref_match_count, &dummy_new_mv_count, gm_mv_candidates, 1, single_mv, &single_mv_count, derived_mv_stack, @@ -2454,6 +2867,8 @@ static AVM_INLINE void setup_ref_mv_list( const uint8_t nearest_refmv_count = *refmv_count; if (xd->left_available) { + DEBUG_REFMV_printf("Spatial Ext [left] MVP [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); for (int idx = 2; idx <= MVREF_COLS; ++idx) { const int col_offset = -(idx << 1) + 1 + col_adj; const MVP_UNIT_STATUS col_units_status[SMVP_COL_SEARCH_COUNT] = { @@ -2466,7 +2881,8 @@ static AVM_INLINE void setup_ref_mv_list( is_valid_candidate(xd, mi_row, mi_col, col_units_status[unit_idx].row_offset, col_units_status[unit_idx].col_offset, -1)) { - scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, + scan_blk_mbmi(DB_ARGS(have_warp, unit_idx ? "ext_tml" : "ext_bml", is_dbg) + cm, xd, mi_row, mi_col, rf, col_units_status[unit_idx].row_offset, col_units_status[unit_idx].col_offset, ref_mv_stack, ref_mv_weight, &col_match_count, &newmv_count, @@ -2507,18 +2923,24 @@ static AVM_INLINE void setup_ref_mv_list( } const int is_compound = is_inter_ref_frame(rf[1]); + DEBUG_REFMV_printf("Derived Spatial MVP & refbank [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); if (is_compound) { - add_derived_smvp_candidates(cm, xd, rf, refmv_count, ref_mv_stack, + add_derived_smvp_candidates(DB_ARGS(is_dbg) + cm, xd, rf, refmv_count, ref_mv_stack, ref_mv_weight, derived_mv_stack, derived_mv_count, &drl_pr_count); if (cm->seq_params.enable_refmvbank) - add_ref_mv_bank_candidates(cm, xd, rf, ref_frame, refmv_count, + add_ref_mv_bank_candidates(DB_ARGS(is_dbg) + cm, xd, rf, ref_frame, refmv_count, ref_mv_stack, ref_mv_weight, &drl_pr_count); } else { if (cm->seq_params.enable_refmvbank) - add_ref_mv_bank_candidates(cm, xd, rf, ref_frame, refmv_count, + add_ref_mv_bank_candidates(DB_ARGS(is_dbg) + cm, xd, rf, ref_frame, refmv_count, ref_mv_stack, ref_mv_weight, &drl_pr_count); - add_derived_smvp_candidates(cm, xd, rf, refmv_count, ref_mv_stack, + add_derived_smvp_candidates(DB_ARGS(is_dbg) + cm, xd, rf, refmv_count, ref_mv_stack, ref_mv_weight, derived_mv_stack, derived_mv_count, &drl_pr_count); } @@ -2532,6 +2954,8 @@ static AVM_INLINE void setup_ref_mv_list( } } + DEBUG_REFMV_printf("GMVs [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); if (rf[1] == NONE_FRAME && mv_ref_list != NULL) { for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx) { mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int; @@ -2555,6 +2979,13 @@ static AVM_INLINE void setup_ref_mv_list( (rf[1] == NONE_FRAME || (rf[1] > NONE_FRAME && ref_mv_stack[idx].comp_mv.as_int == gm_mv_candidates[1].as_int))) { + DEBUG_REFMV_printf("gmv_add[%d]: skipping[%d] y=%d,x=%d," + "y2=%d,x2=%d from GMV\n", + drl_pr_count, idx, + gm_mv_candidates[0].as_mv.row, + gm_mv_candidates[0].as_mv.col, + gm_mv_candidates[1].as_mv.row, + gm_mv_candidates[1].as_mv.col); break; } } @@ -2567,6 +2998,13 @@ static AVM_INLINE void setup_ref_mv_list( ref_mv_stack[idx].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[idx].cwp_idx = CWP_EQUAL; ref_mv_weight[idx] = REF_CAT_LEVEL; + DEBUG_REFMV_printf("gmv_add[%d]: adding[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d from GMV\n", + drl_pr_count, idx, + gm_mv_candidates[0].as_mv.row, + gm_mv_candidates[0].as_mv.col, + gm_mv_candidates[1].as_mv.row, + gm_mv_candidates[1].as_mv.col, REF_CAT_LEVEL); ++(*refmv_count); } } else { @@ -2577,6 +3015,13 @@ static AVM_INLINE void setup_ref_mv_list( ref_mv_stack[*refmv_count].col_offset = OFFSET_NONSPATIAL; ref_mv_stack[*refmv_count].cwp_idx = CWP_EQUAL; ref_mv_weight[*refmv_count] = REF_CAT_LEVEL; + DEBUG_REFMV_printf("gmv_add[%d]: tailing[%d] y=%d,x=%d,y2=%d,x2=%d," + "w=%d from GMV\n", + drl_pr_count, *refmv_count, + gm_mv_candidates[0].as_mv.row, + gm_mv_candidates[0].as_mv.col, + gm_mv_candidates[1].as_mv.row, + gm_mv_candidates[1].as_mv.col, REF_CAT_LEVEL); ++(*refmv_count); } } @@ -2589,6 +3034,8 @@ static AVM_INLINE void setup_ref_mv_list( max_ext_stack_size = 6; if (max_ext_stack_size) { + DEBUG_REFMV_printf("Ext MVP candidates [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); MV_COMP_DATA_TYPE row = 0; MV_COMP_DATA_TYPE col = 0; MV_COMP_DATA_TYPE comp_mv_row = 0; @@ -2603,6 +3050,7 @@ static AVM_INLINE void setup_ref_mv_list( comp_mv_col = ref_mv_stack[1].comp_mv.as_mv.col; curr_mv_weight = (ref_mv_weight[0] + ref_mv_weight[1] + 1) / 2; added_ext_cnt += insert_mvp_candidate( + DB_ARGS(cm, mi_row, mi_col, 0, 0, 1, is_dbg) rf, ref_mv_stack, ref_mv_weight, col, row, comp_mv_col, comp_mv_row, curr_mv_weight, refmv_count, &drl_pr_count); if (*refmv_count < MAX_REF_MV_STACK_SIZE && @@ -2614,6 +3062,7 @@ static AVM_INLINE void setup_ref_mv_list( comp_mv_col = ref_mv_stack[0].comp_mv.as_mv.col; curr_mv_weight = (ref_mv_weight[0] + ref_mv_weight[1] + 1) / 2; added_ext_cnt += insert_mvp_candidate( + DB_ARGS(cm, mi_row, mi_col, 1, 1, 0, is_dbg) rf, ref_mv_stack, ref_mv_weight, col, row, comp_mv_col, comp_mv_row, curr_mv_weight, refmv_count, &drl_pr_count); } @@ -2627,6 +3076,7 @@ static AVM_INLINE void setup_ref_mv_list( comp_mv_col = ref_mv_stack[2].comp_mv.as_mv.col; curr_mv_weight = (ref_mv_weight[0] + ref_mv_weight[2] + 1) / 2; added_ext_cnt += insert_mvp_candidate( + DB_ARGS(cm, mi_row, mi_col, 2, 0, 2, is_dbg) rf, ref_mv_stack, ref_mv_weight, col, row, comp_mv_col, comp_mv_row, curr_mv_weight, refmv_count, &drl_pr_count); @@ -2639,6 +3089,7 @@ static AVM_INLINE void setup_ref_mv_list( comp_mv_col = ref_mv_stack[0].comp_mv.as_mv.col; curr_mv_weight = (ref_mv_weight[0] + ref_mv_weight[2] + 1) / 2; added_ext_cnt += insert_mvp_candidate( + DB_ARGS(cm, mi_row, mi_col, 3, 2, 0, is_dbg) rf, ref_mv_stack, ref_mv_weight, col, row, comp_mv_col, comp_mv_row, curr_mv_weight, refmv_count, &drl_pr_count); } @@ -2652,6 +3103,7 @@ static AVM_INLINE void setup_ref_mv_list( comp_mv_col = ref_mv_stack[2].comp_mv.as_mv.col; curr_mv_weight = (ref_mv_weight[1] + ref_mv_weight[2] + 1) / 2; added_ext_cnt += insert_mvp_candidate( + DB_ARGS(cm, mi_row, mi_col, 4, 1, 2, is_dbg) rf, ref_mv_stack, ref_mv_weight, col, row, comp_mv_col, comp_mv_row, curr_mv_weight, refmv_count, &drl_pr_count); } @@ -2664,6 +3116,7 @@ static AVM_INLINE void setup_ref_mv_list( comp_mv_col = ref_mv_stack[1].comp_mv.as_mv.col; curr_mv_weight = (ref_mv_weight[1] + ref_mv_weight[2] + 1) / 2; added_ext_cnt += insert_mvp_candidate( + DB_ARGS(cm, mi_row, mi_col, 5, 2, 1, is_dbg) rf, ref_mv_stack, ref_mv_weight, col, row, comp_mv_col, comp_mv_row, curr_mv_weight, refmv_count, &drl_pr_count); (void)added_ext_cnt; @@ -2674,6 +3127,9 @@ static AVM_INLINE void setup_ref_mv_list( if (warp_param_stack && valid_num_warp_candidates && *valid_num_warp_candidates < max_num_of_warp_candidates) { + if (have_warp) + DEBUG_REFMV_printf("Warp bank [%d|%d]\n", *refmv_count, + *valid_num_warp_candidates); // Insert warp parameters from the bank #if WARP_CU_BANK const WARP_PARAM_BANK *warp_param_bank = &xd->warp_param_bank; @@ -2694,20 +3150,43 @@ static AVM_INLINE void setup_ref_mv_list( insert_neighbor_warp_candidate(warp_param_stack, &cand_warp_param, *valid_num_warp_candidates, PROJ_PARAM_BANK); + if (have_warp) + DEBUG_REFMV_printf("Bank[%d/%d]: [ %d, %d | %d, %d, %d, %d ],t=%d\n", + idx_bank, *valid_num_warp_candidates, + cand_warp_param.wmmat[0], cand_warp_param.wmmat[1], + cand_warp_param.wmmat[2], cand_warp_param.wmmat[3], + cand_warp_param.wmmat[4], cand_warp_param.wmmat[5], + cand_warp_param.wmtype); (*valid_num_warp_candidates)++; } } + if (have_warp) + DEBUG_REFMV_printf("Warp gmv [%d|%d]\n", *refmv_count, + *valid_num_warp_candidates); // Insert Global motion of the current if (*valid_num_warp_candidates < max_num_of_warp_candidates) { if (!xd->global_motion[ref_frame].invalid) { insert_neighbor_warp_candidate( warp_param_stack, &xd->global_motion[ref_frame], *valid_num_warp_candidates, PROJ_GLOBAL_MOTION); + if (have_warp) + DEBUG_REFMV_printf("GMV[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d\n", + *valid_num_warp_candidates, + xd->global_motion[ref_frame].wmmat[0], + xd->global_motion[ref_frame].wmmat[1], + xd->global_motion[ref_frame].wmmat[2], + xd->global_motion[ref_frame].wmmat[3], + xd->global_motion[ref_frame].wmmat[4], + xd->global_motion[ref_frame].wmmat[5], + xd->global_motion[ref_frame].wmtype); (*valid_num_warp_candidates)++; } } + if (have_warp) + DEBUG_REFMV_printf("Warp defaults [%d|%d]\n", *refmv_count, + *valid_num_warp_candidates); // Filled with default values( currently all params are zeros) int max_num_of_default_allowed = AVMMIN(2, max_num_of_warp_candidates); int current_number_of_defaults = 0; @@ -2718,6 +3197,13 @@ static AVM_INLINE void setup_ref_mv_list( cand_num++) { warp_param_stack[cand_num].wm_params = default_warp_params; warp_param_stack[cand_num].proj_type = PROJ_DEFAULT; + if (have_warp) + DEBUG_REFMV_printf("Defaults[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d\n", + *valid_num_warp_candidates, + default_warp_params.wmmat[0], default_warp_params.wmmat[1], + default_warp_params.wmmat[2], default_warp_params.wmmat[3], + default_warp_params.wmmat[4], default_warp_params.wmmat[5], + default_warp_params.wmtype); (*valid_num_warp_candidates)++; current_number_of_defaults++; } @@ -2726,6 +3212,8 @@ static AVM_INLINE void setup_ref_mv_list( // If there are open slots in reference BV candidate list // fetch reference BVs from the default BVPs if (xd->mi[0]->use_intrabc[xd->tree_type == CHROMA_PART]) { + DEBUG_REFMV_printf("Intrabc defaults [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); const int w = xd->width * MI_SIZE; const int h = xd->height * MI_SIZE; const int sb_width = block_size_wide[cm->sb_size]; @@ -2748,6 +3236,8 @@ static AVM_INLINE void setup_ref_mv_list( add_to_ref_bv_list(tmp_mv, ref_mv_stack, ref_mv_weight, refmv_count); } } + DEBUG_REFMV_printf("Final [%d|%d]\n", *refmv_count, + have_warp ? *valid_num_warp_candidates : 0); } void get_skip_mode_ref_offsets(const AV2_COMMON *cm, int ref_order_hint[2]) { @@ -2825,7 +3315,7 @@ void av2_initialize_ref_mv_stack( } } -void av2_find_mv_refs( +void av2_find_mv_refs(DB_ONLY(const int is_dbg) const AV2_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], @@ -2838,6 +3328,9 @@ void av2_find_mv_refs( const int mi_col = xd->mi_col; int_mv gm_mv[2]; +#if DEBUG_BLOCK_INFO + if (!BLOCK_TO_DEBUG && is_dbg) return; +#endif if (ref_frame == INTRA_FRAME || is_tip_ref_frame(ref_frame)) { gm_mv[0].as_int = gm_mv[1].as_int = 0; } else { @@ -2875,7 +3368,8 @@ void av2_find_mv_refs( (SKIP_MODE_MVP_LIST *)&(xd->skip_mvp_candidate_list); av2_initialize_ref_mv_stack(skip_list->ref_mv_stack, USABLE_REF_MV_STACK_SIZE); - setup_ref_mv_list(cm, xd, ref_frame, &(skip_list->ref_mv_count), + setup_ref_mv_list(DB_ARGS(is_dbg) + cm, xd, mi, ref_frame, &(skip_list->ref_mv_count), skip_list->ref_mv_stack, skip_list->weight, mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row, mi_col, NULL, 0, NULL); @@ -2897,14 +3391,15 @@ void av2_find_mv_refs( } else { av2_initialize_ref_mv_stack(ref_mv_stack[rf[0]], MAX_REF_MV_STACK_SIZE); } - setup_ref_mv_list(cm, xd, rf[0], &ref_mv_count[rf[0]], ref_mv_stack[rf[0]], + setup_ref_mv_list(DB_ARGS(is_dbg) + cm, xd, mi, rf[0], &ref_mv_count[rf[0]], ref_mv_stack[rf[0]], ref_mv_weight[rf[0]], mv_ref_list ? mv_ref_list[rf[0]] : NULL, gm_mv, mi_row, mi_col, derive_wrl ? warp_param_stack[rf[0]] : NULL, derive_wrl ? max_num_of_warp_candidates : 0, derive_wrl ? &valid_num_warp_candidates[rf[0]] : NULL); - if (has_second_drl(mi)) { + if (has_second_drl(mi) && mi->ref_frame[0] != mi->ref_frame[1]) { assert(rf[0] == mi->ref_frame[0]); assert(rf[1] == mi->ref_frame[1]); const BLOCK_SIZE bsize = mi->sb_type[PLANE_TYPE_Y]; @@ -2914,7 +3409,8 @@ void av2_find_mv_refs( gm_mv[1].as_int = 0; av2_initialize_ref_mv_stack(ref_mv_stack[rf[1]], MAX_REF_MV_STACK_SIZE); - setup_ref_mv_list(cm, xd, rf[1], &ref_mv_count[rf[1]], + setup_ref_mv_list(DB_ARGS(is_dbg) + cm, xd, mi, rf[1], &ref_mv_count[rf[1]], ref_mv_stack[rf[1]], ref_mv_weight[rf[1]], mv_ref_list ? mv_ref_list[rf[1]] : NULL, gm_mv, mi_row, mi_col, derive_wrl ? warp_param_stack[rf[1]] : NULL, @@ -4610,6 +5106,9 @@ void decide_rmb_unit_update_count(const AV2_COMMON *const cm, if (xd->tree_type == CHROMA_PART) return; const int mi_sb_size = cm->mib_size; const int mi_sb_size_log2 = cm->mib_size_log2; +#if DEBUG_BLOCK_INFO && DEBUG_REFMV + const int mi_row = xd->mi_row, mi_col = xd->mi_col, is_dbg = 1; +#endif const int mi_row_in_sb = xd->mi_row % mi_sb_size; const int mi_col_in_sb = xd->mi_col % mi_sb_size; const int rmb_unit_mi_size_log2 = mi_sb_size_log2 - SB_TO_RMB_UNITS_LOG2; @@ -4624,23 +5123,80 @@ void decide_rmb_unit_update_count(const AV2_COMMON *const cm, xd->ref_mv_bank.remain_hits = AVMMAX(rmb_units_count, BANK_1ST_UNIT_UPDATE_COUNT); xd->ref_mv_bank.rmb_unit_hits = 0; + if (cm->current_frame.frame_type & 1 || + cm->features.allow_intrabc) + { + DEBUG_REFMV_printf("Resetting refbank: remain=%d|hits=%d|%d\n", + xd->ref_mv_bank.remain_hits, + xd->ref_mv_bank.rmb_unit_hits, + xd->ref_mv_bank.rmb_sb_hits); + } } else if (((mi_row_in_sb % rmb_unit_mi_size) == 0) && ((mi_col_in_sb % rmb_unit_mi_size) == 0)) { xd->ref_mv_bank.remain_hits += rmb_units_count; xd->ref_mv_bank.rmb_unit_hits = 0; + if (cm->current_frame.frame_type & 1 || + cm->features.allow_intrabc) + { + DEBUG_REFMV_printf("Updating refbank availability: remain=%d|hits=%d|%d\n", + xd->ref_mv_bank.remain_hits, + xd->ref_mv_bank.rmb_unit_hits, + xd->ref_mv_bank.rmb_sb_hits); + } } } +#if DEBUG_BLOCK_INFO && DEBUG_REFMV +static void debug_refbank(const AV2_COMMON *cm, const int mi_row, const int mi_col, + REF_MV_BANK *ref_mv_bank, const int rmb_list_index) +{ + const int is_dbg = 1; + CANDIDATE_MV *queue = ref_mv_bank->rmb_buffer[rmb_list_index]; + MV_REFERENCE_FRAME *rmb_ref_frame = ref_mv_bank->rmb_ref_frame; + const int start_idx = ref_mv_bank->rmb_start_idx[rmb_list_index]; + const int count = ref_mv_bank->rmb_count[rmb_list_index]; + for (int i = 0; i < count; ++i) { + const int idx = (start_idx + count - i - 1) % REF_MV_BANK_SIZE; + MV_REFERENCE_FRAME rf[2]; + av2_set_ref_frame(rf, rmb_ref_frame[idx]); + const int comp = rmb_list_index - 6U < 2U || + (rmb_list_index == REF_MV_BANK_LIST_FOR_ALL_OTHERS && + rf[1] != NONE_FRAME); + DEBUG_REFMV_printf("refbank[%d/%d,c=%d]: mv=y:%d,x:%d,y2=%d,x2=%d,r=%d,%d\n", + i, count, rmb_list_index, + queue[idx].this_mv.as_mv.row, + queue[idx].this_mv.as_mv.col, + comp ? queue[idx].comp_mv.as_mv.row : 0, + comp ? queue[idx].comp_mv.as_mv.col : 0, + rmb_list_index < 6 ? rmb_list_index : + rmb_list_index < 8 ? 0 : + rf[0] == TIP_FRAME ? 7 : + rf[0] == INTRA_FRAME ? -1 : rf[0], + rmb_list_index < 6 ? -1 : + rmb_list_index < 8 ? rmb_list_index - 6 : rf[1]); + } +} +#else +#define debug_refbank(...) +#endif + static INLINE void update_ref_mv_bank(const AV2_COMMON *const cm, MACROBLOCKD *const xd, int from_within_sb, const MB_MODE_INFO *const mbmi, REF_MV_BANK *ref_mv_bank) { +#if DEBUG_BLOCK_INFO && DEBUG_REFMV + const int mi_row = xd->mi_row, mi_col = xd->mi_col, is_dbg = 1; +#endif if (from_within_sb) { decide_rmb_unit_update_count(cm, xd, mbmi); if (ref_mv_bank->remain_hits == 0 || ref_mv_bank->rmb_unit_hits >= BANK_UNIT_MAX_ALLOWED_LEFTOVER_UPDATES || ref_mv_bank->rmb_sb_hits >= MAX_RMB_SB_HITS) { + DEBUG_REFMV_printf("Refbank is full: remain=%d|hits=%d|%d\n", + xd->ref_mv_bank.remain_hits, + xd->ref_mv_bank.rmb_unit_hits, + ref_mv_bank->rmb_sb_hits); return; } @@ -4648,7 +5204,13 @@ static INLINE void update_ref_mv_bank(const AV2_COMMON *const cm, ref_mv_bank->rmb_unit_hits++; } else { // If max hits have been reached return. - if (ref_mv_bank->rmb_sb_hits >= MAX_RMB_SB_HITS) return; + if (ref_mv_bank->rmb_sb_hits >= MAX_RMB_SB_HITS) { + DEBUG_REFMV_printf("Refbank is full [external]: remain=%d|hits=%d|%d\n", + xd->ref_mv_bank.remain_hits, + xd->ref_mv_bank.rmb_unit_hits, + ref_mv_bank->rmb_sb_hits); + return; + } } // else increment count and proceed with updating. @@ -4688,11 +5250,18 @@ static INLINE void update_ref_mv_bank(const AV2_COMMON *const cm, rmb_ref_frame[idx0] = rmb_ref_frame[idx1]; } } + DEBUG_REFMV_printf("Moving refbank entry %d to tail %d | " + "remain=%d|hits=%d|%d\n", + idx, (start_idx + count - 1) % REF_MV_BANK_SIZE, + xd->ref_mv_bank.remain_hits, + xd->ref_mv_bank.rmb_unit_hits, + ref_mv_bank->rmb_sb_hits); const int tail = (start_idx + count - 1) % REF_MV_BANK_SIZE; queue[tail] = cand; if (rmb_list_index == REF_MV_BANK_LIST_FOR_ALL_OTHERS) { rmb_ref_frame[tail] = ref_frame; } + debug_refbank(cm, mi_row, mi_col, ref_mv_bank, rmb_list_index); return; } @@ -4705,11 +5274,16 @@ static INLINE void update_ref_mv_bank(const AV2_COMMON *const cm, rmb_ref_frame[idx] = ref_frame; } queue[idx].cwp_idx = mbmi->cwp_idx; + DEBUG_REFMV_printf("Adding new refbank entry in %d | remain=%d|hits=%d|%d\n", + idx, xd->ref_mv_bank.remain_hits, + xd->ref_mv_bank.rmb_unit_hits, + ref_mv_bank->rmb_sb_hits); if (count < REF_MV_BANK_SIZE) { ++ref_mv_bank->rmb_count[rmb_list_index]; } else { ++ref_mv_bank->rmb_start_idx[rmb_list_index]; } + debug_refbank(cm, mi_row, mi_col, ref_mv_bank, rmb_list_index); } void av2_update_ref_mv_bank(const AV2_COMMON *const cm, MACROBLOCKD *const xd, @@ -4772,12 +5346,41 @@ void span_submv(const AV2_COMMON *cm, SUBMB_INFO **submi, int mi_row, } } +#if DEBUG_BLOCK_INFO && DEBUG_REFMV +static void debug_warpbank(const AV2_COMMON *cm, + const int mi_row, const int mi_col, + WARP_PARAM_BANK *warp_param_bank, + MV_REFERENCE_FRAME ref_frame) { + const int is_dbg = 1; + WarpedMotionParams *queue = warp_param_bank->wpb_buffer[ref_frame]; + const int start_idx = warp_param_bank->wpb_start_idx[ref_frame]; + const int count = warp_param_bank->wpb_count[ref_frame]; + for (int idx_bank = 0; idx_bank < count; ++idx_bank) { + const int idx = (start_idx + count - 1 - idx_bank) % WARP_PARAM_BANK_SIZE; + DEBUG_REFMV_printf("refbank[%d/%d,r=%d]: %d,%d,%d,%d,%d,%d,t=%d\n", + idx_bank, count, ref_frame, + queue[idx].wmmat[0], + queue[idx].wmmat[1], + queue[idx].wmmat[2], + queue[idx].wmmat[3], + queue[idx].wmmat[4], + queue[idx].wmmat[5], + queue[idx].wmtype); + } +} +#else +#define debug_warpbank(...) +#endif + #define MAX_WARP_SB_HITS 64 // Update the warp parameter bank // If the warp parameters are already exist in the bank, then bank is // rearranged If the warp parameters are not in the bank, insert it to the // bank. -static INLINE void update_warp_param_bank(const MB_MODE_INFO *const mbmi, +static INLINE void update_warp_param_bank(DB_ARGS(const AV2_COMMON *cm, + const int mi_row, + const int mi_col) + const MB_MODE_INFO *const mbmi, #if COMPOUND_WARP_LINE_BUFFER_REDUCTION int cand_from_sb_above, #endif // COMPOUND_WARP_LINE_BUFFER_REDUCTION @@ -4788,6 +5391,9 @@ static INLINE void update_warp_param_bank(const MB_MODE_INFO *const mbmi, #else const int can_use_second_model = is_inter_compound_mode(mbmi->mode); #endif // COMPOUND_WARP_LINE_BUFFER_REDUCTION +#if DEBUG_BLOCK_INFO && DEBUG_REFMV + const int is_dbg = 1; +#endif for (int ref_idx = 0; ref_idx < 1 + can_use_second_model; ref_idx++) { const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[ref_idx]; WarpedMotionParams *queue = warp_param_bank->wpb_buffer[ref_frame]; @@ -4796,7 +5402,11 @@ static INLINE void update_warp_param_bank(const MB_MODE_INFO *const mbmi, int found = -1; // If max hits have been reached return. - if (warp_param_bank->wpb_sb_hits >= MAX_WARP_SB_HITS) return; + if (warp_param_bank->wpb_sb_hits >= MAX_WARP_SB_HITS) { + DEBUG_REFMV_printf("warprefbank: ignoring further action, hits=%d\n", + warp_param_bank->wpb_sb_hits); + return; + } // else increment count and proceed with updating. ++warp_param_bank->wpb_sb_hits; @@ -4827,6 +5437,16 @@ static INLINE void update_warp_param_bank(const MB_MODE_INFO *const mbmi, } const int tail = (start_idx + count - 1) % WARP_PARAM_BANK_SIZE; queue[tail] = cand; + DEBUG_REFMV_printf("warprefbank: reordering %d to %d" + " [%d,%d,%d,%d,%d,%d]\n", + (start_idx + found) % WARP_PARAM_BANK_SIZE, tail, + mbmi->wm_params[ref_idx].wmmat[0], + mbmi->wm_params[ref_idx].wmmat[1], + mbmi->wm_params[ref_idx].wmmat[2], + mbmi->wm_params[ref_idx].wmmat[3], + mbmi->wm_params[ref_idx].wmmat[4], + mbmi->wm_params[ref_idx].wmmat[5]); + debug_warpbank(cm, mi_row, mi_col, warp_param_bank, ref_frame); continue; } @@ -4846,6 +5466,16 @@ static INLINE void update_warp_param_bank(const MB_MODE_INFO *const mbmi, } else { ++warp_param_bank->wpb_start_idx[ref_frame]; } + DEBUG_REFMV_printf("warprefbank: adding at %d|%d [%d,%d,%d,%d,%d,%d]\n", + warp_param_bank->wpb_count[ref_frame], + warp_param_bank->wpb_start_idx[ref_frame], + mbmi->wm_params[ref_idx].wmmat[0], + mbmi->wm_params[ref_idx].wmmat[1], + mbmi->wm_params[ref_idx].wmmat[2], + mbmi->wm_params[ref_idx].wmmat[3], + mbmi->wm_params[ref_idx].wmmat[4], + mbmi->wm_params[ref_idx].wmmat[5]); + debug_warpbank(cm, mi_row, mi_col, warp_param_bank, ref_frame); } } void av2_update_warp_param_bank(const AV2_COMMON *const cm, @@ -4856,7 +5486,8 @@ void av2_update_warp_param_bank(const AV2_COMMON *const cm, const MB_MODE_INFO *const mbmi) { (void)cm; if (is_warp_mode(mbmi->motion_mode)) { - update_warp_param_bank(mbmi, + update_warp_param_bank(DB_ARGS(cm, xd->mi_row, xd->mi_col) + mbmi, #if COMPOUND_WARP_LINE_BUFFER_REDUCTION cand_from_sb_above, #endif // COMPOUND_WARP_LINE_BUFFER_REDUCTION diff --git a/av2/common/mvref_common.h b/av2/common/mvref_common.h index d2cd349ba1..40f0e9fc9c 100644 --- a/av2/common/mvref_common.h +++ b/av2/common/mvref_common.h @@ -16,6 +16,8 @@ #include "av2/common/blockd.h" #include "av2/common/mv.h" +#include "avm/debug.h" + #ifdef __cplusplus extern "C" { #endif @@ -583,7 +585,7 @@ void av2_find_mode_ctx(const AV2_COMMON *cm, const MACROBLOCKD *xd, // The global_mvs output parameter points to an array of REF_FRAMES elements. // The caller may pass a null global_mvs if it does not need the global_mvs // output. -void av2_find_mv_refs( +void av2_find_mv_refs(DB_ONLY(const int is_dbg) const AV2_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], diff --git a/av2/common/tip.c b/av2/common/tip.c index a46176bf84..caaaf46d9f 100644 --- a/av2/common/tip.c +++ b/av2/common/tip.c @@ -759,6 +759,8 @@ static AVM_INLINE void tip_component_build_inter_predictors( const AV2_COMMON *cm, MACROBLOCKD *xd, int plane, TIP_PLANE *tip_plane, const MV mv[2], int bw, int bh, int mi_x, int mi_y, uint16_t **mc_buf, CONV_BUF_TYPE *tmp_conv_dst, CalcSubpelParamsFunc calc_subpel_params_func) { + xd->mi_row = mi_y >> MI_SIZE_LOG2; + xd->mi_col = mi_x >> MI_SIZE_LOG2; tip_build_inter_predictors_8x8_and_bigger( cm, xd, plane, tip_plane, mv, bw, bh, mi_x, mi_y, mc_buf, tmp_conv_dst, calc_subpel_params_func); diff --git a/av2/decoder/decodeframe.c b/av2/decoder/decodeframe.c index 7cd0394751..6f586538d3 100644 --- a/av2/decoder/decodeframe.c +++ b/av2/decoder/decodeframe.c @@ -222,7 +222,7 @@ static AVM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb, memset(dqcoeff, 0, AVMMAX(nz0, nz1) * sizeof(dqcoeff[0])); } -static AVM_INLINE void read_coeffs_tx_intra_block( +static AVM_INLINE void read_coeffs_tx_intra_block(DB_ONLY(const int depth) const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { MB_MODE_INFO *mbmi = dcb->xd.mi[0]; @@ -231,7 +231,8 @@ static AVM_INLINE void read_coeffs_tx_intra_block( struct avm_usec_timer timer; avm_usec_timer_start(&timer); #endif - av2_read_coeffs_txb_facade(cm, dcb, r, plane, row, col, tx_size); + av2_read_coeffs_txb_facade(DB_ONLY(depth) + cm, dcb, r, plane, row, col, tx_size); #if TXCOEFF_TIMER avm_usec_timer_mark(&timer); const int64_t elapsed_time = avm_usec_timer_elapsed(&timer); @@ -246,11 +247,15 @@ static AVM_INLINE void read_coeffs_tx_intra_block( } } -static AVM_INLINE void decode_block_void(const AV2_COMMON *const cm, +static AVM_INLINE void decode_block_void(DB_ONLY(const int depth) + const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { +#if DEBUG_BLOCK_INFO + (void)depth; +#endif (void)cm; (void)dcb; (void)r; @@ -268,9 +273,12 @@ static AVM_INLINE void predict_inter_block_void(AV2_COMMON *const cm, (void)bsize; } -static AVM_INLINE void predict_and_reconstruct_intra_block( +static AVM_INLINE void predict_and_reconstruct_intra_block(DB_ONLY(const int depth) const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { +#if DEBUG_BLOCK_INFO + (void)depth; +#endif (void)r; MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *mbmi = xd->mi[0]; @@ -382,10 +390,13 @@ static AVM_INLINE void predict_and_reconstruct_intra_block( } // Facade function for inverse cross chroma component transform -static AVM_INLINE void inverse_cross_chroma_transform_block( +static AVM_INLINE void inverse_cross_chroma_transform_block(DB_ONLY(const int depth) const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int blk_row, const int blk_col, const TX_SIZE tx_size) { +#if DEBUG_BLOCK_INFO + (void)depth; +#endif (void)cm; (void)r; (void)plane; @@ -399,10 +410,13 @@ static AVM_INLINE void inverse_cross_chroma_transform_block( xd->bd); } -static AVM_INLINE void inverse_transform_inter_block( +static AVM_INLINE void inverse_transform_inter_block(DB_ONLY(const int depth) const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int blk_row, const int blk_col, const TX_SIZE tx_size) { +#if DEBUG_BLOCK_INFO + (void)depth; +#endif (void)r; MACROBLOCKD *const xd = &dcb->xd; PLANE_TYPE plane_type = get_plane_type(plane); @@ -447,7 +461,7 @@ static AVM_INLINE void set_cb_buffer_offsets(DecoderCodingBlock *dcb, dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); } -static AVM_INLINE void decode_reconstruct_tx( +static AVM_INLINE void decode_reconstruct_tx(DB_ONLY(const int depth) AV2_COMMON *cm, ThreadData *const td, avm_reader *r, MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, TX_SIZE tx_size, int *eob_total) { @@ -463,14 +477,19 @@ static AVM_INLINE void decode_reconstruct_tx( if (mbmi->tx_partition_type[txp_index] == TX_PARTITION_NONE || plane) { if (plane == AVM_PLANE_V && is_cctx_allowed(cm, xd)) { - td->read_coeffs_tx_inter_block_visit(cm, dcb, r, AVM_PLANE_U, blk_row, + td->read_coeffs_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, AVM_PLANE_U, blk_row, blk_col, tx_size); - td->read_coeffs_tx_inter_block_visit(cm, dcb, r, AVM_PLANE_V, blk_row, + td->read_coeffs_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, AVM_PLANE_V, blk_row, blk_col, tx_size); - td->inverse_cctx_block_visit(cm, dcb, r, -1, blk_row, blk_col, tx_size); - td->inverse_tx_inter_block_visit(cm, dcb, r, AVM_PLANE_U, blk_row, + td->inverse_cctx_block_visit(DB_ONLY(depth) + cm, dcb, r, -1, blk_row, blk_col, tx_size); + td->inverse_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, AVM_PLANE_U, blk_row, blk_col, tx_size); - td->inverse_tx_inter_block_visit(cm, dcb, r, AVM_PLANE_V, blk_row, + td->inverse_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, AVM_PLANE_V, blk_row, blk_col, tx_size); eob_info *eob_data_c1 = dcb->eob_data[AVM_PLANE_U] + dcb->txb_offset[AVM_PLANE_U]; @@ -481,10 +500,12 @@ static AVM_INLINE void decode_reconstruct_tx( set_cb_buffer_offsets(dcb, tx_size, AVM_PLANE_V); } else { assert(plane == AVM_PLANE_Y || !is_cctx_allowed(cm, xd)); - td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col, + td->read_coeffs_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, plane, blk_row, blk_col, tx_size); - td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col, + td->inverse_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, plane, blk_row, blk_col, tx_size); eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; *eob_total += eob_data->eob; @@ -501,9 +522,11 @@ static AVM_INLINE void decode_reconstruct_tx( const int offsetc = blk_col + mbmi->txb_pos.col_offset[txb_idx]; if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, offsetr, offsetc, + td->read_coeffs_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, plane, offsetr, offsetc, sub_tx); - td->inverse_tx_inter_block_visit(cm, dcb, r, plane, offsetr, offsetc, + td->inverse_tx_inter_block_visit(DB_ONLY(depth) + cm, dcb, r, plane, offsetr, offsetc, sub_tx); eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; *eob_total += eob_data->eob; @@ -681,7 +704,8 @@ static void av2_dec_setup_tip_frame(AV2Decoder *pbi, AV2_COMMON *cm, } } -static AVM_INLINE void decode_mbmi_block(AV2Decoder *const pbi, +static AVM_INLINE void decode_mbmi_block(DB_ONLY(const int depth) + AV2Decoder *const pbi, DecoderCodingBlock *dcb, int mi_row, int mi_col, avm_reader *r, PARTITION_TYPE partition, @@ -699,6 +723,17 @@ static AVM_INLINE void decode_mbmi_block(AV2Decoder *const pbi, #endif set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis, parent, index); +#if DEBUG_BLOCK_INFO + const char *const names[] = { + [SHARED_PART] = "yuv", + [LUMA_PART] = "y", + [CHROMA_PART] = "uv", + }; + DEBUG_BLOCK_printf("%*sdecode_b[y=%d,x=%d,bs=%dx%d,plane=%s]: r=%d\n", + depth - 1, "", mi_row, mi_col, MI_SIZE * bw, MI_SIZE * bh, + names[xd->is_chroma_ref ? xd->tree_type : LUMA_PART], + r->ec.rng); +#endif xd->mi[0]->partition = partition; // set region_type for each mbmi xd->mi[0]->region_type = parent->region_type; @@ -730,14 +765,14 @@ static AVM_INLINE void decode_mbmi_block(AV2Decoder *const pbi, if (!cm->bru.frame_inactive_flag && !cm->bridge_frame_info.is_bridge_frame) { if (xd->tree_type != CHROMA_PART) { - read_gdf(cm, r, xd); + read_gdf(DB_ONLY(depth) cm, r, xd); } if (cm->seq_params.enable_ccso && xd->tree_type != CHROMA_PART) { - read_ccso(cm, r, xd); + read_ccso(DB_ONLY(depth) cm, r, xd); } } } else - av2_read_mode_info(pbi, dcb, r, x_mis, y_mis); + av2_read_mode_info(DB_ONLY(depth) pbi, dcb, r, x_mis, y_mis); if (xd->tree_type != LUMA_PART) { const struct macroblockd_plane *const pd_u = &xd->plane[1]; @@ -928,9 +963,15 @@ static AVM_INLINE void copy_frame_mvs_inter_block(AV2_COMMON *const cm, } } -static AVM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd, +static AVM_INLINE void set_color_index_map_offset(DB_ONLY(AV2_COMMON *cm, + const int depth) + MACROBLOCKD *const xd, int plane, avm_reader *r) { (void)r; +#if DEBUG_BLOCK_INFO + (void)cm; + (void)depth; +#endif Av2ColorMapParam params; const MB_MODE_INFO *const mbmi = xd->mi[0]; av2_get_block_dimensions(mbmi->sb_type[plane > 0], plane, xd, @@ -1032,7 +1073,8 @@ static AVM_INLINE int bridge_frame_is_valid_inter(const AV2_COMMON *const cm, return 1; } -static AVM_INLINE void decode_token_recon_block(AV2Decoder *const pbi, +static AVM_INLINE void decode_token_recon_block(DB_ONLY(const int depth) + AV2Decoder *const pbi, ThreadData *const td, avm_reader *r, PARTITION_TYPE partition, @@ -1128,9 +1170,9 @@ static AVM_INLINE void decode_token_recon_block(AV2Decoder *const pbi, blk_col >= plane_unit_width) continue; - td->read_coeffs_tx_intra_block_visit( + td->read_coeffs_tx_intra_block_visit(DB_ONLY(depth) cm, dcb, r, plane, blk_row, blk_col, tx_size); - td->predict_and_recon_intra_block_visit( + td->predict_and_recon_intra_block_visit(DB_ONLY(depth) cm, dcb, r, plane, blk_row, blk_col, tx_size); set_cb_buffer_offsets(dcb, tx_size, plane); } @@ -1166,19 +1208,20 @@ static AVM_INLINE void decode_token_recon_block(AV2Decoder *const pbi, blk_col += stepc) { if (plane == AVM_PLANE_V && is_cctx_allowed(cm, xd)) { if (need_parsing) { - td->read_coeffs_tx_intra_block_visit( + td->read_coeffs_tx_intra_block_visit(DB_ONLY(depth) cm, dcb, r, AVM_PLANE_U, blk_row, blk_col, tx_size); - td->read_coeffs_tx_intra_block_visit( + td->read_coeffs_tx_intra_block_visit(DB_ONLY(depth) cm, dcb, r, AVM_PLANE_V, blk_row, blk_col, tx_size); - td->inverse_cctx_block_visit(cm, dcb, r, -1, blk_row, + td->inverse_cctx_block_visit(DB_ONLY(depth) + cm, dcb, r, -1, blk_row, blk_col, tx_size); } if (need_reconstrution) { - td->predict_and_recon_intra_block_visit( + td->predict_and_recon_intra_block_visit(DB_ONLY(depth) cm, dcb, r, AVM_PLANE_U, blk_row & 0xf0, blk_col & 0xf0, tx_size); - td->predict_and_recon_intra_block_visit( + td->predict_and_recon_intra_block_visit(DB_ONLY(depth) cm, dcb, r, AVM_PLANE_V, blk_row & 0xf0, blk_col & 0xf0, tx_size); } @@ -1189,10 +1232,10 @@ static AVM_INLINE void decode_token_recon_block(AV2Decoder *const pbi, } else { assert(plane == AVM_PLANE_Y || !is_cctx_allowed(cm, xd)); if (need_parsing) - td->read_coeffs_tx_intra_block_visit( + td->read_coeffs_tx_intra_block_visit(DB_ONLY(depth) cm, dcb, r, plane, blk_row, blk_col, tx_size); if (need_reconstrution) { - td->predict_and_recon_intra_block_visit( + td->predict_and_recon_intra_block_visit(DB_ONLY(depth) cm, dcb, r, plane, blk_row & (lossless ? 0xff : 0xf0), blk_col & (lossless ? 0xff : 0xf0), tx_size); @@ -1295,7 +1338,8 @@ static AVM_INLINE void decode_token_recon_block(AV2Decoder *const pbi, blk_row += bh_var_tx) { for (int blk_col = col >> ss_x; blk_col < plane_unit_width; blk_col += bw_var_tx) { - decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, + decode_reconstruct_tx(DB_ONLY(depth) + cm, td, r, mbmi, plane, plane_bsize, blk_row, blk_col, max_tx_size, &eobtotal); } @@ -1351,11 +1395,13 @@ static AVM_INLINE void decode_token_recon_block(AV2Decoder *const pbi, td->copy_frame_mvs_block_visit(cm, dcb, bsize); - av2_visit_palette(pbi, xd, r, set_color_index_map_offset); + av2_visit_palette(DB_ONLY(depth) pbi, xd, r, set_color_index_map_offset); av2_mark_block_as_coded(xd, bsize, cm->sb_size); } -static TX_SIZE read_tx_partition(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, +static TX_SIZE read_tx_partition(DB_ONLY(const int depth, + const AV2_COMMON *cm) + MACROBLOCKD *xd, MB_MODE_INFO *mbmi, TX_SIZE max_tx_size, int blk_row, int blk_col, avm_reader *r) { int plane_type = (xd->tree_type == CHROMA_PART); @@ -1425,6 +1471,14 @@ static TX_SIZE read_tx_partition(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, partition = TX_PARTITION_NONE; } +#if DEBUG_BLOCK_INFO + if (allow_horz || allow_vert) { + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-txpart[ctx=%d|%d|%d,%d]: r=%d\n", + depth, "", is_fsc, is_inter, bsize_group, + partition, r->ec.rng); + } +#endif TX_SIZE sub_txs[MAX_TX_PARTITIONS] = { 0 }; int num_txfm_blocks = get_tx_partition_sizes( partition, max_tx_size, &mbmi->txb_pos, sub_txs, xd->error_info); @@ -1434,7 +1488,8 @@ static TX_SIZE read_tx_partition(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, return sub_txs[num_txfm_blocks - 1]; } -static TX_SIZE read_tx_size(MACROBLOCKD *xd, TX_MODE tx_mode, int is_inter, +static TX_SIZE read_tx_size(DB_ONLY(const int depth, const AV2_COMMON *cm) + MACROBLOCKD *xd, TX_MODE tx_mode, int is_inter, int allow_select_inter, avm_reader *r) { const BLOCK_SIZE bsize = xd->mi[0]->sb_type[xd->tree_type == CHROMA_PART]; if (xd->lossless[xd->mi[0]->segment_id]) { @@ -1446,6 +1501,12 @@ static TX_SIZE read_tx_size(MACROBLOCKD *xd, TX_MODE tx_mode, int is_inter, const int is_tx_size_large = avm_read_symbol( r, xd->tile_ctx->lossless_tx_size_cdf[bsize_group][is_inter], 2, ACCT_INFO("lossless_tx_size")); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-lltxsz[ctx=%d|%d,%d]: r=%d\n", + depth, "", bsize_group, is_inter, + is_tx_size_large, r->ec.rng); +#endif if (is_tx_size_large) { return lossless_max_txsize_lookup[bsize]; } @@ -1457,7 +1518,8 @@ static TX_SIZE read_tx_size(MACROBLOCKD *xd, TX_MODE tx_mode, int is_inter, if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) { MB_MODE_INFO *mbmi = xd->mi[0]; const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; - return read_tx_partition(xd, mbmi, max_tx_size, 0, 0, r); + return read_tx_partition(DB_ONLY(depth, cm) + xd, mbmi, max_tx_size, 0, 0, r); } else { return tx_size_from_tx_mode(bsize, tx_mode); } @@ -1486,7 +1548,8 @@ static BruActiveMode read_bru_mode(AV2_COMMON *cm, const MACROBLOCKD *xd, } return sb_active_mode; } -static AVM_INLINE void parse_decode_block(AV2Decoder *const pbi, +static AVM_INLINE void parse_decode_block(DB_ONLY(const int depth) + AV2Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, avm_reader *r, PARTITION_TYPE partition, @@ -1504,12 +1567,13 @@ static AVM_INLINE void parse_decode_block(AV2Decoder *const pbi, "Block size %dx%d violates aspect ratio constraint of %d", blk_w, blk_h, max_aspect_ratio); } - decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize, parent, + AV2_COMMON *cm = &pbi->common; + decode_mbmi_block(DB_ONLY(depth) + pbi, dcb, mi_row, mi_col, r, partition, bsize, parent, index); - av2_visit_palette(pbi, xd, r, av2_decode_palette_tokens); + av2_visit_palette(DB_ONLY(depth) pbi, xd, r, av2_decode_palette_tokens); - AV2_COMMON *cm = &pbi->common; const int num_planes = av2_num_planes(cm); MB_MODE_INFO *mbmi = xd->mi[0]; int inter_block_tx = is_inter_block(mbmi, xd->tree_type) || @@ -1529,14 +1593,20 @@ static AVM_INLINE void parse_decode_block(AV2Decoder *const pbi, for (int idy = 0; idy < height; idy += bh) for (int idx = 0; idx < width; idx += bw) - read_tx_partition(xd, mbmi, max_tx_size, idy, idx, r); + read_tx_partition(DB_ONLY(depth, cm) + xd, mbmi, max_tx_size, idy, idx, r); } else { mbmi->tx_size = - read_tx_size(xd, cm->features.tx_mode, inter_block_tx, + read_tx_size(DB_ONLY(depth, cm) + xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip_txfm[xd->tree_type == CHROMA_PART], r); } } +#if DEBUG_BLOCK_INFO + resolve_refmvs(depth, pbi, dcb); +#endif + if (cm->delta_q_info.delta_q_present_flag) { for (int i = 0; i < MAX_SEGMENTS; i++) { const int current_qindex = av2_get_qindex( @@ -1569,7 +1639,7 @@ static AVM_INLINE void parse_decode_block(AV2Decoder *const pbi, // For optimized decoder, only do reocn when support SB if (!pbi->bru_opt_mode || (pbi->bru_opt_mode && bru_is_sb_active(cm, mi_col, mi_row))) - decode_token_recon_block(pbi, td, r, partition, bsize); + decode_token_recon_block(DB_ONLY(depth + 1) pbi, td, r, partition, bsize); // Note: the copying here must match corresponding encoder-side copying in // av2_update_state(). @@ -1665,7 +1735,8 @@ static AVM_INLINE void set_offsets_for_pred_and_recon(AV2Decoder *const pbi, num_planes, &chroma_ref_info); } -static AVM_INLINE void decode_block(AV2Decoder *const pbi, ThreadData *const td, +static AVM_INLINE void decode_block(DB_ONLY(const int depth) + AV2Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, avm_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize, PARTITION_TREE *parent, int index) { @@ -1683,7 +1754,7 @@ static AVM_INLINE void decode_block(AV2Decoder *const pbi, ThreadData *const td, max_aspect_ratio); } set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize); - decode_token_recon_block(pbi, td, r, partition, bsize); + decode_token_recon_block(DB_ONLY(depth + 1) pbi, td, r, partition, bsize); } /*!\brief Maps (ext_part, 4way, 4way_type, rect_type) to partition_type. */ @@ -1848,7 +1919,7 @@ static void set_sb_mv_precision(SB_INFO *sbi, AV2Decoder *const pbi) { } // TODO(slavarnway): eliminate bsize and subsize in future commits -static AVM_INLINE void decode_partition( +static AVM_INLINE void decode_partition(DB_ONLY(const int depth) AV2Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, avm_reader *reader, BLOCK_SIZE bsize, SB_INFO *sbi, PARTITION_TREE *ptree, PARTITION_TREE *ptree_luma, int parse_decode_flag) { @@ -1878,11 +1949,13 @@ static AVM_INLINE void decode_partition( is_intra_sdp_enabled && bsize == BLOCK_64X64 ? 2 : 1; if (total_loop_num == 2 && xd->tree_type == SHARED_PART) { xd->tree_type = LUMA_PART; - decode_partition(pbi, td, mi_row, mi_col, reader, bsize, sbi, ptree, + decode_partition(DB_ONLY(depth) + pbi, td, mi_row, mi_col, reader, bsize, sbi, ptree, ptree_luma, parse_decode_flag); xd->tree_type = CHROMA_PART; - decode_partition(pbi, td, mi_row, mi_col, reader, bsize, sbi, ptree_luma, + decode_partition(DB_ONLY(depth) + pbi, td, mi_row, mi_col, reader, bsize, sbi, ptree_luma, ptree, parse_decode_flag); xd->tree_type = SHARED_PART; return; @@ -1924,6 +1997,13 @@ static AVM_INLINE void decode_partition( } if (parse_decode_flag & 1) { + PARTITION_TREE *parent = ptree->parent; + set_chroma_ref_info( + xd->tree_type, mi_row, mi_col, ptree->index, bsize, + &ptree->chroma_ref_info, parent ? &parent->chroma_ref_info : NULL, + parent ? parent->bsize : BLOCK_INVALID, + parent ? parent->partition : PARTITION_NONE, ss_x, ss_y); + if (is_sb_root) { if (cm->bru.enabled || cm->bridge_frame_info.is_bridge_frame) { const int mi_grid_idx = get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); @@ -1956,6 +2036,11 @@ static AVM_INLINE void decode_partition( for (int rcol = rcol0; rcol < rcol1; ++rcol) { const int runit_idx = rcol + rrow * rstride; loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx); + DEBUG_BLOCK_printf("Post-restoration[p=%d,type=%d]: r=%d\n", + plane, + cm->rst_info[plane].unit_info[ + runit_idx].restoration_type, + reader->ec.rng); } } } @@ -1975,17 +2060,41 @@ static AVM_INLINE void decode_partition( CFL_DISALLOWED_FOR_CHROMA; } - PARTITION_TREE *parent = ptree->parent; - set_chroma_ref_info( - xd->tree_type, mi_row, mi_col, ptree->index, bsize, - &ptree->chroma_ref_info, parent ? &parent->chroma_ref_info : NULL, - parent ? parent->bsize : BLOCK_INVALID, - parent ? parent->partition : PARTITION_NONE, ss_x, ss_y); +#if DEBUG_BLOCK_INFO + const char *const names[] = { + [SHARED_PART] = "yuv", + [LUMA_PART] = "y", + [CHROMA_PART] = "uv", + }; + DEBUG_BLOCK_printf("%*sdecode_sb[y=%d,x=%d,bs=%dx%d,plane=%s]: r=%d\n", + depth - 1, "", mi_row, mi_col, 4 * mi_size_wide[bsize], + 4 * mi_size_high[bsize], + names[ptree->chroma_ref_info.is_chroma_ref ? + xd->tree_type : LUMA_PART], reader->ec.rng); +#endif partition = !is_partition_point(bsize) ? PARTITION_NONE : read_partition(cm, xd, mi_row, mi_col, reader, has_rows, has_cols, ptree, ptree_luma, bsize); +#if DEBUG_BLOCK_INFO + static const char *const pnames[] = { + [PARTITION_NONE]="none", + [PARTITION_HORZ_4A]="h4a", + [PARTITION_HORZ_4B]="h4b", + [PARTITION_VERT_4A]="v4a", + [PARTITION_VERT_4B]="v4b", + [PARTITION_SPLIT]="s", + [PARTITION_HORZ]="h", + [PARTITION_VERT]="v", + [PARTITION_HORZ_3]="h3", + [PARTITION_VERT_3]="v3", + }; + DEBUG_BLOCK_printf("%*sread_partition[y=%d,x=%d,bs=%dx%d,bp=%d|%s]: r=%d\n", + depth, "", mi_row, mi_col, 4 * mi_size_wide[bsize], + 4 * mi_size_high[bsize], partition, + pnames[partition], reader->ec.rng); +#endif ptree->is_cfl_allowed_for_this_chroma_partition |= is_cfl_allowed_for_sdp(cm, xd, ptree_luma, partition, bsize); CFL_ALLOWED_FOR_SDP_TYPE is_cfl_allowed_in_sdp = @@ -2009,6 +2118,8 @@ static AVM_INLINE void decode_partition( ptree->region_type = avm_read_symbol(reader, xd->tile_ctx->region_type_cdf[ctx], REGION_TYPES, ACCT_INFO("region_type")); + DEBUG_BLOCK_printf("%*sis_mixed_region[ctx=%d,%d]: r=%d\n", + depth, "", ctx, ptree->region_type, reader->ec.rng); if (ptree->region_type == INTRA_REGION) xd->tree_type = LUMA_PART; } else if (!frame_is_intra_only(cm)) { ptree->region_type = parent->region_type; @@ -2149,11 +2260,13 @@ static AVM_INLINE void decode_partition( #define DEC_BLOCK_STX_ARG #define DEC_BLOCK_EPT_ARG partition, #define DEC_BLOCK(db_r, db_c, db_subsize, index) \ - block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \ + block_visit[parse_decode_flag](DB_ONLY(depth + 1) \ + pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \ reader, DEC_BLOCK_EPT_ARG(db_subsize), ptree, \ index) #define DEC_PARTITION(db_r, db_c, db_subsize, index) \ - decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \ + decode_partition(DB_ONLY(depth + 1) \ + pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \ (db_subsize), sbi, ptree->sub_tree[(index)], \ get_partition_subtree_const(ptree_luma, index), \ parse_decode_flag) @@ -2313,7 +2426,7 @@ static AVM_INLINE void decode_partition_sb(AV2Decoder *const pbi, av2_reset_ptree_in_sbi(xd->sbi, xd->tree_type); if (is_intra_sdp_enabled) av2_reset_ptree_in_sbi(xd->sbi, CHROMA_PART); } - decode_partition( + decode_partition(DB_ONLY(1) pbi, td, mi_row, mi_col, reader, bsize, xd->sbi, td->dcb.xd.sbi->ptree_root[av2_get_sdp_idx(xd->tree_type)], (is_intra_sdp_enabled ? td->dcb.xd.sbi->ptree_root[1] : NULL), @@ -4842,6 +4955,17 @@ static const uint8_t *decode_tiles(AV2Decoder *pbi, const uint8_t *data, td->dcb.xd.tile_ctx = &tile_data->tctx; // decode tile +#if DEBUG_BLOCK_INFO + TileInfo tile_info; + av2_tile_set_row(&tile_info, cm, tile_row); + av2_tile_set_col(&tile_info, cm, tile_col); + const int mi_row = tile_info.mi_row_start, mi_col = tile_info.mi_col_start; + DEBUG_BLOCK_printf("decode_tile[tilerow=%d/col=%d,y=%d-%d,x=%d-%d,size=%td]: r=%d\n", + tile_row, tile_col, + tile_info.mi_row_start, tile_info.mi_row_end, + tile_info.mi_col_start, tile_info.mi_col_end, + tile_bs_buf->size, td->bit_reader->ec.rng); +#endif decode_tile(pbi, td, row, col); avm_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted); if (pbi->dcb.corrupted) diff --git a/av2/decoder/decodemv.c b/av2/decoder/decodemv.c index 183ff93a8e..66accc3268 100644 --- a/av2/decoder/decodemv.c +++ b/av2/decoder/decodemv.c @@ -42,7 +42,8 @@ #define DEC_MISMATCH_DEBUG 0 -void read_gdf(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { +void read_gdf(DB_ONLY(const int depth) + AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { if (!is_allow_gdf(cm)) return; if ((cm->gdf_info.gdf_mode < 2) || (cm->gdf_info.gdf_block_num <= 1)) return; if ((xd->mi_row % cm->mib_size != 0) || (xd->mi_col % cm->mib_size != 0)) @@ -66,12 +67,17 @@ void read_gdf(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { if (blk_idx >= 0) { cm->gdf_info.gdf_block_flags[blk_idx] = avm_read_symbol( r, xd->tile_ctx->gdf_cdf, 2, ACCT_INFO("gdf_onoff")); + DEBUG_BLOCK_printf("%*sPost-gdf[y=%d,x=%d,gdf=%d]: r=%d\n", + depth, "", mi_row, mi_col, + cm->gdf_info.gdf_block_flags[blk_idx], + r->ec.rng); } } } } -void read_cdef(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { +void read_cdef(DB_ONLY(const int depth) + AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { assert(xd->tree_type != CHROMA_PART); const int skip_txfm = xd->mi[0]->skip_txfm[0]; if (cm->features.coded_lossless) return; @@ -120,6 +126,9 @@ void read_cdef(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { 1; } } + DEBUG_BLOCK_printf("%*sPost-cdef_idx[ctx=%d,%d]: r=%d\n", + depth, "", cdef_strength_index0_ctx, + mbmi->cdef_strength, r->ec.rng); } xd->cdef_transmitted[index] = true; } else { @@ -158,7 +167,8 @@ static void span_ccso(AV2_COMMON *cm, MACROBLOCKD *const xd, int pli, } } -void read_ccso(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { +void read_ccso(DB_ONLY(const int depth) + AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { if (cm->features.coded_lossless) return; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_row = xd->mi_row; @@ -181,6 +191,8 @@ void read_ccso(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { const int ccso_ctx = av2_get_ccso_context(cm, xd, 0); blk_idc = avm_read_symbol(r, xd->tile_ctx->ccso_cdf[0][ccso_ctx], 2, ACCT_INFO("blk_idc")); + DEBUG_BLOCK_printf("%*sPost-ccso[pl=y,ctx=%d,%d]: r=%d\n", + depth, "", ccso_ctx, blk_idc, r->ec.rng); } else { CcsoInfo *ref_frame_ccso_info = &get_ref_frame_buf(cm, cm->ccso_info.ccso_ref_idx[0])->ccso_info; @@ -216,6 +228,8 @@ void read_ccso(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { const int ccso_ctx = av2_get_ccso_context(cm, xd, 1); blk_idc = avm_read_symbol(r, xd->tile_ctx->ccso_cdf[1][ccso_ctx], 2, ACCT_INFO("blk_idc")); + DEBUG_BLOCK_printf("%*sPost-ccso[pl=u,ctx=%d,%d]: r=%d\n", + depth, "", ccso_ctx, blk_idc, r->ec.rng); } else { CcsoInfo *ref_frame_ccso_info = &get_ref_frame_buf(cm, cm->ccso_info.ccso_ref_idx[1])->ccso_info; @@ -251,6 +265,8 @@ void read_ccso(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd) { const int ccso_ctx = av2_get_ccso_context(cm, xd, 2); blk_idc = avm_read_symbol(r, xd->tile_ctx->ccso_cdf[2][ccso_ctx], 2, ACCT_INFO("blk_idc")); + DEBUG_BLOCK_printf("%*sPost-ccso[pl=v,ctx=%d,%d]: r=%d\n", + depth, "", ccso_ctx, blk_idc, r->ec.rng); } else { CcsoInfo *ref_frame_ccso_info = &get_ref_frame_buf(cm, cm->ccso_info.ccso_ref_idx[2])->ccso_info; @@ -305,17 +321,28 @@ static int read_delta_qindex(AV2_COMMON *cm, const MACROBLOCKD *xd, return reduced_delta_qindex; } -static uint8_t read_mrl_index(FRAME_CONTEXT *ec_ctx, avm_reader *r, +static uint8_t read_mrl_index(DB_ONLY(const int depth, + const AV2_COMMON *const cm, + const MACROBLOCKD *xd) + FRAME_CONTEXT *ec_ctx, avm_reader *r, const MB_MODE_INFO *neighbor0, const MB_MODE_INFO *neighbor1) { int ctx = get_mrl_index_ctx(neighbor0, neighbor1); avm_cdf_prob *mrl_cdf = ec_ctx->mrl_index_cdf[ctx]; const uint8_t mrl_index = avm_read_symbol(r, mrl_cdf, MRL_LINE_NUMBER, ACCT_INFO()); +#if DEBUG_BLOCK_INFO + int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-mrl_index[ctx=%d,%d]: r=%d\n", + depth, "", ctx, mrl_index, r->ec.rng); +#endif return mrl_index; } -static bool read_multi_line_mrl(FRAME_CONTEXT *ec_ctx, avm_reader *r, +static bool read_multi_line_mrl(DB_ONLY(const int depth, + const AV2_COMMON *const cm, + const MACROBLOCKD *xd) + FRAME_CONTEXT *ec_ctx, avm_reader *r, const MB_MODE_INFO *neighbor0, const MB_MODE_INFO *neighbor1) { int multi_line_mrl_ctx = get_multi_line_mrl_index_ctx(neighbor0, neighbor1); @@ -323,6 +350,11 @@ static bool read_multi_line_mrl(FRAME_CONTEXT *ec_ctx, avm_reader *r, ec_ctx->multi_line_mrl_cdf[multi_line_mrl_ctx]; const bool multi_line_mrl = avm_read_symbol(r, multi_line_mrl_cdf, 2, ACCT_INFO()); +#if DEBUG_BLOCK_INFO + int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-multi_line_mrl[ctx=%d,%d]: r=%d\n", + depth, "", multi_line_mrl_ctx, multi_line_mrl, r->ec.rng); +#endif return multi_line_mrl; } @@ -404,7 +436,9 @@ static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, avm_reader *r, return ii_mode; } -static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, avm_reader *r, +static PREDICTION_MODE read_inter_mode(DB_ONLY(const int depth, + const int mi_row, const int mi_col) + FRAME_CONTEXT *ec_ctx, avm_reader *r, int16_t ctx, const AV2_COMMON *const cm, const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, @@ -412,6 +446,9 @@ static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, avm_reader *r, if (is_tip_ref_frame(mbmi->ref_frame[0])) { const int tip_pred_index = avm_read_symbol( r, ec_ctx->tip_pred_mode_cdf, TIP_PRED_MODES, ACCT_INFO("tip_mode")); + DEBUG_BLOCK_printf("%*sPost-tip_mode[%d]: r=%d\n", + depth, "", tip_pred_index_to_mode[tip_pred_index], + r->ec.rng); return tip_pred_index_to_mode[tip_pred_index]; } @@ -424,17 +461,25 @@ static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, avm_reader *r, if (is_warp_newmv_allowed(cm, xd, mbmi, bsize)) { const int is_warpmv = avm_read_symbol( r, ec_ctx->is_warpmv_or_warp_newmv_cdf, 2, ACCT_INFO("is_warpmv")); + DEBUG_BLOCK_printf("%*sPost-single_inter_mode[ctx=%d,%d]: r=%d\n", + depth, "", ctx, + is_warpmv ? WARPMV : WARP_NEWMV, r->ec.rng); return is_warpmv ? WARPMV : WARP_NEWMV; } else { + DEBUG_BLOCK_printf("%*sPost-single_inter_mode[ctx=%d,%d]: r=%d\n", + depth, "", ctx, WARPMV, r->ec.rng); return WARPMV; } } } const int16_t ismode_ctx = inter_single_mode_ctx(ctx); - return SINGLE_INTER_MODE_START + + const int m = SINGLE_INTER_MODE_START + avm_read_symbol(r, ec_ctx->inter_single_mode_cdf[ismode_ctx], INTER_SINGLE_MODES, ACCT_INFO("inter_single_mode")); + DEBUG_BLOCK_printf("%*sPost-single_inter_mode[ctx=%d,%d]: r=%d\n", + depth, "", ismode_ctx, m, r->ec.rng); + return m; } static void read_drl_idx(int max_drl_bits, const int16_t mode_ctx, @@ -488,7 +533,9 @@ static int8_t read_wedge_mode(avm_reader *r, FRAME_CONTEXT *ec_ctx, } // read the reference index warp_ref_idx of WRL -static void read_warp_ref_idx(FRAME_CONTEXT *ec_ctx, MB_MODE_INFO *mbmi, +static void read_warp_ref_idx(DB_ONLY(const int depth, AV2_COMMON *cm, + const int mi_row, const int mi_col) + FRAME_CONTEXT *ec_ctx, MB_MODE_INFO *mbmi, avm_reader *r) { if (mbmi->max_num_warp_candidates <= 1) { mbmi->warp_ref_idx = 0; @@ -502,6 +549,9 @@ static void read_warp_ref_idx(FRAME_CONTEXT *ec_ctx, MB_MODE_INFO *mbmi, mbmi->warp_ref_idx = bit_idx + warp_idx; if (!warp_idx) break; } + DEBUG_BLOCK_printf("%*sPost-warp_ref_idx[%d/%d]: r=%d\n", + depth, "", mbmi->warp_ref_idx, mbmi->max_num_warp_candidates, + r->ec.rng); } static void read_warpmv_with_mvd_flag(FRAME_CONTEXT *ec_ctx, MB_MODE_INFO *mbmi, @@ -540,7 +590,8 @@ static int read_warp_delta_param(const MACROBLOCKD *xd, int index, return coded_value; } -static void read_warp_delta(const AV2_COMMON *cm, const MACROBLOCKD *xd, +static void read_warp_delta(DB_ONLY(const int depth) + const AV2_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mbmi, avm_reader *r, WARP_CANDIDATE *warp_param_stack) { WarpedMotionParams *params = &mbmi->wm_params[0]; @@ -596,6 +647,11 @@ static void read_warp_delta(const AV2_COMMON *cm, const MACROBLOCKD *xd, params->wmmat[4] = -params->wmmat[3]; params->wmmat[5] = params->wmmat[2]; } + DEBUG_BLOCK_printf("%*sPost-warp_param_signal[%d,%d,%d,%d]: r=%d\n", + depth, "", decoded_delta_param[2], decoded_delta_param[3], + mbmi->six_param_warp_model_flag ? decoded_delta_param[4] : 0, + mbmi->six_param_warp_model_flag ? decoded_delta_param[5] : 0, + r->ec.rng); } else { *params = base_params; assert(mbmi->six_param_warp_model_flag == 0); @@ -611,7 +667,9 @@ static void read_warp_delta(const AV2_COMMON *cm, const MACROBLOCKD *xd, assign_warpmv(cm, xd->submi, bsize, params, mi_row, mi_col, 0); } -static MOTION_MODE read_motion_mode(AV2_COMMON *cm, MACROBLOCKD *xd, +static MOTION_MODE read_motion_mode(DB_ONLY(const int depth, const int mi_row, + const int mi_col) + AV2_COMMON *cm, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, avm_reader *r) { const BLOCK_SIZE bsize = mbmi->sb_type[PLANE_TYPE_Y]; mbmi->max_num_warp_candidates = 0; @@ -631,33 +689,48 @@ static MOTION_MODE read_motion_mode(AV2_COMMON *cm, MACROBLOCKD *xd, (allowed_motion_modes & (1 << WARP_DELTA)))) return WARP_EXTEND; + MOTION_MODE mm; + int signal_coded = 0; if (allowed_motion_modes & (1 << WARP_EXTEND)) { const int ctx = av2_get_warp_extend_ctx(xd); const int use_warp_extend = avm_read_symbol(r, xd->tile_ctx->warp_extend_cdf[ctx], 2, ACCT_INFO("use_warp_extend")); + signal_coded = 1; if (use_warp_extend) { - return WARP_EXTEND; + mm = WARP_EXTEND; + goto end; } } - if (!(allowed_motion_modes & (1 << WARP_DELTA))) return WARP_CAUSAL; + if (!(allowed_motion_modes & (1 << WARP_DELTA))) { + mm = WARP_CAUSAL; + goto end; + } if (allowed_motion_modes & (1 << WARP_CAUSAL)) { const int ctx = av2_get_warp_causal_ctx(xd); const int use_warp_causal = avm_read_symbol(r, xd->tile_ctx->warp_causal_cdf[ctx], 2, ACCT_INFO("use_warp_causal")); + signal_coded = 1; if (use_warp_causal) { - return WARP_CAUSAL; + mm = WARP_CAUSAL; + goto end; } } - return WARP_DELTA; + mm = WARP_DELTA; + end: + if (signal_coded) + DEBUG_BLOCK_printf("%*sPost-sngl_newmv_warp[%d]: r=%d\n", + depth, "", mm, r->ec.rng); + return mm; } mbmi->use_wedge_interintra = 0; if (allowed_motion_modes & (1 << INTERINTRA)) { + MOTION_MODE mm = SIMPLE_TRANSLATION; const int bsize_group = size_group_lookup[bsize]; const int use_interintra = avm_read_symbol(r, xd->tile_ctx->interintra_cdf[bsize_group], 2, @@ -686,24 +759,38 @@ static MOTION_MODE read_motion_mode(AV2_COMMON *cm, MACROBLOCKD *xd, mbmi->wedge_boundary_index = get_wedge_boundary_type(bsize); } } - return INTERINTRA; + mm = INTERINTRA; } + DEBUG_BLOCK_printf("%*sPost-interintra[%d,%d,%d]: r=%d\n", + depth, "", mm, + mm != INTERINTRA ? -1 : mbmi->interintra_mode, + mm == INTERINTRA && mbmi->use_wedge_interintra ? + mbmi->interintra_wedge_index : -1, r->ec.rng); + return mm; } if (allowed_motion_modes & (1 << WARP_CAUSAL)) { const int ctx = av2_get_warp_causal_ctx(xd); const int use_warp_causal = avm_read_symbol( r, xd->tile_ctx->warp_causal_cdf[ctx], 2, ACCT_INFO("use_warp_causal")); + MOTION_MODE mm = SIMPLE_TRANSLATION; if (use_warp_causal) { - return WARP_CAUSAL; + mm = WARP_CAUSAL; } + DEBUG_BLOCK_printf("%*sPost-comp_newmv_warp[%d]: r=%d\n", + depth, "", mm, r->ec.rng); + return mm; } return SIMPLE_TRANSLATION; } // Read scale mode flag for joint mvd coding mode -static PREDICTION_MODE read_jmvd_scale_mode(MACROBLOCKD *xd, avm_reader *r, +static PREDICTION_MODE read_jmvd_scale_mode(DB_ONLY(const int depth, + AV2_COMMON *cm, + const int mi_row, + const int mi_col) + MACROBLOCKD *xd, avm_reader *r, MB_MODE_INFO *const mbmi) { if (!is_joint_mvd_coding_mode(mbmi->mode)) return 0; const int is_joint_amvd_mode = @@ -715,6 +802,8 @@ static PREDICTION_MODE read_jmvd_scale_mode(MACROBLOCKD *xd, avm_reader *r, : JOINT_NEWMV_SCALE_FACTOR_CNT; const int jmvd_scale_mode = avm_read_symbol( r, jmvd_scale_mode_cdf, jmvd_scale_cnt, ACCT_INFO("jmvd_scale_mode")); + DEBUG_BLOCK_printf("%*sPost-jmvd_scale_mode[%d]: r=%d\n", + depth, "", jmvd_scale_mode, r->ec.rng); return jmvd_scale_mode; } @@ -737,7 +826,10 @@ static int read_cwp_idx(MACROBLOCKD *xd, avm_reader *r, const AV2_COMMON *cm, return get_cwp_coding_idx(cwp_idx, 0, cm, mbmi); } -static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, avm_reader *r, +static PREDICTION_MODE read_inter_compound_mode(DB_ONLY(const int depth, + const int mi_row, + const int mi_col) + MACROBLOCKD *xd, avm_reader *r, const AV2_COMMON *cm, MB_MODE_INFO *const mbmi, int16_t ctx) { @@ -780,9 +872,14 @@ static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, avm_reader *r, } if (use_optical_flow) { assert(is_inter_compound_mode(comp_idx_to_opfl_mode[mode])); + DEBUG_BLOCK_printf("%*sPost-comp_inter_mode[ctx=%d,%d]: r=%d\n", + depth, "", ctx, + comp_idx_to_opfl_mode[mode], r->ec.rng); return comp_idx_to_opfl_mode[mode]; } } + DEBUG_BLOCK_printf("%*sPost-comp_inter_mode[ctx=%d,%d]: r=%d\n", + depth, "", ctx, NEAR_NEARMV + mode, r->ec.rng); assert(is_inter_compound_mode(NEAR_NEARMV + mode)); return NEAR_NEARMV + mode; @@ -810,7 +907,8 @@ int av2_neg_deinterleave(int diff, int ref, int max) { } } -static int read_segment_id(AV2_COMMON *const cm, const MACROBLOCKD *const xd, +static int read_segment_id(DB_ONLY(const int depth) + AV2_COMMON *const cm, const MACROBLOCKD *const xd, avm_reader *r, int skip) { int cdf_num; const int pred = av2_get_spatial_seg_pred(cm, xd, &cdf_num, 0); @@ -844,6 +942,11 @@ static int read_segment_id(AV2_COMMON *const cm, const MACROBLOCKD *const xd, const int segment_id = av2_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-segid[%d]: r=%d\n", + depth, "", segment_id, r->ec.rng); +#endif if (segment_id < 0 || segment_id > seg->last_active_segid) { avm_internal_error(xd->error_info, AVM_CODEC_CORRUPT_FRAME, "Corrupted segment_ids"); @@ -875,7 +978,8 @@ static void set_segment_id(AV2_COMMON *cm, int mi_offset, int x_inside_boundary, segment_id; } -static int read_intra_segment_id(AV2_COMMON *const cm, +static int read_intra_segment_id(DB_ONLY(const int depth) + AV2_COMMON *const cm, const MACROBLOCKD *const xd, BLOCK_SIZE bsize, avm_reader *r, int skip) { struct segmentation *const seg = &cm->seg; @@ -890,7 +994,7 @@ static int read_intra_segment_id(AV2_COMMON *const cm, const int bh = mi_size_high[bsize]; const int x_inside_boundary = AVMMIN(mi_params->mi_cols - mi_col, bw); const int y_inside_boundary = AVMMIN(mi_params->mi_rows - mi_row, bh); - const int segment_id = read_segment_id(cm, xd, r, skip); + const int segment_id = read_segment_id(DB_ONLY(depth) cm, xd, r, skip); set_segment_id(cm, mi_offset, x_inside_boundary, y_inside_boundary, segment_id); return segment_id; @@ -917,7 +1021,8 @@ static int get_predicted_segment_id(AV2_COMMON *const cm, int mi_offset, : 0; } -static int read_inter_segment_id(AV2_COMMON *const cm, MACROBLOCKD *const xd, +static int read_inter_segment_id(DB_ONLY(const int depth) + AV2_COMMON *const cm, MACROBLOCKD *const xd, int preskip, avm_reader *r) { struct segmentation *const seg = &cm->seg; const CommonModeInfoParams *const mi_params = &cm->mi_params; @@ -949,9 +1054,11 @@ static int read_inter_segment_id(AV2_COMMON *const cm, MACROBLOCKD *const xd, if (seg->temporal_update) { mbmi->seg_id_predicted = 0; } - segment_id = read_segment_id(cm, xd, r, 1); + segment_id = read_segment_id(DB_ONLY(depth) cm, xd, r, 1); set_segment_id(cm, mi_offset, x_inside_boundary, y_inside_boundary, segment_id); + DEBUG_BLOCK_printf("%*sPost-segid[%d]: r=%d\n", + depth, "", segment_id, r->ec.rng); return segment_id; } } @@ -966,18 +1073,22 @@ static int read_inter_segment_id(AV2_COMMON *const cm, MACROBLOCKD *const xd, if (mbmi->seg_id_predicted) { segment_id = get_predicted_segment_id(cm, mi_offset, x_inside_boundary, y_inside_boundary); + DEBUG_BLOCK_printf("%*sPost-segid[%d]: r=%d\n", + depth, "", segment_id, r->ec.rng); } else { - segment_id = read_segment_id(cm, xd, r, 0); + segment_id = read_segment_id(DB_ONLY(depth) cm, xd, r, 0); } } else { - segment_id = read_segment_id(cm, xd, r, 0); + segment_id = read_segment_id(DB_ONLY(depth) cm, xd, r, 0); } set_segment_id(cm, mi_offset, x_inside_boundary, y_inside_boundary, segment_id); return segment_id; } -static int read_skip_mode(AV2_COMMON *cm, const MACROBLOCKD *xd, +static int read_skip_mode(DB_ONLY(const int depth, + const int mi_row, const int mi_col) + AV2_COMMON *cm, const MACROBLOCKD *xd, avm_reader *r) { if (!is_skip_mode_allowed(cm, xd)) return 0; @@ -985,10 +1096,14 @@ static int read_skip_mode(AV2_COMMON *cm, const MACROBLOCKD *xd, FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int skip_mode = avm_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_INFO("skip_mode")); + DEBUG_BLOCK_printf("%*sPost-skip_mode[ctx=%d,%d]: r=%d\n", + depth, "", ctx, skip_mode, r->ec.rng); return skip_mode; } -static int read_skip_txfm(AV2_COMMON *cm, const MACROBLOCKD *xd, int segment_id, +static int read_skip_txfm(DB_ONLY(const int depth, + const int mi_row, const int mi_col) + AV2_COMMON *cm, const MACROBLOCKD *xd, int segment_id, avm_reader *r) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; @@ -997,20 +1112,36 @@ static int read_skip_txfm(AV2_COMMON *cm, const MACROBLOCKD *xd, int segment_id, FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int skip_txfm = avm_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_INFO("skip_txfm")); + DEBUG_BLOCK_printf("%*sPost-skip_txfm[ctx=%d,%d]: r=%d\n", + depth, "", ctx, skip_txfm, r->ec.rng); return skip_txfm; } } -static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth, +static void read_palette_colors_y(DB_ONLY(AV2_COMMON *cm, const int depth) + MACROBLOCKD *const xd, int bit_depth, PALETTE_MODE_INFO *const pmi, avm_reader *r) { uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av2_get_palette_cache(xd, 0, color_cache); const int n = pmi->palette_size[0]; int idx = 0; - for (int i = 0; i < n_cache && idx < n; ++i) { - if (avm_read_bit(r, ACCT_INFO("color_cache"))) +#if DEBUG_BLOCK_INFO + unsigned cache_use_mask = 0, n_cache_bits = 0; +#endif + for (int i = 0; i < n_cache && idx < n; DB_ONLY(n_cache_bits++) ++i) { +#if DEBUG_BLOCK_INFO + cache_use_mask <<= 1; +#endif + if (avm_read_bit(r, ACCT_INFO("color_cache"))) { pmi->palette_colors[idx++] = color_cache[i]; +#if DEBUG_BLOCK_INFO + cache_use_mask |= 1; +#endif + } } +#if DEBUG_BLOCK_INFO + const int n_used_cache = idx; +#endif if (idx < n) { pmi->palette_colors[idx++] = avm_read_literal(r, bit_depth, ACCT_INFO("palette_colors")); @@ -1038,14 +1169,51 @@ static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth, } } } +#if DEBUG_BLOCK_INFO +#define bitmask(x) \ + ((uint64_t) ((x) & 0x8000) << 45) | \ + ((uint64_t) ((x) & 0x4000) << 42) | \ + ((uint64_t) ((x) & 0x2000) << 39) | \ + ((uint64_t) ((x) & 0x1000) << 36) | \ + ((uint64_t) ((x) & 0x800) << 33) | \ + ((uint64_t) ((x) & 0x400) << 30) | \ + ((uint64_t) ((x) & 0x200) << 27) | \ + ((uint64_t) ((x) & 0x100) << 24) | \ + ((uint64_t) ((x) & 0x80) << 21) | \ + ((uint64_t) ((x) & 0x40) << 18) | \ + ((uint64_t) ((x) & 0x20) << 15) | \ + ((uint64_t) ((x) & 0x10) << 12) | \ + ((uint64_t) ((x) & 0x8) << 9) | \ + ((uint64_t) ((x) & 0x4) << 6) | \ + ((uint64_t) ((x) & 0x2) << 3) | \ + ((uint64_t) ((x) & 0x1) << 0) + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + if (BLOCK_TO_DEBUG) { + printf("%*sPost-ypal[sz=%d,cache_sz=%d,mask=%0*"PRIx64"|%d]: r=%d, cache=", + depth, "", n, n_cache, n_cache_bits, bitmask(cache_use_mask), + n_used_cache, r->ec.rng); + for (int i = 0; i < n_cache; i++) + printf("%c%0*x", i ? ',' : '[', 2 + (cm->seq_params.bit_depth > 8), + color_cache[i]); + printf("%s, pal=", n_cache ? "]" : "[]"); + for (int i = 0; i < n; i++) + printf("%c%0*x", i ? ',' : '[', 2 + (cm->seq_params.bit_depth > 8), + pmi->palette_colors[i]); + printf("]\n"); + } +#endif } -static void read_palette_mode_info(AV2_COMMON *const cm, MACROBLOCKD *const xd, +static void read_palette_mode_info(DB_ONLY(const int depth) + AV2_COMMON *const cm, MACROBLOCKD *const xd, avm_reader *r) { MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->sb_type[xd->tree_type == CHROMA_PART]; assert(av2_allow_palette(PLANE_TYPE_Y, cm->features.allow_screen_content_tools, bsize)); +#if DEBUG_BLOCK_INFO + int mi_row = xd->mi_row, mi_col = xd->mi_col; +#endif (void)bsize; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; if (mbmi->mode == DC_PRED && xd->tree_type != CHROMA_PART) { @@ -1056,12 +1224,16 @@ static void read_palette_mode_info(AV2_COMMON *const cm, MACROBLOCKD *const xd, avm_read_symbol(r, xd->tile_ctx->palette_y_size_cdf, PALETTE_SIZES, ACCT_INFO("palette_size", "luma")) + 2; - read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r); + read_palette_colors_y(DB_ONLY(cm, depth) xd, cm->seq_params.bit_depth, pmi, r); + } else { + DEBUG_BLOCK_printf("%*sPost-ypal[%d]: r=%d\n", + depth, "", modev, r->ec.rng); } } } -static void read_intra_dip_mode_info(const AV2_COMMON *const cm, +static void read_intra_dip_mode_info(DB_ONLY(const int depth) + const AV2_COMMON *const cm, MACROBLOCKD *const xd, avm_reader *r) { MB_MODE_INFO *const mbmi = xd->mi[0]; mbmi->use_intra_dip = 0; @@ -1084,6 +1256,12 @@ static void read_intra_dip_mode_info(const AV2_COMMON *const cm, mbmi->intra_dip_mode += avm_read_symbol(r, mode_cdf, n_modes, ACCT_INFO("intra_dip_mode_n6")); } +#if DEBUG_BLOCK_INFO + int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-dip[ctx=%d,%d,tp=%d,mode=%d]: r=%d\n", + depth, "", ctx, mbmi->use_intra_dip, mbmi->intra_dip_mode >> 4, + mbmi->intra_dip_mode & 7, r->ec.rng); +#endif } } @@ -1217,7 +1395,8 @@ void av2_read_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, int blk_row, } } -void av2_read_cctx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, +void av2_read_cctx_type(DB_ONLY(const int depth) + const AV2_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, avm_reader *r) { MB_MODE_INFO *mbmi = xd->mi[0]; @@ -1241,12 +1420,17 @@ void av2_read_cctx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, (void)tx_size; cctx_type = avm_read_symbol(r, ec_ctx->cctx_type_cdf, CCTX_TYPES, ACCT_INFO("cctx_type")); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row + blk_row, mi_col = xd->mi_col + blk_col, plane = 1; +#endif + DEBUG_CF_printf("%*sPost-cctx[%d]: r=%d\n", + depth, "", cctx_type, r->ec.rng); update_cctx_array(xd, blk_row, blk_col, row_offset, col_offset, tx_size, cctx_type); } // This function reads a 'secondary tx set' from the bitstream -static void read_secondary_tx_set(MACROBLOCKD *xd, FRAME_CONTEXT *ec_ctx, +static TX_TYPE read_secondary_tx_set(MACROBLOCKD *xd, FRAME_CONTEXT *ec_ctx, avm_reader *r, MB_MODE_INFO *mbmi, TX_SIZE tx_size, TX_TYPE *tx_type) { const int inter_block = is_inter_block(mbmi, xd->tree_type); @@ -1275,9 +1459,11 @@ static void read_secondary_tx_set(MACROBLOCKD *xd, FRAME_CONTEXT *ec_ctx, } if (get_primary_tx_type(*tx_type) == ADST_ADST) stx_set_flag += IST_SET_SIZE; set_secondary_tx_set(tx_type, stx_set_flag); + return stx_set_flag; } -void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, +void av2_read_sec_tx_type(DB_ONLY(const int depth) + const AV2_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, uint16_t *eob, avm_reader *r) { MB_MODE_INFO *mbmi = xd->mi[0]; @@ -1293,6 +1479,11 @@ void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, if (xd->lossless[mbmi->segment_id]) return; const int inter_block = is_inter_block(mbmi, xd->tree_type); +#if DEBUG_BLOCK_INFO + TX_TYPE stx_set = DC_PRED; + const int mi_row = xd->mi_row + blk_row, mi_col = xd->mi_col + blk_col; + const int plane = 0 /* stx implies Y */; +#endif if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) > 1) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; @@ -1303,6 +1494,9 @@ void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, STX_TYPES, ACCT_INFO("stx_flag")); *tx_type |= (stx_flag << PRIMARY_TX_BITS); if (stx_flag > 0) +#if DEBUG_BLOCK_INFO + stx_set = +#endif read_secondary_tx_set(xd, ec_ctx, r, mbmi, tx_size, tx_type); #if STX_SYNTAX_DEBUG const int sb_size = @@ -1314,6 +1508,8 @@ void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, tx_size_high[tx_size], *eob, get_primary_tx_type(*tx_type), stx_flag, get_secondary_tx_set(*tx_type)); #endif // STX_SYNTAX_DEBUG + DEBUG_CF_printf("%*sPost-stx[type=%d,set=%d]: r=%d\n", + depth, "", stx_flag, stx_set, r->ec.rng); } } else { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; @@ -1324,6 +1520,9 @@ void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, STX_TYPES, ACCT_INFO("stx_flag")); *tx_type |= (stx_flag << PRIMARY_TX_BITS); if (stx_flag > 0) +#if DEBUG_BLOCK_INFO + stx_set = +#endif read_secondary_tx_set(xd, ec_ctx, r, mbmi, tx_size, tx_type); #if STX_SYNTAX_DEBUG const int sb_size = @@ -1335,6 +1534,8 @@ void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, tx_size_high[tx_size], *eob, get_primary_tx_type(*tx_type), stx_flag, get_secondary_tx_set(*tx_type)); #endif // STX_SYNTAX_DEBUG + DEBUG_CF_printf("%*sPost-stx[type=%d,set=%d]: r=%d\n", + depth, "", stx_flag, stx_set, r->ec.rng); } } } @@ -1354,7 +1555,8 @@ static INLINE void mv_clamp_to_integer(MV *mv) { } } -static INLINE int assign_dv(AV2_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, +static INLINE int assign_dv(DB_ONLY(const int depth) + AV2_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, const int_mv *ref_mv, int mi_row, int mi_col, BLOCK_SIZE bsize, avm_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; @@ -1376,6 +1578,8 @@ static INLINE int assign_dv(AV2_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, if (sign) mv_diff.col = -mv_diff.col; } + DEBUG_BLOCK_printf("%*sPost-mvdiff[y:%d,x:%d]: r=%d\n", + depth, "", mv_diff.row, mv_diff.col, r->ec.rng); MV low_prec_refmv = ref_mv->as_mv; if (mbmi->pb_mv_precision < MV_PRECISION_HALF_PEL) lower_mv_precision(&low_prec_refmv, mbmi->pb_mv_precision); @@ -1405,7 +1609,8 @@ static void read_intrabc_drl_idx(int max_ref_bv_cnt, MB_MODE_INFO *mbmi, assert(mbmi->intrabc_drl_idx < max_ref_bv_cnt); } -static void read_intrabc_info(AV2_COMMON *const cm, DecoderCodingBlock *dcb, +static void read_intrabc_info(DB_ONLY(const int depth) + AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *r) { MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -1437,7 +1642,8 @@ static void read_intrabc_info(AV2_COMMON *const cm, DecoderCodingBlock *dcb, // ref_mvs int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES]; - av2_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count, + av2_find_mv_refs(DB_ONLY(0) + cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count, xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, NULL, 0, NULL); @@ -1458,7 +1664,8 @@ static void read_intrabc_info(AV2_COMMON *const cm, DecoderCodingBlock *dcb, mbmi->pb_mv_precision = av2_intraBc_precision_sets.precision[index]; } - valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row, + valid_dv = valid_dv && assign_dv(DB_ONLY(depth) + cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row, xd->mi_col, bsize, r); if (!valid_dv) { // Intra bc motion vectors are not valid - signal corrupt frame @@ -1469,8 +1676,9 @@ static void read_intrabc_info(AV2_COMMON *const cm, DecoderCodingBlock *dcb, assert(is_this_mv_precision_compliant(mbmi->mv[0].as_mv, mbmi->pb_mv_precision)); + int morph_pred_ctx = -1; if (av2_allow_intrabc_morph_pred(cm)) { - const int morph_pred_ctx = get_morph_pred_ctx(xd); + morph_pred_ctx = get_morph_pred_ctx(xd); mbmi->morph_pred = avm_read_symbol( r, ec_ctx->morph_pred_cdf[morph_pred_ctx], 2, ACCT_INFO()); if (mbmi->morph_pred != 0) { @@ -1484,35 +1692,61 @@ static void read_intrabc_info(AV2_COMMON *const cm, DecoderCodingBlock *dcb, "Invalid intrabc BAWP dv"); } } +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; +#endif + DEBUG_BLOCK_printf("%*sPost-intrabc_info[mode=%d,drl=%d," + "prec=%d,morphctx=%d,morph=%d]: r=%d\n", + depth, "", mbmi->intrabc_mode, + mbmi->intrabc_drl_idx, + mbmi->pb_mv_precision == MV_PRECISION_QTR_PEL, + morph_pred_ctx, morph_pred_ctx == -1 ? 0 : mbmi->morph_pred, + r->ec.rng); } } // If delta q is present, reads delta_q index. // Also reads delta_q loop filter levels, if present. -static void read_delta_q_params(AV2_COMMON *const cm, MACROBLOCKD *const xd, +static void read_delta_q_params(DB_ONLY(const int depth) + AV2_COMMON *const cm, MACROBLOCKD *const xd, avm_reader *r) { DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag) { MB_MODE_INFO *const mbmi = xd->mi[0]; - xd->current_base_qindex += - read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res; + const int delta_q_val = read_delta_qindex(cm, xd, r, mbmi); + xd->current_base_qindex += delta_q_val * delta_q_info->delta_q_res; /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ xd->current_base_qindex = clamp(xd->current_base_qindex, 1, cm->seq_params.bit_depth == AVM_BITS_8 ? MAXQ_8_BITS : cm->seq_params.bit_depth == AVM_BITS_10 ? MAXQ_10_BITS : MAXQ); +#if DEBUG_BLOCK_INFO + BLOCK_SIZE bsize = mbmi->sb_type[xd->tree_type == CHROMA_PART]; + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + const int b_col = mi_col & (cm->mib_size - 1); + const int b_row = mi_row & (cm->mib_size - 1); + const int read_delta_q_flag = (b_col == 0 && b_row == 0); + if ((bsize != cm->sb_size || + mbmi->skip_txfm[xd->tree_type == CHROMA_PART] == 0) && + read_delta_q_flag) { + DEBUG_BLOCK_printf("%*sPost-delta_q[%d->%d]: r=%d\n", + depth, "", delta_q_val, xd->current_base_qindex, r->ec.rng); + } +#endif } } // read mode set index and mode index in set for y component, // and map it to y mode and delta angle -static void read_intra_luma_mode(MACROBLOCKD *const xd, avm_reader *r) { +static void read_intra_luma_mode(DB_ONLY(const int depth, + AV2_COMMON *const cm) + MACROBLOCKD *const xd, avm_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; MB_MODE_INFO *const mbmi = xd->mi[0]; uint8_t mode_idx = 0; - const int context = get_y_mode_idx_ctx(xd); + int context = get_y_mode_idx_ctx(xd); int mode_set_index = avm_read_symbol(r, ec_ctx->y_mode_set_cdf, INTRA_MODE_SETS, ACCT_INFO("mode_set_index", "y_mode_set_cdf")); @@ -1528,6 +1762,9 @@ static void read_intra_luma_mode(MACROBLOCKD *const xd, avm_reader *r) { } else { mode_idx = FIRST_MODE_COUNT + (mode_set_index - 1) * SECOND_MODE_COUNT + avm_read_literal(r, 4, ACCT_INFO("mode_idx")); +#if DEBUG_BLOCK_INFO + context = -1; +#endif } assert(mode_idx < LUMA_MODE_COUNT); get_y_intra_mode_set(mbmi, xd); @@ -1536,30 +1773,40 @@ static void read_intra_luma_mode(MACROBLOCKD *const xd, avm_reader *r) { mbmi->y_mode_idx = mode_idx; if (mbmi->joint_y_mode_delta_angle < NON_DIRECTIONAL_MODES_COUNT) assert(mbmi->joint_y_mode_delta_angle == mbmi->y_mode_idx); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-intra_y_mode[set=%d,idx=%d,ctx=%d,mode=%d,angle=%d]: r=%d\n", + depth, "", mode_set_index, mode_idx, context, + mbmi->mode, mbmi->angle_delta[PLANE_TYPE_Y], r->ec.rng); +#endif } // Read mode index for uv component and map it to uv mode and delta angle. // First we read if the uv mode is UV_CFL_PRED. // If yes, uv mode is set to UV_CFL_PRED, delta angle is set to 0. Done. // If not, we read mode index and map it to uv mode and delta angle. -static void read_intra_uv_mode(MACROBLOCKD *const xd, +static void read_intra_uv_mode(DB_ONLY(const int depth, + AV2_COMMON *const cm) + MACROBLOCKD *const xd, CFL_ALLOWED_TYPE cfl_allowed, avm_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; MB_MODE_INFO *const mbmi = xd->mi[0]; int is_cfl_mode = 0; + int cfl_ctx = -1, uv_mode_idx = -1, context = -1; if (cfl_allowed) { - const int cfl_ctx = get_cfl_ctx(xd); + cfl_ctx = get_cfl_ctx(xd); is_cfl_mode = avm_read_symbol(r, ec_ctx->cfl_cdf[cfl_ctx], 2, ACCT_INFO("is_cfl_idx")); } if (is_cfl_mode) { mbmi->uv_mode = UV_CFL_PRED; mbmi->angle_delta[PLANE_TYPE_UV] = 0; - return; + goto end; } - const int context = av2_is_directional_mode(mbmi->mode) ? 1 : 0; - int uv_mode_idx = + context = av2_is_directional_mode(mbmi->mode) ? 1 : 0; + uv_mode_idx = avm_read_symbol(r, ec_ctx->uv_mode_cdf[context], CHROMA_INTRA_MODE_INDEX_COUNT, ACCT_INFO("uv_mode_idx")); if (uv_mode_idx == (CHROMA_INTRA_MODE_INDEX_COUNT - 1)) @@ -1575,9 +1822,24 @@ static void read_intra_uv_mode(MACROBLOCKD *const xd, mbmi->angle_delta[PLANE_TYPE_UV] = mbmi->angle_delta[PLANE_TYPE_Y]; else mbmi->angle_delta[PLANE_TYPE_UV] = 0; +end: {} +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-intra_uv_mode[cfl=%d,idx=%d,ctx=%d|%d,mode=%d,angle=%d]: r=%d\n", + depth, "", is_cfl_mode, uv_mode_idx, + cfl_ctx, context, mbmi->uv_mode, + (1 << mbmi->uv_mode) & ((1 << UV_DC_PRED) | + (1 << UV_SMOOTH_PRED) | + (1 << UV_SMOOTH_V_PRED) | + (1 << UV_SMOOTH_H_PRED) | + (1 << UV_PAETH_PRED)) ? + 0 : mbmi->angle_delta[PLANE_TYPE_UV], r->ec.rng); +#endif } -static void read_intra_frame_mode_info(AV2_COMMON *const cm, +static void read_intra_frame_mode_info(DB_ONLY(const int depth) + AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *r) { MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -1586,8 +1848,13 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; +#endif + if (seg->segid_preskip && xd->tree_type != CHROMA_PART) - mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0); + mbmi->segment_id = read_intra_segment_id(DB_ONLY(depth) cm, xd, bsize, r, 0); mbmi->skip_mode = 0; if (xd->tree_type != CHROMA_PART) mbmi->morph_pred = 0; @@ -1601,10 +1868,15 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, mbmi->use_intrabc[xd->tree_type == CHROMA_PART] = avm_read_symbol(r, ec_ctx->intrabc_cdf[intrabc_ctx], 2, ACCT_INFO("use_intrabc", "chroma")); + DEBUG_BLOCK_printf("%*sPost-intrabc[ctx=%d,%d]: r=%d\n", + depth, "", intrabc_ctx, + mbmi->use_intrabc[xd->tree_type == CHROMA_PART], + r->ec.rng); } if (is_intrabc_block(mbmi, xd->tree_type)) { mbmi->skip_txfm[xd->tree_type == CHROMA_PART] = - read_skip_txfm(cm, xd, mbmi->segment_id, r); + read_skip_txfm(DB_ONLY(depth, mi_row, mi_col) + cm, xd, mbmi->segment_id, r); } else { // Segment SEG_LVL_SKIP should be disabled for intra prediction if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { @@ -1615,19 +1887,19 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, } if (!seg->segid_preskip && xd->tree_type != CHROMA_PART) - mbmi->segment_id = read_intra_segment_id( + mbmi->segment_id = read_intra_segment_id(DB_ONLY(depth) cm, xd, bsize, r, mbmi->skip_txfm[xd->tree_type == CHROMA_PART]); mbmi->seg_id_predicted = 0; - if (xd->tree_type != CHROMA_PART) read_gdf(cm, r, xd); + if (xd->tree_type != CHROMA_PART) read_gdf(DB_ONLY(depth) cm, r, xd); - if (xd->tree_type != CHROMA_PART) read_cdef(cm, r, xd); + if (xd->tree_type != CHROMA_PART) read_cdef(DB_ONLY(depth) cm, r, xd); if (cm->seq_params.enable_ccso && xd->tree_type != CHROMA_PART) - read_ccso(cm, r, xd); + read_ccso(DB_ONLY(depth) cm, r, xd); - if (xd->tree_type != CHROMA_PART) read_delta_q_params(cm, xd, r); + if (xd->tree_type != CHROMA_PART) read_delta_q_params(DB_ONLY(depth) cm, xd, r); mbmi->current_qindex = xd->current_base_qindex; @@ -1646,7 +1918,7 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, } if (av2_allow_intrabc(cm, xd, bsize) && xd->tree_type != CHROMA_PART) { - read_intrabc_info(cm, dcb, r); + read_intrabc_info(DB_ONLY(depth) cm, dcb, r); if (is_intrabc_block(mbmi, xd->tree_type)) { mbmi->use_dpcm_y = 0; mbmi->dpcm_mode_y = 0; @@ -1658,9 +1930,11 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, if (xd->lossless[mbmi->segment_id]) { mbmi->use_dpcm_y = read_dpcm_mode(ec_ctx, r); if (mbmi->use_dpcm_y == 0) { - read_intra_luma_mode(xd, r); + read_intra_luma_mode(DB_ONLY(depth, cm) xd, r); } else { mbmi->dpcm_mode_y = read_dpcm_vert_horz_mode(ec_ctx, r); + DEBUG_BLOCK_printf("%*sPost-ydpcm[dir=%d]: r=%d\n", + depth, "", mbmi->dpcm_mode_y, r->ec.rng); if (mbmi->dpcm_mode_y == 0) { mbmi->joint_y_mode_delta_angle = 22; mbmi->mode = V_PRED; @@ -1674,11 +1948,19 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, } else { mbmi->use_dpcm_y = 0; mbmi->dpcm_mode_y = 0; - read_intra_luma_mode(xd, r); + read_intra_luma_mode(DB_ONLY(depth, cm) xd, r); } if (allow_fsc_intra(cm, bsize, mbmi)) { avm_cdf_prob *fsc_cdf = get_fsc_mode_cdf(xd, bsize, 1); +#if DEBUG_BLOCK_INFO + const int ctx = get_fsc_mode_ctx(xd, 1); + const uint8_t fsc_size_group = fsc_bsize_groups[bsize]; +#endif mbmi->fsc_mode[xd->tree_type == CHROMA_PART] = read_fsc_mode(r, fsc_cdf); + DEBUG_BLOCK_printf("%*sPost-fsc[ctx=%d|%d,%d]: r=%d\n", + depth, "", ctx, fsc_size_group, + mbmi->fsc_mode[xd->tree_type == CHROMA_PART], + r->ec.rng); } else { mbmi->fsc_mode[xd->tree_type == CHROMA_PART] = 0; } @@ -1687,10 +1969,11 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, if (mbmi->use_dpcm_y == 0) { mbmi->mrl_index = (cm->seq_params.enable_mrls && av2_is_directional_mode(mbmi->mode)) - ? read_mrl_index(ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) + ? read_mrl_index(DB_ONLY(depth, cm, xd) + ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) : 0; if (mbmi->mrl_index) { - mbmi->multi_line_mrl = read_multi_line_mrl( + mbmi->multi_line_mrl = read_multi_line_mrl(DB_ONLY(depth, cm, xd) ec_ctx, r, xd->neighbors[0], xd->neighbors[1]); } else { mbmi->multi_line_mrl = 0; @@ -1702,11 +1985,13 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, } else { mbmi->mrl_index = (cm->seq_params.enable_mrls && av2_is_directional_mode(mbmi->mode)) - ? read_mrl_index(ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) + ? read_mrl_index(DB_ONLY(depth, cm, xd) + ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) : 0; if (mbmi->mrl_index) { mbmi->multi_line_mrl = - read_multi_line_mrl(ec_ctx, r, xd->neighbors[0], xd->neighbors[1]); + read_multi_line_mrl(DB_ONLY(depth, cm, xd) + ec_ctx, r, xd->neighbors[0], xd->neighbors[1]); } else { mbmi->multi_line_mrl = 0; } @@ -1718,7 +2003,7 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, if (xd->lossless[mbmi->segment_id]) { mbmi->use_dpcm_uv = read_dpcm_uv_mode(ec_ctx, r); if (mbmi->use_dpcm_uv == 0) { - read_intra_uv_mode( + read_intra_uv_mode(DB_ONLY(depth, cm) xd, is_cfl_allowed(cm->seq_params.enable_cfl_intra, xd) || is_mhccp_allowed(cm, xd), @@ -1727,6 +2012,8 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, } else { get_uv_intra_mode_set(mbmi); mbmi->dpcm_mode_uv = read_dpcm_uv_vert_horz_mode(ec_ctx, r); + DEBUG_BLOCK_printf("%*sPost-uvdpcm[dir=%d]: r=%d\n", + depth, "", mbmi->dpcm_mode_uv, r->ec.rng); mbmi->uv_mode = mbmi->dpcm_mode_uv + 1; if (mbmi->uv_mode == mbmi->mode) mbmi->angle_delta[PLANE_TYPE_UV] = mbmi->angle_delta[PLANE_TYPE_Y]; @@ -1734,7 +2021,7 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, mbmi->angle_delta[PLANE_TYPE_UV] = 0; } } else { - read_intra_uv_mode( + read_intra_uv_mode(DB_ONLY(depth, cm) xd, is_cfl_allowed(cm->seq_params.enable_cfl_intra, xd) || is_mhccp_allowed(cm, xd), @@ -1767,6 +2054,20 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, if (mbmi->cfl_idx == 0) mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs); +#if DEBUG_BLOCK_INFO + const int sign_u = ((CFL_SIGN_U(mbmi->cfl_alpha_signs) ^ 1) - 1) >> 1; + const int sign_v = ((CFL_SIGN_V(mbmi->cfl_alpha_signs) ^ 1) - 1) >> 1; + DEBUG_BLOCK_printf("%*sPost-cfl[type=%d,%s=%d|%d]: r=%d\n", + depth, "", mbmi->cfl_idx, + mbmi->cfl_idx == CFL_MULTI_PARAM ? + "mhdir" : "alpha", + mbmi->cfl_idx == CFL_MULTI_PARAM ? mbmi->mh_dir : + mbmi->cfl_idx == CFL_DERIVED_ALPHA ? 0 : + sign_u * ((mbmi->cfl_alpha_idx >> 3) + 1), + mbmi->cfl_idx != CFL_EXPLICIT ? 0 : + sign_v * ((mbmi->cfl_alpha_idx & 7) + 1), + r->ec.rng); +#endif } } else { // Avoid decoding angle_info if there is is no chroma prediction @@ -1782,10 +2083,10 @@ static void read_intra_frame_mode_info(AV2_COMMON *const cm, mbmi->palette_mode_info.palette_size[1] = 0; if (av2_allow_palette(PLANE_TYPE_Y, cm->features.allow_screen_content_tools, bsize)) - read_palette_mode_info(cm, xd, r); + read_palette_mode_info(DB_ONLY(depth) cm, xd, r); if (xd->tree_type != CHROMA_PART) { mbmi->use_intra_dip = 0; - read_intra_dip_mode_info(cm, xd, r); + read_intra_dip_mode_info(DB_ONLY(depth) cm, xd, r); } } @@ -2070,7 +2371,9 @@ static void set_ref_frames_for_skip_mode(AV2_COMMON *const cm, } // Read the reference frame -static void read_ref_frames(AV2_COMMON *const cm, MACROBLOCKD *const xd, +static void read_ref_frames(DB_ONLY(const int depth, + const int mi_row, const int mi_col) + AV2_COMMON *const cm, MACROBLOCKD *const xd, avm_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { if (xd->mi[0]->skip_mode) { @@ -2086,9 +2389,12 @@ static void read_ref_frames(AV2_COMMON *const cm, MACROBLOCKD *const xd, ACCT_INFO("tip_cdf"))) { ref_frame[0] = TIP_FRAME; } + DEBUG_BLOCK_printf("%*sPost-tip[ctx=%d,%d]: r=%d\n", + depth, "", tip_ctx, is_tip_ref_frame(ref_frame[0]), + r->ec.rng); } - if (is_tip_ref_frame(ref_frame[0])) return; + if (is_tip_ref_frame(ref_frame[0])) goto end; if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { @@ -2106,9 +2412,16 @@ static void read_ref_frames(AV2_COMMON *const cm, MACROBLOCKD *const xd, assert(0 && "Invalid prediction mode."); } } +end: + DEBUG_BLOCK_printf("%*sPost-ref[%d,%d]: r=%d\n", + depth, "", is_tip_ref_frame(ref_frame[0]) ? 7 : + ref_frame[0], ref_frame[1], r->ec.rng); } -static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd, +static INLINE void read_mb_interp_filter(DB_ONLY(const int depth, + const int mi_row, + const int mi_col) + const MACROBLOCKD *const xd, InterpFilter interp_filter, const AV2_COMMON *cm, MB_MODE_INFO *const mbmi, @@ -2128,10 +2441,13 @@ static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd, r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_INFO("switchable_interp_cdf")); mbmi->interp_fltr = filter; + DEBUG_BLOCK_printf("%*sPost-subpelfilter[ctx=%d,%d]: r=%d\n", + depth, "", ctx, filter, r->ec.rng); } } -static void read_intra_block_mode_info(AV2_COMMON *const cm, +static void read_intra_block_mode_info(DB_ONLY(const int depth) + AV2_COMMON *const cm, MACROBLOCKD *const xd, MB_MODE_INFO *const mbmi, avm_reader *r) { @@ -2153,14 +2469,19 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, mbmi->motion_mode = SIMPLE_TRANSLATION; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; +#endif if (xd->tree_type != CHROMA_PART) { if (xd->lossless[mbmi->segment_id]) { mbmi->use_dpcm_y = read_dpcm_mode(ec_ctx, r); if (mbmi->use_dpcm_y == 0) { - read_intra_luma_mode(xd, r); + read_intra_luma_mode(DB_ONLY(depth, cm) xd, r); } else { mbmi->dpcm_mode_y = read_dpcm_vert_horz_mode(ec_ctx, r); + DEBUG_BLOCK_printf("%*sPost-ydpcm[dir=%d]: r=%d\n", + depth, "", mbmi->dpcm_mode_y, r->ec.rng); if (mbmi->dpcm_mode_y == 0) { mbmi->joint_y_mode_delta_angle = 22; mbmi->mode = V_PRED; @@ -2174,14 +2495,22 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, } else { mbmi->use_dpcm_y = 0; mbmi->dpcm_mode_y = 0; - read_intra_luma_mode(xd, r); + read_intra_luma_mode(DB_ONLY(depth, cm) xd, r); } } if (allow_fsc_intra(cm, bsize, mbmi) && xd->tree_type != CHROMA_PART) { avm_cdf_prob *fsc_cdf = get_fsc_mode_cdf(xd, bsize, mbmi->region_type == INTRA_REGION); +#if DEBUG_BLOCK_INFO + const int ctx = get_fsc_mode_ctx(xd, mbmi->region_type == INTRA_REGION); + const uint8_t fsc_size_group = fsc_bsize_groups[bsize]; +#endif mbmi->fsc_mode[xd->tree_type == CHROMA_PART] = read_fsc_mode(r, fsc_cdf); + DEBUG_BLOCK_printf("%*sPost-fsc[ctx=%d|%d,%d]: r=%d\n", + depth, "", ctx, fsc_size_group, + mbmi->fsc_mode[xd->tree_type == CHROMA_PART], + r->ec.rng); } else { mbmi->fsc_mode[xd->tree_type == CHROMA_PART] = 0; } @@ -2193,10 +2522,11 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, if (mbmi->use_dpcm_y == 0) { mbmi->mrl_index = (cm->seq_params.enable_mrls && av2_is_directional_mode(mbmi->mode)) - ? read_mrl_index(ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) + ? read_mrl_index(DB_ONLY(depth, cm, xd) + ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) : 0; if (mbmi->mrl_index) { - mbmi->multi_line_mrl = read_multi_line_mrl( + mbmi->multi_line_mrl = read_multi_line_mrl(DB_ONLY(depth, cm, xd) ec_ctx, r, xd->neighbors[0], xd->neighbors[1]); } else { mbmi->multi_line_mrl = 0; @@ -2208,11 +2538,13 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, } else { mbmi->mrl_index = (cm->seq_params.enable_mrls && av2_is_directional_mode(mbmi->mode)) - ? read_mrl_index(ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) + ? read_mrl_index(DB_ONLY(depth, cm, xd) + ec_ctx, r, xd->neighbors[0], xd->neighbors[1]) : 0; if (mbmi->mrl_index) { mbmi->multi_line_mrl = - read_multi_line_mrl(ec_ctx, r, xd->neighbors[0], xd->neighbors[1]); + read_multi_line_mrl(DB_ONLY(depth, cm, xd) + ec_ctx, r, xd->neighbors[0], xd->neighbors[1]); } else { mbmi->multi_line_mrl = 0; } @@ -2224,7 +2556,7 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, if (xd->lossless[mbmi->segment_id]) { mbmi->use_dpcm_uv = read_dpcm_uv_mode(ec_ctx, r); if (mbmi->use_dpcm_uv == 0) { - read_intra_uv_mode( + read_intra_uv_mode(DB_ONLY(depth, cm) xd, is_cfl_allowed(cm->seq_params.enable_cfl_intra, xd) || is_mhccp_allowed(cm, xd), @@ -2233,6 +2565,8 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, } else { get_uv_intra_mode_set(mbmi); mbmi->dpcm_mode_uv = read_dpcm_uv_vert_horz_mode(ec_ctx, r); + DEBUG_BLOCK_printf("%*sPost-uvdpcm[dir=%d]: r=%d\n", + depth, "", mbmi->dpcm_mode_uv, r->ec.rng); mbmi->uv_mode = mbmi->dpcm_mode_uv + 1; if (mbmi->uv_mode == mbmi->mode) mbmi->angle_delta[PLANE_TYPE_UV] = mbmi->angle_delta[PLANE_TYPE_Y]; @@ -2242,7 +2576,8 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, } else { mbmi->use_dpcm_uv = 0; mbmi->dpcm_mode_uv = 0; - read_intra_uv_mode(xd, + read_intra_uv_mode(DB_ONLY(depth, cm) + xd, is_cfl_allowed(cm->seq_params.enable_cfl_intra, xd) || is_mhccp_allowed(cm, xd), r); @@ -2272,6 +2607,20 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, mbmi->cfl_alpha_idx = read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs); } +#if DEBUG_BLOCK_INFO + const int sign_u = ((CFL_SIGN_U(mbmi->cfl_alpha_signs) ^ 1) - 1) >> 1; + const int sign_v = ((CFL_SIGN_V(mbmi->cfl_alpha_signs) ^ 1) - 1) >> 1; + DEBUG_BLOCK_printf("%*sPost-cfl[type=%d,%s=%d|%d]: r=%d\n", + depth, "", mbmi->cfl_idx, + mbmi->cfl_idx == CFL_MULTI_PARAM ? + "mhdir" : "alpha", + mbmi->cfl_idx == CFL_MULTI_PARAM ? mbmi->mh_dir : + mbmi->cfl_idx == CFL_DERIVED_ALPHA ? 0 : + sign_u * ((mbmi->cfl_alpha_idx >> 3) + 1), + mbmi->cfl_idx != CFL_EXPLICIT ? 0 : + sign_v * ((mbmi->cfl_alpha_idx & 7) + 1), + r->ec.rng); +#endif } } else { // Avoid decoding angle_info if there is is no chroma prediction @@ -2282,14 +2631,16 @@ static void read_intra_block_mode_info(AV2_COMMON *const cm, mbmi->palette_mode_info.palette_size[1] = 0; if (av2_allow_palette(PLANE_TYPE_Y, cm->features.allow_screen_content_tools, bsize)) - read_palette_mode_info(cm, xd, r); + read_palette_mode_info(DB_ONLY(depth) cm, xd, r); if (xd->tree_type != CHROMA_PART) { mbmi->use_intra_dip = 0; - read_intra_dip_mode_info(cm, xd, r); + read_intra_dip_mode_info(DB_ONLY(depth) cm, xd, r); } } -static INLINE int assign_mv(AV2_COMMON *cm, MACROBLOCKD *xd, +static INLINE int assign_mv(DB_ONLY(const int depth, + const int mi_row, const int mi_col) + AV2_COMMON *cm, MACROBLOCKD *xd, PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2], int_mv ref_mv[2], int is_compound, @@ -2482,6 +2833,9 @@ static INLINE int assign_mv(AV2_COMMON *cm, MACROBLOCKD *xd, } } } + DEBUG_BLOCK_printf("%*sPost-mvdiff[%d,y:%d,x:%d]: r=%d\n", + depth, "", ref_idx, mv_diff[ref_idx].row, + mv_diff[ref_idx].col, r->ec.rng); } for (int ref_idx = start_signaled_mvd_idx; @@ -2520,7 +2874,9 @@ static INLINE int assign_mv(AV2_COMMON *cm, MACROBLOCKD *xd, return 1; } -static int read_is_inter_block(AV2_COMMON *const cm, MACROBLOCKD *const xd, +static int read_is_inter_block(DB_ONLY(const int depth, + const int mi_row, const int mi_col) + AV2_COMMON *const cm, MACROBLOCKD *const xd, int segment_id, avm_reader *r) { if (xd->mi[0]->sb_type[PLANE_TYPE_Y] == BLOCK_4X4) { return 0; @@ -2539,6 +2895,8 @@ static int read_is_inter_block(AV2_COMMON *const cm, MACROBLOCKD *const xd, FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int is_inter = avm_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_INFO()); + DEBUG_BLOCK_printf("%*sPost-is_inter[ctx=%d,%d]: r=%d\n", + depth, "", ctx, is_inter, r->ec.rng); return is_inter; } @@ -2577,7 +2935,9 @@ static void dec_dump_logs(AV2_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row, #endif // DEC_MISMATCH_DEBUG // This function read the refinemv_flag ( if require) from the bitstream -static void read_refinemv_flag(AV2_COMMON *const cm, MACROBLOCKD *xd, +static void read_refinemv_flag(DB_ONLY(const int depth, const int mi_row, + const int mi_col) + AV2_COMMON *const cm, MACROBLOCKD *xd, avm_reader *r, BLOCK_SIZE bsize) { MB_MODE_INFO *const mbmi = xd->mi[0]; mbmi->refinemv_flag = get_default_refinemv_flag(cm, mbmi); @@ -2587,12 +2947,14 @@ static void read_refinemv_flag(AV2_COMMON *const cm, MACROBLOCKD *xd, mbmi->refinemv_flag = avm_read_symbol(r, xd->tile_ctx->refinemv_flag_cdf[refinemv_ctx], REFINEMV_NUM_MODES, ACCT_INFO("refinemv_flag")); + DEBUG_BLOCK_printf("%*sPost-refinemv[ctx=%d,%d]: r=%d\n", + depth, "", refinemv_ctx, mbmi->refinemv_flag, r->ec.rng); } } MvSubpelPrecision av2_read_pb_mv_precision(AV2_COMMON *const cm, MACROBLOCKD *const xd, - avm_reader *r) { + avm_reader *r, int ctx[3]) { MB_MODE_INFO *const mbmi = xd->mi[0]; assert(mbmi->max_mv_precision == av2_get_mbmi_max_mv_precision(cm, xd->sbi, mbmi)); @@ -2604,11 +2966,13 @@ MvSubpelPrecision av2_read_pb_mv_precision(AV2_COMMON *const cm, assert(mbmi->most_probable_pb_mv_precision == cm->features.most_probable_fr_mv_precision); - const int mpp_flag_context = av2_get_mpp_flag_context(cm, xd); + const int mpp_flag_context = ctx[0] = av2_get_mpp_flag_context(cm, xd); const int mpp_flag = avm_read_symbol(r, xd->tile_ctx->pb_mv_mpp_flag_cdf[mpp_flag_context], 2, ACCT_INFO("mpp_flag")); if (mpp_flag) return mbmi->most_probable_pb_mv_precision; + ctx[1] = down_ctx; + ctx[2] = max_precision - MV_PRECISION_HALF_PEL; const PRECISION_SET *precision_def = &av2_mv_precision_sets[mbmi->mb_precision_set]; int nsymbs = precision_def->num_precisions - 1; @@ -2620,7 +2984,30 @@ MvSubpelPrecision av2_read_pb_mv_precision(AV2_COMMON *const cm, return av2_get_precision_from_index(mbmi, down); } -static void read_inter_block_mode_info(AV2Decoder *const pbi, +#if DEBUG_BLOCK_INFO +static void debug_warp_matrix(const AV2_COMMON *cm, const int mi_row, + const int mi_col, const MB_MODE_INFO *mbmi, + const int idx, const int depth) +{ +#define signabs(v) v < 0 ? '-' : ' ', abs(v) + DEBUG_BLOCK_printf("%*s[ %c%x, %c%x | %c%x, %c%x, %c%x, %c%x ],t=%d " + "mv=y:%d,x:%d\n", depth, "", + signabs(mbmi->wm_params[idx].wmmat[0]), + signabs(mbmi->wm_params[idx].wmmat[1]), + signabs(mbmi->wm_params[idx].wmmat[2]), + signabs(mbmi->wm_params[idx].wmmat[3]), + signabs(mbmi->wm_params[idx].wmmat[4]), + signabs(mbmi->wm_params[idx].wmmat[5]), + mbmi->wm_params[idx].invalid ? -1 : mbmi->wm_params[idx].wmtype, + mbmi->mv[idx].as_mv.row, mbmi->mv[idx].as_mv.col); +#undef signabs +} +#else +#define debug_warp_matrix(...) +#endif + +static void read_inter_block_mode_info(DB_ONLY(const int depth) + AV2Decoder *const pbi, DecoderCodingBlock *dcb, MB_MODE_INFO *const mbmi, avm_reader *r) { @@ -2662,7 +3049,11 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, av2_collect_neighbors_ref_counts(xd); - read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; +#endif + read_ref_frames(DB_ONLY(depth, mi_row, mi_col) + cm, xd, r, mbmi->segment_id, mbmi->ref_frame); int is_compound = has_second_ref(mbmi); const MV_REFERENCE_FRAME ref_frame = av2_ref_frame_type(mbmi->ref_frame); @@ -2696,8 +3087,12 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, read_drl_idx(cm->features.max_drl_bits, av2_mode_context_pristine(inter_mode_ctx, mbmi->ref_frame), ec_ctx, mbmi, r); + DEBUG_BLOCK_printf("%*sPost-drl[%d,%d]: r=%d\n", + depth, "", mbmi->ref_mv_idx[0], + mbmi->ref_mv_idx[0], r->ec.rng); - av2_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, + av2_find_mv_refs(DB_ONLY(0) + cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, NULL, 0, NULL); @@ -2716,9 +3111,11 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, const int16_t mode_ctx = av2_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame); if (is_compound) - mbmi->mode = read_inter_compound_mode(xd, r, cm, mbmi, mode_ctx); + mbmi->mode = read_inter_compound_mode(DB_ONLY(depth, mi_row, mi_col) + xd, r, cm, mbmi, mode_ctx); else - mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx, cm, xd, mbmi, bsize); + mbmi->mode = read_inter_mode(DB_ONLY(depth, mi_row, mi_col) + ec_ctx, r, mode_ctx, cm, xd, mbmi, bsize); mbmi->use_amvd = 0; if (cm->seq_params.enable_adaptive_mvd && allow_amvd_mode(mbmi->mode)) { @@ -2728,8 +3125,11 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, mbmi->use_amvd = avm_read_symbol(r, ec_ctx->amvd_mode_cdf[amvd_index][amvd_ctx], 2, ACCT_INFO("use_amvd")); + DEBUG_BLOCK_printf("%*sPost-amvd[ctx=%d|%d,%d]: r=%d\n", + depth, "", amvd_index, amvd_ctx, mbmi->use_amvd, + r->ec.rng); } - av2_find_mv_refs( + av2_find_mv_refs(DB_ONLY(0) cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, xd->warp_param_stack, ref_frame < SINGLE_REF_FRAMES ? MAX_WARP_REF_CANDIDATES : 0, @@ -2753,14 +3153,17 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, r, xd->tile_ctx->explicit_bawp_scale_cdf, EXPLICIT_BAWP_SCALE_CNT, ACCT_INFO("explicit_bawp_scales")); } - } - if (!cm->seq_params.monochrome && xd->is_chroma_ref && - mbmi->bawp_flag[0]) { - mbmi->bawp_flag[1] = avm_read_symbol(r, xd->tile_ctx->bawp_cdf[1], 2, - ACCT_INFO("bawp_flag_chroma")); - } else { - mbmi->bawp_flag[1] = 0; + if (!cm->seq_params.monochrome && xd->is_chroma_ref && + mbmi->bawp_flag[0]) { + mbmi->bawp_flag[1] = avm_read_symbol(r, xd->tile_ctx->bawp_cdf[1], 2, + ACCT_INFO("bawp_flag_chroma")); + } else { + mbmi->bawp_flag[1] = 0; + } + DEBUG_BLOCK_printf("%*sPost-bawp[%d,%d]: r=%d\n", + depth, "", mbmi->bawp_flag[0], mbmi->bawp_flag[1], + r->ec.rng); } for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { @@ -2777,7 +3180,8 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, if (has_second_ref(mbmi)) mbmi->num_proj_ref[1] = av2_findSamples(cm, xd, pts1, pts1_inref, 1); } - mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r); + mbmi->motion_mode = read_motion_mode(DB_ONLY(depth, mi_row, mi_col) + cm, xd, mbmi, r); int is_warpmv_warp_causal = ((mbmi->motion_mode == WARP_CAUSAL) && mbmi->mode == WARPMV); if (mbmi->motion_mode == WARP_DELTA || is_warpmv_warp_causal) { @@ -2788,27 +3192,39 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, xd->valid_num_warp_candidates[av2_ref_frame_type(mbmi->ref_frame)], NULL); - read_warp_ref_idx(xd->tile_ctx, mbmi, r); + read_warp_ref_idx(DB_ONLY(depth, cm, mi_row, mi_col) xd->tile_ctx, mbmi, r); ref_warp_model = warp_param_stack[mbmi->warp_ref_idx].wm_params; } if (allow_warpmv_with_mvd_coding(cm, mbmi)) { read_warpmv_with_mvd_flag(xd->tile_ctx, mbmi, r); + DEBUG_BLOCK_printf("%*sPost-warpmv_with_mvd[%d]: r=%d\n", + depth, "", mbmi->warpmv_with_mvd_flag, r->ec.rng); } else { mbmi->warpmv_with_mvd_flag = 0; } - mbmi->jmvd_scale_mode = read_jmvd_scale_mode(xd, r, mbmi); + mbmi->jmvd_scale_mode = read_jmvd_scale_mode(DB_ONLY(depth, cm, mi_row, mi_col) + xd, r, mbmi); int max_drl_bits = cm->features.max_drl_bits; - if (have_drl_index(mbmi->mode)) + if (have_drl_index(mbmi->mode)) { read_drl_idx(max_drl_bits, av2_mode_context_pristine(inter_mode_ctx, mbmi->ref_frame), ec_ctx, mbmi, r); + DEBUG_BLOCK_printf("%*sPost-drl[%d,%d]: r=%d\n", + depth, "", mbmi->ref_mv_idx[0], + !has_second_ref(mbmi) ? -1 : + mbmi->ref_mv_idx[has_second_drl(mbmi)], r->ec.rng); + } set_mv_precision(mbmi, mbmi->max_mv_precision); if (is_pb_mv_precision_active(cm, mbmi, bsize)) { set_precision_set(cm, xd, mbmi, bsize, mbmi->ref_mv_idx); set_most_probable_mv_precision(cm, mbmi, bsize); - mbmi->pb_mv_precision = av2_read_pb_mv_precision(cm, xd, r); + int ctx[3] = { -1, -1, -1 }; + mbmi->pb_mv_precision = av2_read_pb_mv_precision(cm, xd, r, ctx); + DEBUG_BLOCK_printf("%*sPost-mv_precision[ctx=%d|%d|%d,%d]: r=%d\n", + depth, "", ctx[0], ctx[1], ctx[2], + mbmi->pb_mv_precision, r->ec.rng); } if (enable_adaptive_mvd_resolution(cm, mbmi)) set_amvd_mv_precision(mbmi, mbmi->max_mv_precision); @@ -2858,13 +3274,14 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, } const int mv_corrupted_flag = - !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv, + !assign_mv(DB_ONLY(depth, mi_row, mi_col) + cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv, is_compound, mbmi->pb_mv_precision, r); avm_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag); if (mbmi->motion_mode == WARP_DELTA) { - read_warp_delta(cm, xd, mbmi, r, warp_param_stack); + read_warp_delta(DB_ONLY(depth) cm, xd, mbmi, r, warp_param_stack); } mbmi->warp_inter_intra = 0; @@ -2897,10 +3314,15 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, } } } + DEBUG_BLOCK_printf("%*sPost-warp_ii[%d,%d,%d]: r=%d\n", + depth, "", mbmi->warp_inter_intra, + mbmi->warp_inter_intra ? mbmi->interintra_mode : -1, + mbmi->warp_inter_intra && mbmi->use_wedge_interintra ? + mbmi->interintra_wedge_index : -1, r->ec.rng); } if (!mbmi->skip_mode) { - read_refinemv_flag(cm, xd, r, bsize); + read_refinemv_flag(DB_ONLY(depth, mi_row, mi_col) cm, xd, r, bsize); } // init @@ -2915,8 +3337,9 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, const int masked_compound_used = is_any_masked_compound_used(bsize) && cm->seq_params.enable_masked_compound; + int ctx_comp_group_idx; if (masked_compound_used) { - const int ctx_comp_group_idx = get_comp_group_idx_context(cm, xd); + ctx_comp_group_idx = get_comp_group_idx_context(cm, xd); mbmi->comp_group_idx = (uint8_t)avm_read_symbol( r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_INFO("comp_group_idx")); @@ -2955,11 +3378,28 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, avm_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_INFO("mask_type")); } } + if (masked_compound_used) { + DEBUG_BLOCK_printf("%*sPost-comp_inter_type[ctx=%d,%d,%c=%d|%d]: r=%d\n", + depth, "", ctx_comp_group_idx, + mbmi->interinter_comp.type, + mbmi->interinter_comp.type == COMPOUND_AVERAGE ? + '?' : mbmi->interinter_comp.type == COMPOUND_WEDGE ? + 'w' : 'm', + mbmi->interinter_comp.type == COMPOUND_AVERAGE ? + -1 : mbmi->interinter_comp.type == COMPOUND_WEDGE ? + mbmi->interinter_comp.wedge_index : + mbmi->interinter_comp.mask_type, + mbmi->interinter_comp.type == COMPOUND_WEDGE ? + mbmi->interinter_comp.wedge_sign : -1, r->ec.rng); + } } mbmi->cwp_idx = CWP_EQUAL; if (cm->features.enable_cwp) { - if (is_cwp_allowed(mbmi) && !mbmi->skip_mode) + if (is_cwp_allowed(mbmi) && !mbmi->skip_mode) { mbmi->cwp_idx = read_cwp_idx(xd, r, cm, mbmi); + DEBUG_BLOCK_printf("%*sPost-compweightpred_idx[%d]: r=%d\n", + depth, "", mbmi->cwp_idx, r->ec.rng); + } if (is_cwp_allowed(mbmi) && mbmi->skip_mode) mbmi->cwp_idx = xd->skip_mvp_candidate_list.ref_mv_stack[mbmi->ref_mv_idx[0]].cwp_idx; @@ -2968,10 +3408,13 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, mbmi->refinemv_flag = 0; } - read_mb_interp_filter(xd, features->interp_filter, cm, mbmi, r); + read_mb_interp_filter(DB_ONLY(depth, mi_row, mi_col) + xd, features->interp_filter, cm, mbmi, r); +#if !DEBUG_BLOCK_INFO const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; +#endif if (mbmi->motion_mode == WARP_CAUSAL) { mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE; @@ -3065,7 +3508,8 @@ static void read_inter_block_mode_info(AV2Decoder *const pbi, #endif // DEC_MISMATCH_DEBUG } -static void read_inter_frame_mode_info(AV2Decoder *const pbi, +static void read_inter_frame_mode_info(DB_ONLY(const int depth) + AV2Decoder *const pbi, DecoderCodingBlock *dcb, avm_reader *r) { AV2_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &dcb->xd; @@ -3089,9 +3533,13 @@ static void read_inter_frame_mode_info(AV2Decoder *const pbi, mbmi->refinemv_flag = 0; if (xd->tree_type != CHROMA_PART) - mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r); + mbmi->segment_id = read_inter_segment_id(DB_ONLY(depth) cm, xd, 1, r); - mbmi->skip_mode = read_skip_mode(cm, xd, r); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; +#endif + mbmi->skip_mode = read_skip_mode(DB_ONLY(depth, mi_row, mi_col) + cm, xd, r); mbmi->fsc_mode[xd->tree_type == CHROMA_PART] = 0; @@ -3108,7 +3556,8 @@ static void read_inter_frame_mode_info(AV2Decoder *const pbi, } if (!mbmi->skip_mode) { - inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); + inter_block = read_is_inter_block(DB_ONLY(depth, mi_row, mi_col) + cm, xd, mbmi->segment_id, r); } if (!inter_block && @@ -3121,11 +3570,16 @@ static void read_inter_frame_mode_info(AV2Decoder *const pbi, mbmi->use_intrabc[xd->tree_type == CHROMA_PART] = avm_read_symbol(r, xd->tile_ctx->intrabc_cdf[intrabc_ctx], 2, ACCT_INFO("use_intrabc", "chroma")); + DEBUG_BLOCK_printf("%*sPost-intrabc[ctx=%d,%d]: r=%d\n", + depth, "", intrabc_ctx, + mbmi->use_intrabc[xd->tree_type == CHROMA_PART], + r->ec.rng); } if (inter_block || (!inter_block && is_intrabc_block(mbmi, xd->tree_type))) { mbmi->skip_txfm[xd->tree_type == CHROMA_PART] = - read_skip_txfm(cm, xd, mbmi->segment_id, r); + read_skip_txfm(DB_ONLY(depth, mi_row, mi_col) + cm, xd, mbmi->segment_id, r); } else { // Segment SEG_LVL_SKIP should be disabled for intra prediction if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { @@ -3141,16 +3595,16 @@ static void read_inter_frame_mode_info(AV2Decoder *const pbi, mbmi->warp_inter_intra = 0; if (!cm->seg.segid_preskip && xd->tree_type != CHROMA_PART) - mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r); + mbmi->segment_id = read_inter_segment_id(DB_ONLY(depth) cm, xd, 0, r); - if (xd->tree_type != CHROMA_PART) read_gdf(cm, r, xd); + if (xd->tree_type != CHROMA_PART) read_gdf(DB_ONLY(depth) cm, r, xd); - if (xd->tree_type != CHROMA_PART) read_cdef(cm, r, xd); + if (xd->tree_type != CHROMA_PART) read_cdef(DB_ONLY(depth) cm, r, xd); if (cm->seq_params.enable_ccso && xd->tree_type != CHROMA_PART) - read_ccso(cm, r, xd); + read_ccso(DB_ONLY(depth) cm, r, xd); - if (xd->tree_type != CHROMA_PART) read_delta_q_params(cm, xd, r); + if (xd->tree_type != CHROMA_PART) read_delta_q_params(DB_ONLY(depth) cm, xd, r); mbmi->current_qindex = xd->current_base_qindex; @@ -3167,7 +3621,7 @@ static void read_inter_frame_mode_info(AV2Decoder *const pbi, mbmi->ref_frame[1] = NONE_FRAME; mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; - read_intrabc_info(cm, dcb, r); + read_intrabc_info(DB_ONLY(depth) cm, dcb, r); if (is_intrabc_block(mbmi, xd->tree_type)) { mbmi->use_dpcm_y = 0; mbmi->dpcm_mode_y = 0; @@ -3176,9 +3630,9 @@ static void read_inter_frame_mode_info(AV2Decoder *const pbi, } } if (inter_block) - read_inter_block_mode_info(pbi, dcb, mbmi, r); + read_inter_block_mode_info(DB_ONLY(depth) pbi, dcb, mbmi, r); else - read_intra_block_mode_info(cm, xd, mbmi, r); + read_intra_block_mode_info(DB_ONLY(depth) cm, xd, mbmi, r); } static void intra_copy_frame_mvs(AV2_COMMON *const cm, int mi_row, int mi_col, @@ -3204,7 +3658,143 @@ static void intra_copy_frame_mvs(AV2_COMMON *const cm, int mi_row, int mi_col, } } -void av2_read_mode_info(AV2Decoder *const pbi, DecoderCodingBlock *dcb, +#if DEBUG_BLOCK_INFO +void resolve_refmvs(DB_ONLY(const int depth) + AV2Decoder *const pbi, + DecoderCodingBlock *dcb) +{ + AV2_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + + if (DEBUG_BLOCK_INFO) { + if (mbmi->use_intrabc[xd->tree_type == CHROMA_PART]) { + int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES]; + av2_find_mv_refs(1, cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count, + xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, + NULL, 0, NULL); + printf("%*sfind_mv_refs(intra)\n", depth, ""); + for (int n = 0; n < dcb->ref_mv_count[INTRA_FRAME]; n++) + printf("%*smv[%d/%d]: y=%d,x=%d,w=%d\n", + depth + 1, "", n, dcb->ref_mv_count[INTRA_FRAME], + xd->ref_mv_stack[INTRA_FRAME][n].this_mv.as_mv.row, + xd->ref_mv_stack[INTRA_FRAME][n].this_mv.as_mv.col, + xd->weight[INTRA_FRAME][n]); + } else if (mbmi->skip_mode) { + const MV_REFERENCE_FRAME ref_frame = av2_ref_frame_type(mbmi->ref_frame); + int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } }; + av2_find_mv_refs(1, cm, xd, mbmi, ref_frame, dcb->ref_mv_count, + xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, + NULL, 0, NULL); + printf("%*sfind_mv_refs(%d,%d)\n", depth, "", mbmi->ref_frame[0], + mbmi->ref_frame[1]); + for (int n = 0; n < xd->skip_mvp_candidate_list.ref_mv_count; n++) + printf("%*smv[%d/%d]: y=%d,x=%d,y2=%d,x2=%d,w=%d,cwp=%d\n", + depth + 1, "", n, xd->skip_mvp_candidate_list.ref_mv_count, + xd->skip_mvp_candidate_list.ref_mv_stack[n].this_mv.as_mv.row, + xd->skip_mvp_candidate_list.ref_mv_stack[n].this_mv.as_mv.col, + xd->skip_mvp_candidate_list.ref_mv_stack[n].comp_mv.as_mv.row, + xd->skip_mvp_candidate_list.ref_mv_stack[n].comp_mv.as_mv.col, + xd->skip_mvp_candidate_list.weight[n], + xd->skip_mvp_candidate_list.ref_mv_stack[n].cwp_idx); + } else if (is_inter_block(mbmi, xd->tree_type) && + ((has_second_ref(mbmi) && mbmi->mode != GLOBAL_GLOBALMV) || + (!has_second_ref(mbmi) && mbmi->mode != GLOBALMV))) + { + const MV_REFERENCE_FRAME ref_frame = av2_ref_frame_type(mbmi->ref_frame); + int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } }; + av2_find_mv_refs(1, + cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack, + xd->weight, ref_mvs, /*global_mvs=*/NULL, xd->warp_param_stack, + ref_frame < SINGLE_REF_FRAMES ? MAX_WARP_REF_CANDIDATES : 0, + xd->valid_num_warp_candidates); + printf("%*sfind_mv_refs(%d,%d)\n", depth, "", + mbmi->ref_frame[0] == TIP_FRAME ? 7 : mbmi->ref_frame[0], + mbmi->ref_frame[1]); + if (!has_second_ref(mbmi)) { + for (int n = 0; n < dcb->ref_mv_count[ref_frame]; n++) + printf("%*smv[%d/%d]: y=%d,x=%d,w=%d,y_off=%d,x_off=%d\n", + depth + 1, "", n, dcb->ref_mv_count[ref_frame], + xd->ref_mv_stack[ref_frame][n].this_mv.as_mv.row, + xd->ref_mv_stack[ref_frame][n].this_mv.as_mv.col, + xd->weight[ref_frame][n], + xd->ref_mv_stack[ref_frame][n].row_offset, + xd->ref_mv_stack[ref_frame][n].col_offset); + if (mbmi->ref_frame[0] != TIP_FRAME && mbmi->mode > NEWMV) { + const int n_warp = + xd->valid_num_warp_candidates[mbmi->ref_frame[0]]; + for (int n = 0; n < n_warp; n++) { + const int32_t *const mat = + xd->warp_param_stack[mbmi->ref_frame[0]] + [n].wm_params.wmmat; + printf("%*swarp[%d/%d]: %d, %d, %d, %d, %d, %d, t=%d\n", + depth + 1, "", n, n_warp, + mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], + xd->warp_param_stack[mbmi->ref_frame[0]] + [n].wm_params.wmtype); + } + } + } else if (has_second_drl(mbmi)) { + for (int drl = 0; drl < 2; drl++) { + const int rf = mbmi->ref_frame[drl]; + for (int n = 0; n < dcb->ref_mv_count[rf]; n++) + printf("%*smv[%d:%d/%d]: y=%d,x=%d,w=%d\n", + depth + 1, "", drl, n, dcb->ref_mv_count[rf], + xd->ref_mv_stack[rf][n].this_mv.as_mv.row, + xd->ref_mv_stack[rf][n].this_mv.as_mv.col, + xd->weight[rf][n]); + } + } else { + for (int n = 0; n < dcb->ref_mv_count[ref_frame]; n++) + printf("%*smv[%d/%d]: y=%d,x=%d,y2=%d,x2=%d,w=%d\n", + depth + 1, "", n, dcb->ref_mv_count[ref_frame], + xd->ref_mv_stack[ref_frame][n].this_mv.as_mv.row, + xd->ref_mv_stack[ref_frame][n].this_mv.as_mv.col, + xd->ref_mv_stack[ref_frame][n].comp_mv.as_mv.row, + xd->ref_mv_stack[ref_frame][n].comp_mv.as_mv.col, + xd->weight[ref_frame][n]); + } + } + } + + if (is_inter_ref_frame(mbmi->ref_frame[1])) { + if (mbmi->motion_mode >= WARP_CAUSAL) { + for (int i = 0; i < 2; i++) + debug_warp_matrix(cm, mi_row, mi_col, mbmi, i, depth); + } else { + DEBUG_BLOCK_printf("%*sfinal 2dmv: y=%d,x=%d | y=%d,x=%d\n", depth, "", + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + mbmi->mv[1].as_mv.row, mbmi->mv[1].as_mv.col); + } + } else if (is_inter_block(mbmi, xd->tree_type)) { + if (mbmi->motion_mode >= WARP_CAUSAL) { + debug_warp_matrix(cm, mi_row, mi_col, mbmi, 0, depth); + } else { + DEBUG_BLOCK_printf("%*sfinal 2dmv: y=%d,x=%d\n", depth, "", + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col); + } + } + + if (cm->seq_params.enable_refmvbank) { + if (is_inter_block(mbmi, xd->tree_type)) { + av2_update_ref_mv_bank(cm, xd, 1, mbmi); + } else { + decide_rmb_unit_update_count(cm, xd, mbmi); + } + } + + if (!frame_is_intra_only(cm)) + av2_update_warp_param_bank(cm, xd, +#if COMPOUND_WARP_LINE_BUFFER_REDUCTION + 0, +#endif // COMPOUND_WARP_LINE_BUFFER_REDUCTION + mbmi); +} +#endif + +void av2_read_mode_info(DB_ONLY(const int depth) + AV2Decoder *const pbi, DecoderCodingBlock *dcb, avm_reader *r, int x_inside_boundary, int y_inside_boundary) { AV2_COMMON *const cm = &pbi->common; @@ -3216,7 +3806,8 @@ void av2_read_mode_info(AV2Decoder *const pbi, DecoderCodingBlock *dcb, mi->sb_type[PLANE_TYPE_UV] = mi->sb_type[PLANE_TYPE_Y]; if (frame_is_intra_only(cm)) { - read_intra_frame_mode_info(cm, dcb, r); + read_intra_frame_mode_info(DB_ONLY(depth) cm, dcb, r); +#if !DEBUG_BLOCK_INFO if (cm->seq_params.enable_refmvbank) { MB_MODE_INFO *const mbmi = xd->mi[0]; if (is_intrabc_block(mbmi, xd->tree_type)) { @@ -3225,11 +3816,13 @@ void av2_read_mode_info(AV2Decoder *const pbi, DecoderCodingBlock *dcb, decide_rmb_unit_update_count(cm, xd, mbmi); } } +#endif if (cm->seq_params.order_hint_info.enable_ref_frame_mvs) intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_inside_boundary, y_inside_boundary); } else { - read_inter_frame_mode_info(pbi, dcb, r); + read_inter_frame_mode_info(DB_ONLY(depth) pbi, dcb, r); +#if !DEBUG_BLOCK_INFO if (cm->seq_params.enable_refmvbank) { MB_MODE_INFO *const mbmi = xd->mi[0]; if (is_inter_block(mbmi, xd->tree_type)) { @@ -3246,5 +3839,6 @@ void av2_read_mode_info(AV2Decoder *const pbi, DecoderCodingBlock *dcb, 0, #endif // COMPOUND_WARP_LINE_BUFFER_REDUCTION mbmi_tmp); +#endif } } diff --git a/av2/decoder/decodemv.h b/av2/decoder/decodemv.h index ca6a4a65fb..9869e5d737 100644 --- a/av2/decoder/decodemv.h +++ b/av2/decoder/decodemv.h @@ -21,7 +21,11 @@ extern "C" { #endif -void av2_read_mode_info(AV2Decoder *const pbi, DecoderCodingBlock *dcb, +void resolve_refmvs(DB_ONLY(const int depth) + AV2Decoder *pbi, + DecoderCodingBlock *dcb); +void av2_read_mode_info(DB_ONLY(const int depth) + AV2Decoder *const pbi, DecoderCodingBlock *dcb, avm_reader *r, int x_inside_boundary, int y_inside_boundary); @@ -29,7 +33,8 @@ void av2_read_mode_info(AV2Decoder *const pbi, DecoderCodingBlock *dcb, } // extern "C" #endif -void av2_read_sec_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, +void av2_read_sec_tx_type(DB_ONLY(const int depth) + const AV2_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, uint16_t *eob, avm_reader *r); @@ -37,11 +42,15 @@ void av2_read_tx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, avm_reader *r, const int plane, const int eob, const int dc_skip); -void av2_read_cctx_type(const AV2_COMMON *const cm, MACROBLOCKD *xd, +void av2_read_cctx_type(DB_ONLY(const int depth) + const AV2_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, avm_reader *r); -void read_ccso(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd); -void read_cdef(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd); -void read_gdf(AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd); +void read_ccso(DB_ONLY(const int depth) + AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd); +void read_cdef(DB_ONLY(const int depth) + AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd); +void read_gdf(DB_ONLY(const int depth) + AV2_COMMON *cm, avm_reader *r, MACROBLOCKD *const xd); #endif // AVM_AV2_DECODER_DECODEMV_H_ diff --git a/av2/decoder/decoder.c b/av2/decoder/decoder.c index 634230f3f9..383910149e 100644 --- a/av2/decoder/decoder.c +++ b/av2/decoder/decoder.c @@ -471,7 +471,8 @@ void av2_decoder_remove(AV2Decoder *pbi) { avm_free(pbi); } -void av2_visit_palette(AV2Decoder *const pbi, MACROBLOCKD *const xd, +void av2_visit_palette(DB_ONLY(const int depth) + AV2Decoder *const pbi, MACROBLOCKD *const xd, avm_reader *r, palette_visitor_fn_t visit) { if (!is_inter_block(xd->mi[0], xd->tree_type)) { const int plane_start = get_partition_plane_start(xd->tree_type); @@ -480,7 +481,7 @@ void av2_visit_palette(AV2Decoder *const pbi, MACROBLOCKD *const xd, for (int plane = plane_start; plane < plane_end; ++plane) { if (plane == 0 || xd->is_chroma_ref) { if (xd->mi[0]->palette_mode_info.palette_size[plane]) - visit(xd, plane, r); + visit(DB_ONLY(&pbi->common, depth) xd, plane, r); } else { assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0); } diff --git a/av2/decoder/decoder.h b/av2/decoder/decoder.h index bdcdbbe880..1100c3b161 100644 --- a/av2/decoder/decoder.h +++ b/av2/decoder/decoder.h @@ -103,7 +103,8 @@ typedef struct DecoderCodingBlock { /*!\cond */ -typedef void (*decode_block_visitor_fn_t)(const AV2_COMMON *const cm, +typedef void (*decode_block_visitor_fn_t)(DB_ONLY(const int depth) + const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int row, const int col, @@ -802,13 +803,16 @@ static INLINE int av2_read_uniform(avm_reader *r, int n) { return (v << 1) - m + avm_read_literal(r, 1, ACCT_INFO()); } -typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane, +typedef void (*palette_visitor_fn_t)(DB_ONLY(AV2_COMMON *cm, const int depth) + MACROBLOCKD *const xd, int plane, avm_reader *r); -void av2_visit_palette(AV2Decoder *const pbi, MACROBLOCKD *const xd, +void av2_visit_palette(DB_ONLY(const int depth) + AV2Decoder *const pbi, MACROBLOCKD *const xd, avm_reader *r, palette_visitor_fn_t visit); -typedef void (*block_visitor_fn_t)(AV2Decoder *const pbi, ThreadData *const td, +typedef void (*block_visitor_fn_t)(DB_ONLY(const int depth) + AV2Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, avm_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize, PARTITION_TREE *parent, int index); diff --git a/av2/decoder/decodetxb.c b/av2/decoder/decodetxb.c index 8db11062fc..022211689d 100644 --- a/av2/decoder/decodetxb.c +++ b/av2/decoder/decodetxb.c @@ -115,7 +115,11 @@ static int read_adaptive_hr(MACROBLOCKD *xd, avm_reader *r, int ctx) { } // Read high range part of coeff -static INLINE int read_high_range(MACROBLOCKD *xd, avm_reader *r, int tcq_mode, +static INLINE int read_high_range(DB_ONLY(const int depth, + const AV2_COMMON *const cm, + const int mi_row, + const int mi_col, const int c) + MACROBLOCKD *xd, avm_reader *r, int tcq_mode, int level, int lf, int *hr_avg, int plane) { int max_br = lf ? (plane == 0 ? LF_MAX_BASE_BR_RANGE // 8 : LF_NUM_BASE_LEVELS + 1) @@ -126,6 +130,8 @@ static INLINE int read_high_range(MACROBLOCKD *xd, avm_reader *r, int tcq_mode, if (use_hr) { int hr = read_adaptive_hr(xd, r, hr_level_avg); level += hr << (tcq_mode ? 1 : 0); + DEBUG_CF_printf("%*sPost-residual[pos=%d,%d->%d]: r=%d\n", + depth, "", c, hr, level, r->ec.rng); *hr_avg = (hr_level_avg + hr) >> 1; } return level; @@ -159,7 +165,9 @@ static int read_low_range(avm_reader *r, avm_cdf_prob *cdf) { return br_level; } -static INLINE void read_coeffs_reverse_2d( +static INLINE void read_coeffs_reverse_2d(DB_ONLY(const int depth, + const AV2_COMMON *const cm, const TX_SIZE txs_ctx, + const int mi_row, const int mi_col) avm_reader *r, int start_si, int end_si, const int16_t *scan, int bwl, uint8_t *levels, base_lf_cdf_arr base_lf_cdf, br_cdf_arr br_lf_cdf, int plane, base_cdf_arr base_cdf, br_cdf_arr br_cdf, @@ -173,50 +181,60 @@ static INLINE void read_coeffs_reverse_2d( int limits = get_lf_limits(row, col, 0, plane); int q_i = tcq_quant(*state); if (plane > 0) { + int br_ctx = -1, coeff_ctx; if (limits) { - const int coeff_ctx = + coeff_ctx = get_lower_levels_ctx_lf_2d_chroma(levels, pos, bwl, plane); level += avm_read_symbol(r, base_lf_uv_cdf[coeff_ctx], LF_BASE_SYMBOLS, ACCT_INFO("level", "base_lf_uv_cdf")); } else { - const int coeff_ctx = + coeff_ctx = get_lower_levels_ctx_2d_chroma(levels, pos, bwl, plane); level += avm_read_symbol(r, base_uv_cdf[coeff_ctx], 4, ACCT_INFO("level", "base_uv_cdf")); if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx_2d_chroma(levels, pos, bwl); + br_ctx = get_br_ctx_2d_chroma(levels, pos, bwl); avm_cdf_prob *cdf = br_uv_cdf[br_ctx]; level += read_low_range(r, cdf); } } + DEBUG_CF_printf("%*sPost-tok[pos=%d,ctx=%d|%d|%d|%d,freq=%s,plane=uv,%d]: r=%d\n", + depth, "", c, txs_ctx, coeff_ctx, q_i, br_ctx, + limits ? "lo" : "hi", level, r->ec.rng); } else { + int br_ctx = -1, coeff_ctx; if (limits) { - const int coeff_ctx = get_lower_levels_ctx_lf_2d(levels, pos, bwl); + coeff_ctx = get_lower_levels_ctx_lf_2d(levels, pos, bwl); level += avm_read_symbol(r, base_lf_cdf[coeff_ctx][q_i], LF_BASE_SYMBOLS, ACCT_INFO("level", "base_lf_cdf")); if (level > LF_NUM_BASE_LEVELS) { - const int br_ctx = get_br_lf_ctx_2d(levels, pos, bwl); + br_ctx = get_br_lf_ctx_2d(levels, pos, bwl); avm_cdf_prob *cdf = br_lf_cdf[br_ctx]; level += read_low_range(r, cdf); } } else { - const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, plane); + coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, plane); level += avm_read_symbol(r, base_cdf[coeff_ctx][q_i], 4, ACCT_INFO("level", "base_cdf")); if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx_2d(levels, pos, bwl); + br_ctx = get_br_ctx_2d(levels, pos, bwl); avm_cdf_prob *cdf = br_cdf[br_ctx]; level += read_low_range(r, cdf); } } + DEBUG_CF_printf("%*sPost-tok[pos=%d,ctx=%d|%d|%d|%d,freq=%s,plane=y,%d]: r=%d\n", + depth, "", c, txs_ctx, coeff_ctx, q_i, br_ctx, limits ? "lo" : "hi", + level, r->ec.rng); } levels[get_padded_idx(pos, bwl)] = level; *state = tcq_next_state(*state, level); } } -static INLINE void read_coeffs_reverse( +static INLINE void read_coeffs_reverse(DB_ONLY(const int depth, + const AV2_COMMON *const cm, const TX_SIZE txs_ctx, + const int mi_row, const int mi_col) avm_reader *r, TX_CLASS tx_class, int start_si, int end_si, const int16_t *scan, int bwl, uint8_t *levels, base_lf_cdf_arr base_lf_cdf, br_cdf_arr br_lf_cdf, int plane, base_cdf_arr base_cdf, br_cdf_arr br_cdf, @@ -230,53 +248,69 @@ static INLINE void read_coeffs_reverse( const int col = pos - (row << bwl); int limits = get_lf_limits(row, col, tx_class, plane); if (plane > 0) { + int br_ctx = -1,coeff_ctx; if (limits) { - const int coeff_ctx = + coeff_ctx = get_lower_levels_lf_ctx_chroma(levels, pos, bwl, tx_class, plane); level += avm_read_symbol(r, base_lf_uv_cdf[coeff_ctx], LF_BASE_SYMBOLS, ACCT_INFO("level", "base_lf_uv_cdf")); } else { - const int coeff_ctx = + coeff_ctx = get_lower_levels_ctx_chroma(levels, pos, bwl, tx_class, plane); level += avm_read_symbol(r, base_uv_cdf[coeff_ctx], 4, ACCT_INFO("level", "base_uv_cdf")); if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx_chroma(levels, pos, bwl, tx_class); + br_ctx = get_br_ctx_chroma(levels, pos, bwl, tx_class); avm_cdf_prob *cdf = br_uv_cdf[br_ctx]; level += read_low_range(r, cdf); } } + DEBUG_CF_printf("%*sPost-%stok[pos=%d,ctx=%d|%d|%d|%d,freq=%s,plane=uv,%d]: r=%d\n", + depth, "", c == 0 ? "dc_" : "", + c, txs_ctx, coeff_ctx, q_i, br_ctx, limits ? "lo" : "hi", + level, r->ec.rng); } else { + int br_ctx = -1, coeff_ctx; if (limits) { - const int coeff_ctx = + coeff_ctx = get_lower_levels_lf_ctx(levels, pos, bwl, tx_class); level += avm_read_symbol(r, base_lf_cdf[coeff_ctx][q_i], LF_BASE_SYMBOLS, ACCT_INFO("level", "base_lf_cdf")); if (level > LF_NUM_BASE_LEVELS) { - const int br_ctx = get_br_lf_ctx(levels, pos, bwl, tx_class); + br_ctx = get_br_lf_ctx(levels, pos, bwl, tx_class); avm_cdf_prob *cdf = br_lf_cdf[br_ctx]; level += read_low_range(r, cdf); } } else { - const int coeff_ctx = + coeff_ctx = get_lower_levels_ctx(levels, pos, bwl, tx_class, plane); level += avm_read_symbol(r, base_cdf[coeff_ctx][q_i], 4, ACCT_INFO("level", "base_cdf")); if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + br_ctx = get_br_ctx(levels, pos, bwl, tx_class); avm_cdf_prob *cdf = br_cdf[br_ctx]; level += read_low_range(r, cdf); } } + DEBUG_CF_printf("%*sPost-%stok[pos=%d,ctx=%d|%d|%d|%d,freq=%s,plane=y,%d]: r=%d\n", + depth, "", c == 0 ? "dc_" : "", + c, txs_ctx, coeff_ctx, q_i, br_ctx, limits ? "lo" : "hi", + level, r->ec.rng); } levels[get_padded_idx(pos, bwl)] = level; *state = tcq_next_state(*state, level); } } -static INLINE void read_coeffs_forward_2d(avm_reader *r, int start_si, +static INLINE void read_coeffs_forward_2d(DB_ONLY(const int depth, + const AV2_COMMON *const cm, + const int mi_row, + const int mi_col, + const int plane, + const int size_ctx) + avm_reader *r, int start_si, int end_si, const int16_t *scan, int bwl, uint8_t *levels, base_fsc_cdf_arr base_cdf, @@ -287,17 +321,23 @@ static INLINE void read_coeffs_forward_2d(avm_reader *r, int start_si, const int nsymbs = 4; int level = avm_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_INFO("level", "base_cdf")); + int br_ctx = -1; if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx_skip(levels, pos, bwl); + br_ctx = get_br_ctx_skip(levels, pos, bwl); avm_cdf_prob *cdf = br_cdf[br_ctx]; level += read_low_range(r, cdf); } + DEBUG_CF_printf("%*sPost-tok[pos=%d,ctx=%d|%d|%d,plane=%s,%d]: r=%d\n", + depth, "", c, size_ctx, coeff_ctx, br_ctx, + plane ? "uv" : "y", level, r->ec.rng); levels[get_padded_idx_left(pos, bwl)] = level; } } // Decode the end-of-block syntax. -static INLINE void decode_eob(DecoderCodingBlock *dcb, avm_reader *const r, +static INLINE int decode_eob(DB_ONLY(const int depth, const AV2_COMMON *const cm, + int mi_row, int mi_col) + DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const TX_SIZE tx_size #if CONFIG_PARAKIT_COLLECT_DATA , @@ -402,6 +442,8 @@ static INLINE void decode_eob(DecoderCodingBlock *dcb, avm_reader *const r, eob_pt += 1; break; } + DEBUG_CF_printf("%*sPost-eob_bin_%d[ctx=%d,%d]: r=%d\n", + depth, "", 16 << eob_multi_size, pl_ctx, eob_pt - 1, r->ec.rng); const int eob_offset_bits = av2_eob_offset_bits[eob_pt]; if (eob_offset_bits > 0) { @@ -414,10 +456,23 @@ static INLINE void decode_eob(DecoderCodingBlock *dcb, avm_reader *const r, avm_read_literal(r, eob_offset_bits - 1, ACCT_INFO("eob_extra")); } *eob = rec_eob_pos(eob_pt, eob_extra); + if (eob_offset_bits > 0) + DEBUG_CF_printf("%*sPost-eob[%d]: r=%d\n", depth, "", *eob - 1, r->ec.rng); *bob = *eob; // escape character + return *eob; } -uint8_t av2_read_sig_txtype(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, +#if DEBUG_BLOCK_INFO +static const char *const dav2d_tx1d_names[] = { + [DCT_1D] = "dct", + [ADST_1D] = "adst", + [FLIPADST_1D] = "flipadst", + [IDTX_1D] = "identity", +}; +#endif + +int av2_read_sig_txtype(DB_ONLY(const int depth) + const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int blk_row, const int blk_col, const int plane, const TXB_CTX *const txb_ctx, @@ -436,9 +491,10 @@ uint8_t av2_read_sig_txtype(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, int txb_skip_ctx = txb_ctx->txb_skip_ctx; int all_zero; + int pred_mode_ctx = -1; if (plane == AVM_PLANE_Y || plane == AVM_PLANE_U) { MB_MODE_INFO *const mbmi = xd->mi[0]; - const int pred_mode_ctx = + pred_mode_ctx = (is_inter || mbmi->fsc_mode[xd->tree_type == CHROMA_PART]) ? 1 : 0; all_zero = avm_read_symbol( r, ec_ctx->txb_skip_cdf[pred_mode_ctx][txs_ctx][txb_skip_ctx], 2, @@ -448,6 +504,12 @@ uint8_t av2_read_sig_txtype(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, all_zero = avm_read_symbol(r, ec_ctx->v_txb_skip_cdf[txb_skip_ctx], 2, ACCT_INFO("all_zero", "plane_v")); } +#if DEBUG_BLOCK_INFO + const int mi_row = blk_row + xd->mi_row, mi_col = blk_col + xd->mi_col; + DEBUG_CF_printf("%*sPost-all_zero[ctx=%d|%d|%d,%d]: r=%d\n", + depth, "", pred_mode_ctx, txs_ctx, txb_skip_ctx, all_zero, + r->ec.rng); +#endif #if CONFIG_INSPECTION MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -485,7 +547,11 @@ uint8_t av2_read_sig_txtype(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, return 0; } - decode_eob(dcb, r, plane, tx_size +#if DEBUG_BLOCK_INFO + const int eobval = +#endif + decode_eob(DB_ONLY(depth, cm, mi_row, mi_col) + dcb, r, plane, tx_size #if CONFIG_PARAKIT_COLLECT_DATA , cm @@ -493,11 +559,22 @@ uint8_t av2_read_sig_txtype(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, ); av2_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r, plane, *eob, is_inter ? 0 : *eob); +#if DEBUG_BLOCK_INFO + const PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE txtp = av2_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + is_reduced_tx_set_used(cm, plane_type)); + const int ll = xd->lossless[mbmi->segment_id]; + DEBUG_CF_printf("%*sPost-txtp[%s/%s]: r=%d\n", + depth, "", + ll && txtp == DCT_DCT ? "wht" : dav2d_tx1d_names[htx_tab[txtp]], + ll && txtp == DCT_DCT ? "wht" : dav2d_tx1d_names[vtx_tab[txtp]], + r->ec.rng); +#endif if (plane == AVM_PLANE_U && is_cctx_allowed(cm, xd)) { const int skip_cctx = is_inter ? 0 : (*eob == 1); if (!all_zero && !skip_cctx) { - av2_read_cctx_type(cm, xd, blk_row, blk_col, tx_size, r); + av2_read_cctx_type(DB_ONLY(depth) cm, xd, blk_row, blk_col, tx_size, r); } else { int row_offset, col_offset; get_chroma_mi_offsets(xd, &row_offset, &col_offset); @@ -505,10 +582,15 @@ uint8_t av2_read_sig_txtype(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, CCTX_NONE); } } +#if DEBUG_BLOCK_INFO + return eobval; +#else return 1; +#endif } -uint8_t av2_read_coeffs_txb_skip(const AV2_COMMON *const cm, +uint8_t av2_read_coeffs_txb_skip(DB_ONLY(const int depth) + const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int blk_row, const int blk_col, const int plane, const TX_SIZE tx_size) { @@ -553,6 +635,9 @@ uint8_t av2_read_coeffs_txb_skip(const AV2_COMMON *const cm, eob_info *bob_data = dcb->bob_data[plane] + dcb->txb_offset[plane]; bob_data->max_scan_line = 0; +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row + blk_row, mi_col = xd->mi_col + blk_col; +#endif const TX_TYPE tx_type = av2_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, is_reduced_tx_set_used(cm, plane_type)); @@ -582,14 +667,19 @@ uint8_t av2_read_coeffs_txb_skip(const AV2_COMMON *const cm, int level = avm_read_symbol(r, cdf_bob, nsymbs_bob, ACCT_INFO("level", "cdf_bob")) + 1; + int br_ctx = -1; if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx_skip(levels, pos, bwl); + br_ctx = get_br_ctx_skip(levels, pos, bwl); avm_cdf_prob *cdf = br_cdf[br_ctx]; level += read_low_range(r, cdf); } + DEBUG_CF_printf("%*sPost-bob_tok[pos=%d,ctx=%d|%d|%d,plane=%s,%d]: r=%d\n", + depth, "", bob, size_ctx, coeff_ctx_bob, br_ctx, + plane ? "uv" : "y", level, r->ec.rng); levels[get_padded_idx_left(pos, bwl)] = level; } - read_coeffs_forward_2d(r, bob + 1, eob_data->eob - 1, scan, bwl, levels, + read_coeffs_forward_2d(DB_ONLY(depth, cm, mi_row, mi_col, plane, size_ctx) + r, bob + 1, eob_data->eob - 1, scan, bwl, levels, base_cdf, br_cdf); } @@ -605,8 +695,12 @@ uint8_t av2_read_coeffs_txb_skip(const AV2_COMMON *const cm, int idtx_sign_ctx = get_sign_ctx_skip(signs, levels, pos, bwl); sign = avm_read_symbol(r, ec_ctx->idtx_sign_cdf[size_ctx][idtx_sign_ctx], 2, ACCT_INFO("sign")); + DEBUG_CF_printf("%*sPost-sign[pos=%d,ctx=%d|%d,plane=%s,%d]: r=%d\n", + depth, "", c, size_ctx, idtx_sign_ctx, plane ? "uv" : "y", sign, + r->ec.rng); signs[sign_idx] = sign > 0 ? -1 : 1; - level = read_high_range(xd, r, 0, level, 0, &hr_level_avg, plane); + level = read_high_range(DB_ONLY(depth, cm, mi_row, mi_col, c) + xd, r, 0, level, 0, &hr_level_avg, plane); if (c == 0) dc_val = sign ? -level : level; // Bitmasking to clamp level to valid range: // The valid range for 8/10/12 bit video is at most 14/16/18 bit @@ -655,7 +749,8 @@ static INLINE tran_low_t read_coeff_hidden(avm_reader *r, TX_CLASS tx_class, return level; } -uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, +uint8_t av2_read_coeffs_txb(DB_ONLY(const int depth) + const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int blk_row, const int blk_col, const int plane, const TXB_CTX *const txb_ctx, @@ -696,6 +791,10 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, uint16_t *const eob = &(eob_data->eob); uint16_t *const max_scan_line = &(eob_data->max_scan_line); +#if DEBUG_BLOCK_INFO + const int mi_row = blk_row + xd->mi_row, mi_col = blk_col + xd->mi_col; +#endif + const TX_TYPE tx_type = av2_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, is_reduced_tx_set_used(cm, plane_type)); @@ -717,7 +816,7 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, ? (*eob > 3 && cm->seq_params.enable_inter_ist) : (*eob != 1 && cm->seq_params.enable_ist && !xd->mi[0]->fsc_mode[xd->tree_type == CHROMA_PART]))) { - av2_read_sec_tx_type(cm, xd, blk_row, blk_col, tx_size, eob, r); + av2_read_sec_tx_type(DB_ONLY(depth) cm, xd, blk_row, blk_col, tx_size, eob, r); } if (*eob > 1) { @@ -740,6 +839,7 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, const int col = pos - (row << bwl); int limits = get_lf_limits(row, col, tx_class, plane); if (plane > 0) { + int br_ctx = -1; if (limits) { avm_cdf_prob *cdf = ec_ctx->coeff_base_lf_eob_uv_cdf[coeff_ctx]; level += @@ -753,12 +853,16 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, 1; if (level > NUM_BASE_LEVELS) { - const int br_ctx = 0; /* get_lf_ctx_eob */ + br_ctx = 0; /* get_lf_ctx_eob */ cdf = ec_ctx->coeff_br_uv_cdf[br_ctx]; level += read_low_range(r, cdf); } } + DEBUG_CF_printf("%*sPost-eob_tok[pos=%d,ctx=%d|%d|%d,freq=%s,plane=uv,%d]: r=%d\n", + depth, "", c, txs_ctx, coeff_ctx, br_ctx, limits ? "lo" : "hi", + level, r->ec.rng); } else { + int br_ctx = -1; if (limits) { avm_cdf_prob *cdf = ec_ctx->coeff_base_lf_eob_cdf[txs_ctx][coeff_ctx]; level += avm_read_symbol(r, cdf, LF_BASE_SYMBOLS - 1, @@ -766,7 +870,7 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, 1; if (level > LF_NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx_lf_eob(pos, tx_class); + br_ctx = get_br_ctx_lf_eob(pos, tx_class); cdf = ec_ctx->coeff_br_lf_cdf[br_ctx]; level += read_low_range(r, cdf); } @@ -777,11 +881,14 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, 1; if (level > NUM_BASE_LEVELS) { - const int br_ctx = 0; /* get_lf_ctx_eob */ + br_ctx = 0; /* get_lf_ctx_eob */ cdf = ec_ctx->coeff_br_cdf[br_ctx]; level += read_low_range(r, cdf); } } + DEBUG_CF_printf("%*sPost-eob_tok[pos=%d,ctx=%d|%d|%d,freq=%s,plane=y,%d]: r=%d\n", + depth, "", c, txs_ctx, coeff_ctx, br_ctx, limits ? "lo" : "hi", + level, r->ec.rng); } levels[get_padded_idx(pos, bwl)] = level; state = tcq_next_state(state, level); @@ -803,7 +910,8 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, br_cdf_arr br_uv_cdf = ec_ctx->coeff_br_uv_cdf; if (tx_class == TX_CLASS_2D) { - read_coeffs_reverse_2d(r, 1, *eob - 2, scan, bwl, levels, base_lf_cdf, + read_coeffs_reverse_2d(DB_ONLY(depth, cm, txs_ctx, mi_row, mi_col) + r, 1, *eob - 2, scan, bwl, levels, base_lf_cdf, br_lf_cdf, plane, base_cdf, br_cdf, base_lf_uv_cdf, base_uv_cdf, br_uv_cdf, &state); if (enable_parity_hiding) { @@ -822,12 +930,14 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, read_coeff_hidden(r, tx_class, scan, bwl, levels, (sum_abs1 & 1), ec_ctx->coeff_base_ph_cdf); } else { - read_coeffs_reverse(r, tx_class, 0, 0, scan, bwl, levels, base_lf_cdf, + read_coeffs_reverse(DB_ONLY(depth, cm, txs_ctx, mi_row, mi_col) + r, tx_class, 0, 0, scan, bwl, levels, base_lf_cdf, br_lf_cdf, plane, base_cdf, br_cdf, base_lf_uv_cdf, base_uv_cdf, br_uv_cdf, &state); } } else { - read_coeffs_reverse(r, tx_class, 1, *eob - 2, scan, bwl, levels, + read_coeffs_reverse(DB_ONLY(depth, cm, txs_ctx, mi_row, mi_col) + r, tx_class, 1, *eob - 2, scan, bwl, levels, base_lf_cdf, br_lf_cdf, plane, base_cdf, br_cdf, base_lf_uv_cdf, base_uv_cdf, br_uv_cdf, &state); if (enable_parity_hiding) { @@ -846,7 +956,8 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, read_coeff_hidden(r, tx_class, scan, bwl, levels, (sum_abs1 & 1), ec_ctx->coeff_base_ph_cdf); } else { - read_coeffs_reverse(r, tx_class, 0, 0, scan, bwl, levels, base_lf_cdf, + read_coeffs_reverse(DB_ONLY(depth, cm, txs_ctx, mi_row, mi_col) + r, tx_class, 0, 0, scan, bwl, levels, base_lf_cdf, br_lf_cdf, plane, base_cdf, br_cdf, base_lf_uv_cdf, base_uv_cdf, br_uv_cdf, &state); } @@ -877,27 +988,38 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, r, ec_ctx->dc_sign_cdf[plane_type][is_hidden ? 1 : 0][dc_sign_ctx], 2, ACCT_INFO("sign", "dc_sign_cdf", "plane_y_or_u")); + DEBUG_CF_printf("%*sPost-dc_sign[pos=%d,ctx=%d,%d]: r=%d\n", + depth, "", c, dc_sign_ctx, sign, r->ec.rng); } else { sign = avm_read_literal( r, 1, ACCT_INFO("sign", "dc_sign_cdf", "plane_y_or_u")); + DEBUG_CF_printf("%*sPost-dc_sign[pos=%d,%d]: r=%d\n", + depth, "", c, sign, r->ec.rng); } } else { sign = avm_read_literal( r, 1, ACCT_INFO("sign", "v_dc_sign_cdf", "plane_v")); + DEBUG_CF_printf("%*sPost-dc_sign[pos=%d,%d]: r=%d\n", + depth, "", c, sign, r->ec.rng); } if (plane == AVM_PLANE_U) xd->tmp_sign[tmp_sign_idx] = (sign ? 2 : 1); } else { sign = avm_read_bit(r, ACCT_INFO("sign")); + DEBUG_CF_printf("%*sPost-sign[pos=%d,%d]: r=%d\n", + depth, "", c, sign, r->ec.rng); } if (is_hidden && c == 0) { if (level >= ((NUM_BASE_LEVELS + 1) << 1)) { int hr_level = (read_adaptive_hr(xd, r, hr_level_avg >> 1) << 1); + DEBUG_CF_printf("%*sPost-residual[pos=%d,%d->%d]: r=%d\n", + depth, "", c, pos, hr_level, r->ec.rng); level += hr_level; hr_level_avg = (hr_level_avg + hr_level) >> 1; } } else { int limits = get_lf_limits(row, col, tx_class, plane); - level = read_high_range(xd, r, tcq_mode, level, limits, &hr_level_avg, + level = read_high_range(DB_ONLY(depth, cm, mi_row, mi_col, c) + xd, r, tcq_mode, level, limits, &hr_level_avg, plane); } if (c == 0) dc_val = sign ? -level : level; @@ -976,7 +1098,8 @@ uint8_t av2_read_coeffs_txb(const AV2_COMMON *const cm, DecoderCodingBlock *dcb, return cul_level; } -void av2_read_coeffs_txb_facade(const AV2_COMMON *const cm, +void av2_read_coeffs_txb_facade(DB_ONLY(const int depth) + const AV2_COMMON *const cm, DecoderCodingBlock *dcb, avm_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { @@ -995,9 +1118,17 @@ void av2_read_coeffs_txb_facade(const AV2_COMMON *const cm, pd->left_entropy_context + row, &txb_ctx, mbmi->fsc_mode[xd->tree_type == CHROMA_PART] && cm->seq_params.enable_fsc); +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row + row, mi_col = xd->mi_col + col; + DEBUG_CF_printf("%*sdecode_cf[y=%d,x=%d,pl=%d,tx=%dx%d]: r=%d\n", + depth - 1, "", xd->mi_row + (row << pd->subsampling_y), + xd->mi_col + (col << pd->subsampling_x), plane, + tx_size_wide[tx_size], tx_size_high[tx_size], r->ec.rng); +#endif - const uint8_t decode_rest = - av2_read_sig_txtype(cm, dcb, r, row, col, plane, &txb_ctx, tx_size); + const int decode_rest = + av2_read_sig_txtype(DB_ONLY(depth) + cm, dcb, r, row, col, plane, &txb_ctx, tx_size); const PLANE_TYPE plane_type = get_plane_type(plane); const TX_TYPE tx_type = @@ -1011,10 +1142,12 @@ void av2_read_coeffs_txb_facade(const AV2_COMMON *const cm, get_primary_tx_type(tx_type) == IDTX && plane == PLANE_TYPE_Y) || use_inter_fsc(cm, plane, tx_type, is_inter))) { cul_level = - av2_read_coeffs_txb_skip(cm, dcb, r, row, col, plane, tx_size); + av2_read_coeffs_txb_skip(DB_ONLY(depth) + cm, dcb, r, row, col, plane, tx_size); } else { cul_level = - av2_read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size); + av2_read_coeffs_txb(DB_ONLY(depth) + cm, dcb, r, row, col, plane, &txb_ctx, tx_size); } } else { av2_update_txk_skip_array(cm, xd->mi_row, xd->mi_col, xd->tree_type, @@ -1028,7 +1161,16 @@ void av2_read_coeffs_txb_facade(const AV2_COMMON *const cm, is_reduced_tx_set_used(cm, plane_type)); update_txk_array(xd, row, col, tx_size, tx_type_inter); } - +#if DEBUG_BLOCK_INFO + const TX_TYPE txtp = tx_type * (plane == AVM_PLANE_Y || decode_rest > 0); + const int ll = xd->lossless[mbmi->segment_id]; + DEBUG_BLOCK_printf("%*sPost-%c_cf_blk[tx=%dx%d,txtp=%s/%s,eob=%d]: r=%d\n", + depth, "", "yuv"[plane], + tx_size_wide[tx_size], tx_size_high[tx_size], + ll && txtp == DCT_DCT ? "wht" : dav2d_tx1d_names[htx_tab[txtp]], + ll && txtp == DCT_DCT ? "wht" : dav2d_tx1d_names[vtx_tab[txtp]], + decode_rest - 1, r->ec.rng); +#endif #if TXCOEFF_TIMER avm_usec_timer_mark(&timer); const int64_t elapsed_time = avm_usec_timer_elapsed(&timer); diff --git a/av2/decoder/decodetxb.h b/av2/decoder/decodetxb.h index 8c76e5734a..460cba70c4 100644 --- a/av2/decoder/decodetxb.h +++ b/av2/decoder/decodetxb.h @@ -15,12 +15,15 @@ #include "av2/common/enums.h" +#include "avm/debug.h" + struct avm_reader; struct AV2Common; struct DecoderCodingBlock; struct txb_ctx; -uint8_t av2_read_coeffs_txb(const struct AV2Common *const cm, +uint8_t av2_read_coeffs_txb(DB_ONLY(const int depth) + const struct AV2Common *const cm, struct DecoderCodingBlock *dcb, struct avm_reader *const r, const int blk_row, const int blk_col, const int plane, @@ -29,20 +32,23 @@ uint8_t av2_read_coeffs_txb(const struct AV2Common *const cm, ); -void av2_read_coeffs_txb_facade(const struct AV2Common *const cm, +void av2_read_coeffs_txb_facade(DB_ONLY(const int depth) + const struct AV2Common *const cm, struct DecoderCodingBlock *dcb, struct avm_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size); -uint8_t av2_read_sig_txtype(const struct AV2Common *const cm, +int av2_read_sig_txtype(DB_ONLY(const int depth) + const struct AV2Common *const cm, struct DecoderCodingBlock *dcb, struct avm_reader *const r, const int blk_row, const int blk_col, const int plane, const struct txb_ctx *const txb_ctx, const TX_SIZE tx_size); -uint8_t av2_read_coeffs_txb_skip(const struct AV2Common *const cm, +uint8_t av2_read_coeffs_txb_skip(DB_ONLY(const int depth) + const struct AV2Common *const cm, struct DecoderCodingBlock *dcb, struct avm_reader *const r, const int blk_row, const int blk_col, const int plane, diff --git a/av2/decoder/detokenize.c b/av2/decoder/detokenize.c index 89ab0a121d..99ba3f988b 100644 --- a/av2/decoder/detokenize.c +++ b/av2/decoder/detokenize.c @@ -109,7 +109,8 @@ static int decode_color_map_tokens(Av2ColorMapParam *param, avm_reader *r) { return 1; } -void av2_decode_palette_tokens(MACROBLOCKD *const xd, int plane, +void av2_decode_palette_tokens(DB_ONLY(AV2_COMMON *cm, const int depth) + MACROBLOCKD *const xd, int plane, avm_reader *r) { assert(plane == 0 || plane == 1); Av2ColorMapParam params; @@ -127,4 +128,9 @@ void av2_decode_palette_tokens(MACROBLOCKD *const xd, int plane, avm_internal_error(xd->error_info, AVM_CODEC_ERROR, "Error decoding palette tokens"); } +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row, mi_col = xd->mi_col; + DEBUG_BLOCK_printf("%*sPost-y-pal-indices: r=%d\n", + depth, "", r->ec.rng); +#endif } diff --git a/av2/decoder/detokenize.h b/av2/decoder/detokenize.h index 56da03552b..b89610dbdc 100644 --- a/av2/decoder/detokenize.h +++ b/av2/decoder/detokenize.h @@ -22,7 +22,8 @@ extern "C" { #endif -void av2_decode_palette_tokens(MACROBLOCKD *const xd, int plane, avm_reader *r); +void av2_decode_palette_tokens(DB_ONLY(AV2_COMMON *cm, const int depth) + MACROBLOCKD *const xd, int plane, avm_reader *r); #ifdef __cplusplus } // extern "C" diff --git a/av2/encoder/rdopt.c b/av2/encoder/rdopt.c index 1de2128892..bdd4cddac2 100644 --- a/av2/encoder/rdopt.c +++ b/av2/encoder/rdopt.c @@ -814,7 +814,7 @@ static AVM_INLINE void setup_buffer_ref_mvs_inter( } // Gets an initial list of candidate vectors from neighbours and orders them - av2_find_mv_refs( + av2_find_mv_refs(DB_ONLY(0) cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, xd->warp_param_stack, ref_frame < INTER_REFS_PER_FRAME ? MAX_WARP_REF_CANDIDATES : 0, @@ -5621,7 +5621,8 @@ static int64_t rd_pick_intrabc_mode_sb(const AV2_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; mbmi->use_intrabc[xd->tree_type == CHROMA_PART] = 1; - av2_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + av2_find_mv_refs(DB_ONLY(0) + cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, NULL, 0, NULL); mbmi->use_intrabc[xd->tree_type == CHROMA_PART] = 0; @@ -6376,7 +6377,8 @@ static AVM_INLINE void rd_pick_skip_mode( const uint8_t ref_frame_type = av2_ref_frame_type(mbmi->ref_frame); - av2_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + av2_find_mv_refs(DB_ONLY(0) + cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, NULL, 0, NULL); @@ -6981,7 +6983,7 @@ static AVM_INLINE void set_params_rd_pick_inter_mode( // Ref mv list population is not required, when compound references are // pruned. if (prune_ref_frame(cpi, x, ref_frame)) continue; - av2_find_mv_refs( + av2_find_mv_refs(DB_ONLY(0) cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, xd->warp_param_stack, ref_frame < INTER_REFS_PER_FRAME ? MAX_WARP_REF_CANDIDATES : 0, diff --git a/avm/debug.h b/avm/debug.h index b605a9125c..ae277ab7a6 100644 --- a/avm/debug.h +++ b/avm/debug.h @@ -31,5 +31,21 @@ #define DEBUG_OBU_HDR 0 #define DEBUG_SEQ_HDR 0 #define DEBUG_FRAME_HDR 0 +#define DEBUG_BLOCK_INFO 0 +#if DEBUG_BLOCK_INFO +#define DB_ONLY(x...) x, +#define BLOCK_TO_DEBUG \ + (cm->current_frame.frame_number == 0 && \ + mi_row >= 0 && mi_row < 16 && mi_col >= 0 && mi_col < 16) +#define DEBUG_BLOCK_printf(fmt...) \ + if (BLOCK_TO_DEBUG) printf(fmt) +#define DEBUG_CF_printf(fmt...) \ + if (BLOCK_TO_DEBUG && plane > -1 && 1) printf(fmt) +#else +#define DB_ONLY(x...) +#define BLOCK_TO_DEBUG 0 +#define DEBUG_BLOCK_printf(fmt...) do { } while (0) +#define DEBUG_CF_printf(fmt...) do { } while (0) +#endif #endif /* AVM_DEBUG_H */ -- 2.48.1 0004-Print-Intra-Prediction-DQ-coeffs-and-inverse-transfo.patch000066400000000000000000000621731517466257200347240ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/patchesFrom c1dc48906ac7fadc2fd1f2af9b6ffe446af4d565 Mon Sep 17 00:00:00 2001 From: Luc Trudeau Date: Wed, 17 Sep 2025 16:53:29 -0400 Subject: [PATCH 4/5] Print Intra Prediction, DQ coeffs and inverse transform --- av2/common/idct.c | 19 +++- av2/common/idct.h | 7 +- av2/common/reconintra.c | 189 ++++++++++++++++++++++++++++++++++++-- av2/common/reconintra.h | 3 +- av2/common/tip.c | 15 +++ av2/decoder/decodeframe.c | 41 ++++++++- av2/encoder/encodemb.c | 10 +- av2/encoder/tpl_model.c | 2 +- av2/encoder/tx_search.c | 8 +- avm/debug.h | 49 ++++++++++ 10 files changed, 319 insertions(+), 24 deletions(-) diff --git a/av2/common/idct.c b/av2/common/idct.c index 9af6944168..699c7c3b2f 100644 --- a/av2/common/idct.c +++ b/av2/common/idct.c @@ -995,7 +995,8 @@ static void av2_highbd_inv_txfm_add_master(const tran_low_t *input, av2_highbd_inv_txfm_add(input, dest, stride, txfm_param); } -void av2_inverse_transform_block(const MACROBLOCKD *xd, +void av2_inverse_transform_block(DB_ONLY(const int print) + const MACROBLOCKD *xd, const tran_low_t *dqcoeff, int plane, TX_TYPE tx_type, TX_SIZE tx_size, uint16_t *dst, int stride, int eob, @@ -1016,7 +1017,7 @@ void av2_inverse_transform_block(const MACROBLOCKD *xd, DECLARE_ALIGNED(32, tran_low_t, temp_dqcoeff[MAX_TX_SQUARE]); memcpy(temp_dqcoeff, dqcoeff, sizeof(tran_low_t) * tx_size_2d[tx_size]); - av2_inv_stxfm(temp_dqcoeff, &txfm_param); + av2_inv_stxfm(DB_ONLY(print) temp_dqcoeff, &txfm_param); MB_MODE_INFO *const mbmi = xd->mi[0]; if (xd->lossless[mbmi->segment_id]) { @@ -1070,7 +1071,8 @@ void inv_stxfm_c(tran_low_t *src, tran_low_t *dst, const PREDICTION_MODE mode, } } -void av2_inv_stxfm(tran_low_t *coeff, TxfmParam *txfm_param) { +void av2_inv_stxfm(DB_ONLY(const int print) + tran_low_t *coeff, TxfmParam *txfm_param) { const TX_TYPE stx_type = txfm_param->sec_tx_type; const int width = tx_size_wide[txfm_param->tx_size] <= 32 @@ -1102,6 +1104,12 @@ void av2_inv_stxfm(tran_low_t *coeff, TxfmParam *txfm_param) { *tmp = src[scan[r]]; tmp++; } +#if DEBUG_BLOCK_INFO + if (print) { + coef_dump(buf0, 8, 8, 3, "dq"); + } +#endif + int8_t transpose = 0; mode = AVMMIN(intra_mode, SMOOTH_H_PRED); if ((mode == H_PRED) || (mode == D157_PRED) || (mode == D67_PRED) || @@ -1161,5 +1169,10 @@ void av2_inv_stxfm(tran_low_t *coeff, TxfmParam *txfm_param) { fprintf(stderr, "\n"); } #endif // STX_COEFF_DEBUG +#if DEBUG_B_PIXELS + if (print) { + coef_dump_transpose(coeff, width, height, 3, "stx"); + } +#endif } } diff --git a/av2/common/idct.h b/av2/common/idct.h index 7e4c02b15c..1cf9317670 100644 --- a/av2/common/idct.h +++ b/av2/common/idct.h @@ -21,6 +21,8 @@ #include "av2/common/enums.h" #include "avm_dsp/txfm_common.h" +#include "avm/debug.h" + #ifdef __cplusplus extern "C" { #endif @@ -35,7 +37,8 @@ void av2_inv_cross_chroma_tx_block(tran_low_t *dqcoeff_c1, tran_low_t *dqcoeff_c2, TX_SIZE tx_size, CctxType cctx_type, const int bd); -void av2_inverse_transform_block(const MACROBLOCKD *xd, +void av2_inverse_transform_block(DB_ONLY(int print) + const MACROBLOCKD *xd, const tran_low_t *dqcoeff, int plane, TX_TYPE tx_type, TX_SIZE tx_size, uint16_t *dst, int stride, int eob, @@ -49,7 +52,7 @@ void av2_highbd_iwht4x4_horz_add(const tran_low_t *input, uint16_t *dest, void av2_highbd_iwht4x4_vert_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void av2_inv_stxfm(tran_low_t *coeff, TxfmParam *txfm_param); +void av2_inv_stxfm(DB_ONLY(int print) tran_low_t *coeff, TxfmParam *txfm_param); #ifdef __cplusplus } // extern "C" diff --git a/av2/common/reconintra.c b/av2/common/reconintra.c index be6528d678..b40226d390 100644 --- a/av2/common/reconintra.c +++ b/av2/common/reconintra.c @@ -27,6 +27,8 @@ #include "av2/common/reconintra.h" #include +#include "avm/debug.h" + enum { NEED_LEFT = 1 << 1, NEED_ABOVE = 1 << 2, @@ -1071,7 +1073,7 @@ void av2_highbd_ibp_dr_prediction_z3_c( } } -void av2_build_intra_predictors_high( +void av2_build_intra_predictors_high(DB_ONLY(const AV2_COMMON *cm) const MACROBLOCKD *xd, const uint16_t *ref, int ref_stride, uint16_t *dst, int dst_stride, PREDICTION_MODE mode, int p_angle, int angle_delta, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, @@ -1092,6 +1094,15 @@ void av2_build_intra_predictors_high( uint16_t *const above_row_2nd = above_data_2nd + 32; uint16_t *const left_col_2nd = left_data_2nd + 32; +#if DEBUG_BLOCK_INFO + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int hbd = !!(xd->bd - 8); +#endif +#if DEBUG_B_PIXELS + const char *pred_names[] = { "y-intra-pred", "u-intra-pred", "v-intra-pred" }; +#endif + uint16_t *const second_pred = second_pred_data + 16; uint16_t *dst_mrl_line_0 = mrl_line_0_data + 16; const int txwpx = tx_size_wide[tx_size]; @@ -1156,20 +1167,38 @@ void av2_build_intra_predictors_high( val = (n_left_px > 0) ? left_ref_1st[0] : base - 1; } for (i = 0; i < txhpx; ++i) { - avm_memset16(dst, val, txwpx); - dst += dst_stride; + avm_memset16(&dst[i * dst_stride], val, txwpx); + } +#if DEBUG_B_PIXELS + if (BLOCK_TO_DEBUG) { + if (0 || plane == AVM_PLANE_Y) { + if (need_left) { + printf("l\n"); + for (int x = 0; x < txhpx; x++) + printf(" %0*x", 2 + hbd, val); + printf("\n"); + } else { + printf("t\n"); + for (int x = 0; x < txwpx; x++) + printf(" %0*x", 2 + hbd, val); + printf("\n"); + } + hex_dump(dst, dst_stride, txwpx, txhpx, pred_names[plane], hbd); + } } +#endif return; } // NEED_LEFT + int num_left_pixels_needed = 0; if (need_left) { int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT; if (use_intra_dip) need_bottom = 1; if (is_dr_mode) need_bottom = apply_ibp ? (p_angle < 90) || (p_angle > 180) : p_angle > 180; - int num_left_pixels_needed = + num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3) + (mrl_index << 1); if (use_intra_dip) { // DIP mode requires left edge + 1/4 tx height for overhang feature. @@ -1201,12 +1230,13 @@ void av2_build_intra_predictors_high( } // NEED_ABOVE + int num_top_pixels_needed = 0; if (need_above) { int need_right = extend_modes[mode] & NEED_ABOVERIGHT; if (use_intra_dip) need_right = 1; if (is_dr_mode) need_right = apply_ibp ? (p_angle < 90) || (p_angle > 180) : p_angle < 90; - int num_top_pixels_needed = + num_top_pixels_needed = txwpx + (need_right ? txhpx : 0) + (mrl_index << 1); if (use_intra_dip) { // DIP mode requires above line + 1/4 tx width for overhang feature. @@ -1261,6 +1291,123 @@ void av2_build_intra_predictors_high( } } +#if DEBUG_BLOCK_INFO + if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS && plane == AVM_PLANE_Y) { + int print_t = 0, print_l = 0, print_tl = 0; + int print_t2 = 0, print_l2 = 0, print_tl2 = 0; + switch (mode) { + case DC_PRED: + if (use_intra_dip) { + print_t = txwpx + (txwpx >> 2); + print_l = txhpx + (txhpx >> 2); + print_tl = 1; + } else { + print_t = n_top_px ? txwpx : 0; + print_l = n_left_px ? txhpx : 0; + } + break; + case PAETH_PRED: + print_t = n_top_px ? txwpx : 0; + print_l = n_left_px ? txhpx : 0; + print_tl = n_top_px && n_left_px; + break; + case SMOOTH_PRED: + print_t = txwpx + 1; + print_l = txhpx + 1; + break; + case SMOOTH_H_PRED: + print_t = -1; // top-right only + print_l = txhpx; + break; + case SMOOTH_V_PRED: + print_t = txwpx; + print_l = -1; // bottom-left only + break; + default: + assert(is_dr_mode); + const int tl_filter = !disable_edge_filter && !mrl_index && + p_angle != 90 && p_angle != 180 && n_left_px > 0 && n_top_px > 0 && + txwpx + txhpx >= 24; + const int ibp = apply_ibp && !mrl_index && !(angle_delta & 1); + if (p_angle == 90 || (p_angle < 90 && !ibp && !n_top_px)) { // v + print_t = txwpx; + } else if (p_angle < 90) { // z1 + print_tl = 1; + print_t = txwpx + txhpx; + if (ibp) + print_l = txwpx + txhpx; + else if (tl_filter) + print_l = 1; + } else if (p_angle < 180) { // z2 + print_tl = 1; + print_t = txwpx; + print_l = txhpx; + } else if (p_angle == 180 || (p_angle > 180 && !ibp && !n_left_px)) { // h + print_l = txhpx; + } else if (p_angle > 180) { // z3 + print_tl = 1; + print_l = txwpx + txhpx; + if (ibp) + print_t = txwpx + txhpx; + else if (tl_filter) + print_t = 1; + } + if (xd->mi[0]->multi_line_mrl && tx_size != TX_4X4) { + print_t2 = print_t; + print_tl2 = print_tl; + print_l2 = print_l; + } + if (mrl_index && print_tl && (p_angle < 90 || p_angle > 180) /* mode is z1/3 */) { + if (print_t) print_t += mrl_index * 2; + if (print_l) print_l += mrl_index * 2; + } + break; + } + const char *pix_fmt = hbd ? " %03x" : " %02x"; + if (0 || plane == AVM_PLANE_Y) { + if (print_l > 0) { + printf("l\n"); + for (int x = print_l - 1; x >= 0; x--) + printf(pix_fmt, left_col_1st[x]); + printf("\n"); + if (print_l2) { + printf("l2\n"); + for (int x = print_l2 - 1; x >= 0; x--) + printf(pix_fmt, left_col_2nd[x]); + printf("\n"); + } + } else if (print_l < 0) { + printf("bl\n"); + printf(pix_fmt, left_col_1st[txhpx]); + printf("\n"); + } + if (print_t > 0) { + hex_dump(above_row_1st, 0, print_t, 1, "t", hbd); + if (print_t2) + hex_dump(above_row_2nd, 0, print_t2, 1, "t2", hbd); + } else if (print_t < 0) { + hex_dump(&above_row_1st[txwpx], 0, 1, 1, "tr", hbd); + } + if (print_tl) { + if (mrl_index) { + printf("tl\n"); + int ii; + for (ii = -mrl_index; ii < 0; ii++) + printf(pix_fmt, left_col_1st[-(mrl_index + 1) - ii]); + for (; ii <= mrl_index; ii++) + printf(pix_fmt, above_row_1st[ii - (mrl_index + 1)]); + printf("\n"); + } else { + hex_dump(&above_row_1st[-1], 0, mrl_index + 1, 1, "tl", hbd); + } + if (print_tl2) { + hex_dump(&above_row_2nd[-1], 0, 1, 1, "tl2", hbd); + } + } + } + } +#endif // DEBUG_B_PIXELS + if (use_intra_dip) { av2_highbd_intra_dip_predictor(mbmi->intra_dip_mode, dst, dst_stride, above_row_1st, left_col_1st, tx_size, xd->bd @@ -1269,6 +1416,13 @@ void av2_build_intra_predictors_high( mbmi->intra_dip_features #endif // CONFIG_DIP_EXT_PRUNING ); +#if DEBUG_B_PIXELS + if (BLOCK_TO_DEBUG) { + if (0 || plane == AVM_PLANE_Y) { + hex_dump(dst, dst_stride, txwpx, txhpx, pred_names[plane], hbd); + } + } +#endif return; } @@ -1388,6 +1542,14 @@ void av2_build_intra_predictors_high( } } +#if DEBUG_B_PIXELS + if (BLOCK_TO_DEBUG) { + if (0 || plane == AVM_PLANE_Y) { + hex_dump(dst, dst_stride, txwpx, txhpx, pred_names[plane], hbd); + } + } +#endif + return; } // predict @@ -1409,6 +1571,13 @@ void av2_build_intra_predictors_high( pred_high[mode][tx_size](dst, dst_stride, above_row_1st, left_col_1st, xd->bd); } +#if DEBUG_B_PIXELS + if (BLOCK_TO_DEBUG) { + if (0 || plane == AVM_PLANE_Y) { + hex_dump(dst, dst_stride, txwpx, txhpx, pred_names[plane], hbd); + } + } +#endif } // This function avoided the below operations in the original @@ -1707,6 +1876,12 @@ void av2_predict_intra_block(const AV2_COMMON *cm, const MACROBLOCKD *xd, dst[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]]; } } +#if DEBUG_B_PIXELS + const int mi_row = xd->mi_row, mi_col = xd->mi_col, hbd = !!(xd->bd - 8); + if (BLOCK_TO_DEBUG) { + hex_dump(dst, dst_stride, txwpx, txhpx, "y-pal-pred", hbd); + } +#endif return; } @@ -1803,10 +1978,10 @@ void av2_predict_intra_block(const AV2_COMMON *cm, const MACROBLOCKD *xd, const int is_sb_boundary = (mi_row % cm->mib_size == 0 && row_off == 0) ? 1 : 0; - if (mrl_index) { + if (mrl_index || DEBUG_BLOCK_INFO) { const int n_topright_px = have_top_right ? px_top_right : 0; const int n_bottomleft_px = have_bottom_left ? px_bottom_left : 0; - av2_build_intra_predictors_high( + av2_build_intra_predictors_high(DB_ONLY(cm) xd, ref, ref_stride, dst, dst_stride, mode, p_angle, angle_delta, tx_size, disable_edge_filter, have_top ? AVMMIN(txwpx, xr + txwpx) : 0, n_topright_px, have_left ? AVMMIN(txhpx, yd + txhpx) : 0, diff --git a/av2/common/reconintra.h b/av2/common/reconintra.h index 4c03af11df..8593e253e0 100644 --- a/av2/common/reconintra.h +++ b/av2/common/reconintra.h @@ -16,6 +16,7 @@ #include #include +#include "avm/debug.h" #include "avm/avm_integer.h" #include "av2/common/av2_common_int.h" #include "av2/common/blockd.h" @@ -289,7 +290,7 @@ DECLARE_ALIGNED(16, static const int16_t, av2_dr_interp_filter[32][4]) = { }; // Build Intra Predictors for mrl_index > 0 or = 0 -void av2_build_intra_predictors_high( +void av2_build_intra_predictors_high(DB_ONLY(const AV2_COMMON *cm) const MACROBLOCKD *xd, const uint16_t *ref, int ref_stride, uint16_t *dst, int dst_stride, PREDICTION_MODE mode, int p_angle, int angle_delta, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, diff --git a/av2/common/tip.c b/av2/common/tip.c index caaaf46d9f..8a062d7909 100644 --- a/av2/common/tip.c +++ b/av2/common/tip.c @@ -15,6 +15,8 @@ #include "config/avm_dsp_rtcd.h" #include "av2/common/reconinter.h" +#include "avm/debug.h" + // Percentage threshold of number of blocks with available motion // projection in a frame to allow TIP mode #define TIP_ENABLE_COUNT_THRESHOLD 60 @@ -959,4 +961,17 @@ void av2_setup_tip_frame(AV2_COMMON *cm, MACROBLOCKD *xd, uint16_t **mc_buf, tip_setup_tip_frame_planes(cm, xd, 0, 0, mvs_rows, mvs_cols, mvs_cols, mc_buf, tmp_conv_dst, calc_subpel_params_func, copy_refined_mvs); +#if DEBUG_B_PIXELS + const int sbsz = cm->seq_params.mib_size * 4; + const YV12_BUFFER_CONFIG *src = &cm->tip_ref.tip_frame->buf; + const ptrdiff_t stride = src->strides[0]; + const int h = mvs_rows * 8, w = mvs_cols * 8; + for (int y = 0; y < h; y += sbsz) + for (int x = 0; x < w; x += sbsz) + for (int yy = y; yy < AVMMIN(h, y + sbsz); yy += 64) + for (int xx = x; xx < AVMMIN(w, x + sbsz); xx += 64) + hex_dump(&src->buffers[0][yy * stride + xx], stride, + AVMMIN(w - xx, 64), AVMMIN(h - yy, 64), + "y-pred", !!(xd->bd - 8)); +#endif } diff --git a/av2/decoder/decodeframe.c b/av2/decoder/decodeframe.c index 6f586538d3..a143d28d75 100644 --- a/av2/decoder/decodeframe.c +++ b/av2/decoder/decodeframe.c @@ -209,11 +209,25 @@ static AVM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb, scan_line = AVMMAX(eob_data_c1->max_scan_line, eob_data_c2->max_scan_line); eob = AVMMAX(eob_data_c1->eob, eob_data_c2->eob); } - av2_inverse_transform_block( +#if DEBUG_BLOCK_INFO + const int mi_row = dcb->xd.mi_row; + const int mi_col = dcb->xd.mi_col; + const int print = BLOCK_TO_DEBUG && DEBUG_B_PIXELS && eob > 0; + if (print && (0 || plane == AVM_PLANE_Y) && !(tx_type >> 4)) { + coef_dump_transpose(dqcoeff, AVMMIN(tx_size_wide[tx_size], 32), + AVMMIN(tx_size_high[tx_size], 32), 3, "dq"); + } +#endif + av2_inverse_transform_block(DB_ONLY(print && (0 || plane == AVM_PLANE_Y)) &dcb->xd, dqcoeff, plane, tx_type, tx_size, dst, stride, eob, replace_adst_by_ddt(cm->seq_params.enable_inter_ddt, cm->features.allow_screen_content_tools, &dcb->xd), reduced_tx_set); +#if DEBUG_BLOCK_INFO + if (print && plane == AVM_PLANE_Y) { + hex_dump(dst, stride, tx_size_wide[tx_size], tx_size_high[tx_size], "recon", !!(dcb->xd.bd - 8)); + } +#endif const int width = tx_size_wide[tx_size] <= 32 ? tx_size_wide[tx_size] : 32; const int height = tx_size_high[tx_size] <= 32 ? tx_size_high[tx_size] : 32; const int sbSize = (width >= 8 && height >= 8) ? 8 : 4; @@ -366,6 +380,15 @@ static AVM_INLINE void predict_and_reconstruct_intra_block(DB_ONLY(const int dep pd->dst.stride, reduced_tx_set_used); } } +#if DEBUG_BLOCK_INFO && DEBUG_B_PIXELS + const int mi_row = dcb->xd.mi_row; + const int mi_col = dcb->xd.mi_col; + if (0 && BLOCK_TO_DEBUG && plane != AVM_PLANE_Y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + uint16_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; + hex_dump(dst, pd->dst.stride, tx_size_wide[tx_size], tx_size_high[tx_size], "recon", !!(dcb->xd.bd - 8)); + } +#endif #if CONFIG_MISMATCH_DEBUG { @@ -938,6 +961,22 @@ static AVM_INLINE void predict_inter_block(AV2_COMMON *const cm, } } #endif // CONFIG_INSPECTION + +#if DEBUG_B_PIXELS + if (BLOCK_TO_DEBUG) { + struct macroblockd_plane *const pd = &xd->plane[0]; + const int plane_block_size = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int w = AVMMIN(cm->mi_params.mi_cols - mi_col, + mi_size_wide[plane_block_size]) * MI_SIZE; + const int h = AVMMIN(cm->mi_params.mi_rows - mi_row, + mi_size_high[plane_block_size]) * MI_SIZE; + for (int y = 0; y < h; y += 64) + for (int x = 0; x < w; x += 64) + hex_dump(&pd->dst.buf[y * pd->dst.stride + x], pd->dst.stride, + AVMMIN(w - x, 64), AVMMIN(h - y, 64), "y-pred", !!(xd->bd - 8)); + } +#endif } static AVM_INLINE void copy_frame_mvs_inter_block(AV2_COMMON *const cm, diff --git a/av2/encoder/encodemb.c b/av2/encoder/encodemb.c index 4e81ebf1d4..7535d9d7fa 100644 --- a/av2/encoder/encodemb.c +++ b/av2/encoder/encodemb.c @@ -1029,7 +1029,7 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, if (recon_with_cctx) { av2_inv_cross_chroma_tx_block(dqcoeff_c1, dqcoeff, tx_size, cctx_type, xd->bd); - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff_c1, AVM_PLANE_U, tx_type, tx_size, dst_c1, pd_c1->dst.stride, max_chroma_eob, replace_adst_by_ddt(cm->seq_params.enable_inter_ddt, @@ -1040,7 +1040,7 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, if (p->eobs[block] || recon_with_cctx) { *(args->skip) = 0; - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff, plane, tx_type, tx_size, dst, pd->dst.stride, (plane == 0 || !is_cctx_allowed(cm, xd) || !recon_with_cctx) ? p->eobs[block] @@ -1530,7 +1530,7 @@ void av2_encode_block_intra(int plane, int block, int blk_row, int blk_col, } if (*eob) { - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff, plane, tx_type, tx_size, dst, dst_stride, *eob, replace_adst_by_ddt(cm->seq_params.enable_inter_ddt, cm->features.allow_screen_content_tools, xd), @@ -1847,13 +1847,13 @@ void av2_encode_block_intra_joint_uv(int block, int blk_row, int blk_col, if (*eob_c1 || *eob_c2) { av2_inv_cross_chroma_tx_block(dqcoeff_c1, dqcoeff_c2, tx_size, cctx_type, xd->bd); - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff_c1, AVM_PLANE_U, tx_type, tx_size, dst_c1, dst_stride, AVMMAX(*eob_c1, *eob_c2), replace_adst_by_ddt(cm->seq_params.enable_inter_ddt, cm->features.allow_screen_content_tools, xd), cm->features.reduced_tx_set_used); - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff_c2, AVM_PLANE_V, tx_type, tx_size, dst_c2, dst_stride, AVMMAX(*eob_c1, *eob_c2), replace_adst_by_ddt(cm->seq_params.enable_inter_ddt, diff --git a/av2/encoder/tpl_model.c b/av2/encoder/tpl_model.c index ab118c5e01..e3c60f9950 100644 --- a/av2/encoder/tpl_model.c +++ b/av2/encoder/tpl_model.c @@ -187,7 +187,7 @@ static AVM_INLINE void txfm_quant_rdcost( *rate_cost = rate_estimator(qcoeff, eob, tx_size); - av2_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride, + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride, eob, 0, 0); } diff --git a/av2/encoder/tx_search.c b/av2/encoder/tx_search.c index b6f5ed3ebb..9ff8b3b67b 100644 --- a/av2/encoder/tx_search.c +++ b/av2/encoder/tx_search.c @@ -884,7 +884,7 @@ static AVM_INLINE void inverse_transform_block_facade( const int dst_stride = pd->dst.stride; uint16_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; - av2_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff, plane, tx_type, tx_size, dst, dst_stride, eob, use_ddt, reduced_tx_set); } @@ -1057,7 +1057,7 @@ static INLINE int64_t dist_block_px_domain(const AV2_COMP *cpi, MACROBLOCK *x, TX_TYPE tx_type = av2_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, is_reduced_tx_set_used(&cpi->common, plane_type)); - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, dqcoeff, plane, tx_type, tx_size, recon, MAX_TX_SIZE, eob, replace_adst_by_ddt(cpi->common.seq_params.enable_inter_ddt, cpi->common.features.allow_screen_content_tools, xd), @@ -1122,13 +1122,13 @@ static INLINE int64_t joint_uv_dist_block_px_domain(const AV2_COMP *cpi, is_reduced_tx_set_used(&cpi->common, PLANE_TYPE_UV)); av2_inv_cross_chroma_tx_block(tmp_dqcoeff_c1, tmp_dqcoeff_c2, tx_size, cctx_type, xd->bd); - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, tmp_dqcoeff_c1, AVM_PLANE_U, tx_type, tx_size, recon_c1, MAX_TX_SIZE, max_chroma_eob, replace_adst_by_ddt(cpi->common.seq_params.enable_inter_ddt, cpi->common.features.allow_screen_content_tools, xd), cpi->common.features.reduced_tx_set_used); - av2_inverse_transform_block( + av2_inverse_transform_block(DB_ONLY(0) xd, tmp_dqcoeff_c2, AVM_PLANE_V, tx_type, tx_size, recon_c2, MAX_TX_SIZE, max_chroma_eob, replace_adst_by_ddt(cpi->common.seq_params.enable_inter_ddt, diff --git a/avm/debug.h b/avm/debug.h index ae277ab7a6..99ef4c1f95 100644 --- a/avm/debug.h +++ b/avm/debug.h @@ -28,10 +28,15 @@ #ifndef AVM_DEBUG_H #define AVM_DEBUG_H +#include +#include +#include + #define DEBUG_OBU_HDR 0 #define DEBUG_SEQ_HDR 0 #define DEBUG_FRAME_HDR 0 #define DEBUG_BLOCK_INFO 0 +#define DEBUG_B_PIXELS 0 #if DEBUG_BLOCK_INFO #define DB_ONLY(x...) x, #define BLOCK_TO_DEBUG \ @@ -41,6 +46,50 @@ if (BLOCK_TO_DEBUG) printf(fmt) #define DEBUG_CF_printf(fmt...) \ if (BLOCK_TO_DEBUG && plane > -1 && 1) printf(fmt) +#define PIX_HEX_FMT "%02x" +#define PXSTRIDE(x) (x) + +static inline void hex_fdump(FILE *out, const uint16_t *buf, ptrdiff_t stride, + int w, int h, const char *what, const int hbd) +{ + fprintf(out, "%s\n", what); + const char *pix_fmt = hbd ? " %03x" : " %02x"; + while (h--) { + int x; + for (x = 0; x < w; x++) + fprintf(out, pix_fmt, buf[x]); + buf += PXSTRIDE(stride); + fprintf(out, "\n"); + } +} + +static inline void hex_dump(const uint16_t *buf, ptrdiff_t stride, + int w, int h, const char *what, const int hbd) +{ + hex_fdump(stdout, buf, stride, w, h, what, hbd); +} + +static inline void coef_dump_transpose(const int32_t *buf, const int w, const int h, + const int len, const char *what) +{ + printf("%s\n", what); + for (int y = 0; y < w; y++) { + for (int x = 0; x < h; x++) + printf(" %*d", len, buf[x * w + y]); + printf("\n"); + } +} + +static inline void coef_dump(const int32_t *buf, const int w, const int h, + const int len, const char *what) +{ + printf("%s\n", what); + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) + printf(" %*d", len, buf[y * w + x]); + printf("\n"); + } +} #else #define DB_ONLY(x...) #define BLOCK_TO_DEBUG 0 -- 2.48.1 0005-Add-loopfilters-to-enable-disable-loopfilters.patch000066400000000000000000000310511517466257200336540ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/patchesFrom 503ba7d986412fc962bb1f76ec450d2299dbb828 Mon Sep 17 00:00:00 2001 From: Luc Trudeau Date: Fri, 28 Nov 2025 09:52:45 -0500 Subject: [PATCH 5/5] Add --loopfilters to enable/disable loopfilters --- apps/avmdec.c | 35 +++++++++++++++++++++++++++++++++++ av2/av2_dx_iface.c | 16 ++++++++++++++++ av2/common/reconinter.c | 10 ++++++++++ av2/decoder/decodeframe.c | 27 ++++++++++++++++++--------- av2/decoder/decoder.h | 2 ++ avm/avm_codec.h | 10 ++++++++++ avm/avmdx.h | 11 +++++++++++ 7 files changed, 102 insertions(+), 9 deletions(-) diff --git a/apps/avmdec.c b/apps/avmdec.c index 5c867d04ad..184ce63121 100644 --- a/apps/avmdec.c +++ b/apps/avmdec.c @@ -62,6 +62,28 @@ struct AvxDecInputContext { struct WebmInputContext *webm_ctx; }; +static const struct arg_enum_list inloop_filters_tbl[] = { + { "none", INLOOPFILTER_NONE }, + { "deblock", INLOOPFILTER_DEBLOCK }, + { "nodeblock", INLOOPFILTER_ALL - INLOOPFILTER_DEBLOCK }, + { "cdef", INLOOPFILTER_CDEF }, + { "deblock+cdef", INLOOPFILTER_DEBLOCK | INLOOPFILTER_CDEF }, + { "nocdef", INLOOPFILTER_ALL - INLOOPFILTER_CDEF }, + { "ccso", INLOOPFILTER_CCSO }, + { "cdef+ccso", INLOOPFILTER_CDEF | INLOOPFILTER_CCSO }, + { "deblock+cdef+ccso", INLOOPFILTER_DEBLOCK | INLOOPFILTER_CDEF | + INLOOPFILTER_CCSO }, + { "noccso", INLOOPFILTER_ALL - INLOOPFILTER_CCSO }, + { "wiener", INLOOPFILTER_WIENER }, + { "nowiener", INLOOPFILTER_ALL - INLOOPFILTER_WIENER }, + { "deblock+cdef+ccso+wiener", INLOOPFILTER_DEBLOCK | INLOOPFILTER_CDEF | + INLOOPFILTER_CCSO | INLOOPFILTER_WIENER }, + { "gdf", INLOOPFILTER_GDF }, + { "nogdf", INLOOPFILTER_ALL - INLOOPFILTER_GDF }, + { "all", INLOOPFILTER_ALL }, + {NULL, 0} +}; + static const arg_def_t help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t looparg = @@ -129,6 +151,8 @@ static const arg_def_t outallarg = ARG_DEF( NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream"); static const arg_def_t skipfilmgrain = ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application"); +static const arg_def_t inloopfiltersarg = + ARG_DEF_ENUM(NULL, "inloopfilters", 1, "Which in-loop filters to enable", inloop_filters_tbl); static const arg_def_t randomaccess = ARG_DEF(NULL, "random-access-point-index", 1, "Start decoding at the N-th random access point"); @@ -166,6 +190,7 @@ static const arg_def_t *all_args[] = { &help, &selectlocalopsarg, &outallarg, &skipfilmgrain, + &inloopfiltersarg, &randomaccess, &bruoptmodearg, &icc_file, @@ -679,6 +704,7 @@ static int main_loop(int argc, const char **argv_) { int num_local_ops_selections = 0; int output_all_layers = 0; int skip_film_grain = 0; + enum InloopFilterType inloop_filters = INLOOPFILTER_ALL; int random_access_point_index = -1; int bru_opt_mode = 0; avm_image_t *scaled_img = NULL; @@ -870,6 +896,8 @@ static int main_loop(int argc, const char **argv_) { output_all_layers = 1; } else if (arg_match(&arg, &skipfilmgrain, argi)) { skip_film_grain = 1; + } else if (arg_match(&arg, &inloopfiltersarg, argi)) { + inloop_filters = arg_parse_enum(&arg); } else if (arg_match(&arg, &randomaccess, argi)) { random_access_point_index = arg_parse_uint(&arg); } else if (arg_match(&arg, &bruoptmodearg, argi)) { @@ -1041,6 +1069,13 @@ static int main_loop(int argc, const char **argv_) { goto fail; } + if (AVM_CODEC_CONTROL_TYPECHECKED(&decoder, AV2D_SET_INLOOP_FILTERS, + inloop_filters)) { + fprintf(stderr, "Failed to set inloop_filters: %s\n", + avm_codec_error(&decoder)); + goto fail; + } + if (AVM_CODEC_CONTROL_TYPECHECKED(&decoder, AV2D_SET_RANDOM_ACCESS, random_access_point_index)) { fprintf(stderr, "Failed to set random_access_point_index: %s\n", diff --git a/av2/av2_dx_iface.c b/av2/av2_dx_iface.c index a7aec49a49..95b2b02959 100644 --- a/av2/av2_dx_iface.c +++ b/av2/av2_dx_iface.c @@ -51,6 +51,7 @@ struct avm_codec_alg_priv { int byte_alignment; int skip_loop_filter; int skip_film_grain; + int inloop_filters; int64_t random_access_point_index; int bru_opt_mode; unsigned int row_mt; @@ -370,6 +371,7 @@ static void init_buffer_callbacks(avm_codec_alg_priv_t *ctx) { cm->features.byte_alignment = ctx->byte_alignment; pbi->skip_loop_filter = ctx->skip_loop_filter; pbi->skip_film_grain = ctx->skip_film_grain; + pbi->inloop_filters = ctx->inloop_filters; pbi->random_access_point_index = ctx->random_access_point_index; pbi->bru_opt_mode = ctx->bru_opt_mode; @@ -1712,6 +1714,19 @@ static avm_codec_err_t ctrl_set_skip_film_grain(avm_codec_alg_priv_t *ctx, return AVM_CODEC_OK; } +static avm_codec_err_t ctrl_set_inloop_filters(avm_codec_alg_priv_t *ctx, + va_list args) { + ctx->inloop_filters = va_arg(args, int); + + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->inloop_filters = ctx->inloop_filters; + } + + return AVM_CODEC_OK; +} + static avm_codec_err_t ctrl_set_random_access(avm_codec_alg_priv_t *ctx, va_list args) { ctx->random_access_point_index = va_arg(args, int); @@ -1838,6 +1853,7 @@ static avm_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AV2_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback }, { AV2D_SET_ROW_MT, ctrl_set_row_mt }, { AV2D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain }, + { AV2D_SET_INLOOP_FILTERS, ctrl_set_inloop_filters }, { AV2D_SET_RANDOM_ACCESS, ctrl_set_random_access }, { AV2D_SET_BRU_OPT_MODE, ctrl_set_bru_opt_mode }, { AV2D_ENABLE_SUBGOP_STATS, ctrl_enable_subgop_stats }, diff --git a/av2/common/reconinter.c b/av2/common/reconinter.c index 12207fb329..66cb54b4d3 100644 --- a/av2/common/reconinter.c +++ b/av2/common/reconinter.c @@ -1969,9 +1969,19 @@ void av2_build_one_bawp_inter_predictor( (mi_y_p + y_off_p - BAWP_REF_LINES) < 0 || ref_w <= 0 || ref_h <= 0 || (mi_x_p + ref_w + x_off_p) > width_p || (mi_y_p + ref_h + y_off_p) > height_p) { +#if 0 avm_internal_error( (struct avm_internal_error_info *)&cm->error, AVM_CODEC_ERROR, "Inter BAWP template cannot outside the valid reference range"); +#else + // When disabling postfilters, the reconstruction changes and motion vectors + // (b/c refinemc/opfl) will change, causing some sequences to cause the + // above error to be thrown. To increase the number of CTC samples we can + // test, we skip BAWP (instead of aborting decoding) when this happens. + // This is not normative but is allowed as error resilience behaviour. + mbmi->bawp_alpha[plane][ref] = 256; + mbmi->bawp_beta[plane][ref] = 0; +#endif return; } else { uint16_t *recon_buf = xd->plane[plane].dst.buf; diff --git a/av2/decoder/decodeframe.c b/av2/decoder/decodeframe.c index a143d28d75..3596b55719 100644 --- a/av2/decoder/decodeframe.c +++ b/av2/decoder/decodeframe.c @@ -721,7 +721,8 @@ static void av2_dec_setup_tip_frame(AV2Decoder *pbi, AV2_COMMON *cm, cm->cur_frame->v_ac_delta_q = cm->quant_params.v_ac_delta_q = avg_v_ac_delta_q; } - if (cm->seq_params.enable_lf_sub_pu && cm->features.allow_lf_sub_pu) { + if (cm->seq_params.enable_lf_sub_pu && cm->features.allow_lf_sub_pu && + pbi->inloop_filters & INLOOPFILTER_DEBLOCK) { init_tip_lf_parameter(cm, 0, av2_num_planes(cm)); loop_filter_tip_frame(cm, xd, 0, av2_num_planes(cm)); } @@ -10421,9 +10422,16 @@ void av2_decode_tg_tiles_and_wrapup(AV2Decoder *pbi, const uint8_t *data, } } + const int filters = pbi->inloop_filters; + const int enable_deblock = filters & INLOOPFILTER_DEBLOCK; + const int enable_cdef = filters & INLOOPFILTER_CDEF; + const int enable_restoration = filters & INLOOPFILTER_WIENER; + const int enable_ccso = filters & INLOOPFILTER_CCSO; + const int enable_gdf = filters & INLOOPFILTER_GDF; + if (!cm->bru.frame_inactive_flag) { - if (cm->lf.apply_deblocking_filter[0] || - cm->lf.apply_deblocking_filter[1]) { + if (enable_deblock && + (cm->lf.apply_deblocking_filter[0]|| cm->lf.apply_deblocking_filter[1])) { if (pbi->num_workers > 1) { av2_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0, num_planes, 0, pbi->tile_workers, @@ -10434,7 +10442,7 @@ void av2_decode_tg_tiles_and_wrapup(AV2Decoder *pbi, const uint8_t *data, } } - const int use_ccso = + const int use_ccso = enable_ccso && !pbi->skip_loop_filter && !cm->features.coded_lossless && (cm->ccso_info.ccso_enable[0] || cm->ccso_info.ccso_enable[1] || cm->ccso_info.ccso_enable[2]); @@ -10457,16 +10465,17 @@ void av2_decode_tg_tiles_and_wrapup(AV2Decoder *pbi, const uint8_t *data, extend_ccso_border(&cm->cur_frame->buf, ext_rec_y, CCSO_PADDING_SIZE); } - const int do_loop_restoration = - cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + const int do_loop_restoration = enable_restoration && + (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || (cm->rst_info[1].frame_restoration_type != RESTORE_NONE && !cm->seq_params.monochrome) || (cm->rst_info[2].frame_restoration_type != RESTORE_NONE && - !cm->seq_params.monochrome); - const int do_cdef = !pbi->skip_loop_filter && + !cm->seq_params.monochrome)); + const int do_cdef = enable_cdef && + !pbi->skip_loop_filter && !cm->features.coded_lossless && cm->cdef_info.cdef_frame_enable; - const int do_gdf = is_gdf_enabled(cm); + const int do_gdf = enable_gdf && is_gdf_enabled(cm); const int optimized_loop_restoration = !cm->seq_params.disable_loopfilters_across_tiles && !do_gdf && !use_ccso && !do_cdef; diff --git a/av2/decoder/decoder.h b/av2/decoder/decoder.h index 1100c3b161..e784ed5248 100644 --- a/av2/decoder/decoder.h +++ b/av2/decoder/decoder.h @@ -385,6 +385,8 @@ typedef struct AV2Decoder { // So we track whether this is the first frame or not. int decoding_first_frame; + int inloop_filters; + int max_threads; int inv_tile_order; int need_resync; // wait for key/intra-only frame. diff --git a/avm/avm_codec.h b/avm/avm_codec.h index dfae4e9e1b..93a884e02b 100644 --- a/avm/avm_codec.h +++ b/avm/avm_codec.h @@ -602,6 +602,16 @@ typedef enum { NUM_OBU_METADATA_TYPES, } OBU_METADATA_TYPE; +enum InloopFilterType { + INLOOPFILTER_NONE = 0, + INLOOPFILTER_DEBLOCK = 1 << 0, + INLOOPFILTER_CDEF = 1 << 1, + INLOOPFILTER_WIENER = 1 << 2, + INLOOPFILTER_CCSO = 1 << 3, + INLOOPFILTER_GDF = 1 << 4, + INLOOPFILTER_ALL = -1, +}; + /*!\brief Returns string representation of OBU_TYPE. * * \param[in] type The OBU_TYPE to convert to string. diff --git a/avm/avmdx.h b/avm/avmdx.h index e83a34305f..7050565134 100644 --- a/avm/avmdx.h +++ b/avm/avmdx.h @@ -338,6 +338,14 @@ enum avm_dec_control_id { */ AV2D_SET_SKIP_FILM_GRAIN, + /*!\brief Codec control function to set the inloop filters, int + * parameter + * + * Valid values are integers. The decoder will skip inloop filters when its + * value is set to nonzero. The default value is INLOOPFILTER_ALL. + */ + AV2D_SET_INLOOP_FILTERS, + AV2D_SET_RANDOM_ACCESS, AV2D_SET_BRU_OPT_MODE, @@ -492,6 +500,9 @@ AVM_CTRL_USE_TYPE(AV2D_SET_ROW_MT, unsigned int) AVM_CTRL_USE_TYPE(AV2D_SET_SKIP_FILM_GRAIN, int) #define AVM_CTRL_AV2D_SET_SKIP_FILM_GRAIN +AVM_CTRL_USE_TYPE(AV2D_SET_INLOOP_FILTERS, int) +#define AVM_CTRL_AV2D_SET_DUMP_INTRA_FILE + AVM_CTRL_USE_TYPE(AV2D_SET_RANDOM_ACCESS, int) #define AVM_CTRL_AV2D_SET_RANDOM_ACCESS -- 2.48.1 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/000077500000000000000000000000001517466257200206335ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/000077500000000000000000000000001517466257200214125ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/000077500000000000000000000000001517466257200216365ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/cdef.S000066400000000000000000000453041517466257200226710ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" // n1 = s0/d0 // w1 = d0/q0 // n2 = s4/d2 // w2 = d2/q1 .macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldrh r12, [\s1, #-2] vldr \n1, [\s1] vdup.16 d4, r12 ldrh r12, [\s1, #\w] vmov.16 d4[1], r12 ldrh r12, [\s2, #-2] vldr \n2, [\s2] vmov.16 d4[2], r12 ldrh r12, [\s2, #\w] vmovl.u8 q0, d0 vmov.16 d4[3], r12 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vstr s8, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s9, [r0, #2*\w] add r0, r0, #2*\stride vstr s10, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s11, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldrh r12, [\s1, #-2] vldr \n1, [\s1] vdup.16 d4, r12 ldrh r12, [\s2, #-2] vldr \n2, [\s2] vmovl.u8 q0, d0 vmov.16 d4[1], r12 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vstr s8, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s9, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vldr \n1, [\s1] ldrh r12, [\s1, #\w] vldr \n2, [\s2] vdup.16 d4, r12 ldrh r12, [\s2, #\w] vmovl.u8 q0, d0 vmov.16 d4[1], r12 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s8, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s9, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vldr \n1, [\s1] vldr \n2, [\s2] vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride .endif 3: .endm .macro load_n_incr dst, src, incr, w .if \w == 4 vld1.32 {\dst\()[0]}, [\src, :32], \incr .else vld1.8 {\dst\()}, [\src, :64], \incr .endif .endm // void dav2d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); // n1 = s0/d0 // w1 = d0/q0 // n2 = s4/d2 // w2 = d2/q1 .macro padding_func w, stride, n1, w1, n2, w2, align function cdef_padding\w\()_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] cmp r7, #0xf // fully edged beq cdef_padding\w\()_edged_8bpc_neon vmov.i16 q3, #0x8000 tst r7, #4 // CDEF_HAVE_TOP bne 1f // !CDEF_HAVE_TOP sub r12, r0, #2*(2*\stride+2) vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif b 3f 1: // CDEF_HAVE_TOP add r8, r4, r2 sub r0, r0, #2*(2*\stride) pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 // Middle section 3: tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vld1.16 {d2[]}, [r3, :16]! ldrh r12, [r1, #\w] load_n_incr d0, r1, r2, \w subs r6, r6, #1 vmov.16 d2[1], r12 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s4, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s5, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {d2[]}, [r3, :16]! load_n_incr d0, r1, r2, \w subs r6, r6, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s4, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b b 3f 2: tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldrh r12, [r1, #\w] load_n_incr d0, r1, r2, \w vdup.16 d2, r12 subs r6, r6, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s4, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr d0, r1, r2, \w subs r6, r6, #1 vmovl.u8 q0, d0 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b 3: tst r7, #8 // CDEF_HAVE_BOTTOM bne 1f // !CDEF_HAVE_BOTTOM sub r12, r0, #4 vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif pop {r4-r8,pc} 1: // CDEF_HAVE_BOTTOM add r8, r5, r2 pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 endfunc .endm padding_func 8, 16, d0, q0, d2, q1, 128 padding_func 4, 8, s0, d0, s4, d2, 64 // void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg, align function cdef_padding\w\()_edged_8bpc_neon sub r0, r0, #(2*\stride) ldrh r12, [r4, #-2] vldr \reg, [r4] add r8, r4, r2 strh r12, [r0, #-2] ldrh r12, [r4, #\w] vstr \reg, [r0] strh r12, [r0, #\w] ldrh r12, [r8, #-2] vldr \reg, [r8] strh r12, [r0, #\stride-2] ldrh r12, [r8, #\w] vstr \reg, [r0, #\stride] strh r12, [r0, #\stride+\w] add r0, r0, #2*\stride 0: ldrh r12, [r3], #2 vldr \reg, [r1] str r12, [r0, #-2] ldrh r12, [r1, #\w] add r1, r1, r2 subs r6, r6, #1 vstr \reg, [r0] str r12, [r0, #\w] add r0, r0, #\stride bgt 0b ldrh r12, [r5, #-2] vldr \reg, [r5] add r8, r5, r2 strh r12, [r0, #-2] ldrh r12, [r5, #\w] vstr \reg, [r0] strh r12, [r0, #\w] ldrh r12, [r8, #-2] vldr \reg, [r8] strh r12, [r0, #\stride-2] ldrh r12, [r8, #\w] vstr \reg, [r0, #\stride] strh r12, [r0, #\stride+\w] pop {r4-r8,pc} endfunc .endm padding_func_edged 8, 16, d0, 64 padding_func_edged 4, 8, s0, 32 tables filter 8, 8 filter 4, 8 find_dir 8 .macro load_px_8 d11, d12, d21, d22, w .if \w == 8 add r6, r2, r9 // x + off sub r9, r2, r9 // x - off vld1.8 {\d11}, [r6] // p0 add r6, r6, #16 // += stride vld1.8 {\d21}, [r9] // p1 add r9, r9, #16 // += stride vld1.8 {\d12}, [r6] // p0 vld1.8 {\d22}, [r9] // p1 .else add r6, r2, r9 // x + off sub r9, r2, r9 // x - off vld1.32 {\d11[0]}, [r6] // p0 add r6, r6, #8 // += stride vld1.32 {\d21[0]}, [r9] // p1 add r9, r9, #8 // += stride vld1.32 {\d11[1]}, [r6] // p0 add r6, r6, #8 // += stride vld1.32 {\d21[1]}, [r9] // p1 add r9, r9, #8 // += stride vld1.32 {\d12[0]}, [r6] // p0 add r6, r6, #8 // += stride vld1.32 {\d22[0]}, [r9] // p1 add r9, r9, #8 // += stride vld1.32 {\d12[1]}, [r6] // p0 vld1.32 {\d22[1]}, [r9] // p1 .endif .endm .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min .if \min vmin.u8 q3, q3, \s1 vmax.u8 q4, q4, \s1 vmin.u8 q3, q3, \s2 vmax.u8 q4, q4, \s2 .endif vabd.u8 q8, q0, \s1 // abs(diff) vabd.u8 q11, q0, \s2 // abs(diff) vshl.u8 q9, q8, \shift // abs(diff) >> shift vshl.u8 q12, q11, \shift // abs(diff) >> shift vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) vcgt.u8 q10, q0, \s1 // px > p0 vcgt.u8 q13, q0, \s2 // px > p1 vmin.u8 q9, q9, q8 // imin(abs(diff), clip) vmin.u8 q12, q12, q11 // imin(abs(diff), clip) vneg.s8 q8, q9 // -imin() vneg.s8 q11, q12 // -imin() vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) vdup.8 d18, \tap // taps[k] vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, // const uint16_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h, size_t edges); .macro filter_func_8 w, pri, sec, min, suffix function cdef_filter\w\suffix\()_edged_neon .if \pri movrel_local r8, pri_taps and r9, r3, #1 add r8, r8, r9, lsl #1 .endif movrel_local r9, directions\w add r5, r9, r5, lsl #1 vmov.u8 d17, #7 vdup.8 d16, r6 // damping vmov.8 d8[0], r3 vmov.8 d8[1], r4 vclz.i8 d8, d8 // clz(threshold) vsub.i8 d8, d17, d8 // ulog2(threshold) vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) vneg.s8 d8, d8 // -shift .if \sec vdup.8 q6, d8[1] .endif .if \pri vdup.8 q5, d8[0] .endif 1: .if \w == 8 add r12, r2, #16 vld1.8 {d0}, [r2, :64] // px vld1.8 {d1}, [r12, :64] // px .else add r12, r2, #8 vld1.32 {d0[0]}, [r2, :32] // px add r9, r2, #2*8 vld1.32 {d0[1]}, [r12, :32] // px add r12, r12, #2*8 vld1.32 {d1[0]}, [r9, :32] // px vld1.32 {d1[1]}, [r12, :32] // px .endif vmov.u8 q1, #0 // sum vmov.u8 q2, #0 // sum .if \min vmov.u16 q3, q0 // min vmov.u16 q4, q0 // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov lr, #2 // sec_taps[0] 2: .if \pri ldrsb r9, [r5] // off1 load_px_8 d28, d29, d30, d31, \w .endif .if \sec add r5, r5, #4 // +2*2 ldrsb r9, [r5] // off2 .endif .if \pri ldrb r12, [r8] // *pri_taps vdup.8 q7, r3 // threshold handle_pixel_8 q14, q15, q7, q5, r12, \min .endif .if \sec load_px_8 d28, d29, d30, d31, \w add r5, r5, #8 // +2*4 ldrsb r9, [r5] // off3 vdup.8 q7, r4 // threshold handle_pixel_8 q14, q15, q7, q6, lr, \min load_px_8 d28, d29, d30, d31, \w handle_pixel_8 q14, q15, q7, q6, lr, \min sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; .else add r5, r5, #1 // r5 += 1 .endif subs lr, lr, #1 // sec_tap-- (value) .if \pri add r8, r8, #1 // pri_taps++ (pointer) .endif bne 2b vshr.s16 q14, q1, #15 // -(sum < 0) vshr.s16 q15, q2, #15 // -(sum < 0) vadd.i16 q1, q1, q14 // sum - (sum < 0) vadd.i16 q2, q2, q15 // sum - (sum < 0) vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 vqmovun.s16 d0, q1 vqmovun.s16 d1, q2 .if \min vmin.u8 q0, q0, q4 vmax.u8 q0, q0, q3 // iclip(px + .., min, max) .endif .if \w == 8 vst1.8 {d0}, [r0, :64], r1 add r2, r2, #2*16 // tmp += 2*tmp_stride subs r7, r7, #2 // h -= 2 vst1.8 {d1}, [r0, :64], r1 .else vst1.32 {d0[0]}, [r0, :32], r1 add r2, r2, #4*8 // tmp += 4*tmp_stride vst1.32 {d0[1]}, [r0, :32], r1 subs r7, r7, #4 // h -= 4 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 .endif // Reset pri_taps and directions back to the original point sub r5, r5, #2 .if \pri sub r8, r8, #2 .endif bgt 1b vpop {q4-q7} pop {r4-r9,pc} endfunc .endm .macro filter_8 w filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec .endm filter_8 8 filter_8 4 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/cdef16.S000066400000000000000000000175071517466257200230440ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" // r1 = d0/q0 // r2 = d2/q1 .macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vldr s8, [\s1, #-4] vld1.16 {\r1}, [\s1, :\align] vldr s9, [\s1, #2*\w] vldr s10, [\s2, #-4] vld1.16 {\r2}, [\s2, :\align] vldr s11, [\s2, #2*\w] vstr s8, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s9, [r0, #2*\w] add r0, r0, #2*\stride vstr s10, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s11, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vldr s8, [\s1, #-4] vld1.16 {\r1}, [\s1, :\align] vldr s9, [\s2, #-4] vld1.16 {\r2}, [\s2, :\align] vstr s8, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s9, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vld1.16 {\r1}, [\s1, :\align] vldr s8, [\s1, #2*\w] vld1.16 {\r2}, [\s2, :\align] vldr s9, [\s2, #2*\w] vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s8, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s9, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {\r1}, [\s1, :\align] vld1.16 {\r2}, [\s2, :\align] vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride .endif 3: .endm // void dav2d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); // r1 = d0/q0 // r2 = d2/q1 .macro padding_func_16 w, stride, r1, r2, align function cdef_padding\w\()_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] vmov.i16 q3, #0x8000 tst r7, #4 // CDEF_HAVE_TOP bne 1f // !CDEF_HAVE_TOP sub r12, r0, #2*(2*\stride+2) vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif b 3f 1: // CDEF_HAVE_TOP add r8, r4, r2 sub r0, r0, #2*(2*\stride) pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0 // Middle section 3: tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vld1.32 {d2[]}, [r3, :32]! vldr s5, [r1, #2*\w] vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s4, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s5, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.32 {d2[]}, [r3, :32]! vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s4, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b b 3f 2: tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vldr s4, [r1, #2*\w] vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s4, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b 3: tst r7, #8 // CDEF_HAVE_BOTTOM bne 1f // !CDEF_HAVE_BOTTOM sub r12, r0, #4 vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif pop {r4-r8,pc} 1: // CDEF_HAVE_BOTTOM add r8, r5, r2 pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1 endfunc .endm padding_func_16 8, 16, q0, q1, 128 padding_func_16 4, 8, d0, d2, 64 tables filter 8, 16 filter 4, 16 find_dir 16 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/cdef_tmpl.S000066400000000000000000000455671517466257200237400ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro dir_table w, stride const directions\w .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 .byte 1 * \stride + 0, 2 * \stride + 0 .byte 1 * \stride + 0, 2 * \stride - 1 // Repeated, to avoid & 7 .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 endconst .endm .macro tables dir_table 8, 16 dir_table 4, 8 const pri_taps .byte 4, 2, 3, 3 endconst .endm .macro load_px d11, d12, d21, d22, w .if \w == 8 add r6, r2, r9, lsl #1 // x + off sub r9, r2, r9, lsl #1 // x - off vld1.16 {\d11,\d12}, [r6] // p0 vld1.16 {\d21,\d22}, [r9] // p1 .else add r6, r2, r9, lsl #1 // x + off sub r9, r2, r9, lsl #1 // x - off vld1.16 {\d11}, [r6] // p0 add r6, r6, #2*8 // += stride vld1.16 {\d21}, [r9] // p1 add r9, r9, #2*8 // += stride vld1.16 {\d12}, [r6] // p0 vld1.16 {\d22}, [r9] // p1 .endif .endm .macro handle_pixel s1, s2, thresh_vec, shift, tap, min .if \min vmin.u16 q2, q2, \s1 vmax.s16 q3, q3, \s1 vmin.u16 q2, q2, \s2 vmax.s16 q3, q3, \s2 .endif vabd.u16 q8, q0, \s1 // abs(diff) vabd.u16 q11, q0, \s2 // abs(diff) vshl.u16 q9, q8, \shift // abs(diff) >> shift vshl.u16 q12, q11, \shift // abs(diff) >> shift vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) vsub.i16 q10, \s1, q0 // diff = p0 - px vsub.i16 q13, \s2, q0 // diff = p1 - px vneg.s16 q8, q9 // -clip vneg.s16 q11, q12 // -clip vmin.s16 q10, q10, q9 // imin(diff, clip) vmin.s16 q13, q13, q12 // imin(diff, clip) vdup.16 q9, \tap // taps[k] vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() .endm // void dav2d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint16_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h, size_t edges); .macro filter_func w, bpc, pri, sec, min, suffix function cdef_filter\w\suffix\()_\bpc\()bpc_neon .if \bpc == 8 cmp r8, #0xf beq cdef_filter\w\suffix\()_edged_neon .endif .if \pri .if \bpc == 16 clz r9, r9 sub r9, r9, #24 // -bitdepth_min_8 neg r9, r9 // bitdepth_min_8 .endif movrel_local r8, pri_taps .if \bpc == 16 lsr r9, r3, r9 // pri_strength >> bitdepth_min_8 and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1 .else and r9, r3, #1 .endif add r8, r8, r9, lsl #1 .endif movrel_local r9, directions\w add r5, r9, r5, lsl #1 vmov.u16 d17, #15 vdup.16 d16, r6 // damping .if \pri vdup.16 q5, r3 // threshold .endif .if \sec vdup.16 q7, r4 // threshold .endif vmov.16 d8[0], r3 vmov.16 d8[1], r4 vclz.i16 d8, d8 // clz(threshold) vsub.i16 d8, d17, d8 // ulog2(threshold) vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) vneg.s16 d8, d8 // -shift .if \sec vdup.16 q6, d8[1] .endif .if \pri vdup.16 q4, d8[0] .endif 1: .if \w == 8 vld1.16 {q0}, [r2, :128] // px .else add r12, r2, #2*8 vld1.16 {d0}, [r2, :64] // px vld1.16 {d1}, [r12, :64] // px .endif vmov.u16 q1, #0 // sum .if \min vmov.u16 q2, q0 // min vmov.u16 q3, q0 // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov lr, #2 // sec_taps[0] 2: .if \pri ldrsb r9, [r5] // off1 load_px d28, d29, d30, d31, \w .endif .if \sec add r5, r5, #4 // +2*2 ldrsb r9, [r5] // off2 .endif .if \pri ldrb r12, [r8] // *pri_taps handle_pixel q14, q15, q5, q4, r12, \min .endif .if \sec load_px d28, d29, d30, d31, \w add r5, r5, #8 // +2*4 ldrsb r9, [r5] // off3 handle_pixel q14, q15, q7, q6, lr, \min load_px d28, d29, d30, d31, \w handle_pixel q14, q15, q7, q6, lr, \min sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; .else add r5, r5, #1 // r5 += 1 .endif subs lr, lr, #1 // sec_tap-- (value) .if \pri add r8, r8, #1 // pri_taps++ (pointer) .endif bne 2b vshr.s16 q14, q1, #15 // -(sum < 0) vadd.i16 q1, q1, q14 // sum - (sum < 0) vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 .if \min vmin.s16 q0, q0, q3 vmax.s16 q0, q0, q2 // iclip(px + .., min, max) .endif .if \bpc == 8 vmovn.u16 d0, q0 .endif .if \w == 8 add r2, r2, #2*16 // tmp += tmp_stride subs r7, r7, #1 // h-- .if \bpc == 8 vst1.8 {d0}, [r0, :64], r1 .else vst1.16 {q0}, [r0, :128], r1 .endif .else .if \bpc == 8 vst1.32 {d0[0]}, [r0, :32], r1 .else vst1.16 {d0}, [r0, :64], r1 .endif add r2, r2, #2*16 // tmp += 2*tmp_stride subs r7, r7, #2 // h -= 2 .if \bpc == 8 vst1.32 {d0[1]}, [r0, :32], r1 .else vst1.16 {d1}, [r0, :64], r1 .endif .endif // Reset pri_taps and directions back to the original point sub r5, r5, #2 .if \pri sub r8, r8, #2 .endif bgt 1b vpop {q4-q7} pop {r4-r9,pc} endfunc .endm .macro filter w, bpc filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec function cdef_filter\w\()_\bpc\()bpc_neon, export=1 push {r4-r9,lr} vpush {q4-q7} ldrd r4, r5, [sp, #92] ldrd r6, r7, [sp, #100] .if \bpc == 16 ldrd r8, r9, [sp, #108] .else ldr r8, [sp, #108] .endif cmp r3, #0 // pri_strength bne 1f b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 1: cmp r4, #0 // sec_strength bne 1f b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 1: b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec endfunc .endm const div_table, align=4 .short 840, 420, 280, 210, 168, 140, 120, 105 endconst const alt_fact, align=4 .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 endconst .macro cost_alt dest, s1, s2, s3, s4, s5, s6 vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] vmull.s16 q2, \s2, \s2 vmull.s16 q3, \s3, \s3 vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] vmull.s16 q12, \s5, \s5 vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact vmla.i32 q1, q2, q14 vmla.i32 q1, q3, q15 vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact vmla.i32 q5, q12, q14 vmla.i32 q5, q6, q15 vadd.i32 d2, d2, d3 vadd.i32 d3, d10, d11 vpadd.i32 \dest, d2, d3 // *cost_ptr .endm .macro find_best s1, s2, s3 .ifnb \s2 vmov.32 lr, \s2 .endif cmp r12, r1 // cost[n] > best_cost itt gt movgt r0, r3 // best_dir = n movgt r1, r12 // best_cost = cost[n] .ifnb \s2 add r3, r3, #1 // n++ cmp lr, r1 // cost[n] > best_cost vmov.32 r12, \s3 itt gt movgt r0, r3 // best_dir = n movgt r1, lr // best_cost = cost[n] add r3, r3, #1 // n++ .endif .endm // int dav2d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, // unsigned *const var) .macro find_dir bpc function cdef_find_dir_\bpc\()bpc_neon, export=1 push {lr} vpush {q4-q7} .if \bpc == 16 clz r3, r3 // clz(bitdepth_max) sub lr, r3, #24 // -bitdepth_min_8 .endif sub sp, sp, #32 // cost mov r3, #8 vmov.u16 q1, #0 // q0-q1 sum_diag[0] vmov.u16 q3, #0 // q2-q3 sum_diag[1] vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] vmov.u16 q8, #0 // q6,d16 sum_alt[0] // q7,d17 sum_alt[1] vmov.u16 q9, #0 // q9,d22 sum_alt[2] vmov.u16 q11, #0 vmov.u16 q10, #0 // q10,d23 sum_alt[3] .irpc i, 01234567 .if \bpc == 8 vld1.8 {d30}, [r0, :64], r1 vmov.u8 d31, #128 vsubl.u8 q15, d30, d31 // img[x] - 128 .else vld1.16 {q15}, [r0, :128], r1 vdup.16 q14, lr // -bitdepth_min_8 vshl.u16 q15, q15, q14 vmov.u16 q14, #128 vsub.i16 q15, q15, q14 // img[x] - 128 .endif vmov.u16 q14, #0 .if \i == 0 vmov q0, q15 // sum_diag[0] .else vext.8 q12, q14, q15, #(16-2*\i) vext.8 q13, q15, q14, #(16-2*\i) vadd.i16 q0, q0, q12 // sum_diag[0] vadd.i16 q1, q1, q13 // sum_diag[0] .endif vrev64.16 q13, q15 vswp d26, d27 // [-x] .if \i == 0 vmov q2, q13 // sum_diag[1] .else vext.8 q12, q14, q13, #(16-2*\i) vext.8 q13, q13, q14, #(16-2*\i) vadd.i16 q2, q2, q12 // sum_diag[1] vadd.i16 q3, q3, q13 // sum_diag[1] .endif vpadd.u16 d26, d30, d31 // [(x >> 1)] vmov.u16 d27, #0 vpadd.u16 d24, d26, d28 vpadd.u16 d24, d24, d28 // [y] vmov.u16 r12, d24[0] vadd.i16 q5, q5, q15 // sum_hv[1] .if \i < 4 vmov.16 d8[\i], r12 // sum_hv[0] .else vmov.16 d9[\i-4], r12 // sum_hv[0] .endif .if \i == 0 vmov.u16 q6, q13 // sum_alt[0] .else vext.8 q12, q14, q13, #(16-2*\i) vext.8 q14, q13, q14, #(16-2*\i) vadd.i16 q6, q6, q12 // sum_alt[0] vadd.i16 d16, d16, d28 // sum_alt[0] .endif vrev64.16 d26, d26 // [-(x >> 1)] vmov.u16 q14, #0 .if \i == 0 vmov q7, q13 // sum_alt[1] .else vext.8 q12, q14, q13, #(16-2*\i) vext.8 q13, q13, q14, #(16-2*\i) vadd.i16 q7, q7, q12 // sum_alt[1] vadd.i16 d17, d17, d26 // sum_alt[1] .endif .if \i < 6 vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) vadd.i16 q9, q9, q12 // sum_alt[2] vadd.i16 d22, d22, d26 // sum_alt[2] .else vadd.i16 q9, q9, q15 // sum_alt[2] .endif .if \i == 0 vmov q10, q15 // sum_alt[3] .elseif \i == 1 vadd.i16 q10, q10, q15 // sum_alt[3] .else vext.8 q12, q14, q15, #(16-2*(\i/2)) vext.8 q13, q15, q14, #(16-2*(\i/2)) vadd.i16 q10, q10, q12 // sum_alt[3] vadd.i16 d23, d23, d26 // sum_alt[3] .endif .endr vmov.u32 q15, #105 vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] vmlal.s16 q12, d9, d9 vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] vmlal.s16 q13, d11, d11 vadd.s32 d8, d24, d25 vadd.s32 d9, d26, d27 vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) vmul.i32 d8, d8, d30 // cost[2,6] *= 105 vrev64.16 q1, q1 vrev64.16 q3, q3 vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] vstr s16, [sp, #2*4] // cost[2] vstr s17, [sp, #6*4] // cost[6] movrel_local r12, div_table vld1.16 {q14}, [r12, :128] vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] vmull.s16 q12, d1, d1 vmlal.s16 q5, d2, d2 vmlal.s16 q12, d3, d3 vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] vmull.s16 q1, d5, d5 vmlal.s16 q0, d6, d6 vmlal.s16 q1, d7, d7 vmovl.u16 q13, d28 // div_table vmovl.u16 q14, d29 vmul.i32 q5, q5, q13 // cost[0] vmla.i32 q5, q12, q14 vmul.i32 q0, q0, q13 // cost[4] vmla.i32 q0, q1, q14 vadd.i32 d10, d10, d11 vadd.i32 d0, d0, d1 vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 movrel_local r12, alt_fact vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 vstr s0, [sp, #0*4] // cost[0] vstr s1, [sp, #4*4] // cost[4] vmovl.u16 q13, d29 // div_table[2*m+1] + 105 vmovl.u16 q14, d30 vmovl.u16 q15, d31 cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] vstr s28, [sp, #1*4] // cost[1] vstr s29, [sp, #3*4] // cost[3] mov r0, #0 // best_dir vmov.32 r1, d0[0] // best_cost mov r3, #1 // n vstr s30, [sp, #5*4] // cost[5] vstr s31, [sp, #7*4] // cost[7] vmov.32 r12, d14[0] find_best d14[0], d8[0], d14[1] find_best d14[1], d0[1], d15[0] find_best d15[0], d8[1], d15[1] find_best d15[1] eor r3, r0, #4 // best_dir ^4 ldr r12, [sp, r3, lsl #2] sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] lsr r1, r1, #10 str r1, [r2] // *var add sp, sp, #32 vpop {q4-q7} pop {pc} endfunc .endm dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/filmgrain.S000066400000000000000000002037571517466257200237500ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 .macro increment_seed steps, shift=1 lsr r11, r2, #3 lsr r12, r2, #12 lsr lr, r2, #1 eor r11, r2, r11 // (r >> 0) ^ (r >> 3) eor r12, r12, lr // (r >> 12) ^ (r >> 1) eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) .if \shift lsr r2, r2, #\steps .endif and r11, r11, #((1 << \steps) - 1) // bit .if \shift orr r2, r2, r11, lsl #(16 - \steps) // *state .else orr r2, r2, r11, lsl #16 // *state .endif .endm .macro read_rand dest, bits, age ubfx \dest, r2, #16 - \bits - \age, #\bits .endm .macro read_shift_rand dest, bits ubfx \dest, r2, #17 - \bits, #\bits lsr r2, r2, #1 .endm // special calling convention: // r2 holds seed // r3 holds dav2d_gaussian_sequence // clobbers r11-r12 // returns in d0-d1 function get_gaussian_neon push {r5-r6,lr} increment_seed 4 read_rand r5, 11, 3 read_rand r6, 11, 2 add r5, r3, r5, lsl #1 add r6, r3, r6, lsl #1 vld1.16 {d0[0]}, [r5] read_rand r5, 11, 1 vld1.16 {d0[1]}, [r6] add r5, r3, r5, lsl #1 read_rand r6, 11, 0 increment_seed 4 add r6, r3, r6, lsl #1 vld1.16 {d0[2]}, [r5] read_rand r5, 11, 3 vld1.16 {d0[3]}, [r6] add r5, r3, r5, lsl #1 read_rand r6, 11, 2 vld1.16 {d1[0]}, [r5] add r6, r3, r6, lsl #1 read_rand r5, 11, 1 vld1.16 {d1[1]}, [r6] read_rand r6, 11, 0 add r5, r3, r5, lsl #1 add r6, r3, r6, lsl #1 vld1.16 {d1[2]}, [r5] vld1.16 {d1[3]}, [r6] pop {r5-r6,pc} endfunc .macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r0, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r1, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r2, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r3, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r4, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r5, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r6, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r7, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r8, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r9, q0 increment_seed 2 read_rand r11, 11, 1 read_rand r12, 11, 0 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[0]}, [r11] vld1.16 {d0[1]}, [r12] vrshl.s16 d0, d0, d30 vmovn.i16 \r10, q0 .endm .macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 vst1.16 {\r0, \r1, \r2, \r3}, [r0]! vst1.16 {\r4, \r5, \r6, \r7}, [r0]! vst1.16 {\r8, \r9}, [r0]! vst1.16 {\r10[0]}, [r0]! .endm .macro get_grain_row_44 r0, r1, r2, r3, r4, r5 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r0, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r1, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r2, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r3, q0 bl get_gaussian_neon vrshl.s16 q0, q0, q15 vmovn.i16 \r4, q0 increment_seed 4 read_rand r11, 11, 3 read_rand r12, 11, 2 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[]}, [r11] read_rand r11, 11, 1 vld1.16 {d0[1]}, [r12] add r11, r3, r11, lsl #1 read_rand r12, 11, 0 vld1.16 {d0[2]}, [r11] add r12, r3, r12, lsl #1 vld1.16 {d0[3]}, [r12] vrshl.s16 d0, d0, d30 vmovn.i16 \r5, q0 .endm .macro store_grain_row_44 r0, r1, r2, r3, r4, r5 vst1.16 {\r0, \r1, \r2, \r3}, [r0]! vst1.16 {\r4, \r5}, [r0] add r0, r0, #GRAIN_WIDTH-32 .endm function get_grain_2_neon push {r11,lr} increment_seed 2 read_rand r11, 11, 1 read_rand r12, 11, 0 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[0]}, [r11] vld1.16 {d0[1]}, [r12] vrshl.s16 d0, d0, d30 vmovn.i16 d0, q0 pop {r11,pc} endfunc .macro get_grain_2 dst bl get_grain_2_neon .ifnc \dst, d0 vmov \dst, d0 .endif .endm // r1 holds the number of entries to produce // r6, r8 and r10 hold the previous output entries // q0 holds the vector of produced entries // q1 holds the input vector of sums from above .macro output_lag n function output_lag\n\()_neon push {r0, lr} .if \n == 1 mov lr, #-128 .else mov r0, #1 mov lr, #1 sub r7, r7, #1 sub r9, r9, #1 lsl r0, r0, r7 lsl lr, lr, r9 add r7, r7, #1 add r9, r9, #1 .endif 1: read_shift_rand r12, 11 vmov.32 r11, d2[0] lsl r12, r12, #1 vext.8 q0, q0, q0, #1 ldrsh r12, [r3, r12] .if \n == 1 mla r11, r6, r4, r11 // sum (above) + *coeff * prev output add r6, r11, r8 // 1 << (ar_coeff_shift - 1) add r12, r12, r10 asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 + grain_scale_shift) add r6, r6, r12 cmp r6, r5 .elseif \n == 2 mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 mla r11, r6, r10, r11 // += *coeff * prev output 2 mov r8, r6 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 + grain_scale_shift) add r6, r6, r12 push {lr} cmp r6, r5 mov lr, #-128 .else push {r1-r3} sbfx r1, r4, #0, #8 sbfx r2, r4, #8, #8 sbfx r3, r4, #16, #8 mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 mla r11, r6, r3, r11 // += *coeff * prev output 3 pop {r1-r3} mov r10, r8 mov r8, r6 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 + grain_scale_shift) add r6, r6, r12 push {lr} cmp r6, r5 mov lr, #-128 .endif it gt movgt r6, r5 cmp r6, lr it lt movlt r6, lr .if \n >= 2 pop {lr} .endif subs r1, r1, #1 vext.8 q1, q1, q1, #4 vmov.8 d1[7], r6 bgt 1b pop {r0, pc} endfunc .endm output_lag 1 output_lag 2 output_lag 3 function sum_lag1_above_neon vmull.s8 q2, d6, d28 vmull.s8 q3, d7, d28 vmull.s8 q4, d0, d27 vmull.s8 q5, d1, d27 vaddl.s16 q0, d4, d8 vaddl.s16 q2, d5, d9 vaddl.s16 q4, d6, d10 vaddl.s16 q5, d7, d11 vmull.s8 q3, d3, d29 vmull.s8 q1, d2, d29 vaddw.s16 q4, q4, d6 vaddw.s16 q5, q5, d7 vaddw.s16 q3, q2, d3 vaddw.s16 q2, q0, d2 bx lr endfunc .macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff .ifc \lag\()_\edge, lag3_left bl sum_lag3_left_above_neon .else bl sum_\lag\()_above_neon .endif .ifc \type, uv_420 vpush {q6-q7} add r12, r11, #GRAIN_WIDTH vld1.16 {q0, q1}, [r11]! vld1.16 {q6, q7}, [r12]! vpaddl.s8 q0, q0 vpaddl.s8 q1, q1 vpaddl.s8 q6, q6 vpaddl.s8 q7, q7 vadd.i16 q0, q0, q6 vadd.i16 q1, q1, q7 vpop {q6-q7} vrshrn.s16 d0, q0, #2 vrshrn.s16 d1, q1, #2 .endif .ifc \type, uv_422 vld1.8 {q0, q1}, [r11]! vpaddl.s8 q0, q0 vpaddl.s8 q1, q1 vrshrn.s16 d0, q0, #1 vrshrn.s16 d1, q1, #1 .endif .ifc \type, uv_444 vld1.8 {q0}, [r11]! .endif .if \uv_layout .ifnb \uv_coeff vdup.8 d13, \uv_coeff .endif vmull.s8 q1, d0, d13 vmull.s8 q0, d1, d13 vaddw.s16 q2, q2, d2 vaddw.s16 q3, q3, d3 vaddw.s16 q4, q4, d0 vaddw.s16 q5, q5, d1 .endif .if \uv_layout && \elems == 16 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 444 && \elems == 15 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 422 && \elems == 9 b sum_\lag\()_uv_420_\edge\()_start .else sum_\lag\()_\type\()_\edge\()_start: push {r11} .ifc \edge, left increment_seed 4 read_rand r11, 11, 3 read_rand r12, 11, 2 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d1[1]}, [r11] read_rand r11, 11, 1 vld1.16 {d1[2]}, [r12] add r11, r3, r11, lsl #1 vld1.16 {d1[3]}, [r11] lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 vrshl.s16 d1, d1, d30 vmovn.i16 d1, q0 vext.8 q2, q2, q2, #12 .ifc \lag, lag3 vmov.s8 r10, d1[5] .endif .ifnc \lag, lag1 vmov.s8 r8, d1[6] .endif vmov.s8 r6, d1[7] vmov q1, q2 mov r1, #1 bl output_\lag\()_neon .else increment_seed 4, shift=0 vmov q1, q2 mov r1, #4 bl output_\lag\()_neon .endif increment_seed 4, shift=0 vmov q1, q3 mov r1, #4 bl output_\lag\()_neon increment_seed 4, shift=0 vmov q1, q4 .if \elems == 9 mov r1, #1 bl output_\lag\()_neon lsr r2, r2, #3 read_rand r11, 11, 2 read_rand r12, 11, 1 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d2[0]}, [r11] read_rand r11, 11, 0 vld1.16 {d2[1]}, [r12] add r11, r3, r11, lsl #1 vld1.16 {d2[2]}, [r11] vrshl.s16 d2, d2, d30 vmovn.i16 d2, q1 vext.8 q0, q0, q1, #7 .else mov r1, #4 bl output_\lag\()_neon increment_seed 4, shift=0 vmov q1, q5 .ifc \edge, right mov r1, #3 bl output_\lag\()_neon read_shift_rand r11, 11 add r11, r3, r11, lsl #1 vld1.16 {d2[0]}, [r11] vrshl.s16 d2, d2, d30 vext.8 q0, q0, q1, #1 .else mov r1, #4 bl output_\lag\()_neon .endif .endif .if \store vst1.8 {q0}, [r0]! .endif pop {r11} pop {r1, pc} .endif .endm .macro sum_lag1_func type, uv_layout, edge, elems=16 function sum_\type\()_lag1_\edge\()_neon push {r1, lr} sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 endfunc .endm sum_lag1_func y, 0, left sum_lag1_func y, 0, mid sum_lag1_func y, 0, right, 15 sum_lag1_func uv_444, 444, left sum_lag1_func uv_444, 444, mid sum_lag1_func uv_444, 444, right, 15 sum_lag1_func uv_422, 422, left sum_lag1_func uv_422, 422, mid sum_lag1_func uv_422, 422, right, 9 sum_lag1_func uv_420, 420, left sum_lag1_func uv_420, 420, mid sum_lag1_func uv_420, 420, right, 9 .macro sum_lag1 type, dst, left, mid, right, edge=mid vmov q3, \mid vext.8 q0, \left, \mid, #15 vext.8 q1, \mid, \right, #1 bl sum_\type\()_lag1_\edge\()_neon vmov \dst, q0 .endm .macro sum_y_lag1 dst, left, mid, right, edge=mid sum_lag1 y, \dst, \left, \mid, \right, \edge .endm .macro sum_uv_444_lag1 dst, left, mid, right, edge=mid sum_lag1 uv_444, \dst, \left, \mid, \right, \edge .endm .macro sum_uv_422_lag1 dst, left, mid, right, edge=mid sum_lag1 uv_422, \dst, \left, \mid, \right, \edge .endm .macro sum_uv_420_lag1 dst, left, mid, right, edge=mid sum_lag1 uv_420, \dst, \left, \mid, \right, \edge .endm function sum_lag2_above_neon push {lr} sub r12, r0, #2*GRAIN_WIDTH - 16 sub lr, r0, #1*GRAIN_WIDTH - 16 vld1.8 {q10}, [r12] // load top right vld1.8 {q13}, [lr] vext.8 q6, q8, q9, #14 // top left, top mid vdup.8 d14, d28[0] vext.8 q8, q8, q9, #15 vdup.8 d15, d28[1] vmull.s8 q0, d12, d14 vmull.s8 q1, d13, d14 vmull.s8 q6, d16, d15 vmull.s8 q8, d17, d15 vaddl.s16 q2, d0, d12 vaddl.s16 q3, d1, d13 vaddl.s16 q4, d2, d16 vaddl.s16 q5, d3, d17 vext.8 q6, q9, q10, #1 // top mid, top right vdup.8 d14, d28[3] vext.8 q8, q9, q10, #2 vdup.8 d15, d28[4] vmull.s8 q0, d12, d14 vmull.s8 q1, d13, d14 vmull.s8 q6, d16, d15 vmull.s8 q8, d17, d15 vaddl.s16 q7, d0, d12 vaddl.s16 q0, d1, d13 vaddl.s16 q6, d2, d16 vaddl.s16 q1, d3, d17 vadd.i32 q2, q2, q7 vadd.i32 q3, q3, q0 vadd.i32 q4, q4, q6 vadd.i32 q5, q5, q1 vext.8 q6, q11, q12, #14 // top left, top mid vdup.8 d14, d28[5] vext.8 q8, q11, q12, #15 vdup.8 d15, d28[6] vmull.s8 q0, d12, d14 vmull.s8 q1, d13, d14 vmull.s8 q6, d16, d15 vmull.s8 q8, d17, d15 vaddl.s16 q7, d0, d12 vaddl.s16 q0, d1, d13 vaddl.s16 q6, d2, d16 vaddl.s16 q1, d3, d17 vadd.i32 q2, q2, q7 vadd.i32 q3, q3, q0 vadd.i32 q4, q4, q6 vadd.i32 q5, q5, q1 vext.8 q6, q12, q13, #1 // top mid, top right vdup.8 d14, d29[0] vext.8 q8, q12, q13, #2 vdup.8 d15, d29[1] vmull.s8 q0, d12, d14 vmull.s8 q1, d13, d14 vmull.s8 q6, d16, d15 vmull.s8 q8, d17, d15 vaddl.s16 q7, d0, d12 vaddl.s16 q0, d1, d13 vaddl.s16 q6, d2, d16 vaddl.s16 q1, d3, d17 vadd.i32 q2, q2, q7 vadd.i32 q3, q3, q0 vadd.i32 q4, q4, q6 vadd.i32 q5, q5, q1 vdup.8 d14, d28[2] vdup.8 d15, d28[7] vmull.s8 q0, d18, d14 vmull.s8 q1, d19, d14 vmull.s8 q6, d24, d15 vmull.s8 q8, d25, d15 vaddl.s16 q7, d0, d12 vaddl.s16 q0, d1, d13 vaddl.s16 q6, d2, d16 vaddl.s16 q1, d3, d17 vmov q8, q9 vmov q9, q10 vadd.i32 q2, q2, q7 vadd.i32 q3, q3, q0 vadd.i32 q4, q4, q6 vadd.i32 q5, q5, q1 vmov q11, q12 vmov q12, q13 pop {pc} endfunc .macro sum_lag2_func type, uv_layout, edge, elems=16 function sum_\type\()_lag2_\edge\()_neon push {r1, lr} .ifc \edge, left sub r12, r0, #2*GRAIN_WIDTH sub lr, r0, #1*GRAIN_WIDTH vld1.8 {q9}, [r12] // load the previous block right above vld1.8 {q12}, [lr] .endif sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4] endfunc .endm sum_lag2_func y, 0, left sum_lag2_func y, 0, mid sum_lag2_func y, 0, right, 15 sum_lag2_func uv_444, 444, left sum_lag2_func uv_444, 444, mid sum_lag2_func uv_444, 444, right, 15 sum_lag2_func uv_422, 422, left sum_lag2_func uv_422, 422, mid sum_lag2_func uv_422, 422, right, 9 sum_lag2_func uv_420, 420, left sum_lag2_func uv_420, 420, mid sum_lag2_func uv_420, 420, right, 9 function sum_lag3_left_above_neon // A separate codepath for the left edge, to avoid reading outside // of the edge of the buffer. sub r12, r0, #3*GRAIN_WIDTH vld1.8 {q11, q12}, [r12] vext.8 q12, q11, q12, #13 vext.8 q11, q11, q11, #13 b sum_lag3_above_start endfunc function sum_lag3_above_neon sub r12, r0, #3*GRAIN_WIDTH + 3 vld1.8 {q11, q12}, [r12] sum_lag3_above_start: vdup.8 d20, d26[0] vext.8 q9, q11, q12, #1 vdup.8 d21, d26[1] vmull.s8 q0, d22, d20 vmull.s8 q1, d23, d20 vmull.s8 q6, d18, d21 vmull.s8 q7, d19, d21 vext.8 q8, q11, q12, #2 vdup.8 d20, d26[2] vext.8 q9, q11, q12, #3 vdup.8 d21, d26[3] vaddl.s16 q2, d0, d12 vaddl.s16 q3, d1, d13 vaddl.s16 q4, d2, d14 vaddl.s16 q5, d3, d15 vmull.s8 q0, d16, d20 vmull.s8 q1, d17, d20 vmull.s8 q6, d18, d21 vmull.s8 q7, d19, d21 vaddl.s16 q8, d0, d12 vaddl.s16 q9, d1, d13 vaddl.s16 q0, d2, d14 vaddl.s16 q1, d3, d15 vext.8 q6, q11, q12, #4 vdup.8 d20, d26[4] vext.8 q7, q11, q12, #5 vdup.8 d21, d26[5] vadd.i32 q2, q2, q8 vadd.i32 q3, q3, q9 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d12, d20 vmull.s8 q1, d13, d20 vmull.s8 q8, d14, d21 vmull.s8 q9, d15, d21 sub r12, r0, #2*GRAIN_WIDTH + 3 vaddl.s16 q6, d0, d16 vaddl.s16 q7, d1, d17 vaddl.s16 q0, d2, d18 vaddl.s16 q1, d3, d19 vext.8 q8, q11, q12, #6 vld1.8 {q11, q12}, [r12] vdup.8 d20, d26[6] vdup.8 d21, d26[7] vadd.i32 q2, q2, q6 vadd.i32 q3, q3, q7 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d16, d20 vmull.s8 q1, d17, d20 vmull.s8 q6, d22, d21 vmull.s8 q7, d23, d21 vaddl.s16 q8, d0, d12 vaddl.s16 q9, d1, d13 vaddl.s16 q0, d2, d14 vaddl.s16 q1, d3, d15 vext.8 q6, q11, q12, #1 vdup.8 d20, d27[0] vext.8 q7, q11, q12, #2 vdup.8 d21, d27[1] vadd.i32 q2, q2, q8 vadd.i32 q3, q3, q9 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d12, d20 vmull.s8 q1, d13, d20 vmull.s8 q8, d14, d21 vmull.s8 q9, d15, d21 vaddl.s16 q6, d0, d16 vaddl.s16 q7, d1, d17 vaddl.s16 q0, d2, d18 vaddl.s16 q1, d3, d19 vext.8 q8, q11, q12, #3 vdup.8 d20, d27[2] vext.8 q9, q11, q12, #4 vdup.8 d21, d27[3] vadd.i32 q2, q2, q6 vadd.i32 q3, q3, q7 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d16, d20 vmull.s8 q1, d17, d20 vmull.s8 q6, d18, d21 vmull.s8 q7, d19, d21 sub r12, r0, #1*GRAIN_WIDTH + 3 vaddl.s16 q8, d0, d12 vaddl.s16 q9, d1, d13 vaddl.s16 q0, d2, d14 vaddl.s16 q1, d3, d15 vext.8 q6, q11, q12, #5 vdup.8 d20, d27[4] vext.8 q7, q11, q12, #6 vdup.8 d21, d27[5] vld1.8 {q11, q12}, [r12] vadd.i32 q2, q2, q8 vadd.i32 q3, q3, q9 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d12, d20 vmull.s8 q1, d13, d20 vmull.s8 q8, d14, d21 vmull.s8 q9, d15, d21 vaddl.s16 q6, d0, d16 vaddl.s16 q7, d1, d17 vaddl.s16 q0, d2, d18 vaddl.s16 q1, d3, d19 vdup.8 d20, d27[6] vext.8 q9, q11, q12, #1 vdup.8 d21, d27[7] vadd.i32 q2, q2, q6 vadd.i32 q3, q3, q7 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d22, d20 vmull.s8 q1, d23, d20 vmull.s8 q6, d18, d21 vmull.s8 q7, d19, d21 vaddl.s16 q8, d0, d12 vaddl.s16 q9, d1, d13 vaddl.s16 q0, d2, d14 vaddl.s16 q1, d3, d15 vext.8 q6, q11, q12, #2 vdup.8 d20, d28[0] vext.8 q7, q11, q12, #3 vdup.8 d21, d28[1] vadd.i32 q2, q2, q8 vadd.i32 q3, q3, q9 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d12, d20 vmull.s8 q1, d13, d20 vmull.s8 q8, d14, d21 vmull.s8 q9, d15, d21 vaddl.s16 q6, d0, d16 vaddl.s16 q7, d1, d17 vaddl.s16 q0, d2, d18 vaddl.s16 q1, d3, d19 vext.8 q8, q11, q12, #4 vdup.8 d20, d28[2] vext.8 q9, q11, q12, #5 vdup.8 d21, d28[3] vadd.i32 q2, q2, q6 vadd.i32 q3, q3, q7 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d16, d20 vmull.s8 q1, d17, d20 vmull.s8 q6, d18, d21 vmull.s8 q7, d19, d21 vaddl.s16 q8, d0, d12 vaddl.s16 q9, d1, d13 vaddl.s16 q0, d2, d14 vaddl.s16 q1, d3, d15 vext.8 q6, q11, q12, #6 vdup.8 d20, d28[4] vadd.i32 q2, q2, q8 vadd.i32 q3, q3, q9 vadd.i32 q4, q4, q0 vadd.i32 q5, q5, q1 vmull.s8 q0, d12, d20 vmull.s8 q1, d13, d20 vaddw.s16 q2, q2, d0 vaddw.s16 q3, q3, d1 vaddw.s16 q4, q4, d2 vaddw.s16 q5, q5, d3 bx lr endfunc .macro sum_lag3_func type, uv_layout, edge, elems=16 function sum_\type\()_lag3_\edge\()_neon push {r1, lr} sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0] endfunc .endm sum_lag3_func y, 0, left sum_lag3_func y, 0, mid sum_lag3_func y, 0, right, 15 sum_lag3_func uv_444, 444, left sum_lag3_func uv_444, 444, mid sum_lag3_func uv_444, 444, right, 15 sum_lag3_func uv_422, 422, left sum_lag3_func uv_422, 422, mid sum_lag3_func uv_422, 422, right, 9 sum_lag3_func uv_420, 420, left sum_lag3_func uv_420, 420, mid sum_lag3_func uv_420, 420, right, 9 function generate_grain_rows_neon push {r11,lr} 1: get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 subs r1, r1, #1 store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 bgt 1b pop {r11,pc} endfunc function generate_grain_rows_44_neon push {r11,lr} 1: get_grain_row_44 d16, d17, d18, d19, d20, d21 subs r1, r1, #1 store_grain_row_44 d16, d17, d18, d19, d20, d21 bgt 1b pop {r11,pc} endfunc function gen_grain_uv_444_lag0_neon vld1.8 {q3}, [r11]! push {r11,lr} bl get_gaussian_neon vrshl.s16 q8, q0, q15 bl get_gaussian_neon vrshl.s16 q9, q0, q15 vqmovn.s16 d0, q8 vqmovn.s16 d1, q9 vand q3, q3, q1 vmull.s8 q2, d6, d22 vmull.s8 q3, d7, d22 vrshl.s16 q2, q2, q12 vrshl.s16 q3, q3, q12 vaddw.s8 q2, q2, d0 vaddw.s8 q3, q3, d1 vqmovn.s16 d4, q2 vqmovn.s16 d5, q3 vst1.8 {q2}, [r0]! pop {r11,pc} endfunc function get_grain_row_44_neon push {r11,lr} get_grain_row_44 d16, d17, d18, d19, d20, d21 pop {r11,pc} endfunc function add_uv_420_coeff_lag0_neon vld1.16 {q2, q3}, [r11]! vld1.16 {q4, q5}, [r12]! vpaddl.s8 q2, q2 vpaddl.s8 q3, q3 vpaddl.s8 q4, q4 vpaddl.s8 q5, q5 vadd.i16 q2, q2, q4 vadd.i16 q3, q3, q5 vrshrn.s16 d4, q2, #2 vrshrn.s16 d5, q3, #2 b add_coeff_lag0_start endfunc function add_uv_422_coeff_lag0_neon vld1.16 {q2, q3}, [r11]! vpaddl.s8 q2, q2 vpaddl.s8 q3, q3 vrshrn.s16 d4, q2, #1 vrshrn.s16 d5, q3, #1 add_coeff_lag0_start: vand q3, q2, q1 vmull.s8 q2, d6, d22 vmull.s8 q3, d7, d22 vrshl.s16 q2, q2, q12 vrshl.s16 q3, q3, q12 vaddw.s8 q2, q2, d0 vaddw.s8 q3, q3, d1 vqmovn.s16 d4, q2 vqmovn.s16 d5, q3 bx lr endfunc .macro gen_grain_82 type function generate_grain_\type\()_8bpc_neon, export=1 push {r4-r11,lr} .ifc \type, uv_444 mov r12, r3 mov lr, #28 add r11, r1, #3*GRAIN_WIDTH mov r1, r2 mul r12, r12, lr .endif movrel r3, X(gaussian_sequence) ldr r2, [r1, #FGD_SEED] ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] .ifc \type, y add r4, r1, #FGD_AR_COEFFS_Y .else add r4, r1, #FGD_AR_COEFFS_UV .endif adr r5, L(gen_grain_\type\()_tbl) ldr r6, [r1, #FGD_AR_COEFF_LAG] add r9, r9, #4 ldr r6, [r5, r6, lsl #2] vdup.16 q15, r9 // 4 + data->grain_scale_shift add r5, r5, r6 vneg.s16 q15, q15 .ifc \type, uv_444 cmp r12, #0 movw r10, #0x49d8 movw lr, #0xb524 // Intentionally using a separate register instead of moveq with an // immediate constant, to avoid armv8 deprecated it instruction forms. it eq moveq r10, lr add r4, r4, r12 // Add offset to ar_coeffs_uv[1] eor r2, r2, r10 .endif ldr r7, [r1, #FGD_AR_COEFF_SHIFT] mov r8, #1 mov r10, #1 lsl r8, r8, r7 // 1 << ar_coeff_shift lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) bx r5 .align 2 L(gen_grain_\type\()_tbl): .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB L(generate_grain_\type\()_lag0): .ifc \type, y mov r1, #GRAIN_HEIGHT bl generate_grain_rows_neon .else mov r1, #3 bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT-3 vdup.16 q12, r7 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] vmov.i8 q0, #0 vmov.i8 q1, #255 vext.8 q13, q0, q1, #13 vext.8 q14, q1, q0, #1 vneg.s16 q12, q12 1: vmov q1, q13 bl gen_grain_uv_444_lag0_neon // 16 vmov.i8 q1, #255 bl gen_grain_uv_444_lag0_neon // 32 bl gen_grain_uv_444_lag0_neon // 48 bl gen_grain_uv_444_lag0_neon // 64 vmov q1, q14 bl gen_grain_uv_444_lag0_neon // 80 get_grain_2 d16 subs r1, r1, #1 add r11, r11, #2 vst1.16 {d16[0]}, [r0]! bgt 1b .endif pop {r4-r11,pc} L(generate_grain_\type\()_lag1): vpush {q4-q7} mov r5, #127 vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] .ifc \type, y ldrsb r4, [r4, #1] // ar_coeffs_y[3] .else add r4, r4, #2 .endif mov r1, #3 .ifc \type, uv_444 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] .endif bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT - 3 1: sum_\type\()_lag1 q7, q8, q8, q9, left sum_\type\()_lag1 q8, q8, q9, q10 sum_\type\()_lag1 q9, q9, q10, q11 sum_\type\()_lag1 q10, q10, q11, q12 sum_\type\()_lag1 q12, q11, q12, q13, right get_grain_2 d26 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #2 .endif store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26 vmov q11, q10 vmov q10, q9 vmov q9, q8 vmov q8, q7 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag2): vpush {q4-q7} mov r5, #127 vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] vmov.s8 r4, d29[2] vmov.s8 r10, d29[3] mov r1, #3 bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag2_left_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_right_neon get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #2 .endif vst1.16 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag3): vpush {q4-q7} mov r5, #127 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] vmov.u8 r4, d28[5] vmov.u8 r10, d28[6] vmov.u8 r12, d28[7] orr r4, r4, r10, lsl #8 orr r4, r4, r12, lsl #16 mov r1, #3 vpush {d26} bl generate_grain_rows_neon vpop {d26} mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag3_left_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_right_neon get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #2 .endif vst1.16 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm gen_grain_82 y gen_grain_82 uv_444 .macro set_height dst, type .ifc \type, uv_420 mov \dst, #SUB_GRAIN_HEIGHT-3 .else mov \dst, #GRAIN_HEIGHT-3 .endif .endm .macro increment_y_ptr reg, type .ifc \type, uv_420 add \reg, \reg, #2*GRAIN_WIDTH-(3*32) .else sub \reg, \reg, #3*32-GRAIN_WIDTH .endif .endm .macro gen_grain_44 type function generate_grain_\type\()_8bpc_neon, export=1 push {r4-r11,lr} mov r12, r3 mov lr, #28 add r11, r1, #3*GRAIN_WIDTH-3 mov r1, r2 mul r12, r12, lr movrel r3, X(gaussian_sequence) ldr r2, [r1, #FGD_SEED] ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] add r4, r1, #FGD_AR_COEFFS_UV adr r5, L(gen_grain_\type\()_tbl) ldr r6, [r1, #FGD_AR_COEFF_LAG] add r9, r9, #4 ldr r6, [r5, r6, lsl #2] vdup.16 q15, r9 // 4 + data->grain_scale_shift add r5, r5, r6 vneg.s16 q15, q15 cmp r12, #0 movw r10, #0x49d8 movw lr, #0xb524 // Intentionally using a separate register instead of moveq with an // immediate constant, to avoid armv8 deprecated it instruction forms. it eq moveq r10, lr add r4, r4, r12 // Add offset to ar_coeffs_uv[1] eor r2, r2, r10 ldr r7, [r1, #FGD_AR_COEFF_SHIFT] mov r8, #1 mov r10, #1 lsl r8, r8, r7 // 1 << ar_coeff_shift lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) bx r5 .align 2 L(gen_grain_\type\()_tbl): .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB L(generate_grain_\type\()_lag0): .ifc \type, uv_420 vpush {q4-q5} .endif mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type vdup.16 q12, r7 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] vmov.i8 q0, #0 vmov.i8 q1, #255 vext.8 q13, q0, q1, #13 vext.8 q14, q1, q0, #7 vneg.s16 q12, q12 1: bl get_grain_row_44_neon .ifc \type, uv_420 add r12, r11, #GRAIN_WIDTH .endif vmov q1, q13 vmov q0, q8 bl add_\type\()_coeff_lag0_neon vmov.i8 q1, #255 vmov q0, q9 vmov q8, q2 bl add_\type\()_coeff_lag0_neon vmov.i8 q1, q14 vmov q0, q10 vmov q9, q2 bl add_\type\()_coeff_lag0_neon vmov q10, q2 subs r1, r1, #1 increment_y_ptr r11, \type store_grain_row_44 d16, d17, d18, d19, d20, d21 bgt 1b .ifc \type, uv_420 vpop {q4-q5} .endif pop {r4-r11,pc} L(generate_grain_\type\()_lag1): vpush {q4-q7} mov r5, #127 vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] add r4, r4, #2 mov r1, #3 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] bl generate_grain_rows_44_neon set_height r1, \type 1: sum_\type\()_lag1 q7, q8, q8, q9, left sum_\type\()_lag1 q8, q8, q9, q10 sum_\type\()_lag1 q10, q9, q10, q11, right subs r1, r1, #1 increment_y_ptr r11, \type store_grain_row_44 d14, d15, d16, d17, d20, d21 vmov q9, q8 vmov q8, q7 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag2): vpush {q4-q7} mov r5, #127 vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] vmov.s8 r4, d29[2] vmov.s8 r10, d29[3] mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type 1: bl sum_\type\()_lag2_left_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_right_neon subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH-48 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag3): vpush {q4-q7} mov r5, #127 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] vmov.u8 r4, d28[5] vmov.u8 r10, d28[6] vmov.u8 r12, d28[7] orr r4, r4, r10, lsl #8 orr r4, r4, r12, lsl #16 mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type 1: bl sum_\type\()_lag3_left_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_right_neon subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH-48 bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm gen_grain_44 uv_420 gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, off vmov.u8 r11, \src1[0+\off] vmov.u8 r12, \src2[0+\off] add r11, r11, r3 vmov.u8 lr, \src1[2+\off] add r12, r12, r3 vld1.8 {\dst1[0+\off]}, [r11] vmov.u8 r11, \src2[2+\off] add lr, lr, r3 vld1.8 {\dst2[0+\off]}, [r12] vmov.u8 r12, \src1[4+\off] add r11, r11, r3 vld1.8 {\dst1[2+\off]}, [lr] vmov.u8 lr, \src2[4+\off] add r12, r12, r3 vld1.8 {\dst2[2+\off]}, [r11] vmov.u8 r11, \src1[6+\off] add lr, lr, r3 vld1.8 {\dst1[4+\off]}, [r12] vmov.u8 r12, \src2[6+\off] add r11, r11, r3 vld1.8 {\dst2[4+\off]}, [lr] add r12, r12, r3 vld1.8 {\dst1[6+\off]}, [r11] vld1.8 {\dst2[6+\off]}, [r12] .endm .macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 gather_interleaved \dst1, \dst3, \src1, \src3, 0 gather_interleaved \dst1, \dst3, \src1, \src3, 1 gather_interleaved \dst2, \dst4, \src2, \src4, 0 gather_interleaved \dst2, \dst4, \src2, \src4, 1 .endm function gather32_neon push {r11-r12,lr} gather d8, d9, d10, d11, d0, d1, d2, d3 pop {r11-r12,pc} endfunc function gather16_neon push {r11-r12,lr} gather_interleaved d8, d9, d0, d1, 0 gather_interleaved d8, d9, d0, d1, 1 pop {r11-r12,pc} endfunc const overlap_coeffs_0, align=4 .byte 27, 17, 0, 0, 0, 0, 0, 0 .byte 17, 27, 32, 32, 32, 32, 32, 32 endconst const overlap_coeffs_1, align=4 .byte 23, 0, 0, 0, 0, 0, 0, 0 .byte 22, 32, 32, 32, 32, 32, 32, 32 endconst .macro calc_offset offx, offy, src, sx, sy and \offy, \src, #0xF // randval & 0xF lsr \offx, \src, #4 // randval >> 4 .if \sy == 0 add \offy, \offy, \offy // 2 * (randval & 0xF) .endif .if \sx == 0 add \offx, \offx, \offx // 2 * (randval >> 4) .endif .endm .macro add_offset dst, offx, offy, src, stride mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy add \dst, \dst, \offx // grain_lut += offx .endm // void dav2d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const int scaling_shift, // const entry grain_lut[][GRAIN_WIDTH], // const int offsets[][2], // const int h, const ptrdiff_t clip, // const ptrdiff_t type); function fgy_32x32_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut ldrd r6, r7, [sp, #108] // offsets, h ldr r8, [sp, #116] // clip mov r9, #GRAIN_WIDTH // grain_lut stride neg r4, r4 vdup.16 q13, r4 // -scaling_shift cmp r8, #0 movrel_local r12, overlap_coeffs_0 beq 1f // clip vmov.i8 q14, #16 vmov.i8 q15, #235 b 2f 1: // no clip vmov.i8 q14, #0 vmov.i8 q15, #255 2: vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs add r5, r5, #9 // grain_lut += 9 add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride add r5, r5, r9 // grain_lut += grain_stride ldr r10, [r6, #8] // offsets[1][0] calc_offset r10, r4, r10, 0, 0 add_offset r4, r10, r4, r5, r9 ldr r10, [r6, #4] // offsets[0][1] calc_offset r10, r11, r10, 0, 0 add_offset r11, r10, r11, r5, r9 ldr r10, [r6, #12] // offsets[1][1] calc_offset r10, r8, r10, 0, 0 add_offset r8, r10, r8, r5, r9 ldr r6, [r6] // offsets[0][0] calc_offset r6, lr, r6, 0, 0 add_offset r5, r6, lr, r5, r9 add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by ldr r10, [sp, #120] // type adr r11, L(fgy_loop_tbl) tst r10, #1 ldr r10, [r11, r10, lsl #2] add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx add r11, r11, r10 beq 1f // y overlap vdup.8 d14, d24[0] vdup.8 d15, d24[1] mov r10, r7 // backup actual h mov r7, #2 1: bx r11 endfunc function fgy_loop_neon L(fgy_loop_tbl): .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB .macro fgy ox, oy L(loop_\ox\oy): 1: .if \ox vld1.8 {d8}, [r4], r9 // grain_lut old .endif .if \oy vld1.8 {q2, q3}, [r6], r9 // grain_lut top .endif .if \ox && \oy vld1.8 {d10}, [r8], r9 // grain_lut top old .endif vld1.8 {q0, q1}, [r1, :128], r2 // src vld1.8 {q10, q11}, [r5], r9 // grain_lut .if \ox vmull.s8 q4, d8, d24 vmlal.s8 q4, d20, d25 .endif .if \oy .if \ox vmull.s8 q5, d10, d24 vmlal.s8 q5, d4, d25 vqrshrn.s16 d20, q4, #5 vqrshrn.s16 d4, q5, #5 .endif vmull.s8 q4, d20, d15 vmull.s8 q5, d21, d15 vmull.s8 q8, d22, d15 vmull.s8 q9, d23, d15 vmlal.s8 q4, d4, d14 vmlal.s8 q5, d5, d14 vmlal.s8 q8, d6, d14 vmlal.s8 q9, d7, d14 vqrshrn.s16 d20, q4, #5 vqrshrn.s16 d21, q5, #5 vqrshrn.s16 d22, q8, #5 vqrshrn.s16 d23, q9, #5 .elseif \ox vqrshrn.s16 d20, q4, #5 .endif bl gather32_neon vmovl.s8 q8, d20 // grain vmovl.s8 q9, d21 vmovl.s8 q10, d22 vmovl.s8 q11, d23 vmovl.u8 q2, d8 // scaling vmovl.u8 q3, d9 vmovl.u8 q4, d10 vmovl.u8 q5, d11 vmul.i16 q8, q8, q2 // scaling * grain vmul.i16 q9, q9, q3 vmul.i16 q10, q10, q4 vmul.i16 q11, q11, q5 vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) vrshl.s16 q9, q9, q13 vrshl.s16 q10, q10, q13 vrshl.s16 q11, q11, q13 vaddw.u8 q8, q8, d0 // *src + noise vaddw.u8 q9, q9, d1 vaddw.u8 q10, q10, d2 vaddw.u8 q11, q11, d3 vqmovun.s16 d0, q8 vqmovun.s16 d1, q9 vqmovun.s16 d2, q10 vqmovun.s16 d3, q11 vmax.u8 q0, q0, q14 vmax.u8 q1, q1, q14 vmin.u8 q0, q0, q15 vmin.u8 q1, q1, q15 subs r7, r7, #1 .if \oy vdup.8 d14, d25[0] vdup.8 d15, d25[1] .endif vst1.8 {q0, q1}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r10, #2 sub r7, r10, #2 // restore actual remaining h bgt L(loop_\ox\()0) .endif vpop {q4-q7} pop {r4-r11,pc} .endm fgy 0, 0 fgy 0, 1 fgy 1, 0 fgy 1, 1 endfunc // void dav2d_fguv_32x32_420_8bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const Dav2dFilmGrainData *const data, // const entry grain_lut[][GRAIN_WIDTH], // const pixel *const luma_row, // const ptrdiff_t luma_stride, // const int offsets[][2], // const ptrdiff_t h, const ptrdiff_t uv, // const ptrdiff_t is_id, // const ptrdiff_t type); .macro fguv layout, sx, sy function fguv_32x32_\layout\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] // data, grain_lut ldrd r6, r7, [sp, #108] // luma_row, luma_stride ldrd r8, r9, [sp, #116] // offsets, h ldrd r10, r11, [sp, #124] // uv, is_id // !csfl add r10, r4, r10, lsl #2 // + 4*uv add r12, r10, #FGD_UV_LUMA_MULT add lr, r10, #FGD_UV_MULT add r10, r10, #FGD_UV_OFFSET vld1.16 {d4[]}, [r12] // uv_luma_mult vld1.16 {d4[2]}, [r10] // uv_offset vld1.16 {d4[1]}, [lr] // uv_mult ldr lr, [r4, #FGD_SCALING_SHIFT] ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] neg lr, lr // -scaling_shift cmp r12, #0 vdup.16 q13, lr // -scaling_shift beq 1f // clip cmp r11, #0 vmov.i8 q14, #16 vmov.i8 q15, #240 beq 2f // is_id vmov.i8 q15, #235 b 2f 1: // no clip vmov.i8 q14, #0 vmov.i8 q15, #255 2: mov r10, #GRAIN_WIDTH // grain_lut stride add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 .if \sy add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride .else add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride add r5, r5, r10 // grain_lut += grain_stride .endif ldr r12, [r8, #8] // offsets[1][0] calc_offset r12, r4, r12, \sx, \sy add_offset r4, r12, r4, r5, r10 ldr r12, [r8, #4] // offsets[0][1] calc_offset r12, lr, r12, \sx, \sy add_offset lr, r12, lr, r5, r10 ldr r12, [r8, #12] // offsets[1][1] calc_offset r12, r11, r12, \sx, \sy add_offset r11, r12, r11, r5, r10 ldr r8, [r8] // offsets[0][0] calc_offset r8, r12, r8, \sx, \sy add_offset r5, r8, r12, r5, r10 add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx movrel_local r12, overlap_coeffs_\sx ldr lr, [sp, #132] // type vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs movrel_local r12, L(fguv_loop_sx\sx\()_tbl) #if CONFIG_THUMB // This uses movrel_local instead of adr above, because the target // can be out of range for adr. But movrel_local leaves the thumb bit // set on COFF (but probably wouldn't if building for thumb on ELF), // thus try to clear the bit for robustness. bic r12, r12, #1 #endif tst lr, #1 ldr lr, [r12, lr, lsl #2] add r12, r12, lr beq 1f // y overlap sub lr, r9, #(2 >> \sy) // backup remaining h mov r9, #(2 >> \sy) 1: .if \sy vmov.i8 d6, #23 vmov.i8 d7, #22 .else vmov.i8 d6, #27 vmov.i8 d7, #17 .endif .if \sy add r7, r7, r7 // luma_stride *= 2 .endif bx r12 endfunc .endm fguv 420, 1, 1 fguv 422, 1, 0 fguv 444, 0, 0 function fguv_loop_sx0_neon L(fguv_loop_sx0_tbl): .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .macro fguv_loop_sx0 csfl, ox, oy L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): .if \oy mov r12, lr .endif 1: .if \ox vld1.8 {d8}, [r4], r10 // grain_lut old .endif .if \oy vld1.8 {q8, q9}, [r8], r10 // grain_lut top .endif .if \ox && \oy vld1.8 {d10}, [r11], r10 // grain_lut top old .endif vld1.8 {q0, q1}, [r6, :128], r7 // luma vld1.8 {q10, q11}, [r5], r10 // grain_lut .if \ox vmull.s8 q4, d8, d24 vmlal.s8 q4, d20, d25 .endif .if \oy .if \ox vmull.s8 q5, d10, d24 vmlal.s8 q5, d16, d25 vqrshrn.s16 d20, q4, #5 vqrshrn.s16 d16, q5, #5 .endif vmull.s8 q4, d20, d7 vmull.s8 q5, d21, d7 vmull.s8 q6, d22, d7 vmull.s8 q7, d23, d7 vmlal.s8 q4, d16, d6 vmlal.s8 q5, d17, d6 vmlal.s8 q6, d18, d6 vmlal.s8 q7, d19, d6 vqrshrn.s16 d20, q4, #5 vqrshrn.s16 d21, q5, #5 vqrshrn.s16 d22, q6, #5 vqrshrn.s16 d23, q7, #5 .elseif \ox vqrshrn.s16 d20, q4, #5 .endif .if !\csfl vld1.8 {q8, q9}, [r1, :128] // src vmovl.u8 q4, d0 vmovl.u8 q5, d1 vmovl.u8 q6, d2 vmovl.u8 q7, d3 vmovl.u8 q0, d16 vmovl.u8 q1, d17 vmovl.u8 q8, d18 vmovl.u8 q9, d19 vmul.i16 q4, q4, d4[0] vmul.i16 q5, q5, d4[0] vmul.i16 q6, q6, d4[0] vmul.i16 q7, q7, d4[0] vmul.i16 q0, q0, d4[1] vmul.i16 q1, q1, d4[1] vmul.i16 q8, q8, d4[1] vmul.i16 q9, q9, d4[1] vqadd.s16 q4, q4, q0 vqadd.s16 q5, q5, q1 vqadd.s16 q6, q6, q8 vqadd.s16 q7, q7, q9 vdup.16 q0, d4[2] vshr.s16 q4, q4, #6 vshr.s16 q5, q5, #6 vshr.s16 q6, q6, #6 vshr.s16 q7, q7, #6 vadd.i16 q4, q4, q0 vadd.i16 q5, q5, q0 vadd.i16 q6, q6, q0 vadd.i16 q7, q7, q0 vqmovun.s16 d0, q4 vqmovun.s16 d1, q5 vqmovun.s16 d2, q6 vqmovun.s16 d3, q7 .endif bl gather32_neon vld1.8 {q0, q1}, [r1, :128], r2 // src vmovl.s8 q8, d20 // grain vmovl.s8 q9, d21 vmovl.s8 q10, d22 vmovl.s8 q11, d23 vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vmovl.u8 q4, d10 vmovl.u8 q5, d11 vmul.i16 q8, q8, q6 // scaling * grain vmul.i16 q9, q9, q7 vmul.i16 q10, q10, q4 vmul.i16 q11, q11, q5 vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) vrshl.s16 q9, q9, q13 vrshl.s16 q10, q10, q13 vrshl.s16 q11, q11, q13 vaddw.u8 q8, q8, d0 // *src + noise vaddw.u8 q9, q9, d1 vaddw.u8 q10, q10, d2 vaddw.u8 q11, q11, d3 vqmovun.s16 d0, q8 vqmovun.s16 d1, q9 vqmovun.s16 d2, q10 vqmovun.s16 d3, q11 vmax.u8 q0, q0, q14 vmax.u8 q1, q1, q14 vmin.u8 q0, q0, q15 vmin.u8 q1, q1, q15 subs r9, r9, #1 .if \oy vdup.8 d6, d25[0] vdup.8 d7, d25[1] .endif vst1.8 {q0, q1}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r12, #0 mov r9, r12 // restore actual remaining h bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx0 0, 0, 0 fguv_loop_sx0 0, 0, 1 fguv_loop_sx0 0, 1, 0 fguv_loop_sx0 0, 1, 1 fguv_loop_sx0 1, 0, 0 fguv_loop_sx0 1, 0, 1 fguv_loop_sx0 1, 1, 0 fguv_loop_sx0 1, 1, 1 9: vpop {q4-q7} pop {r4-r11,pc} endfunc function fguv_loop_sx1_neon L(fguv_loop_sx1_tbl): .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): .if \oy mov r12, lr .endif 1: .if \ox vld1.8 {d8}, [r4], r10 // grain_lut old .endif .if \oy vld1.8 {q8}, [r8], r10 // grain_lut top .endif .if \ox && \oy vld1.8 {d10}, [r11], r10 // grain_lut top old .endif vld1.8 {q0, q1}, [r6, :128], r7 // luma vld1.8 {q10}, [r5], r10 // grain_lut vld1.8 {q11}, [r1, :128], r2 // src .if \ox vmull.s8 q4, d8, d24 vmlal.s8 q4, d20, d25 .endif vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 .if \oy .if \ox vmull.s8 q5, d10, d24 vmlal.s8 q5, d16, d25 vqrshrn.s16 d20, q4, #5 vqrshrn.s16 d16, q5, #5 .endif vmull.s8 q4, d20, d7 vmull.s8 q5, d21, d7 vmlal.s8 q4, d16, d6 vmlal.s8 q5, d17, d6 vqrshrn.s16 d20, q4, #5 vqrshrn.s16 d21, q5, #5 .elseif \ox vqrshrn.s16 d20, q4, #5 .endif .if \csfl vrshrn.u16 d0, q0, #1 vrshrn.u16 d1, q1, #1 .else vrshr.u16 q4, q0, #1 vrshr.u16 q5, q1, #1 vmovl.u8 q0, d22 vmovl.u8 q1, d23 vmul.i16 q4, q4, d4[0] vmul.i16 q5, q5, d4[0] vmul.i16 q0, q0, d4[1] vmul.i16 q1, q1, d4[1] vqadd.s16 q4, q4, q0 vqadd.s16 q5, q5, q1 vdup.16 q0, d4[2] vshr.s16 q4, q4, #6 vshr.s16 q5, q5, #6 vadd.i16 q4, q4, q0 vadd.i16 q5, q5, q0 vqmovun.s16 d0, q4 vqmovun.s16 d1, q5 .endif bl gather16_neon vmovl.s8 q8, d20 // grain vmovl.s8 q9, d21 vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vmul.i16 q8, q8, q6 // scaling * grain vmul.i16 q9, q9, q7 vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) vrshl.s16 q9, q9, q13 vaddw.u8 q8, q8, d22 // *src + noise vaddw.u8 q9, q9, d23 vqmovun.s16 d0, q8 vqmovun.s16 d1, q9 vmax.u8 q0, q0, q14 vmin.u8 q0, q0, q15 subs r9, r9, #1 .if \oy vswp d6, d7 .endif vst1.8 {q0}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r12, #0 mov r9, r12 // restore actual remaining h bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx1 0, 0, 0 fguv_loop_sx1 0, 0, 1 fguv_loop_sx1 0, 1, 0 fguv_loop_sx1 0, 1, 1 fguv_loop_sx1 1, 0, 0 fguv_loop_sx1 1, 0, 1 fguv_loop_sx1 1, 1, 0 fguv_loop_sx1 1, 1, 1 9: vpop {q4-q7} pop {r4-r11,pc} endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/filmgrain16.S000066400000000000000000002233251517466257200241100ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 .macro increment_seed steps, shift=1 lsr r11, r2, #3 lsr r12, r2, #12 lsr lr, r2, #1 eor r11, r2, r11 // (r >> 0) ^ (r >> 3) eor r12, r12, lr // (r >> 12) ^ (r >> 1) eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) .if \shift lsr r2, r2, #\steps .endif and r11, r11, #((1 << \steps) - 1) // bit .if \shift orr r2, r2, r11, lsl #(16 - \steps) // *state .else orr r2, r2, r11, lsl #16 // *state .endif .endm .macro read_rand dest, bits, age ubfx \dest, r2, #16 - \bits - \age, #\bits .endm .macro read_shift_rand dest, bits ubfx \dest, r2, #17 - \bits, #\bits lsr r2, r2, #1 .endm // special calling convention: // r2 holds seed // r3 holds dav2d_gaussian_sequence // clobbers r11-r12 // returns in d0-d1 function get_gaussian_neon push {r5-r6,lr} increment_seed 4 read_rand r5, 11, 3 read_rand r6, 11, 2 add r5, r3, r5, lsl #1 add r6, r3, r6, lsl #1 vld1.16 {d0[0]}, [r5] read_rand r5, 11, 1 vld1.16 {d0[1]}, [r6] add r5, r3, r5, lsl #1 read_rand r6, 11, 0 increment_seed 4 add r6, r3, r6, lsl #1 vld1.16 {d0[2]}, [r5] read_rand r5, 11, 3 vld1.16 {d0[3]}, [r6] add r5, r3, r5, lsl #1 read_rand r6, 11, 2 vld1.16 {d1[0]}, [r5] add r6, r3, r6, lsl #1 read_rand r5, 11, 1 vld1.16 {d1[1]}, [r6] read_rand r6, 11, 0 add r5, r3, r5, lsl #1 add r6, r3, r6, lsl #1 vld1.16 {d1[2]}, [r5] vld1.16 {d1[3]}, [r6] pop {r5-r6,pc} endfunc function get_grain_2_neon push {r11,lr} increment_seed 2 read_rand r11, 11, 1 read_rand r12, 11, 0 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[0]}, [r11] vld1.16 {d0[1]}, [r12] vrshl.s16 d0, d0, d30 pop {r11,pc} endfunc .macro get_grain_2 dst bl get_grain_2_neon .ifnc \dst, d0 vmov \dst, d0 .endif .endm function get_grain_4_neon push {r11,lr} increment_seed 4 read_rand r11, 11, 3 read_rand r12, 11, 2 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[0]}, [r11] read_rand r11, 11, 1 vld1.16 {d0[1]}, [r12] read_rand r12, 11, 0 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d0[2]}, [r11] vld1.16 {d0[3]}, [r12] vrshl.s16 d0, d0, d30 pop {r11,pc} endfunc .macro get_grain_4 dst bl get_grain_4_neon .ifnc \dst, d0 vmov \dst, d0 .endif .endm // r1 holds the number of entries to produce // r6, r8 and r10 hold the previous output entries // q0 holds the vector of produced entries // q1 holds the input vector of sums from above .macro output_lag n function output_lag\n\()_neon push {r0, lr} .if \n == 1 mvn lr, r5 // grain_min = ~grain_max .else mov r0, #1 mov lr, #1 sub r7, r7, #1 sub r9, r9, #1 lsl r0, r0, r7 lsl lr, lr, r9 add r7, r7, #1 add r9, r9, #1 .endif 1: read_shift_rand r12, 11 vmov.32 r11, d2[0] lsl r12, r12, #1 vext.8 q0, q0, q0, #2 ldrsh r12, [r3, r12] .if \n == 1 mla r11, r6, r4, r11 // sum (above) + *coeff * prev output add r6, r11, r8 // 1 << (ar_coeff_shift - 1) add r12, r12, r10 asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add r6, r6, r12 cmp r6, r5 .elseif \n == 2 mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 mla r11, r6, r10, r11 // += *coeff * prev output 2 mov r8, r6 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add r6, r6, r12 push {lr} cmp r6, r5 mvn lr, r5 // grain_min = ~grain_max .else push {r1-r3} sbfx r1, r4, #0, #8 sbfx r2, r4, #8, #8 sbfx r3, r4, #16, #8 mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 mla r11, r6, r3, r11 // += *coeff * prev output 3 pop {r1-r3} mov r10, r8 mov r8, r6 add r6, r11, r0 // 1 << (ar_coeff_shift - 1) add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) asr r6, r6, r7 // >> ar_coeff_shift asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add r6, r6, r12 push {lr} cmp r6, r5 mvn lr, r5 // grain_min = ~grain_max .endif it gt movgt r6, r5 cmp r6, lr it lt movlt r6, lr .if \n >= 2 pop {lr} .endif subs r1, r1, #1 vext.8 q1, q1, q1, #4 vmov.16 d1[3], r6 bgt 1b pop {r0, pc} endfunc .endm output_lag 1 output_lag 2 output_lag 3 function sum_lag1_above_neon sub r12, r0, #1*GRAIN_WIDTH*2 - 16 vld1.16 {q10}, [r12] // load top right vext.8 q0, q8, q9, #14 // top left, top mid vext.8 q1, q9, q10, #2 // top left, top mid vmull.s16 q2, d18, d28 vmlal.s16 q2, d0, d27 vmlal.s16 q2, d2, d29 vmull.s16 q3, d19, d28 vmlal.s16 q3, d1, d27 vmlal.s16 q3, d3, d29 vmov q8, q9 vmov q9, q10 bx lr endfunc .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff .ifc \lag\()_\edge, lag3_left bl sum_lag3_left_above_neon .else bl sum_\lag\()_above_neon .endif .ifc \type, uv_420 vpush {q6-q7} add r12, r11, #GRAIN_WIDTH*2 vld1.16 {q0, q1}, [r11]! vld1.16 {q6, q7}, [r12]! vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d12, d12, d13 vpadd.i16 d13, d14, d15 vadd.i16 q0, q0, q6 vpop {q6-q7} vrshr.s16 q0, q0, #2 .endif .ifc \type, uv_422 vld1.16 {q0, q1}, [r11]! vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vrshr.s16 q0, q0, #1 .endif .ifc \type, uv_444 vld1.16 {q0}, [r11]! .endif .if \uv_layout .ifnb \uv_coeff vdup.8 d13, \uv_coeff vmovl.s8 q6, d13 .endif vmlal.s16 q2, d0, d13 vmlal.s16 q3, d1, d13 .endif .if \uv_layout && \elems == 8 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 444 && \elems == 7 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 422 && \elems == 1 b sum_\lag\()_uv_420_\edge\()_start .else sum_\lag\()_\type\()_\edge\()_start: push {r11} .if \elems > 4 .ifc \edge, left increment_seed 4 read_rand r11, 11, 3 read_rand r12, 11, 2 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d1[1]}, [r11] read_rand r11, 11, 1 vld1.16 {d1[2]}, [r12] add r11, r3, r11, lsl #1 vld1.16 {d1[3]}, [r11] lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 vrshl.s16 d1, d1, d30 vext.8 q2, q2, q2, #12 .ifc \lag, lag3 vmov.s16 r10, d1[1] .endif .ifnc \lag, lag1 vmov.s16 r8, d1[2] .endif vmov.s16 r6, d1[3] vmov q1, q2 mov r1, #1 bl output_\lag\()_neon .else increment_seed 4, shift=0 vmov q1, q2 mov r1, #4 bl output_\lag\()_neon .endif increment_seed 4, shift=0 vmov q1, q3 .ifc \edge, right mov r1, #3 bl output_\lag\()_neon read_shift_rand r12, 11 add r12, r3, r12, lsl #1 vld1.16 {d2[0]}, [r12] vrshl.s16 d2, d2, d30 vext.8 q0, q0, q1, #2 .else mov r1, #4 bl output_\lag\()_neon .endif .else // elems == 1 increment_seed 4, shift=0 vmov q1, q2 mov r1, #1 bl output_\lag\()_neon lsr r2, r2, #3 read_rand r11, 11, 2 read_rand r12, 11, 1 add r11, r3, r11, lsl #1 add r12, r3, r12, lsl #1 vld1.16 {d2[0]}, [r11] read_rand r11, 11, 0 vld1.16 {d2[1]}, [r12] add r11, r3, r11, lsl #1 vld1.16 {d2[2]}, [r11] vrshl.s16 d2, d2, d30 vext.8 q0, q0, q1, #14 .endif vst1.16 {q0}, [r0]! pop {r11} pop {r1, pc} .endif .endm .macro sum_lag1_func type, uv_layout, edge, elems=8 function sum_\type\()_lag1_\edge\()_neon push {r1, lr} .ifc \edge, left sub r12, r0, #1*GRAIN_WIDTH*2 vld1.8 {q9}, [r12] // load the previous block right above .endif sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems endfunc .endm sum_lag1_func y, 0, left sum_lag1_func y, 0, mid sum_lag1_func y, 0, right, 7 sum_lag1_func uv_444, 444, left sum_lag1_func uv_444, 444, mid sum_lag1_func uv_444, 444, right, 7 sum_lag1_func uv_422, 422, left sum_lag1_func uv_422, 422, mid sum_lag1_func uv_422, 422, right, 1 sum_lag1_func uv_420, 420, left sum_lag1_func uv_420, 420, mid sum_lag1_func uv_420, 420, right, 1 function sum_lag2_above_neon push {lr} sub r12, r0, #2*GRAIN_WIDTH*2 - 16 sub lr, r0, #1*GRAIN_WIDTH*2 - 16 vld1.16 {q10}, [r12] // load top right vld1.16 {q13}, [lr] vdup.8 d10, d28[0] vext.8 q0, q8, q9, #12 // top left, top mid vdup.8 d12, d28[1] vext.8 q1, q8, q9, #14 vdup.8 d14, d28[3] vext.8 q4, q9, q10, #2 // top mid, top right vmovl.s8 q5, d10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmull.s16 q2, d0, d10 vmlal.s16 q2, d2, d12 vmlal.s16 q2, d8, d14 vmull.s16 q3, d1, d10 vmlal.s16 q3, d3, d12 vmlal.s16 q3, d9, d14 vdup.8 d10, d28[4] vext.8 q0, q9, q10, #4 // top mid, top right vdup.8 d12, d28[5] vext.8 q1, q11, q12, #12 // top left, top mid vdup.8 d14, d28[6] vext.8 q4, q11, q12, #14 vmovl.s8 q5, d10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmlal.s16 q2, d0, d10 vmlal.s16 q2, d2, d12 vmlal.s16 q2, d8, d14 vmlal.s16 q3, d1, d10 vmlal.s16 q3, d3, d12 vmlal.s16 q3, d9, d14 vdup.8 d10, d29[0] vext.8 q0, q12, q13, #2 // top mid, top right vdup.8 d12, d29[1] vext.8 q1, q12, q13, #4 vdup.8 d14, d28[2] vdup.8 d8, d28[7] vmovl.s8 q5, d10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q4, d8 vmlal.s16 q2, d0, d10 vmlal.s16 q2, d2, d12 vmlal.s16 q2, d18, d14 vmlal.s16 q2, d24, d8 vmlal.s16 q3, d1, d10 vmlal.s16 q3, d3, d12 vmlal.s16 q3, d19, d14 vmlal.s16 q3, d25, d8 vmov q8, q9 vmov q9, q10 vmov q11, q12 vmov q12, q13 pop {pc} endfunc .macro sum_lag2_func type, uv_layout, edge, elems=8 function sum_\type\()_lag2_\edge\()_neon push {r1, lr} .ifc \edge, left sub r12, r0, #2*GRAIN_WIDTH*2 sub lr, r0, #1*GRAIN_WIDTH*2 vld1.16 {q9}, [r12] // load the previous block right above vld1.16 {q12}, [lr] .endif sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] endfunc .endm sum_lag2_func y, 0, left sum_lag2_func y, 0, mid sum_lag2_func y, 0, right, 7 sum_lag2_func uv_444, 444, left sum_lag2_func uv_444, 444, mid sum_lag2_func uv_444, 444, right, 7 sum_lag2_func uv_422, 422, left sum_lag2_func uv_422, 422, mid sum_lag2_func uv_422, 422, right, 1 sum_lag2_func uv_420, 420, left sum_lag2_func uv_420, 420, mid sum_lag2_func uv_420, 420, right, 1 function sum_lag3_left_above_neon // A separate codepath for the left edge, to avoid reading outside // of the edge of the buffer. sub r12, r0, #3*GRAIN_WIDTH*2 vld1.8 {q11, q12}, [r12] vext.8 q12, q11, q12, #10 vext.8 q11, q11, q11, #10 b sum_lag3_above_start endfunc function sum_lag3_above_neon movw r12, #(3*GRAIN_WIDTH + 3)*2 sub r12, r0, r12 vld1.8 {q11, q12}, [r12] sum_lag3_above_start: vdup.8 d12, d26[0] vext.8 q1, q11, q12, #2 vdup.8 d14, d26[1] vext.8 q4, q11, q12, #4 vdup.8 d16, d26[2] vext.8 q5, q11, q12, #6 vdup.8 d18, d26[3] vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 movw r12, #(2*GRAIN_WIDTH + 3)*2 sub r12, r0, r12 vmull.s16 q2, d22, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d10, d18 vmull.s16 q3, d23, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d26[4] vext.8 q0, q11, q12, #8 vdup.8 d14, d26[5] vext.8 q1, q11, q12, #10 vdup.8 d16, d26[6] vext.8 q4, q11, q12, #12 vld1.8 {q11, q12}, [r12] vdup.8 d18, d26[7] vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d22, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d23, d18 vdup.8 d12, d27[0] vext.8 q0, q11, q12, #2 vdup.8 d14, d27[1] vext.8 q1, q11, q12, #4 vdup.8 d16, d27[2] vext.8 q4, q11, q12, #6 vdup.8 d18, d27[3] vext.8 q5, q11, q12, #8 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d10, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d27[4] vext.8 q0, q11, q12, #10 vdup.8 d14, d27[5] vext.8 q1, q11, q12, #12 vld1.8 {q11, q12}, [r12] vdup.8 d16, d27[6] vdup.8 d18, d27[7] vmovl.s8 q6, d12 vmovl.s8 q7, d14 vext.8 q5, q11, q12, #2 vmovl.s8 q8, d16 vmovl.s8 q9, d18 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d22, d16 vmlal.s16 q2, d10, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d23, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d28[0] vext.8 q0, q11, q12, #4 vdup.8 d14, d28[1] vext.8 q1, q11, q12, #6 vdup.8 d16, d28[2] vext.8 q4, q11, q12, #8 vdup.8 d18, d28[3] vext.8 q5, q11, q12, #10 vmovl.s8 q6, d12 vmovl.s8 q7, d14 vmovl.s8 q8, d16 vmovl.s8 q9, d18 vmlal.s16 q2, d0, d12 vmlal.s16 q2, d2, d14 vmlal.s16 q2, d8, d16 vmlal.s16 q2, d10, d18 vmlal.s16 q3, d1, d12 vmlal.s16 q3, d3, d14 vmlal.s16 q3, d9, d16 vmlal.s16 q3, d11, d18 vdup.8 d12, d28[4] vext.8 q0, q11, q12, #12 vmovl.s8 q6, d12 vmlal.s16 q2, d0, d12 vmlal.s16 q3, d1, d12 bx lr endfunc .macro sum_lag3_func type, uv_layout, edge, elems=8 function sum_\type\()_lag3_\edge\()_neon push {r1, lr} sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] endfunc .endm sum_lag3_func y, 0, left sum_lag3_func y, 0, mid sum_lag3_func y, 0, right, 7 sum_lag3_func uv_444, 444, left sum_lag3_func uv_444, 444, mid sum_lag3_func uv_444, 444, right, 7 sum_lag3_func uv_422, 422, left sum_lag3_func uv_422, 422, mid sum_lag3_func uv_422, 422, right, 1 sum_lag3_func uv_420, 420, left sum_lag3_func uv_420, 420, mid sum_lag3_func uv_420, 420, right, 1 function generate_grain_rows_neon push {r10-r11,lr} 1: mov r10, #80 2: bl get_gaussian_neon vrshl.s16 q0, q0, q15 subs r10, r10, #8 vst1.16 {q0}, [r0]! bgt 2b get_grain_2 d0 subs r1, r1, #1 vst1.32 {d0[0]}, [r0]! bgt 1b pop {r10-r11,pc} endfunc function generate_grain_rows_44_neon push {r10-r11,lr} 1: mov r10, #40 2: bl get_gaussian_neon vrshl.s16 q0, q0, q15 subs r10, r10, #8 vst1.16 {q0}, [r0]! bgt 2b get_grain_4 d0 subs r1, r1, #1 vst1.16 {d0}, [r0] add r0, r0, #GRAIN_WIDTH*2-80 bgt 1b pop {r10-r11,pc} endfunc function gen_grain_uv_444_lag0_neon vld1.16 {q3}, [r11]! gen_grain_uv_lag0_8_start: push {r11,lr} bl get_gaussian_neon vrshl.s16 q0, q0, q15 gen_grain_uv_lag0_8_add: vand q3, q3, q1 vmull.s16 q2, d6, d22 vmull.s16 q3, d7, d22 vrshl.s32 q2, q2, q12 vrshl.s32 q3, q3, q12 vqmovn.s32 d4, q2 vqmovn.s32 d5, q3 vqadd.s16 q2, q2, q0 vmin.s16 q2, q2, q9 vmax.s16 q2, q2, q10 vst1.16 {q2}, [r0]! pop {r11,pc} endfunc function gen_grain_uv_420_lag0_8_neon add r12, r11, #GRAIN_WIDTH*2 vld1.16 {q2,q3}, [r11]! vld1.16 {q4,q5}, [r12] vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vpadd.i16 d8, d8, d9 vpadd.i16 d9, d10, d11 vadd.i16 q2, q2, q4 vrshr.s16 q3, q2, #2 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_422_lag0_8_neon vld1.16 {q2,q3}, [r11]! vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vrshr.s16 q3, q2, #1 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_420_lag0_4_neon add r12, r11, #GRAIN_WIDTH*2 vld1.16 {q2}, [r11] vld1.16 {q0}, [r12] add r11, r11, #32 vpadd.i16 d4, d4, d5 vpadd.i16 d0, d0, d1 vadd.i16 d4, d4, d0 vrshr.s16 d6, d4, #2 push {r11,lr} get_grain_4 d0 b gen_grain_uv_lag0_8_add endfunc function gen_grain_uv_422_lag0_4_neon vld1.16 {q2}, [r11] add r11, r11, #32 vpadd.i16 d4, d4, d5 vrshr.s16 d6, d4, #1 push {r11,lr} get_grain_4 d0 b gen_grain_uv_lag0_8_add endfunc .macro gen_grain_82 type function generate_grain_\type\()_16bpc_neon, export=1 push {r4-r11,lr} .ifc \type, uv_444 ldr r4, [sp, #36] mov r12, r3 mov lr, #28 add r11, r1, #3*GRAIN_WIDTH*2 mov r1, r2 mul r12, r12, lr clz lr, r4 .else clz lr, r2 .endif movrel r3, X(gaussian_sequence) sub lr, lr, #24 // -bitdepth_min_8 ldr r2, [r1, #FGD_SEED] ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] .ifc \type, y add r4, r1, #FGD_AR_COEFFS_Y .else add r4, r1, #FGD_AR_COEFFS_UV .endif add r9, r9, lr // grain_scale_shift - bitdepth_min_8 adr r5, L(gen_grain_\type\()_tbl) ldr r6, [r1, #FGD_AR_COEFF_LAG] add r9, r9, #4 ldr r6, [r5, r6, lsl #2] vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift add r5, r5, r6 vneg.s16 q15, q15 .ifc \type, uv_444 push {lr} cmp r12, #0 movw r10, #0x49d8 movw lr, #0xb524 // Intentionally using a separate register instead of moveq with an // immediate constant, to avoid armv8 deprecated it instruction forms. it eq moveq r10, lr add r4, r4, r12 // Add offset to ar_coeffs_uv[1] eor r2, r2, r10 pop {lr} .endif ldr r7, [r1, #FGD_AR_COEFF_SHIFT] neg lr, lr // bitdepth_min_8 mov r8, #1 mov r10, #1 lsl r8, r8, r7 // 1 << ar_coeff_shift lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) bx r5 .align 2 L(gen_grain_\type\()_tbl): .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB L(generate_grain_\type\()_lag0): .ifc \type, y mov r1, #GRAIN_HEIGHT bl generate_grain_rows_neon .else mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 mvn r6, r5 // grain_min = ~grain_max mov r1, #3 bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT-3 vdup.32 q12, r7 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] vmov.i8 q0, #0 vmov.i8 q1, #255 vdup.16 q9, r5 vdup.16 q10, r6 vext.8 q13, q0, q1, #10 vext.8 q14, q1, q0, #2 vneg.s32 q12, q12 vmovl.s8 q11, d22 1: vmov q1, q13 bl gen_grain_uv_444_lag0_neon // 8 vmov.i8 q1, #255 bl gen_grain_uv_444_lag0_neon // 16 bl gen_grain_uv_444_lag0_neon // 24 bl gen_grain_uv_444_lag0_neon // 32 bl gen_grain_uv_444_lag0_neon // 40 bl gen_grain_uv_444_lag0_neon // 48 bl gen_grain_uv_444_lag0_neon // 56 bl gen_grain_uv_444_lag0_neon // 64 bl gen_grain_uv_444_lag0_neon // 72 vmov q1, q14 bl gen_grain_uv_444_lag0_neon // 80 get_grain_2 d16 subs r1, r1, #1 add r11, r11, #4 vst1.32 {d16[0]}, [r0]! bgt 1b .endif pop {r4-r11,pc} L(generate_grain_\type\()_lag1): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] .ifc \type, y ldrsb r4, [r4, #1] // ar_coeffs_y[3] .else add r4, r4, #2 .endif mov r1, #3 .ifc \type, uv_444 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] .endif bl generate_grain_rows_neon vmovl.s8 q13, d27 vmovl.s8 q12, d29 vmovl.s8 q14, d28 vmov d29, d24 .ifc \type, uv_444 vmovl.s8 q6, d13 .endif mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_mid_neon // 48 bl sum_\type\()_lag1_mid_neon // 56 bl sum_\type\()_lag1_mid_neon // 64 bl sum_\type\()_lag1_mid_neon // 72 bl sum_\type\()_lag1_right_neon // 80 get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #4 .endif vst1.32 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag2): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] vmov.s8 r4, d29[2] vmov.s8 r10, d29[3] mov r1, #3 bl generate_grain_rows_neon mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_mid_neon // 48 bl sum_\type\()_lag2_mid_neon // 56 bl sum_\type\()_lag2_mid_neon // 64 bl sum_\type\()_lag2_mid_neon // 72 bl sum_\type\()_lag2_right_neon // 80 get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #4 .endif vst1.32 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag3): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] vmov.u8 r4, d28[5] vmov.u8 r10, d28[6] vmov.u8 r12, d28[7] orr r4, r4, r10, lsl #8 orr r4, r4, r12, lsl #16 mov r1, #3 vpush {d26} bl generate_grain_rows_neon vpop {d26} mov r1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_mid_neon // 48 bl sum_\type\()_lag3_mid_neon // 56 bl sum_\type\()_lag3_mid_neon // 64 bl sum_\type\()_lag3_mid_neon // 72 bl sum_\type\()_lag3_right_neon // 80 get_grain_2 d16 subs r1, r1, #1 .ifc \type, uv_444 add r11, r11, #4 .endif vst1.32 {d16[0]}, [r0]! bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm gen_grain_82 y gen_grain_82 uv_444 .macro set_height dst, type .ifc \type, uv_420 mov \dst, #SUB_GRAIN_HEIGHT-3 .else mov \dst, #GRAIN_HEIGHT-3 .endif .endm .macro increment_y_ptr reg, type .ifc \type, uv_420 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) .else sub \reg, \reg, #6*32-GRAIN_WIDTH*2 .endif .endm .macro gen_grain_44 type function generate_grain_\type\()_16bpc_neon, export=1 push {r4-r11,lr} ldr r4, [sp, #36] mov r12, r3 movw r11, #(3*GRAIN_WIDTH-3)*2 mov lr, #28 add r11, r1, r11 mov r1, r2 mul r12, r12, lr clz lr, r4 movrel r3, X(gaussian_sequence) sub lr, lr, #24 // -bitdepth_min_8 ldr r2, [r1, #FGD_SEED] ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] add r4, r1, #FGD_AR_COEFFS_UV add r9, r9, lr // grain_scale_shift - bitdepth_min_8 adr r5, L(gen_grain_\type\()_tbl) ldr r6, [r1, #FGD_AR_COEFF_LAG] add r9, r9, #4 ldr r6, [r5, r6, lsl #2] vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift add r5, r5, r6 vneg.s16 q15, q15 push {lr} cmp r12, #0 movw r10, #0x49d8 movw lr, #0xb524 // Intentionally using a separate register instead of moveq with an // immediate constant, to avoid armv8 deprecated it instruction forms. it eq moveq r10, lr add r4, r4, r12 // Add offset to ar_coeffs_uv[1] eor r2, r2, r10 pop {lr} ldr r7, [r1, #FGD_AR_COEFF_SHIFT] neg lr, lr mov r8, #1 mov r10, #1 lsl r8, r8, r7 // 1 << ar_coeff_shift lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) bx r5 .align 2 L(gen_grain_\type\()_tbl): .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB L(generate_grain_\type\()_lag0): .ifc \type, uv_420 vpush {q4-q5} .endif mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 mvn r6, r5 // grain_min = ~grain_max mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type vdup.32 q12, r7 vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] vmov.i8 q0, #0 vmov.i8 q1, #255 vdup.16 q9, r5 vdup.16 q10, r6 vext.8 q13, q0, q1, #10 vext.8 q14, q1, q0, #14 vneg.s32 q12, q12 vmovl.s8 q11, d22 1: vmov q1, q13 bl gen_grain_\type\()_lag0_8_neon // 8 vmov.i8 q1, #255 bl gen_grain_\type\()_lag0_8_neon // 16 bl gen_grain_\type\()_lag0_8_neon // 24 bl gen_grain_\type\()_lag0_8_neon // 32 bl gen_grain_\type\()_lag0_8_neon // 40 vmov q1, q14 bl gen_grain_\type\()_lag0_4_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b .ifc \type, uv_420 vpop {q4-q5} .endif pop {r4-r11,pc} L(generate_grain_\type\()_lag1): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] add r4, r4, #2 mov r1, #3 vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] bl generate_grain_rows_44_neon vmovl.s8 q13, d27 vmovl.s8 q12, d29 vmovl.s8 q14, d28 vmov d29, d24 vmovl.s8 q6, d13 set_height r1, \type 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_right_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag2): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] vmov.s8 r4, d29[2] vmov.s8 r10, d29[3] mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_right_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b vpop {q4-q7} pop {r4-r11,pc} L(generate_grain_\type\()_lag3): vpush {q4-q7} mov r5, #128 lsl r5, r5, lr // 128 << bitdepth_min_8 sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] vmov.u8 r4, d28[5] vmov.u8 r10, d28[6] vmov.u8 r12, d28[7] orr r4, r4, r10, lsl #8 orr r4, r4, r12, lsl #16 mov r1, #3 bl generate_grain_rows_44_neon set_height r1, \type 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_right_neon // 44 subs r1, r1, #1 increment_y_ptr r11, \type add r0, r0, #GRAIN_WIDTH*2-6*16 bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm gen_grain_44 uv_420 gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off vmov.u16 r11, \src1[0+\off] vmov.u16 r12, \src3[0+\off] add r11, r11, r3 vmov.u16 lr, \src1[2+\off] add r12, r12, r3 vld1.8 {\dst1[0+\off]}, [r11] vmov.u16 r11, \src3[2+\off] add lr, lr, r3 vld1.8 {\dst2[0+\off]}, [r12] vmov.u16 r12, \src2[0+\off] add r11, r11, r3 vld1.8 {\dst1[2+\off]}, [lr] vmov.u16 lr, \src4[0+\off] add r12, r12, r3 vld1.8 {\dst2[2+\off]}, [r11] vmov.u16 r11, \src2[2+\off] add lr, lr, r3 vld1.8 {\dst1[4+\off]}, [r12] vmov.u16 r12, \src4[2+\off] add r11, r11, r3 vld1.8 {\dst2[4+\off]}, [lr] add r12, r12, r3 vld1.8 {\dst1[6+\off]}, [r11] vld1.8 {\dst2[6+\off]}, [r12] .endm .macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 .endm function gather32_neon push {r11-r12,lr} gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 pop {r11-r12,pc} endfunc function gather16_neon push {r11-r12,lr} gather_interleaved d8, d9, d0, d1, d2, d3, 0 gather_interleaved d8, d9, d0, d1, d2, d3, 1 pop {r11-r12,pc} endfunc const overlap_coeffs_0, align=4 .short 27, 17, 0, 0 .short 17, 27, 32, 32 endconst const overlap_coeffs_1, align=4 .short 23, 0, 0, 0 .short 22, 32, 32, 32 endconst .macro calc_offset offx, offy, src, sx, sy and \offy, \src, #0xF // randval & 0xF lsr \offx, \src, #4 // randval >> 4 .if \sy == 0 add \offy, \offy, \offy // 2 * (randval & 0xF) .endif .if \sx == 0 add \offx, \offx, \offx // 2 * (randval >> 4) .endif .endm .macro add_offset dst, offx, offy, src, stride mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy add \dst, \dst, \offx, lsl #1 // grain_lut += offx .endm // void dav2d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const int scaling_shift, // const entry grain_lut[][GRAIN_WIDTH], // const int offsets[][2], // const int h, const ptrdiff_t clip, // const ptrdiff_t type, // const int bitdepth_max); function fgy_32x32_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut ldrd r6, r7, [sp, #108] // offsets, h ldr r8, [sp, #116] // clip mov r9, #GRAIN_WIDTH*2 // grain_lut stride ldr r10, [sp, #124] // bitdepth_max eor r4, r4, #15 // 15 - scaling_shift vdup.16 q6, r10 // bitdepth_max clz r10, r10 vdup.16 q13, r4 // 15 - scaling_shift rsb r10, r10, #24 // bitdepth_min_8 cmp r8, #0 vdup.16 q12, r10 // bitdepth_min_8 movrel_local r12, overlap_coeffs_0 beq 1f // clip vmov.i16 q14, #16 vmov.i16 q15, #235 vshl.s16 q14, q14, q12 vshl.s16 q15, q15, q12 b 2f 1: // no clip vmov.i16 q14, #0 vmov q15, q6 2: vshr.u16 q6, q6, #1 // grain_max vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs add r5, r5, #18 // grain_lut += 9 add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride add r5, r5, r9 // grain_lut += grain_stride ldr r10, [r6, #8] // offsets[1][0] calc_offset r10, r4, r10, 0, 0 add_offset r4, r10, r4, r5, r9 ldr r10, [r6, #4] // offsets[0][1] calc_offset r10, r11, r10, 0, 0 add_offset r11, r10, r11, r5, r9 ldr r10, [r6, #12] // offsets[1][1] calc_offset r10, r8, r10, 0, 0 add_offset r8, r10, r8, r5, r9 ldr r6, [r6] // offsets[0][0] calc_offset r6, lr, r6, 0, 0 add_offset r5, r6, lr, r5, r9 add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by ldr r10, [sp, #120] // type adr r11, L(fgy_loop_tbl) tst r10, #1 ldr r10, [r11, r10, lsl #2] add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx add r11, r11, r10 beq 1f // y overlap vdup.16 d14, d24[0] vdup.16 d15, d24[1] mov r10, r7 // backup actual h mov r7, #2 1: sub r2, r2, #32 // src_stride -= 32 sub r9, r9, #32 // grain_stride -= 32 bx r11 endfunc function fgy_loop_neon L(fgy_loop_tbl): .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB .macro fgy ox, oy L(loop_\ox\oy): 1: .if \ox vld1.16 {d0}, [r4], r9 // grain_lut old .endif .if \oy vld1.16 {q2, q3}, [r6]! // grain_lut top .endif .if \ox && \oy vld1.16 {d2}, [r8], r9 // grain_lut top old .endif .if \oy vld1.16 {q4, q5}, [r6], r9 // grain_lut top .endif .if !\ox && !\oy vld1.16 {q0, q1}, [r1, :128]! // src .endif vld1.16 {q8, q9}, [r5]! // grain_lut .if !\ox && !\oy vld1.16 {q2, q3}, [r1, :128], r2 // src .endif .if !\oy vmvn.i16 q5, #0xf000 // 0x0fff .endif vld1.16 {q10, q11}, [r5], r9 // grain_lut .if \ox add r4, r4, #32 vmull.s16 q0, d0, d24 vmlal.s16 q0, d16, d25 .endif .if \oy .if \ox add r8, r8, #32 vmull.s16 q1, d2, d24 vmlal.s16 q1, d4, d25 vqrshrn.s32 d16, q0, #5 vmvn d0, d12 // grain_min vqrshrn.s32 d4, q1, #5 vmin.s16 d16, d16, d12 vmin.s16 d4, d4, d12 vmax.s16 d16, d16, d0 vmax.s16 d4, d4, d0 .endif vmull.s16 q0, d4, d14 vmull.s16 q1, d5, d14 vmull.s16 q2, d6, d14 vmull.s16 q3, d7, d14 vmlal.s16 q0, d16, d15 vmlal.s16 q1, d17, d15 vmlal.s16 q2, d18, d15 vmlal.s16 q3, d19, d15 vmull.s16 q8, d20, d15 vmull.s16 q9, d21, d15 vmull.s16 q10, d22, d15 vmull.s16 q11, d23, d15 vmlal.s16 q8, d8, d14 vmlal.s16 q9, d9, d14 vmlal.s16 q10, d10, d14 vmlal.s16 q11, d11, d14 vmvn q4, q6 // grain_min vqrshrn.s32 d0, q0, #5 vqrshrn.s32 d1, q1, #5 vqrshrn.s32 d2, q2, #5 vqrshrn.s32 d3, q3, #5 vqrshrn.s32 d4, q8, #5 vqrshrn.s32 d5, q9, #5 vqrshrn.s32 d6, q10, #5 vqrshrn.s32 d7, q11, #5 vmin.s16 q8, q0, q6 vmin.s16 q9, q1, q6 vld1.16 {q0, q1}, [r1, :128]! // src vmin.s16 q10, q2, q6 vmin.s16 q11, q3, q6 vmax.s16 q8, q8, q4 vmax.s16 q9, q9, q4 vld1.16 {q2, q3}, [r1, :128], r2 // src vmvn.i16 q5, #0xf000 // 0x0fff vmax.s16 q10, q10, q4 vmax.s16 q11, q11, q4 .elseif \ox vmvn d4, d12 // grain_min vqrshrn.s32 d16, q0, #5 vld1.16 {q0, q1}, [r1, :128]! // src vmin.s16 d16, d16, d12 vmax.s16 d16, d16, d4 vld1.16 {q2, q3}, [r1, :128], r2 // src .endif // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. vand q0, q0, q5 vand q1, q1, q5 vand q2, q2, q5 vand q3, q3, q5 bl gather32_neon .if \ox || \oy vpush {q6-q7} .endif vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vmovl.u8 q4, d10 vmovl.u8 q5, d11 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) vshl.u16 q7, q7, q13 vshl.u16 q4, q4, q13 vshl.u16 q5, q5, q13 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) vqrdmulh.s16 q9, q9, q7 vqrdmulh.s16 q10, q10, q4 vqrdmulh.s16 q11, q11, q5 .if \ox || \oy vpop {q6-q7} .endif vqadd.s16 q0, q0, q8 // *src + noise vqadd.s16 q1, q1, q9 vqadd.s16 q2, q2, q10 vqadd.s16 q3, q3, q11 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vst1.16 {q0, q1}, [r0, :128]! // dst subs r7, r7, #1 .if \oy vdup.16 d14, d25[0] vdup.16 d15, d25[1] .endif vst1.16 {q2, q3}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r10, #2 sub r7, r10, #2 // restore actual remaining h bgt L(loop_\ox\()0) .endif vpop {q4-q7} pop {r4-r11,pc} .endm fgy 0, 0 fgy 0, 1 fgy 1, 0 fgy 1, 1 endfunc // void dav2d_fguv_32x32_420_16bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const Dav2dFilmGrainData *const data, // const entry grain_lut[][GRAIN_WIDTH], // const pixel *const luma_row, // const ptrdiff_t luma_stride, // const int offsets[][2], // const ptrdiff_t h, const ptrdiff_t uv, // const ptrdiff_t is_id, // const ptrdiff_t type, // const int bitdepth_max); .macro fguv layout, sx, sy function fguv_32x32_\layout\()_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] // data, grain_lut ldrd r10, r11, [sp, #124] // uv, is_id ldr r6, [sp, #136] // bitdepth_max clz r7, r6 rsb r7, r7, #24 // bitdepth_min_8 // !csfl add r10, r4, r10, lsl #2 // + 4*uv add r12, r10, #FGD_UV_LUMA_MULT add lr, r10, #FGD_UV_MULT ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset vld1.16 {d30[]}, [r12] // uv_luma_mult lsl r10, r10, r7 // uv_offset << bitdepth_min_8 vld1.16 {d30[1]}, [lr] // uv_mult ldr lr, [r4, #FGD_SCALING_SHIFT] ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] eor lr, lr, #15 // 15 - scaling_shift vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 cmp r12, #0 vdup.16 q13, lr // 15 - scaling_shift beq 1f // clip cmp r11, #0 mov r8, #16 mov r9, #240 lsl r8, r8, r7 lsl r9, r9, r7 beq 2f // is_id mov r9, #235 lsl r9, r9, r7 b 2f 1: // no clip mov r8, #0 mov r9, r6 // bitdepth_max 2: vmov.16 d30[3], r6 // bitdepth_max vdup.16 d31, r8 // clip_min mov r10, #GRAIN_WIDTH*2 // grain_lut stride .if \sy mov r6, #23 mov r7, #22 .else mov r6, #27 mov r7, #17 .endif vmov.16 d31[1], r9 // clip_max ldrd r8, r9, [sp, #116] // offsets, h add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 .if \sy add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride .else add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride add r5, r5, r10 // grain_lut += grain_stride .endif vmov.16 d31[2], r6 // overlap y [0] ldr r12, [r8, #8] // offsets[1][0] calc_offset r12, r4, r12, \sx, \sy add_offset r4, r12, r4, r5, r10 ldr r12, [r8, #4] // offsets[0][1] calc_offset r12, lr, r12, \sx, \sy add_offset lr, r12, lr, r5, r10 ldr r12, [r8, #12] // offsets[1][1] calc_offset r12, r11, r12, \sx, \sy add_offset r11, r12, r11, r5, r10 ldr r8, [r8] // offsets[0][0] calc_offset r8, r12, r8, \sx, \sy add_offset r5, r8, r12, r5, r10 vmov.16 d31[3], r7 // overlap y [1] add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx movrel_local r12, overlap_coeffs_\sx ldr lr, [sp, #132] // type ldrd r6, r7, [sp, #108] // luma_row, luma_stride vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs movrel_local r12, L(fguv_loop_sx\sx\()_tbl) #if CONFIG_THUMB // This uses movrel_local instead of adr above, because the target // can be out of range for adr. But movrel_local leaves the thumb bit // set on COFF (but probably wouldn't if building for thumb on ELF), // thus try to clear the bit for robustness. bic r12, r12, #1 #endif tst lr, #1 ldr lr, [r12, lr, lsl #2] add r12, r12, lr beq 1f // y overlap sub lr, r9, #(2 >> \sy) // backup remaining h mov r9, #(2 >> \sy) 1: .if \sy add r7, r7, r7 // luma_stride *= 2 .endif sub r7, r7, #32 // luma_stride -= 32 bx r12 endfunc .endm fguv 420, 1, 1 fguv 422, 1, 0 fguv 444, 0, 0 function fguv_loop_sx0_neon L(fguv_loop_sx0_tbl): .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB .macro fguv_loop_sx0 csfl, ox, oy L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): sub r2, r2, #32 // src_stride -= 32 sub r10, r10, #32 // grain_stride -= 32 .if \oy mov r12, lr .endif L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): 1: .if \ox vld1.16 {d0}, [r4], r10 // grain_lut old .endif .if \oy vld1.16 {q2, q3}, [r8]! // grain_lut top .endif .if \ox && \oy vld1.16 {d2}, [r11], r10 // grain_lut top old .endif .if !\ox && !\oy vld1.16 {q0, q1}, [r6, :128]! // luma .endif vld1.16 {q8, q9}, [r5]! // grain_lut .if \oy vld1.16 {q4, q5}, [r8], r10 // grain_lut top .endif .if !\ox && !\oy vld1.16 {q2, q3}, [r6, :128], r7 // luma .endif .if \oy vdup.16 d28, d31[2] // overlap y coeff vdup.16 d29, d31[3] // overlap y coeff .endif vld1.16 {q10, q11}, [r5], r10 // grain_lut .if \ox vdup.16 q7, d30[3] // bitdepth_max add r4, r4, #32 vmull.s16 q0, d0, d24 vshr.u16 q7, q7, #1 // grain_max vmlal.s16 q0, d16, d25 vmvn q6, q7 // grain_min .endif .if \oy .if \ox add r11, r11, #32 vmull.s16 q1, d2, d24 vmlal.s16 q1, d4, d25 vqrshrn.s32 d16, q0, #5 vqrshrn.s32 d4, q1, #5 vmin.s16 d4, d4, d14 vmin.s16 d16, d16, d14 vmax.s16 d4, d4, d12 vmax.s16 d16, d16, d12 .endif vmull.s16 q0, d4, d28 vmull.s16 q1, d5, d28 vmull.s16 q2, d6, d28 vmull.s16 q3, d7, d28 .if !\ox vdup.16 q7, d30[3] // bitdepth_max .endif vmlal.s16 q0, d16, d29 vmlal.s16 q1, d17, d29 vmlal.s16 q2, d18, d29 vmlal.s16 q3, d19, d29 .if !\ox vshr.u16 q7, q7, #1 // grain_max .endif vmull.s16 q8, d20, d29 vmull.s16 q9, d21, d29 vmull.s16 q10, d22, d29 vmull.s16 q11, d23, d29 .if !\ox vmvn q6, q7 // grain_min .endif vmlal.s16 q8, d8, d28 vmlal.s16 q9, d9, d28 vmlal.s16 q10, d10, d28 vmlal.s16 q11, d11, d28 vqrshrn.s32 d0, q0, #5 vqrshrn.s32 d1, q1, #5 vqrshrn.s32 d2, q2, #5 vqrshrn.s32 d3, q3, #5 vqrshrn.s32 d4, q8, #5 vqrshrn.s32 d5, q9, #5 vqrshrn.s32 d6, q10, #5 vqrshrn.s32 d7, q11, #5 vmin.s16 q8, q0, q7 vmin.s16 q9, q1, q7 vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 q10, q2, q7 vmin.s16 q11, q3, q7 vmax.s16 q8, q8, q6 vmax.s16 q9, q9, q6 vld1.16 {q2, q3}, [r6, :128], r7 // luma vmax.s16 q10, q10, q6 vmax.s16 q11, q11, q6 .elseif \ox vqrshrn.s32 d16, q0, #5 vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 d16, d16, d14 vld1.16 {q2, q3}, [r6, :128], r7 // luma vmax.s16 d16, d16, d12 .endif .if !\csfl vdup.16 d28, d30[0] // uv_luma_mult vld1.16 {q4, q5}, [r1, :128]! // src vdup.16 d29, d30[1] // uv_mult vmull.s16 q6, d0, d28 vmull.s16 q7, d1, d28 vmull.s16 q0, d2, d28 vmull.s16 q1, d3, d28 vmlal.s16 q6, d8, d29 vmlal.s16 q7, d9, d29 vmlal.s16 q0, d10, d29 vmlal.s16 q1, d11, d29 vld1.16 {q4, q5}, [r1, :128] // src sub r1, r1, #32 vshrn.s32 d12, q6, #6 vshrn.s32 d13, q7, #6 vshrn.s32 d14, q0, #6 vshrn.s32 d15, q1, #6 vmull.s16 q0, d4, d28 vmull.s16 q1, d5, d28 vmull.s16 q2, d6, d28 vmull.s16 q3, d7, d28 vmlal.s16 q0, d8, d29 vmlal.s16 q1, d9, d29 vmlal.s16 q2, d10, d29 vmlal.s16 q3, d11, d29 vdup.16 q14, d30[2] // uv_offset vshrn.s32 d0, q0, #6 vshrn.s32 d1, q1, #6 vshrn.s32 d2, q2, #6 vshrn.s32 d3, q3, #6 vdup.16 q4, d30[3] // bitdepth_max vmov.i16 q5, #0 vadd.i16 q6, q6, q14 vadd.i16 q7, q7, q14 vadd.i16 q2, q0, q14 vadd.i16 q3, q1, q14 vmin.s16 q0, q6, q4 vmin.s16 q1, q7, q4 vmin.s16 q2, q2, q4 vmin.s16 q3, q3, q4 vmax.s16 q0, q0, q5 vmax.s16 q1, q1, q5 vmax.s16 q2, q2, q5 vmax.s16 q3, q3, q5 .else vdup.16 q14, d30[3] // bitdepth_max // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. vand q0, q0, q14 vand q1, q1, q14 vand q2, q2, q14 vand q3, q3, q14 .endif bl gather32_neon vld1.16 {q0, q1}, [r1, :128]! // src vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vmovl.u8 q4, d10 vmovl.u8 q5, d11 vld1.16 {q2, q3}, [r1, :128], r2 // src vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) vshl.u16 q7, q7, q13 vshl.u16 q4, q4, q13 vshl.u16 q5, q5, q13 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) vqrdmulh.s16 q9, q9, q7 vqrdmulh.s16 q10, q10, q4 vqrdmulh.s16 q11, q11, q5 vdup.16 q4, d31[0] // clip_min vdup.16 q5, d31[1] // clip_max vqadd.s16 q0, q0, q8 // *src + noise vqadd.s16 q1, q1, q9 vqadd.s16 q2, q2, q10 vqadd.s16 q3, q3, q11 .if \oy vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x .endif vmax.s16 q0, q0, q4 vmax.s16 q1, q1, q4 vmax.s16 q2, q2, q4 vmax.s16 q3, q3, q4 vmin.s16 q0, q0, q5 vmin.s16 q1, q1, q5 vmin.s16 q2, q2, q5 vmin.s16 q3, q3, q5 vst1.16 {q0, q1}, [r0, :128]! // dst subs r9, r9, #1 .if \oy vmov.32 d31[1], lr // new coeffs for overlap y .endif vst1.16 {q2, q3}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r12, #0 mov r9, r12 // restore actual remaining h bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) .endif b 9f .endm fguv_loop_sx0 0, 0, 0 fguv_loop_sx0 0, 0, 1 fguv_loop_sx0 0, 1, 0 fguv_loop_sx0 0, 1, 1 fguv_loop_sx0 1, 0, 0 fguv_loop_sx0 1, 0, 1 fguv_loop_sx0 1, 1, 0 fguv_loop_sx0 1, 1, 1 9: vpop {q4-q7} pop {r4-r11,pc} endfunc function fguv_loop_sx1_neon L(fguv_loop_sx1_tbl): .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): .if \oy mov r12, lr .endif 1: .if \ox vld1.16 {d0}, [r4], r10 // grain_lut old .endif .if \ox && \oy vld1.16 {d2}, [r11], r10 // grain_lut top old .endif .if \oy vld1.16 {q2, q3}, [r8], r10 // grain_lut top .endif .if !\ox && !\oy vld1.16 {q0, q1}, [r6, :128]! // luma .endif vld1.16 {q8, q9}, [r5], r10 // grain_lut .if \oy vdup.16 d28, d31[2] // overlap y coeff vdup.16 d29, d31[3] // overlap y coeff .endif .if !\ox && !\oy vld1.16 {q2, q3}, [r6, :128], r7 // luma .endif .if \ox vdup.16 q7, d30[3] // bitdepth_max vmull.s16 q0, d0, d24 vshr.u16 q7, q7, #1 // grain_max vmlal.s16 q0, d16, d25 vmvn q6, q7 // grain_min .endif .if \oy .if \ox vmull.s16 q1, d2, d24 vmlal.s16 q1, d4, d25 vqrshrn.s32 d16, q0, #5 vqrshrn.s32 d4, q1, #5 vmin.s16 d4, d4, d14 vmin.s16 d16, d16, d14 vmax.s16 d4, d4, d12 vmax.s16 d16, d16, d12 .endif vmull.s16 q0, d4, d28 vmull.s16 q1, d5, d28 vmull.s16 q2, d6, d28 vmull.s16 q3, d7, d28 .if !\ox vdup.16 q7, d30[3] // bitdepth_max .endif vmlal.s16 q0, d16, d29 vmlal.s16 q1, d17, d29 vmlal.s16 q2, d18, d29 vmlal.s16 q3, d19, d29 .if !\ox vshr.u16 q7, q7, #1 // grain_max .endif vqrshrn.s32 d16, q0, #5 vqrshrn.s32 d17, q1, #5 vqrshrn.s32 d18, q2, #5 vqrshrn.s32 d19, q3, #5 .if !\ox vmvn q6, q7 // grain_min .endif vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 q8, q8, q7 vmin.s16 q9, q9, q7 vmax.s16 q8, q8, q6 vmax.s16 q9, q9, q6 vld1.16 {q2, q3}, [r6, :128], r7 // luma .elseif \ox vqrshrn.s32 d16, q0, #5 vld1.16 {q0, q1}, [r6, :128]! // luma vmin.s16 d16, d16, d14 vld1.16 {q2, q3}, [r6, :128], r7 // luma vmax.s16 d16, d16, d12 .endif vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vrshr.u16 q0, q0, #1 vrshr.u16 q1, q1, #1 .if !\csfl vdup.16 d28, d30[0] // uv_luma_mult vld1.16 {q2, q3}, [r1, :128], r2 // src vdup.16 d29, d30[1] // uv_mult vmull.s16 q6, d0, d28 vmull.s16 q7, d1, d28 vmull.s16 q0, d2, d28 vmull.s16 q1, d3, d28 vmlal.s16 q6, d4, d29 vmlal.s16 q7, d5, d29 vmlal.s16 q0, d6, d29 vmlal.s16 q1, d7, d29 vshrn.s32 d12, q6, #6 vshrn.s32 d13, q7, #6 vshrn.s32 d14, q0, #6 vshrn.s32 d15, q1, #6 vdup.16 q14, d30[2] // uv_offset vdup.16 q4, d30[3] // bitdepth_max vmov.i16 q5, #0 vadd.i16 q6, q6, q14 vadd.i16 q7, q7, q14 vmin.s16 q0, q6, q4 vmin.s16 q1, q7, q4 vmax.s16 q0, q0, q5 vmax.s16 q1, q1, q5 .else vdup.16 q14, d30[3] // bitdepth_max vld1.16 {q2, q3}, [r1, :128], r2 // src // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. vand q0, q0, q14 vand q1, q1, q14 .endif bl gather16_neon vmovl.u8 q6, d8 // scaling vmovl.u8 q7, d9 vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) vshl.u16 q7, q7, q13 vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) vqrdmulh.s16 q9, q9, q7 vdup.16 q4, d31[0] // clip_min vdup.16 q5, d31[1] // clip_max vqadd.s16 q0, q2, q8 // *src + noise vqadd.s16 q1, q3, q9 .if \oy // Swap the two last coefficients of d31, place them first in d28 vrev64.16 d28, d31 .endif vmax.s16 q0, q0, q4 vmax.s16 q1, q1, q4 vmin.s16 q0, q0, q5 vmin.s16 q1, q1, q5 subs r9, r9, #1 .if \oy // Take the first two 16 bit coefficients of d28 and place them at the // end of d31 vtrn.32 d31, d28 .endif vst1.16 {q0, q1}, [r0, :128], r2 // dst bgt 1b .if \oy cmp r12, #0 mov r9, r12 // restore actual remaining h bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx1 0, 0, 0 fguv_loop_sx1 0, 0, 1 fguv_loop_sx1 0, 1, 0 fguv_loop_sx1 0, 1, 1 fguv_loop_sx1 1, 0, 0 fguv_loop_sx1 1, 0, 1 fguv_loop_sx1 1, 1, 0 fguv_loop_sx1 1, 1, 1 9: vpop {q4-q7} pop {r4-r11,pc} endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/ipred.S000066400000000000000000003411221517466257200230700ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * Copyright © 2019, B Krishnan Iyer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 push {r4, lr} ldr r4, [sp, #8] clz r3, r3 adr r2, L(ipred_dc_128_tbl) sub r3, r3, #25 ldr r3, [r2, r3, lsl #2] vmov.i8 q0, #128 add r2, r2, r3 add r12, r0, r1 lsl r1, r1, #1 bx r2 .align 2 L(ipred_dc_128_tbl): .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4, pc} 8: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4, pc} 16: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vmov.i8 q1, #128 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vmov.i8 q1, #128 sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 push {r4, lr} ldr lr, [sp, #8] clz r3, r3 adr r4, L(ipred_v_tbl) sub r3, r3, #25 ldr r3, [r4, r3, lsl #2] add r2, r2, #1 add r4, r4, r3 add r12, r0, r1 lsl r1, r1, #1 bx r4 .align 2 L(ipred_v_tbl): .word 640f - L(ipred_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_v_tbl) + CONFIG_THUMB 40: vld1.32 {d0[]}, [r2] 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs lr, lr, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4, pc} 80: vld1.8 {d0}, [r2] 8: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs lr, lr, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4, pc} 160: vld1.8 {q0}, [r2] 16: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs lr, lr, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vld1.8 {q0, q1}, [r2] 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs lr, lr, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vld1.8 {q0, q1}, [r2]! sub r1, r1, #32 vld1.8 {q2, q3}, [r2] 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 subs lr, lr, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_h_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] sub r2, r2, #4 mov lr, #-4 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_h_tbl): .word 640f - L(ipred_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_h_tbl) + CONFIG_THUMB .word 8f - L(ipred_h_tbl) + CONFIG_THUMB .word 4f - L(ipred_h_tbl) + CONFIG_THUMB 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr vst1.32 {d3[0]}, [r0, :32], r1 vst1.32 {d2[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4-r5, pc} 8: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr vst1.8 {d3}, [r0, :64], r1 vst1.8 {d2}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d1}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4-r5, pc} 160: add r2, r2, #3 mov lr, #-1 16: vld1.8 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.8 {d2[], d3[]}, [r2], lr vst1.8 {q0}, [r0, :128], r1 vld1.8 {d4[], d5[]}, [r2], lr vst1.8 {q1}, [r12, :128], r1 vld1.8 {d6[], d7[]}, [r2], lr vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: add r2, r2, #3 mov lr, #-1 sub r1, r1, #16 32: vld1.8 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.8 {d2[], d3[]}, [r2], lr vst1.8 {q0}, [r0, :128]! vld1.8 {d4[], d5[]}, [r2], lr vst1.8 {q1}, [r12, :128]! vld1.8 {d6[], d7[]}, [r2], lr vst1.8 {q0}, [r0, :128], r1 vst1.8 {q1}, [r12, :128], r1 vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: add r2, r2, #3 mov lr, #-1 sub r1, r1, #48 64: vld1.8 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.8 {d2[], d3[]}, [r2], lr vst1.8 {q0}, [r0, :128]! vld1.8 {d4[], d5[]}, [r2], lr vst1.8 {q1}, [r12, :128]! vld1.8 {d6[], d7[]}, [r2], lr vst1.8 {q0}, [r0, :128]! vst1.8 {q1}, [r12, :128]! vst1.8 {q0}, [r0, :128]! vst1.8 {q1}, [r12, :128]! vst1.8 {q0}, [r0, :128], r1 vst1.8 {q1}, [r12, :128], r1 vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_dc_top_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] add r2, r2, #1 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_top_tbl): .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB 40: vld1.32 {d0[]}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 d0, d0[0] 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4-r5, pc} 80: vld1.8 {d0}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #3 vdup.8 d0, d0[0] 8: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4-r5, pc} 160: vld1.8 {d0, d1}, [r2] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] 16: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: vld1.8 {d0, d1, d2, d3}, [r2] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d4, q0, #5 vdup.8 q0, d4[0] vdup.8 q1, d4[0] 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: vld1.8 {d0, d1, d2, d3}, [r2]! vaddl.u8 q0, d0, d1 vld1.8 {d4, d5, d6, d7}, [r2] vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q1, q2, q3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d18, q0, #6 vdup.8 q0, d18[0] vdup.8 q1, d18[0] sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] sub r2, r2, r4 clz r3, r3 clz lr, r4 sub lr, lr, #25 adr r5, L(ipred_dc_left_tbl) sub r3, r3, #20 ldr r3, [r5, r3, lsl #2] ldr lr, [r5, lr, lsl #2] add r3, r5, r3 add r5, r5, lr add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_left_tbl): .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB L(ipred_dc_left_h4): vld1.32 {d0[]}, [r2, :32] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w4): vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt L(ipred_dc_left_w4) pop {r4-r5, pc} L(ipred_dc_left_h8): vld1.8 {d0}, [r2, :64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #3 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w8): vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt L(ipred_dc_left_w8) pop {r4-r5, pc} L(ipred_dc_left_h16): vld1.8 {d0, d1}, [r2, :128] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w16): vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt L(ipred_dc_left_w16) pop {r4-r5, pc} L(ipred_dc_left_h32): vld1.8 {d0, d1, d2, d3}, [r2, :128] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #5 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w32): vmov.8 q1, q0 1: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h64): vld1.8 {d0, d1, d2, d3}, [r2, :128]! vld1.8 {d4, d5, d6, d7}, [r2, :128] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q1, q2, q3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #6 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w64): vmov.8 q1, q0 sub r1, r1, #32 1: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} endfunc // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_8bpc_neon, export=1 push {r4-r6, lr} ldr r4, [sp, #16] sub r2, r2, r4 add lr, r3, r4 // width + height clz r3, r3 clz r12, r4 vdup.16 q15, lr // width + height adr r5, L(ipred_dc_tbl) rbit lr, lr // rbit(width + height) sub r3, r3, #20 // 25 leading bits, minus table offset 5 sub r12, r12, #25 clz lr, lr // ctz(width + height) ldr r3, [r5, r3, lsl #2] ldr r12, [r5, r12, lsl #2] neg lr, lr // -ctz(width + height) add r3, r5, r3 add r5, r5, r12 vshr.u16 q15, q15, #1 // (width + height) >> 1 vdup.16 q14, lr // -ctz(width + height) add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_tbl): .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB L(ipred_dc_h4): vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w4): vld1.32 {d1[]}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d1, d1 vpadd.u16 d1, d1 cmp r4, #4 vadd.s16 d0, d0, d1 vshl.u16 d0, d0, d28 beq 1f // h = 8/16 movw lr, #(0x3334/2) movw r5, #(0x5556/2) cmp r4, #16 it ne movne lr, r5 vdup.16 d30, lr vqdmulh.s16 d0, d0, d30 1: vdup.8 d0, d0[0] 2: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h8): vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w8): vld1.8 {d2}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d2, d2 vpadd.u16 d2, d2 vpadd.u16 d2, d2 cmp r4, #8 vadd.s16 d0, d0, d2 vshl.u16 d0, d0, d28 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 vdup.16 d24, lr vqdmulh.s16 d0, d0, d24 1: vdup.8 d0, d0[0] 2: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h16): vld1.8 {d0, d1}, [r2, :128]! vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w16): vld1.8 {d2, d3}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 vadd.u16 d2, d2, d3 vpadd.u16 d2, d2 vpadd.u16 d2, d2 cmp r4, #16 vadd.s16 d0, d0, d2 vshl.u16 d0, d0, d28 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 vdup.16 d24, lr vqdmulh.s16 d0, d0, d24 1: vdup.8 q0, d0[0] 2: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h32): vld1.8 {d0, d1, d2, d3}, [r2, :128]! vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w32): vld1.8 {d2, d3, d4, d5}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vadd.u16 q1, q1, q2 vadd.u16 d2, d2, d3 vpadd.u16 d2, d2 vpadd.u16 d2, d2 cmp r4, #32 vadd.s16 d0, d0, d2 vshl.u16 d4, d0, d28 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 vdup.16 d24, lr vqdmulh.s16 d4, d4, d24 1: vdup.8 q0, d4[0] vdup.8 q1, d4[0] 2: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h64): vld1.8 {d0, d1, d2, d3}, [r2, :128]! vaddl.u8 q0, d0, d1 vld1.8 {d4, d5, d6, d7}, [r2, :128]! vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q1, q2, q3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w64): vld1.8 {d2, d3, d4, d5}, [r2]! vadd.s16 d0, d0, d30 vaddl.u8 q2, d4, d5 vaddl.u8 q1, d2, d3 vadd.u16 d4, d4, d5 vadd.u16 d2, d2, d3 vld1.8 {d16, d17, d18, d19}, [r2] vpadd.u16 d4, d4 vpadd.u16 d2, d2 vpadd.u16 d4, d4 vpadd.u16 d2, d2 vaddl.u8 q8, d16, d17 vaddl.u8 q9, d18, d19 vadd.u16 d16, d16, d17 vadd.u16 d18, d18, d19 vpadd.u16 d16, d16 vpadd.u16 d18, d18 vpadd.u16 d16, d16 vpadd.u16 d18, d18 vadd.u16 d2, d2, d4 vadd.u16 d3, d16, d18 cmp r4, #64 vadd.s16 d0, d0, d2 vadd.s16 d0, d0, d3 vshl.u16 d18, d0, d28 beq 1f // h = 16/32 movw lr, #(0x5556/2) movt lr, #(0x3334/2) and r5, r4, #31 lsr lr, lr, r5 vdup.16 d30, lr vqdmulh.s16 d18, d18, d30 1: sub r1, r1, #32 vdup.8 q0, d18[0] vdup.8 q1, d18[0] 2: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} endfunc // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 push {r4-r8, lr} ldr r4, [sp, #24] clz lr, r3 adr r5, L(ipred_paeth_tbl) sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.8 {d4[], d5[]}, [r2] add r8, r2, #1 sub r2, r2, #4 add r5, r5, lr mov r7, #-4 add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_paeth_tbl): .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB 40: vld1.32 {d6[], d7[]}, [r8] vsubl.u8 q8, d6, d4 // top - topleft 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 vzip.32 d0, d1 vzip.32 d2, d3 vaddw.u8 q9, q8, d0 vaddw.u8 q10, q8, d2 vqmovun.s16 d18, q9 // base vqmovun.s16 d19, q10 vmov d1, d2 vabd.u8 q10, q3, q9 // tdiff vabd.u8 q11, q2, q9 // tldiff vabd.u8 q9, q0, q9 // ldiff vmin.u8 q12, q10, q11 // min(tdiff, tldiff) vcge.u8 q10, q11, q10 // tldiff >= tdiff vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft vbit q10, q0, q9 // ldiff <= min ? left : ... vst1.32 {d21[1]}, [r0, :32], r1 vst1.32 {d21[0]}, [r6, :32], r1 subs r4, r4, #4 vst1.32 {d20[1]}, [r0, :32], r1 vst1.32 {d20[0]}, [r6, :32], r1 bgt 4b pop {r4-r8, pc} 80: vld1.8 {d6}, [r8] vsubl.u8 q8, d6, d4 // top - topleft vmov d7, d6 8: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 vaddw.u8 q9, q8, d0 vaddw.u8 q10, q8, d1 vaddw.u8 q11, q8, d2 vaddw.u8 q12, q8, d3 vqmovun.s16 d18, q9 // base vqmovun.s16 d19, q10 vqmovun.s16 d20, q11 vqmovun.s16 d21, q12 vabd.u8 q11, q3, q9 // tdiff vabd.u8 q12, q3, q10 vabd.u8 q13, q2, q9 // tldiff vabd.u8 q14, q2, q10 vabd.u8 q10, q1, q10 // ldiff vabd.u8 q9, q0, q9 vmin.u8 q15, q12, q14 // min(tdiff, tldiff) vcge.u8 q12, q14, q12 // tldiff >= tdiff vmin.u8 q14, q11, q13 // min(tdiff, tldiff) vcge.u8 q11, q13, q11 // tldiff >= tdiff vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff vcge.u8 q9, q14, q9 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 vst1.8 {d25}, [r0, :64], r1 vst1.8 {d24}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d23}, [r0, :64], r1 vst1.8 {d22}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: 320: 640: vld1.8 {d6}, [r8]! mov r12, r3 // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3 1: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 2: vsubl.u8 q8, d6, d4 // top - topleft vmov d7, d6 vaddw.u8 q9, q8, d0 vaddw.u8 q10, q8, d1 vaddw.u8 q11, q8, d2 vaddw.u8 q12, q8, d3 vqmovun.s16 d18, q9 // base vqmovun.s16 d19, q10 vqmovun.s16 d20, q11 vqmovun.s16 d21, q12 vabd.u8 q11, q3, q9 // tdiff vabd.u8 q12, q3, q10 vabd.u8 q13, q2, q9 // tldiff vabd.u8 q14, q2, q10 vabd.u8 q10, q1, q10 // ldiff vabd.u8 q9, q0, q9 vmin.u8 q15, q12, q14 // min(tdiff, tldiff) vcge.u8 q12, q14, q12 // tldiff >= tdiff vmin.u8 q14, q11, q13 // min(tdiff, tldiff) vcge.u8 q11, q13, q11 // tldiff >= tdiff vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff vcge.u8 q9, q14, q9 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 subs r3, r3, #8 vst1.8 {d25}, [r0, :64]! vst1.8 {d24}, [r6, :64]! vst1.8 {d23}, [r5, :64]! vst1.8 {d22}, [lr, :64]! ble 8f vld1.8 {d6}, [r8]! b 2b 8: subs r4, r4, #4 ble 9f // End of horizontal loop, move pointers to next four rows sub r8, r8, r12 add r0, r0, r1 add r6, r6, r1 vld1.8 {d6}, [r8]! add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: pop {r4-r8, pc} endfunc #if 0 // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_8bpc_neon, export=1 push {r4-r10, lr} ldr r4, [sp, #32] movrel r10, X(sm_weights) add r12, r10, r4 add r10, r10, r3 clz r9, r3 adr r5, L(ipred_smooth_tbl) sub lr, r2, r4 sub r9, r9, #25 ldr r9, [r5, r9, lsl #2] vld1.8 {d4[]}, [lr] // bottom add r8, r2, #1 add r5, r5, r9 add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_tbl): .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB 40: vld1.32 {d16[]}, [r8] // top vld1.32 {d18[]}, [r10, :32] // weights_hor sub r2, r2, #4 mov r7, #-4 vdup.8 q3, d16[3] // right vsubl.u8 q8, d16, d4 // top-bottom vmovl.u8 q9, d18 // weights_hor 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver vshll.i8 q12, d6, #8 // right*256 vshll.i8 q13, d6, #8 vzip.32 d1, d0 // left, flipped vzip.32 d3, d2 vzip.32 d20, d21 // weights_ver vzip.32 d22, d23 vshll.i8 q14, d4, #8 // bottom*256 vshll.i8 q15, d4, #8 vsubl.u8 q0, d1, d6 // left-right vsubl.u8 q1, d3, d6 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor vmla.i16 q13, q0, q9 // (left flipped) vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q15, q8, q11 vhadd.u16 q12, q12, q14 vhadd.u16 q13, q13, q15 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 vst1.32 {d24[0]}, [r0, :32], r1 vst1.32 {d24[1]}, [r6, :32], r1 subs r4, r4, #4 vst1.32 {d25[0]}, [r0, :32], r1 vst1.32 {d25[1]}, [r6, :32], r1 bgt 4b pop {r4-r10, pc} 80: vld1.8 {d16}, [r8] // top vld1.8 {d18}, [r10, :64] // weights_hor sub r2, r2, #2 mov r7, #-2 vdup.8 q3, d16[7] // right vsubl.u8 q8, d16, d4 // top-bottom vmovl.u8 q9, d18 // weights_hor 8: vld2.8 {d0[], d1[]}, [r2, :16], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vshll.i8 q12, d6, #8 // right*256 vshll.i8 q13, d6, #8 vshll.i8 q14, d4, #8 // bottom*256 vshll.i8 q15, d4, #8 vsubl.u8 q1, d0, d6 // left-right (left flipped) vsubl.u8 q0, d1, d6 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor vmla.i16 q13, q1, q9 vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q15, q8, q11 vhadd.u16 q12, q12, q14 vhadd.u16 q13, q13, q15 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 subs r4, r4, #2 vst1.8 {d24}, [r0, :64], r1 vst1.8 {d25}, [r6, :64], r1 bgt 8b pop {r4-r10, pc} 160: 320: 640: add lr, r2, r3 sub r2, r2, #2 mov r7, #-2 vld1.8 {d6[], d7[]}, [lr] // right sub r1, r1, r3 mov r9, r3 1: vld2.8 {d0[], d1[]}, [r2, :16], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vsubl.u8 q1, d0, d6 // left-right (left flipped) vsubl.u8 q0, d1, d6 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 2: vld1.8 {d16}, [r8]! // top vld1.8 {d18}, [r10, :64]! // weights_hor vshll.i8 q12, d6, #8 // right*256 vshll.i8 q13, d6, #8 vmovl.u8 q9, d18 // weights_hor vshll.i8 q14, d4, #8 // bottom*256 vshll.i8 q15, d4, #8 vsubl.u8 q8, d16, d4 // top-bottom vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor vmla.i16 q13, q1, q9 vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q15, q8, q11 vhadd.u16 q12, q12, q14 vhadd.u16 q13, q13, q15 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 subs r3, r3, #8 vst1.8 {d24}, [r0, :64]! vst1.8 {d25}, [r6, :64]! bgt 2b subs r4, r4, #2 ble 9f sub r8, r8, r9 sub r10, r10, r9 add r0, r0, r1 add r6, r6, r1 mov r3, r9 b 1b 9: pop {r4-r10, pc} endfunc // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_8bpc_neon, export=1 push {r4-r7, lr} ldr r4, [sp, #20] movrel r7, X(sm_weights) add r7, r7, r4 clz lr, r3 adr r5, L(ipred_smooth_v_tbl) sub r12, r2, r4 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.8 {d4[]}, [r12] // bottom add r2, r2, #1 add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_v_tbl): .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB 40: vld1.32 {d6[]}, [r2] // top vsubl.u8 q3, d6, d4 // top-bottom 4: vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver vshll.i8 q10, d4, #8 // bottom*256 vshll.i8 q11, d4, #8 vzip.32 d16, d17 // weights_ver vzip.32 d18, d19 vmovl.u8 q8, d16 // weights_ver vmovl.u8 q9, d18 subs r4, r4, #4 vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q11, q3, q9 vrshrn.i16 d20, q10, #8 vrshrn.i16 d21, q11, #8 vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r6, :32], r1 vst1.32 {d21[0]}, [r0, :32], r1 vst1.32 {d21[1]}, [r6, :32], r1 bgt 4b pop {r4-r7, pc} 80: vld1.8 {d6}, [r2] // top vsubl.u8 q3, d6, d4 // top-bottom 8: vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver vshll.i8 q12, d4, #8 // bottom*256 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vmovl.u8 q8, d16 // weights_ver vmovl.u8 q9, d18 vmovl.u8 q10, d20 vmovl.u8 q11, d22 vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q13, q3, q9 vmla.i16 q14, q3, q10 vmla.i16 q15, q3, q11 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 vrshrn.i16 d26, q14, #8 vrshrn.i16 d27, q15, #8 vst1.8 {d24}, [r0, :64], r1 vst1.8 {d25}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d26}, [r0, :64], r1 vst1.8 {d27}, [r6, :64], r1 bgt 8b pop {r4-r7, pc} 160: 320: 640: vpush {q4-q7} // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3 mov r12, r3 1: vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver vmovl.u8 q4, d8 // weights_ver vmovl.u8 q5, d10 vmovl.u8 q6, d12 vmovl.u8 q7, d14 2: vld1.8 {q3}, [r2]! // top vshll.i8 q8, d4, #8 // bottom*256 vshll.i8 q9, d4, #8 vshll.i8 q10, d4, #8 vshll.i8 q11, d4, #8 vsubl.u8 q0, d6, d4 // top-bottom vsubl.u8 q1, d7, d4 vshll.i8 q12, d4, #8 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q9, q1, q4 vmla.i16 q10, q0, q5 vmla.i16 q11, q1, q5 vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q13, q1, q6 vmla.i16 q14, q0, q7 vmla.i16 q15, q1, q7 vrshrn.i16 d16, q8, #8 vrshrn.i16 d17, q9, #8 vrshrn.i16 d18, q10, #8 vrshrn.i16 d19, q11, #8 vrshrn.i16 d20, q12, #8 vrshrn.i16 d21, q13, #8 vrshrn.i16 d22, q14, #8 vrshrn.i16 d23, q15, #8 subs r3, r3, #16 vst1.8 {q8}, [r0, :128]! vst1.8 {q9}, [r6, :128]! vst1.8 {q10}, [r5, :128]! vst1.8 {q11}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r2, r2, r12 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r7, pc} endfunc // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_8bpc_neon, export=1 push {r4-r8, lr} ldr r4, [sp, #24] movrel r8, X(sm_weights) add r8, r8, r3 clz lr, r3 adr r5, L(ipred_smooth_h_tbl) add r12, r2, r3 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.8 {d4[]}, [r12] // right add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_h_tbl): .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB 40: vld1.32 {d6[]}, [r8, :32] // weights_hor sub r2, r2, #4 mov r7, #-4 vmovl.u8 q3, d6 // weights_hor 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left vshll.i8 q8, d4, #8 // right*256 vshll.i8 q9, d4, #8 vzip.32 d3, d2 // left, flipped vzip.32 d1, d0 vsubl.u8 q1, d3, d4 // left-right vsubl.u8 q0, d1, d4 subs r4, r4, #4 vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor vmla.i16 q9, q0, q3 vrshrn.i16 d16, q8, #8 vrshrn.i16 d17, q9, #8 vst1.32 {d16[0]}, [r0, :32], r1 vst1.32 {d16[1]}, [r6, :32], r1 vst1.32 {d17[0]}, [r0, :32], r1 vst1.32 {d17[1]}, [r6, :32], r1 bgt 4b pop {r4-r8, pc} 80: vld1.8 {d6}, [r8, :64] // weights_hor sub r2, r2, #4 mov r7, #-4 vmovl.u8 q3, d6 // weights_hor 8: vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left vshll.i8 q12, d4, #8 // right*256 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vsubl.u8 q11, d22, d4 // left-right vsubl.u8 q10, d20, d4 vsubl.u8 q9, d18, d4 vsubl.u8 q8, d16, d4 vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor vmla.i16 q13, q10, q3 // (left flipped) vmla.i16 q14, q9, q3 vmla.i16 q15, q8, q3 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 vrshrn.i16 d26, q14, #8 vrshrn.i16 d27, q15, #8 vst1.8 {d24}, [r0, :64], r1 vst1.8 {d25}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d26}, [r0, :64], r1 vst1.8 {d27}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: 320: 640: vpush {q4-q7} sub r2, r2, #4 mov r7, #-4 // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3 mov r12, r3 1: vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left vsubl.u8 q4, d8, d4 // left-right vsubl.u8 q5, d10, d4 vsubl.u8 q6, d12, d4 vsubl.u8 q7, d14, d4 2: vld1.8 {q1}, [r8, :128]! // weights_hor vshll.i8 q8, d4, #8 // right*256 vshll.i8 q9, d4, #8 vshll.i8 q10, d4, #8 vshll.i8 q11, d4, #8 vmovl.u8 q0, d2 // weights_hor vmovl.u8 q1, d3 vshll.i8 q12, d4, #8 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor vmla.i16 q9, q7, q1 // (left flipped) vmla.i16 q10, q6, q0 vmla.i16 q11, q6, q1 vmla.i16 q12, q5, q0 vmla.i16 q13, q5, q1 vmla.i16 q14, q4, q0 vmla.i16 q15, q4, q1 vrshrn.i16 d16, q8, #8 vrshrn.i16 d17, q9, #8 vrshrn.i16 d18, q10, #8 vrshrn.i16 d19, q11, #8 vrshrn.i16 d20, q12, #8 vrshrn.i16 d21, q13, #8 vrshrn.i16 d22, q14, #8 vrshrn.i16 d23, q15, #8 subs r3, r3, #16 vst1.8 {q8}, [r0, :128]! vst1.8 {q9}, [r6, :128]! vst1.8 {q10}, [r5, :128]! vst1.8 {q11}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r8, r8, r12 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r8, pc} endfunc #endif // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height); function ipred_filter_8bpc_neon, export=1 push {r4-r8, lr} movw r12, #511 ldrd r4, r5, [sp, #24] and r5, r5, r12 // 511 movrel r6, X(filter_intra_taps) lsl r5, r5, #6 add r6, r6, r5 vld1.8 {d20, d21, d22, d23}, [r6, :128]! clz lr, r3 adr r5, L(ipred_filter_tbl) vld1.8 {d27, d28, d29}, [r6, :64] sub lr, lr, #26 ldr lr, [r5, lr, lsl #2] vmovl.s8 q8, d20 vmovl.s8 q9, d21 add r5, r5, lr vmovl.s8 q10, d22 vmovl.s8 q11, d23 add r6, r0, r1 lsl r1, r1, #1 vmovl.s8 q12, d27 vmovl.s8 q13, d28 vmovl.s8 q14, d29 add r8, r2, #1 sub r2, r2, #2 mov r7, #-2 bx r5 .align 2 L(ipred_filter_tbl): .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB 40: vld1.32 {d0[]}, [r8] // top (0-3) vmovl.u8 q0, d0 // top (0-3) 4: vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmovl.u8 q1, d2 // left (0-1) + topleft (2) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vqrshrun.s16 d4, q2, #4 subs r4, r4, #2 vst1.32 {d4[0]}, [r0, :32], r1 vmovl.u8 q0, d4 vst1.32 {d4[1]}, [r6, :32], r1 vmov d0, d1 // move top from [4-7] to [0-3] bgt 4b pop {r4-r8, pc} 80: vld1.8 {d0}, [r8] // top (0-7) vmovl.u8 q0, d0 // top (0-7) 8: vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmovl.u8 q1, d2 // left (0-1) + topleft (2) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) vqrshrun.s16 d4, q2, #4 vmovl.u8 q1, d4 // first block, in 16 bit vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5) vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6) vqrshrun.s16 d5, q3, #4 vzip.32 d4, d5 subs r4, r4, #2 vst1.8 {d4}, [r0, :64], r1 vmovl.u8 q0, d5 vst1.8 {d5}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: 320: vpush {q4-q5} sub r1, r1, r3 mov lr, r3 1: vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2) vmovl.u8 q0, d0 // left (0-1) + topleft (2) 2: vld1.8 {q2}, [r8]! // top(0-15) vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) vmovl.u8 q1, d4 // top(0-7) vmovl.u8 q2, d5 // top(8-15) vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) vqrshrun.s16 d6, q3, #4 vmovl.u8 q0, d6 // first block, in 16 bit vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6) vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) vqrshrun.s16 d7, q4, #4 vmovl.u8 q0, d7 // second block, in 16 bit vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1) vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2) vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3) vqrshrun.s16 d8, q5, #4 vmovl.u8 q0, d8 // third block, in 16 bit vmov.u8 r12, d5[6] vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4) vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0) vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6) vmov.8 d0[4], r12 subs r3, r3, #16 vqrshrun.s16 d9, q15, #4 vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]! vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]! ble 8f vmov.u8 r12, d9[7] vmov.8 d0[0], r12 vmov.u8 r12, d9[3] vmov.8 d0[2], r12 b 2b 8: subs r4, r4, #2 ble 9f sub r8, r6, lr add r0, r0, r1 add r6, r6, r1 mov r3, lr b 1b 9: vpop {q4-q5} pop {r4-r8, pc} endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 push {r4-r5, lr} ldrd r4, r5, [sp, #12] vld1.8 {d0}, [r2, :64] clz lr, r4 adr r12, L(pal_pred_tbl) sub lr, lr, #25 vmov.i8 q15, #7 ldr lr, [r12, lr, lsl #2] add r12, r12, lr add r2, r0, r1 bx r12 .align 2 L(pal_pred_tbl): .word 640f - L(pal_pred_tbl) + CONFIG_THUMB .word 320f - L(pal_pred_tbl) + CONFIG_THUMB .word 160f - L(pal_pred_tbl) + CONFIG_THUMB .word 80f - L(pal_pred_tbl) + CONFIG_THUMB .word 40f - L(pal_pred_tbl) + CONFIG_THUMB 40: lsl r1, r1, #1 4: vld1.8 {d2}, [r3, :64]! subs r5, r5, #4 vshr.u8 d3, d2, #4 vand.u8 d2, d2, d30 vzip.8 d2, d3 vtbl.8 d2, {d0}, d2 vtbl.8 d3, {d0}, d3 vst1.32 {d2[0]}, [r0, :32], r1 vst1.32 {d2[1]}, [r2, :32], r1 vst1.32 {d3[0]}, [r0, :32], r1 vst1.32 {d3[1]}, [r2, :32], r1 bgt 4b pop {r4-r5, pc} 80: lsl r1, r1, #1 8: vld1.8 {q1}, [r3, :64]! subs r5, r5, #4 vshr.u8 q2, q1, #4 vand.u8 q1, q1, q15 vzip.8 q1, q2 vtbl.8 d2, {d0}, d2 vtbl.8 d3, {d0}, d3 vst1.8 {d2}, [r0, :64], r1 vtbl.8 d4, {d0}, d4 vst1.8 {d3}, [r2, :64], r1 vtbl.8 d5, {d0}, d5 vst1.8 {d4}, [r0, :64], r1 vst1.8 {d5}, [r2, :64], r1 bgt 8b pop {r4-r5, pc} 160: lsl r1, r1, #1 16: vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #4 vand.u8 q8, q10, q15 vshr.u8 q9, q10, #4 vand.u8 q10, q11, q15 vshr.u8 q11, q11, #4 vzip.8 q8, q9 vzip.8 q10, q11 vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 vtbl.8 d19, {d0}, d19 vtbl.8 d20, {d0}, d20 vtbl.8 d21, {d0}, d21 vst1.8 {q8}, [r0, :128], r1 vtbl.8 d22, {d0}, d22 vst1.8 {q9}, [r2, :128], r1 vtbl.8 d23, {d0}, d23 vst1.8 {q10}, [r0, :128], r1 vst1.8 {q11}, [r2, :128], r1 bgt 16b pop {r4-r5, pc} 320: lsl r1, r1, #1 32: vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #2 vand.u8 q8, q10, q15 vshr.u8 q9, q10, #4 vand.u8 q10, q11, q15 vshr.u8 q11, q11, #4 vzip.8 q8, q9 vzip.8 q10, q11 vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 vtbl.8 d19, {d0}, d19 vtbl.8 d20, {d0}, d20 vtbl.8 d21, {d0}, d21 vst1.8 {q8, q9}, [r0, :128], r1 vtbl.8 d22, {d0}, d22 vtbl.8 d23, {d0}, d23 vst1.8 {q10, q11}, [r2, :128], r1 bgt 32b pop {r4-r5, pc} 640: sub r1, r1, #32 64: vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #1 vand.u8 q8, q10, q15 vshr.u8 q9, q10, #4 vand.u8 q10, q11, q15 vshr.u8 q11, q11, #4 vzip.8 q8, q9 vzip.8 q10, q11 vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 vtbl.8 d19, {d0}, d19 vtbl.8 d20, {d0}, d20 vtbl.8 d21, {d0}, d21 vst1.8 {q8, q9}, [r0, :128]! vtbl.8 d22, {d0}, d22 vtbl.8 d23, {d0}, d23 vst1.8 {q10, q11}, [r0, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz lr, r3 adr r12, L(ipred_cfl_128_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vmov.i16 q0, #128 // dc vdup.i16 q1, r6 // alpha add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 bx r12 .align 2 L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB L(ipred_cfl_splat_w4): vld1.16 {q2, q3}, [r5, :128]! vmul.i16 q2, q2, q1 // diff = ac * alpha vmul.i16 q3, q3, q1 vshr.s16 q8, q2, #15 // sign = diff >> 15 vshr.s16 q9, q3, #15 vadd.i16 q2, q2, q8 // diff + sign vadd.i16 q3, q3, q9 vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshr.s16 q3, q3, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vadd.i16 q3, q3, q0 vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign()) vqmovun.s16 d5, q3 vst1.32 {d4[0]}, [r0, :32], r1 vst1.32 {d4[1]}, [r6, :32], r1 subs r4, r4, #4 vst1.32 {d5[0]}, [r0, :32], r1 vst1.32 {d5[1]}, [r6, :32], r1 bgt L(ipred_cfl_splat_w4) pop {r4-r8, pc} L(ipred_cfl_splat_w8): vld1.16 {q8, q9}, [r5, :128]! vld1.16 {q10, q11}, [r5, :128]! vmul.i16 q8, q8, q1 // diff = ac * alpha vmul.i16 q9, q9, q1 vmul.i16 q10, q10, q1 vmul.i16 q11, q11, q1 vshr.s16 q12, q8, #15 // sign = diff >> 15 vshr.s16 q13, q9, #15 vshr.s16 q14, q10, #15 vshr.s16 q15, q11, #15 vadd.i16 q8, q8, q12 // diff + sign vadd.i16 q9, q9, q13 vadd.i16 q10, q10, q14 vadd.i16 q11, q11, q15 vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshr.s16 q9, q9, #6 vrshr.s16 q10, q10, #6 vrshr.s16 q11, q11, #6 vadd.i16 q8, q8, q0 // dc + apply_sign() vadd.i16 q9, q9, q0 vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q0 vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) vqmovun.s16 d17, q9 vqmovun.s16 d18, q10 vqmovun.s16 d19, q11 vst1.8 {d16}, [r0, :64], r1 vst1.8 {d17}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d18}, [r0, :64], r1 vst1.8 {d19}, [r6, :64], r1 bgt L(ipred_cfl_splat_w8) pop {r4-r8, pc} L(ipred_cfl_splat_w16): add r12, r5, r3, lsl #1 sub r1, r1, r3 mov lr, r3 1: vld1.16 {q8, q9}, [r5, :128]! vmul.i16 q8, q8, q1 // diff = ac * alpha vld1.16 {q10, q11}, [r12, :128]! vmul.i16 q9, q9, q1 vmul.i16 q10, q10, q1 vmul.i16 q11, q11, q1 vshr.s16 q12, q8, #15 // sign = diff >> 15 vshr.s16 q13, q9, #15 vshr.s16 q14, q10, #15 vshr.s16 q15, q11, #15 vadd.i16 q8, q8, q12 // diff + sign vadd.i16 q9, q9, q13 vadd.i16 q10, q10, q14 vadd.i16 q11, q11, q15 vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshr.s16 q9, q9, #6 vrshr.s16 q10, q10, #6 vrshr.s16 q11, q11, #6 vadd.i16 q8, q8, q0 // dc + apply_sign() vadd.i16 q9, q9, q0 vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q0 vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) vqmovun.s16 d17, q9 vqmovun.s16 d18, q10 vqmovun.s16 d19, q11 subs r3, r3, #16 vst1.16 {q8}, [r0, :128]! vst1.16 {q9}, [r6, :128]! bgt 1b subs r4, r4, #2 add r5, r5, lr, lsl #1 add r12, r12, lr, lsl #1 add r0, r0, r1 add r6, r6, r1 mov r3, lr bgt 1b pop {r4-r8, pc} endfunc // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz lr, r3 adr r12, L(ipred_cfl_top_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vdup.16 q1, r6 // alpha add r2, r2, #1 add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 bx r12 .align 2 L(ipred_cfl_top_tbl): .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB 4: vld1.32 {d0[]}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) 8: vld1.8 {d0}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) 16: vld1.8 {q0}, [r2] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) 32: vld1.8 {q2, q3}, [r2] vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q2, q3 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #5 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_left_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] sub r2, r2, r4 clz lr, r3 clz r8, r4 adr r12, L(ipred_cfl_splat_tbl) adr r7, L(ipred_cfl_left_tbl) sub lr, lr, #26 sub r8, r8, #26 ldr lr, [r12, lr, lsl #2] ldr r8, [r7, r8, lsl #2] vdup.16 q1, r6 // alpha add r12, r12, lr add r7, r7, r8 add r6, r0, r1 lsl r1, r1, #1 bx r7 .align 2 L(ipred_cfl_left_tbl): .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB L(ipred_cfl_left_h4): vld1.32 {d0[]}, [r2, :32] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h8): vld1.8 {d0}, [r2, :64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h16): vld1.8 {q0}, [r2, :128] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h32): vld1.8 {q2, q3}, [r2, :128] vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q2, q3 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #5 vdup.16 q0, d0[0] bx r12 endfunc // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] sub r2, r2, r4 add r8, r3, r4 // width + height vdup.16 q1, r6 // alpha clz lr, r3 clz r6, r4 vdup.16 d16, r8 // width + height adr r7, L(ipred_cfl_tbl) rbit r8, r8 // rbit(width + height) sub lr, lr, #22 // 26 leading bits, minus table offset 4 sub r6, r6, #26 clz r8, r8 // ctz(width + height) ldr lr, [r7, lr, lsl #2] ldr r6, [r7, r6, lsl #2] neg r8, r8 // -ctz(width + height) add r12, r7, lr add r7, r7, r6 vshr.u16 d16, d16, #1 // (width + height) >> 1 vdup.16 d17, r8 // -ctz(width + height) add r6, r0, r1 lsl r1, r1, #1 bx r7 .align 2 L(ipred_cfl_tbl): .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB L(ipred_cfl_h4): vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w4): vld1.32 {d1[]}, [r2] vadd.i16 d0, d0, d16 vpaddl.u8 d1, d1 vpadd.u16 d1, d1 cmp r4, #4 vadd.i16 d0, d0, d1 vshl.u16 d0, d0, d17 beq 1f // h = 8/16 movw lr, #(0x3334/2) movw r8, #(0x5556/2) cmp r4, #16 it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.i16 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w8): vld1.8 {d1}, [r2] vadd.i16 d0, d0, d16 vpaddl.u8 d1, d1 vpadd.i16 d1, d1 vpadd.i16 d1, d1 cmp r4, #8 vadd.i16 d0, d0, d1 vshl.u16 d0, d0, d17 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #(0x3334/2) movw r8, #(0x5556/2) it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): vld1.8 {q0}, [r2, :128]! vaddl.u8 q0, d0, d1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w16): vld1.8 {q2}, [r2] vadd.i16 d0, d0, d16 vaddl.u8 q2, d4, d5 vadd.i16 d4, d4, d5 vpadd.i16 d4, d4 vpadd.i16 d4, d4 cmp r4, #16 vadd.i16 d0, d0, d4 vshl.u16 d0, d0, d17 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #(0x3334/2) movw r8, #(0x5556/2) it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): vld1.8 {q2, q3}, [r2, :128]! vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w32): vld1.8 {q2, q3}, [r2] vadd.i16 d0, d0, d16 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.i16 q2, q2, q3 vadd.i16 d4, d4, d5 vpadd.i16 d4, d4 vpadd.i16 d4, d4 cmp r4, #32 vadd.i16 d0, d0, d4 vshl.u16 d0, d0, d17 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #(0x3334/2) movw r8, #(0x5556/2) it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_420_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_420_tbl): .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w4): 1: // Copy and subsample input vld1.8 {d0}, [r1, :64], r2 vld1.8 {d2}, [r12, :64], r2 vld1.8 {d1}, [r1, :64], r2 vld1.8 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vadd.i16 q0, q0, q1 vshl.i16 q0, q0, #1 subs r8, r8, #2 vst1.16 {q0}, [r0, :128]! vadd.i16 q8, q8, q0 bgt 1b cmp r4, #0 vmov d0, d1 vmov d2, d1 vmov d3, d1 L(ipred_cfl_ac_420_w4_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q8, q8, q1 bgt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums vadd.i16 q0, q8, q9 vadd.i16 q1, q10, q11 vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 vadd.i32 q0, q1 vadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d0 // sum sub r0, r0, r6, lsl #3 vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz vdup.16 q8, d16[0] L(ipred_cfl_ac_420_w4_subtract_dc): 6: // Subtract dc from ac vld1.16 {q0, q1}, [r0, :128] subs r6, r6, #4 vsub.i16 q0, q0, q8 vsub.i16 q1, q1, q8 vst1.16 {q0, q1}, [r0, :128]! bgt 6b pop {r4-r8, pc} L(ipred_cfl_ac_420_w8): cmp r3, #0 bne L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding vld1.8 {q0}, [r1, :128], r2 vld1.8 {q1}, [r12, :128], r2 vld1.8 {q2}, [r1, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q3}, [r12, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vshl.i16 q0, q0, #1 vshl.i16 q1, q2, #1 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov q0, q1 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.16 {d0}, [r1, :64], r2 vld1.16 {d2}, [r12, :64], r2 vld1.16 {d1}, [r1, :64], r2 vld1.16 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vadd.i16 q0, q0, q1 vshl.i16 q0, q0, #1 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov q0, q1 L(ipred_cfl_ac_420_w8_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl r6, r6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): adr r7, L(ipred_cfl_ac_420_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_420_w16_tbl): .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w16_wpad0): 1: // Copy and subsample input, without padding vld1.8 {q0, q1}, [r1, :128], r2 vld1.8 {q2, q3}, [r12, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q12, q13}, [r1, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vld1.8 {q2, q3}, [r12, :128], r2 vpaddl.u8 q12, q12 vpaddl.u8 q13, q13 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q3 vshl.i16 q0, q0, #1 vshl.i16 q1, q1, #1 vshl.i16 q2, q12, #1 vshl.i16 q3, q13, #1 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): 1: // Copy and subsample input, padding 4 vldr d2, [r1, #16] vld1.8 {q0}, [r1, :128], r2 vldr d6, [r12, #16] vld1.8 {q2}, [r12, :128], r2 vpaddl.u8 d2, d2 vldr d26, [r1, #16] vpaddl.u8 q0, q0 vld1.8 {q12}, [r1, :128], r2 vpaddl.u8 d6, d6 vldr d30, [r12, #16] vpaddl.u8 q2, q2 vld1.8 {q14}, [r12, :128], r2 vpaddl.u8 d26, d26 vpaddl.u8 q12, q12 vpaddl.u8 d30, d30 vpaddl.u8 q14, q14 vadd.i16 d2, d2, d6 vadd.i16 q0, q0, q2 vadd.i16 d26, d26, d30 vadd.i16 q12, q12, q14 vshl.i16 d2, d2, #1 vshl.i16 q0, q0, #1 vshl.i16 d6, d26, #1 vshl.i16 q2, q12, #1 vdup.16 d3, d2[3] vdup.16 d7, d6[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.8 {q0}, [r1, :128], r2 vld1.8 {q1}, [r12, :128], r2 vld1.8 {q2}, [r1, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q3}, [r12, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vshl.i16 q0, q0, #1 vshl.i16 q2, q2, #1 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vld1.8 {d4}, [r1, :64], r2 vpaddl.u8 q0, q0 vld1.8 {d5}, [r12, :64], r2 vpaddl.u8 q2, q2 vadd.i16 d0, d0, d1 vadd.i16 d4, d4, d5 vshl.i16 d0, d0, #1 vshl.i16 d4, d4, #1 vdup.16 q1, d0[3] vdup.16 q3, d4[3] vdup.16 d1, d0[3] vdup.16 d5, d4[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl r6, r6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_422_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_422_tbl): .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w4): 1: // Copy and subsample input vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vld1.8 {d2}, [r1, :64], r2 vld1.8 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): cmp r3, #0 bne L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding vld1.8 {q0}, [r1, :128], r2 vld1.8 {q1}, [r12, :128], r2 vld1.8 {q2}, [r1, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q3}, [r12, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vld1.8 {d2}, [r1, :64], r2 vld1.8 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vdup.16 d7, d3[3] vmov d6, d3 vdup.16 d5, d2[3] vmov d4, d2 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): adr r7, L(ipred_cfl_ac_422_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_422_w16_tbl): .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w16_wpad0): 1: // Copy and subsample input, without padding vld1.8 {q0, q1}, [r1, :128], r2 vld1.8 {q2, q3}, [r12, :128], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): 1: // Copy and subsample input, padding 4 vldr d2, [r1, #16] vld1.8 {q0}, [r1, :128], r2 vldr d6, [r12, #16] vld1.8 {q2}, [r12, :128], r2 vpaddl.u8 d2, d2 vpaddl.u8 q0, q0 vpaddl.u8 d6, d6 vpaddl.u8 q2, q2 vshl.i16 d2, d2, #2 vshl.i16 q0, q0, #2 vshl.i16 d6, d6, #2 vshl.i16 q2, q2, #2 vdup.16 d3, d2[3] vdup.16 d7, d6[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.8 {q0}, [r1, :128], r2 vld1.8 {q2}, [r12, :128], r2 vpaddl.u8 q0, q0 vpaddl.u8 q2, q2 vshl.i16 q0, q0, #2 vshl.i16 q2, q2, #2 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vpaddl.u8 q0, q0 vshl.i16 q0, q0, #2 vdup.16 q3, d1[3] vdup.16 q1, d0[3] vdup.16 d5, d1[3] vmov d4, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) endfunc // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_444_tbl) sub r8, r8, #26 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_444_tbl): .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w4): 1: // Copy and expand input vld1.32 {d0[]}, [r1, :32], r2 vld1.32 {d0[1]}, [r12, :32], r2 vld1.32 {d2[]}, [r1, :32], r2 vld1.32 {d2[1]}, [r12, :32], r2 vshll.u8 q0, d0, #3 vshll.u8 q1, d2, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): 1: // Copy and expand input vld1.16 {d0}, [r1, :64], r2 vld1.16 {d2}, [r12, :64], r2 vld1.16 {d4}, [r1, :64], r2 vshll.u8 q0, d0, #3 vld1.16 {d6}, [r12, :64], r2 vshll.u8 q1, d2, #3 vshll.u8 q2, d4, #3 vshll.u8 q3, d6, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): cmp r3, #0 bne L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding vld1.8 {q1}, [r1, :128], r2 vld1.8 {q3}, [r12, :128], r2 vshll.u8 q0, d2, #3 vshll.u8 q1, d3, #3 vshll.u8 q2, d6, #3 vshll.u8 q3, d7, #3 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d4}, [r12, :64], r2 vshll.u8 q0, d0, #3 vshll.u8 q2, d4, #3 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): adr r7, L(ipred_cfl_ac_444_w32_tbl) ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_444_w32_tbl): .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w32_wpad0): 1: // Copy and expand input, without padding vld1.8 {q2, q3}, [r1, :128], r2 vld1.8 {q13, q14}, [r12, :128], r2 vshll.u8 q0, d4, #3 vshll.u8 q1, d5, #3 vshll.u8 q2, d6, #3 vshll.u8 q3, d7, #3 vshll.u8 q12, d26, #3 vshll.u8 q13, d27, #3 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vshll.u8 q0, d28, #3 vshll.u8 q1, d29, #3 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): 1: // Copy and expand input, padding 8 vldr d4, [r1, #16] vld1.8 {q1}, [r1, :128], r2 vldr d28, [r12, #16] vld1.8 {q13}, [r12, :128], r2 vshll.u8 q2, d4, #3 vshll.u8 q0, d2, #3 vshll.u8 q1, d3, #3 vshll.u8 q12, d26, #3 vshll.u8 q13, d27, #3 vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vshll.u8 q0, d28, #3 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vdup.16 q1, d1[3] vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): 1: // Copy and expand input, padding 16 vld1.8 {q1}, [r1, :128], r2 vld1.8 {q13}, [r12, :128], r2 vshll.u8 q0, d2, #3 vshll.u8 q1, d3, #3 vshll.u8 q12, d26, #3 vshll.u8 q13, d27, #3 vdup.16 q2, d3[3] vdup.16 q3, d3[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vdup.16 q0, d27[3] vdup.16 q1, d27[3] vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): 1: // Copy and expand input, padding 24 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d24}, [r12, :64], r2 vshll.u8 q0, d0, #3 vshll.u8 q12, d24, #3 subs r8, r8, #2 vdup.16 q1, d1[3] vdup.16 q2, d1[3] vdup.16 q3, d1[3] vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vdup.16 q13, d25[3] vdup.16 q0, d25[3] vdup.16 q1, d25[3] vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 L(ipred_cfl_ac_444_w32_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #1 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl r6, r6, #3 // Aggregate the sums, with wider intermediates earlier than in // ipred_cfl_ac_420_w8_calc_subtract_dc. vpaddl.u16 q0, q8 vpaddl.u16 q1, q9 vpaddl.u16 q2, q10 vpaddl.u16 q3, q11 vadd.i32 q0, q0, q1 vadd.i32 q2, q2, q3 vadd.i32 q0, q0, q2 vadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d0 // sum sub r0, r0, r6, lsl #3 vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz vdup.16 q8, d16[0] b L(ipred_cfl_ac_420_w4_subtract_dc) endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/ipred16.S000066400000000000000000003767241517466257200232570ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, B Krishnan Iyer * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height, // const int bitdepth_max); function ipred_dc_128_16bpc_neon, export=1 push {r4, lr} ldr r4, [sp, #8] ldr r12, [sp, #24] clz r3, r3 adr r2, L(ipred_dc_128_tbl) sub r3, r3, #25 vdup.16 q0, r12 ldr r3, [r2, r3, lsl #2] add r12, r0, r1 vrshr.u16 q0, q0, #1 add r2, r2, r3 lsl r1, r1, #1 bx r2 .align 2 L(ipred_dc_128_tbl): .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB 4: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4, pc} 8: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 8b pop {r4, pc} 160: vmov q1, q0 16: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vmov q1, q0 sub r1, r1, #32 32: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vmov q1, q0 sub r1, r1, #96 64: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_16bpc_neon, export=1 push {r4, lr} ldr lr, [sp, #8] clz r3, r3 adr r4, L(ipred_v_tbl) sub r3, r3, #25 ldr r3, [r4, r3, lsl #2] add r2, r2, #2 add r4, r4, r3 add r12, r0, r1 lsl r1, r1, #1 bx r4 .align 2 L(ipred_v_tbl): .word 640f - L(ipred_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_v_tbl) + CONFIG_THUMB 40: vld1.16 {d0}, [r2] 4: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs lr, lr, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4, pc} 80: vld1.16 {q0}, [r2] 8: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs lr, lr, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 8b pop {r4, pc} 160: vld1.16 {q0, q1}, [r2] 16: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs lr, lr, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vld1.16 {q0, q1}, [r2]! sub r1, r1, #32 vld1.16 {q2, q3}, [r2] 32: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 subs lr, lr, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vld1.16 {q0, q1}, [r2]! sub r1, r1, #96 vld1.16 {q2, q3}, [r2]! vld1.16 {q8, q9}, [r2]! vld1.16 {q10, q11}, [r2]! 64: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d4, d5, d6, d7}, [r0, :128]! vst1.16 {d4, d5, d6, d7}, [r12, :128]! subs lr, lr, #2 vst1.16 {d16, d17, d18, d19}, [r0, :128]! vst1.16 {d16, d17, d18, d19}, [r12, :128]! vst1.16 {d20, d21, d22, d23}, [r0, :128], r1 vst1.16 {d20, d21, d22, d23}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_h_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] sub r2, r2, #2 mov lr, #-2 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_h_tbl): .word 640f - L(ipred_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_h_tbl) + CONFIG_THUMB .word 8f - L(ipred_h_tbl) + CONFIG_THUMB .word 40f - L(ipred_h_tbl) + CONFIG_THUMB 40: sub r2, r2, #6 mov lr, #-8 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr vst1.16 {d3}, [r0, :64], r1 vst1.16 {d2}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d1}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4-r5, pc} 8: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.16 {d2[], d3[]}, [r2], lr vst1.16 {q0}, [r0, :128], r1 vld1.16 {d4[], d5[]}, [r2], lr vst1.16 {q1}, [r12, :128], r1 vld1.16 {d6[], d7[]}, [r2], lr vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 8b pop {r4-r5, pc} 160: sub r1, r1, #16 16: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.16 {d2[], d3[]}, [r2], lr vst1.16 {q0}, [r0, :128]! vld1.16 {d4[], d5[]}, [r2], lr vst1.16 {q1}, [r12, :128]! vld1.16 {d6[], d7[]}, [r2], lr vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: sub r1, r1, #48 32: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.16 {d2[], d3[]}, [r2], lr vst1.16 {q0}, [r0, :128]! vld1.16 {d4[], d5[]}, [r2], lr vst1.16 {q1}, [r12, :128]! vld1.16 {d6[], d7[]}, [r2], lr vst1.16 {q0}, [r0, :128]! vst1.16 {q1}, [r12, :128]! vst1.16 {q0}, [r0, :128]! vst1.16 {q1}, [r12, :128]! vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: sub r1, r1, #96 64: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #2 vld1.16 {d4[], d5[]}, [r2], lr vmov q1, q0 vmov q3, q2 vst1.16 {q0, q1}, [r0, :128]! vst1.16 {q2, q3}, [r12, :128]! vst1.16 {q0, q1}, [r0, :128]! vst1.16 {q2, q3}, [r12, :128]! vst1.16 {q0, q1}, [r0, :128]! vst1.16 {q2, q3}, [r12, :128]! vst1.16 {q0, q1}, [r0, :128], r1 vst1.16 {q2, q3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_dc_top_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] add r2, r2, #2 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_top_tbl): .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB 40: vld1.16 {d0}, [r2] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 d0, d0[0] 4: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4-r5, pc} 80: vld1.16 {d0, d1}, [r2] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] 8: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 8b pop {r4-r5, pc} 160: vld1.16 {d0, d1, d2, d3}, [r2] vadd.i16 q0, q0, q1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d4, d0, #4 vdup.16 q0, d4[0] vdup.16 q1, d4[0] 16: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: vld1.16 {d0, d1, d2, d3}, [r2]! vld1.16 {d4, d5, d6, d7}, [r2] vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vadd.i16 q0, q0, q2 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d18, q0, #5 vdup.16 q0, d18[0] vdup.16 q1, d18[0] sub r1, r1, #32 32: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: vld1.16 {d0, d1, d2, d3}, [r2]! vld1.16 {d4, d5, d6, d7}, [r2]! vadd.i16 q0, q0, q1 vld1.16 {d16, d17, d18, d19}, [r2]! vadd.i16 q2, q2, q3 vld1.16 {d20, d21, d22, d23}, [r2] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q0, q2 vadd.i16 q8, q8, q10 vadd.i16 q0, q0, q8 vadd.i16 d0, d0, d1 vpaddl.u16 d0, d0 vpadd.i32 d0, d0, d0 vrshrn.i32 d18, q0, #6 vdup.16 q0, d18[0] vdup.16 q1, d18[0] sub r1, r1, #96 64: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] sub r2, r2, r4, lsl #1 clz r3, r3 clz lr, r4 sub lr, lr, #25 adr r5, L(ipred_dc_left_tbl) sub r3, r3, #20 ldr r3, [r5, r3, lsl #2] ldr lr, [r5, lr, lsl #2] add r3, r5, r3 add r5, r5, lr add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_left_tbl): .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB L(ipred_dc_left_h4): vld1.16 {d0}, [r2, :64] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w4): vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt L(ipred_dc_left_w4) pop {r4-r5, pc} L(ipred_dc_left_h8): vld1.16 {d0, d1}, [r2, :128] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w8): vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt L(ipred_dc_left_w8) pop {r4-r5, pc} L(ipred_dc_left_h16): vld1.16 {d0, d1, d2, d3}, [r2, :128] vadd.i16 q0, q0, q1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w16): vmov q1, q0 1: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h32): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128] vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vadd.i16 q0, q0, q2 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d0, q0, #5 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w32): sub r1, r1, #32 vmov q1, q0 1: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h64): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128]! vadd.i16 q0, q0, q1 vld1.16 {d16, d17, d18, d19}, [r2, :128]! vadd.i16 q2, q2, q3 vld1.16 {d20, d21, d22, d23}, [r2, :128] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q0, q2 vadd.i16 q8, q8, q10 vadd.i16 q0, q0, q8 vadd.i16 d0, d0, d1 vpaddl.u16 d0, d0 vpadd.i32 d0, d0, d0 vrshrn.i32 d0, q0, #6 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w64): sub r1, r1, #96 vmov q1, q0 1: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} endfunc // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_16bpc_neon, export=1 push {r4-r6, lr} ldr r4, [sp, #16] sub r2, r2, r4, lsl #1 add lr, r3, r4 // width + height clz r3, r3 clz r12, r4 vdup.32 q15, lr // width + height adr r5, L(ipred_dc_tbl) rbit lr, lr // rbit(width + height) sub r3, r3, #20 // 25 leading bits, minus table offset 5 sub r12, r12, #25 clz lr, lr // ctz(width + height) ldr r3, [r5, r3, lsl #2] ldr r12, [r5, r12, lsl #2] neg lr, lr // -ctz(width + height) add r3, r5, r3 add r5, r5, r12 vshr.u32 q15, q15, #1 // (width + height) >> 1 vdup.32 q14, lr // -ctz(width + height) add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_tbl): .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB L(ipred_dc_h4): vld1.16 {d0}, [r2, :64]! vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w4): vld1.16 {d2}, [r2] vadd.i32 d0, d0, d30 vpadd.i16 d2, d2, d2 vpaddl.u16 d2, d2 cmp r4, #4 vadd.i32 d0, d0, d2 vshl.u32 d0, d0, d28 beq 1f // h = 8/16 cmp r4, #16 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d0, d0, d24 vshr.u32 d0, d0, #17 1: vdup.16 d0, d0[0] 2: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h8): vld1.16 {d0, d1}, [r2, :128]! vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w8): vld1.16 {d2, d3}, [r2] vadd.i32 d0, d0, d30 vadd.i16 d2, d2, d3 vpadd.i16 d2, d2, d2 vpaddl.u16 d2, d2 cmp r4, #8 vadd.i32 d0, d0, d2 vshl.u32 d0, d0, d28 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d0, d0, d24 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] 2: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h16): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vadd.i16 q0, q0, q1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w16): vld1.16 {d2, d3, d4, d5}, [r2] vadd.i32 d0, d0, d30 vadd.i16 q1, q1, q2 vadd.i16 d2, d2, d3 vpadd.i16 d2, d2, d1 vpaddl.u16 d2, d2 cmp r4, #16 vadd.i32 d0, d0, d2 vshl.u32 d4, d0, d28 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d4, d4, d24 vshr.u32 d4, d4, #17 1: vdup.16 q0, d4[0] vdup.16 q1, d4[0] 2: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h32): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128]! vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vadd.i16 q0, q0, q2 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w32): vld1.16 {d2, d3, d4, d5}, [r2]! vadd.i32 d0, d0, d30 vld1.16 {d16, d17, d18, d19}, [r2] vadd.i16 q1, q1, q2 vadd.i16 q8, q8, q9 vadd.i16 q1, q1, q8 vadd.i16 d2, d2, d3 vpadd.i16 d2, d2, d2 vpaddl.u16 d2, d2 cmp r4, #32 vadd.i32 d0, d0, d2 vshl.u32 d4, d0, d28 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d4, d4, d24 vshr.u32 d4, d4, #17 1: sub r1, r1, #32 vdup.16 q0, d4[0] vdup.16 q1, d4[0] 2: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h64): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128]! vadd.i16 q0, q0, q1 vld1.16 {d16, d17, d18, d19}, [r2, :128]! vadd.i16 q2, q2, q3 vld1.16 {d20, d21, d22, d23}, [r2, :128]! vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q0, q2 vadd.i16 q8, q8, q10 vadd.i16 q0, q0, q8 vadd.i16 d0, d0, d1 vpaddl.u16 d0, d0 add r2, r2, #2 vpadd.i32 d0, d0, d0 bx r3 L(ipred_dc_w64): vld1.16 {d2, d3, d4, d5}, [r2]! vadd.i32 d0, d0, d30 vld1.16 {d16, d17, d18, d19}, [r2]! vadd.i16 q1, q1, q2 vld1.16 {d20, d21, d22, d23}, [r2]! vadd.i16 q8, q8, q9 vld1.16 {d24, d25, d26, d27}, [r2]! vadd.i16 q10, q10, q11 vadd.i16 q12, q12, q13 vadd.i16 q1, q1, q8 vadd.i16 q10, q10, q12 vadd.i16 q1, q1, q10 vadd.i16 d2, d2, d3 vpaddl.u16 d2, d2 vpadd.i32 d2, d2, d2 cmp r4, #64 vadd.i32 d0, d0, d2 vshl.u32 d4, d0, d28 beq 1f // h = 16/32 cmp r4, #16 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d4, d4, d24 vshr.u32 d4, d4, #17 1: sub r1, r1, #96 vdup.16 q0, d4[0] vdup.16 q1, d4[0] 2: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} endfunc // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_16bpc_neon, export=1 push {r4-r6, lr} vpush {q4} ldr r4, [sp, #32] clz lr, r3 adr r12, L(ipred_paeth_tbl) sub lr, lr, #25 ldr lr, [r12, lr, lsl #2] vld1.16 {d4[], d5[]}, [r2] add r6, r2, #2 sub r2, r2, #4 add r12, r12, lr mov r5, #-4 add lr, r0, r1 lsl r1, r1, #1 bx r12 .align 2 L(ipred_paeth_tbl): .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB 40: sub r2, r2, #4 mov r5, #-8 vld1.16 {d6}, [r6] vsub.i16 d16, d6, d4 // top - topleft vmov d7, d6 vmov d17, d16 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5 vadd.i16 q9, q8, q0 // base vadd.i16 q10, q8, q1 vabd.s16 q11, q3, q9 // tdiff vabd.s16 q12, q3, q10 vabd.s16 q13, q2, q9 // tldiff vabd.s16 q14, q2, q10 vabd.s16 q9, q0, q9 // ldiff vabd.s16 q10, q1, q10 vmin.u16 q15, q11, q13 // min(tdiff, tldiff) vmin.u16 q4, q12, q14 vcge.u16 q11, q13, q11 // tldiff >= tdiff vcge.u16 q12, q14, q12 vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff vcge.u16 q10, q4, q10 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 vst1.16 {d25}, [r0, :64], r1 vst1.16 {d24}, [lr, :64], r1 subs r4, r4, #4 vst1.16 {d23}, [r0, :64], r1 vst1.16 {d22}, [lr, :64], r1 bgt 4b vpop {q4} pop {r4-r6, pc} 80: 160: 320: 640: vld1.16 {q3}, [r6]! mov r12, r3 sub r1, r1, r3, lsl #1 1: vld2.16 {d0[], d2[]}, [r2, :32], r5 vmov d1, d0 vmov d3, d2 2: vsub.i16 q8, q3, q2 // top - topleft vadd.i16 q9, q8, q0 // base vadd.i16 q10, q8, q1 vabd.s16 q11, q3, q9 // tdiff vabd.s16 q12, q3, q10 vabd.s16 q13, q2, q9 // tldiff vabd.s16 q14, q2, q10 vabd.s16 q9, q0, q9 // ldiff vabd.s16 q10, q1, q10 vmin.u16 q15, q11, q13 // min(tdiff, tldiff) vmin.u16 q4, q12, q14 vcge.u16 q11, q13, q11 // tldiff >= tdiff vcge.u16 q12, q14, q12 vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff vcge.u16 q10, q4, q10 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 subs r3, r3, #8 vst1.16 {q12}, [r0, :128]! vst1.16 {q11}, [lr, :128]! ble 8f vld1.16 {q3}, [r6]! b 2b 8: subs r4, r4, #2 ble 9f // End of horizontal loop, move pointers to next two rows sub r6, r6, r12, lsl #1 add r0, r0, r1 add lr, lr, r1 vld1.16 {q3}, [r6]! mov r3, r12 b 1b 9: vpop {q4} pop {r4-r6, pc} endfunc #if 0 // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_16bpc_neon, export=1 push {r4-r10, lr} ldr r4, [sp, #32] movrel r10, X(sm_weights) add r12, r10, r4 add r10, r10, r3 clz r9, r3 adr r5, L(ipred_smooth_tbl) sub lr, r2, r4, lsl #1 sub r9, r9, #25 ldr r9, [r5, r9, lsl #2] vld1.16 {d4[], d5[]}, [lr] // bottom add r8, r2, #2 add r5, r5, r9 add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_tbl): .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB 40: vld1.16 {d16}, [r8] // top vld1.32 {d18[]}, [r10, :32] // weights_hor sub r2, r2, #8 mov r7, #-8 vdup.16 q3, d16[3] // right vsub.i16 q8, q8, q2 // top-bottom vmovl.u8 q9, d18 // weights_hor vadd.i16 d19, d4, d6 // bottom+right 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver vshll.u16 q12, d19, #8 // (bottom+right)*256 vshll.u16 q13, d19, #8 vshll.u16 q14, d19, #8 vshll.u16 q15, d19, #8 vzip.32 d20, d21 // weights_ver vzip.32 d22, d23 vsub.i16 q1, q1, q3 // left-right vsub.i16 q0, q0, q3 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor vmlal.s16 q13, d2, d18 // (left flipped) vmlal.s16 q14, d1, d18 vmlal.s16 q15, d0, d18 vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver vmlal.s16 q13, d16, d21 vmlal.s16 q14, d16, d22 vmlal.s16 q15, d16, d23 vrshrn.i32 d24, q12, #9 vrshrn.i32 d25, q13, #9 vrshrn.i32 d26, q14, #9 vrshrn.i32 d27, q15, #9 vst1.16 {d24}, [r0, :64], r1 vst1.16 {d25}, [r6, :64], r1 subs r4, r4, #4 vst1.16 {d26}, [r0, :64], r1 vst1.16 {d27}, [r6, :64], r1 bgt 4b pop {r4-r10, pc} 80: vld1.16 {q8}, [r8] // top vld1.8 {d18}, [r10, :64] // weights_hor sub r2, r2, #4 mov r7, #-4 vdup.16 q3, d17[3] // right vsub.i16 q8, q8, q2 // top-bottom vmovl.u8 q9, d18 // weights_hor vadd.i16 d3, d4, d6 // bottom+right 8: vld2.16 {d0[], d1[]}, [r2, :32], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vshll.u16 q12, d3, #8 // (bottom+right)*256 vshll.u16 q13, d3, #8 vshll.u16 q14, d3, #8 vshll.u16 q15, d3, #8 vsub.i16 q0, q0, q3 // left-right vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor vmlal.s16 q13, d1, d19 // (left flipped) vmlal.s16 q14, d0, d18 vmlal.s16 q15, d0, d19 vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver vmlal.s16 q13, d17, d20 vmlal.s16 q14, d16, d22 vmlal.s16 q15, d17, d22 vrshrn.i32 d24, q12, #9 vrshrn.i32 d25, q13, #9 vrshrn.i32 d26, q14, #9 vrshrn.i32 d27, q15, #9 subs r4, r4, #2 vst1.16 {q12}, [r0, :128], r1 vst1.16 {q13}, [r6, :128], r1 bgt 8b pop {r4-r10, pc} 160: 320: 640: add lr, r2, r3, lsl #1 sub r2, r2, #4 mov r7, #-4 vld1.16 {d6[], d7[]}, [lr] // right sub r1, r1, r3, lsl #1 mov r9, r3 vadd.i16 d3, d4, d6 // bottom+right 1: vld2.16 {d0[], d1[]}, [r2, :32], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vsub.i16 q0, q0, q3 // left-right vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 2: vld1.8 {d18}, [r10, :64]! // weights_hor vld1.16 {q8}, [r8]! // top vshll.u16 q12, d3, #8 // (bottom+right)*256 vshll.u16 q13, d3, #8 vmovl.u8 q9, d18 // weights_hor vshll.u16 q14, d3, #8 vshll.u16 q15, d3, #8 vsub.i16 q8, q8, q2 // top-bottom vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor vmlal.s16 q13, d1, d19 // (left flipped) vmlal.s16 q14, d0, d18 vmlal.s16 q15, d0, d19 vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver vmlal.s16 q13, d17, d20 vmlal.s16 q14, d16, d22 vmlal.s16 q15, d17, d22 vrshrn.i32 d24, q12, #9 vrshrn.i32 d25, q13, #9 vrshrn.i32 d26, q14, #9 vrshrn.i32 d27, q15, #9 subs r3, r3, #8 vst1.16 {q12}, [r0, :128]! vst1.16 {q13}, [r6, :128]! bgt 2b subs r4, r4, #2 ble 9f sub r8, r8, r9, lsl #1 sub r10, r10, r9 add r0, r0, r1 add r6, r6, r1 mov r3, r9 b 1b 9: pop {r4-r10, pc} endfunc // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_16bpc_neon, export=1 push {r4-r7, lr} ldr r4, [sp, #20] movrel r7, X(sm_weights) add r7, r7, r4 clz lr, r3 adr r5, L(ipred_smooth_v_tbl) sub r12, r2, r4, lsl #1 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.16 {d4[], d5[]}, [r12] // bottom add r2, r2, #2 add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_v_tbl): .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB 40: vld1.16 {d6}, [r2] // top vsub.i16 d6, d6, d4 // top-bottom vmov d7, d6 4: vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver vzip.32 d16, d17 // weights_ver vzip.32 d18, d19 vshll.u8 q8, d16, #7 // weights_ver << 7 vshll.u8 q9, d18, #7 vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 vqrdmulh.s16 q11, q3, q9 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vst1.16 {d20}, [r0, :64], r1 vst1.16 {d21}, [r6, :64], r1 subs r4, r4, #4 vst1.16 {d22}, [r0, :64], r1 vst1.16 {d23}, [r6, :64], r1 bgt 4b pop {r4-r7, pc} 80: vld1.16 {q3}, [r2] // top vsub.i16 q3, q3, q2 // top-bottom 8: vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver vshll.u8 q8, d16, #7 // weights_ver << 7 vshll.u8 q9, d18, #7 vshll.u8 q10, d20, #7 vshll.u8 q11, d22, #7 vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 vqrdmulh.s16 q9, q3, q9 vqrdmulh.s16 q10, q3, q10 vqrdmulh.s16 q11, q3, q11 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vst1.16 {q8}, [r0, :128], r1 vst1.16 {q9}, [r6, :128], r1 subs r4, r4, #4 vst1.16 {q10}, [r0, :128], r1 vst1.16 {q11}, [r6, :128], r1 bgt 8b pop {r4-r7, pc} 160: 320: 640: vpush {q4-q7} // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3, lsl #1 mov r12, r3 1: vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver vshll.u8 q4, d8, #7 // weights_ver << 7 vshll.u8 q5, d10, #7 vshll.u8 q6, d12, #7 vshll.u8 q7, d14, #7 2: vld1.16 {q0, q1}, [r2]! // top vsub.i16 q0, q0, q2 // top-bottom vsub.i16 q1, q1, q2 vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8 vqrdmulh.s16 q9, q1, q4 vqrdmulh.s16 q10, q0, q5 vqrdmulh.s16 q11, q1, q5 vqrdmulh.s16 q12, q0, q6 vqrdmulh.s16 q13, q1, q6 vqrdmulh.s16 q14, q0, q7 vqrdmulh.s16 q15, q1, q7 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q2 vadd.i16 q14, q14, q2 vadd.i16 q15, q15, q2 subs r3, r3, #16 vst1.16 {q8, q9}, [r0, :128]! vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r5, :128]! vst1.16 {q14, q15}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r2, r2, r12, lsl #1 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r7, pc} endfunc // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_16bpc_neon, export=1 push {r4-r8, lr} ldr r4, [sp, #24] movrel r8, X(sm_weights) add r8, r8, r3 clz lr, r3 adr r5, L(ipred_smooth_h_tbl) add r12, r2, r3, lsl #1 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.16 {d4[], d5[]}, [r12] // right add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_h_tbl): .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB 40: vld1.32 {d6[]}, [r8, :32] // weights_hor sub r2, r2, #8 mov r7, #-8 vshll.u8 q3, d6, #7 // weights_hor << 7 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left vsub.i16 q0, q0, q2 // left-right vsub.i16 q1, q1, q2 subs r4, r4, #4 vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8 vqrdmulh.s16 q9, q0, q3 // (left flipped) vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vst1.16 {d17}, [r0, :64], r1 vst1.16 {d16}, [r6, :64], r1 vst1.16 {d19}, [r0, :64], r1 vst1.16 {d18}, [r6, :64], r1 bgt 4b pop {r4-r8, pc} 80: vld1.8 {d6}, [r8, :64] // weights_hor sub r2, r2, #8 mov r7, #-8 vshll.u8 q3, d6, #7 // weights_hor << 7 8: vld1.16 {d23}, [r2, :64], r7 // left subs r4, r4, #4 vsub.i16 d23, d23, d4 // left-right vdup.16 q8, d23[3] // flip left vdup.16 q9, d23[2] vdup.16 q10, d23[1] vdup.16 q11, d23[0] vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8 vqrdmulh.s16 q9, q9, q3 vqrdmulh.s16 q10, q10, q3 vqrdmulh.s16 q11, q11, q3 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vst1.16 {q8}, [r0, :128], r1 vst1.16 {q9}, [r6, :128], r1 vst1.16 {q10}, [r0, :128], r1 vst1.16 {q11}, [r6, :128], r1 bgt 8b pop {r4-r8, pc} 160: 320: 640: vpush {q4-q7} sub r2, r2, #8 mov r7, #-8 // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3, lsl #1 mov r12, r3 1: vld1.16 {d15}, [r2, :64], r7 // left vsub.i16 d15, d15, d4 // left-right vdup.16 q4, d15[3] // flip left vdup.16 q5, d15[2] vdup.16 q6, d15[1] vdup.16 q7, d15[0] 2: vld1.8 {q1}, [r8, :128]! // weights_hor subs r3, r3, #16 vshll.u8 q0, d2, #7 // weights_hor << 7 vshll.u8 q1, d3, #7 vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8 vqrdmulh.s16 q9, q1, q4 vqrdmulh.s16 q10, q0, q5 vqrdmulh.s16 q11, q1, q5 vqrdmulh.s16 q12, q0, q6 vqrdmulh.s16 q13, q1, q6 vqrdmulh.s16 q14, q0, q7 vqrdmulh.s16 q15, q1, q7 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q2 vadd.i16 q14, q14, q2 vadd.i16 q15, q15, q2 vst1.16 {q8, q9}, [r0, :128]! vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r5, :128]! vst1.16 {q14, q15}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r8, r8, r12 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r8, pc} endfunc #endif // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height, // const int bitdepth_max); .macro filter_fn bpc function ipred_filter_\bpc\()bpc_neon, export=1 movw r12, #511 ldrd r4, r5, [sp, #88] and r5, r5, r12 // 511 movrel r6, X(filter_intra_taps) lsl r5, r5, #6 add r6, r6, r5 vld1.8 {d20, d21, d22, d23}, [r6, :128]! clz lr, r3 adr r5, L(ipred_filter\bpc\()_tbl) vld1.8 {d27, d28, d29}, [r6, :64] sub lr, lr, #26 ldr lr, [r5, lr, lsl #2] vmovl.s8 q8, d20 vmovl.s8 q9, d21 add r5, r5, lr vmovl.s8 q10, d22 vmovl.s8 q11, d23 add r6, r0, r1 lsl r1, r1, #1 vmovl.s8 q12, d27 vmovl.s8 q13, d28 vmovl.s8 q14, d29 mov r7, #-4 vdup.16 q15, r8 add r8, r2, #2 sub r2, r2, #4 .if \bpc == 10 vmov.i16 q7, #0 .endif bx r5 .align 2 L(ipred_filter\bpc\()_tbl): .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB 40: vld1.16 {d0}, [r8] // top (0-3) 4: vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) .if \bpc == 10 vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vrshr.s16 q2, q2, #4 vmax.s16 q2, q2, q7 .else vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) vqrshrun.s32 d4, q2, #4 vqrshrun.s32 d5, q3, #4 .endif vmin.s16 q2, q2, q15 subs r4, r4, #2 vst1.16 {d4}, [r0, :64], r1 vst1.16 {d5}, [r6, :64], r1 vmov d0, d5 // move top from [4-7] to [0-3] bgt 4b vpop {q4-q7} pop {r4-r8, pc} 80: vld1.16 {q0}, [r8] // top (0-7) 8: vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) .if \bpc == 10 vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) vrshr.s16 q2, q2, #4 vmax.s16 q2, q2, q7 vmin.s16 q2, q2, q15 vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5) vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6) vrshr.s16 q3, q3, #4 vmax.s16 q3, q3, q7 .else vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) vqrshrun.s32 d4, q2, #4 vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1) vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2) vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3) vqrshrun.s32 d5, q3, #4 vmin.s16 q2, q2, q15 vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4) vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0) vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5) vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6) vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1) vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2) vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3) vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4) vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0) vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5) vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6) vqrshrun.s32 d6, q4, #4 vqrshrun.s32 d7, q5, #4 .endif vmin.s16 q3, q3, q15 vswp d5, d6 subs r4, r4, #2 vst1.16 {q2}, [r0, :128], r1 vmov q0, q3 vst1.16 {q3}, [r6, :128], r1 bgt 8b vpop {q4-q7} pop {r4-r8, pc} 160: 320: sub r1, r1, r3, lsl #1 mov lr, r3 1: vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2) 2: vld1.16 {q1, q2}, [r8]! // top(0-15) .if \bpc == 10 vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) vrshr.s16 q3, q3, #4 vmax.s16 q3, q3, q7 vmin.s16 q3, q3, q15 vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5) vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6) vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) vrshr.s16 q4, q4, #4 vmax.s16 q4, q4, q7 vmin.s16 q4, q4, q15 vmov q0, q4 vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1) vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2) vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3) vrshr.s16 q5, q5, #4 vmax.s16 q5, q5, q7 vmin.s16 q5, q5, q15 vmov q0, q5 vmov.u16 r12, d5[3] vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4) vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0) vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6) vmov.16 d0[2], r12 subs r3, r3, #16 vrshr.s16 q6, q6, #4 .else vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0) vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5) vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6) vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1) vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2) vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3) vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4) vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0) vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5) vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6) vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1) vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2) vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3) vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4) vqrshrun.s32 d6, q3, #4 vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1) vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2) vqrshrun.s32 d7, q4, #4 vmin.s16 q3, q3, q15 vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3) vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4) vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0) vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5) vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6) vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1) vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2) vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3) vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4) vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0) vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5) vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6) vqrshrun.s32 d8, q5, #4 vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1) vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2) vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3) vqrshrun.s32 d9, q6, #4 vmin.s16 q0, q4, q15 vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4) vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0) vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6) vmin.s16 q4, q4, q15 vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1) vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2) vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3) vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4) vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0) vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6) vqrshrun.s32 d10, q7, #4 vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1) vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2) vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3) vqrshrun.s32 d11, q6, #4 vmin.s16 q0, q5, q15 vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4) vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0) vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6) vmin.s16 q5, q5, q15 vmov.u16 r12, d5[3] vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1) vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2) vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3) vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4) vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0) vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6) vmov.16 d0[2], r12 vqrshrun.s32 d12, q1, #4 subs r3, r3, #16 vqrshrun.s32 d13, q7, #4 .endif vswp q4, q5 .if \bpc == 10 vmax.s16 q6, q6, q7 .endif vswp d7, d10 vmin.s16 q6, q6, q15 vswp d9, d12 vst1.16 {q3, q4}, [r0, :128]! vst1.16 {q5, q6}, [r6, :128]! ble 8f vmov.u16 r12, d13[3] vmov.16 d0[0], r12 vmov.u16 r12, d9[3] vmov.16 d0[1], r12 b 2b 8: subs r4, r4, #2 ble 9f sub r8, r6, lr, lsl #1 add r0, r0, r1 add r6, r6, r1 mov r3, lr b 1b 9: vpop {q4-q7} pop {r4-r8, pc} endfunc .endm filter_fn 10 filter_fn 12 function ipred_filter_16bpc_neon, export=1 push {r4-r8, lr} vpush {q4-q7} movw r12, 0x3ff ldr r8, [sp, #104] cmp r8, r12 ble ipred_filter_10bpc_neon b ipred_filter_12bpc_neon endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] ldr r5, [sp, #16] vld1.16 {q14}, [r2, :128] clz lr, r4 adr r12, L(pal_pred_tbl) sub lr, lr, #25 vmov.i8 q13, #7 ldr lr, [r12, lr, lsl #2] vmov.i16 q15, #0x100 add r12, r12, lr add r2, r0, r1 bx r12 .align 2 L(pal_pred_tbl): .word 640f - L(pal_pred_tbl) + CONFIG_THUMB .word 320f - L(pal_pred_tbl) + CONFIG_THUMB .word 160f - L(pal_pred_tbl) + CONFIG_THUMB .word 80f - L(pal_pred_tbl) + CONFIG_THUMB .word 40f - L(pal_pred_tbl) + CONFIG_THUMB 40: lsl r1, r1, #1 4: vld1.8 {d2}, [r3, :64]! subs r5, r5, #4 vshr.u8 d3, d2, #4 vand.u8 d2, d2, d26 vzip.8 d2, d3 // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... vadd.i8 q0, q1, q1 vadd.i8 q1, q1, q1 vzip.8 q0, q1 vadd.i16 q0, q0, q15 vadd.i16 q1, q1, q15 vtbl.8 d0, {q14}, d0 vtbl.8 d1, {q14}, d1 vst1.16 {d0}, [r0, :64], r1 vtbl.8 d2, {q14}, d2 vst1.16 {d1}, [r2, :64], r1 vtbl.8 d3, {q14}, d3 vst1.16 {d2}, [r0, :64], r1 vst1.16 {d3}, [r2, :64], r1 bgt 4b pop {r4-r5, pc} 80: lsl r1, r1, #1 8: vld1.8 {q1}, [r3, :64]! subs r5, r5, #4 vshr.u8 q2, q1, #4 vand.u8 q1, q1, q13 vzip.8 q1, q2 // Prefer doing the adds twice, instead of chaining a vmov after // the add. vadd.i8 q0, q1, q1 vadd.i8 q1, q1, q1 vadd.i8 q3, q2, q2 vadd.i8 q2, q2, q2 vzip.8 q0, q1 vzip.8 q2, q3 vadd.i16 q0, q0, q15 vadd.i16 q1, q1, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q2, q2, q15 vtbl.8 d1, {q14}, d1 vadd.i16 q3, q3, q15 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vst1.16 {q0}, [r0, :128], r1 vtbl.8 d6, {q14}, d6 vst1.16 {q1}, [r2, :128], r1 vtbl.8 d7, {q14}, d7 vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r2, :128], r1 bgt 8b pop {r4-r5, pc} 160: lsl r1, r1, #1 16: vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #4 vand.u8 q2, q10, q13 vshr.u8 q3, q10, #4 vand.u8 q10, q11, q13 vshr.u8 q11, q11, #4 vzip.8 q2, q3 vzip.8 q10, q11 vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 vadd.i8 q3, q3, q3 vadd.i8 q8, q10, q10 vadd.i8 q9, q10, q10 vadd.i8 q10, q11, q11 vzip.8 q0, q1 vadd.i8 q11, q11, q11 vzip.8 q2, q3 vzip.8 q8, q9 vadd.i16 q0, q0, q15 vzip.8 q10, q11 vadd.i16 q1, q1, q15 vadd.i16 q2, q2, q15 vadd.i16 q3, q3, q15 vadd.i16 q8, q8, q15 vadd.i16 q9, q9, q15 vadd.i16 q10, q10, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q11, q11, q15 vtbl.8 d1, {q14}, d1 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vtbl.8 d6, {q14}, d6 vtbl.8 d7, {q14}, d7 vtbl.8 d16, {q14}, d16 vtbl.8 d17, {q14}, d17 vtbl.8 d18, {q14}, d18 vst1.16 {q0, q1}, [r0, :128], r1 vtbl.8 d19, {q14}, d19 vtbl.8 d20, {q14}, d20 vst1.16 {q2, q3}, [r2, :128], r1 vtbl.8 d21, {q14}, d21 vtbl.8 d22, {q14}, d22 vst1.16 {q8, q9}, [r0, :128], r1 vtbl.8 d23, {q14}, d23 vst1.16 {q10, q11}, [r2, :128], r1 bgt 16b pop {r4-r5, pc} 320: lsl r1, r1, #1 sub r1, r1, #32 32: vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #2 vand.u8 q2, q10, q13 vshr.u8 q3, q10, #4 vand.u8 q10, q11, q13 vshr.u8 q11, q11, #4 vzip.8 q2, q3 vzip.8 q10, q11 vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 vadd.i8 q3, q3, q3 vadd.i8 q8, q10, q10 vadd.i8 q9, q10, q10 vadd.i8 q10, q11, q11 vzip.8 q0, q1 vadd.i8 q11, q11, q11 vzip.8 q2, q3 vzip.8 q8, q9 vadd.i16 q0, q0, q15 vzip.8 q10, q11 vadd.i16 q1, q1, q15 vadd.i16 q2, q2, q15 vadd.i16 q3, q3, q15 vadd.i16 q8, q8, q15 vadd.i16 q9, q9, q15 vadd.i16 q10, q10, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q11, q11, q15 vtbl.8 d1, {q14}, d1 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vtbl.8 d6, {q14}, d6 vtbl.8 d7, {q14}, d7 vtbl.8 d16, {q14}, d16 vtbl.8 d17, {q14}, d17 vtbl.8 d18, {q14}, d18 vst1.16 {q0, q1}, [r0, :128]! vtbl.8 d19, {q14}, d19 vtbl.8 d20, {q14}, d20 vst1.16 {q2, q3}, [r0, :128], r1 vtbl.8 d21, {q14}, d21 vtbl.8 d22, {q14}, d22 vst1.16 {q8, q9}, [r2, :128]! vtbl.8 d23, {q14}, d23 vst1.16 {q10, q11}, [r2, :128], r1 bgt 32b pop {r4-r5, pc} 640: sub r1, r1, #96 64: vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #1 vand.u8 q2, q10, q13 vshr.u8 q3, q10, #4 vand.u8 q10, q11, q13 vshr.u8 q11, q11, #4 vzip.8 q2, q3 vzip.8 q10, q11 vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 vadd.i8 q3, q3, q3 vadd.i8 q8, q10, q10 vadd.i8 q9, q10, q10 vadd.i8 q10, q11, q11 vzip.8 q0, q1 vadd.i8 q11, q11, q11 vzip.8 q2, q3 vzip.8 q8, q9 vadd.i16 q0, q0, q15 vzip.8 q10, q11 vadd.i16 q1, q1, q15 vadd.i16 q2, q2, q15 vadd.i16 q3, q3, q15 vadd.i16 q8, q8, q15 vadd.i16 q9, q9, q15 vadd.i16 q10, q10, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q11, q11, q15 vtbl.8 d1, {q14}, d1 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vtbl.8 d6, {q14}, d6 vtbl.8 d7, {q14}, d7 vtbl.8 d16, {q14}, d16 vtbl.8 d17, {q14}, d17 vtbl.8 d18, {q14}, d18 vst1.16 {q0, q1}, [r0, :128]! vtbl.8 d19, {q14}, d19 vtbl.8 d20, {q14}, d20 vst1.16 {q2, q3}, [r0, :128]! vtbl.8 d21, {q14}, d21 vtbl.8 d22, {q14}, d22 vst1.16 {q8, q9}, [r0, :128]! vtbl.8 d23, {q14}, d23 vst1.16 {q10, q11}, [r0, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_128_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] clz lr, r3 vdup.16 q15, r7 // bitdepth_max adr r12, L(ipred_cfl_128_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vrshr.u16 q0, q15, #1 vdup.16 q1, r6 // alpha add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r12 .align 2 L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB L(ipred_cfl_splat_w4): vld1.16 {q8, q9}, [r5, :128]! vmull.s16 q2, d16, d2 // diff = ac * alpha vmull.s16 q3, d17, d3 vmull.s16 q8, d18, d2 vmull.s16 q9, d19, d3 vshr.s32 q10, q2, #31 // sign = diff >> 15 vshr.s32 q11, q3, #31 vshr.s32 q12, q8, #31 vshr.s32 q13, q9, #31 vadd.i32 q2, q2, q10 // diff + sign vadd.i32 q3, q3, q11 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q8, #6 vrshrn.i32 d7, q9, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vadd.i16 q3, q3, q0 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vst1.16 {d4}, [r0, :64], r1 vst1.16 {d5}, [r6, :64], r1 subs r4, r4, #4 vst1.16 {d6}, [r0, :64], r1 vst1.16 {d7}, [r6, :64], r1 bgt L(ipred_cfl_splat_w4) pop {r4-r8, pc} L(ipred_cfl_splat_w8): vld1.16 {q8, q9}, [r5, :128]! subs r4, r4, #2 vmull.s16 q2, d16, d2 // diff = ac * alpha vmull.s16 q3, d17, d3 vmull.s16 q8, d18, d2 vmull.s16 q9, d19, d3 vshr.s32 q10, q2, #31 // sign = diff >> 15 vshr.s32 q11, q3, #31 vshr.s32 q12, q8, #31 vshr.s32 q13, q9, #31 vadd.i32 q2, q2, q10 // diff + sign vadd.i32 q3, q3, q11 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q8, #6 vrshrn.i32 d7, q9, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vadd.i16 q3, q3, q0 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r6, :128], r1 bgt L(ipred_cfl_splat_w8) pop {r4-r8, pc} L(ipred_cfl_splat_w16): vpush {q4-q7} add r12, r5, r3, lsl #1 sub r1, r1, r3, lsl #1 mov lr, r3 1: vld1.16 {q6, q7}, [r5, :128]! vmull.s16 q2, d12, d2 // diff = ac * alpha vld1.16 {q8, q9}, [r12, :128]! vmull.s16 q3, d13, d3 vmull.s16 q4, d14, d2 vmull.s16 q5, d15, d3 vmull.s16 q6, d16, d2 vmull.s16 q7, d17, d3 vmull.s16 q8, d18, d2 vmull.s16 q9, d19, d3 vshr.s32 q10, q2, #31 // sign = diff >> 15 vshr.s32 q11, q3, #31 vshr.s32 q12, q4, #31 vshr.s32 q13, q5, #31 vadd.i32 q2, q2, q10 // diff + sign vshr.s32 q10, q6, #31 vadd.i32 q3, q3, q11 vshr.s32 q11, q7, #31 vadd.i32 q4, q4, q12 vshr.s32 q12, q8, #31 vadd.i32 q5, q5, q13 vshr.s32 q13, q9, #31 vadd.i32 q6, q6, q10 vadd.i32 q7, q7, q11 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q4, #6 vrshrn.i32 d7, q5, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vrshrn.i32 d8, q6, #6 vrshrn.i32 d9, q7, #6 vadd.i16 q3, q3, q0 vrshrn.i32 d10, q8, #6 vrshrn.i32 d11, q9, #6 vadd.i16 q4, q4, q0 vadd.i16 q5, q5, q0 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmax.s16 q4, q4, q14 vmax.s16 q5, q5, q14 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vmin.s16 q4, q4, q15 vmin.s16 q5, q5, q15 subs r3, r3, #16 vst1.16 {q2, q3}, [r0, :128]! vst1.16 {q4, q5}, [r6, :128]! bgt 1b subs r4, r4, #2 add r5, r5, lr, lsl #1 add r12, r12, lr, lsl #1 add r0, r0, r1 add r6, r6, r1 mov r3, lr bgt 1b vpop {q4-q7} pop {r4-r8, pc} endfunc // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_top_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] clz lr, r3 vdup.16 q15, r7 // bitdepth_max adr r12, L(ipred_cfl_top_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vdup.16 q1, r6 // alpha add r2, r2, #2 add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r12 .align 2 L(ipred_cfl_top_tbl): .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB 4: vld1.16 {d0}, [r2] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) 8: vld1.16 {q0}, [r2] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) 16: vld1.16 {q2, q3}, [r2] vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) 32: vld1.16 {q8, q9}, [r2]! vld1.16 {q10, q11}, [r2] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q8, q10 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d0, q0, #5 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_left_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] sub r2, r2, r4, lsl #1 clz lr, r3 clz r8, r4 vdup.16 q15, r7 // bitdepth_max adr r12, L(ipred_cfl_splat_tbl) adr r7, L(ipred_cfl_left_tbl) sub lr, lr, #26 sub r8, r8, #26 ldr lr, [r12, lr, lsl #2] ldr r8, [r7, r8, lsl #2] vdup.16 q1, r6 // alpha add r12, r12, lr add r7, r7, r8 add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r7 .align 2 L(ipred_cfl_left_tbl): .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB L(ipred_cfl_left_h4): vld1.16 {d0}, [r2, :64] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h8): vld1.16 {q0}, [r2, :128] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h16): vld1.16 {q2, q3}, [r2, :128] vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h32): vld1.16 {q8, q9}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q8, q10 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d0, q0, #5 vdup.16 q0, d0[0] bx r12 endfunc // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] sub r2, r2, r4, lsl #1 add r8, r3, r4 // width + height vdup.16 q1, r6 // alpha clz lr, r3 clz r6, r4 vdup.32 d16, r8 // width + height vdup.16 q15, r7 // bitdepth_max adr r7, L(ipred_cfl_tbl) rbit r8, r8 // rbit(width + height) sub lr, lr, #22 // 26 leading bits, minus table offset 4 sub r6, r6, #26 clz r8, r8 // ctz(width + height) ldr lr, [r7, lr, lsl #2] ldr r6, [r7, r6, lsl #2] neg r8, r8 // -ctz(width + height) add r12, r7, lr add r7, r7, r6 vshr.u32 d16, d16, #1 // (width + height) >> 1 vdup.32 d17, r8 // -ctz(width + height) add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r7 .align 2 L(ipred_cfl_tbl): .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB L(ipred_cfl_h4): vld1.16 {d0}, [r2, :64]! vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w4): vld1.16 {d1}, [r2] vadd.i32 d0, d0, d16 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #4 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 8/16 cmp r4, #16 movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): vld1.16 {q0}, [r2, :128]! vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w8): vld1.16 {q2}, [r2] vadd.i32 d0, d0, d16 vadd.i16 d1, d4, d5 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #8 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): vld1.16 {q2, q3}, [r2, :128]! vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w16): vld1.16 {q2, q3}, [r2] vadd.i32 d0, d0, d16 vadd.i16 q2, q2, q3 vadd.i16 d1, d4, d5 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #16 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): vld1.16 {q2, q3}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128]! vadd.i16 q2, q2, q3 vadd.i16 q10, q10, q11 vadd.i16 q0, q2, q10 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w32): vld1.16 {q2, q3}, [r2]! vadd.i32 d0, d0, d16 vld1.16 {q10, q11}, [r2]! vadd.i16 q2, q2, q3 vadd.i16 q10, q10, q11 vadd.i16 q2, q2, q10 vadd.i16 d1, d4, d5 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #32 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_420_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i32 q8, #0 vmov.i32 q9, #0 vmov.i32 q10, #0 vmov.i32 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_420_tbl): .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w4): 1: // Copy and subsample input vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vshl.i16 q0, q0, #1 subs r8, r8, #2 vst1.16 {q0}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 bgt 1b cmp r4, #0 vmov d0, d1 vmov d2, d1 vmov d3, d1 L(ipred_cfl_ac_420_w4_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums vadd.i32 q8, q8, q9 vadd.i32 q10, q10, q11 vadd.i32 q0, q8, q10 vadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d0 // sum sub r0, r0, r6, lsl #3 vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz vdup.16 q8, d16[0] 6: // Subtract dc from ac vld1.16 {q0, q1}, [r0, :128] subs r6, r6, #4 vsub.i16 q0, q0, q8 vsub.i16 q1, q1, q8 vst1.16 {q0, q1}, [r0, :128]! bgt 6b pop {r4-r8, pc} L(ipred_cfl_ac_420_w8): cmp r3, #0 bne L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vld1.16 {q12, q13}, [r1, :128], r2 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q3 vpadd.i16 d2, d24, d25 vpadd.i16 d3, d26, d27 vshl.i16 q0, q0, #1 vshl.i16 q1, q1, #1 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov q0, q1 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vshl.i16 q0, q0, #1 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov q0, q1 L(ipred_cfl_ac_420_w8_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl r6, r6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): adr r7, L(ipred_cfl_ac_420_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_420_w16_tbl): .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w16_wpad0): sub r2, r2, #32 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q12, q13}, [r12, :128]! vld1.16 {q2, q3}, [r1, :128], r2 vadd.i16 q0, q0, q12 vadd.i16 q1, q1, q13 vld1.16 {q12, q13}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vadd.i16 q2, q2, q12 vadd.i16 q3, q3, q13 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vshl.i16 q0, q0, #1 vshl.i16 q1, q1, #1 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): sub r2, r2, #32 1: // Copy and subsample input, padding 4 vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q12, q13}, [r12, :128]! vld1.16 {q2}, [r1, :128], r2 vadd.i16 q0, q0, q12 vadd.i16 q1, q1, q13 vld1.16 {q12}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vadd.i16 q2, q2, q12 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vshl.i16 q0, q0, #1 vshl.i16 d2, d2, #1 subs r8, r8, #1 vdup.16 d3, d2[3] vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q12, q13}, [r12, :128], r2 vadd.i16 q0, q0, q12 vadd.i16 q1, q1, q13 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vshl.i16 q0, q0, #1 subs r8, r8, #1 vdup.16 q1, d1[3] vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q12}, [r12, :128], r2 vadd.i16 q0, q0, q12 vpadd.i16 d0, d0, d1 vshl.i16 d0, d0, #1 subs r8, r8, #1 vdup.16 q1, d0[3] vdup.16 d1, d0[3] vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl r6, r6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_422_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_422_tbl): .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w4): 1: // Copy and subsample input vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): cmp r3, #0 bne L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vld1.16 {q12, q13}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d24, d24, d25 vpadd.i16 d25, d26, d27 vpadd.i16 d26, d4, d5 vpadd.i16 d27, d6, d7 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q12, #2 vshl.i16 q3, q13, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q2}, [r12, :128], r2 vld1.16 {q12}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d24, d24, d25 vpadd.i16 d25, d4, d5 vshl.i16 q0, q0, #2 vshl.i16 q12, q12, #2 vdup.16 d7, d25[3] vmov d6, d25 vdup.16 d5, d24[3] vmov d4, d24 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): adr r7, L(ipred_cfl_ac_422_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_422_w16_tbl): .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w16_wpad0): sub r2, r2, #32 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2, q3}, [r12, :128]! vld1.16 {q12, q13}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d24, d25 vpadd.i16 d3, d26, d27 vld1.16 {q12, q13}, [r12, :128], r2 vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vpadd.i16 d6, d24, d25 vpadd.i16 d7, d26, d27 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): sub r2, r2, #32 1: // Copy and subsample input, padding 4 vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2, q3}, [r12, :128]! vld1.16 {q12}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d24, d25 vld1.16 {q12}, [r12, :128], r2 vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vpadd.i16 d6, d24, d25 vshl.i16 q0, q0, #2 vshl.i16 d2, d2, #2 vshl.i16 q2, q2, #2 vshl.i16 d6, d6, #2 vdup.16 d3, d2[3] vdup.16 d7, d6[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vshl.i16 q0, q0, #2 vshl.i16 q2, q2, #2 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q2}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vshl.i16 q0, q0, #2 vdup.16 q3, d1[3] vdup.16 q1, d0[3] vdup.16 d5, d1[3] vmov d4, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) endfunc // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_444_tbl) sub r8, r8, #26 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_444_tbl): .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w4): 1: // Copy and expand input vld1.16 {d0}, [r1, :64], r2 vld1.16 {d1}, [r12, :64], r2 vld1.16 {d2}, [r1, :64], r2 vld1.16 {d3}, [r12, :64], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): 1: // Copy and expand input vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 vshl.i16 q3, q3, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): cmp r3, #0 bne L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 vshl.i16 q3, q3, #3 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q2}, [r12, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q2, q2, #3 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): adr r7, L(ipred_cfl_ac_444_w32_tbl) ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 asr r2, r2, #1 add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_444_w32_tbl): .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w32_wpad0): sub r2, r2, #32 1: // Copy and expand input, without padding vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2, q3}, [r1, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 vshl.i16 q3, q3, #3 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): sub r2, r2, #32 1: // Copy and expand input, padding 8 vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2}, [r1, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vdup.16 q3, d5[3] vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): 1: // Copy and expand input, padding 16 vld1.16 {q0, q1}, [r1, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vdup.16 q2, d3[3] vdup.16 q3, d3[3] vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): 1: // Copy and expand input, padding 24 vld1.16 {q0}, [r1, :128], r2 vshl.i16 q0, q0, #3 subs r8, r8, #1 vdup.16 q1, d1[3] vst1.16 {q0, q1}, [r0, :128]! vdup.16 q2, d1[3] vdup.16 q3, d1[3] vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 L(ipred_cfl_ac_444_w32_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #1 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl r6, r6, #3 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/itx.S000066400000000000000000003403231517466257200225730ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); // Most of the functions use the following register layout: // r0-r3 external parameters // r4 function pointer to first transform // r5 function pointer to second transform // r6 output parameter for helper function // r7 input parameter for helper function // r8 input stride for helper function // r9 scratch variable for helper functions // r10-r11 pointer to list of eob thresholds, eob threshold value, // scratch variables within helper functions (backed up) // The SIMD registers most often use the following layout: // d0-d3 multiplication coefficients // d4-d7 scratch registers // d8-d15 unused in some transforms, used for scratch registers in others // d16-v31 inputs/outputs of transforms // Potential further optimizations, that are left unimplemented for now: // - Trying to keep multiplication coefficients in registers across multiple // transform functions. (The register layout is designed to potentially // allow this.) // - Use a simplified version of the transforms themselves for cases where // we know a significant number of inputs are zero. E.g. if the eob value // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. const idct_coeffs, align=4 // idct4 .short 2896, 2896*8, 1567, 3784 // idct8 .short 799, 4017, 3406, 2276 // idct16 .short 401, 4076, 3166, 2598 .short 1931, 3612, 3920, 1189 // idct32 .short 201, 4091, 3035, 2751 .short 1751, 3703, 3857, 1380 .short 995, 3973, 3513, 2106 .short 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .short 101*8, 4095*8, 2967*8, -2824*8 .short 1660*8, 3745*8, 3822*8, -1474*8 .short 4076, 401, 4017, 799 .short 4036*8, -700*8, 2359*8, 3349*8 .short 3461*8, -2191*8, 897*8, 3996*8 .short -3166, -2598, -799, -4017 .short 501*8, 4065*8, 3229*8, -2520*8 .short 2019*8, 3564*8, 3948*8, -1092*8 .short 3612, 1931, 2276, 3406 .short 4085*8, -301*8, 2675*8, 3102*8 .short 3659*8, -1842*8, 1285*8, 3889*8 .short -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 // .h[4-5] can be interpreted as .s[2] .short 1321, 3803, 2482, 3344, 3344, 0 endconst const iadst8_coeffs, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 // idct_coeffs .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst const iadst16_coeffs, align=4 .short 4091, 201, 3973, 995 .short 3703, 1751, 3290, 2440 .short 2751, 3035, 2106, 3513 .short 1380, 3857, 601, 4052 endconst .macro vmull_vmlal d0, s0, s1, c0, c1 vmull.s16 \d0, \s0, \c0 vmlal.s16 \d0, \s1, \c1 .endm .macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1 vmull.s16 \d0, \s0, \c0 vmlal.s16 \d0, \s2, \c1 vmull.s16 \d1, \s1, \c0 vmlal.s16 \d1, \s3, \c1 .endm .macro vmull_vmlsl d0, s0, s1, c0, c1 vmull.s16 \d0, \s0, \c0 vmlsl.s16 \d0, \s1, \c1 .endm .macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1 vmull.s16 \d0, \s0, \c0 vmlsl.s16 \d0, \s2, \c1 vmull.s16 \d1, \s1, \c0 vmlsl.s16 \d1, \s3, \c1 .endm .macro vqrshrn_8h d0, d1, s0, s1, shift vqrshrn.s32 \d0, \s0, \shift vqrshrn.s32 \d1, \s1, \shift .endm .macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 vqrdmulh.s16 \r0, \r0, \c vqrdmulh.s16 \r1, \r1, \c .ifnb \r2 vqrdmulh.s16 \r2, \r2, \c vqrdmulh.s16 \r3, \r3, \c .endif .ifnb \r4 vqrdmulh.s16 \r4, \r4, \c vqrdmulh.s16 \r5, \r5, \c vqrdmulh.s16 \r6, \r6, \c vqrdmulh.s16 \r7, \r7, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 .ifnb \load vld1.8 {\load}, [\src, :64], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #\shiftbits .endif .ifnb \addsrc vaddw.u8 \adddst, \adddst, \addsrc .endif .ifnb \narrowsrc vqmovun.s16 \narrowdst, \narrowsrc .endif .ifnb \store vst1.8 {\store}, [\dst, :64], r1 .endif .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst load_add_store d2, q8, , , , , , \dst, \src, \shiftbits load_add_store d3, q9, , , , , , \dst, \src, \shiftbits load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits load_add_store , , , , , , d3, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src mov \src, \dst load_add_store d2, q8, , , , , , \dst, \src load_add_store d3, q9, , , , , , \dst, \src load_add_store d4, q10, d2, q8, , , , \dst, \src load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src load_add_store , , d4, q10, q9, d3, d2, \dst, \src load_add_store , , d5, q11, q10, d4, d3, \dst, \src load_add_store , , , , q11, d5, d4, \dst, \src load_add_store , , , , , , d5, \dst, \src .endm .macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src .ifnb \load vld1.32 {\load[0]}, [\src, :32], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #4 .endif .ifnb \load vld1.32 {\load[1]}, [\src, :32], r1 .endif .ifnb \addsrc vaddw.u8 \adddst, \adddst, \addsrc .endif .ifnb \store vst1.32 {\store[0]}, [\dst, :32], r1 .endif .ifnb \narrowsrc vqmovun.s16 \narrowdst, \narrowsrc .endif .ifnb \store vst1.32 {\store[1]}, [\dst, :32], r1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst load_add_store4 d0, , , , , , , \dst, \src load_add_store4 d1, q8, , , , , , \dst, \src load_add_store4 d2, q9, d0, q8, , , , \dst, \src load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src load_add_store4 , , , , q15, d7, d6, \dst, \src load_add_store4 , , , , , , d7, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst load_add_store4 d0, , , , , , , \dst, \src load_add_store4 d1, q8, , , , , , \dst, \src load_add_store4 d2, q9, d0, q8, , , , \dst, \src load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src load_add_store4 , , , , q11, d3, d2, \dst, \src load_add_store4 , , , , , , d3, \dst, \src .endm .macro idct_dc w, h, shift cmp r3, #0 bne 1f vmov.i16 d30, #0 movw r12, #2896*8 vld1.16 {d16[]}, [r2, :16] vdup.16 d0, r12 vqrdmulh.s16 d16, d16, d0[0] vst1.16 {d30[0]}, [r2, :16] .if (\w == 2*\h) || (2*\w == \h) vqrdmulh.s16 d16, d16, d0[0] .endif .if \shift > 0 vrshr.s16 d16, d16, #\shift .endif vqrdmulh.s16 d20, d16, d0[0] mov r3, #\h vrshr.s16 d16, d20, #4 vrshr.s16 d17, d20, #4 b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon 1: vld1.32 {d0[0]}, [r0, :32], r1 vld1.32 {d0[1]}, [r0, :32], r1 vld1.32 {d1[0]}, [r0, :32], r1 vld1.32 {d1[1]}, [r0, :32], r1 subs r3, r3, #4 sub r0, r0, r1, lsl #2 vaddw.u8 q10, q8, d0 vqmovun.s16 d0, q10 vaddw.u8 q11, q8, d1 vst1.32 {d0[0]}, [r0, :32], r1 vqmovun.s16 d1, q11 vst1.32 {d0[1]}, [r0, :32], r1 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 bgt 1b bx lr endfunc function idct_dc_w8_neon 1: vld1.8 {d0}, [r0, :64], r1 vld1.8 {d1}, [r0, :64], r1 vld1.8 {d2}, [r0, :64], r1 vaddw.u8 q10, q8, d0 vld1.8 {d3}, [r0, :64], r1 sub r0, r0, r1, lsl #2 subs r3, r3, #4 vaddw.u8 q11, q8, d1 vqmovun.s16 d0, q10 vaddw.u8 q12, q8, d2 vqmovun.s16 d1, q11 vaddw.u8 q13, q8, d3 vst1.8 {d0}, [r0, :64], r1 vqmovun.s16 d2, q12 vst1.8 {d1}, [r0, :64], r1 vqmovun.s16 d3, q13 vst1.8 {d2}, [r0, :64], r1 vst1.8 {d3}, [r0, :64], r1 bgt 1b bx lr endfunc function idct_dc_w16_neon 1: vld1.8 {q0}, [r0, :128], r1 vld1.8 {q1}, [r0, :128], r1 vld1.8 {q2}, [r0, :128], r1 subs r3, r3, #4 vaddw.u8 q10, q8, d0 vaddw.u8 q11, q8, d1 vld1.8 {q3}, [r0, :128], r1 vaddw.u8 q12, q8, d2 vaddw.u8 q13, q8, d3 sub r0, r0, r1, lsl #2 vaddw.u8 q14, q8, d4 vaddw.u8 q15, q8, d5 vqmovun.s16 d0, q10 vqmovun.s16 d1, q11 vaddw.u8 q10, q8, d6 vaddw.u8 q11, q8, d7 vqmovun.s16 d2, q12 vqmovun.s16 d3, q13 vqmovun.s16 d4, q14 vqmovun.s16 d5, q15 vst1.8 {q0}, [r0, :128], r1 vqmovun.s16 d6, q10 vqmovun.s16 d7, q11 vst1.8 {q1}, [r0, :128], r1 vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w32_neon 1: vld1.8 {q0, q1}, [r0, :128], r1 subs r3, r3, #2 vld1.8 {q2, q3}, [r0, :128], r1 vaddw.u8 q10, q8, d0 vaddw.u8 q11, q8, d1 vaddw.u8 q12, q8, d2 vaddw.u8 q13, q8, d3 sub r0, r0, r1, lsl #1 vaddw.u8 q14, q8, d4 vaddw.u8 q15, q8, d5 vqmovun.s16 d0, q10 vqmovun.s16 d1, q11 vaddw.u8 q10, q8, d6 vaddw.u8 q11, q8, d7 vqmovun.s16 d2, q12 vqmovun.s16 d3, q13 vqmovun.s16 d4, q14 vqmovun.s16 d5, q15 vst1.8 {q0, q1}, [r0, :128], r1 vqmovun.s16 d6, q10 vqmovun.s16 d7, q11 vst1.8 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w64_neon sub r1, r1, #32 1: vld1.8 {q0, q1}, [r0, :128]! subs r3, r3, #1 vld1.8 {q2, q3}, [r0, :128] vaddw.u8 q10, q8, d0 vaddw.u8 q11, q8, d1 vaddw.u8 q12, q8, d2 vaddw.u8 q13, q8, d3 sub r0, r0, #32 vaddw.u8 q14, q8, d4 vaddw.u8 q15, q8, d5 vqmovun.s16 d0, q10 vqmovun.s16 d1, q11 vaddw.u8 q10, q8, d6 vaddw.u8 q11, q8, d7 vqmovun.s16 d2, q12 vqmovun.s16 d3, q13 vqmovun.s16 d4, q14 vqmovun.s16 d5, q15 vst1.8 {q0, q1}, [r0, :128]! vqmovun.s16 d6, q10 vqmovun.s16 d7, q11 vst1.8 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc .macro iwht4 vadd.i16 d16, d16, d17 vsub.i16 d21, d18, d19 vsub.i16 d20, d16, d21 vshr.s16 d20, d20, #1 vsub.i16 d18, d20, d17 vsub.i16 d17, d20, d19 vadd.i16 d19, d21, d18 vsub.i16 d16, d16, d17 .endm .macro idct_4h_x4 r0, r1, r2, r3 vmull_vmlal q3, \r1, \r3, d0[3], d0[2] vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] vmull_vmlal q1, \r0, \r2, d0[0], d0[0] vqrshrn.s32 d6, q3, #12 vqrshrn.s32 d7, q2, #12 vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] vqrshrn.s32 d2, q1, #12 vqrshrn.s32 d3, q2, #12 vqadd.s16 \r0, d2, d6 vqsub.s16 \r3, d2, d6 vqadd.s16 \r1, d3, d7 vqsub.s16 \r2, d3, d7 .endm .macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] vqrshrn_8h d12, d13, q6, q7, #12 vqrshrn_8h d14, d15, q4, q5, #12 vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] vqrshrn_8h d4, d5, q2, q3, #12 vqrshrn_8h d6, d7, q4, q5, #12 vqadd.s16 \q0, q2, q6 vqsub.s16 \q3, q2, q6 vqadd.s16 \q1, q3, q7 vqsub.s16 \q2, q3, q7 .endm function inv_dct_4h_x4_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {d0}, [r12, :64] idct_4h_x4 d16, d17, d18, d19 bx lr endfunc function inv_dct_8h_x4_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {d0}, [r12, :64] idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel_local r12, iadst4_coeffs vld1.16 {d0, d1}, [r12, :128] vsubl.s16 q1, d16, d18 vmull.s16 q2, d16, d0[0] vmlal.s16 q2, d18, d0[1] vmlal.s16 q2, d19, d0[2] vmull.s16 q10, d17, d0[3] vaddw.s16 q1, q1, d19 vmull.s16 q3, d16, d0[2] vmlsl.s16 q3, d18, d0[0] vmlsl.s16 q3, d19, d0[1] vadd.s32 q11, q2, q3 vmul.s32 q1, q1, d1[0] vadd.s32 q2, q2, q10 vadd.s32 q3, q3, q10 vsub.s32 q11, q11, q10 vqrshrn.s32 \o0, q2, #12 vqrshrn.s32 \o2, q1, #12 vqrshrn.s32 \o1, q3, #12 vqrshrn.s32 \o3, q11, #12 .endm function inv_adst_4h_x4_neon, export=1 iadst_4x4 d16, d17, d18, d19 bx lr endfunc function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 d19, d18, d17, d16 bx lr endfunc .macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7 movrel_local r12, iadst4_coeffs vld1.16 {d0, d1}, [r12, :128] vsubl.s16 q2, d16, d20 vsubl.s16 q3, d17, d21 vmull.s16 q4, d16, d0[0] vmlal.s16 q4, d20, d0[1] vmlal.s16 q4, d22, d0[2] vmull.s16 q5, d17, d0[0] vmlal.s16 q5, d21, d0[1] vmlal.s16 q5, d23, d0[2] vaddw.s16 q2, q2, d22 vaddw.s16 q3, q3, d23 vmull.s16 q6, d16, d0[2] vmlsl.s16 q6, d20, d0[0] vmlsl.s16 q6, d22, d0[1] vmull.s16 q7, d17, d0[2] vmlsl.s16 q7, d21, d0[0] vmlsl.s16 q7, d23, d0[1] vmul.s32 q10, q2, d1[0] vmul.s32 q11, q3, d1[0] vmull.s16 q2, d18, d0[3] vmull.s16 q3, d19, d0[3] vadd.s32 q8, q4, q2 // out0 vadd.s32 q9, q5, q3 vadd.s32 q4, q4, q6 // out3 vadd.s32 q5, q5, q7 vadd.s32 q6, q6, q2 // out1 vadd.s32 q7, q7, q3 vsub.s32 q4, q4, q2 // out3 vsub.s32 q5, q5, q3 vqrshrn.s32 d20, q10, #12 vqrshrn.s32 d21, q11, #12 vqrshrn.s32 \o0, q8, #12 vqrshrn.s32 \o1, q9, #12 .ifc \o4, d18 vmov q9, q10 .endif vqrshrn.s32 \o2, q6, #12 vqrshrn.s32 \o3, q7, #12 vqrshrn.s32 \o6, q4, #12 vqrshrn.s32 \o7, q5, #12 .endm function inv_adst_8h_x4_neon, export=1 iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17 bx lr endfunc function inv_identity_4h_x4_neon, export=1 movw r12, #(5793-4096)*8 vdup.16 d0, r12 vqrdmulh.s16 q2, q8, d0[0] vqrdmulh.s16 q3, q9, d0[0] vqadd.s16 q8, q8, q2 vqadd.s16 q9, q9, q3 bx lr endfunc function inv_identity_8h_x4_neon, export=1 movw r12, #(5793-4096)*8 vdup.16 d0, r12 vqrdmulh.s16 q1, q8, d0[0] vqrdmulh.s16 q2, q9, d0[0] vqrdmulh.s16 q3, q10, d0[0] vqadd.s16 q8, q8, q1 vqrdmulh.s16 q1, q11, d0[0] vqadd.s16 q9, q9, q2 vqadd.s16 q10, q10, q3 vqadd.s16 q11, q11, q1 bx lr endfunc .macro identity_8x4_shift1 r0, r1, r2, r3, c .irp i, \r0, \r1, \r2, \r3 vqrdmulh.s16 q1, \i, \c vrhadd.s16 \i, \i, q1 .endr .endm function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 push {r4-r5,lr} vmov.i16 q15, #0 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q15}, [r2, :128]! vshr.s16 q8, q8, #2 vshr.s16 q9, q9, #2 iwht4 vst1.16 {q15}, [r2, :128]! transpose_4x4h q8, q9, d16, d17, d18, d19 iwht4 vld1.32 {d0[]}, [r0, :32], r1 vld1.32 {d0[1]}, [r0, :32], r1 vld1.32 {d1[]}, [r0, :32], r1 vld1.32 {d1[1]}, [r0, :32], r1 b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon vmov.i16 q15, #0 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q15}, [r2, :128]! blx r4 vst1.16 {q15}, [r2, :128]! transpose_4x4h q8, q9, d16, d17, d18, d19 blx r5 vld1.32 {d0[]}, [r0, :32], r1 vld1.32 {d0[1]}, [r0, :32], r1 vld1.32 {d1[]}, [r0, :32], r1 vld1.32 {d1[1]}, [r0, :32], r1 vrshr.s16 q8, q8, #4 vrshr.s16 q9, q9, #4 L(itx_4x4_end): sub r0, r0, r1, lsl #2 vaddw.u8 q8, q8, d0 vqmovun.s16 d0, q8 vaddw.u8 q9, q9, d1 vst1.32 {d0[0]}, [r0, :32], r1 vqmovun.s16 d1, q9 vst1.32 {d0[1]}, [r0, :32], r1 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 pop {r4-r5,pc} endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 push {r4-r5,lr} .ifc \txfm1\()_\txfm2, dct_dct cmp r3, #0 bne 1f vmov.i16 d30, #0 movw r12, #2896*8 vld1.16 {d16[]}, [r2, :16] vdup.16 d4, r12 vst1.16 {d30[0]}, [r2, :16] vqrdmulh.s16 d16, d16, d4[0] vld1.32 {d0[0]}, [r0, :32], r1 vqrdmulh.s16 d20, d16, d4[0] vld1.32 {d0[1]}, [r0, :32], r1 vrshr.s16 d16, d20, #4 vrshr.s16 d17, d20, #4 vld1.32 {d1[0]}, [r0, :32], r1 vmov q9, q8 vld1.32 {d1[1]}, [r0, :32], r1 b L(itx_4x4_end) 1: .endif movrel_local r4, inv_\txfm1\()_4h_x4_neon movrel_local r5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13 vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a vqadd.s16 q2, \q1, \q3 // t4 vqsub.s16 \q1, \q1, \q3 // t5a vqadd.s16 q3, \q7, \q5 // t7 vqsub.s16 \q3, \q7, \q5 // t6a vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 vqrshrn_8h d8, d9, q4, q5, #12 // t5 vqrshrn_8h d10, d11, q6, q7, #12 // t6 vqsub.s16 \q7, \q0, q3 // out7 vqadd.s16 \q0, \q0, q3 // out0 vqadd.s16 \q1, \q2, q5 // out1 vqsub.s16 q6, \q2, q5 // out6 vqadd.s16 \q2, \q4, q4 // out2 vqsub.s16 \q5, \q4, q4 // out5 vqadd.s16 \q3, \q6, q2 // out3 vqsub.s16 \q4, \q6, q2 // out4 vmov \q6, q6 // out6 .endm .macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4h_x4 \r0, \r2, \r4, \r6 vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a vqrshrn.s32 \r1, q1, #12 // t4a vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a vqrshrn.s32 \r7, q2, #12 // t7a vqrshrn.s32 \r3, q3, #12 // t5a vqrshrn.s32 \r5, q1, #12 // taa vqadd.s16 d2, \r1, \r3 // t4 vqsub.s16 \r1, \r1, \r3 // t5a vqadd.s16 d3, \r7, \r5 // t7 vqsub.s16 \r3, \r7, \r5 // t6a vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 vqrshrn.s32 d4, q2, #12 // t5 vqrshrn.s32 d5, q3, #12 // t6 vqsub.s16 \r7, \r0, d3 // out7 vqadd.s16 \r0, \r0, d3 // out0 vqadd.s16 \r1, \r2, d5 // out1 vqsub.s16 d6, \r2, d5 // out6 vqadd.s16 \r2, \r4, d4 // out2 vqsub.s16 \r5, \r4, d4 // out5 vqadd.s16 \r3, \r6, d2 // out3 vqsub.s16 \r4, \r6, d2 // out4 vmov \r6, d6 // out6 .endm function inv_dct_8h_x8_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {q0}, [r12, :128] idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_dct_4h_x8_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {q0}, [r12, :128] idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc .macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 movrel_local r12, iadst8_coeffs vld1.16 {d0, d1, d2}, [r12, :64] vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] vqrshrn_8h d16, d17, q2, q3, #12 // t0a vqrshrn_8h d30, d31, q4, q5, #12 // t1a vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] vqrshrn_8h d20, d21, q6, q7, #12 // t2a vqrshrn_8h d26, d27, q2, q3, #12 // t3a vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] vqrshrn_8h d24, d25, q4, q5, #12 // t4a vqrshrn_8h d22, d23, q6, q7, #12 // t5a vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] vqrshrn_8h d28, d29, q2, q3, #12 // t6a vqrshrn_8h d18, d19, q4, q5, #12 // t7a vqadd.s16 q2, q8, q12 // t0 vqsub.s16 q3, q8, q12 // t4 vqadd.s16 q4, q15, q11 // t1 vqsub.s16 q5, q15, q11 // t5 vqadd.s16 q6, q10, q14 // t2 vqsub.s16 q7, q10, q14 // t6 vqadd.s16 q10, q13, q9 // t3 vqsub.s16 q11, q13, q9 // t7 vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2] vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] vqrshrn_8h d6, d7, q8, q9, #12 // t4a vqrshrn_8h d10, d11, q12, q13, #12 // t5a vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] vqrshrn_8h d14, d15, q14, q15, #12 // t6a vqrshrn_8h d22, d23, q8, q9, #12 // t7a vqadd.s16 \q0, q2, q6 // out0 vqsub.s16 q2, q2, q6 // t2 vqadd.s16 \q7, q4, q10 // out7 vqsub.s16 q4, q4, q10 // t3 vqneg.s16 \q7, \q7 // out7 vqadd.s16 \q1, q3, q7 // out1 vqsub.s16 q3, q3, q7 // t6 vqadd.s16 \q6, q5, q11 // out6 vqsub.s16 q5, q5, q11 // t7 vqneg.s16 \q1, \q1 // out1 vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) vqrshrn_8h d4, d5, q10, q11, #12 // out3 vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) vqrshrn_8h d6, d7, q12, q13, #12 // out5 vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) vqneg.s16 \q3, q2 // out3 vqneg.s16 \q5, q3 // out5 .endm .macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 movrel_local r12, iadst8_coeffs vld1.16 {d0, d1, d2}, [r12, :64] vmull_vmlal q2, d23, d16, d0[0], d0[1] vmull_vmlsl q3, d23, d16, d0[1], d0[0] vmull_vmlal q4, d21, d18, d0[2], d0[3] vqrshrn.s32 d16, q2, #12 // t0a vqrshrn.s32 d23, q3, #12 // t1a vmull_vmlsl q5, d21, d18, d0[3], d0[2] vmull_vmlal q6, d19, d20, d1[0], d1[1] vqrshrn.s32 d18, q4, #12 // t2a vqrshrn.s32 d21, q5, #12 // t3a vmull_vmlsl q7, d19, d20, d1[1], d1[0] vmull_vmlal q2, d17, d22, d1[2], d1[3] vqrshrn.s32 d20, q6, #12 // t4a vqrshrn.s32 d19, q7, #12 // t5a vmull_vmlsl q3, d17, d22, d1[3], d1[2] vqrshrn.s32 d22, q2, #12 // t6a vqrshrn.s32 d17, q3, #12 // t7a vqadd.s16 d4, d16, d20 // t0 vqsub.s16 d5, d16, d20 // t4 vqadd.s16 d6, d23, d19 // t1 vqsub.s16 d7, d23, d19 // t5 vqadd.s16 d8, d18, d22 // t2 vqsub.s16 d9, d18, d22 // t6 vqadd.s16 d18, d21, d17 // t3 vqsub.s16 d19, d21, d17 // t7 vmull_vmlal q8, d5, d7, d2[3], d2[2] vmull_vmlsl q10, d5, d7, d2[2], d2[3] vmull_vmlsl q11, d19, d9, d2[3], d2[2] vqrshrn.s32 d5, q8, #12 // t4a vqrshrn.s32 d7, q10, #12 // t5a vmull_vmlal q8, d19, d9, d2[2], d2[3] vqrshrn.s32 d9, q11, #12 // t6a vqrshrn.s32 d19, q8, #12 // t7a vqadd.s16 \r0, d4, d8 // out0 vqsub.s16 d4, d4, d8 // t2 vqadd.s16 \r7, d6, d18 // out7 vqsub.s16 d6, d6, d18 // t3 vqneg.s16 \r7, \r7 // out7 vqadd.s16 \r1, d5, d9 // out1 vqsub.s16 d5, d5, d9 // t6 vqadd.s16 \r6, d7, d19 // out6 vqsub.s16 d7, d7, d19 // t7 vqneg.s16 \r1, \r1 // out1 vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) vqrshrn.s32 d4, q9, #12 // out3 vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) vqrshrn.s32 d5, q10, #12 // out5 vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21) vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19) vqneg.s16 \r3, d4 // out3 vqneg.s16 \r5, d5 // out5 .endm function inv_adst_8h_x8_neon, export=1 iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_flipadst_8h_x8_neon, export=1 iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17 bx lr endfunc function inv_adst_4h_x8_neon, export=1 iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc function inv_flipadst_4h_x8_neon, export=1 iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16 bx lr endfunc function inv_identity_8h_x8_neon, export=1 vqshl.s16 q8, q8, #1 vqshl.s16 q9, q9, #1 vqshl.s16 q10, q10, #1 vqshl.s16 q11, q11, #1 vqshl.s16 q12, q12, #1 vqshl.s16 q13, q13, #1 vqshl.s16 q14, q14, #1 vqshl.s16 q15, q15, #1 bx lr endfunc function inv_identity_4h_x8_neon, export=1 vqshl.s16 q8, q8, #1 vqshl.s16 q9, q9, #1 vqshl.s16 q10, q10, #1 vqshl.s16 q11, q11, #1 bx lr endfunc .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_neon vmov.i16 q0, #0 vmov.i16 q1, #0 vld1.16 {q8, q9}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q12, q13}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q14, q15}, [r2, :128] vst1.16 {q0, q1}, [r2, :128] .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out b L(itx_8x8_epilog) .else blx r4 vrshr.s16 q8, q8, #1 vrshr.s16 q9, q9, #1 vrshr.s16 q10, q10, #1 vrshr.s16 q11, q11, #1 vrshr.s16 q12, q12, #1 vrshr.s16 q13, q13, #1 vrshr.s16 q14, q14, #1 vrshr.s16 q15, q15, #1 L(itx_8x8_epilog): transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 blx r5 load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} .endif endfunc .endm def_fn_8x8_base identity_ def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif push {r4-r5,r7,lr} vpush {q4-q7} movrel_local r5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else movrel_local r4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc .endm def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst def_fn_8x8 adst, flipadst def_fn_8x8 flipadst, dct def_fn_8x8 flipadst, adst def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst function inv_txfm_add_8x4_neon vmov.i16 q14, #0 vmov.i16 q15, #0 movw r12, #2896*8 vdup.16 d0, r12 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q14, q15}, [r2, :128]! vld1.16 {d20, d21, d22, d23}, [r2, :128] vst1.16 {q14, q15}, [r2, :128] scale_input d0[0], q8, q9, q10, q11 blx r4 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 vswp d17, d20 vswp d19, d21 vswp d18, d20 vswp d21, d22 blx r5 load_add_store_8x4 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} endfunc function inv_txfm_add_4x8_neon vmov.i16 q14, #0 vmov.i16 q15, #0 movw r12, #2896*8 vdup.16 d0, r12 vld1.16 {q8, q9}, [r2, :128] vst1.16 {q14, q15}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128] vst1.16 {q14, q15}, [r2, :128] scale_input d0[0], q8, q9, q10, q11 blx r4 transpose_4x8h q8, q9, q10, q11 vswp d17, d20 vswp d19, d21 vswp d17, d18 vswp d19, d22 blx r5 load_add_store_4x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} endfunc .macro def_fn_48 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif push {r4-r5,r7,lr} vpush {q4-q7} movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct def_fn_48 \w, \h, identity, identity def_fn_48 \w, \h, dct, adst def_fn_48 \w, \h, dct, flipadst def_fn_48 \w, \h, dct, identity def_fn_48 \w, \h, adst, dct def_fn_48 \w, \h, adst, adst def_fn_48 \w, \h, adst, flipadst def_fn_48 \w, \h, flipadst, dct def_fn_48 \w, \h, flipadst, adst def_fn_48 \w, \h, flipadst, flipadst def_fn_48 \w, \h, identity, dct def_fn_48 \w, \h, adst, identity def_fn_48 \w, \h, flipadst, identity def_fn_48 \w, \h, identity, adst def_fn_48 \w, \h, identity, flipadst .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_4h_x16_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {q0, q1}, [r12, :128] vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a vqrshrn.s32 d17, q2, #12 // t8a vqrshrn.s32 d31, q3, #12 // t15a vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a vqrshrn.s32 d23, q4, #12 // t9a vqrshrn.s32 d25, q2, #12 // t14a vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a vqrshrn.s32 d21, q3, #12 // t10a vqrshrn.s32 d27, q4, #12 // t13a vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a vqrshrn.s32 d19, q2, #12 // t11a vqrshrn.s32 d29, q3, #12 // t12a idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 vqsub.s16 d4, d17, d23 // t9 vqadd.s16 d17, d17, d23 // t8 vqsub.s16 d5, d31, d25 // t14 vqadd.s16 d31, d31, d25 // t15 vqsub.s16 d23, d19, d21 // t10 vqadd.s16 d19, d19, d21 // t11 vqadd.s16 d25, d29, d27 // t12 vqsub.s16 d29, d29, d27 // t13 vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a vqrshrn.s32 d21, q3, #12 // t9a vqrshrn.s32 d27, q4, #12 // t14a vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a vqrshrn.s32 d29, q3, #12 // t13a vneg.s32 q4, q4 vqrshrn.s32 d23, q4, #12 // t10a vqsub.s16 d4, d17, d19 // t11a vqadd.s16 d17, d17, d19 // t8a vqsub.s16 d5, d31, d25 // t12a vqadd.s16 d31, d31, d25 // t15a vqadd.s16 d19, d21, d23 // t9 vqsub.s16 d21, d21, d23 // t10 vqsub.s16 d25, d27, d29 // t13 vqadd.s16 d27, d27, d29 // t14 vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11 vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a vqrshrn.s32 d6, q3, #12 // t11 vqrshrn.s32 d7, q4, #12 // t12 vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a vqrshrn.s32 d4, q2, #12 // t10a vqrshrn.s32 d5, q4, #12 // t13a vqadd.s16 d8, d16, d31 // out0 vqsub.s16 d31, d16, d31 // out15 vmov d16, d8 vqadd.s16 d23, d30, d17 // out7 vqsub.s16 d9, d30, d17 // out8 vqadd.s16 d17, d18, d27 // out1 vqsub.s16 d30, d18, d27 // out14 vqadd.s16 d18, d20, d5 // out2 vqsub.s16 d29, d20, d5 // out13 vqadd.s16 d5, d28, d19 // out6 vqsub.s16 d25, d28, d19 // out9 vqadd.s16 d19, d22, d7 // out3 vqsub.s16 d28, d22, d7 // out12 vqadd.s16 d20, d24, d6 // out4 vqsub.s16 d27, d24, d6 // out11 vqadd.s16 d21, d26, d4 // out5 vqsub.s16 d26, d26, d4 // out10 vmov d24, d9 vmov d22, d5 bx lr endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel_local r12, iadst16_coeffs vld1.16 {q0, q1}, [r12, :128] movrel_local r12, idct_coeffs vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 vqrshrn.s32 d16, q2, #12 // t0 vqrshrn.s32 d31, q3, #12 // t1 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 vqrshrn.s32 d18, q4, #12 // t2 vqrshrn.s32 d29, q2, #12 // t3 vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 vqrshrn.s32 d20, q3, #12 // t4 vqrshrn.s32 d27, q4, #12 // t5 vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 vqrshrn.s32 d22, q2, #12 // t6 vqrshrn.s32 d25, q3, #12 // t7 vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 vqrshrn.s32 d23, q4, #12 // t8 vqrshrn.s32 d24, q2, #12 // t9 vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 vqrshrn.s32 d21, q3, #12 // t10 vqrshrn.s32 d26, q4, #12 // t11 vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 vqrshrn.s32 d19, q2, #12 // t12 vqrshrn.s32 d28, q3, #12 // t13 vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 vqrshrn.s32 d17, q4, #12 // t14 vqrshrn.s32 d30, q2, #12 // t15 vld1.16 {q0}, [r12, :128] vqsub.s16 d2, d16, d23 // t8a vqadd.s16 d16, d16, d23 // t0a vqsub.s16 d3, d31, d24 // t9a vqadd.s16 d31, d31, d24 // t1a vqadd.s16 d23, d18, d21 // t2a vqsub.s16 d18, d18, d21 // t10a vqadd.s16 d24, d29, d26 // t3a vqsub.s16 d29, d29, d26 // t11a vqadd.s16 d21, d20, d19 // t4a vqsub.s16 d20, d20, d19 // t12a vqadd.s16 d26, d27, d28 // t5a vqsub.s16 d27, d27, d28 // t13a vqadd.s16 d19, d22, d17 // t6a vqsub.s16 d22, d22, d17 // t14a vqadd.s16 d28, d25, d30 // t7a vqsub.s16 d25, d25, d30 // t15a vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 vqrshrn.s32 d17, q2, #12 // t8 vqrshrn.s32 d30, q3, #12 // t9 vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 vqrshrn.s32 d18, q4, #12 // t10 vqrshrn.s32 d29, q2, #12 // t11 vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 vqrshrn.s32 d27, q3, #12 // t12 vqrshrn.s32 d20, q4, #12 // t13 vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 vqrshrn.s32 d25, q2, #12 // t14 vqrshrn.s32 d22, q3, #12 // t15 vqsub.s16 d2, d16, d21 // t4 vqadd.s16 d16, d16, d21 // t0 vqsub.s16 d3, d31, d26 // t5 vqadd.s16 d31, d31, d26 // t1 vqadd.s16 d21, d23, d19 // t2 vqsub.s16 d23, d23, d19 // t6 vqadd.s16 d26, d24, d28 // t3 vqsub.s16 d24, d24, d28 // t7 vqadd.s16 d19, d17, d27 // t8a vqsub.s16 d17, d17, d27 // t12a vqadd.s16 d28, d30, d20 // t9a vqsub.s16 d30, d30, d20 // t13a vqadd.s16 d27, d18, d25 // t10a vqsub.s16 d18, d18, d25 // t14a vqadd.s16 d20, d29, d22 // t11a vqsub.s16 d29, d29, d22 // t15a vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a vqrshrn.s32 d22, q2, #12 // t4a vqrshrn.s32 d25, q3, #12 // t5a vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 vqrshrn.s32 d24, q4, #12 // t6a vqrshrn.s32 d23, q2, #12 // t7a vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 vqrshrn.s32 d17, q3, #12 // t12 vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 vqrshrn.s32 d29, q4, #12 // t13 vqrshrn.s32 d30, q2, #12 // t14 vqrshrn.s32 d18, q3, #12 // t15 vqsub.s16 d2, d16, d21 // t2a .ifc \o0, d16 vqadd.s16 \o0, d16, d21 // out0 vqsub.s16 d21, d31, d26 // t3a vqadd.s16 \o15,d31, d26 // out15 .else vqadd.s16 d4, d16, d21 // out0 vqsub.s16 d21, d31, d26 // t3a vqadd.s16 \o15,d31, d26 // out15 vmov \o0, d4 .endif vqneg.s16 \o15, \o15 // out15 vqsub.s16 d3, d29, d18 // t15a vqadd.s16 \o13,d29, d18 // out13 vqadd.s16 \o2, d17, d30 // out2 vqsub.s16 d26, d17, d30 // t14a vqneg.s16 \o13,\o13 // out13 vqadd.s16 \o1, d19, d27 // out1 vqsub.s16 d27, d19, d27 // t10 vqadd.s16 \o14,d28, d20 // out14 vqsub.s16 d20, d28, d20 // t11 vqneg.s16 \o1, \o1 // out1 vqadd.s16 \o3, d22, d24 // out3 vqsub.s16 d22, d22, d24 // t6 vqadd.s16 \o12,d25, d23 // out12 vqsub.s16 d23, d25, d23 // t7 vqneg.s16 \o3, \o3 // out3 vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) vqrshrn.s32 d24, q12, #12 // out8 vqrshrn.s32 d4, q2, #12 // out7 vqrshrn.s32 d5, q3, #12 // out5 vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) vqrshrn.s32 d26, q4, #12 // out10 vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) vqrshrn.s32 \o4, q1, #12 // out4 vqrshrn.s32 d7, q3, #12 // out9 vqrshrn.s32 d6, q4, #12 // out11 vqrshrn.s32 \o6, q11, #12 // out6 .ifc \o8, d23 vmov \o8, d24 vmov \o10,d26 .endif vqneg.s16 \o7, d4 // out7 vqneg.s16 \o5, d5 // out5 vqneg.s16 \o11,d6 // out11 vqneg.s16 \o9, d7 // out9 .endm function inv_adst_4h_x16_neon, export=1 iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_flipadst_4h_x16_neon, export=1 iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 bx lr endfunc function inv_identity_4h_x16_neon, export=1 movw r12, #2*(5793-4096)*8 vdup.16 d0, r12 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q1, \i, d0[0] vqadd.s16 \i, \i, \i vqadd.s16 \i, \i, q1 .endr bx lr endfunc .macro identity_4x16_shift2 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q2, \i, \c vshr.s16 q2, q2, #1 vrhadd.s16 \i, \i, q2 .endr .endm .macro identity_4x16_shift1 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q2, \i, \c vrshr.s16 q2, q2, #1 vqadd.s16 \i, \i, q2 .endr .endm .macro identity_8x8_shift1 c identity_4x16_shift1 \c .endm .macro identity_8x8 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q2, \i, \c vqadd.s16 \i, \i, \i vqadd.s16 \i, \i, q2 .endr .endm .macro def_horz_16 scale=0, identity=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x4_neon push {lr} vmov.i16 d7, #0 .if \identity movw r12, #2*(5793-4096)*8 vdup.16 d0, r12 .endif .if \scale movw r12, #2896*8 vdup.16 d1, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64] vst1.16 {d7}, [r7, :64], r8 .endr .if \scale scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif .if \identity .if \shift == -2 identity_4x16_shift2 d0[0] .else identity_4x16_shift1 d0[0] .endif b L(horz_16x4_epilog) .else blx r4 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #\shift .endr .if \shift == 1 b L(horz_16x4_epilog) .else L(horz_16x4_epilog): transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 transpose_4x4h q14, q15, d28, d29, d30, d31 .irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31 vst1.16 {\i}, [r6, :64]! .endr pop {pc} .endif .endif endfunc .endm def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity def_horz_16 scale=1, identity=0, shift=1, suffix=_scale def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr blx r5 load_add_store_4x16 r6, r7 pop {pc} endfunc function inv_txfm_add_16x16_neon sub_sp_align 512 ldrh r11, [r10], #2 .irp i, 0, 4, 8, 12 add r6, sp, #(\i*16*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 12 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #16*2 blx r9 .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #32 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 512 vpop {q4} pop {r4-r11,pc} endfunc const eob_16x16 .short 10, 36, 78, 256 endconst const eob_16x16_identity .short 4, 8, 12, 256 endconst .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif push {r4-r11,lr} vpush {q4} .ifc \txfm1, identity movrel_local r9, inv_txfm_horz_identity_16x4_neon .else movrel_local r9, inv_txfm_horz_16x4_neon movrel_local r4, inv_\txfm1\()_4h_x16_neon .endif movrel_local r5, inv_\txfm2\()_4h_x16_neon .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_16x16 .else movrel_local r10, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel_local r10, eob_16x16_identity .else movrel_local r10, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst def_fn_16x16 adst, flipadst def_fn_16x16 flipadst, dct def_fn_16x16 flipadst, adst def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct .macro def_fn_416_base variant function inv_txfm_\variant\()add_16x4_neon .ifc \variant, identity_ vmov.i16 d4, #0 .irp i, d16, d18, d20, d22 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr .irp i, d17, d19, d21, d23 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr movw r12, #2*(5793-4096)*8 vdup.16 d0, r12 .irp i, d24, d26, d28, d30 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr .irp i, d25, d27, d29, d31 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr identity_4x16_shift1 d0[0] b L(itx_16x4_epilog) .else vmov.i16 q2, #0 vmov.i16 q3, #0 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! vld1.16 {d20, d21, d22, d23}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! vld1.16 {d24, d25, d26, d27}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! vld1.16 {d28, d29, d30, d31}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! blx r4 vswp d17, d20 vswp d19, d22 vswp d18, d20 vswp d19, d21 vswp d25, d28 vswp d27, d30 vswp d26, d28 vswp d27, d29 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #1 .endr L(itx_16x4_epilog): transpose_4x8h q8, q9, q10, q11 blx r5 mov r6, r0 load_add_store_8x4 r6, r7 vmov q8, q12 vmov q9, q13 vmov q10, q14 vmov q11, q15 transpose_4x8h q8, q9, q10, q11 blx r5 add r6, r0, #8 load_add_store_8x4 r6, r7 vpop {q4-q7} pop {r4-r11,pc} .endif endfunc function inv_txfm_\variant\()add_4x16_neon vmov.i16 q2, #0 mov r11, #32 cmp r3, r10 blt 1f add r6, r2, #16 .ifc \variant, identity_ .irp i, q12, q13, q14, q15 vld1.16 {\i}, [r6, :128] vst1.16 {q2}, [r6, :128], r11 .endr movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q12, q13, q14, q15, d0[0] .else .irp i, q8, q9, q10, q11 vld1.16 {\i}, [r6, :128] vst1.16 {q2}, [r6, :128], r11 .endr blx r4 vrshr.s16 q12, q8, #1 vrshr.s16 q13, q9, #1 vrshr.s16 q14, q10, #1 vrshr.s16 q15, q11, #1 .endif transpose_4x8h q12, q13, q14, q15 vswp d27, d29 vswp d26, d28 vswp d27, d30 vswp d25, d28 b 2f 1: .irp i, q12, q13, q14, q15 vmov.i16 \i, #0 .endr 2: vmov.i16 q2, #0 .irp i, q8, q9, q10, q11 vld1.16 {\i}, [r2, :128] vst1.16 {q2}, [r2, :128], r11 .endr .ifc \variant, identity_ movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q8, q9, q10, q11, d0[0] b L(itx_4x16_epilog) .else blx r4 .irp i, q8, q9, q10, q11 vrshr.s16 \i, \i, #1 .endr L(itx_4x16_epilog): transpose_4x8h q8, q9, q10, q11 vswp d19, d21 vswp d18, d20 vswp d19, d22 vswp d17, d20 blx r5 load_add_store_4x16 r0, r6 vpop {q4-q7} pop {r4-r11,pc} .endif endfunc .endm def_fn_416_base identity_ def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 4 .ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon .endif movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon mov r10, #\eob_half .else .ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon .endif movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 29 def_fn_416 \w, \h, identity, identity, 29 def_fn_416 \w, \h, dct, adst, 29 def_fn_416 \w, \h, dct, flipadst, 29 def_fn_416 \w, \h, dct, identity, 8 def_fn_416 \w, \h, adst, dct, 29 def_fn_416 \w, \h, adst, adst, 29 def_fn_416 \w, \h, adst, flipadst, 29 def_fn_416 \w, \h, flipadst, dct, 29 def_fn_416 \w, \h, flipadst, adst, 29 def_fn_416 \w, \h, flipadst, flipadst, 29 def_fn_416 \w, \h, identity, dct, 32 def_fn_416 \w, \h, adst, identity, 8 def_fn_416 \w, \h, flipadst, identity, 8 def_fn_416 \w, \h, identity, adst, 32 def_fn_416 \w, \h, identity, flipadst, 32 .endm def_fns_416 4, 16 def_fns_416 16, 4 function inv_txfm_add_16x8_neon sub_sp_align 256 .irp i, 0, 4 add r6, sp, #(\i*16*2) .if \i > 0 cmp r3, r10 blt 1f .endif add r7, r2, #(\i*2) mov r8, #8*2 blx r9 .endr b 2f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr 2: .irp i, 0, 8 add r7, sp, #(\i*2) mov r8, #32 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\j}, [r7, :128], r8 .endr blx r5 add r6, r0, #(\i) load_add_store_8x8 r6, r7 .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc .macro def_fn_816_base variant function inv_txfm_\variant\()add_8x16_neon sub_sp_align 256 .irp i, 0, 8 add r6, sp, #(\i*8*2) .if \i > 0 cmp r3, r10 blt 1f .endif add r7, r2, #(\i*2) mov r8, #16*2 vmov.i16 q2, #0 movw r12, #2896*8 vdup.16 d0, r12 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\j}, [r7, :128] vst1.16 {q2}, [r7, :128], r8 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .ifc \variant, identity_ // The identity shl #1 and downshift vrshr #1 cancel out .else blx r4 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \j, \j, #1 .endr .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 vst1.16 {q8, q9}, [r6, :128]! vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r6, :128]! vst1.16 {q14, q15}, [r6, :128]! .endr b 2f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr 2: .ifc \variant, identity_ b L(itx_8x16_epilog) .else L(itx_8x16_epilog): .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #16 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} .endif endfunc .endm def_fn_816_base identity_ def_fn_816_base /* Define symbols used in .if statement */ .equ dct, 1 .equ identity, 2 .equ adst, 3 .equ flipadst, 4 .macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 8 .ifnc \txfm1, identity movrel_local r4, inv_\txfm1\()_8h_x8_neon .endif movrel_local r5, inv_\txfm2\()_4h_x16_neon .else .ifc \txfm1, identity movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon .else movrel_local r4, inv_\txfm1\()_4h_x16_neon movrel_local r9, inv_txfm_horz_scale_16x4_neon .endif movrel_local r5, inv_\txfm2\()_8h_x8_neon .endif .if \w == 8 mov r10, #\eob_8x8 .else mov r10, #\eob_4x4 .endif .if \w == 8 && \txfm1 == identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct, 43, 10 def_fn_816 \w, \h, identity, identity, 43, 10 def_fn_816 \w, \h, dct, adst, 43, 10 def_fn_816 \w, \h, dct, flipadst, 43, 10 def_fn_816 \w, \h, dct, identity, 8, 4 def_fn_816 \w, \h, adst, dct, 43, 10 def_fn_816 \w, \h, adst, adst, 43, 10 def_fn_816 \w, \h, adst, flipadst, 43, 10 def_fn_816 \w, \h, flipadst, dct, 43, 10 def_fn_816 \w, \h, flipadst, adst, 43, 10 def_fn_816 \w, \h, flipadst, flipadst, 43, 10 def_fn_816 \w, \h, identity, dct, 64, 4 def_fn_816 \w, \h, adst, identity, 8, 4 def_fn_816 \w, \h, flipadst, identity, 8, 4 def_fn_816 \w, \h, identity, adst, 64, 4 def_fn_816 \w, \h, identity, flipadst, 64, 4 .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_4h_x16_neon, export=1 movrel_local r12, idct_coeffs, 2*16 vld1.16 {q0, q1}, [r12, :128] sub r12, r12, #2*16 vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a vqrshrn.s32 d16, q2, #12 // t16a vqrshrn.s32 d31, q3, #12 // t31a vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a vqrshrn.s32 d24, q4, #12 // t17a vqrshrn.s32 d23, q2, #12 // t30a vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a vqrshrn.s32 d20, q3, #12 // t18a vqrshrn.s32 d27, q4, #12 // t29a vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a vqrshrn.s32 d28, q2, #12 // t19a vqrshrn.s32 d19, q3, #12 // t28a vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a vqrshrn.s32 d18, q4, #12 // t20a vqrshrn.s32 d29, q2, #12 // t27a vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a vqrshrn.s32 d26, q3, #12 // t21a vqrshrn.s32 d21, q4, #12 // t26a vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a vqrshrn.s32 d22, q2, #12 // t22a vqrshrn.s32 d25, q3, #12 // t25a vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a vqrshrn.s32 d30, q4, #12 // t23a vqrshrn.s32 d17, q2, #12 // t24a vld1.16 {q0}, [r12, :128] vqsub.s16 d2, d16, d24 // t17 vqadd.s16 d16, d16, d24 // t16 vqsub.s16 d3, d31, d23 // t30 vqadd.s16 d31, d31, d23 // t31 vqsub.s16 d24, d28, d20 // t18 vqadd.s16 d28, d28, d20 // t19 vqadd.s16 d23, d18, d26 // t20 vqsub.s16 d18, d18, d26 // t21 vqsub.s16 d20, d30, d22 // t22 vqadd.s16 d30, d30, d22 // t23 vqadd.s16 d26, d17, d25 // t24 vqsub.s16 d17, d17, d25 // t25 vqsub.s16 d22, d29, d21 // t26 vqadd.s16 d29, d29, d21 // t27 vqadd.s16 d25, d19, d27 // t28 vqsub.s16 d19, d19, d27 // t29 vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a vqrshrn.s32 d21, q2, #12 // t17a vqrshrn.s32 d27, q3, #12 // t30a vneg.s32 q4, q4 // -> t18a vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a vqrshrn.s32 d19, q4, #12 // t18a vqrshrn.s32 d24, q1, #12 // t29a vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a vqrshrn.s32 d22, q2, #12 // t21a vqrshrn.s32 d18, q3, #12 // t26a vneg.s32 q4, q4 // -> t22a vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a vqrshrn.s32 d17, q4, #12 // t22a vqrshrn.s32 d20, q1, #12 // t25a vqsub.s16 d2, d27, d24 // t29 vqadd.s16 d27, d27, d24 // t30 vqsub.s16 d3, d21, d19 // t18 vqadd.s16 d21, d21, d19 // t17 vqsub.s16 d24, d16, d28 // t19a vqadd.s16 d16, d16, d28 // t16a vqsub.s16 d19, d30, d23 // t20a vqadd.s16 d30, d30, d23 // t23a vqsub.s16 d28, d17, d22 // t21 vqadd.s16 d17, d17, d22 // t22 vqadd.s16 d23, d26, d29 // t24a vqsub.s16 d26, d26, d29 // t27a vqadd.s16 d22, d20, d18 // t25 vqsub.s16 d20, d20, d18 // t26 vqsub.s16 d29, d31, d25 // t28a vqadd.s16 d31, d31, d25 // t31a vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 vqrshrn.s32 d18, q2, #12 // t18a vqrshrn.s32 d25, q3, #12 // t29a vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 vqrshrn.s32 d29, q4, #12 // t19 vqrshrn.s32 d24, q1, #12 // t28 vneg.s32 q2, q2 // -> t20 vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a vqrshrn.s32 d26, q2, #12 // t20 vqrshrn.s32 d19, q3, #12 // t27 vneg.s32 q4, q4 // -> t21a vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a vqrshrn.s32 d20, q4, #12 // t21a vqrshrn.s32 d28, q1, #12 // t26a vqsub.s16 d2, d16, d30 // t23 vqadd.s16 d16, d16, d30 // t16 = out16 vqsub.s16 d3, d31, d23 // t24 vqadd.s16 d31, d31, d23 // t31 = out31 vqsub.s16 d23, d21, d17 // t22a vqadd.s16 d17, d21, d17 // t17a = out17 vqadd.s16 d30, d27, d22 // t30a = out30 vqsub.s16 d21, d27, d22 // t25a vqsub.s16 d27, d18, d20 // t21 vqadd.s16 d18, d18, d20 // t18 = out18 vqadd.s16 d4, d29, d26 // t19a = out19 vqsub.s16 d26, d29, d26 // t20a vqadd.s16 d29, d25, d28 // t29 = out29 vqsub.s16 d25, d25, d28 // t26 vqadd.s16 d28, d24, d19 // t28a = out28 vqsub.s16 d24, d24, d19 // t27a vmov d19, d4 // out19 vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 vqrshrn.s32 d20, q2, #12 // t20 vqrshrn.s32 d22, q3, #12 // t27 vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a vmov d27, d22 // t27 vqrshrn.s32 d26, q2, #12 // t26a vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 vqrshrn.s32 d21, q3, #12 // t21a vqrshrn.s32 d22, q12, #12 // t22 vqrshrn.s32 d25, q2, #12 // t25 vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a vqrshrn.s32 d23, q2, #12 // t23a vqrshrn.s32 d24, q3, #12 // t24a bx lr endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x4_neon push {lr} vmov.i16 d7, #0 lsl r8, r8, #1 .if \scale movw r12, #2896*8 vdup.16 d0, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64] vst1.16 {d7}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .if \scale scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_4h_x16_neon transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 transpose_4x4h q14, q15, d28, d29, d30, d31 .macro store1 r0, r1, r2, r3 vst1.16 {\r0}, [r6, :64]! vst1.16 {\r1}, [r6, :64]! vst1.16 {\r2}, [r6, :64]! vst1.16 {\r3}, [r6, :64]! add r6, r6, #32 .endm store1 d16, d20, d24, d28 store1 d17, d21, d25, d29 store1 d18, d22, d26, d30 store1 d19, d23, d27, d31 .purgem store1 sub r6, r6, #64*4 vmov.i16 d7, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64] vst1.16 {d7}, [r7, :64], r8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in d0[1] scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct32_odd_4h_x16_neon transpose_4x4h q15, q14, d31, d30, d29, d28 transpose_4x4h q13, q12, d27, d26, d25, d24 transpose_4x4h q11, q10, d23, d22, d21, d20 transpose_4x4h q9, q8, d19, d18, d17, d16 .macro store2 r0, r1, r2, r3, shift vld1.16 {q0, q1}, [r6, :128] vqsub.s16 d7, d0, \r0 vqadd.s16 d0, d0, \r0 vqsub.s16 d6, d1, \r1 vqadd.s16 d1, d1, \r1 vqsub.s16 d5, d2, \r2 vqadd.s16 d2, d2, \r2 vqsub.s16 d4, d3, \r3 vqadd.s16 d3, d3, \r3 vrev64.16 q2, q2 vrev64.16 q3, q3 vrshr.s16 q0, q0, #\shift vrshr.s16 q1, q1, #\shift vrshr.s16 q2, q2, #\shift vrshr.s16 q3, q3, #\shift vst1.16 {q0, q1}, [r6, :128]! vst1.16 {q2, q3}, [r6, :128]! .endm store2 d31, d27, d23, d19, \shift store2 d30, d26, d22, d18, \shift store2 d29, d25, d21, d17, \shift store2 d28, d24, d20, d16, \shift .purgem store2 pop {pc} endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_4x32_neon push {r10-r11,lr} lsl r8, r8, #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 bl inv_dct_4h_x16_neon .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vst1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 sub r7, r7, r8, lsr #1 bl inv_dct32_odd_4h_x16_neon neg r9, r8 mov r10, r6 .macro combine r0, r1, r2, r3, op, stride vld1.16 {d4}, [r7, :64], \stride vld1.32 {d2[0]}, [r10, :32], r1 vld1.16 {d5}, [r7, :64], \stride vld1.32 {d2[1]}, [r10, :32], r1 \op\().s16 d4, d4, \r0 vld1.16 {d6}, [r7, :64], \stride vld1.32 {d3[0]}, [r10, :32], r1 \op\().s16 d5, d5, \r1 vld1.32 {d3[1]}, [r10, :32], r1 vrshr.s16 q2, q2, #4 \op\().s16 d6, d6, \r2 vld1.16 {d7}, [r7, :64], \stride vaddw.u8 q2, q2, d2 \op\().s16 d7, d7, \r3 vqmovun.s16 d2, q2 vrshr.s16 q3, q3, #4 vst1.32 {d2[0]}, [r6, :32], r1 vaddw.u8 q3, q3, d3 vst1.32 {d2[1]}, [r6, :32], r1 vqmovun.s16 d3, q3 vst1.32 {d3[0]}, [r6, :32], r1 vst1.32 {d3[1]}, [r6, :32], r1 .endm combine d31, d30, d29, d28, vqadd, r8 combine d27, d26, d25, d24, vqadd, r8 combine d23, d22, d21, d20, vqadd, r8 combine d19, d18, d17, d16, vqadd, r8 sub r7, r7, r8 combine d16, d17, d18, d19, vqsub, r9 combine d20, d21, d22, d23, vqsub, r9 combine d24, d25, d26, d27, vqsub, r9 combine d28, d29, d30, d31, vqsub, r9 .purgem combine pop {r10-r11,pc} endfunc const eob_32x32 .short 10, 36, 78, 136, 210, 300, 406, 1024 endconst const eob_16x32 .short 10, 36, 78, 151, 215, 279, 343, 512 endconst const eob_16x32_shortside .short 10, 36, 78, 512 endconst const eob_8x32 // Contrary to the others, this one is only ever used in increments of 8x8 .short 43, 107, 171, 256 endconst function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 push {r4-r7,lr} vmov.i16 q0, #0 movrel_local r5, eob_32x32, 2 mov r6, #2*32 1: mov r12, #0 movrel_local r4, eob_32x32, 2 2: add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q0}, [r2, :128], r6 .endr transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 load_add_store_8x8 r0, r7, shiftbits=2 ldrh lr, [r4], #4 sub r0, r0, r1, lsl #3 cmp r3, lr add r0, r0, #8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12 add r0, r0, r1, lsl #3 mls r2, r6, r12, r2 add r2, r2, #2*8 b 1b 9: pop {r4-r7,pc} endfunc .macro shift_8_regs op, shift .irp i, q8, q9, q10, q11, q12, q13, q14, q15 \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 push {r4-r7,lr} movw r6, #2896*8 movw r7, #2*(5793-4096)*8 vdup.i16 d0, r6 movrel_local r5, eob_16x32\hshort, 2 vmov.16 d0[1], r7 mov r6, #2*\h 1: mov r12, #0 movrel_local r4, eob_16x32\wshort, 2 2: vmov.i16 q1, #0 add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q1}, [r2, :128], r6 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .if \w == 16 // 16x32 identity_8x8_shift1 d0[1] .else // 32x16 shift_8_regs vqshl.s16, 1 identity_8x8 d0[1] .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 .if \w == 16 load_add_store_8x8 r0, r7, shiftbits=2 .else load_add_store_8x8 r0, r7, shiftbits=4 .endif ldrh lr, [r4], #4 sub r0, r0, r1, lsl #3 cmp r3, lr add r0, r0, #8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12 add r0, r0, r1, lsl #3 mls r2, r6, r12, r2 add r2, r2, #2*8 b 1b 9: pop {r4-r7,pc} endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 push {r4-r5,lr} vmov.i16 q0, #0 movrel_local r4, eob_8x32 mov r12, #2*\h 1: ldrh lr, [r4], #2 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q0}, [r2, :128], r12 .endr .if \w == 8 // 8x32 shift_8_regs vrshr.s16, 1 .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 cmp r3, lr .if \w == 8 load_add_store_8x8 r0, r5, shiftbits=2 .else load_add_store_8x8 r0, r5, shiftbits=3 .endif blt 9f .if \w == 8 sub r2, r2, r12, lsl #3 add r2, r2, #2*8 .else sub r0, r0, r1, lsl #3 add r0, r0, #8 .endif b 1b 9: pop {r4-r5,pc} endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 idct_dc 32, 32, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 2048 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, sp, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_dct_32x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 2048 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 idct_dc 16, 32, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, sp, #(\i*16*2) add r7, r2, #(\i*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif mov r8, #2*32 bl inv_txfm_horz_scale_16x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #16*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 1024 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 idct_dc 32, 16, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r5, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12 add r6, sp, #(\i*32*2) add r7, r2, #(\i*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 12 ldrh r11, [r10], #2 .endif .endif mov r8, #2*16 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 1024 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 idct_dc 8, 32, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 512 movrel_local r10, eob_8x32 mov r8, #2*32 mov r9, #32 mov r6, sp 1: vmov.i16 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q0}, [r2, :128], r8 .endr ldrh r11, [r10], #2 sub r2, r2, r8, lsl #3 sub r9, r9, #8 add r2, r2, #2*8 bl inv_dct_8h_x8_neon .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #2 .endr transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 vst1.16 {q8, q9}, [r6, :128]! cmp r3, r11 vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r6, :128]! vst1.16 {q14, q15}, [r6, :128]! bge 1b cmp r9, #0 beq 3f vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r9, r9, #8 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #8*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 idct_dc 32, 8, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 512 .irp i, 0, 4 add r6, sp, #(\i*32*2) add r7, r2, #(\i*2) .if \i > 0 cmp r3, #10 blt 1f .endif mov r8, #8*2 bl inv_txfm_horz_dct_32x4_neon .endr b 2f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr 2: mov r8, #2*32 mov r9, #0 1: add r6, r0, r9 add r7, sp, r9, lsl #1 // #(\i*2) .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r7, :128], r8 .endr add r9, r9, #8 bl inv_dct_8h_x8_neon cmp r9, #32 load_add_store_8x8 r6, r7 blt 1b add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld1.16 {d0, d1, d2}, [r12, :64]! vqrdmulh.s16 d23, d16, d0[1] // t63a vqrdmulh.s16 d16, d16, d0[0] // t32a vqrdmulh.s16 d22, d17, d0[2] // t62a vqrdmulh.s16 d17, d17, d0[3] // t33a vqrdmulh.s16 d21, d18, d1[1] // t61a vqrdmulh.s16 d18, d18, d1[0] // t34a vqrdmulh.s16 d20, d19, d1[2] // t60a vqrdmulh.s16 d19, d19, d1[3] // t35a vqadd.s16 d24, d16, d17 // t32 vqsub.s16 d25, d16, d17 // t33 vqsub.s16 d26, d19, d18 // t34 vqadd.s16 d27, d19, d18 // t35 vqadd.s16 d28, d20, d21 // t60 vqsub.s16 d29, d20, d21 // t61 vqsub.s16 d30, d23, d22 // t62 vqadd.s16 d31, d23, d22 // t63 vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a vneg.s32 q2, q2 // t34a vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a vqrshrn.s32 d26, q2, #12 // t34a vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a vqrshrn.s32 d29, q3, #12 // t61a vqrshrn.s32 d25, q4, #12 // t33a vqrshrn.s32 d30, q2, #12 // t62a vqadd.s16 d16, d24, d27 // t32a vqsub.s16 d19, d24, d27 // t35a vqadd.s16 d17, d25, d26 // t33 vqsub.s16 d18, d25, d26 // t34 vqsub.s16 d20, d31, d28 // t60a vqadd.s16 d23, d31, d28 // t63a vqsub.s16 d21, d30, d29 // t61 vqadd.s16 d22, d30, d29 // t62 vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 vqrshrn.s32 d21, q2, #12 // t61a vqrshrn.s32 d18, q3, #12 // t34a vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 vqrshrn.s32 d20, q4, #12 // t60 vqrshrn.s32 d19, q2, #12 // t35 vst1.16 {d16, d17, d18, d19}, [r6, :128]! vst1.16 {d20, d21, d22, d23}, [r6, :128]! bx lr endfunc function inv_dct64_step2_neon movrel_local r12, idct_coeffs vld1.16 {d0}, [r12, :64] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a vldr d16, [r6, #2*4*0] // t32a vldr d17, [r9, #2*4*8] // t39a vldr d18, [r9, #2*4*0] // t63a vldr d19, [r6, #2*4*8] // t56a vldr d20, [r6, #2*4*16] // t40a vldr d21, [r9, #2*4*24] // t47a vldr d22, [r9, #2*4*16] // t55a vldr d23, [r6, #2*4*24] // t48a vqadd.s16 d24, d16, d17 // t32 vqsub.s16 d25, d16, d17 // t39 vqadd.s16 d26, d18, d19 // t63 vqsub.s16 d27, d18, d19 // t56 vqsub.s16 d28, d21, d20 // t40 vqadd.s16 d29, d21, d20 // t47 vqadd.s16 d30, d23, d22 // t48 vqsub.s16 d31, d23, d22 // t55 vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a vqrshrn.s32 d25, q2, #12 // t56a vqrshrn.s32 d27, q3, #12 // t39a vneg.s32 q4, q4 // t40a vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a vqrshrn.s32 d31, q4, #12 // t40a vqrshrn.s32 d28, q2, #12 // t55a vqadd.s16 d16, d24, d29 // t32a vqsub.s16 d19, d24, d29 // t47a vqadd.s16 d17, d27, d31 // t39 vqsub.s16 d18, d27, d31 // t40 vqsub.s16 d20, d26, d30 // t48a vqadd.s16 d23, d26, d30 // t63a vqsub.s16 d21, d25, d28 // t55 vqadd.s16 d22, d25, d28 // t56 vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 vqrshrn.s32 d18, q2, #12 // t40a vqrshrn.s32 d21, q3, #12 // t55a vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 vqrshrn.s32 d19, q4, #12 // t47 vqrshrn.s32 d20, q2, #12 // t48 vstr d16, [r6, #2*4*0] // t32a vstr d17, [r9, #2*4*0] // t39 vstr d18, [r6, #2*4*8] // t40a vstr d19, [r9, #2*4*8] // t47 vstr d20, [r6, #2*4*16] // t48 vstr d21, [r9, #2*4*16] // t55a vstr d22, [r6, #2*4*24] // t56 vstr d23, [r9, #2*4*24] // t63a add r6, r6, #2*4 sub r9, r9, #2*4 cmp r6, r9 blt 1b bx lr endfunc .macro load8 src, strd, zero, clear .irp i, d16, d17, d18, d19, d20, d21, d22, d23 .if \clear vld1.16 {\i}, [\src, :64] vst1.16 {\zero}, [\src, :64], \strd .else vld1.16 {\i}, [\src, :64], \strd .endif .endr .endm .macro store16 dst vst1.16 {q8, q9}, [\dst, :128]! vst1.16 {q10, q11}, [\dst, :128]! vst1.16 {q12, q13}, [\dst, :128]! vst1.16 {q14, q15}, [\dst, :128]! .endm .macro clear_upper8 .irp i, q12, q13, q14, q15 vmov.i16 \i, #0 .endr .endm .macro vmov_if reg, val, cond .if \cond vmov.i16 \reg, \val .endif .endm .macro movdup_if reg, gpr, val, cond .if \cond movw \gpr, \val vdup.16 \reg, \gpr .endif .endm .macro vst1_if regs, dst, dstalign, cond .if \cond vst1.16 \regs, \dst, \dstalign .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_4h_x64_neon, export=1 mov r6, sp push {r10-r11,lr} lsl r8, r8, #2 movdup_if d0, r12, #2896*8, \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 add r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct_4h_x16_neon store16 r6 movdup_if d0, r12, #2896*8, \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 lsr r8, r8, #1 sub r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct32_odd_4h_x16_neon add r10, r6, #8*15 sub r6, r6, #8*16 mov r9, #-8 .macro store_addsub r0, r1, r2, r3 vld1.16 {d2}, [r6, :64]! vld1.16 {d3}, [r6, :64]! vqadd.s16 d6, d2, \r0 vqsub.s16 \r0, d2, \r0 vld1.16 {d4}, [r6, :64]! vqadd.s16 d7, d3, \r1 vqsub.s16 \r1, d3, \r1 vld1.16 {d5}, [r6, :64]! vqadd.s16 d2, d4, \r2 sub r6, r6, #8*4 vqsub.s16 \r2, d4, \r2 vst1.16 {d6}, [r6, :64]! vst1.16 {\r0}, [r10, :64], r9 vqadd.s16 d3, d5, \r3 vqsub.s16 \r3, d5, \r3 vst1.16 {d7}, [r6, :64]! vst1.16 {\r1}, [r10, :64], r9 vst1.16 {d2}, [r6, :64]! vst1.16 {\r2}, [r10, :64], r9 vst1.16 {d3}, [r6, :64]! vst1.16 {\r3}, [r10, :64], r9 .endm store_addsub d31, d30, d29, d28 store_addsub d27, d26, d25, d24 store_addsub d23, d22, d21, d20 store_addsub d19, d18, d17, d16 .purgem store_addsub add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 add r10, r7, r8, lsl #3 // offset 8 sub r9, r9, r8 // offset 15 sub r11, r10, r8 // offset 7 vld1.16 {d16}, [r7, :64] // in1 (offset 0) vld1.16 {d17}, [r9, :64] // in31 (offset 15) vld1.16 {d18}, [r10, :64] // in17 (offset 8) vld1.16 {d19}, [r11, :64] // in15 (offset 7) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear add r7, r7, r8, lsl #2 // offset 4 sub r9, r9, r8, lsl #2 // offset 11 sub r10, r7, r8 // offset 3 add r11, r9, r8 // offset 12 vld1.16 {d16}, [r10, :64] // in7 (offset 3) vld1.16 {d17}, [r11, :64] // in25 (offset 12) vld1.16 {d18}, [r9, :64] // in23 (offset 11) vld1.16 {d19}, [r7, :64] // in9 (offset 4) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear sub r10, r10, r8, lsl #1 // offset 1 sub r9, r9, r8, lsl #1 // offset 9 add r10, r10, r8 // offset 2 add r9, r9, r8 // offset 10 add r7, r7, r8 // offset 5 add r11, r11, r8 // offset 13 vld1.16 d16, [r10, :64] // in5 (offset 2) vld1.16 d17, [r11, :64] // in27 (offset 13) vld1.16 d18, [r9, :64] // in21 (offset 10) vld1.16 d19, [r7, :64] // in11 (offset 5) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear sub r10, r10, r8 // offset 1 sub r9, r9, r8 // offset 9 add r11, r11, r8 // offset 14 add r7, r7, r8 // offset 6 vld1.16 d16, [r10, :64] // in3 (offset 1) vld1.16 d17, [r11, :64] // in29 (offset 14) vld1.16 d18, [r9, :64] // in19 (offset 9) vld1.16 d19, [r7, :64] // in13 (offset 6) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon sub r6, r6, #2*4*32 add r9, r6, #2*4*7 bl inv_dct64_step2_neon pop {r10-r11,pc} endfunc .endm def_dct64_func def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x4_neon vdup.16 q3, r9 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, #2*56 push {r10-r11,lr} mov r10, #2*64 mov r11, #-2*4*4 1: vld1.16 {d16, d17, d18, d19}, [r7, :128]! vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 vld1.16 {d20, d21, d22, d23}, [r7, :128]! vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q15, q14, d31, d30, d29, d28 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q13, q12, d27, d26, d25, d24 .macro store_addsub src0, src1, src2, src3 vqsub.s16 d3, \src0, \src1 vqsub.s16 d2, \src2, \src3 vqadd.s16 d0, \src0, \src1 vqadd.s16 d1, \src2, \src3 vrshl.s16 q1, q1, q3 vrshl.s16 q0, q0, q3 vrev64.16 q1, q1 vst1.16 {q0}, [r6, :128], r10 vst1.16 {q1}, [r9, :128], r10 .endm store_addsub d16, d31, d20, d27 store_addsub d17, d30, d21, d26 store_addsub d18, d29, d22, d25 store_addsub d19, d28, d23, d24 .purgem store_addsub sub r6, r6, r10, lsl #2 sub r9, r9, r10, lsl #2 add r6, r6, #16 sub r9, r9, #16 cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_vert_dct_4x64_neon lsl r8, r8, #1 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, r1, lsl #6 sub r9, r9, r1 push {r10-r11,lr} neg r10, r1 mov r11, #-2*4*4 1: vld1.16 {d16, d17, d18, d19}, [r7, :128]! vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 vld1.16 {d20, d21, d22, d23}, [r7, :128]! vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 .macro add_dest_addsub src0, src1, src2, src3 vld1.32 {d0[0]}, [r6, :32], r1 vld1.32 {d1[0]}, [r9, :32], r10 vqadd.s16 d4, \src0, \src1 vld1.32 {d0[1]}, [r6, :32] vqadd.s16 d5, \src2, \src3 vld1.32 {d1[1]}, [r9, :32] vqsub.s16 d6, \src0, \src1 vqsub.s16 d7, \src2, \src3 sub r6, r6, r1 sub r9, r9, r10 vrshr.s16 q2, q2, #4 vrshr.s16 q3, q3, #4 vaddw.u8 q2, q2, d0 vaddw.u8 q3, q3, d1 vqmovun.s16 d0, q2 vqmovun.s16 d1, q3 vst1.32 {d0[0]}, [r6, :32], r1 vst1.32 {d1[0]}, [r9, :32], r10 vst1.32 {d0[1]}, [r6, :32], r1 vst1.32 {d1[1]}, [r9, :32], r10 .endm add_dest_addsub d16, d31, d17, d30 add_dest_addsub d18, d29, d19, d28 add_dest_addsub d20, d27, d21, d26 add_dest_addsub d22, d25, d23, d24 .purgem add_dest_addsub cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_dct_clear_4h_x64_neon add r6, r5, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r7, r5, #(\i*2) mov r8, #64*2 bl inv_txfm_dct_4h_x64_neon add r6, r0, #(\i) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 idct_dc 64, 32, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_dct_clear_scale_4h_x64_neon add r6, r5, #(\i*64*2) mov r9, #-1 // shift bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i) add r7, r5, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 idct_dc 32, 64, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 32*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r7, r5, #(\i*2) mov r8, #32*2 bl inv_txfm_dct_4h_x64_neon add r6, r0, #(\i) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 32*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 idct_dc 64, 16, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 64*16*2+64*4*2 add r4, sp, #64*4*2 movrel_local r10, eob_16x32 .irp i, 0, 4, 8, 12 add r6, r4, #(\i*64*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*2) mov r8, #16*2 bl inv_txfm_dct_clear_4h_x64_neon add r6, r4, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x4_neon .if \i < 12 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: movrel_local r5, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i) add r7, r4, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 64*16*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 idct_dc 16, 64, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 16*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*16*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_16x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r7, r5, #(\i*2) mov r8, #16*2 bl inv_txfm_dct_4h_x64_neon add r6, r0, #(\i) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 16*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/itx16.S000066400000000000000000003616211517466257200227460ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); // Most of the functions use the following register layout: // r0-r3 external parameters // r4 function pointer to first transform // r5 function pointer to second transform // r6 output parameter for helper function // r7 input parameter for helper function // r8 input stride for helper function // r9 scratch variable for helper functions // r10-r11 pointer to list of eob thresholds, eob threshold value, // scratch variables within helper functions (backed up) // The SIMD registers most often use the following layout: // d0-d3 multiplication coefficients // d4-d7 scratch registers // d8-d15 unused in some transforms, used for scratch registers in others // d16-v31 inputs/outputs of transforms // Potential further optimizations, that are left unimplemented for now: // - Trying to keep multiplication coefficients in registers across multiple // transform functions. (The register layout is designed to potentially // allow this.) // - Use a simplified version of the transforms themselves for cases where // we know a significant number of inputs are zero. E.g. if the eob value // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. // A macro for cases where a thumb mov can express the constant in one // instruction, while arm mode requires two separate movw+movt pairs. .macro mov_const reg, val #if CONFIG_THUMB mov.w \reg, #\val #else movw \reg, #((\val) & 0xffff) movt \reg, #(((\val) >> 16) & 0xffff) #endif .endm const idct_coeffs, align=4 // idct4 .int 2896, 2896*8*(1<<16), 1567, 3784 // idct8 .int 799, 4017, 3406, 2276 // idct16 .int 401, 4076, 3166, 2598 .int 1931, 3612, 3920, 1189 // idct32 .int 201, 4091, 3035, 2751 .int 1751, 3703, 3857, 1380 .int 995, 3973, 3513, 2106 .int 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) .int 4076, 401, 4017, 799 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) .int -3166, -2598, -799, -4017 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) .int 3612, 1931, 2276, 3406 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) .int -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 .int 1321, 3803, 2482, 3344 endconst const iadst8_coeffs, align=4 .int 4076, 401, 3612, 1931 .int 2598, 3166, 1189, 3920 // idct_coeffs .int 2896, 0, 1567, 3784 endconst const iadst16_coeffs, align=4 .int 4091, 201, 3973, 995 .int 3703, 1751, 3290, 2440 .int 2751, 3035, 2106, 3513 .int 1380, 3857, 601, 4052 endconst .macro vmul_vmla d0, s0, s1, c0, c1 vmul.i32 \d0, \s0, \c0 vmla.i32 \d0, \s1, \c1 .endm .macro vmul_vmls d0, s0, s1, c0, c1 vmul.i32 \d0, \s0, \c0 vmls.i32 \d0, \s1, \c1 .endm .macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 vqrdmulh.s32 \r0, \r0, \c vqrdmulh.s32 \r1, \r1, \c .ifnb \r2 vqrdmulh.s32 \r2, \r2, \c vqrdmulh.s32 \r3, \r3, \c .endif .ifnb \r4 vqrdmulh.s32 \r4, \r4, \c vqrdmulh.s32 \r5, \r5, \c vqrdmulh.s32 \r6, \r6, \c vqrdmulh.s32 \r7, \r7, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 .ifnb \load vld1.16 {\load}, [\src, :128], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #\shiftbits .endif .ifnb \addsrc vqadd.s16 \adddst, \adddst, \addsrc .endif .ifnb \max vmax.s16 \max, \max, q6 .endif .ifnb \min vmin.s16 \min, \min, q7 .endif .ifnb \store vst1.16 {\store}, [\dst, :128], r1 .endif .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff load_add_store q0, q8, , , , , , \dst, \src, \shiftbits load_add_store q1, q9, , , , , , \dst, \src, \shiftbits load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits load_add_store , , , , , q15, q14, \dst, \src, \shiftbits load_add_store , , , , , , q15, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff load_add_store q0, q8, , , , , , \dst, \src, \shiftbits load_add_store q1, q9, , , , , , \dst, \src, \shiftbits load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits load_add_store , , , , , q11, q10, \dst, \src, \shiftbits load_add_store , , , , , , q11, \dst, \src, \shiftbits .endm .macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4 .ifnb \load1 vld1.16 {\load1}, [\src, :64], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #\shiftbits .endif .ifnb \load2 vld1.16 {\load2}, [\src, :64], r1 .endif .ifnb \addsrc vqadd.s16 \adddst, \adddst, \addsrc .endif .ifnb \max vmax.s16 \max, \max, q6 .endif .ifnb \store1 vst1.16 {\store1}, [\dst, :64], r1 .endif .ifnb \min vmin.s16 \min, \min, q7 .endif .ifnb \store2 vst1.16 {\store2}, [\dst, :64], r1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff mov \src, \dst load_add_store4 d0, d1, q8, , , , , , , \dst, \src load_add_store4 d2, d3, q9, , , , , , , \dst, \src load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src load_add_store4 , , , , , , q15, d28, d29, \dst, \src load_add_store4 , , , , , , , d30, d31, \dst, \src .endm .macro load_add_store_4x8 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff mov \src, \dst load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits .endm .macro load_add_store_4x4 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff mov \src, \dst load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits .endm .macro idct_dc w, h, shift cmp r3, #0 bne 1f vmov.i16 q14, #0 mov_const r12, 2896*8*(1<<16) vld1.32 {d24[], d25[]}, [r2, :32] vdup.32 d0, r12 vqrdmulh.s32 q13, q12, d0[0] vst1.32 {d28[0]}, [r2, :32] .if (\w == 2*\h) || (2*\w == \h) vqrdmulh.s32 q13, q13, d0[0] .endif .if \shift > 0 vqrshrn.s32 d24, q13, #\shift vqrshrn.s32 d25, q13, #\shift .else vqmovn.s32 d24, q13 vqmovn.s32 d25, q13 .endif vqrdmulh.s16 q12, q12, d0[1] mov r3, #\h vrshr.s16 q12, q12, #4 b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {d0}, [r0, :64], r1 vld1.16 {d1}, [r0, :64], r1 vld1.16 {d2}, [r0, :64], r1 vld1.16 {d3}, [r0, :64], r1 subs r3, r3, #4 vqadd.s16 q0, q0, q12 sub r0, r0, r1, lsl #2 vqadd.s16 q1, q1, q12 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmin.s16 q0, q0, q15 vst1.16 {d0}, [r0, :64], r1 vmin.s16 q1, q1, q15 vst1.16 {d1}, [r0, :64], r1 vst1.16 {d2}, [r0, :64], r1 vst1.16 {d3}, [r0, :64], r1 bgt 1b bx lr endfunc function idct_dc_w8_neon vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0}, [r0, :128], r1 subs r3, r3, #4 vld1.16 {q1}, [r0, :128], r1 vqadd.s16 q0, q0, q12 vld1.16 {q2}, [r0, :128], r1 vqadd.s16 q1, q1, q12 vld1.16 {q3}, [r0, :128], r1 vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 sub r0, r0, r1, lsl #2 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vst1.16 {q0}, [r0, :128], r1 vmin.s16 q2, q2, q15 vst1.16 {q1}, [r0, :128], r1 vmin.s16 q3, q3, q15 vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w16_neon vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0, q1}, [r0, :128], r1 subs r3, r3, #2 vld1.16 {q2, q3}, [r0, :128], r1 vqadd.s16 q0, q0, q12 vqadd.s16 q1, q1, q12 vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 sub r0, r0, r1, lsl #1 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vst1.16 {q0, q1}, [r0, :128], r1 vmin.s16 q3, q3, q15 vst1.16 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w32_neon sub r1, r1, #32 vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0, q1}, [r0, :128]! subs r3, r3, #1 vld1.16 {q2, q3}, [r0, :128] vqadd.s16 q0, q0, q12 vqadd.s16 q1, q1, q12 vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 sub r0, r0, #32 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vst1.16 {q0, q1}, [r0, :128]! vmin.s16 q3, q3, q15 vst1.16 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w64_neon sub r1, r1, #96 vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0, q1}, [r0, :128]! subs r3, r3, #1 vld1.16 {q2, q3}, [r0, :128]! vqadd.s16 q0, q0, q12 vld1.16 {q8, q9}, [r0, :128]! vqadd.s16 q1, q1, q12 vld1.16 {q10, q11}, [r0, :128] vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 vqadd.s16 q8, q8, q12 vqadd.s16 q9, q9, q12 vqadd.s16 q10, q10, q12 vqadd.s16 q11, q11, q12 sub r0, r0, #96 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmax.s16 q8, q8, q14 vmax.s16 q9, q9, q14 vmax.s16 q10, q10, q14 vmax.s16 q11, q11, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vmin.s16 q8, q8, q15 vst1.16 {q0, q1}, [r0, :128]! vmin.s16 q9, q9, q15 vst1.16 {q2, q3}, [r0, :128]! vmin.s16 q10, q10, q15 vst1.16 {q8, q9}, [r0, :128]! vmin.s16 q11, q11, q15 vst1.16 {q10, q11}, [r0, :128], r1 bgt 1b bx lr endfunc .macro iwht4 vadd.i32 q8, q8, q9 vsub.i32 q13, q10, q11 vsub.i32 q12, q8, q13 vshr.s32 q12, q12, #1 vsub.i32 q10, q12, q9 vsub.i32 q9, q12, q11 vadd.i32 q11, q13, q10 vsub.i32 q8, q8, q9 .endm .macro idct_4s_x4 r0, r1, r2, r3 vmul_vmla q4, \r1, \r3, d1[1], d1[0] vmul_vmla q2, \r0, \r2, d0[0], d0[0] vmul_vmls q3, \r1, \r3, d1[0], d1[1] vmul_vmls q5, \r0, \r2, d0[0], d0[0] vrshr.s32 q4, q4, #12 vrshr.s32 q2, q2, #12 vrshr.s32 q3, q3, #12 vrshr.s32 q5, q5, #12 vqadd.s32 \r0, q2, q4 vqsub.s32 \r3, q2, q4 vqadd.s32 \r1, q5, q3 vqsub.s32 \r2, q5, q3 .endm .macro idct_2s_x4 r0, r1, r2, r3 vmul_vmla d6, \r1, \r3, d1[1], d1[0] vmul_vmla d4, \r0, \r2, d0[0], d0[0] vmul_vmls d5, \r1, \r3, d1[0], d1[1] vmul_vmls d7, \r0, \r2, d0[0], d0[0] vrshr.s32 d6, d6, #12 vrshr.s32 d4, d4, #12 vrshr.s32 d5, d5, #12 vrshr.s32 d7, d7, #12 vqadd.s32 \r0, d4, d6 vqsub.s32 \r3, d4, d6 vqadd.s32 \r1, d7, d5 vqsub.s32 \r2, d7, d5 .endm function inv_dct_4s_x4_neon movrel_local r12, idct_coeffs vld1.32 {d0, d1}, [r12, :128] idct_4s_x4 q8, q9, q10, q11 bx lr endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel_local r12, iadst4_coeffs vld1.32 {d0, d1}, [r12, :128] vsub.i32 q1, q8, q10 vmul.i32 q2, q8, d0[0] vmla.i32 q2, q10, d0[1] vmla.i32 q2, q11, d1[0] vmul.i32 q4, q9, d1[1] vadd.i32 q1, q1, q11 vmul.i32 q3, q8, d1[0] vmls.i32 q3, q10, d0[0] vmls.i32 q3, q11, d0[1] vadd.i32 \o3, q2, q3 vmul.i32 \o2, q1, d1[1] vadd.i32 \o0, q2, q4 vadd.i32 \o1, q3, q4 vsub.i32 \o3, \o3, q4 vrshr.s32 \o0, \o0, #12 vrshr.s32 \o2, \o2, #12 vrshr.s32 \o1, \o1, #12 vrshr.s32 \o3, \o3, #12 .endm function inv_adst_4s_x4_neon iadst_4x4 q8, q9, q10, q11 bx lr endfunc function inv_flipadst_4s_x4_neon iadst_4x4 q11, q10, q9, q8 bx lr endfunc function inv_identity_4s_x4_neon mov r12, #0 movt r12, #(5793-4096)*8 vdup.32 d0, r12 vqrdmulh.s32 q1, q8, d0[0] vqrdmulh.s32 q2, q9, d0[0] vqrdmulh.s32 q3, q10, d0[0] vqrdmulh.s32 q4, q11, d0[0] vqadd.s32 q8, q8, q1 vqadd.s32 q9, q9, q2 vqadd.s32 q10, q10, q3 vqadd.s32 q11, q11, q4 bx lr endfunc function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 push {r4-r5,lr} vpush {q4-q5} vmov.i16 q14, #0 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q14, q15}, [r2, :128]! vshr.s32 q8, q8, #2 vld1.32 {q10, q11}, [r2, :128] vshr.s32 q9, q9, #2 vshr.s32 q10, q10, #2 vshr.s32 q11, q11, #2 iwht4 vst1.32 {q14, q15}, [r2, :128] transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 iwht4 vld1.16 {d0}, [r0, :64], r1 vqmovn.s32 d16, q8 vld1.16 {d1}, [r0, :64], r1 vqmovn.s32 d17, q9 vld1.16 {d2}, [r0, :64], r1 vqmovn.s32 d18, q10 vld1.16 {d3}, [r0, :64], r1 vqmovn.s32 d19, q11 b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon vmov.i16 q14, #0 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.16 {q14, q15}, [r2, :128]! vld1.32 {q10, q11}, [r2, :128] vst1.16 {q14, q15}, [r2, :128] blx r4 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 vqmovn.s32 d18, q10 vqmovn.s32 d19, q11 transpose_4x4h q8, q9, d16, d17, d18, d19 blx r5 vld1.16 {d0}, [r0, :64], r1 vld1.16 {d1}, [r0, :64], r1 vrshr.s16 q8, q8, #4 vld1.16 {d2}, [r0, :64], r1 vrshr.s16 q9, q9, #4 vld1.16 {d3}, [r0, :64], r1 L(itx_4x4_end): // read bitdepth_max from the callers stack ldr r4, [sp, #44] vdup.i16 q15, r4 sub r0, r0, r1, lsl #2 vqadd.s16 q8, q8, q0 vqadd.s16 q9, q9, q1 vmax.s16 q8, q8, q14 vmax.s16 q9, q9, q14 vmin.s16 q8, q8, q15 vmin.s16 q9, q9, q15 vst1.16 {d16}, [r0, :64], r1 vst1.16 {d17}, [r0, :64], r1 vst1.16 {d18}, [r0, :64], r1 vst1.16 {d19}, [r0, :64], r1 vpop {q4-q5} pop {r4-r5,pc} endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 push {r4-r5,lr} vpush {q4-q5} .ifc \txfm1\()_\txfm2, dct_dct cmp r3, #0 bne 1f vmov.i16 q14, #0 mov_const r12, 2896*8*(1<<16) vld1.32 {d16[], d17[]}, [r2, :32] vdup.32 d4, r12 vst1.32 {d28[0]}, [r2, :32] vqrdmulh.s32 q8, q8, d4[0] vld1.16 {d0}, [r0, :64], r1 vqmovn.s32 d20, q8 vqmovn.s32 d21, q8 vld1.16 {d1}, [r0, :64], r1 vqrdmulh.s16 q10, q10, d4[1] vld1.16 {d2}, [r0, :64], r1 vrshr.s16 q8, q10, #4 vld1.16 {d3}, [r0, :64], r1 vrshr.s16 q9, q10, #4 b L(itx_4x4_end) 1: .endif movrel_local r4, inv_\txfm1\()_4s_x4_neon movrel r5, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4s_x4 \r0, \r2, \r4, \r6 vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, \r0, \r2, \r4, \r6 vmin.s32 \r, \r, q5 .endr .irp r, \r0, \r2, \r4, \r6 vmax.s32 \r, \r, q4 .endr vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, q2, #12 // t4a vrshr.s32 \r7, q3, #12 // t7a vrshr.s32 \r3, q6, #12 // t5a vrshr.s32 \r5, q7, #12 // t6a vqadd.s32 q2, \r1, \r3 // t4 vqsub.s32 \r1, \r1, \r3 // t5a vqadd.s32 q3, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a .irp r, q2, \r1, q3, \r3 vmin.s32 \r, \r, q5 .endr .irp r, q2, \r1, q3, \r3 vmax.s32 \r, \r, q4 .endr vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 q7, q7, #12 // t5 vrshr.s32 q5, q6, #12 // t6 vqsub.s32 \r7, \r0, q3 // out7 vqadd.s32 \r0, \r0, q3 // out0 vqadd.s32 \r1, \r2, q5 // out1 vqsub.s32 q6, \r2, q5 // out6 vqadd.s32 \r2, \r4, q7 // out2 vqsub.s32 \r5, \r4, q7 // out5 vqadd.s32 \r3, \r6, q2 // out3 vqsub.s32 \r4, \r6, q2 // out4 vmov \r6, q6 // out6 .endm .macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_2s_x4 \r0, \r2, \r4, \r6 vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, \r0, \r2, \r4, \r6 vmin.s32 \r, \r, d9 .endr .irp r, \r0, \r2, \r4, \r6 vmax.s32 \r, \r, d8 .endr vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, d4, #12 // t4a vrshr.s32 \r7, d5, #12 // t7a vrshr.s32 \r3, d6, #12 // t5a vrshr.s32 \r5, d7, #12 // t6a vqadd.s32 d4, \r1, \r3 // t4 vqsub.s32 \r1, \r1, \r3 // t5a vqadd.s32 d5, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a .irp r, d4, \r1, d5, \r3 vmin.s32 \r, \r, d9 .endr .irp r, d4, \r1, d5, \r3 vmax.s32 \r, \r, d8 .endr vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 d6, d6, #12 // t5 vrshr.s32 d7, d7, #12 // t6 vqsub.s32 \r7, \r0, d5 // out7 vqadd.s32 \r0, \r0, d5 // out0 vqadd.s32 \r1, \r2, d7 // out1 vqsub.s32 d7, \r2, d7 // out6 vqadd.s32 \r2, \r4, d6 // out2 vqsub.s32 \r5, \r4, d6 // out5 vqadd.s32 \r3, \r6, d4 // out3 vqsub.s32 \r4, \r6, d4 // out4 vmov \r6, d7 // out6 .endm function inv_dct_4s_x8_neon movrel_local r12, idct_coeffs vld1.32 {q0, q1}, [r12, :128] idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 bx lr endfunc .macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 movrel_local r12, iadst8_coeffs vld1.32 {q0, q1}, [r12, :128]! vmul_vmla q2, q15, q8, d0[0], d0[1] vmul_vmls q3, q15, q8, d0[1], d0[0] vmul_vmla q4, q13, q10, d1[0], d1[1] vrshr.s32 q8, q2, #12 // t0a vrshr.s32 q15, q3, #12 // t1a vmul_vmls q5, q13, q10, d1[1], d1[0] vmul_vmla q6, q11, q12, d2[0], d2[1] vrshr.s32 q10, q4, #12 // t2a vrshr.s32 q13, q5, #12 // t3a vmul_vmls q7, q11, q12, d2[1], d2[0] vmul_vmla q2, q9, q14, d3[0], d3[1] vrshr.s32 q12, q6, #12 // t4a vrshr.s32 q11, q7, #12 // t5a vmul_vmls q3, q9, q14, d3[1], d3[0] vrshr.s32 q14, q2, #12 // t6a vrshr.s32 q9, q3, #12 // t7a vld1.32 {q0}, [r12] vqadd.s32 q2, q8, q12 // t0 vqsub.s32 q3, q8, q12 // t4 vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vqadd.s32 q4, q15, q11 // t1 vqsub.s32 q5, q15, q11 // t5 vqadd.s32 q6, q10, q14 // t2 vqsub.s32 q7, q10, q14 // t6 vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 q10, q13, q9 // t3 vqsub.s32 q11, q13, q9 // t7 .irp r, q2, q3, q4, q5, q6, q7, q10, q11 vmin.s32 \r, \r, q12 .endr .irp r, q2, q3, q4, q5, q6, q7, q10, q11 vmax.s32 \r, \r, q14 .endr vmul_vmla q8, q3, q5, d1[1], d1[0] vmul_vmls q13, q3, q5, d1[0], d1[1] vmul_vmls q14, q11, q7, d1[1], d1[0] vrshr.s32 q3, q8, #12 // t4a vrshr.s32 q5, q13, #12 // t5a vmul_vmla q8, q11, q7, d1[0], d1[1] vrshr.s32 q7, q14, #12 // t6a vrshr.s32 q11, q8, #12 // t7a vqadd.s32 \r0, q2, q6 // out0 vqsub.s32 q2, q2, q6 // t2 vqadd.s32 \r7, q4, q10 // out7 vqsub.s32 q4, q4, q10 // t3 vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 \r1, q3, q7 // out1 vqsub.s32 q3, q3, q7 // t6 vqadd.s32 \r6, q5, q11 // out6 vqsub.s32 q5, q5, q11 // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, q2, q4, q3, q5 vmin.s32 \r, \r, q12 .endr .irp r, q2, q4, q3, q5 vmax.s32 \r, \r, q10 .endr vqneg.s32 \r7, \r7 // out7 vqneg.s32 \r1, \r1 // out1 vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11) vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10) vrshr.s32 q2, q10, #12 // out3 vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13) vrshr.s32 q3, q12, #12 // out5 vrshr.s32 \r2, q10, #12 // out2 (q10 or q13) vrshr.s32 \r4, q6, #12 // out4 (q12 or q11) vqneg.s32 \r3, q2 // out3 vqneg.s32 \r5, q3 // out5 .endm function inv_adst_4s_x8_neon iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 bx lr endfunc function inv_flipadst_4s_x8_neon iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8 bx lr endfunc function inv_identity_4s_x8_neon vqshl.s32 q8, q8, #1 vqshl.s32 q9, q9, #1 vqshl.s32 q10, q10, #1 vqshl.s32 q11, q11, #1 vqshl.s32 q12, q12, #1 vqshl.s32 q13, q13, #1 vqshl.s32 q14, q14, #1 vqshl.s32 q15, q15, #1 bx lr endfunc function inv_txfm_add_8x8_neon vmov.i32 q0, #0 mov r7, #8*4 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q12, #1 vqrshrn.s32 d18, q9, #1 vqrshrn.s32 d19, q13, #1 vqrshrn.s32 d20, q10, #1 vqrshrn.s32 d21, q14, #1 vqrshrn.s32 d22, q11, #1 vqrshrn.s32 d23, q15, #1 cmp r3, r10 transpose_4x8h q8, q9, q10, q11 blt 1f sub r2, r2, r7, lsl #3 vpush {q8-q11} add r2, r2, #16 vmov.i32 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr blx r4 vqrshrn.s32 d31, q15, #1 vqrshrn.s32 d30, q11, #1 vqrshrn.s32 d29, q14, #1 vqrshrn.s32 d28, q10, #1 vqrshrn.s32 d27, q13, #1 vqrshrn.s32 d26, q9, #1 vqrshrn.s32 d25, q12, #1 vqrshrn.s32 d24, q8, #1 vpop {q8-q11} transpose_4x8h q12, q13, q14, q15 b 2f 1: vmov.i16 q12, #0 vmov.i16 q13, #0 vmov.i16 q14, #0 vmov.i16 q15, #0 2: blx r5 load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,r10,pc} endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif push {r4-r5,r7,r10,lr} vpush {q4-q7} mov r10, #\eob_half movrel_local r4, inv_\txfm1\()_4s_x8_neon movrel r5, X(inv_\txfm2\()_8h_x8_neon) b inv_txfm_add_8x8_neon endfunc .endm def_fn_8x8 dct, dct, 10 def_fn_8x8 identity, identity, 10 def_fn_8x8 dct, adst, 10 def_fn_8x8 dct, flipadst, 10 def_fn_8x8 dct, identity, 4 def_fn_8x8 adst, dct, 10 def_fn_8x8 adst, adst, 10 def_fn_8x8 adst, flipadst, 10 def_fn_8x8 flipadst, dct, 10 def_fn_8x8 flipadst, adst, 10 def_fn_8x8 flipadst, flipadst, 10 def_fn_8x8 identity, dct, 4 def_fn_8x8 adst, identity, 4 def_fn_8x8 flipadst, identity, 4 def_fn_8x8 identity, adst, 4 def_fn_8x8 identity, flipadst, 4 function inv_txfm_add_8x4_neon mov_const r12, 2896*8*(1<<16) vmov.i32 q0, #0 vmov.i32 q1, #0 vld1.16 {q8, q9}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vdup.32 d4, r12 vld1.16 {q10, q11}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q12, q13}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q14, q15}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15 blx r4 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 vqmovn.s32 d18, q10 vqmovn.s32 d19, q11 vqmovn.s32 d20, q12 vqmovn.s32 d21, q13 vqmovn.s32 d22, q14 vqmovn.s32 d23, q15 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 vswp d17, d20 vswp d19, d21 vswp d18, d20 vswp d21, d22 blx r5 load_add_store_8x4 r0, r7 vpop {q4-q7} pop {r4-r5,r7,r10,pc} endfunc function inv_txfm_add_4x8_neon mov_const r12, 2896*8*(1<<16) vmov.i32 q0, #0 cmp r3, r10 mov r7, #32 blt 1f add r2, r2, #16 vdup.32 d2, r12 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr scale_input d2[0], q8, q9, q10, q11 sub r2, r2, r7, lsl #2 blx r4 sub r2, r2, #16 vqmovn.s32 d24, q8 vqmovn.s32 d25, q9 vqmovn.s32 d26, q10 vqmovn.s32 d27, q11 transpose_4x4h q12, q13, d24, d25, d26, d27 b 2f 1: vmov.i16 q12, #0 vmov.i16 q13, #0 2: mov_const r12, 2896*8*(1<<16) vmov.i32 q0, #0 vdup.32 d2, r12 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr scale_input d2[0], q8, q9, q10, q11 blx r4 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 vqmovn.s32 d18, q10 vqmovn.s32 d19, q11 transpose_4x4h q8, q9, d16, d17, d18, d19 vmov q10, q12 vmov q11, q13 blx r5 load_add_store_4x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,r10,pc} endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif push {r4-r5,r7,r10,lr} vpush {q4-q7} movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon .if \w == 4 mov r10, #\eob_half .endif movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct, 13 def_fn_48 \w, \h, identity, identity, 13 def_fn_48 \w, \h, dct, adst, 13 def_fn_48 \w, \h, dct, flipadst, 13 def_fn_48 \w, \h, dct, identity, 4 def_fn_48 \w, \h, adst, dct, 13 def_fn_48 \w, \h, adst, adst, 13 def_fn_48 \w, \h, adst, flipadst, 13 def_fn_48 \w, \h, flipadst, dct, 13 def_fn_48 \w, \h, flipadst, adst, 13 def_fn_48 \w, \h, flipadst, flipadst, 13 def_fn_48 \w, \h, identity, dct, 16 def_fn_48 \w, \h, adst, identity, 4 def_fn_48 \w, \h, flipadst, identity, 4 def_fn_48 \w, \h, identity, adst, 16 def_fn_48 \w, \h, identity, flipadst, 16 .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_2s_x16_neon movrel_local r12, idct_coeffs vld1.32 {q0, q1}, [r12, :128]! idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 // idct_8 leaves the row_clip_max/min constants in d9 and d8 .irp r, d16, d18, d20, d22, d24, d26, d28, d30 vmin.s32 \r, \r, d9 .endr .irp r, d16, d18, d20, d22, d24, d26, d28, d30 vmax.s32 \r, \r, d8 .endr vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #32 vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a vrshr.s32 d17, d4, #12 // t8a vrshr.s32 d31, d5, #12 // t15a vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a vrshr.s32 d23, d6, #12 // t9a vrshr.s32 d25, d4, #12 // t14a vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a vrshr.s32 d21, d5, #12 // t10a vrshr.s32 d27, d6, #12 // t13a vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a vrshr.s32 d19, d4, #12 // t11a vrshr.s32 d29, d5, #12 // t12a vld1.32 {q0}, [r12, :128] vqsub.s32 d4, d17, d23 // t9 vqadd.s32 d17, d17, d23 // t8 vqsub.s32 d5, d31, d25 // t14 vqadd.s32 d31, d31, d25 // t15 vqsub.s32 d23, d19, d21 // t10 vqadd.s32 d19, d19, d21 // t11 vqadd.s32 d25, d29, d27 // t12 vqsub.s32 d29, d29, d27 // t13 .irp r, d4, d17, d5, d31, d23, d19, d25, d29 vmin.s32 \r, \r, d9 .endr .irp r, d4, d17, d5, d31, d23, d19, d25, d29 vmax.s32 \r, \r, d8 .endr vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a vrshr.s32 d21, d6, #12 // t9a vrshr.s32 d27, d7, #12 // t14a vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a vrshr.s32 d29, d6, #12 // t13a vneg.s32 d7, d7 vrshr.s32 d23, d7, #12 // t10a vqsub.s32 d4, d17, d19 // t11a vqadd.s32 d17, d17, d19 // t8a vqsub.s32 d5, d31, d25 // t12a vqadd.s32 d31, d31, d25 // t15a vqadd.s32 d19, d21, d23 // t9 vqsub.s32 d21, d21, d23 // t10 vqsub.s32 d25, d27, d29 // t13 vqadd.s32 d27, d27, d29 // t14 .irp r, d4, d17, d5, d31, d19, d21, d25, d27 vmin.s32 \r, \r, d9 .endr .irp r, d4, d17, d5, d31, d19, d21, d25, d27 vmax.s32 \r, \r, d8 .endr vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a vrshr.s32 d6, d6, #12 // t11 vrshr.s32 d7, d7, #12 // t12 vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a vrshr.s32 d4, d4, #12 // t10a vrshr.s32 d5, d5, #12 // t13a vqadd.s32 d8, d16, d31 // out0 vqsub.s32 d31, d16, d31 // out15 vmov d16, d8 vqadd.s32 d23, d30, d17 // out7 vqsub.s32 d9, d30, d17 // out8 vqadd.s32 d17, d18, d27 // out1 vqsub.s32 d30, d18, d27 // out14 vqadd.s32 d18, d20, d5 // out2 vqsub.s32 d29, d20, d5 // out13 vqadd.s32 d5, d28, d19 // out6 vqsub.s32 d25, d28, d19 // out9 vqadd.s32 d19, d22, d7 // out3 vqsub.s32 d28, d22, d7 // out12 vqadd.s32 d20, d24, d6 // out4 vqsub.s32 d27, d24, d6 // out11 vqadd.s32 d21, d26, d4 // out5 vqsub.s32 d26, d26, d4 // out10 vmov d24, d9 vmov d22, d5 bx lr endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel_local r12, iadst16_coeffs vld1.32 {q0, q1}, [r12, :128]! vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0 vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1 vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2 vrshr.s32 d16, d4, #12 // t0 vrshr.s32 d31, d6, #12 // t1 vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3 vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4 vrshr.s32 d18, d8, #12 // t2 vrshr.s32 d29, d4, #12 // t3 vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5 vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6 vrshr.s32 d20, d6, #12 // t4 vrshr.s32 d27, d8, #12 // t5 vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7 vld1.32 {q0, q1}, [r12, :128] movrel_local r12, idct_coeffs vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8 vrshr.s32 d22, d4, #12 // t6 vrshr.s32 d25, d6, #12 // t7 vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9 vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10 vrshr.s32 d23, d8, #12 // t8 vrshr.s32 d24, d4, #12 // t9 vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11 vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12 vrshr.s32 d21, d6, #12 // t10 vrshr.s32 d26, d8, #12 // t11 vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13 vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14 vrshr.s32 d19, d4, #12 // t12 vrshr.s32 d28, d6, #12 // t13 vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15 vrshr.s32 d17, d8, #12 // t14 vrshr.s32 d30, d4, #12 // t15 vld1.32 {q0, q1}, [r12, :128] vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqsub.s32 d5, d16, d23 // t8a vqadd.s32 d16, d16, d23 // t0a vqsub.s32 d7, d31, d24 // t9a vqadd.s32 d31, d31, d24 // t1a vqadd.s32 d23, d18, d21 // t2a vqsub.s32 d18, d18, d21 // t10a vqadd.s32 d24, d29, d26 // t3a vqsub.s32 d29, d29, d26 // t11a vqadd.s32 d21, d20, d19 // t4a vqsub.s32 d20, d20, d19 // t12a vqadd.s32 d26, d27, d28 // t5a vqsub.s32 d27, d27, d28 // t13a vqadd.s32 d19, d22, d17 // t6a vqsub.s32 d22, d22, d17 // t14a vqadd.s32 d28, d25, d30 // t7a vqsub.s32 d25, d25, d30 // t15a .irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 vmin.s32 \r, \r, d11 .endr .irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 vmax.s32 \r, \r, d10 .endr vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 vrshr.s32 d17, d4, #12 // t8 vrshr.s32 d30, d6, #12 // t9 vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11 vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12 vrshr.s32 d18, d8, #12 // t10 vrshr.s32 d29, d4, #12 // t11 vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13 vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14 vrshr.s32 d27, d6, #12 // t12 vrshr.s32 d20, d8, #12 // t13 vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15 vrshr.s32 d25, d4, #12 // t14 vrshr.s32 d22, d6, #12 // t15 vqsub.s32 d2, d16, d21 // t4 vqadd.s32 d16, d16, d21 // t0 vqsub.s32 d3, d31, d26 // t5 vqadd.s32 d31, d31, d26 // t1 vqadd.s32 d21, d23, d19 // t2 vqsub.s32 d23, d23, d19 // t6 vqadd.s32 d26, d24, d28 // t3 vqsub.s32 d24, d24, d28 // t7 vqadd.s32 d19, d17, d27 // t8a vqsub.s32 d17, d17, d27 // t12a vqadd.s32 d28, d30, d20 // t9a vqsub.s32 d30, d30, d20 // t13a vqadd.s32 d27, d18, d25 // t10a vqsub.s32 d18, d18, d25 // t14a vqadd.s32 d20, d29, d22 // t11a vqsub.s32 d29, d29, d22 // t15a .irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 vmin.s32 \r, \r, d11 .endr .irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 vmax.s32 \r, \r, d10 .endr vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a vrshr.s32 d22, d4, #12 // t4a vrshr.s32 d25, d6, #12 // t5a vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12 vrshr.s32 d24, d8, #12 // t6a vrshr.s32 d23, d4, #12 // t7a vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13 vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14 vrshr.s32 d17, d6, #12 // t12 vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15 vrshr.s32 d29, d8, #12 // t13 vrshr.s32 d30, d4, #12 // t14 vrshr.s32 d18, d6, #12 // t15 vqsub.s32 d2, d16, d21 // t2a .ifc \o0, d16 vqadd.s32 \o0, d16, d21 // out0 vqsub.s32 d21, d31, d26 // t3a vqadd.s32 \o15,d31, d26 // out15 .else vqadd.s32 d4, d16, d21 // out0 vqsub.s32 d21, d31, d26 // t3a vqadd.s32 \o15,d31, d26 // out15 vmov \o0, d4 .endif vqsub.s32 d3, d29, d18 // t15a vqadd.s32 \o13,d29, d18 // out13 vqadd.s32 \o2, d17, d30 // out2 vqsub.s32 d26, d17, d30 // t14a vqadd.s32 \o1, d19, d27 // out1 vqsub.s32 d27, d19, d27 // t10 vqadd.s32 \o14,d28, d20 // out14 vqsub.s32 d20, d28, d20 // t11 vqadd.s32 \o3, d22, d24 // out3 vqsub.s32 d22, d22, d24 // t6 vqadd.s32 \o12,d25, d23 // out12 vqsub.s32 d23, d25, d23 // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, d2, d21, d3, d26, d27, d20, d22, d23 vmin.s32 \r, \r, d11 .endr .irp r, d2, d21, d3, d26, d27, d20, d22, d23 vmax.s32 \r, \r, d10 .endr vqneg.s32 \o15, \o15 // out15 vqneg.s32 \o13,\o13 // out13 vqneg.s32 \o1, \o1 // out1 vqneg.s32 \o3, \o3 // out3 vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) vrshr.s32 d24, d24, #12 // out8 vrshr.s32 d4, d4, #12 // out7 vrshr.s32 d5, d6, #12 // out5 vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) vrshr.s32 d26, d8, #12 // out10 vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) vrshr.s32 \o4, d2, #12 // out4 vrshr.s32 d7, d6, #12 // out9 vrshr.s32 d6, d8, #12 // out11 vrshr.s32 \o6, d22, #12 // out6 .ifc \o8, d23 vmov \o8, d24 vmov \o10,d26 .endif vqneg.s32 \o7, d4 // out7 vqneg.s32 \o5, d5 // out5 vqneg.s32 \o11,d6 // out11 vqneg.s32 \o9, d7 // out9 .endm function inv_adst_2s_x16_neon iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_flipadst_2s_x16_neon iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 bx lr endfunc function inv_identity_2s_x16_neon mov r12, #0 movt r12, #2*(5793-4096)*8 vdup.32 d0, r12 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s32 q1, \i, d0[0] vqadd.s32 \i, \i, \i vqadd.s32 \i, \i, q1 .endr bx lr endfunc .macro identity_8x4_shift1 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s32 q2, \i, \c vrshr.s32 q2, q2, #1 vqadd.s32 \i, \i, q2 .endr .endm .macro identity_8x4 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s32 q2, \i, \c vqadd.s32 \i, \i, \i vqadd.s32 \i, \i, q2 .endr .endm .macro def_horz_16 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x2_neon push {lr} vmov.i32 d7, #0 .if \scale mov_const r12, 2896*8*(1<<16) vdup.32 d1, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r7, :64] vst1.32 {d7}, [r7, :64], r8 .endr .if \scale scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif blx r4 vqrshrn.s32 d16, q8, #\shift vqrshrn.s32 d17, q9, #\shift vqrshrn.s32 d18, q10, #\shift vqrshrn.s32 d19, q11, #\shift vqrshrn.s32 d20, q12, #\shift vqrshrn.s32 d21, q13, #\shift vqrshrn.s32 d22, q14, #\shift vqrshrn.s32 d23, q15, #\shift .if \scale b L(horz_16x2_epilog) .else L(horz_16x2_epilog): vuzp.16 q8, q9 vuzp.16 q10, q11 .irp i, q8, q10, q9, q11 vst1.16 {\i}, [r6, :128]! .endr pop {pc} .endif endfunc .endm def_horz_16 scale=1, shift=1, suffix=_scale def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_4x16_neon push {lr} .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr blx r5 load_add_store_4x16 r6, r7 pop {pc} endfunc function inv_txfm_add_16x16_neon sub_sp_align 512 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6, 8, 10, 12, 14 add r6, sp, #(\i*16*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 14 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #16*4 bl inv_txfm_horz_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #32 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc const eob_16x16 .short 3, 10, 21, 36, 55, 78, 105, 256 endconst const eob_16x16_identity .short 2, 4, 6, 8, 10, 12, 14, 256 endconst .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif push {r4-r11,lr} vpush {q4-q7} movrel_local r4, inv_\txfm1\()_2s_x16_neon movrel r5, X(inv_\txfm2\()_4h_x16_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_16x16 .else movrel_local r10, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel_local r10, eob_16x16_identity .else movrel_local r10, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst def_fn_16x16 adst, flipadst def_fn_16x16 flipadst, dct def_fn_16x16 flipadst, adst def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct function inv_txfm_add_16x4_neon cmp r3, r10 mov r11, #16 blt 1f add r6, r2, #8 vmov.i32 d4, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r6, :64] vst1.32 {d4}, [r6, :64], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 vqrshrn.s32 d20, q12, #1 vqrshrn.s32 d21, q13, #1 vqrshrn.s32 d22, q14, #1 vqrshrn.s32 d23, q15, #1 vuzp.16 q8, q9 mov r6, sp vuzp.16 q10, q11 vpush {q8-q11} b 2f 1: vmov.i16 q8, #0 vmov.i16 q9, #0 mov r6, sp vpush {q8-q9} vpush {q8-q9} 2: vmov.i32 d4, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r2, :64] vst1.32 {d4}, [r2, :64], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 vqrshrn.s32 d20, q12, #1 vqrshrn.s32 d21, q13, #1 vqrshrn.s32 d22, q14, #1 vqrshrn.s32 d23, q15, #1 vuzp.16 q8, q9 mov r6, sp vuzp.16 q10, q11 vmov q12, q10 vmov q13, q11 vpop {q10-q11} blx r5 mov r6, r0 load_add_store_8x4 r6, r7 vpop {q10-q11} vmov q8, q12 vmov q9, q13 blx r5 add r6, r0, #16 load_add_store_8x4 r6, r7 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_4x16_neon ldrh r9, [r10, #4] mov r11, #64 cmp r3, r9 ldrh r9, [r10, #2] blt 1f add r6, r2, #48 vmov.i32 q2, #0 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r6, :128] vst1.32 {q2}, [r6, :128], r11 .endr blx r4 vqrshrn.s32 d28, q8, #1 vqrshrn.s32 d29, q9, #1 vqrshrn.s32 d30, q10, #1 vqrshrn.s32 d31, q11, #1 transpose_4x4h q14, q15, d28, d29, d30, d31 b 2f 1: vmov.i16 q14, #0 vmov.i16 q15, #0 2: cmp r3, r9 ldrh r9, [r10] blt 1f add r6, r2, #32 vmov.i32 q2, #0 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r6, :128] vst1.32 {q2}, [r6, :128], r11 .endr blx r4 vqrshrn.s32 d24, q8, #1 vqrshrn.s32 d25, q9, #1 vqrshrn.s32 d26, q10, #1 vqrshrn.s32 d27, q11, #1 transpose_4x4h q12, q13, d24, d25, d26, d27 b 2f 1: vmov.i16 q12, #0 vmov.i16 q13, #0 2: cmp r3, r9 blt 1f add r6, r2, #16 vmov.i32 q2, #0 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r6, :128] vst1.32 {q2}, [r6, :128], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 transpose_4x4h q8, q9, d16, d17, d18, d19 b 2f 1: vmov.i16 q8, #0 vmov.i16 q9, #0 2: vmov.i16 q2, #0 vpush {q8-q9} .irp i, q8, q9, q10, q11 vld1.16 {\i}, [r2, :128] vst1.16 {q2}, [r2, :128], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 transpose_4x4h q8, q9, d16, d17, d18, d19 vpop {q10-q11} blx r5 load_add_store_4x16 r0, r6 vpop {q4-q7} pop {r4-r11,pc} endfunc const eob_4x16 .short 13, 29, 45, 64 endconst const eob_4x16_identity1 .short 16, 32, 48, 64 endconst const eob_4x16_identity2 .short 4, 8, 12, 64 endconst .macro def_fn_416 w, h, txfm1, txfm2, eob_16x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 4 movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_4x16 .else movrel_local r10, eob_4x16_identity1 .endif .else .ifc \txfm2, identity movrel_local r10, eob_4x16_identity2 .else movrel_local r10, eob_4x16 .endif .endif .else mov r10, #\eob_16x4 movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon) .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 3 def_fn_416 \w, \h, identity, identity, 3 def_fn_416 \w, \h, dct, adst, 3 def_fn_416 \w, \h, dct, flipadst, 3 def_fn_416 \w, \h, dct, identity, 2 def_fn_416 \w, \h, adst, dct, 3 def_fn_416 \w, \h, adst, adst, 3 def_fn_416 \w, \h, adst, flipadst, 3 def_fn_416 \w, \h, flipadst, dct, 3 def_fn_416 \w, \h, flipadst, adst, 3 def_fn_416 \w, \h, flipadst, flipadst, 3 def_fn_416 \w, \h, identity, dct, 2 def_fn_416 \w, \h, adst, identity, 2 def_fn_416 \w, \h, flipadst, identity, 2 def_fn_416 \w, \h, identity, adst, 2 def_fn_416 \w, \h, identity, flipadst, 2 .endm def_fns_416 4, 16 def_fns_416 16, 4 function inv_txfm_add_16x8_neon sub_sp_align 256 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6 add r6, sp, #(\i*16*2) .if \i > 0 mov r8, #(8 - \i) cmp r3, r11 blt 1f .if \i < 6 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #8*4 bl inv_txfm_horz_scale_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 8 add r7, sp, #(\i*2) mov r8, #32 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\j}, [r7, :128], r8 .endr blx r5 add r6, r0, #(\i*2) load_add_store_8x8 r6, r7 .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_8x16_neon add r10, r10, #2 sub_sp_align 256 ldrh r11, [r10], #4 .irp i, 0, 4, 8, 12 add r6, sp, #(\i*8*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 12 ldrh r11, [r10], #4 .endif .endif add r7, r2, #(\i*4) mov r8, #16*4 mov_const r12, 2896*8*(1<<16) vmov.i32 q2, #0 vdup.32 d0, r12 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\j}, [r7, :128] vst1.32 {q2}, [r7, :128], r8 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 vqrshrn.s32 d20, q12, #1 vqrshrn.s32 d21, q13, #1 vqrshrn.s32 d22, q14, #1 vqrshrn.s32 d23, q15, #1 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 .irp j, d16, d20, d17, d21, d18, d22, d19, d23 vst1.16 {\j}, [r6, :64]! .endr .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #16 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc const eob_8x16 .short 3, 10, 21, 43, 59, 75, 91, 128 endconst const eob_8x16_identity1 .short 2, 4, 6, 64, 80, 96, 112, 128 endconst const eob_8x16_identity2 .short 2, 4, 6, 8, 10, 12, 14, 128 endconst .macro def_fn_816 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 8 movrel_local r4, inv_\txfm1\()_4s_x8_neon movrel r5, X(inv_\txfm2\()_4h_x16_neon) .else movrel_local r4, inv_\txfm1\()_2s_x16_neon movrel r5, X(inv_\txfm2\()_8h_x8_neon) .endif .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_8x16 .else movrel_local r10, eob_8x16_identity1 .endif .else .ifc \txfm2, identity movrel_local r10, eob_8x16_identity2 .else movrel_local r10, eob_8x16 .endif .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct def_fn_816 \w, \h, identity, identity def_fn_816 \w, \h, dct, adst def_fn_816 \w, \h, dct, flipadst def_fn_816 \w, \h, dct, identity def_fn_816 \w, \h, adst, dct def_fn_816 \w, \h, adst, adst def_fn_816 \w, \h, adst, flipadst def_fn_816 \w, \h, flipadst, dct def_fn_816 \w, \h, flipadst, adst def_fn_816 \w, \h, flipadst, flipadst def_fn_816 \w, \h, identity, dct def_fn_816 \w, \h, adst, identity def_fn_816 \w, \h, flipadst, identity def_fn_816 \w, \h, identity, adst def_fn_816 \w, \h, identity, flipadst .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_2s_x16_neon movrel_local r12, idct_coeffs, 4*16 vld1.32 {q0, q1}, [r12, :128]! vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a vrshr.s32 d16, d4, #12 // t16a vrshr.s32 d31, d6, #12 // t31a vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a vrshr.s32 d24, d8, #12 // t17a vrshr.s32 d23, d4, #12 // t30a vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a vrshr.s32 d20, d6, #12 // t18a vrshr.s32 d27, d8, #12 // t29a vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #4*24 vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a vrshr.s32 d28, d4, #12 // t19a vrshr.s32 d19, d6, #12 // t28a vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a vrshr.s32 d18, d8, #12 // t20a vrshr.s32 d29, d4, #12 // t27a vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a vrshr.s32 d26, d6, #12 // t21a vrshr.s32 d21, d8, #12 // t26a vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a vrshr.s32 d22, d4, #12 // t22a vrshr.s32 d25, d6, #12 // t25a vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a vrshr.s32 d30, d8, #12 // t23a vrshr.s32 d17, d4, #12 // t24a vld1.32 {q0, q1}, [r12, :128] vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqsub.s32 d5, d16, d24 // t17 vqadd.s32 d16, d16, d24 // t16 vqsub.s32 d7, d31, d23 // t30 vqadd.s32 d31, d31, d23 // t31 vqsub.s32 d24, d28, d20 // t18 vqadd.s32 d28, d28, d20 // t19 vqadd.s32 d23, d18, d26 // t20 vqsub.s32 d18, d18, d26 // t21 vqsub.s32 d20, d30, d22 // t22 vqadd.s32 d30, d30, d22 // t23 vqadd.s32 d26, d17, d25 // t24 vqsub.s32 d17, d17, d25 // t25 vqsub.s32 d22, d29, d21 // t26 vqadd.s32 d29, d29, d21 // t27 vqadd.s32 d25, d19, d27 // t28 vqsub.s32 d19, d19, d27 // t29 .irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 vmin.s32 \r, \r, d11 .endr .irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 vmax.s32 \r, \r, d10 .endr vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a vrshr.s32 d21, d4, #12 // t17a vrshr.s32 d27, d6, #12 // t30a vneg.s32 d8, d8 // -> t18a vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a vrshr.s32 d19, d8, #12 // t18a vrshr.s32 d24, d5, #12 // t29a vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a vrshr.s32 d22, d4, #12 // t21a vrshr.s32 d18, d6, #12 // t26a vneg.s32 d8, d8 // -> t22a vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a vrshr.s32 d17, d8, #12 // t22a vrshr.s32 d20, d5, #12 // t25a vqsub.s32 d2, d27, d24 // t29 vqadd.s32 d27, d27, d24 // t30 vqsub.s32 d3, d21, d19 // t18 vqadd.s32 d21, d21, d19 // t17 vqsub.s32 d24, d16, d28 // t19a vqadd.s32 d16, d16, d28 // t16a vqsub.s32 d19, d30, d23 // t20a vqadd.s32 d30, d30, d23 // t23a vqsub.s32 d28, d17, d22 // t21 vqadd.s32 d17, d17, d22 // t22 vqadd.s32 d23, d26, d29 // t24a vqsub.s32 d26, d26, d29 // t27a vqadd.s32 d22, d20, d18 // t25 vqsub.s32 d20, d20, d18 // t26 vqsub.s32 d29, d31, d25 // t28a vqadd.s32 d31, d31, d25 // t31a .irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 vmin.s32 \r, \r, d11 .endr .irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 vmax.s32 \r, \r, d10 .endr vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 vrshr.s32 d18, d4, #12 // t18a vrshr.s32 d25, d6, #12 // t29a vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28 vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20 vrshr.s32 d29, d8, #12 // t19 vrshr.s32 d24, d5, #12 // t28 vneg.s32 d4, d4 // -> t20 vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27 vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a vrshr.s32 d26, d4, #12 // t20 vrshr.s32 d19, d6, #12 // t27 vneg.s32 d8, d8 // -> t21a vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a vrshr.s32 d20, d8, #12 // t21a vrshr.s32 d28, d5, #12 // t26a vqsub.s32 d2, d16, d30 // t23 vqadd.s32 d16, d16, d30 // t16 = out16 vqsub.s32 d3, d31, d23 // t24 vqadd.s32 d31, d31, d23 // t31 = out31 vqsub.s32 d23, d21, d17 // t22a vqadd.s32 d17, d21, d17 // t17a = out17 vqadd.s32 d30, d27, d22 // t30a = out30 vqsub.s32 d21, d27, d22 // t25a vqsub.s32 d27, d18, d20 // t21 vqadd.s32 d18, d18, d20 // t18 = out18 vqadd.s32 d4, d29, d26 // t19a = out19 vqsub.s32 d26, d29, d26 // t20a vqadd.s32 d29, d25, d28 // t29 = out29 vqsub.s32 d25, d25, d28 // t26 vqadd.s32 d28, d24, d19 // t28a = out28 vqsub.s32 d24, d24, d19 // t27a vmov d19, d4 // out19 .irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 vmin.s32 \r, \r, d11 .endr .irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 vmax.s32 \r, \r, d10 .endr vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 vrshr.s32 d20, d4, #12 // t20 vrshr.s32 d22, d6, #12 // t27 vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a vmov d27, d22 // t27 vrshr.s32 d26, d4, #12 // t26a vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22 vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25 vrshr.s32 d21, d6, #12 // t21a vrshr.s32 d22, d24, #12 // t22 vrshr.s32 d25, d4, #12 // t25 vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a vrshr.s32 d23, d4, #12 // t23a vrshr.s32 d24, d6, #12 // t24a bx lr endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x2_neon push {lr} vmov.i32 d7, #0 lsl r8, r8, #1 .if \scale mov_const r12, 2896*8*(1<<16) vdup.32 d0, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r7, :64] vst1.32 {d7}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .if \scale scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_2s_x16_neon // idct_16 leaves the row_clip_max/min constants in d9 and d8, // but here we want to use full q registers for clipping. vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmin.s32 \r, \r, q3 .endr .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmax.s32 \r, \r, q2 .endr vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 vtrn.32 d22, d23 vtrn.32 d24, d25 vtrn.32 d26, d27 vtrn.32 d28, d29 vtrn.32 d30, d31 .macro store1 r0, r1, r2, r3 vst1.16 {\r0}, [r6, :64]! vst1.16 {\r1}, [r6, :64]! vst1.16 {\r2}, [r6, :64]! vst1.16 {\r3}, [r6, :64]! .endm store1 d16, d18, d20, d22 store1 d24, d26, d28, d30 store1 d17, d19, d21, d23 store1 d25, d27, d29, d31 .purgem store1 sub r6, r6, #64*2 vmov.i32 d7, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r7, :64] vst1.32 {d7}, [r7, :64], r8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in d0[1] scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct32_odd_2s_x16_neon vtrn.32 d31, d30 vtrn.32 d29, d28 vtrn.32 d27, d26 vtrn.32 d25, d24 vtrn.32 d23, d22 vtrn.32 d21, d20 vtrn.32 d19, d18 vtrn.32 d17, d16 .macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift vld1.32 {q0, q1}, [r6, :128]! vld1.32 {q2, q3}, [r6, :128] sub r6, r6, #32 vqsub.s32 d15, d0, \r0 vqadd.s32 d0, d0, \r0 vqsub.s32 d14, d1, \r1 vqadd.s32 d1, d1, \r1 vqsub.s32 d13, d2, \r2 vqadd.s32 d2, d2, \r2 vqsub.s32 d12, d3, \r3 vqadd.s32 d3, d3, \r3 vqsub.s32 d11, d4, \r4 vqadd.s32 d4, d4, \r4 vqsub.s32 d10, d5, \r5 vqadd.s32 d5, d5, \r5 vqsub.s32 d9, d6, \r6 vqadd.s32 d6, d6, \r6 vqsub.s32 d8, d7, \r7 vqadd.s32 d7, d7, \r7 vqrshrn.s32 d0, q0, #\shift vqrshrn.s32 d1, q1, #\shift vqrshrn.s32 d2, q2, #\shift vqrshrn.s32 d3, q3, #\shift vqrshrn.s32 d4, q4, #\shift vqrshrn.s32 d5, q5, #\shift vqrshrn.s32 d6, q6, #\shift vqrshrn.s32 d7, q7, #\shift vrev32.16 q2, q2 vrev32.16 q3, q3 vst1.16 {q0, q1}, [r6, :128]! vst1.16 {q2, q3}, [r6, :128]! .endm store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift .purgem store2 pop {pc} endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_4x32_neon push {r10-r11,lr} lsl r8, r8, #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 bl X(inv_dct_4h_x16_neon) .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vst1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 sub r7, r7, r8, lsr #1 bl X(inv_dct32_odd_4h_x16_neon) neg r9, r8 mov r10, r6 vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff .macro combine r0, r1, r2, r3, op, stride vld1.16 {d4}, [r7, :64], \stride vld1.16 {d0}, [r10, :64], r1 vld1.16 {d5}, [r7, :64], \stride vld1.16 {d1}, [r10, :64], r1 \op\().s16 d4, d4, \r0 vld1.16 {d6}, [r7, :64], \stride vld1.16 {d2}, [r10, :64], r1 \op\().s16 d5, d5, \r1 vld1.16 {d3}, [r10, :64], r1 vrshr.s16 q2, q2, #4 \op\().s16 d6, d6, \r2 vld1.16 {d7}, [r7, :64], \stride vqadd.s16 q0, q0, q2 \op\().s16 d7, d7, \r3 vmax.s16 q0, q0, q6 vrshr.s16 q3, q3, #4 vmin.s16 q0, q0, q7 vqadd.s16 q1, q1, q3 vst1.16 {d0}, [r6, :64], r1 vmax.s16 q1, q1, q6 vst1.16 {d1}, [r6, :64], r1 vmin.s16 q1, q1, q7 vst1.16 {d2}, [r6, :64], r1 vst1.16 {d3}, [r6, :64], r1 .endm combine d31, d30, d29, d28, vqadd, r8 combine d27, d26, d25, d24, vqadd, r8 combine d23, d22, d21, d20, vqadd, r8 combine d19, d18, d17, d16, vqadd, r8 sub r7, r7, r8 combine d16, d17, d18, d19, vqsub, r9 combine d20, d21, d22, d23, vqsub, r9 combine d24, d25, d26, d27, vqsub, r9 combine d28, d29, d30, d31, vqsub, r9 .purgem combine pop {r10-r11,pc} endfunc const eob_32x32 .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024 endconst const eob_16x32 .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512 endconst const eob_16x32_shortside .short 3, 10, 21, 36, 55, 78, 105, 512 endconst const eob_8x32 .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256 endconst function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 push {r4-r7,lr} vpush {q6-q7} movrel_local r5, eob_32x32, 2 mov r6, #4*32 1: mov r12, #0 movrel_local r4, eob_32x32, 6 2: vmov.i32 q0, #0 add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r6 .endr vqmovn.s32 d16, q8 vqmovn.s32 d17, q12 vqmovn.s32 d18, q9 vqmovn.s32 d19, q13 vqmovn.s32 d20, q10 vqmovn.s32 d21, q14 vqmovn.s32 d22, q11 vqmovn.s32 d23, q15 transpose_4x8h q8, q9, q10, q11 load_add_store_8x4 r0, r7, shiftbits=2 ldrh lr, [r4], #8 sub r0, r0, r1, lsl #2 cmp r3, lr add r0, r0, #2*8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12, lsl #1 add r0, r0, r1, lsl #2 mls r2, r6, r12, r2 add r2, r2, #4*4 b 1b 9: vpop {q6-q7} pop {r4-r7,pc} endfunc .macro shift_8_regs op, shift .irp i, q8, q9, q10, q11, q12, q13, q14, q15 \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 push {r4-r9,lr} vpush {q6-q7} mov r9, #0 mov_const r8, 2896*8*(1<<16) movt r9, #2*(5793-4096)*8 movrel_local r5, eob_16x32\hshort, 2 mov r6, #4*\h 1: mov r12, #0 movrel_local r4, eob_16x32\wshort, 6 2: vdup.i32 d0, r8 vmov.i32 q1, #0 vmov.32 d0[1], r9 add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q1}, [r2, :128], r6 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .if \w == 16 // 16x32 identity_8x4_shift1 d0[1] .else // 32x16 shift_8_regs vqshl.s32, 1 identity_8x4 d0[1] .endif vqmovn.s32 d16, q8 vqmovn.s32 d17, q12 vqmovn.s32 d18, q9 vqmovn.s32 d19, q13 vqmovn.s32 d20, q10 vqmovn.s32 d21, q14 vqmovn.s32 d22, q11 vqmovn.s32 d23, q15 transpose_4x8h q8, q9, q10, q11 .if \w == 16 load_add_store_8x4 r0, r7, shiftbits=2 .else load_add_store_8x4 r0, r7, shiftbits=4 .endif ldrh lr, [r4], #8 sub r0, r0, r1, lsl #2 cmp r3, lr add r0, r0, #2*8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12, lsl #1 add r0, r0, r1, lsl #2 mls r2, r6, r12, r2 add r2, r2, #4*4 b 1b 9: vpop {q6-q7} pop {r4-r9,pc} endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 push {r4-r5,lr} vpush {q6-q7} movrel_local r4, eob_8x32, 2 mov r12, #4*\h 1: ldrh lr, [r4], #4 .if \w == 8 vmov.i32 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r12 .endr vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q12, #1 vqrshrn.s32 d18, q9, #1 vqrshrn.s32 d19, q13, #1 vqrshrn.s32 d20, q10, #1 vqrshrn.s32 d21, q14, #1 vqrshrn.s32 d22, q11, #1 vqrshrn.s32 d23, q15, #1 transpose_4x8h q8, q9, q10, q11 cmp r3, lr load_add_store_8x4 r0, r5, shiftbits=2 blt 9f sub r2, r2, r12, lsl #3 add r2, r2, #4*4 .else vmov.i32 q0, #0 vmov.i32 q1, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vld1.32 {q10, q11}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vld1.32 {q12, q13}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vld1.32 {q14, q15}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vqmovn.s32 d16, q8 vqmovn.s32 d17, q10 vqmovn.s32 d20, q9 vqmovn.s32 d21, q11 vqmovn.s32 d18, q12 vqmovn.s32 d19, q14 vqmovn.s32 d22, q13 vqmovn.s32 d23, q15 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 cmp r3, lr load_add_store_4x8 r0, r5, shiftbits=3 blt 9f sub r0, r0, r1, lsl #3 add r0, r0, #2*4 .endif b 1b 9: vpop {q6-q7} pop {r4-r5,pc} endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 idct_dc 32, 32, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 2048 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, sp, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_horz_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 2048 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 idct_dc 16, 32, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_2s_x16_neon .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, sp, #(\i*16*2) add r7, r2, #(\i*4) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif mov r8, #4*32 bl inv_txfm_horz_scale_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #16*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 1024 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 idct_dc 32, 16, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel r5, X(inv_dct_4h_x16_neon) .irp i, 0, 2, 4, 6, 8, 10, 12, 14 add r6, sp, #(\i*32*2) add r7, r2, #(\i*4) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 14 ldrh r11, [r10], #2 .endif .endif mov r8, #4*16 bl inv_txfm_horz_scale_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 1024 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 idct_dc 8, 32, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 512 movrel_local r10, eob_8x32, 2 mov r8, #4*32 mov r9, #32 mov r6, sp 1: vmov.i32 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r8 .endr ldrh r11, [r10], #4 sub r2, r2, r8, lsl #3 sub r9, r9, #4 add r2, r2, #4*4 bl inv_dct_4s_x8_neon vqrshrn.s32 d16, q8, #2 vqrshrn.s32 d18, q9, #2 vqrshrn.s32 d20, q10, #2 vqrshrn.s32 d22, q11, #2 vqrshrn.s32 d17, q12, #2 vqrshrn.s32 d19, q13, #2 vqrshrn.s32 d21, q14, #2 vqrshrn.s32 d23, q15, #2 transpose_4x8h q8, q9, q10, q11 vst1.16 {q8, q9}, [r6, :128]! cmp r3, r11 vst1.16 {q10, q11}, [r6, :128]! bge 1b cmp r9, #0 beq 3f vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r9, r9, #4 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #8*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 idct_dc 32, 8, 2 push {r4-r11,lr} vpush {q4-q7} movrel_local r10, eob_8x32 sub_sp_align 512 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6 add r6, sp, #(\i*32*2) add r7, r2, #(\i*4) .if \i > 0 cmp r3, r11 mov r8, #(8 - \i) blt 1f .if \i < 6 ldrh r11, [r10], #2 .endif .endif mov r8, #8*4 bl inv_txfm_horz_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: mov r8, #2*32 mov r9, #0 1: add r6, r0, r9, lsl #1 add r7, sp, r9, lsl #1 // #(\i*2) .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r7, :128], r8 .endr add r9, r9, #8 bl X(inv_dct_8h_x8_neon) cmp r9, #32 load_add_store_8x8 r6, r7 blt 1b add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld1.32 {q0, q1}, [r12, :128]! vqrdmulh.s32 d23, d16, d0[1] // t63a vqrdmulh.s32 d16, d16, d0[0] // t32a vqrdmulh.s32 d22, d17, d1[0] // t62a vqrdmulh.s32 d17, d17, d1[1] // t33a vqrdmulh.s32 d21, d18, d2[1] // t61a vqrdmulh.s32 d18, d18, d2[0] // t34a vqrdmulh.s32 d20, d19, d3[0] // t60a vqrdmulh.s32 d19, d19, d3[1] // t35a vld1.32 {q0}, [r12, :128]! vqadd.s32 d24, d16, d17 // t32 vqsub.s32 d25, d16, d17 // t33 vqsub.s32 d26, d19, d18 // t34 vqadd.s32 d27, d19, d18 // t35 vqadd.s32 d28, d20, d21 // t60 vqsub.s32 d29, d20, d21 // t61 vqsub.s32 d30, d23, d22 // t62 vqadd.s32 d31, d23, d22 // t63 .irp r, q12, q13, q14, q15 vmin.s32 \r, \r, q5 .endr .irp r, q12, q13, q14, q15 vmax.s32 \r, \r, q4 .endr vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a vneg.s32 d4, d4 // t34a vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a vrshr.s32 d26, d4, #12 // t34a vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a vrshr.s32 d29, d6, #12 // t61a vrshr.s32 d25, d7, #12 // t33a vrshr.s32 d30, d4, #12 // t62a vqadd.s32 d16, d24, d27 // t32a vqsub.s32 d19, d24, d27 // t35a vqadd.s32 d17, d25, d26 // t33 vqsub.s32 d18, d25, d26 // t34 vqsub.s32 d20, d31, d28 // t60a vqadd.s32 d23, d31, d28 // t63a vqsub.s32 d21, d30, d29 // t61 vqadd.s32 d22, d30, d29 // t62 .irp r, q8, q9, q10, q11 vmin.s32 \r, \r, q5 .endr .irp r, q8, q9, q10, q11 vmax.s32 \r, \r, q4 .endr vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60 vrshr.s32 d21, d4, #12 // t61a vrshr.s32 d18, d6, #12 // t34a vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 vrshr.s32 d20, d7, #12 // t60 vrshr.s32 d19, d4, #12 // t35 vst1.32 {d16, d17, d18, d19}, [r6, :128]! vst1.32 {d20, d21, d22, d23}, [r6, :128]! bx lr endfunc function inv_dct64_step2_neon movrel_local r12, idct_coeffs vld1.32 {q0}, [r12, :128] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a vldr d16, [r6, #4*2*0] // t32a vldr d17, [r9, #4*2*8] // t39a vldr d18, [r9, #4*2*0] // t63a vldr d19, [r6, #4*2*8] // t56a vldr d20, [r6, #4*2*16] // t40a vldr d21, [r9, #4*2*24] // t47a vldr d22, [r9, #4*2*16] // t55a vldr d23, [r6, #4*2*24] // t48a vqadd.s32 d24, d16, d17 // t32 vqsub.s32 d25, d16, d17 // t39 vqadd.s32 d26, d18, d19 // t63 vqsub.s32 d27, d18, d19 // t56 vqsub.s32 d28, d21, d20 // t40 vqadd.s32 d29, d21, d20 // t47 vqadd.s32 d30, d23, d22 // t48 vqsub.s32 d31, d23, d22 // t55 .irp r, q12, q13, q14, q15 vmin.s32 \r, \r, q5 .endr .irp r, q12, q13, q14, q15 vmax.s32 \r, \r, q4 .endr vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a vrshr.s32 d25, d4, #12 // t56a vrshr.s32 d27, d6, #12 // t39a vneg.s32 d7, d7 // t40a vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a vrshr.s32 d31, d7, #12 // t40a vrshr.s32 d28, d4, #12 // t55a vqadd.s32 d16, d24, d29 // t32a vqsub.s32 d19, d24, d29 // t47a vqadd.s32 d17, d27, d31 // t39 vqsub.s32 d18, d27, d31 // t40 vqsub.s32 d20, d26, d30 // t48a vqadd.s32 d23, d26, d30 // t63a vqsub.s32 d21, d25, d28 // t55 vqadd.s32 d22, d25, d28 // t56 .irp r, q8, q9, q10, q11 vmin.s32 \r, \r, q5 .endr .irp r, q8, q9, q10, q11 vmax.s32 \r, \r, q4 .endr vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47 vrshr.s32 d18, d4, #12 // t40a vrshr.s32 d21, d6, #12 // t55a vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 vrshr.s32 d19, d7, #12 // t47 vrshr.s32 d20, d4, #12 // t48 vstr d16, [r6, #4*2*0] // t32a vstr d17, [r9, #4*2*0] // t39 vstr d18, [r6, #4*2*8] // t40a vstr d19, [r9, #4*2*8] // t47 vstr d20, [r6, #4*2*16] // t48 vstr d21, [r9, #4*2*16] // t55a vstr d22, [r6, #4*2*24] // t56 vstr d23, [r9, #4*2*24] // t63a add r6, r6, #4*2 sub r9, r9, #4*2 cmp r6, r9 blt 1b bx lr endfunc .macro load8 src, strd, zero, clear .irp i, d16, d17, d18, d19, d20, d21, d22, d23 .if \clear vld1.32 {\i}, [\src, :64] vst1.32 {\zero}, [\src, :64], \strd .else vld1.32 {\i}, [\src, :64], \strd .endif .endr .endm .macro store16 dst vst1.32 {q8, q9}, [\dst, :128]! vst1.32 {q10, q11}, [\dst, :128]! vst1.32 {q12, q13}, [\dst, :128]! vst1.32 {q14, q15}, [\dst, :128]! .endm .macro clear_upper8 .irp i, q12, q13, q14, q15 vmov.i32 \i, #0 .endr .endm .macro vmov_if reg, val, cond .if \cond vmov.i32 \reg, \val .endif .endm .macro movdup_if reg, gpr, val, cond .if \cond mov_const \gpr, \val vdup.32 \reg, \gpr .endif .endm .macro vst1_if regs, dst, dstalign, cond .if \cond vst1.32 \regs, \dst, \dstalign .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_2s_x64_neon mov r6, sp push {r10-r11,lr} lsl r8, r8, #2 movdup_if d0, r12, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 add r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct_2s_x16_neon // idct_16 leaves the row_clip_max/min constants in d9 and d8, // but here we want to use full q registers for clipping. vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmin.s32 \r, \r, q3 .endr .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmax.s32 \r, \r, q2 .endr store16 r6 movdup_if d0, r12, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 lsr r8, r8, #1 sub r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct32_odd_2s_x16_neon add r10, r6, #8*15 sub r6, r6, #8*16 mov r9, #-8 vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 vld1.32 {d2}, [r6, :64]! vld1.32 {d3}, [r6, :64]! vqadd.s32 d6, d2, \r0 vqsub.s32 \r0, d2, \r0 vld1.32 {d4}, [r6, :64]! vqadd.s32 d7, d3, \r1 vqsub.s32 \r1, d3, \r1 vmin.s32 d6, d6, d1 vmin.s32 \r0, \r0, d1 vld1.32 {d5}, [r6, :64]! vqadd.s32 d2, d4, \r2 sub r6, r6, #8*4 vmax.s32 d6, d6, d0 vmax.s32 \r0, \r0, d0 vqsub.s32 \r2, d4, \r2 vmin.s32 d7, d7, d1 vmin.s32 \r1, \r1, d1 vst1.32 {d6}, [r6, :64]! vst1.32 {\r0}, [r10, :64], r9 vmin.s32 d2, d2, d1 vmin.s32 \r2, \r2, d1 vmax.s32 d7, d7, d0 vmax.s32 \r1, \r1, d0 vqadd.s32 d3, d5, \r3 vqsub.s32 \r3, d5, \r3 vmax.s32 d2, d2, d0 vmax.s32 \r2, \r2, d0 vmin.s32 d3, d3, d1 vmin.s32 \r3, \r3, d1 vst1.32 {d7}, [r6, :64]! vst1.32 {\r1}, [r10, :64], r9 vmax.s32 d3, d3, d0 vmax.s32 \r3, \r3, d0 vst1.32 {d2}, [r6, :64]! vst1.32 {\r2}, [r10, :64], r9 vst1.32 {d3}, [r6, :64]! vst1.32 {\r3}, [r10, :64], r9 .endm store_addsub d31, d30, d29, d28 store_addsub d27, d26, d25, d24 store_addsub d23, d22, d21, d20 store_addsub d19, d18, d17, d16 .purgem store_addsub add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 add r10, r7, r8, lsl #3 // offset 8 sub r9, r9, r8 // offset 15 sub r11, r10, r8 // offset 7 vld1.32 {d16}, [r7, :64] // in1 (offset 0) vld1.32 {d17}, [r9, :64] // in31 (offset 15) vld1.32 {d18}, [r10, :64] // in17 (offset 8) vld1.32 {d19}, [r11, :64] // in15 (offset 7) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r7, r7, r8, lsl #2 // offset 4 sub r9, r9, r8, lsl #2 // offset 11 sub r10, r7, r8 // offset 3 add r11, r9, r8 // offset 12 vld1.32 {d16}, [r10, :64] // in7 (offset 3) vld1.32 {d17}, [r11, :64] // in25 (offset 12) vld1.32 {d18}, [r9, :64] // in23 (offset 11) vld1.32 {d19}, [r7, :64] // in9 (offset 4) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear sub r10, r10, r8, lsl #1 // offset 1 sub r9, r9, r8, lsl #1 // offset 9 add r10, r10, r8 // offset 2 add r9, r9, r8 // offset 10 add r7, r7, r8 // offset 5 add r11, r11, r8 // offset 13 vld1.32 d16, [r10, :64] // in5 (offset 2) vld1.32 d17, [r11, :64] // in27 (offset 13) vld1.32 d18, [r9, :64] // in21 (offset 10) vld1.32 d19, [r7, :64] // in11 (offset 5) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear sub r10, r10, r8 // offset 1 sub r9, r9, r8 // offset 9 add r11, r11, r8 // offset 14 add r7, r7, r8 // offset 6 vld1.32 d16, [r10, :64] // in3 (offset 1) vld1.32 d17, [r11, :64] // in29 (offset 14) vld1.32 d18, [r9, :64] // in19 (offset 9) vld1.32 d19, [r7, :64] // in13 (offset 6) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon sub r6, r6, #2*4*32 add r9, r6, #2*4*7 bl inv_dct64_step2_neon pop {r10-r11,pc} endfunc .endm def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x2_neon vdup.32 q4, r9 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, #2*56 push {r10-r11,lr} mov r10, #2*64 mov r11, #-2*4*4 1: vld1.32 {d16, d17, d18, d19}, [r7, :128]! vld1.32 {d28, d29, d30, d31}, [r8, :128], r11 vld1.32 {d20, d21, d22, d23}, [r7, :128]! vld1.32 {d24, d25, d26, d27}, [r8, :128], r11 vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 vtrn.32 d22, d23 vtrn.32 d31, d30 vtrn.32 d29, d28 vtrn.32 d27, d26 vtrn.32 d25, d24 .macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7 vqsub.s32 d7, \src0, \src1 vqsub.s32 d6, \src2, \src3 vqsub.s32 d5, \src4, \src5 vqsub.s32 d4, \src6, \src7 vqadd.s32 d0, \src0, \src1 vqadd.s32 d1, \src2, \src3 vqadd.s32 d2, \src4, \src5 vqadd.s32 d3, \src6, \src7 vrshl.s32 q3, q3, q4 vrshl.s32 q2, q2, q4 vrshl.s32 q0, q0, q4 vrshl.s32 q1, q1, q4 vqmovn.s32 d7, q3 vqmovn.s32 d6, q2 vqmovn.s32 d0, q0 vqmovn.s32 d1, q1 vrev32.16 q3, q3 vst1.16 {q0}, [r6, :128], r10 vst1.16 {q3}, [r9, :128], r10 .endm store_addsub d16, d31, d18, d29, d20, d27, d22, d25 store_addsub d17, d30, d19, d28, d21, d26, d23, d24 .purgem store_addsub sub r6, r6, r10, lsl #1 sub r9, r9, r10, lsl #1 add r6, r6, #16 sub r9, r9, #16 cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_vert_dct_4x64_neon lsl r8, r8, #1 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, r1, lsl #6 sub r9, r9, r1 push {r10-r11,lr} neg r10, r1 mov r11, #-2*4*4 1: vld1.16 {d16, d17, d18, d19}, [r7, :128]! vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 vld1.16 {d20, d21, d22, d23}, [r7, :128]! vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 vld1.16 {d0}, [r6, :64], r1 vld1.16 {d1}, [r9, :64], r10 vqadd.s16 d4, \src0, \src1 vld1.16 {d2}, [r6, :64] vqsub.s16 d5, \src0, \src1 vld1.16 {d3}, [r9, :64] vqadd.s16 d6, \src2, \src3 vqsub.s16 d7, \src2, \src3 sub r6, r6, r1 sub r9, r9, r10 vrshr.s16 q2, q2, #4 vrshr.s16 q3, q3, #4 vqadd.s16 q2, q2, q0 vqadd.s16 q3, q3, q1 vmax.s16 q2, q2, q6 vmax.s16 q3, q3, q6 vmin.s16 q2, q2, q7 vmin.s16 q3, q3, q7 vst1.16 {d4}, [r6, :64], r1 vst1.16 {d5}, [r9, :64], r10 vst1.16 {d6}, [r6, :64], r1 vst1.16 {d7}, [r9, :64], r10 .endm add_dest_addsub d16, d31, d17, d30 add_dest_addsub d18, d29, d19, d28 add_dest_addsub d20, d27, d21, d26 add_dest_addsub d22, d25, d23, d24 .purgem add_dest_addsub cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_dct_clear_2s_x64_neon add r6, r5, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x2_neon .if \i < 30 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r7, r5, #(\i*2) mov r8, #64*2 bl X(inv_txfm_dct_4h_x64_neon) add r6, r0, #(\i*2) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 idct_dc 64, 32, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_dct_clear_scale_2s_x64_neon add r6, r5, #(\i*64*2) mov r9, #-1 // shift bl inv_txfm_horz_dct_64x2_neon .if \i < 30 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i*2) add r7, r5, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 idct_dc 32, 64, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 32*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_horz_scale_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r7, r5, #(\i*2) mov r8, #32*2 bl X(inv_txfm_dct_4h_x64_neon) add r6, r0, #(\i*2) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 32*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 idct_dc 64, 16, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 64*16*2+64*4*2 add r4, sp, #64*4*2 movrel_local r10, eob_16x32 .irp i, 0, 2, 4, 6, 8, 10, 12, 14 add r6, r4, #(\i*64*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*4) mov r8, #16*4 bl inv_txfm_dct_clear_2s_x64_neon add r6, r4, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x2_neon .if \i < 8 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: movrel r5, X(inv_dct_4h_x16_neon) .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i*2) add r7, r4, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 64*16*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 idct_dc 16, 64, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 16*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_2s_x16_neon .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*16*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_horz_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r7, r5, #(\i*2) mov r8, #16*2 bl X(inv_txfm_dct_4h_x64_neon) add r6, r0, #(\i*2) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 16*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/loopfilter.S000066400000000000000000000771631517466257200241570ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro loop_filter wd function lpf_8_wd\wd\()_neon vabd.u8 d0, d22, d23 // abs(p1 - p0) vabd.u8 d1, d25, d24 // abs(q1 - q0) vabd.u8 d2, d23, d24 // abs(p0 - q0) vabd.u8 d3, d22, d25 // abs(p1 - q1) .if \wd >= 6 vabd.u8 d4, d21, d22 // abs(p2 - p1) vabd.u8 d5, d26, d25 // abs(q2 - q1) .endif .if \wd >= 8 vabd.u8 d6, d20, d21 // abs(p3 - p2) vabd.u8 d7, d27, d26 // abs(q3 - q3) .endif .if \wd >= 6 vmax.u8 d4, d4, d5 .endif vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2 .if \wd >= 8 vmax.u8 d6, d6, d7 .endif vshr.u8 d3, d3, #1 .if \wd >= 8 vmax.u8 d4, d4, d6 .endif .if \wd >= 6 vand d4, d4, d14 .endif vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 vmax.u8 d4, d0, d4 vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E vand d1, d1, d2 // fm vand d1, d1, d13 // fm && wd >= 4 .if \wd >= 6 vand d14, d14, d1 // fm && wd > 4 .endif .if \wd >= 16 vand d15, d15, d1 // fm && wd == 16 .endif vmov r10, r11, d1 orrs r10, r10, r11 beq 9f // if (!fm || wd < 4) return; .if \wd >= 6 vmov.i8 d10, #1 vabd.u8 d2, d21, d23 // abs(p2 - p0) vabd.u8 d3, d22, d23 // abs(p1 - p0) vabd.u8 d4, d25, d24 // abs(q1 - q0) vabd.u8 d5, d26, d24 // abs(q2 - q0) .if \wd >= 8 vabd.u8 d6, d20, d23 // abs(p3 - p0) vabd.u8 d7, d27, d24 // abs(q3 - q0) .endif vmax.u8 d2, d2, d3 vmax.u8 d4, d4, d5 .if \wd >= 8 vmax.u8 d6, d6, d7 .endif vmax.u8 d2, d2, d4 .if \wd >= 8 vmax.u8 d2, d2, d6 .endif .if \wd == 16 vabd.u8 d3, d17, d23 // abs(p6 - p0) vabd.u8 d4, d18, d23 // abs(p5 - p0) vabd.u8 d5, d19, d23 // abs(p4 - p0) .endif vcge.u8 d2, d10, d2 // flat8in .if \wd == 16 vabd.u8 d6, d28, d24 // abs(q4 - q0) vabd.u8 d7, d29, d24 // abs(q5 - q0) vabd.u8 d8, d30, d24 // abs(q6 - q0) .endif vand d14, d2, d14 // flat8in && fm && wd > 4 vbic d1, d1, d14 // fm && wd >= 4 && !flat8in .if \wd == 16 vmax.u8 d3, d3, d4 vmax.u8 d5, d5, d6 .endif vmov r10, r11, d1 .if \wd == 16 vmax.u8 d7, d7, d8 vmax.u8 d3, d3, d5 vmax.u8 d3, d3, d7 vcge.u8 d3, d10, d3 // flat8out .endif orrs r10, r10, r11 .if \wd == 16 vand d15, d15, d3 // flat8out && fm && wd == 16 vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out .endif beq 1f // skip wd == 4 case .endif vsubl.u8 q1, d22, d25 // p1 - q1 vcgt.u8 d0, d0, d12 // hev vqmovn.s16 d2, q1 vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) vbic d0, d1, d0 // (fm && wd >= 4 && !hev) vsubl.u8 q1, d24, d23 vmov.i16 q3, #3 vmul.i16 q1, q1, q3 vmov.i8 d6, #4 vaddw.s8 q1, q1, d4 vmov.i8 d7, #3 vqmovn.s16 d2, q1 // f vqadd.s8 d4, d6, d2 // imin(f + 4, 127) vqadd.s8 d5, d7, d2 // imin(f + 3, 127) vshr.s8 d4, d4, #3 // f1 vshr.s8 d5, d5, #3 // f2 vmovl.u8 q1, d23 // p0 vmovl.u8 q3, d24 // q0 vaddw.s8 q1, q1, d5 vsubw.s8 q3, q3, d4 vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1 vqmovun.s16 d2, q1 // out p0 vqmovun.s16 d6, q3 // out q0 vbit d23, d2, d1 // if (fm && wd >= 4) vmovl.u8 q1, d22 // p1 vbit d24, d6, d1 // if (fm && wd >= 4) vmovl.u8 q3, d25 // q1 vaddw.s8 q1, q1, d4 vsubw.s8 q3, q3, d4 vqmovun.s16 d2, q1 // out p1 vqmovun.s16 d6, q3 // out q1 vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 vmov r10, r11, d14 orrs r10, r10, r11 beq 2f // skip if there's no flat8in vaddl.u8 q0, d21, d21 // p2 * 2 vaddl.u8 q1, d21, d22 // p2 + p1 vaddl.u8 q2, d22, d23 // p1 + p0 vaddl.u8 q3, d23, d24 // p0 + q0 vadd.i16 q4, q0, q1 vadd.i16 q5, q2, q3 vaddl.u8 q6, d24, d25 // q0 + q1 vadd.i16 q4, q4, q5 vsub.i16 q6, q6, q0 vaddl.u8 q5, d25, d26 // q1 + q2 vrshrn.i16 d0, q4, #3 // out p1 vadd.i16 q4, q4, q6 vsub.i16 q5, q5, q1 vaddl.u8 q6, d26, d26 // q2 + q2 vrshrn.i16 d1, q4, #3 // out p0 vadd.i16 q4, q4, q5 vsub.i16 q6, q6, q2 vrshrn.i16 d2, q4, #3 // out q0 vbit d22, d0, d14 // p1 if (flat8in) vadd.i16 q4, q4, q6 vbit d23, d1, d14 // p0 if (flat8in) vrshrn.i16 d3, q4, #3 // out q1 vbit d24, d2, d14 // q0 if (flat8in) vbit d25, d3, d14 // q1 if (flat8in) .elseif \wd >= 8 vmov r10, r11, d14 orrs r10, r10, r11 .if \wd == 8 beq 8f // skip if there's no flat8in .else beq 2f // skip if there's no flat8in .endif vaddl.u8 q0, d20, d21 // p3 + p2 vaddl.u8 q1, d22, d25 // p1 + q1 vaddl.u8 q2, d20, d22 // p3 + p1 vaddl.u8 q3, d23, d26 // p0 + q2 vadd.i16 q4, q0, q0 // 2 * (p3 + p2) vaddw.u8 q4, q4, d23 // + p0 vaddw.u8 q4, q4, d24 // + q0 vadd.i16 q4, q4, q2 // + p3 + p1 vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2 vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1 vrshrn.i16 d10, q4, #3 // out p2 vadd.i16 q4, q4, q1 vaddl.u8 q0, d20, d23 // p3 + p0 vaddl.u8 q1, d24, d27 // q0 + q3 vrshrn.i16 d11, q4, #3 // out p1 vadd.i16 q4, q4, q3 vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0 vaddl.u8 q2, d21, d24 // p2 + q0 vaddl.u8 q3, d25, d27 // q1 + q3 vrshrn.i16 d12, q4, #3 // out p0 vadd.i16 q4, q4, q1 vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0 vaddl.u8 q0, d22, d25 // p1 + q1 vaddl.u8 q1, d26, d27 // q2 + q3 vrshrn.i16 d13, q4, #3 // out q0 vadd.i16 q4, q4, q3 vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1 vrshrn.i16 d0, q4, #3 // out q1 vadd.i16 q4, q4, q1 vbit d21, d10, d14 vbit d22, d11, d14 vbit d23, d12, d14 vrshrn.i16 d1, q4, #3 // out q2 vbit d24, d13, d14 vbit d25, d0, d14 vbit d26, d1, d14 .endif 2: .if \wd == 16 vmov r10, r11, d15 orrs r10, r10, r11 bne 1f // check if flat8out is needed vmov r10, r11, d14 orrs r10, r10, r11 beq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: vaddl.u8 q1, d17, d17 // p6 + p6 vaddl.u8 q2, d17, d18 // p6 + p5 vaddl.u8 q3, d17, d19 // p6 + p4 vaddl.u8 q4, d17, d20 // p6 + p3 vadd.i16 q6, q1, q2 vadd.i16 q5, q3, q4 vaddl.u8 q3, d17, d21 // p6 + p2 vadd.i16 q6, q6, q5 vaddl.u8 q4, d17, d22 // p6 + p1 vaddl.u8 q5, d18, d23 // p5 + p0 vadd.i16 q3, q3, q4 vaddl.u8 q4, d19, d24 // p4 + q0 vadd.i16 q6, q6, q3 vadd.i16 q5, q5, q4 vaddl.u8 q3, d20, d25 // p3 + q1 vadd.i16 q6, q6, q5 vsub.i16 q3, q3, q1 vaddl.u8 q1, d21, d26 // p2 + q2 vrshrn.i16 d0, q6, #4 // out p5 vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1) vsub.i16 q1, q1, q2 vaddl.u8 q2, d22, d27 // p1 + q3 vaddl.u8 q3, d17, d19 // p6 + p4 vrshrn.i16 d1, q6, #4 // out p4 vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2) vsub.i16 q2, q2, q3 vaddl.u8 q3, d23, d28 // p0 + q4 vaddl.u8 q4, d17, d20 // p6 + p3 vrshrn.i16 d2, q6, #4 // out p3 vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3) vsub.i16 q3, q3, q4 vaddl.u8 q4, d24, d29 // q0 + q5 vaddl.u8 q2, d17, d21 // p6 + p2 vrshrn.i16 d3, q6, #4 // out p2 vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4) vsub.i16 q4, q4, q2 vaddl.u8 q3, d25, d30 // q1 + q6 vaddl.u8 q5, d17, d22 // p6 + p1 vrshrn.i16 d4, q6, #4 // out p1 vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5) vsub.i16 q3, q3, q5 vaddl.u8 q4, d26, d30 // q2 + q6 vbif d0, d18, d15 // out p5 vaddl.u8 q5, d18, d23 // p5 + p0 vrshrn.i16 d5, q6, #4 // out p0 vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6) vsub.i16 q4, q4, q5 vaddl.u8 q5, d27, d30 // q3 + q6 vbif d1, d19, d15 // out p4 vaddl.u8 q9, d19, d24 // p4 + q0 vrshrn.i16 d6, q6, #4 // out q0 vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6) vsub.i16 q5, q5, q9 vaddl.u8 q4, d28, d30 // q4 + q6 vbif d2, d20, d15 // out p3 vaddl.u8 q9, d20, d25 // p3 + q1 vrshrn.i16 d7, q6, #4 // out q1 vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6) vsub.i16 q9, q4, q9 vaddl.u8 q5, d29, d30 // q5 + q6 vbif d3, d21, d15 // out p2 vaddl.u8 q10, d21, d26 // p2 + q2 vrshrn.i16 d8, q6, #4 // out q2 vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6) vsub.i16 q5, q5, q10 vaddl.u8 q9, d30, d30 // q6 + q6 vbif d4, d22, d15 // out p1 vaddl.u8 q10, d22, d27 // p1 + q3 vrshrn.i16 d9, q6, #4 // out q3 vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6) vsub.i16 q9, q9, q10 vbif d5, d23, d15 // out p0 vrshrn.i16 d10, q6, #4 // out q4 vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6) vrshrn.i16 d11, q6, #4 // out q5 vbif d6, d24, d15 // out q0 vbif d7, d25, d15 // out q1 vbif d8, d26, d15 // out q2 vbif d9, d27, d15 // out q3 vbif d10, d28, d15 // out q4 vbif d11, d29, d15 // out q5 .endif bx lr .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels bx r8 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels bx r9 .endif 9: // Return directly without writing back any pixels bx r12 endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_8_wd16 adr r8, 7f + CONFIG_THUMB adr r9, 8f + CONFIG_THUMB bl lpf_8_wd16_neon .endm .macro lpf_8_wd8 adr r9, 8f + CONFIG_THUMB bl lpf_8_wd8_neon .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon .endm function lpf_v_4_8_neon mov r12, lr sub r10, r0, r1, lsl #1 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 lpf_8_wd4 sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_4_8_neon mov r12, lr sub r10, r0, #2 add r0, r10, r1, lsl #2 vld1.32 {d22[0]}, [r10], r1 vld1.32 {d22[1]}, [r0], r1 vld1.32 {d23[0]}, [r10], r1 vld1.32 {d23[1]}, [r0], r1 vld1.32 {d24[0]}, [r10], r1 vld1.32 {d24[1]}, [r0], r1 vld1.32 {d25[0]}, [r10], r1 vld1.32 {d25[1]}, [r0], r1 add r0, r0, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 lpf_8_wd4 sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc function lpf_v_6_8_neon mov r12, lr sub r10, r0, r1, lsl #1 sub r10, r10, r1 vld1.8 {d21}, [r10, :64], r1 // p2 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d25}, [r0, :64], r1 // q1 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 lpf_8_wd6 sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_6_8_neon mov r12, lr sub r10, r0, #4 add r0, r10, r1, lsl #2 vld1.8 {d20}, [r10], r1 vld1.8 {d24}, [r0], r1 vld1.8 {d21}, [r10], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d22}, [r10], r1 vld1.8 {d26}, [r0], r1 vld1.8 {d23}, [r10], r1 vld1.8 {d27}, [r0], r1 add r0, r0, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 lpf_8_wd6 sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc function lpf_v_8_8_neon mov r12, lr sub r10, r0, r1, lsl #2 vld1.8 {d20}, [r10, :64], r1 // p3 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d21}, [r10, :64], r1 // p2 vld1.8 {d25}, [r0, :64], r1 // q1 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d26}, [r0, :64], r1 // q2 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d27}, [r0, :64], r1 // q3 sub r0, r0, r1, lsl #2 lpf_8_wd8 sub r10, r0, r1, lsl #1 sub r10, r10, r1 vst1.8 {d21}, [r10, :64], r1 // p2 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d25}, [r0, :64], r1 // q1 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_8_8_neon mov r12, lr sub r10, r0, #4 add r0, r10, r1, lsl #2 vld1.8 {d20}, [r10], r1 vld1.8 {d24}, [r0], r1 vld1.8 {d21}, [r10], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d22}, [r10], r1 vld1.8 {d26}, [r0], r1 vld1.8 {d23}, [r10], r1 vld1.8 {d27}, [r0], r1 add r0, r0, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 lpf_8_wd8 sub r10, r0, r1, lsl #3 sub r10, r10, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 add r0, r10, r1, lsl #2 vst1.8 {d20}, [r10], r1 vst1.8 {d24}, [r0], r1 vst1.8 {d21}, [r10], r1 vst1.8 {d25}, [r0], r1 vst1.8 {d22}, [r10], r1 vst1.8 {d26}, [r0], r1 vst1.8 {d23}, [r10], r1 vst1.8 {d27}, [r0], r1 add r0, r0, #4 bx r12 8: sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc function lpf_v_16_8_neon mov r12, lr sub r10, r0, r1, lsl #3 add r10, r10, r1 vld1.8 {d17}, [r10, :64], r1 // p6 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d18}, [r10, :64], r1 // p5 vld1.8 {d25}, [r0, :64], r1 // q1 vld1.8 {d19}, [r10, :64], r1 // p4 vld1.8 {d26}, [r0, :64], r1 // q2 vld1.8 {d20}, [r10, :64], r1 // p3 vld1.8 {d27}, [r0, :64], r1 // q3 vld1.8 {d21}, [r10, :64], r1 // p2 vld1.8 {d28}, [r0, :64], r1 // q4 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d29}, [r0, :64], r1 // q5 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d30}, [r0, :64], r1 // q6 sub r0, r0, r1, lsl #3 add r0, r0, r1 lpf_8_wd16 sub r10, r0, r1, lsl #2 sub r10, r10, r1, lsl #1 vst1.8 {d0}, [r10, :64], r1 // p5 vst1.8 {d6}, [r0, :64], r1 // q0 vst1.8 {d1}, [r10, :64], r1 // p4 vst1.8 {d7}, [r0, :64], r1 // q1 vst1.8 {d2}, [r10, :64], r1 // p3 vst1.8 {d8}, [r0, :64], r1 // q2 vst1.8 {d3}, [r10, :64], r1 // p2 vst1.8 {d9}, [r0, :64], r1 // q3 vst1.8 {d4}, [r10, :64], r1 // p1 vst1.8 {d10}, [r0, :64], r1 // q4 vst1.8 {d5}, [r10, :64], r1 // p0 vst1.8 {d11}, [r0, :64], r1 // q5 sub r0, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 bx r12 7: sub r10, r0, r1 sub r10, r10, r1, lsl #1 vst1.8 {d21}, [r10, :64], r1 // p2 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d25}, [r0, :64], r1 // q1 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_16_8_neon mov r12, lr sub r10, r0, #8 vld1.8 {d16}, [r10, :64], r1 vld1.8 {d24}, [r0, :64], r1 vld1.8 {d17}, [r10, :64], r1 vld1.8 {d25}, [r0, :64], r1 vld1.8 {d18}, [r10, :64], r1 vld1.8 {d26}, [r0, :64], r1 vld1.8 {d19}, [r10, :64], r1 vld1.8 {d27}, [r0, :64], r1 vld1.8 {d20}, [r10, :64], r1 vld1.8 {d28}, [r0, :64], r1 vld1.8 {d21}, [r10, :64], r1 vld1.8 {d29}, [r0, :64], r1 vld1.8 {d22}, [r10, :64], r1 vld1.8 {d30}, [r0, :64], r1 vld1.8 {d23}, [r10, :64], r1 vld1.8 {d31}, [r0, :64], r1 transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31 lpf_8_wd16 sub r0, r0, r1, lsl #3 sub r10, r0, #8 transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5 transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31 vst1.8 {d16}, [r10, :64], r1 vst1.8 {d6}, [r0, :64], r1 vst1.8 {d17}, [r10, :64], r1 vst1.8 {d7}, [r0, :64], r1 vst1.8 {d0}, [r10, :64], r1 vst1.8 {d8}, [r0, :64], r1 vst1.8 {d1}, [r10, :64], r1 vst1.8 {d9}, [r0, :64], r1 vst1.8 {d2}, [r10, :64], r1 vst1.8 {d10}, [r0, :64], r1 vst1.8 {d3}, [r10, :64], r1 vst1.8 {d11}, [r0, :64], r1 vst1.8 {d4}, [r10, :64], r1 vst1.8 {d30}, [r0, :64], r1 vst1.8 {d5}, [r10, :64], r1 vst1.8 {d31}, [r0, :64], r1 bx r12 7: sub r10, r0, r1, lsl #3 sub r10, r10, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 add r0, r10, r1, lsl #2 vst1.8 {d20}, [r10], r1 vst1.8 {d24}, [r0], r1 vst1.8 {d21}, [r10], r1 vst1.8 {d25}, [r0], r1 vst1.8 {d22}, [r10], r1 vst1.8 {d26}, [r0], r1 vst1.8 {d23}, [r10], r1 vst1.8 {d27}, [r0], r1 add r0, r0, #4 bx r12 8: sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc // void dav2d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av2FilterLUT *lut, const int w) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [r2] // vmask[0], vmask[1] .ifc \type, y ldr r2, [r2, #8] // vmask[2] .endif add r5, r5, #128 // Move to sharp part of lut .ifc \type, y orr r7, r7, r2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub r4, r3, r4, lsl #2 .else sub r3, r3, #4 lsl r4, r4, #2 .endif orr r6, r6, r7 // vmask[0] |= vmask[1] 1: tst r6, #0x03 .ifc \dir, v vld1.8 {d0}, [r4]! vld1.8 {d1}, [r3]! .else vld2.32 {d0[0], d1[0]}, [r3], r4 vld2.32 {d0[1], d1[1]}, [r3], r4 .endif beq 7f // if (!(vm & bits)) continue; vld1.8 {d5[]}, [r5] // sharp[0] add r5, r5, #8 vmov.i32 d2, #0xff vdup.32 d13, r6 // vmask[0] vand d0, d0, d2 // Keep only lowest byte in each 32 bit word vand d1, d1, d2 vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0] vmov.i8 d4, #1 vld1.8 {d6[]}, [r5] // sharp[1] sub r5, r5, #8 vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0] vtst.32 d2, d1, d2 // L != 0 vmul.i32 d1, d1, d4 // L .ifc \type, y vdup.32 d15, r2 // vmask[2] .endif vdup.32 d14, r7 // vmask[1] vmov r10, r11, d2 orrs r10, r10, r11 beq 7f // if (!L) continue; vneg.s8 d5, d5 // -sharp[0] movrel_local r10, word_12 vshr.u8 d12, d1, #4 // H vld1.32 {d16}, [r10, :64] vshl.s8 d3, d1, d5 // L >> sharp[0] .ifc \type, y vtst.32 d15, d15, d16 // if (vmask[2] & bits) .endif vmov.i8 d7, #2 vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1]) vadd.i8 d0, d1, d7 // L + 2 vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I vadd.u8 d0, d0, d0 // 2*(L + 2) vtst.32 d14, d14, d16 // if (vmask[1] & bits) vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E vtst.32 d13, d13, d16 // if (vmask[0] & bits) vand d13, d13, d2 // vmask[0] &= L != 0 .ifc \type, y tst r2, #0x03 beq 2f // wd16 bl lpf_\dir\()_16_8_neon b 8f 2: .endif tst r7, #0x03 beq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_8_neon .else // wd6 bl lpf_\dir\()_6_8_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_8_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment r0. // If the whole function is skipped, increment it here instead. add r0, r0, r1, lsl #3 .else 7: .endif 8: lsrs r6, r6, #2 // vmask[0] >>= 2 lsr r7, r7, #2 // vmask[1] >>= 2 .ifc \type, y lsr r2, r2, #2 // vmask[2] >>= 2 .endif .ifc \dir, v add r0, r0, #8 .else // For dir h, r0 is returned incremented .endif bne 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv const word_12, align=4 .word 1, 2 endconst dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/loopfilter16.S000066400000000000000000000777251517466257200243320ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro loop_filter wd function lpf_4_wd\wd\()_neon vabd.u16 d0, d22, d23 // abs(p1 - p0) vabd.u16 d1, d25, d24 // abs(q1 - q0) vabd.u16 d2, d23, d24 // abs(p0 - q0) vabd.u16 d3, d22, d25 // abs(p1 - q1) .if \wd >= 6 vabd.u16 d4, d21, d22 // abs(p2 - p1) vabd.u16 d5, d26, d25 // abs(q2 - q1) .endif .if \wd >= 8 vabd.u16 d6, d20, d21 // abs(p3 - p2) vabd.u16 d7, d27, d26 // abs(q3 - q3) .endif .if \wd >= 6 vmax.u16 d4, d4, d5 .endif vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2 .if \wd >= 8 vmax.u16 d6, d6, d7 .endif vshr.u16 d3, d3, #1 .if \wd >= 8 vmax.u16 d4, d4, d6 .endif vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 vmax.u16 d4, d0, d4 vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E vand d1, d1, d2 // fm && wd >= 4 (implicit) .if \wd >= 6 vmov d14, d1 // fm && wd > 4 (implicit) .endif .if \wd >= 16 vmov d15, d1 // fm && wd == 16 (implicit) .endif vmov r10, r11, d1 orrs r10, r10, r11 beq 9f // if (!fm || wd < 4) return; .if \wd >= 6 vmov.i16 d10, #1 vabd.u16 d2, d21, d23 // abs(p2 - p0) vabd.u16 d3, d22, d23 // abs(p1 - p0) vabd.u16 d4, d25, d24 // abs(q1 - q0) vabd.u16 d5, d26, d24 // abs(q2 - q0) vdup.16 d9, r9 // bitdepth_min_8 .if \wd >= 8 vabd.u16 d6, d20, d23 // abs(p3 - p0) vabd.u16 d7, d27, d24 // abs(q3 - q0) .endif vmax.u16 d2, d2, d3 vmax.u16 d4, d4, d5 .if \wd >= 8 vmax.u16 d6, d6, d7 .endif vmax.u16 d2, d2, d4 vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8 .if \wd >= 8 vmax.u16 d2, d2, d6 .endif .if \wd == 16 vabd.u16 d3, d17, d23 // abs(p6 - p0) vabd.u16 d4, d18, d23 // abs(p5 - p0) vabd.u16 d5, d19, d23 // abs(p4 - p0) .endif vcge.u16 d2, d10, d2 // flat8in .if \wd == 16 vabd.u16 d6, d28, d24 // abs(q4 - q0) vabd.u16 d7, d29, d24 // abs(q5 - q0) vabd.u16 d8, d30, d24 // abs(q6 - q0) .endif vand d14, d2, d14 // flat8in && fm && wd > 4 vbic d1, d1, d14 // fm && wd >= 4 && !flat8in .if \wd == 16 vmax.u16 d3, d3, d4 vmax.u16 d5, d5, d6 .endif vmov r10, r11, d1 .if \wd == 16 vmax.u16 d7, d7, d8 vmax.u16 d3, d3, d5 vmax.u16 d3, d3, d7 vcge.u16 d3, d10, d3 // flat8out .endif orrs r10, r10, r11 .if \wd == 16 vand d15, d15, d3 // flat8out && fm && wd == 16 vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out .endif beq 1f // skip wd == 4 case .endif vdup.16 d3, r8 // bitdepth_max vsub.u16 d2, d22, d25 // p1 - q1 vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1 vcgt.u16 d0, d0, d12 // hev vmvn d9, d3 // - 128 * (1 << bitdepth_min_8) vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1) vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1) vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) vsub.u16 d2, d24, d23 vmov.i16 d6, #3 vbic d0, d1, d0 // (fm && wd >= 4 && !hev) vmul.i16 d2, d2, d6 vmov.i16 d7, #4 vadd.i16 d2, d2, d4 vmin.s16 d2, d2, d3 // f = iclip_diff() vmax.s16 d2, d2, d9 // f = iclip_diff() vqadd.s16 d4, d7, d2 // f + 4 vqadd.s16 d5, d6, d2 // f + 3 vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1) vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1) vshr.s16 d4, d4, #3 // f1 vshr.s16 d5, d5, #3 // f2 vmov.i16 d9, #0 vdup.16 d3, r8 // bitdepth_max vqadd.s16 d2, d23, d5 // p0 + f2 vqsub.s16 d6, d24, d4 // q0 - f1 vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1 vmin.s16 d2, d2, d3 // out p0 = iclip_pixel() vmin.s16 d6, d6, d3 // out q0 = iclip_pixel() vmax.s16 d2, d2, d9 // out p0 = iclip_pixel() vmax.s16 d6, d6, d9 // out q0 = iclip_pixel() vbit d23, d2, d1 // if (fm && wd >= 4) vbit d24, d6, d1 // if (fm && wd >= 4) vqadd.s16 d2, d22, d4 // p1 + f vqsub.s16 d6, d25, d4 // q1 - f vmin.s16 d2, d2, d3 // out p1 = iclip_pixel() vmin.s16 d6, d6, d3 // out q1 = iclip_pixel() vmax.s16 d2, d2, d9 // out p1 = iclip_pixel() vmax.s16 d6, d6, d9 // out q1 = iclip_pixel() vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 vmov r10, r11, d14 orrs r10, r10, r11 beq 2f // skip if there's no flat8in vadd.i16 d0, d21, d21 // p2 * 2 vadd.i16 d2, d21, d22 // p2 + p1 vadd.i16 d4, d22, d23 // p1 + p0 vadd.i16 d6, d23, d24 // p0 + q0 vadd.i16 d8, d0, d2 vadd.i16 d10, d4, d6 vadd.i16 d12, d24, d25 // q0 + q1 vadd.i16 d8, d8, d10 vsub.i16 d12, d12, d0 vadd.i16 d10, d25, d26 // q1 + q2 vrshr.u16 d0, d8, #3 // out p1 vadd.i16 d8, d8, d12 vsub.i16 d10, d10, d2 vadd.i16 d12, d26, d26 // q2 + q2 vrshr.u16 d1, d8, #3 // out p0 vadd.i16 d8, d8, d10 vsub.i16 d12, d12, d4 vrshr.u16 d2, d8, #3 // out q0 vbit d22, d0, d14 // p1 if (flat8in) vadd.i16 d8, d8, d12 vbit d23, d1, d14 // p0 if (flat8in) vrshr.u16 d3, d8, #3 // out q1 vbit d24, d2, d14 // q0 if (flat8in) vbit d25, d3, d14 // q1 if (flat8in) .elseif \wd >= 8 vmov r10, r11, d14 orrs r10, r10, r11 .if \wd == 8 beq 8f // skip if there's no flat8in .else beq 2f // skip if there's no flat8in .endif vadd.i16 d0, d20, d21 // p3 + p2 vadd.i16 d2, d22, d25 // p1 + q1 vadd.i16 d4, d20, d22 // p3 + p1 vadd.i16 d6, d23, d26 // p0 + q2 vadd.i16 d8, d0, d0 // 2 * (p3 + p2) vadd.i16 d9, d23, d24 // p0 + q0 vadd.i16 d8, d8, d4 // + p3 + p1 vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2 vadd.i16 d8, d8, d9 // + p0 + q0 vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1 vrshr.u16 d10, d8, #3 // out p2 vadd.i16 d8, d8, d2 vadd.i16 d0, d20, d23 // p3 + p0 vadd.i16 d2, d24, d27 // q0 + q3 vrshr.u16 d11, d8, #3 // out p1 vadd.i16 d8, d8, d6 vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0 vadd.i16 d4, d21, d24 // p2 + q0 vadd.i16 d6, d25, d27 // q1 + q3 vrshr.u16 d12, d8, #3 // out p0 vadd.i16 d8, d8, d2 vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0 vadd.i16 d0, d22, d25 // p1 + q1 vadd.i16 d2, d26, d27 // q2 + q3 vrshr.u16 d13, d8, #3 // out q0 vadd.i16 d8, d8, d6 vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1 vrshr.u16 d0, d8, #3 // out q1 vadd.i16 d8, d8, d2 vbit d21, d10, d14 vbit d22, d11, d14 vbit d23, d12, d14 vrshr.u16 d1, d8, #3 // out q2 vbit d24, d13, d14 vbit d25, d0, d14 vbit d26, d1, d14 .endif 2: .if \wd == 16 vmov r10, r11, d15 orrs r10, r10, r11 bne 1f // check if flat8out is needed vmov r10, r11, d14 orrs r10, r10, r11 beq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: vadd.i16 d2, d17, d17 // p6 + p6 vadd.i16 d4, d17, d18 // p6 + p5 vadd.i16 d6, d17, d19 // p6 + p4 vadd.i16 d8, d17, d20 // p6 + p3 vadd.i16 d12, d2, d4 vadd.i16 d10, d6, d8 vadd.i16 d6, d17, d21 // p6 + p2 vadd.i16 d12, d12, d10 vadd.i16 d8, d17, d22 // p6 + p1 vadd.i16 d10, d18, d23 // p5 + p0 vadd.i16 d6, d6, d8 vadd.i16 d8, d19, d24 // p4 + q0 vadd.i16 d12, d12, d6 vadd.i16 d10, d10, d8 vadd.i16 d6, d20, d25 // p3 + q1 vadd.i16 d12, d12, d10 vsub.i16 d6, d6, d2 vadd.i16 d2, d21, d26 // p2 + q2 vrshr.u16 d0, d12, #4 // out p5 vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1) vsub.i16 d2, d2, d4 vadd.i16 d4, d22, d27 // p1 + q3 vadd.i16 d6, d17, d19 // p6 + p4 vrshr.u16 d1, d12, #4 // out p4 vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2) vsub.i16 d4, d4, d6 vadd.i16 d6, d23, d28 // p0 + q4 vadd.i16 d8, d17, d20 // p6 + p3 vrshr.u16 d2, d12, #4 // out p3 vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3) vsub.i16 d6, d6, d8 vadd.i16 d8, d24, d29 // q0 + q5 vadd.i16 d4, d17, d21 // p6 + p2 vrshr.u16 d3, d12, #4 // out p2 vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4) vsub.i16 d8, d8, d4 vadd.i16 d6, d25, d30 // q1 + q6 vadd.i16 d10, d17, d22 // p6 + p1 vrshr.u16 d4, d12, #4 // out p1 vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5) vsub.i16 d6, d6, d10 vadd.i16 d8, d26, d30 // q2 + q6 vbif d0, d18, d15 // out p5 vadd.i16 d10, d18, d23 // p5 + p0 vrshr.u16 d5, d12, #4 // out p0 vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6) vsub.i16 d8, d8, d10 vadd.i16 d10, d27, d30 // q3 + q6 vbif d1, d19, d15 // out p4 vadd.i16 d18, d19, d24 // p4 + q0 vrshr.u16 d6, d12, #4 // out q0 vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6) vsub.i16 d10, d10, d18 vadd.i16 d8, d28, d30 // q4 + q6 vbif d2, d20, d15 // out p3 vadd.i16 d18, d20, d25 // p3 + q1 vrshr.u16 d7, d12, #4 // out q1 vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6) vsub.i16 d18, d8, d18 vadd.i16 d10, d29, d30 // q5 + q6 vbif d3, d21, d15 // out p2 vadd.i16 d20, d21, d26 // p2 + q2 vrshr.u16 d8, d12, #4 // out q2 vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6) vsub.i16 d10, d10, d20 vadd.i16 d18, d30, d30 // q6 + q6 vbif d4, d22, d15 // out p1 vadd.i16 d20, d22, d27 // p1 + q3 vrshr.u16 d9, d12, #4 // out q3 vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6) vsub.i16 d18, d18, d20 vbif d5, d23, d15 // out p0 vrshr.u16 d10, d12, #4 // out q4 vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6) vrshr.u16 d11, d12, #4 // out q5 vbif d6, d24, d15 // out q0 vbif d7, d25, d15 // out q1 vbif d8, d26, d15 // out q2 vbif d9, d27, d15 // out q3 vbif d10, d28, d15 // out q4 vbif d11, d29, d15 // out q5 .endif bx lr .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels bx r6 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels bx r7 .endif 9: // Return directly without writing back any pixels bx r12 endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_4_wd16 adr r6, 7f + CONFIG_THUMB adr r7, 8f + CONFIG_THUMB bl lpf_4_wd16_neon .endm .macro lpf_4_wd8 adr r7, 8f + CONFIG_THUMB bl lpf_4_wd8_neon .endm .macro lpf_4_wd6 bl lpf_4_wd6_neon .endm .macro lpf_4_wd4 bl lpf_4_wd4_neon .endm function lpf_v_4_4_neon mov r12, lr sub r10, r0, r1, lsl #1 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 lpf_4_wd4 sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_4_4_neon mov r12, lr sub r10, r0, #4 add r0, r10, r1, lsl #1 vld1.16 {d22}, [r10], r1 vld1.16 {d24}, [r0], r1 vld1.16 {d23}, [r10], r1 vld1.16 {d25}, [r0], r1 add r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 lpf_4_wd4 sub r10, r0, r1, lsl #2 sub r10, r10, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc function lpf_v_6_4_neon mov r12, lr sub r10, r0, r1, lsl #1 sub r10, r10, r1 vld1.16 {d21}, [r10, :64], r1 // p2 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d25}, [r0, :64], r1 // q1 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 lpf_4_wd6 sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_6_4_neon mov r12, lr sub r10, r0, #8 vld1.16 {d20}, [r10, :64], r1 vld1.16 {d24}, [r0, :64], r1 vld1.16 {d21}, [r10, :64], r1 vld1.16 {d25}, [r0, :64], r1 vld1.16 {d22}, [r10, :64], r1 vld1.16 {d26}, [r0, :64], r1 vld1.16 {d23}, [r10, :64], r1 vld1.16 {d27}, [r0, :64], r1 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 lpf_4_wd6 sub r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 sub r10, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc function lpf_v_8_4_neon mov r12, lr sub r10, r0, r1, lsl #2 vld1.16 {d20}, [r10, :64], r1 // p3 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d21}, [r10, :64], r1 // p2 vld1.16 {d25}, [r0, :64], r1 // q1 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d26}, [r0, :64], r1 // q2 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d27}, [r0, :64], r1 // q3 sub r0, r0, r1, lsl #2 lpf_4_wd8 sub r10, r0, r1, lsl #1 sub r10, r10, r1 vst1.16 {d21}, [r10, :64], r1 // p2 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d25}, [r0, :64], r1 // q1 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_8_4_neon mov r12, lr sub r10, r0, #8 vld1.16 {d20}, [r10, :64], r1 vld1.16 {d24}, [r0, :64], r1 vld1.16 {d21}, [r10, :64], r1 vld1.16 {d25}, [r0, :64], r1 vld1.16 {d22}, [r10, :64], r1 vld1.16 {d26}, [r0, :64], r1 vld1.16 {d23}, [r10, :64], r1 vld1.16 {d27}, [r0, :64], r1 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 lpf_4_wd8 sub r0, r0, r1, lsl #2 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 sub r10, r0, #8 vst1.16 {d20}, [r10, :64], r1 vst1.16 {d24}, [r0, :64], r1 vst1.16 {d21}, [r10, :64], r1 vst1.16 {d25}, [r0, :64], r1 vst1.16 {d22}, [r10, :64], r1 vst1.16 {d26}, [r0, :64], r1 vst1.16 {d23}, [r10, :64], r1 vst1.16 {d27}, [r0, :64], r1 bx r12 8: sub r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 sub r10, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc function lpf_v_16_4_neon mov r12, lr sub r10, r0, r1, lsl #3 add r10, r10, r1 vld1.16 {d17}, [r10, :64], r1 // p6 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d18}, [r10, :64], r1 // p5 vld1.16 {d25}, [r0, :64], r1 // q1 vld1.16 {d19}, [r10, :64], r1 // p4 vld1.16 {d26}, [r0, :64], r1 // q2 vld1.16 {d20}, [r10, :64], r1 // p3 vld1.16 {d27}, [r0, :64], r1 // q3 vld1.16 {d21}, [r10, :64], r1 // p2 vld1.16 {d28}, [r0, :64], r1 // q4 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d29}, [r0, :64], r1 // q5 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d30}, [r0, :64], r1 // q6 sub r0, r0, r1, lsl #3 add r0, r0, r1 lpf_4_wd16 sub r10, r0, r1, lsl #2 sub r10, r10, r1, lsl #1 vst1.16 {d0}, [r10, :64], r1 // p5 vst1.16 {d6}, [r0, :64], r1 // q0 vst1.16 {d1}, [r10, :64], r1 // p4 vst1.16 {d7}, [r0, :64], r1 // q1 vst1.16 {d2}, [r10, :64], r1 // p3 vst1.16 {d8}, [r0, :64], r1 // q2 vst1.16 {d3}, [r10, :64], r1 // p2 vst1.16 {d9}, [r0, :64], r1 // q3 vst1.16 {d4}, [r10, :64], r1 // p1 vst1.16 {d10}, [r0, :64], r1 // q4 vst1.16 {d5}, [r10, :64], r1 // p0 vst1.16 {d11}, [r0, :64], r1 // q5 sub r0, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 bx r12 7: sub r10, r0, r1 sub r10, r10, r1, lsl #1 vst1.16 {d21}, [r10, :64], r1 // p2 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d25}, [r0, :64], r1 // q1 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_16_4_neon mov r12, lr sub r10, r0, #16 sub r0, r0, #8 vld1.16 {d16}, [r10, :64], r1 vld1.16 {d20}, [r0, :64], r1 vld1.16 {d17}, [r10, :64], r1 vld1.16 {d21}, [r0, :64], r1 vld1.16 {d18}, [r10, :64], r1 vld1.16 {d22}, [r0, :64], r1 vld1.16 {d19}, [r10, :64], r1 vld1.16 {d23}, [r0, :64], r1 sub r10, r10, r1, lsl #2 sub r0, r0, r1, lsl #2 add r10, r10, #16 add r0, r0, #16 vld1.16 {d24}, [r10, :64], r1 vld1.16 {d28}, [r0, :64], r1 vld1.16 {d25}, [r10, :64], r1 vld1.16 {d29}, [r0, :64], r1 vld1.16 {d26}, [r10, :64], r1 vld1.16 {d30}, [r0, :64], r1 vld1.16 {d27}, [r10, :64], r1 vld1.16 {d31}, [r0, :64], r1 sub r0, r0, #8 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 transpose_4x4h q14, q15, d28, d29, d30, d31 lpf_4_wd16 sub r0, r0, r1, lsl #2 transpose_4x4h q8, q0, d16, d17, d0, d1 transpose_4x4h q1, q2, d2, d3, d4, d5 transpose_4x4h q3, q4, d6, d7, d8, d9 transpose_4x4h q5, q15, d10, d11, d30, d31 sub r10, r0, #16 sub r0, r0, #8 vst1.16 {d16}, [r10, :64], r1 vst1.16 {d2}, [r0, :64], r1 vst1.16 {d17}, [r10, :64], r1 vst1.16 {d3}, [r0, :64], r1 vst1.16 {d0}, [r10, :64], r1 vst1.16 {d4}, [r0, :64], r1 vst1.16 {d1}, [r10, :64], r1 vst1.16 {d5}, [r0, :64], r1 sub r10, r10, r1, lsl #2 sub r0, r0, r1, lsl #2 add r10, r10, #16 add r0, r0, #16 vst1.16 {d6}, [r10, :64], r1 vst1.16 {d10}, [r0, :64], r1 vst1.16 {d7}, [r10, :64], r1 vst1.16 {d11}, [r0, :64], r1 vst1.16 {d8}, [r10, :64], r1 vst1.16 {d30}, [r0, :64], r1 vst1.16 {d9}, [r10, :64], r1 vst1.16 {d31}, [r0, :64], r1 sub r0, r0, #8 bx r12 7: sub r0, r0, r1, lsl #2 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 sub r10, r0, #8 vst1.16 {d20}, [r10, :64], r1 vst1.16 {d24}, [r0, :64], r1 vst1.16 {d21}, [r10, :64], r1 vst1.16 {d25}, [r0, :64], r1 vst1.16 {d22}, [r10, :64], r1 vst1.16 {d26}, [r0, :64], r1 vst1.16 {d23}, [r10, :64], r1 vst1.16 {d27}, [r0, :64], r1 bx r12 8: sub r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 sub r10, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc // void dav2d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av2FilterLUT *lut, const int w, // const int bitdepth_max) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded sub sp, sp, #8 clz r9, r8 rsb r9, r9, #24 // bitdepth_min_8 ldrd r6, r7, [r2] // vmask[0], vmask[1] .ifc \type, y ldr r2, [r2, #8] // vmask[2] .endif add r5, r5, #128 // Move to sharp part of lut .ifc \type, y orr r7, r7, r2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub r4, r3, r4, lsl #2 .else sub r3, r3, #4 lsl r4, r4, #2 .endif orr r6, r6, r7 // vmask[0] |= vmask[1] 1: tst r6, #0x01 strd r6, r7, [sp] .ifc \dir, v ldrb r10, [r4], #4 ldrb r11, [r3], #4 .else ldrb r10, [r3] ldrb r11, [r3, #4] add r3, r3, r4 .endif beq 7f // if (!(vm & bits)) continue; orrs r12, r10, r11 vdup.16 d31, r9 // bitdepth_min_8 beq 7f // if (!(l[0][0] | l[offset][0])) continue; cmp r11, #0 // Check for nonzero values in l[0][0] ldrb r6, [r5], #8 // sharp[0] it eq moveq r11, r10 // if (!l[0][0]) L = l[offset][0] ldrb r12, [r5] // sharp[1] lsr r6, r11, r6 // L >> sharp[0] sub r5, r5, #8 cmp r12, r6 lsr r10, r11, #4 // H add r11, r11, #2 // L + 2 it lt movlt r6, r12 // imin(L >> sharp[0], sharp[1]) add r11, r11, r11 // 2*(L + 2) cmp r6, #1 lsl r10, r10, r9 // H << bitdepth_min_8 it lt movlt r6, #1 // imax(imin(), 1) = limit = I vdup.16 d12, r10 // H << bitdepth_min_8 add r11, r11, r6 // 2*(L + 2) + limit = E lsl r6, r6, r9 // I << bitdepth_min_8 lsl r11, r11, r9 // E << bitdepth_min_8 vdup.16 d11, r6 // I << bitdepth_min_8 vdup.16 d10, r11 // E << bitdepth_min_8 .ifc \type, y tst r2, #0x01 beq 2f // wd16 bl lpf_\dir\()_16_4_neon b 8f 2: .endif tst r7, #0x01 beq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_4_neon .else // wd6 bl lpf_\dir\()_6_4_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_4_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment r0. // If the whole function is skipped, increment it here instead. add r0, r0, r1, lsl #2 .else 7: .endif 8: ldrd r6, r7, [sp] .ifc \type, y lsr r2, r2, #1 // vmask[2] >>= 1 .endif .ifc \dir, v add r0, r0, #8 .else // For dir h, r0 is returned incremented .endif lsrs r6, r6, #1 // vmask[0] >>= 1 lsr r7, r7, #1 // vmask[1] >>= 1 bne 1b add sp, sp, #8 vpop {q4-q7} pop {r4-r11,pc} endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/mc.S000066400000000000000000003265231517466257200223740ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro avg dst0, dst1, t0, t1, t2, t3 vld1.16 {\t0,\t1}, [r2, :128]! vld1.16 {\t2,\t3}, [r3, :128]! vadd.i16 \t0, \t0, \t2 vadd.i16 \t1, \t1, \t3 vqrshrun.s16 \dst0, \t0, #5 vqrshrun.s16 \dst1, \t1, #5 .endm .macro w_avg dst0, dst1, t0, t1, t2, t3 vld1.16 {\t0,\t1}, [r2, :128]! vld1.16 {\t2,\t3}, [r3, :128]! vsub.i16 \t0, \t2, \t0 vsub.i16 \t1, \t3, \t1 vqdmulh.s16 \t0, \t0, q15 vqdmulh.s16 \t1, \t1, q15 vadd.i16 \t0, \t2, \t0 vadd.i16 \t1, \t3, \t1 vqrshrun.s16 \dst0, \t0, #4 vqrshrun.s16 \dst1, \t1, #4 .endm .macro mask dst0, dst1, t0, t1, t2, t3 vld1.8 {q14}, [lr, :128]! vld1.16 {\t0,\t1}, [r2, :128]! vmul.i8 q14, q14, q15 vld1.16 {\t2,\t3}, [r3, :128]! vshll.i8 q13, d28, #8 vshll.i8 q14, d29, #8 vsub.i16 \t0, \t2, \t0 vsub.i16 \t1, \t3, \t1 vqdmulh.s16 \t0, \t0, q13 vqdmulh.s16 \t1, \t1, q14 vadd.i16 \t0, \t2, \t0 vadd.i16 \t1, \t3, \t1 vqrshrun.s16 \dst0, \t0, #4 vqrshrun.s16 \dst1, \t1, #4 .endm .macro bidir_fn type function \type\()_8bpc_neon, export=1 push {r4-r6,lr} ldrd r4, r5, [sp, #16] clz r4, r4 .ifnc \type, avg ldr lr, [sp, #24] .endif .ifc \type, w_avg vdup.s16 q15, lr vneg.s16 q15, q15 vshl.i16 q15, q15, #11 .endif .ifc \type, mask vmov.i8 q15, #256-2 .endif adr r12, L(\type\()_tbl) sub r4, r4, #24 ldr r4, [r12, r4, lsl #2] \type d16, d17, q0, q1, q2, q3 add r12, r12, r4 bx r12 .align 2 L(\type\()_tbl): .word 1280f - L(\type\()_tbl) + CONFIG_THUMB .word 640f - L(\type\()_tbl) + CONFIG_THUMB .word 320f - L(\type\()_tbl) + CONFIG_THUMB .word 160f - L(\type\()_tbl) + CONFIG_THUMB .word 80f - L(\type\()_tbl) + CONFIG_THUMB .word 4f - L(\type\()_tbl) + CONFIG_THUMB 4: add r6, r0, r1 lsl r1, r1, #1 cmp r5, #4 vst1.32 {d16[0]}, [r0, :32], r1 vst1.32 {d16[1]}, [r6, :32], r1 vst1.32 {d17[0]}, [r0, :32], r1 vst1.32 {d17[1]}, [r6, :32], r1 beq 0f \type d18, d19, q0, q1, q2, q3 cmp r5, #8 vst1.32 {d18[0]}, [r0, :32], r1 vst1.32 {d18[1]}, [r6, :32], r1 vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d19[1]}, [r6, :32], r1 beq 0f \type d16, d17, q0, q1, q2, q3 vst1.32 {d16[0]}, [r0, :32], r1 vst1.32 {d16[1]}, [r6, :32], r1 \type d18, d19, q0, q1, q2, q3 vst1.32 {d17[0]}, [r0, :32], r1 vst1.32 {d17[1]}, [r6, :32], r1 vst1.32 {d18[0]}, [r0, :32], r1 vst1.32 {d18[1]}, [r6, :32], r1 vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d19[1]}, [r6, :32], r1 pop {r4-r6,pc} 80: add r6, r0, r1 lsl r1, r1, #1 8: vst1.8 {d16}, [r0, :64], r1 \type d18, d19, q0, q1, q2, q3 vst1.8 {d17}, [r6, :64], r1 vst1.8 {d18}, [r0, :64], r1 subs r5, r5, #4 vst1.8 {d19}, [r6, :64], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 8b 160: add r6, r0, r1 lsl r1, r1, #1 16: \type d18, d19, q0, q1, q2, q3 vst1.8 {q8}, [r0, :128], r1 \type d20, d21, q0, q1, q2, q3 vst1.8 {q9}, [r6, :128], r1 \type d22, d23, q0, q1, q2, q3 vst1.8 {q10}, [r0, :128], r1 subs r5, r5, #4 vst1.8 {q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 16b 320: add r6, r0, r1 lsl r1, r1, #1 32: \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128], r1 \type d22, d23, q0, q1, q2, q3 subs r5, r5, #2 vst1.8 {q10, q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 32b 640: add r6, r0, #32 64: \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 \type d22, d23, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128], r1 \type d16, d17, q0, q1, q2, q3 vst1.8 {q10, q11}, [r6, :128], r1 \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128], r1 \type d22, d23, q0, q1, q2, q3 subs r5, r5, #2 vst1.8 {q10, q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 64b 1280: sub r1, r1, #32 add r6, r0, #64 128: \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 \type d22, d23, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128]! \type d16, d17, q0, q1, q2, q3 vst1.8 {q10, q11}, [r0, :128], r1 \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 vst1.8 {q8, q9}, [r6, :128]! \type d22, d23, q0, q1, q2, q3 subs r5, r5, #1 vst1.8 {q10, q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 128b 0: pop {r4-r6,pc} endfunc .endm bidir_fn avg bidir_fn w_avg bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 push {r4-r9,lr} ldrd r4, r5, [sp, #28] ldrd r6, r7, [sp, #36] clz r8, r4 adr r9, L(w_mask_\type\()_tbl) sub r8, r8, #24 ldr r8, [r9, r8, lsl #2] add r9, r9, r8 movw r12, #6903 vdup.16 q14, r12 .if \type == 444 vmov.i8 q15, #64 .elseif \type == 422 vdup.8 d0, r7 // d0[] <- sign vmov.i8 d30, #129 vsub.i8 d30, d30, d0 // 129 - sign .elseif \type == 420 vdup.16 q0, r7 // d0[] <- sign vmov.i16 q15, #256 vsub.i16 q15, q15, q0 // 256 - sign .endif add r12, r0, r1 lsl r1, r1, #1 bx r9 .align 2 L(w_mask_\type\()_tbl): .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB 4: vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once) vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once) subs r5, r5, #4 vsub.i16 q8, q2, q0 // tmp2-tmp1 vsub.i16 q9, q3, q1 vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x])) vabd.s16 q11, q1, q3 vqsub.u16 q10, q14, q10 // 6903 - abs () vqsub.u16 q11, q14, q11 vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8 vshr.s16 q11, q11, #8 vshl.s16 q12, q10, #9 // (64-m)<<9 vshl.s16 q13, q11, #9 vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15 vqdmulh.s16 q13, q13, q9 vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1 vadd.i16 q13, q13, q1 vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4 vqrshrun.s16 d25, q13, #4 .if \type == 444 vmovn.u16 d20, q10 // 64 - m vmovn.u16 d21, q11 vsub.i8 q10, q15, q10 // m vst1.8 {d20, d21}, [r6, :128]! .elseif \type == 422 vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition) vpadd.s16 d21, d22, d23 vmovn.s16 d6, q10 vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1 vst1.8 {d6}, [r6, :64]! .elseif \type == 420 vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition) vadd.s16 d21, d22, d23 vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d20[0]}, [r6, :32]! .endif vst1.32 {d24[0]}, [r0, :32], r1 vst1.32 {d24[1]}, [r12, :32], r1 vst1.32 {d25[0]}, [r0, :32], r1 vst1.32 {d25[1]}, [r12, :32], r1 bgt 4b pop {r4-r9,pc} 8: vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2 vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2 subs r5, r5, #2 vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1 vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2 vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1) vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2) vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2) vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8 vshl.s16 q12, q10, #9 // (64 - my1) << 9 vshl.s16 q13, q11, #9 // (64 - my2) << 9 vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 .if \type == 444 vmovn.u16 d20, q10 // 64 - m vmovn.u16 d21, q11 vsub.i8 q10, q15, q10 // m vst1.8 {d20, d21}, [r6, :128]! .elseif \type == 422 vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2) vmovn.s16 d20, q10 vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1 vst1.8 {d20}, [r6, :64]! .elseif \type == 420 vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition) vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d20[0]}, [r6, :32]! .endif vst1.16 {d24}, [r0, :64], r1 vst1.16 {d25}, [r12, :64], r1 bgt 8b pop {r4-r9,pc} 1280: 640: 320: 160: sub r1, r1, r4 .if \type == 444 add lr, r6, r4 .elseif \type == 422 add lr, r6, r4, lsr #1 .endif add r9, r3, r4, lsl #1 add r7, r2, r4, lsl #1 161: mov r8, r4 16: vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1 vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1 vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2 subs r8, r8, #16 vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1 vsub.i16 q3, q3, q1 vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1) vabs.s16 q11, q3 vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) vqsub.u16 q11, q14, q11 vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 vshr.s16 q11, q11, #8 vshl.s16 q12, q10, #9 // (64 - my1) << 9 vshl.s16 q13, q11, #9 vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 vqdmulh.s16 q13, q13, q3 vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 vadd.i16 q13, q13, q1 vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2 .if \type == 444 vmovn.u16 d20, q10 // 64 - my1 vmovn.u16 d21, q11 vsub.i8 q10, q15, q10 // my1 vst1.8 {d20, d21}, [r6, :128]! .elseif \type == 422 vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) vpadd.s16 d21, d22, d23 vmovn.s16 d20, q10 vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1 vst1.8 {d20}, [r6, :64]! .endif vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 vqrshrun.s16 d25, q13, #4 vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2 vsub.i16 q1, q1, q9 vst1.16 {d24, d25}, [r0, :128]! // store dsty1 vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2) vabs.s16 q3, q1 vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2) vqsub.u16 q3, q14, q3 vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8 vshr.s16 q3, q3, #8 vshl.s16 q12, q2, #9 // (64 - my2) << 9 vshl.s16 q13, q3, #9 .if \type == 444 vmovn.u16 d4, q2 // 64 - my2 vmovn.u16 d5, q3 vsub.i8 q2, q15, q2 // my2 vst1.8 {d4, d5}, [lr, :128]! .elseif \type == 422 vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition) vpadd.s16 d5, d6, d7 vmovn.s16 d4, q2 vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1 vst1.8 {d4}, [lr, :64]! .elseif \type == 420 vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition) vadd.s16 q11, q11, q3 vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) vpadd.s16 d21, d22, d23 vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.8 {d20}, [r6, :64]! .endif vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 vqdmulh.s16 q13, q13, q1 vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 vadd.i16 q13, q13, q9 vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 vqrshrun.s16 d25, q13, #4 vst1.16 {d24, d25}, [r12, :128]! // store dsty2 bgt 16b subs r5, r5, #2 add r2, r2, r4, lsl #1 add r3, r3, r4, lsl #1 add r7, r7, r4, lsl #1 add r9, r9, r4, lsl #1 .if \type == 444 add r6, r6, r4 add lr, lr, r4 .elseif \type == 422 add r6, r6, r4, lsr #1 add lr, lr, r4, lsr #1 .endif add r0, r0, r1 add r12, r12, r1 bgt 161b pop {r4-r9,pc} endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_8bpc_neon, export=1 push {r4-r5,lr} ldrd r4, r5, [sp, #12] clz lr, r3 adr r3, L(blend_tbl) sub lr, lr, #26 ldr lr, [r3, lr, lsl #2] add r3, r3, lr bx r3 .align 2 L(blend_tbl): .word 320f - L(blend_tbl) + CONFIG_THUMB .word 160f - L(blend_tbl) + CONFIG_THUMB .word 80f - L(blend_tbl) + CONFIG_THUMB .word 40f - L(blend_tbl) + CONFIG_THUMB 40: vmov.i8 d22, #64 add r12, r0, r1 lsl r1, r1, #1 4: vld1.u8 {d2}, [r5, :64]! vld1.u8 {d1}, [r2, :64]! vld1.32 {d0[]}, [r0, :32] subs r4, r4, #2 vld1.32 {d0[1]}, [r12, :32] vsub.i8 d3, d22, d2 vmull.u8 q8, d1, d2 vmlal.u8 q8, d0, d3 vrshrn.i16 d20, q8, #6 vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r12, :32], r1 bgt 4b pop {r4-r5,pc} 80: vmov.i8 d16, #64 add r12, r0, r1 lsl r1, r1, #1 8: vld1.u8 {q1}, [r5, :128]! vld1.u8 {q2}, [r2, :128]! vld1.u8 {d0}, [r0, :64] vsub.i8 d17, d16, d2 vld1.u8 {d1}, [r12, :64] subs r4, r4, #2 vsub.i8 d18, d16, d3 vmull.u8 q3, d2, d4 vmlal.u8 q3, d0, d17 vmull.u8 q10, d3, d5 vmlal.u8 q10, d1, d18 vrshrn.i16 d22, q3, #6 vrshrn.i16 d23, q10, #6 vst1.u8 {d22}, [r0, :64], r1 vst1.u8 {d23}, [r12, :64], r1 bgt 8b pop {r4-r5,pc} 160: vmov.i8 q12, #64 add r12, r0, r1 lsl r1, r1, #1 16: vld1.u8 {q1, q2}, [r5, :128]! vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0}, [r0, :128] subs r4, r4, #2 vsub.i8 q15, q12, q1 vld1.u8 {q13}, [r12, :128] vmull.u8 q3, d16, d2 vmlal.u8 q3, d0, d30 vmull.u8 q14, d17, d3 vmlal.u8 q14, d1, d31 vsub.i8 q15, q12, q2 vrshrn.i16 d20, q3, #6 vrshrn.i16 d21, q14, #6 vmull.u8 q3, d18, d4 vmlal.u8 q3, d26, d30 vmull.u8 q14, d19, d5 vmlal.u8 q14, d27, d31 vrshrn.i16 d22, q3, #6 vrshrn.i16 d23, q14, #6 vst1.u8 {q10}, [r0, :128], r1 vst1.u8 {q11}, [r12, :128], r1 bgt 16b pop {r4-r5,pc} 320: vmov.i8 q10, #64 32: vld1.u8 {q2, q3}, [r5, :128]! vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0, q1}, [r0, :128] subs r4, r4, #1 vsub.i8 q11, q10, q2 vmull.u8 q15, d16, d4 vmlal.u8 q15, d0, d22 vmull.u8 q14, d17, d5 vmlal.u8 q14, d1, d23 vsub.i8 q11, q10, q3 vrshrn.i16 d24, q15, #6 vrshrn.i16 d25, q14, #6 vmull.u8 q15, d18, d6 vmlal.u8 q15, d2, d22 vmull.u8 q14, d19, d7 vmlal.u8 q14, d3, d23 vrshrn.i16 d26, q15, #6 vrshrn.i16 d27, q14, #6 vst1.u8 {q12, q13}, [r0, :128], r1 bgt 32b pop {r4-r5,pc} endfunc // This has got the same signature as the put_8tap functions, // assumes that the caller has loaded the h argument into r5, // and assumes that r8 is set to (clz(w)-24). function put_neon adr r9, L(put_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(put_tbl): .word 1280f - L(put_tbl) + CONFIG_THUMB .word 640f - L(put_tbl) + CONFIG_THUMB .word 32f - L(put_tbl) + CONFIG_THUMB .word 160f - L(put_tbl) + CONFIG_THUMB .word 8f - L(put_tbl) + CONFIG_THUMB .word 4f - L(put_tbl) + CONFIG_THUMB .word 2f - L(put_tbl) + CONFIG_THUMB 2: vld1.16 {d0[]}, [r2], r3 vld1.16 {d1[]}, [r2], r3 subs r5, r5, #2 vst1.16 {d0[0]}, [r0, :16], r1 vst1.16 {d1[0]}, [r0, :16], r1 bgt 2b pop {r4-r11,pc} 4: vld1.32 {d0[]}, [r2], r3 vld1.32 {d1[]}, [r2], r3 subs r5, r5, #2 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d1[0]}, [r0, :32], r1 bgt 4b pop {r4-r11,pc} 8: vld1.8 {d0}, [r2], r3 vld1.8 {d1}, [r2], r3 subs r5, r5, #2 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d1}, [r0, :64], r1 bgt 8b pop {r4-r11,pc} 160: add r8, r0, r1 lsl r1, r1, #1 add r9, r2, r3 lsl r3, r3, #1 16: vld1.8 {q0}, [r2], r3 vld1.8 {q1}, [r9], r3 subs r5, r5, #2 vst1.8 {q0}, [r0, :128], r1 vst1.8 {q1}, [r8, :128], r1 bgt 16b pop {r4-r11,pc} 32: vld1.8 {q0, q1}, [r2], r3 subs r5, r5, #1 vst1.8 {q0, q1}, [r0, :128], r1 bgt 32b pop {r4-r11,pc} 640: sub r1, r1, #32 sub r3, r3, #32 64: vld1.8 {q0, q1}, [r2]! vst1.8 {q0, q1}, [r0, :128]! vld1.8 {q2, q3}, [r2], r3 subs r5, r5, #1 vst1.8 {q2, q3}, [r0, :128], r1 bgt 64b pop {r4-r11,pc} 1280: sub r1, r1, #96 sub r3, r3, #96 128: vld1.8 {q8, q9}, [r2]! vst1.8 {q8, q9}, [r0, :128]! vld1.8 {q10, q11}, [r2]! vst1.8 {q10, q11}, [r0, :128]! vld1.8 {q12, q13}, [r2]! vst1.8 {q12, q13}, [r0, :128]! vld1.8 {q14, q15}, [r2], r3 subs r5, r5, #1 vst1.8 {q14, q15}, [r0, :128], r1 bgt 128b pop {r4-r11,pc} endfunc // This has got the same signature as the put_8tap functions, // assumes that the caller has loaded the h argument into r4, // and assumes that r8 is set to (clz(w)-24), and r7 to w*2. function prep_neon adr r9, L(prep_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(prep_tbl): .word 1280f - L(prep_tbl) + CONFIG_THUMB .word 640f - L(prep_tbl) + CONFIG_THUMB .word 320f - L(prep_tbl) + CONFIG_THUMB .word 160f - L(prep_tbl) + CONFIG_THUMB .word 8f - L(prep_tbl) + CONFIG_THUMB .word 4f - L(prep_tbl) + CONFIG_THUMB 4: vld1.32 {d0[]}, [r1], r2 vld1.32 {d2[]}, [r1], r2 subs r4, r4, #2 vshll.u8 q0, d0, #4 vshll.u8 q1, d2, #4 vst1.16 {d1, d2}, [r0, :64]! bgt 4b pop {r4-r11,pc} 8: vld1.8 {d0}, [r1], r2 vld1.8 {d2}, [r1], r2 subs r4, r4, #2 vshll.u8 q0, d0, #4 vshll.u8 q1, d2, #4 vst1.16 {q0, q1}, [r0, :128]! bgt 8b pop {r4-r11,pc} 160: add r9, r1, r2 lsl r2, r2, #1 add r8, r0, r7 lsl r7, r7, #1 16: vld1.8 {q2}, [r1], r2 vld1.8 {q3}, [r9], r2 subs r4, r4, #2 vshll.u8 q0, d4, #4 vshll.u8 q1, d5, #4 vshll.u8 q2, d6, #4 vshll.u8 q3, d7, #4 vst1.16 {q0, q1}, [r0, :128], r7 vst1.16 {q2, q3}, [r8, :128], r7 bgt 16b pop {r4-r11,pc} 320: add r8, r0, r3 32: vld1.8 {q0, q1}, [r1], r2 subs r4, r4, #2 vshll.u8 q8, d0, #4 vshll.u8 q9, d1, #4 vld1.8 {q2, q3}, [r1], r2 vshll.u8 q10, d2, #4 vshll.u8 q11, d3, #4 vshll.u8 q12, d4, #4 vst1.16 {q8, q9}, [r0, :128], r7 vshll.u8 q13, d5, #4 vst1.16 {q10, q11}, [r8, :128], r7 vshll.u8 q14, d6, #4 vst1.16 {q12, q13}, [r0, :128], r7 vshll.u8 q15, d7, #4 vst1.16 {q14, q15}, [r8, :128], r7 bgt 32b pop {r4-r11,pc} 640: sub r2, r2, #32 add r8, r0, #32 mov r6, #64 64: vld1.8 {q0, q1}, [r1]! subs r4, r4, #1 vshll.u8 q8, d0, #4 vshll.u8 q9, d1, #4 vld1.8 {q2, q3}, [r1], r2 vshll.u8 q10, d2, #4 vshll.u8 q11, d3, #4 vshll.u8 q12, d4, #4 vst1.16 {q8, q9}, [r0, :128], r6 vshll.u8 q13, d5, #4 vshll.u8 q14, d6, #4 vst1.16 {q10, q11}, [r8, :128], r6 vshll.u8 q15, d7, #4 vst1.16 {q12, q13}, [r0, :128], r6 vst1.16 {q14, q15}, [r8, :128], r6 bgt 64b pop {r4-r11,pc} 1280: sub r2, r2, #96 add r8, r0, #32 mov r6, #64 128: vld1.8 {q0, q1}, [r1]! vld1.8 {q2, q3}, [r1]! vshll.u8 q10, d0, #4 vshll.u8 q11, d1, #4 vshll.u8 q12, d2, #4 vshll.u8 q13, d3, #4 vshll.u8 q14, d4, #4 vshll.u8 q15, d5, #4 vld1.8 {q8, q9}, [r1]! vst1.16 {q10, q11}, [r0, :128], r6 vst1.16 {q12, q13}, [r8, :128], r6 vshll.u8 q0, d6, #4 vshll.u8 q1, d7, #4 vshll.u8 q2, d16, #4 vshll.u8 q3, d17, #4 vshll.u8 q8, d18, #4 vshll.u8 q9, d19, #4 vld1.8 {q10, q11}, [r1], r2 vst1.16 {q14, q15}, [r0, :128], r6 vst1.16 {q0, q1}, [r8, :128], r6 vshll.u8 q12, d20, #4 vshll.u8 q13, d21, #4 vshll.u8 q14, d22, #4 vshll.u8 q15, d23, #4 subs r4, r4, #1 vst1.16 {q2, q3}, [r0, :128], r6 vst1.16 {q8, q9}, [r8, :128], r6 vst1.16 {q12, q13}, [r0, :128], r6 vst1.16 {q14, q15}, [r8, :128], r6 bgt 128b pop {r4-r11,pc} endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 vld1.\wd {\d0[]}, [\s0], \strd vld1.\wd {\d1[]}, [\s1], \strd .ifnb \d2 vld1.\wd {\d2[]}, [\s0], \strd vld1.\wd {\d3[]}, [\s1], \strd .endif .ifnb \d4 vld1.\wd {\d4[]}, [\s0], \strd .endif .ifnb \d5 vld1.\wd {\d5[]}, [\s1], \strd .endif .ifnb \d6 vld1.\wd {\d6[]}, [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 vld1.8 {\d0}, [\s0], \strd vld1.8 {\d1}, [\s1], \strd .ifnb \d2 vld1.8 {\d2}, [\s0], \strd vld1.8 {\d3}, [\s1], \strd .endif .ifnb \d4 vld1.8 {\d4}, [\s0], \strd .endif .ifnb \d5 vld1.8 {\d5}, [\s1], \strd .endif .ifnb \d6 vld1.8 {\d6}, [\s0], \strd .endif .endm .macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro interleave_1_16 r0, r1, r2, r3, r4 vext.8 \r0, \r0, \r1, #6 vext.8 \r1, \r1, \r2, #6 .ifnb \r3 vext.8 \r2, \r2, \r3, #6 vext.8 \r3, \r3, \r4, #6 .endif .endm .macro interleave_1_32 r0, r1, r2, r3, r4 vext.8 \r0, \r0, \r1, #4 vext.8 \r1, \r1, \r2, #4 .ifnb \r3 vext.8 \r2, \r2, \r3, #4 vext.8 \r3, \r3, \r4, #4 .endif .endm .macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6 vmovl.u8 \q0, \d0 vmovl.u8 \q1, \d1 .ifnb \q2 vmovl.u8 \q2, \d2 vmovl.u8 \q3, \d3 .endif .ifnb \q4 vmovl.u8 \q4, \d4 .endif .ifnb \q5 vmovl.u8 \q5, \d5 .endif .ifnb \q6 vmovl.u8 \q6, \d6 .endif .endm .macro mul_mla_4 d, s0, s1, s2, s3 vmul.s16 \d, \s0, d0[0] vmla.s16 \d, \s1, d0[1] vmla.s16 \d, \s2, d0[2] vmla.s16 \d, \s3, d0[3] .endm .macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] vmla.s16 \d0, \s2, d0[2] vmla.s16 \d0, \s3, d0[3] vmla.s16 \d0, \s4, d1[0] vmla.s16 \d0, \s5, d1[1] vmla.s16 \d0, \s6, d1[2] vmla.s16 \d0, \s7, d1[3] .endm .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] vmla.s16 \d0, \s2, d0[2] vmla.s16 \d0, \s3, d0[3] vmla.s16 \d0, \s4, d1[0] vmla.s16 \d0, \s5, d1[1] vmla.s16 \d0, \s6, d1[2] vmla.s16 \d0, \s7, d1[3] vmul.s16 \d1, \s1, d0[0] vmla.s16 \d1, \s2, d0[1] vmla.s16 \d1, \s3, d0[2] vmla.s16 \d1, \s4, d0[3] vmla.s16 \d1, \s5, d1[0] vmla.s16 \d1, \s6, d1[1] vmla.s16 \d1, \s7, d1[2] vmla.s16 \d1, \s8, d1[3] .endm .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] vmla.s16 \d0, \s2, d0[2] vmla.s16 \d0, \s3, d0[3] vmla.s16 \d0, \s4, d1[0] vmla.s16 \d0, \s5, d1[1] vmla.s16 \d0, \s6, d1[2] vmla.s16 \d0, \s7, d1[3] vmul.s16 \d1, \s2, d0[0] vmla.s16 \d1, \s3, d0[1] vmla.s16 \d1, \s4, d0[2] vmla.s16 \d1, \s5, d0[3] vmla.s16 \d1, \s6, d1[0] vmla.s16 \d1, \s7, d1[1] vmla.s16 \d1, \s8, d1[2] vmla.s16 \d1, \s9, d1[3] .endm .macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3 vqrshrun.s16 \d0, \q0, #\shift .ifnb \q1 vqrshrun.s16 \d1, \q1, #\shift .endif .ifnb \q2 vqrshrun.s16 \d2, \q2, #\shift vqrshrun.s16 \d3, \q3, #\shift .endif .endm .macro vrshr_s16 shift, r0, r1, r2, r3 vrshr.s16 \r0, \r0, #\shift .ifnb \r1 vrshr.s16 \r1, \r1, #\shift .endif .ifnb \r2 vrshr.s16 \r2, \r2, #\shift vrshr.s16 \r3, \r3, #\shift .endif .endm .macro st_16 strd, reg, lanes vst1.16 {\reg[0]}, [r0, :16], \strd vst1.16 {\reg[1]}, [r8, :16], \strd .if \lanes > 2 vst1.16 {\reg[2]}, [r0, :16], \strd vst1.16 {\reg[3]}, [r8, :16], \strd .endif .endm .macro st_32 strd, r0, r1 vst1.32 {\r0[0]}, [r0, :32], \strd vst1.32 {\r0[1]}, [r8, :32], \strd .ifnb \r1 vst1.32 {\r1[0]}, [r0, :32], \strd vst1.32 {\r1[1]}, [r8, :32], \strd .endif .endm .macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 vst1.8 {\r0}, [r0, \align], \strd vst1.8 {\r1}, [r8, \align], \strd .ifnb \r2 vst1.8 {\r2}, [r0, \align], \strd vst1.8 {\r3}, [r8, \align], \strd .endif .ifnb \r4 vst1.8 {\r4}, [r0, \align], \strd vst1.8 {\r5}, [r8, \align], \strd vst1.8 {\r6}, [r0, \align], \strd vst1.8 {\r7}, [r8, \align], \strd .endif .endm .macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3 .ifc \type, put vqrshrun_s16 6, \q0, \d0, \q1, \d2 st_32 \strd, \d0, \d2 .else vrshr_s16 2, \q0, \q1 st_reg \strd, :64, \d0, \d1, \d2, \d3 .endif .endm .macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3 .ifc \type, put vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 st_reg \strd, :64, \d0, \d1, \d2, \d3 .else vrshr_s16 2, \q0, \q1, \q2, \q3 st_reg \strd, :128,\q0, \q1, \q2, \q3 .endif .endm .macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3 .ifc \type, put vqrshrun.s16 \d0, \q0, #6 vqrshrun.s16 \d1, \q1, #6 vqrshrun.s16 \d4, \q2, #6 vqrshrun.s16 \d5, \q3, #6 st_reg \strd, :128, \q0, \q2 .else vrshr_s16 2, \q0, \q1, \q2, \q3 vst1.16 {\q0, \q1}, [r0, :128], \strd vst1.16 {\q2, \q3}, [r8, :128], \strd .endif .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_8bpc_neon, export=1 push {r4-r11,lr} movw r8, \type_h movw r9, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, r10 mul \my, \my, r10 add \mx, \mx, r8 // mx, 8tap_h, 4tap_h add \my, \my, r9 // my, 8tap_v, 4tap_v .ifc \type, prep lsl \d_strd, \w, #1 .endif clz r8, \w tst \mx, #(0x7f << 14) sub r8, r8, #24 movrel r10, X(mc_subpel_filters), -8 bne L(\type\()_8tap_h) tst \my, #(0x7f << 14) bne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx r9, \mx, #7, #7 and \mx, \mx, #0x7f it gt movgt \mx, r9 tst \my, #(0x7f << 14) add \mx, r10, \mx, lsl #3 bne L(\type\()_8tap_hv) adr r9, L(\type\()_8tap_h_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_8tap_h_tbl): .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 2: vld1.8 {d4}, [\src], \s_strd vld1.8 {d6}, [\sr2], \s_strd vmovl.u8 q2, d4 vmovl.u8 q3, d6 vext.8 d5, d4, d5, #2 vext.8 d7, d6, d7, #2 subs \h, \h, #2 vtrn.32 d4, d6 vtrn.32 d5, d7 vmul.s16 d2, d4, d0[0] vmla.s16 d2, d5, d0[1] vmla.s16 d2, d6, d0[2] vmla.s16 d2, d7, d0[3] vrshr.s16 d2, d2, #2 vqrshrun.s16 d2, q1, #4 vst1.16 {d2[0]}, [\dst, :16], \d_strd vst1.16 {d2[1]}, [\ds2, :16], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 4: vld1.8 {d16}, [\src], \s_strd vld1.8 {d24}, [\sr2], \s_strd vmovl.u8 q8, d16 vmovl.u8 q12, d24 vext.8 d18, d16, d17, #2 vext.8 d20, d16, d17, #4 vext.8 d22, d16, d17, #6 vext.8 d26, d24, d25, #2 vext.8 d28, d24, d25, #4 vext.8 d30, d24, d25, #6 subs \h, \h, #2 vmul.s16 d4, d16, d0[0] vmla.s16 d4, d18, d0[1] vmla.s16 d4, d20, d0[2] vmla.s16 d4, d22, d0[3] vmul.s16 d5, d24, d0[0] vmla.s16 d5, d26, d0[1] vmla.s16 d5, d28, d0[2] vmla.s16 d5, d30, d0[3] vrshr.s16 q2, q2, #2 .ifc \type, put vqrshrun.s16 d4, q2, #4 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd .endif bgt 4b pop {r4-r11,pc} 80: // 8xN h vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 8: vld1.8 {q8}, [\src], \s_strd vld1.8 {q12}, [\sr2], \s_strd vmovl.u8 q9, d17 vmovl.u8 q8, d16 vmovl.u8 q13, d25 vmovl.u8 q12, d24 vmul.s16 q10, q8, d0[0] vmul.s16 q14, q12, d0[0] .irpc i, 1234567 vext.8 q11, q8, q9, #(2*\i) vext.8 q15, q12, q13, #(2*\i) .if \i < 4 vmla.s16 q10, q11, d0[\i] vmla.s16 q14, q15, d0[\i] .else vmla.s16 q10, q11, d1[\i-4] vmla.s16 q14, q15, d1[\i-4] .endif .endr subs \h, \h, #2 vrshr.s16 q10, q10, #2 vrshr.s16 q14, q14, #2 .ifc \type, put vqrshrun.s16 d20, q10, #4 vqrshrun.s16 d28, q14, #4 vst1.8 {d20}, [\dst, :64], \d_strd vst1.8 {d28}, [\ds2, :64], \d_strd .else vst1.16 {q10}, [\dst, :128], \d_strd vst1.16 {q14}, [\ds2, :128], \d_strd .endif bgt 8b pop {r4-r11,pc} 160: 320: 640: 1280: // 16xN, 32xN, ... h // This could be done without touching q4-q6, by using only // one temporary for vext in the loop. That's slower on A7 and A53, // (but surprisingly, marginally faster on A8 and A73). vpush {q4-q6} vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 sub \s_strd, \s_strd, \w sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w .endif 161: vld1.8 {d16, d17, d18}, [\src]! vld1.8 {d24, d25, d26}, [\sr2]! mov \mx, \w vmovl.u8 q10, d18 vmovl.u8 q9, d17 vmovl.u8 q8, d16 vmovl.u8 q14, d26 vmovl.u8 q13, d25 vmovl.u8 q12, d24 16: vmul.s16 q1, q8, d0[0] vmul.s16 q2, q9, d0[0] vmul.s16 q3, q12, d0[0] vmul.s16 q4, q13, d0[0] .irpc i, 1234567 vext.8 q5, q8, q9, #(2*\i) vext.8 q6, q9, q10, #(2*\i) vext.8 q11, q12, q13, #(2*\i) vext.8 q15, q13, q14, #(2*\i) .if \i < 4 vmla.s16 q1, q5, d0[\i] vmla.s16 q2, q6, d0[\i] vmla.s16 q3, q11, d0[\i] vmla.s16 q4, q15, d0[\i] .else vmla.s16 q1, q5, d1[\i-4] vmla.s16 q2, q6, d1[\i-4] vmla.s16 q3, q11, d1[\i-4] vmla.s16 q4, q15, d1[\i-4] .endif .endr vrshr.s16 q1, q1, #2 vrshr.s16 q2, q2, #2 vrshr.s16 q3, q3, #2 vrshr.s16 q4, q4, #2 subs \mx, \mx, #16 .ifc \type, put vqrshrun.s16 d2, q1, #4 vqrshrun.s16 d3, q2, #4 vqrshrun.s16 d4, q3, #4 vqrshrun.s16 d5, q4, #4 vst1.8 {q1}, [\dst, :128]! vst1.8 {q2}, [\ds2, :128]! .else vst1.16 {q1, q2}, [\dst, :128]! vst1.16 {q3, q4}, [\ds2, :128]! .endif ble 9f vmov q8, q10 vmov q12, q14 vld1.8 {d18, d19}, [\src]! vld1.8 {d26, d27}, [\sr2]! vmovl.u8 q10, d19 vmovl.u8 q9, d18 vmovl.u8 q14, d27 vmovl.u8 q13, d26 b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 161b vpop {q4-q6} pop {r4-r11,pc} L(\type\()_8tap_v): cmp \h, #4 ubfx r9, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r9 add \my, r10, \my, lsl #3 adr r9, L(\type\()_8tap_v_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_8tap_v_tbl): .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put bgt 28f cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 // 2x2 v load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 interleave_1_16 d1, d2, d3, d4, d5 bgt 24f vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 mul_mla_4 d6, d16, d18, d20, d22 vqrshrun_s16 6, q3, d6 st_16 \d_strd, d6, 2 pop {r4-r11,pc} 24: // 2x4 v load_16 \sr2, \src, \s_strd, d6, d7 interleave_1_16 d5, d6, d7 vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6 vmov d17, d20 vmov d19, d22 vmov d21, d24 vmov d23, d26 mul_mla_4 q3, q8, q9, q10, q11 vqrshrun_s16 6, q3, d6 st_16 \d_strd, d6, 4 pop {r4-r11,pc} 28: // 2x6, 2x8, 2x12, 2x16 v vpush {q4-q7} vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14 interleave_1_16 d2, d4, d6, d8, d10 interleave_1_16 d10, d12, d14 vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12 vmov d3, d6 vmov d5, d8 vmov d7, d10 vmov d9, d12 216: subs \h, \h, #4 load_16 \sr2, \src, \s_strd, d16, d18, d20, d22 interleave_1_16 d14, d16, d18, d20, d22 vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20 vmov d11, d14 vmov d13, d16 vmov d15, d18 vmov d17, d20 mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8 vqrshrun_s16 6, q1, d2 st_16 \d_strd, d2, 4 ble 0f cmp \h, #2 vmov q1, q5 vmov q2, q6 vmov q3, q7 vmov q4, q8 vmov q5, q9 vmov q6, q10 vmov d14, d22 beq 26f b 216b 26: load_16 \sr2, \src, \s_strd, d16, d18 interleave_1_16 d14, d16, d18 vmovl_u8 q7, d14, q8, d16 vmov d11, d14 vmov d13, d16 mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16 vqrshrun_s16 6, q1, d2 st_16 \d_strd, d2, 2 0: vpop {q4-q7} pop {r4-r11,pc} .endif 40: bgt 480f // 4x2, 4x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 interleave_1_32 d1, d2, d3, d4, d5 vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 mul_mla_4 q3, q8, q9, q10, q11 shift_store_4 \type, \d_strd, q3, d6, d7 ble 0f load_32 \sr2, \src, \s_strd, d6, d7 interleave_1_32 d5, d6, d7 vmovl_u8 q12, d5, q13, d6 mul_mla_4 q3, q10, q11, q12, q13 shift_store_4 \type, \d_strd, q3, d6, d7 0: pop {r4-r11,pc} 480: // 4x6, 4x8, 4x12, 4x16 v vpush {q4} vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 interleave_1_32 d2, d4, d6 interleave_1_32 d6, d8, d16, d18, d20 vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18 48: subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d22, d24, d26, d28 interleave_1_32 d20, d22, d24, d26, d28 vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26 mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13 shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5 ble 0f load_32 \sr2, \src, \s_strd, d30, d2 subs \h, \h, #2 interleave_1_32 d28, d30, d2 vmovl_u8 q14, d28, q15, d30 mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15 shift_store_4 \type, \d_strd, q8, d16, d17 ble 0f load_32 \sr2, \src, \s_strd, d4, d6 subs \h, \h, #2 interleave_1_32 d2, d4, d6 vmovl_u8 q1, d2, q2, d4 mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2 shift_store_4 \type, \d_strd, q9, d18, d19 ble 0f subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d8, d16, d18, d20 interleave_1_32 d6, d8, d16, d18, d20 vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18 mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9 shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27 bgt 48b 0: vpop {q4} pop {r4-r11,pc} 80: bgt 880f // 8x2, 8x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5 mul_mla_4 q1, q8, q9, q10, q11 mul_mla_4 q2, q9, q10, q11, q12 shift_store_8 \type, \d_strd, q1, d2, q2, d4 ble 0f load_reg \sr2, \src, \s_strd, d6, d7 vmovl_u8 q13, d6, q14, d7 mul_mla_4 q1, q10, q11, q12, q13 mul_mla_4 q2, q11, q12, q13, q14 shift_store_8 \type, \d_strd, q1, d2, q2, d4 0: pop {r4-r11,pc} 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: vpush {q4} vld1.8 {d0}, [\my, :64] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20 88: subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d22, d24 vmovl_u8 q11, d22, q12, d24 mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12 shift_store_8 \type, \d_strd, q1, d2, q2, d4 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d26, d28 vmovl_u8 q13, d26, q14, d28 mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14 shift_store_8 \type, \d_strd, q3, d6, q4, d8 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d30, d2 vmovl_u8 q15, d30, q1, d2 mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1 shift_store_8 \type, \d_strd, q8, d16, q9, d18 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d4, d6 vmovl_u8 q2, d4, q3, d6 mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3 shift_store_8 \type, \d_strd, q10, d20, q11, d22 ble 9f subs \h, \h, #4 load_reg \sr2, \src, \s_strd, d8, d16, d18, d20 vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20 mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8 mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10 shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30 bgt 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: vpop {q4} pop {r4-r11,pc} 160: bgt 1680b // 16x2, 16x4 v add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 cmp \h, #2 load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15 vmovl.u8 q1, d22 vmovl.u8 q2, d24 vmovl.u8 q3, d26 vmovl.u8 q8, d28 vmovl.u8 q9, d30 vmovl.u8 q11, d23 vmovl.u8 q12, d25 vmovl.u8 q13, d27 vmovl.u8 q14, d29 vmovl.u8 q15, d31 mul_mla_4 q1, q1, q2, q3, q8 mul_mla_4 q10, q2, q3, q8, q9 mul_mla_4 q2, q11, q12, q13, q14 mul_mla_4 q11, q12, q13, q14, q15 shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11 ble 0f load_reg \sr2, \src, \s_strd, q10, q11 vmovl.u8 q1, d20 vmovl.u8 q10, d21 vmovl.u8 q12, d22 vmovl.u8 q11, d23 mul_mla_4 q2, q3, q8, q9, q1 mul_mla_4 q3, q13, q14, q15, q10 mul_mla_4 q13, q8, q9, q1, q12 mul_mla_4 q14, q14, q15, q10, q11 shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14 0: pop {r4-r11,pc} L(\type\()_8tap_hv): cmp \h, #4 ubfx r9, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r9 add \my, r10, \my, lsl #3 adr r9, L(\type\()_8tap_hv_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_8tap_hv_tbl): .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB 20: .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 280f add \my, \my, #2 vld1.32 {d2[]}, [\my] // 2x2, 2x4 hv sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.8 {d26}, [\src], \s_strd vmovl.u8 q13, d26 vext.8 q14, q13, q13, #2 vmul.s16 d26, d26, d0 vmul.s16 d28, d28, d0 vpadd.s16 d26, d26, d28 vpadd.s16 d26, d26, d26 vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vmov d17, d26 vext.8 d16, d16, d26, #4 2: bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d26, #4 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d26, d2[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 subs \h, \h, #2 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd ble 0f vmov d16, d18 vmov d17, d26 b 2b 280: // 2x8, 2x16, 2x32 hv vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.8 {d26}, [\src], \s_strd vmovl.u8 q13, d26 vext.8 q14, q13, q13, #2 vmul.s16 d26, d26, d0 vmul.s16 d28, d28, d0 vpadd.s16 d26, d26, d28 vpadd.s16 d26, d26, d26 vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vmov d17, d26 vext.8 d16, d16, d26, #4 bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d26, #4 vmov d19, d26 bl L(\type\()_8tap_filter_2) vext.8 d20, d19, d26, #4 vmov d21, d26 28: bl L(\type\()_8tap_filter_2) vext.8 d22, d21, d26, #4 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d19, d2[3] vmlal.s16 q2, d20, d3[0] vmlal.s16 q2, d21, d3[1] vmlal.s16 q2, d22, d3[2] vmlal.s16 q2, d26, d3[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 subs \h, \h, #2 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd ble 0f vmov d16, d18 vmov d17, d19 vmov d18, d20 vmov d19, d21 vmov d20, d22 vmov d21, d26 b 28b 0: pop {r4-r11,pc} L(\type\()_8tap_filter_2): vld1.8 {d28}, [\sr2], \s_strd vld1.8 {d30}, [\src], \s_strd vext.8 d29, d28, d28, #1 vext.8 d31, d30, d30, #1 vmovl.u8 q13, d28 vmovl.u8 q14, d29 vmov d27, d28 vmovl.u8 q14, d30 vmovl.u8 q15, d31 vtrn.32 d26, d28 vtrn.32 d27, d30 vmul.s16 d26, d26, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d28, d0[2] vmla.s16 d26, d30, d0[3] vrshr.s16 d26, d26, #2 vext.8 d27, d26, d26, #4 bx lr .endif 40: add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 480f add \my, \my, #2 vld1.32 {d2[]}, [\my] sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 // 4x2, 4x4 hv vld1.8 {d30}, [\src], \s_strd vmovl.u8 q14, d30 vext.8 d27, d28, d29, #2 vext.8 d30, d28, d29, #4 vext.8 d31, d28, d29, #6 vmul.s16 d26, d28, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d30, d0[2] vmla.s16 d26, d31, d0[3] vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_4) vmov d17, d26 vmov d18, d27 4: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d26, d2[3] vmull.s16 q3, d17, d2[0] vmlal.s16 q3, d18, d2[1] vmlal.s16 q3, d26, d2[2] vmlal.s16 q3, d27, d2[3] vqrshrn.s32 d4, q2, #\shift_hv vqrshrn.s32 d6, q3, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d4, q2 vqmovun.s16 d6, q3 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d6[0]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d6}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d18 vmov d17, d26 vmov d18, d27 b 4b 480: // 4x8, 4x16, 4x32 hv vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.8 {d30}, [\src], \s_strd vmovl.u8 q14, d30 vext.8 d27, d28, d29, #2 vext.8 d30, d28, d29, #4 vext.8 d31, d28, d29, #6 vmul.s16 d26, d28, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d30, d0[2] vmla.s16 d26, d31, d0[3] vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_4) vmov d17, d26 vmov d18, d27 bl L(\type\()_8tap_filter_4) vmov d19, d26 vmov d20, d27 bl L(\type\()_8tap_filter_4) vmov d21, d26 vmov d22, d27 48: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d19, d2[3] vmlal.s16 q2, d20, d3[0] vmlal.s16 q2, d21, d3[1] vmlal.s16 q2, d22, d3[2] vmlal.s16 q2, d26, d3[3] vmull.s16 q3, d17, d2[0] vmlal.s16 q3, d18, d2[1] vmlal.s16 q3, d19, d2[2] vmlal.s16 q3, d20, d2[3] vmlal.s16 q3, d21, d3[0] vmlal.s16 q3, d22, d3[1] vmlal.s16 q3, d26, d3[2] vmlal.s16 q3, d27, d3[3] vqrshrn.s32 d4, q2, #\shift_hv vqrshrn.s32 d6, q3, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d4, q2 vqmovun.s16 d6, q3 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d6[0]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d6}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d18 vmov d17, d19 vmov d18, d20 vmov d19, d21 vmov d20, d22 vmov d21, d26 vmov d22, d27 b 48b 0: pop {r4-r11,pc} L(\type\()_8tap_filter_4): vld1.8 {d30}, [\sr2], \s_strd vld1.8 {d31}, [\src], \s_strd vmovl.u8 q14, d30 vext.8 d27, d28, d29, #2 vext.8 d30, d28, d29, #4 vext.8 d1, d28, d29, #6 vmul.s16 d26, d28, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d30, d0[2] vmla.s16 d26, d1, d0[3] vmovl.u8 q14, d31 vext.8 d30, d28, d29, #2 vext.8 d31, d28, d29, #4 vext.8 d1, d28, d29, #6 vmul.s16 d27, d28, d0[0] vmla.s16 d27, d30, d0[1] vmla.s16 d27, d31, d0[2] vmla.s16 d27, d1, d0[3] vrshr.s16 d26, d26, #2 vrshr.s16 d27, d27, #2 bx lr 80: 160: 320: bgt 880f vpush {q4-q7} add \my, \my, #2 vld1.8 {d0}, [\mx, :64] vld1.32 {d2[]}, [\my] sub \src, \src, #3 sub \src, \src, \s_strd vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.8 {q14}, [\src], \s_strd vmovl.u8 q12, d28 vmovl.u8 q13, d29 vmul.s16 q10, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d1[\i-4] .endr vrshr.s16 q3, q10, #2 bl L(\type\()_8tap_filter_8) vmov q4, q10 vmov q5, q11 8: bl L(\type\()_8tap_filter_8) vmull.s16 q12, d6, d2[0] vmull.s16 q13, d7, d2[0] vmull.s16 q14, d8, d2[0] vmull.s16 q15, d9, d2[0] vmlal.s16 q12, d8, d2[1] vmlal.s16 q13, d9, d2[1] vmlal.s16 q14, d10, d2[1] vmlal.s16 q15, d11, d2[1] vmlal.s16 q12, d10, d2[2] vmlal.s16 q13, d11, d2[2] vmlal.s16 q14, d20, d2[2] vmlal.s16 q15, d21, d2[2] vmlal.s16 q12, d20, d2[3] vmlal.s16 q13, d21, d2[3] vmlal.s16 q14, d22, d2[3] vmlal.s16 q15, d23, d2[3] vqrshrn.s32 d24, q12, #\shift_hv vqrshrn.s32 d25, q13, #\shift_hv vqrshrn.s32 d28, q14, #\shift_hv vqrshrn.s32 d29, q15, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d24, q12 vqmovun.s16 d28, q14 vst1.8 {d24}, [\dst, :64], \d_strd vst1.8 {d28}, [\ds2, :64], \d_strd .else vst1.16 {q12}, [\dst, :128], \d_strd vst1.16 {q14}, [\ds2, :128], \d_strd .endif ble 9f vmov q3, q5 vmov q4, q10 vmov q5, q11 b 8b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: vpush {q4-q7} vld1.8 {d0}, [\mx, :64] vld1.8 {d2}, [\my, :64] sub \src, \src, #3 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.8 {q14}, [\src], \s_strd vmovl.u8 q12, d28 vmovl.u8 q13, d29 vmul.s16 q10, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d1[\i-4] .endr vrshr.s16 q3, q10, #2 bl L(\type\()_8tap_filter_8) vmov q4, q10 vmov q5, q11 bl L(\type\()_8tap_filter_8) vmov q6, q10 vmov q7, q11 bl L(\type\()_8tap_filter_8) vmov q8, q10 vmov q9, q11 88: bl L(\type\()_8tap_filter_8) vmull.s16 q12, d6, d2[0] vmull.s16 q13, d7, d2[0] vmull.s16 q14, d8, d2[0] vmull.s16 q15, d9, d2[0] vmlal.s16 q12, d8, d2[1] vmlal.s16 q13, d9, d2[1] vmlal.s16 q14, d10, d2[1] vmlal.s16 q15, d11, d2[1] vmlal.s16 q12, d10, d2[2] vmlal.s16 q13, d11, d2[2] vmlal.s16 q14, d12, d2[2] vmlal.s16 q15, d13, d2[2] vmlal.s16 q12, d12, d2[3] vmlal.s16 q13, d13, d2[3] vmlal.s16 q14, d14, d2[3] vmlal.s16 q15, d15, d2[3] vmlal.s16 q12, d14, d3[0] vmlal.s16 q13, d15, d3[0] vmlal.s16 q14, d16, d3[0] vmlal.s16 q15, d17, d3[0] vmlal.s16 q12, d16, d3[1] vmlal.s16 q13, d17, d3[1] vmlal.s16 q14, d18, d3[1] vmlal.s16 q15, d19, d3[1] vmlal.s16 q12, d18, d3[2] vmlal.s16 q13, d19, d3[2] vmlal.s16 q14, d20, d3[2] vmlal.s16 q15, d21, d3[2] vmlal.s16 q12, d20, d3[3] vmlal.s16 q13, d21, d3[3] vmlal.s16 q14, d22, d3[3] vmlal.s16 q15, d23, d3[3] vqrshrn.s32 d24, q12, #\shift_hv vqrshrn.s32 d25, q13, #\shift_hv vqrshrn.s32 d28, q14, #\shift_hv vqrshrn.s32 d29, q15, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d24, q12 vqmovun.s16 d28, q14 vst1.8 {d24}, [\dst, :64], \d_strd vst1.8 {d28}, [\ds2, :64], \d_strd .else vst1.16 {q12}, [\dst, :128], \d_strd vst1.16 {q14}, [\ds2, :128], \d_strd .endif ble 9f vmov q3, q5 vmov q4, q6 vmov q5, q7 vmov q6, q8 vmov q7, q9 vmov q8, q10 vmov q9, q11 b 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: vpop {q4-q7} pop {r4-r11,pc} L(\type\()_8tap_filter_8): vld1.8 {q14}, [\sr2], \s_strd vld1.8 {q15}, [\src], \s_strd vmovl.u8 q12, d28 vmovl.u8 q13, d29 vmul.s16 q10, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d1[\i-4] .endr vmovl.u8 q12, d30 vmovl.u8 q13, d31 vmul.s16 q11, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q11, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q11, q14, d1[\i-4] .endr vrshr.s16 q10, q10, #2 vrshr.s16 q11, q11, #2 bx lr endfunc function \type\()_bilin_8bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] vdup.8 d1, \mx vdup.8 d3, \my rsb r8, \mx, #16 rsb r9, \my, #16 vdup.8 d0, r8 vdup.8 d2, r9 .ifc \type, prep lsl \d_strd, \w, #1 .endif clz r8, \w cmp \mx, #0 sub r8, r8, #24 bne L(\type\()_bilin_h) cmp \my, #0 bne L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cmp \my, #0 bne L(\type\()_bilin_hv) adr r9, L(\type\()_bilin_h_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_bilin_h_tbl): .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: vld1.32 {d4[]}, [\src], \s_strd vld1.32 {d6[]}, [\sr2], \s_strd vext.8 d5, d4, d4, #1 vext.8 d7, d6, d6, #1 vtrn.16 q2, q3 subs \h, \h, #2 vmull.u8 q3, d4, d0 vmlal.u8 q3, d5, d1 vqrshrn.u16 d4, q3, #4 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: vld1.8 {d4}, [\src], \s_strd vld1.8 {d6}, [\sr2], \s_strd vext.8 d5, d4, d4, #1 vext.8 d7, d6, d6, #1 vtrn.32 q2, q3 subs \h, \h, #2 vmull.u8 q3, d4, d0 vmlal.u8 q3, d5, d1 .ifc \type, put vqrshrn.u16 d4, q3, #4 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd .else vst1.16 {d6}, [\dst, :64], \d_strd vst1.16 {d7}, [\ds2, :64], \d_strd .endif bgt 4b pop {r4-r11,pc} 80: // 8xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: vld1.8 {q8}, [\src], \s_strd vld1.8 {q10}, [\sr2], \s_strd vext.8 q9, q8, q8, #1 vext.8 q11, q10, q10, #1 subs \h, \h, #2 vmull.u8 q8, d16, d0 vmull.u8 q10, d20, d0 vmlal.u8 q8, d18, d1 vmlal.u8 q10, d22, d1 .ifc \type, put vqrshrn.u16 d16, q8, #4 vqrshrn.u16 d18, q10, #4 vst1.8 {d16}, [\dst, :64], \d_strd vst1.8 {d18}, [\ds2, :64], \d_strd .else vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q10}, [\ds2, :128], \d_strd .endif bgt 8b pop {r4-r11,pc} 160: 320: 640: 1280: // 16xN, 32xN, ... h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w .endif 161: vld1.8 {d16}, [\src]! vld1.8 {d22}, [\sr2]! mov \mx, \w 16: vld1.8 {d17,d18}, [\src]! vld1.8 {d23,d24}, [\sr2]! vext.8 q10, q8, q9, #1 vext.8 q13, q11, q12, #1 vmull.u8 q2, d16, d0 vmull.u8 q3, d17, d0 vmull.u8 q14, d22, d0 vmull.u8 q15, d23, d0 vmlal.u8 q2, d20, d1 vmlal.u8 q3, d21, d1 vmlal.u8 q14, d26, d1 vmlal.u8 q15, d27, d1 subs \mx, \mx, #16 .ifc \type, put vqrshrn.u16 d4, q2, #4 vqrshrn.u16 d5, q3, #4 vqrshrn.u16 d28, q14, #4 vqrshrn.u16 d29, q15, #4 vst1.8 {q2}, [\dst, :128]! vst1.8 {q14}, [\ds2, :128]! .else vst1.16 {q2, q3}, [\dst, :128]! vst1.16 {q14, q15}, [\ds2, :128]! .endif ble 9f vmov d16, d18 vmov d22, d24 b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 161b pop {r4-r11,pc} L(\type\()_bilin_v): cmp \h, #4 adr r9, L(\type\()_bilin_v_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_bilin_v_tbl): .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v vld1.16 {d16[]}, [\src], \s_strd bgt 24f 22: vld1.16 {d17[]}, [\sr2], \s_strd vld1.16 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #6 vext.8 d17, d17, d18, #6 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 vqrshrn.u16 d4, q2, #4 vst1.16 {d4[0]}, [\dst, :16] vst1.16 {d4[1]}, [\ds2, :16] pop {r4-r11,pc} 24: // 2x4, 2x6, 2x8, ... v vld1.16 {d17[]}, [\sr2], \s_strd vld1.16 {d18[]}, [\src], \s_strd vld1.16 {d19[]}, [\sr2], \s_strd vld1.16 {d20[]}, [\src], \s_strd sub \h, \h, #4 vext.8 d16, d16, d17, #6 vext.8 d17, d17, d18, #6 vext.8 d18, d18, d19, #6 vext.8 d19, d19, d20, #6 vtrn.32 d16, d18 vtrn.32 d17, d19 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 cmp \h, #2 vqrshrn.u16 d4, q2, #4 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd vst1.16 {d4[2]}, [\dst, :16], \d_strd vst1.16 {d4[3]}, [\ds2, :16], \d_strd blt 0f vmov d16, d20 beq 22b b 24b 0: pop {r4-r11,pc} .endif 40: // 4xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.32 {d16[]}, [\src], \s_strd 4: vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d4, q2, #4 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d18 b 4b 0: pop {r4-r11,pc} 80: // 8xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {d16}, [\src], \s_strd 8: vld1.8 {d17}, [\sr2], \s_strd vld1.8 {d18}, [\src], \s_strd vmull.u8 q2, d16, d2 vmull.u8 q3, d17, d2 vmlal.u8 q2, d17, d3 vmlal.u8 q3, d18, d3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d4, q2, #4 vqrshrn.u16 d6, q3, #4 vst1.8 {d4}, [\dst, :64], \d_strd vst1.8 {d6}, [\ds2, :64], \d_strd .else vst1.16 {q2}, [\dst, :128], \d_strd vst1.16 {q3}, [\ds2, :128], \d_strd .endif ble 0f vmov d16, d18 b 8b 0: pop {r4-r11,pc} 160: // 16xN, 32xN, ... 320: 640: 1280: mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {q8}, [\src], \s_strd 2: vld1.8 {q9}, [\sr2], \s_strd vld1.8 {q10}, [\src], \s_strd vmull.u8 q12, d16, d2 vmull.u8 q13, d17, d2 vmull.u8 q14, d18, d2 vmull.u8 q15, d19, d2 vmlal.u8 q12, d18, d3 vmlal.u8 q13, d19, d3 vmlal.u8 q14, d20, d3 vmlal.u8 q15, d21, d3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d24, q12, #4 vqrshrn.u16 d25, q13, #4 vqrshrn.u16 d28, q14, #4 vqrshrn.u16 d29, q15, #4 vst1.8 {q12}, [\dst, :128], \d_strd vst1.8 {q14}, [\ds2, :128], \d_strd .else vst1.16 {q12, q13}, [\dst, :128], \d_strd vst1.16 {q14, q15}, [\ds2, :128], \d_strd .endif ble 9f vmov q8, q10 b 2b 9: subs \w, \w, #16 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 .ifc \type, put add \dst, \dst, #16 .else add \dst, \dst, #32 .endif b 1b 0: pop {r4-r11,pc} L(\type\()_bilin_hv): vmovl.u8 q2, d2 vmovl.u8 q3, d3 adr r9, L(\type\()_bilin_hv_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_bilin_hv_tbl): .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB 20: // 2xN hv .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.32 {d28[]}, [\src], \s_strd vext.8 d29, d28, d28, #1 vmull.u8 q8, d28, d0 vmlal.u8 q8, d29, d1 2: vld1.32 {d28[]}, [\sr2], \s_strd vld1.32 {d30[]}, [\src], \s_strd vext.8 d29, d28, d28, #1 vext.8 d31, d30, d30, #1 vtrn.16 d28, d30 vtrn.16 d29, d31 vmull.u8 q9, d28, d0 vmlal.u8 q9, d29, d1 vtrn.32 d16, d18 vmul.u16 d20, d16, d4 vmla.u16 d20, d19, d6 vqrshrn.u16 d20, q10, #8 subs \h, \h, #2 vst1.16 {d20[0]}, [\dst, :16], \d_strd vst1.16 {d20[1]}, [\ds2, :16], \d_strd ble 0f vtrn.32 d19, d16 b 2b 0: pop {r4-r11,pc} .endif 40: // 4xN hv add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {d28}, [\src], \s_strd vext.8 d29, d28, d28, #1 vmull.u8 q8, d28, d0 vmlal.u8 q8, d29, d1 4: vld1.8 {d28}, [\sr2], \s_strd vld1.8 {d30}, [\src], \s_strd vext.8 d29, d28, d28, #1 vext.8 d31, d30, d30, #1 vtrn.32 d28, d30 vtrn.32 d29, d31 vmull.u8 q9, d28, d0 vmlal.u8 q9, d29, d1 vmov d17, d18 vmul.u16 q10, q8, q2 vmla.u16 q10, q9, q3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d20, q10, #8 vst1.32 {d20[0]}, [\dst, :32], \d_strd vst1.32 {d20[1]}, [\ds2, :32], \d_strd .else vrshr.u16 q10, q10, #4 vst1.16 {d20}, [\dst, :64], \d_strd vst1.16 {d21}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d19 b 4b 0: pop {r4-r11,pc} 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {q12}, [\src], \s_strd vext.8 q13, q12, q12, #1 vmull.u8 q8, d24, d0 vmlal.u8 q8, d26, d1 2: vld1.8 {q12}, [\sr2], \s_strd vld1.8 {q14}, [\src], \s_strd vext.8 q13, q12, q12, #1 vext.8 q15, q14, q14, #1 vmull.u8 q9, d24, d0 vmlal.u8 q9, d26, d1 vmull.u8 q10, d28, d0 vmlal.u8 q10, d30, d1 vmul.u16 q8, q8, q2 vmla.u16 q8, q9, q3 vmul.u16 q9, q9, q2 vmla.u16 q9, q10, q3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d16, q8, #8 vqrshrn.u16 d18, q9, #8 vst1.8 {d16}, [\dst, :64], \d_strd vst1.8 {d18}, [\ds2, :64], \d_strd .else vrshr.u16 q8, q8, #4 vrshr.u16 q9, q9, #4 vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q9}, [\ds2, :128], \d_strd .endif ble 9f vmov q8, q10 b 2b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 1b 0: pop {r4-r11,pc} endfunc .endm filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10 filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 .macro load_filter_ptr src asr r12, \src, #10 add r12, r11, r12, lsl #3 .endm .macro load_filter_coef dst, src, inc add \src, \src, \inc vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc load_filter_ptr \src load_filter_coef \dst, \src, \inc .endm function warp_filter_horz_neon load_filter_ptr r5 // filter 0 vld1.16 {q7}, [r2], r3 vmov.i8 q6, #128 load_filter_coef d0, r5, r7 // filter 0 load_filter_row d1, r5, r7 // filter 1 load_filter_row d2, r5, r7 // filter 2 load_filter_ptr r5 // filter 3 veor q7, q7, q6 // subtract by 128 to allow using vmull load_filter_coef d3, r5, r7 // filter 3 vext.8 d12, d14, d15, #1 // filter 1 pixels vext.8 d13, d14, d15, #2 // filter 2 pixels load_filter_ptr r5 // filter 4 vmull.s8 q2, d14, d0 // filter 0 output vmull.s8 q3, d12, d1 // filter 1 output load_filter_coef d0, r5, r7 // filter 4 load_filter_ptr r5 // filter 5 vext.8 d12, d14, d15, #3 // filter 3 pixels vmull.s8 q4, d13, d2 // filter 2 output vext.8 d13, d14, d15, #4 // filter 4 pixels vpadd.i16 d4, d4, d5 // pixel 0 (4x16) vpadd.i16 d5, d6, d7 // pixel 1 (4x16) load_filter_coef d1, r5, r7 // filter 5 load_filter_ptr r5 // filter 6 vmull.s8 q5, d12, d3 // filter 3 output vext.8 d12, d14, d15, #5 // filter 5 pixels vmull.s8 q3, d13, d0 // filter 4 output load_filter_coef d0, r5, r7 // filter 6 vext.8 d13, d14, d15, #6 // filter 6 pixels load_filter_ptr r5 // filter 7 vpadd.i16 d8, d8, d9 // pixel 2 (4x16) vpadd.i16 d9, d10, d11 // pixel 3 (4x16) vmull.s8 q5, d12, d1 // filter 5 output load_filter_coef d1, r5, r7 // filter 7 vext.8 d14, d14, d15, #7 // filter 7 pixels vpadd.i16 d6, d6, d7 // pixel 4 (4x16) vpadd.i16 d10, d10, d11 // pixel 5 (4x16) vmull.s8 q6, d13, d0 // filter 6 output vmull.s8 q7, d14, d1 // filter 7 output sub r5, r5, r7, lsl #3 vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16) vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16) vpadd.i16 d12, d12, d13 // pixel 6 (4x16) vpadd.i16 d14, d14, d15 // pixel 7 (4x16) vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16) vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16) vpadd.i16 d4, d4, d5 // pixel 0-3 vpadd.i16 d5, d6, d10 // pixel 4-7 add r5, r5, r8 bx lr endfunc // void dav2d_warp_affine_8x8_8bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my) .macro warp t, shift function warp_affine_8x8\t\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldr r6, [sp, #108] ldrd r8, r9, [r4] sxth r7, r8 asr r8, r8, #16 asr r4, r9, #16 sxth r9, r9 mov r10, #8 sub r2, r2, r3, lsl #1 sub r2, r2, r3 sub r2, r2, #3 movrel r11, X(mc_warp_filter), 64*8 .ifnb \t lsl r1, r1, #1 .endif add r5, r5, #512 add r6, r6, #512 bl warp_filter_horz_neon vrshr.s16 q8, q2, #3 bl warp_filter_horz_neon vrshr.s16 q9, q2, #3 bl warp_filter_horz_neon vrshr.s16 q10, q2, #3 bl warp_filter_horz_neon vrshr.s16 q11, q2, #3 bl warp_filter_horz_neon vrshr.s16 q12, q2, #3 bl warp_filter_horz_neon vrshr.s16 q13, q2, #3 bl warp_filter_horz_neon vrshr.s16 q14, q2, #3 1: bl warp_filter_horz_neon vrshr.s16 q15, q2, #3 load_filter_row d8, r6, r9 load_filter_row d9, r6, r9 load_filter_row d10, r6, r9 load_filter_row d11, r6, r9 load_filter_row d12, r6, r9 load_filter_row d13, r6, r9 load_filter_row d14, r6, r9 load_filter_row d15, r6, r9 transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 vmovl.s8 q1, d8 vmovl.s8 q2, d9 vmovl.s8 q3, d10 vmovl.s8 q4, d11 vmovl.s8 q5, d12 vmovl.s8 q6, d13 sub r6, r6, r9, lsl #3 // This ordering of vmull/vmlal is highly beneficial for // Cortex A8/A9/A53 here, but harmful for Cortex A7. vmull.s16 q0, d16, d2 vmlal.s16 q0, d18, d4 vmlal.s16 q0, d20, d6 vmlal.s16 q0, d22, d8 vmlal.s16 q0, d24, d10 vmlal.s16 q0, d26, d12 vmull.s16 q1, d17, d3 vmlal.s16 q1, d19, d5 vmlal.s16 q1, d21, d7 vmlal.s16 q1, d23, d9 vmlal.s16 q1, d25, d11 vmlal.s16 q1, d27, d13 vmovl.s8 q2, d14 vmovl.s8 q3, d15 vmlal.s16 q0, d28, d4 vmlal.s16 q0, d30, d6 vmlal.s16 q1, d29, d5 vmlal.s16 q1, d31, d7 .ifb \t vmov.i16 q7, #128 .else vmov.i16 q7, #0x800 .endif vmov q8, q9 vmov q9, q10 vqrshrn.s32 d0, q0, #\shift vmov q10, q11 vqrshrn.s32 d1, q1, #\shift vmov q11, q12 vadd.i16 q0, q0, q7 vmov q12, q13 .ifb \t vqmovun.s16 d0, q0 .endif vmov q13, q14 vmov q14, q15 subs r10, r10, #1 .ifnb \t vst1.16 {q0}, [r0, :128], r1 .else vst1.8 {d0}, [r0, :64], r1 .endif add r6, r6, r4 bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm warp , 11 warp t, 7 // void dav2d_emu_edge_8bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_8bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] ldrd r8, r9, [sp, #52] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub r12, r3, #1 // ih - 1 cmp r5, r3 sub lr, r2, #1 // iw - 1 it lt movlt r12, r5 // min(y, ih - 1) cmp r4, r2 bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) it lt movlt lr, r4 // min(x, iw - 1) bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) mla r8, r12, r9, r8 // ref += iclip() * stride add r8, r8, lr // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add r10, r5, r1 // y + bh neg r5, r5 // -y sub r10, r10, r3 // y + bh - ih sub r12, r1, #1 // bh - 1 cmp r10, r1 bic r5, r5, r5, asr #31 // max(-y, 0) it ge movge r10, r12 // min(y + bh - ih, bh-1) cmp r5, r1 bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) it ge movge r5, r12 // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add r11, r4, r0 // x + bw neg r4, r4 // -x sub r11, r11, r2 // x + bw - iw sub lr, r0, #1 // bw - 1 cmp r11, r0 bic r4, r4, r4, asr #31 // max(-x, 0) it ge movge r11, lr // min(x + bw - iw, bw-1) cmp r4, r0 bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) it ge movge r4, lr // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub r1, r1, r5 // bh - top_ext mla r6, r5, r7, r6 sub r2, r0, r4 // bw - left_ext sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext sub r2, r2, r11 // center_w = bw - left_ext - right_ext mov r0, r6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left vld1.8 {d0[], d1[]}, [r8] mov r12, r6 // out = dst mov r3, r4 1: subs r3, r3, #16 vst1.8 {q0}, [r12, :128]! bgt 1b .endif mov lr, r8 add r12, r6, r4 // out = dst + left_ext mov r3, r2 1: vld1.8 {q0, q1}, [lr]! subs r3, r3, #32 .if \need_left vst1.8 {q0, q1}, [r12]! .else vst1.8 {q0, q1}, [r12, :128]! .endif bgt 1b .if \need_right add r3, r8, r2 // in + center_w sub r3, r3, #1 // in + center_w - 1 add r12, r6, r4 // dst + left_ext vld1.8 {d0[], d1[]}, [r3] add r12, r12, r2 // out = dst + left_ext + center_w mov r3, r11 1: subs r3, r3, #16 vst1.8 {q0}, [r12]! bgt 1b .endif subs r1, r1, #1 // center_h-- add r6, r6, r7 add r8, r8, r9 bgt 0b .endm cmp r4, #0 beq 2f // need_left cmp r11, #0 beq 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cmp r11, #0 beq 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cmp r10, #0 // Storing the original dst in r0 overwrote bw, recalculate it here add r2, r2, r4 // center_w + left_ext add r2, r2, r11 // bw = center_w + left_ext + right_ext beq 3f // need_bottom sub r8, r6, r7 // ref = dst - stride mov r4, r2 1: vld1.8 {q0, q1}, [r8, :128]! mov r3, r10 2: subs r3, r3, #1 vst1.8 {q0, q1}, [r6, :128], r7 bgt 2b mls r6, r7, r10, r6 // dst -= bottom_ext * stride subs r4, r4, #32 // bw -= 32 add r6, r6, #32 // dst += 32 bgt 1b 3: cmp r5, #0 beq 3f // need_top mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride 1: vld1.8 {q0, q1}, [r0, :128]! mov r3, r5 2: subs r3, r3, #1 vst1.8 {q0, q1}, [r6, :128], r7 bgt 2b mls r6, r7, r5, r6 // dst -= top_ext * stride subs r2, r2, #32 // bw -= 32 add r6, r6, #32 // dst += 32 bgt 1b 3: pop {r4-r11,pc} endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/mc16.S000066400000000000000000003530121517466257200225340ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define PREP_BIAS 8192 .macro avg d0, d00, d01, d1, d10, d11 vld1.16 {q0, q1}, [r2, :128]! vld1.16 {q2, q3}, [r3, :128]! vqadd.s16 q0, q0, q2 vqadd.s16 q1, q1, q3 vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits vshl.s16 \d0, q0, q13 // -(intermediate_bits+1) vshl.s16 \d1, q1, q13 // -(intermediate_bits+1) .endm .macro w_avg d0, d00, d01, d1, d10, d11 vld1.16 {q0, q1}, [r2, :128]! vld1.16 {q2, q3}, [r3, :128]! // This difference requires a 17 bit range, and all bits are // significant for the following multiplication. vsubl.s16 \d0, d4, d0 vsubl.s16 q0, d5, d1 vsubl.s16 \d1, d6, d2 vsubl.s16 q1, d7, d3 vmul.s32 \d0, \d0, q4 vmul.s32 q0, q0, q4 vmul.s32 \d1, \d1, q4 vmul.s32 q1, q1, q4 vshr.s32 \d0, \d0, #4 vshr.s32 q0, q0, #4 vshr.s32 \d1, \d1, #4 vshr.s32 q1, q1, #4 vaddw.s16 \d0, \d0, d4 vaddw.s16 q0, q0, d5 vaddw.s16 \d1, \d1, d6 vaddw.s16 q1, q1, d7 vmovn.i32 \d00, \d0 vmovn.i32 \d01, q0 vmovn.i32 \d10, \d1 vmovn.i32 \d11, q1 vrshl.s16 \d0, \d0, q13 // -intermediate_bits vrshl.s16 \d1, \d1, q13 // -intermediate_bits vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits vmin.s16 \d0, \d0, q15 // bitdepth_max vmin.s16 \d1, \d1, q15 // bitdepth_max vmax.s16 \d0, \d0, q14 // 0 vmax.s16 \d1, \d1, q14 // 0 .endm .macro mask d0, d00, d01, d1, d10, d11 vld1.8 {q7}, [r6, :128]! vld1.16 {q0, q1}, [r2, :128]! vneg.s8 q7, q7 vld1.16 {q2, q3}, [r3, :128]! vmovl.s8 q6, d14 vmovl.s8 q7, d15 vmovl.s16 q4, d12 vmovl.s16 q5, d13 vmovl.s16 q6, d14 vmovl.s16 q7, d15 vsubl.s16 \d0, d4, d0 vsubl.s16 q0, d5, d1 vsubl.s16 \d1, d6, d2 vsubl.s16 q1, d7, d3 vmul.s32 \d0, \d0, q4 vmul.s32 q0, q0, q5 vmul.s32 \d1, \d1, q6 vmul.s32 q1, q1, q7 vshr.s32 \d0, \d0, #6 vshr.s32 q0, q0, #6 vshr.s32 \d1, \d1, #6 vshr.s32 q1, q1, #6 vaddw.s16 \d0, \d0, d4 vaddw.s16 q0, q0, d5 vaddw.s16 \d1, \d1, d6 vaddw.s16 q1, q1, d7 vmovn.i32 \d00, \d0 vmovn.i32 \d01, q0 vmovn.i32 \d10, \d1 vmovn.i32 \d11, q1 vrshl.s16 \d0, \d0, q13 // -intermediate_bits vrshl.s16 \d1, \d1, q13 // -intermediate_bits vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits vmin.s16 \d0, \d0, q15 // bitdepth_max vmin.s16 \d1, \d1, q15 // bitdepth_max vmax.s16 \d0, \d0, q14 // 0 vmax.s16 \d1, \d1, q14 // 0 .endm .macro bidir_fn type, bdmax function \type\()_16bpc_neon, export=1 push {r4-r7,lr} ldrd r4, r5, [sp, #20] ldr r6, [sp, #28] clz r4, r4 .ifnc \type, avg ldr r7, [sp, #32] vmov.i16 q14, #0 vdup.16 q15, r7 // bitdepth_max .endif .ifc \type, w_avg vpush {q4} .endif .ifc \type, mask vpush {q4-q7} .endif clz r7, \bdmax sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18 .ifc \type, avg mov lr, #1 movw r12, #2*PREP_BIAS lsl lr, lr, r7 // 1 << intermediate_bits neg r12, r12 // -2*PREP_BIAS add r7, r7, #1 sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits neg r7, r7 // -(intermediate_bits+1) vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits vdup.16 q13, r7 // -(intermediate_bits+1) .else mov r12, #PREP_BIAS lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits neg r7, r7 // -intermediate_bits vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits vdup.16 q13, r7 // -intermediate_bits .endif .ifc \type, w_avg vdup.32 q4, r6 vneg.s32 q4, q4 .endif adr r7, L(\type\()_tbl) sub r4, r4, #24 \type q8, d16, d17, q9, d18, d19 ldr r4, [r7, r4, lsl #2] add r7, r7, r4 bx r7 .align 2 L(\type\()_tbl): .word 1280f - L(\type\()_tbl) + CONFIG_THUMB .word 640f - L(\type\()_tbl) + CONFIG_THUMB .word 320f - L(\type\()_tbl) + CONFIG_THUMB .word 160f - L(\type\()_tbl) + CONFIG_THUMB .word 80f - L(\type\()_tbl) + CONFIG_THUMB .word 40f - L(\type\()_tbl) + CONFIG_THUMB 40: add r7, r0, r1 lsl r1, r1, #1 4: subs r5, r5, #4 vst1.16 {d16}, [r0, :64], r1 vst1.16 {d17}, [r7, :64], r1 vst1.16 {d18}, [r0, :64], r1 vst1.16 {d19}, [r7, :64], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 4b 80: add r7, r0, r1 lsl r1, r1, #1 8: vst1.16 {q8}, [r0, :128], r1 subs r5, r5, #2 vst1.16 {q9}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 8b 160: 16: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #2 vst1.16 {q10, q11}, [r0, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 16b 320: add r7, r0, #32 32: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #1 vst1.16 {q10, q11}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 32b 640: add r7, r0, #32 mov r12, #64 sub r1, r1, #64 64: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #1 vst1.16 {q10, q11}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 64b 1280: add r7, r0, #32 mov r12, #64 sub r1, r1, #192 128: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #1 vst1.16 {q10, q11}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 128b 0: .ifc \type, mask vpop {q4-q7} .endif .ifc \type, w_avg vpop {q4} .endif pop {r4-r7,pc} endfunc .endm bidir_fn avg, r6 bidir_fn w_avg, r7 bidir_fn mask, r7 .macro w_mask_fn type function w_mask_\type\()_16bpc_neon, export=1 push {r4-r10,lr} vpush {q4-q7} ldrd r4, r5, [sp, #96] ldrd r6, r7, [sp, #104] ldr r8, [sp, #112] clz r9, r4 adr lr, L(w_mask_\type\()_tbl) vdup.16 q15, r8 // bitdepth_max sub r9, r9, #24 clz r8, r8 // clz(bitdepth_max) ldr r9, [lr, r9, lsl #2] add r9, lr, r9 sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov r10, #PREP_BIAS*64 neg r8, r8 // -sh movw r12, #27615 // (64 + 1 - 38)<> mask_sh vshr.u16 q7, q7, #10 vadd.i32 q2, q2, q13 // += PREP_BIAS*64 vadd.i32 q3, q3, q13 vadd.i32 q4, q4, q13 vadd.i32 q5, q5, q13 vmovl.u16 q12, d12 vmovl.u16 q13, d13 vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) vmovl.u16 q12, d14 vmla.i32 q3, q9, q13 vmovl.u16 q13, d15 vmla.i32 q4, q10, q12 vmla.i32 q5, q11, q13 vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh vrshl.s32 q3, q3, q14 vrshl.s32 q4, q4, q14 vrshl.s32 q5, q5, q14 vqmovun.s32 d4, q2 // iclip_pixel vqmovun.s32 d5, q3 vqmovun.s32 d6, q4 vqmovun.s32 d7, q5 vmin.u16 q2, q2, q15 // iclip_pixel vmin.u16 q3, q3, q15 // iclip_pixel .if \type == 444 vmovn.i16 d12, q6 // 64 - m vmovn.i16 d13, q7 vsub.i16 q6, q1, q6 // m vst1.8 {q6}, [r6, :128]! .elseif \type == 422 vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) vpadd.i16 d13, d14, d15 vmovn.i16 d12, q6 vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 vst1.8 {d12}, [r6, :64]! .elseif \type == 420 vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition) vadd.i16 d13, d14, d15 vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d12[0]}, [r6, :32]! .endif vst1.16 {d4}, [r0, :64], r1 vst1.16 {d5}, [r12, :64], r1 vst1.16 {d6}, [r0, :64], r1 vst1.16 {d7}, [r12, :64], r1 bgt 4b vpop {q4-q7} pop {r4-r10,pc} 8: vld1.16 {q2, q3}, [r2, :128]! // tmp1 vld1.16 {q4, q5}, [r3, :128]! // tmp2 subs r5, r5, #2 vdup.32 q13, r10 // PREP_BIAS*64 vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) vabd.s16 q7, q3, q5 vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) vsubl.s16 q9, d9, d5 vsubl.s16 q10, d10, d6 vsubl.s16 q11, d11, d7 vqsub.u16 q6, q0, q6 // 27615 - abs() vqsub.u16 q7, q0, q7 vshll.s16 q5, d7, #6 // tmp1 << 6 vshll.s16 q4, d6, #6 vshll.s16 q3, d5, #6 vshll.s16 q2, d4, #6 vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh vshr.u16 q7, q7, #10 vadd.i32 q2, q2, q13 // += PREP_BIAS*64 vadd.i32 q3, q3, q13 vadd.i32 q4, q4, q13 vadd.i32 q5, q5, q13 vmovl.u16 q12, d12 vmovl.u16 q13, d13 vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) vmovl.u16 q12, d14 vmla.i32 q3, q9, q13 vmovl.u16 q13, d15 vmla.i32 q4, q10, q12 vmla.i32 q5, q11, q13 vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh vrshl.s32 q3, q3, q14 vrshl.s32 q4, q4, q14 vrshl.s32 q5, q5, q14 vqmovun.s32 d4, q2 // iclip_pixel vqmovun.s32 d5, q3 vqmovun.s32 d6, q4 vqmovun.s32 d7, q5 vmin.u16 q2, q2, q15 // iclip_pixel vmin.u16 q3, q3, q15 // iclip_pixel .if \type == 444 vmovn.i16 d12, q6 // 64 - m vmovn.i16 d13, q7 vsub.i16 q6, q1, q6 // m vst1.8 {q6}, [r6, :128]! .elseif \type == 422 vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) vpadd.i16 d13, d14, d15 vmovn.i16 d12, q6 vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 vst1.8 {d12}, [r6, :64]! .elseif \type == 420 vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d12[0]}, [r6, :32]! .endif vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 8b vpop {q4-q7} pop {r4-r10,pc} 1280: 640: 320: 160: sub r1, r1, r4, lsl #1 .if \type == 444 add lr, r6, r4 .elseif \type == 422 add lr, r6, r4, lsr #1 .endif add r7, r2, r4, lsl #1 add r9, r3, r4, lsl #1 161: mov r8, r4 16: vld1.16 {q2}, [r2, :128]! // tmp1 vld1.16 {q4}, [r3, :128]! // tmp2 vld1.16 {q3}, [r7, :128]! vld1.16 {q5}, [r9, :128]! subs r8, r8, #8 vdup.32 q13, r10 // PREP_BIAS*64 vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) vabd.s16 q7, q3, q5 vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) vsubl.s16 q9, d9, d5 vsubl.s16 q10, d10, d6 vsubl.s16 q11, d11, d7 vqsub.u16 q6, q0, q6 // 27615 - abs() vqsub.u16 q7, q0, q7 vshll.s16 q5, d7, #6 // tmp1 << 6 vshll.s16 q4, d6, #6 vshll.s16 q3, d5, #6 vshll.s16 q2, d4, #6 vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh vshr.u16 q7, q7, #10 vadd.i32 q2, q2, q13 // += PREP_BIAS*64 vadd.i32 q3, q3, q13 vadd.i32 q4, q4, q13 vadd.i32 q5, q5, q13 vmovl.u16 q12, d12 vmovl.u16 q13, d13 vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) vmovl.u16 q12, d14 vmla.i32 q3, q9, q13 vmovl.u16 q13, d15 vmla.i32 q4, q10, q12 vmla.i32 q5, q11, q13 vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh vrshl.s32 q3, q3, q14 vrshl.s32 q4, q4, q14 vrshl.s32 q5, q5, q14 vqmovun.s32 d4, q2 // iclip_pixel vqmovun.s32 d5, q3 vqmovun.s32 d6, q4 vqmovun.s32 d7, q5 vmin.u16 q2, q2, q15 // iclip_pixel vmin.u16 q3, q3, q15 // iclip_pixel .if \type == 444 vmovn.i16 d12, q6 // 64 - m vmovn.i16 d13, q7 vsub.i16 q6, q1, q6 // m vst1.8 {d12}, [r6, :64]! vst1.8 {d13}, [lr, :64]! .elseif \type == 422 vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) vpadd.i16 d13, d14, d15 vmovn.i16 d12, q6 vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 vst1.32 {d12[0]}, [r6, :32]! vst1.32 {d12[1]}, [lr, :32]! .elseif \type == 420 vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d12[0]}, [r6, :32]! .endif vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! bgt 16b subs r5, r5, #2 add r2, r2, r4, lsl #1 add r3, r3, r4, lsl #1 add r7, r7, r4, lsl #1 add r9, r9, r4, lsl #1 .if \type == 444 add r6, r6, r4 add lr, lr, r4 .elseif \type == 422 add r6, r6, r4, lsr #1 add lr, lr, r4, lsr #1 .endif add r0, r0, r1 add r12, r12, r1 bgt 161b vpop {q4-q7} pop {r4-r10,pc} endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_16bpc_neon, export=1 push {r4-r5,lr} ldrd r4, r5, [sp, #12] clz lr, r3 adr r3, L(blend_tbl) sub lr, lr, #26 ldr lr, [r3, lr, lsl #2] add r3, r3, lr bx r3 .align 2 L(blend_tbl): .word 320f - L(blend_tbl) + CONFIG_THUMB .word 160f - L(blend_tbl) + CONFIG_THUMB .word 80f - L(blend_tbl) + CONFIG_THUMB .word 40f - L(blend_tbl) + CONFIG_THUMB 40: add r12, r0, r1 lsl r1, r1, #1 4: vld1.8 {d4}, [r5, :64]! vld1.16 {q1}, [r2, :128]! vld1.16 {d0}, [r0, :64] vneg.s8 d4, d4 // -m subs r4, r4, #2 vld1.16 {d1}, [r12, :64] vmovl.s8 q2, d4 vshl.i16 q2, q2, #9 // -m << 9 vsub.i16 q1, q0, q1 // a - b vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 vadd.i16 q0, q0, q1 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d1}, [r12, :64], r1 bgt 4b pop {r4-r5,pc} 80: add r12, r0, r1 lsl r1, r1, #1 8: vld1.8 {q8}, [r5, :128]! vld1.16 {q2, q3}, [r2, :128]! vneg.s8 q9, q8 // -m vld1.16 {q0}, [r0, :128] vld1.16 {q1}, [r12, :128] vmovl.s8 q8, d18 vmovl.s8 q9, d19 vshl.i16 q8, q8, #9 // -m << 9 vshl.i16 q9, q9, #9 vsub.i16 q2, q0, q2 // a - b vsub.i16 q3, q1, q3 subs r4, r4, #2 vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q3, q3, q9 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 bgt 8b pop {r4-r5,pc} 160: add r12, r0, r1 lsl r1, r1, #1 16: vld1.8 {q12, q13}, [r5, :128]! vld1.16 {q8, q9}, [r2, :128]! subs r4, r4, #2 vneg.s8 q14, q12 // -m vld1.16 {q0, q1}, [r0, :128] vneg.s8 q15, q13 vld1.16 {q10, q11}, [r2, :128]! vmovl.s8 q12, d28 vmovl.s8 q13, d29 vmovl.s8 q14, d30 vmovl.s8 q15, d31 vld1.16 {q2, q3}, [r12, :128] vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 q13, q13, #9 vshl.i16 q14, q14, #9 vshl.i16 q15, q15, #9 vsub.i16 q8, q0, q8 // a - b vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vsub.i16 q11, q3, q11 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q13 vqrdmulh.s16 q10, q10, q14 vqrdmulh.s16 q11, q11, q15 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vst1.16 {q0, q1}, [r0, :128], r1 vadd.i16 q3, q3, q11 vst1.16 {q2, q3}, [r12, :128], r1 bgt 16b pop {r4-r5,pc} 320: add r12, r0, #32 32: vld1.8 {q12, q13}, [r5, :128]! vld1.16 {q8, q9}, [r2, :128]! subs r4, r4, #1 vneg.s8 q14, q12 // -m vld1.16 {q0, q1}, [r0, :128] vneg.s8 q15, q13 vld1.16 {q10, q11}, [r2, :128]! vmovl.s8 q12, d28 vmovl.s8 q13, d29 vmovl.s8 q14, d30 vmovl.s8 q15, d31 vld1.16 {q2, q3}, [r12, :128] vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 q13, q13, #9 vshl.i16 q14, q14, #9 vshl.i16 q15, q15, #9 vsub.i16 q8, q0, q8 // a - b vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vsub.i16 q11, q3, q11 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q13 vqrdmulh.s16 q10, q10, q14 vqrdmulh.s16 q11, q11, q15 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vst1.16 {q0, q1}, [r0, :128], r1 vadd.i16 q3, q3, q11 vst1.16 {q2, q3}, [r12, :128], r1 bgt 32b pop {r4-r5,pc} endfunc // This has got the same signature as the put_8tap functions, // and assumes that r9 is set to (clz(w)-24). function put_neon adr r10, L(put_tbl) ldr r9, [r10, r9, lsl #2] add r10, r10, r9 bx r10 .align 2 L(put_tbl): .word 1280f - L(put_tbl) + CONFIG_THUMB .word 640f - L(put_tbl) + CONFIG_THUMB .word 320f - L(put_tbl) + CONFIG_THUMB .word 16f - L(put_tbl) + CONFIG_THUMB .word 80f - L(put_tbl) + CONFIG_THUMB .word 4f - L(put_tbl) + CONFIG_THUMB .word 2f - L(put_tbl) + CONFIG_THUMB 2: vld1.32 {d0[]}, [r2], r3 vld1.32 {d1[]}, [r2], r3 subs r5, r5, #2 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 bgt 2b pop {r4-r11,pc} 4: vld1.16 {d0}, [r2], r3 vld1.16 {d1}, [r2], r3 subs r5, r5, #2 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d1}, [r0, :64], r1 bgt 4b pop {r4-r11,pc} 80: add r8, r0, r1 lsl r1, r1, #1 add r9, r2, r3 lsl r3, r3, #1 8: vld1.16 {q0}, [r2], r3 vld1.16 {q1}, [r9], r3 subs r5, r5, #2 vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r8, :128], r1 bgt 8b pop {r4-r11,pc} 16: vld1.16 {q0, q1}, [r2], r3 subs r5, r5, #1 vst1.16 {q0, q1}, [r0, :128], r1 bgt 16b pop {r4-r11,pc} 320: sub r1, r1, #32 sub r3, r3, #32 32: vld1.16 {q0, q1}, [r2]! vst1.16 {q0, q1}, [r0, :128]! vld1.16 {q2, q3}, [r2], r3 subs r5, r5, #1 vst1.16 {q2, q3}, [r0, :128], r1 bgt 32b pop {r4-r11,pc} 640: sub r1, r1, #96 sub r3, r3, #96 64: vld1.16 {q8, q9}, [r2]! vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r2]! vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r2]! vst1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r2], r3 subs r5, r5, #1 vst1.16 {q14, q15}, [r0, :128], r1 bgt 64b pop {r4-r11,pc} 1280: sub r1, r1, #224 sub r3, r3, #224 128: vld1.16 {q8, q9}, [r2]! vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r2]! vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r2]! vst1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r2]! vst1.16 {q14, q15}, [r0, :128]! vld1.16 {q8, q9}, [r2]! vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r2]! vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r2]! vst1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r2], r3 subs r5, r5, #1 vst1.16 {q14, q15}, [r0, :128], r1 bgt 128b pop {r4-r11,pc} endfunc // This has got the same signature as the prep_8tap functions, // and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and // r8 to w*2. function prep_neon adr r10, L(prep_tbl) ldr r9, [r10, r9, lsl #2] vdup.16 q15, r7 // intermediate_bits vmov.i16 q14, #PREP_BIAS add r10, r10, r9 bx r10 .align 2 L(prep_tbl): .word 1280f - L(prep_tbl) + CONFIG_THUMB .word 640f - L(prep_tbl) + CONFIG_THUMB .word 320f - L(prep_tbl) + CONFIG_THUMB .word 16f - L(prep_tbl) + CONFIG_THUMB .word 80f - L(prep_tbl) + CONFIG_THUMB .word 40f - L(prep_tbl) + CONFIG_THUMB 40: add r9, r1, r2 lsl r2, r2, #1 4: vld1.16 {d0}, [r1], r2 vld1.16 {d1}, [r9], r2 subs r4, r4, #2 vshl.s16 q0, q0, q15 vsub.i16 q0, q0, q14 vst1.16 {q0}, [r0, :128]! bgt 4b pop {r4-r11,pc} 80: add r9, r1, r2 lsl r2, r2, #1 8: vld1.16 {q0}, [r1], r2 vld1.16 {q1}, [r9], r2 subs r4, r4, #2 vshl.s16 q0, q0, q15 vshl.s16 q1, q1, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vst1.16 {q0, q1}, [r0, :128]! bgt 8b pop {r4-r11,pc} 16: vld1.16 {q0, q1}, [r1], r2 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1], r2 subs r4, r4, #2 vshl.s16 q1, q1, q15 vshl.s16 q2, q2, q15 vshl.s16 q3, q3, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q3, q3, q14 vst1.16 {q2, q3}, [r0, :128]! bgt 16b pop {r4-r11,pc} 320: sub r2, r2, #32 32: vld1.16 {q0, q1}, [r1]! subs r4, r4, #1 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1], r2 vshl.s16 q1, q1, q15 vshl.s16 q2, q2, q15 vshl.s16 q3, q3, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q3, q3, q14 vst1.16 {q2, q3}, [r0, :128]! bgt 32b pop {r4-r11,pc} 640: sub r2, r2, #96 64: vld1.16 {q0, q1}, [r1]! subs r4, r4, #1 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1]! vshl.s16 q1, q1, q15 vld1.16 {q8, q9}, [r1]! vshl.s16 q2, q2, q15 vld1.16 {q10, q11}, [r1], r2 vshl.s16 q3, q3, q15 vshl.s16 q8, q8, q15 vshl.s16 q9, q9, q15 vshl.s16 q10, q10, q15 vshl.s16 q11, q11, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vsub.i16 q3, q3, q14 vsub.i16 q8, q8, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q9, q9, q14 vst1.16 {q2, q3}, [r0, :128]! vsub.i16 q10, q10, q14 vst1.16 {q8, q9}, [r0, :128]! vsub.i16 q11, q11, q14 vst1.16 {q10, q11}, [r0, :128]! bgt 64b pop {r4-r11,pc} 1280: sub r2, r2, #224 128: vld1.16 {q0, q1}, [r1]! subs r4, r4, #1 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1]! vshl.s16 q1, q1, q15 vld1.16 {q8, q9}, [r1]! vshl.s16 q2, q2, q15 vld1.16 {q10, q11}, [r1]! vshl.s16 q3, q3, q15 vshl.s16 q8, q8, q15 vshl.s16 q9, q9, q15 vshl.s16 q10, q10, q15 vshl.s16 q11, q11, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vsub.i16 q3, q3, q14 vsub.i16 q8, q8, q14 vst1.16 {q0, q1}, [r0, :128]! vld1.16 {q0, q1}, [r1]! vsub.i16 q9, q9, q14 vsub.i16 q10, q10, q14 vst1.16 {q2, q3}, [r0, :128]! vld1.16 {q2, q3}, [r1]! vsub.i16 q11, q11, q14 vshl.s16 q0, q0, q15 vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q8, q9}, [r1]! vshl.s16 q1, q1, q15 vshl.s16 q2, q2, q15 vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q10, q11}, [r1], r2 vshl.s16 q3, q3, q15 vshl.s16 q8, q8, q15 vshl.s16 q9, q9, q15 vshl.s16 q10, q10, q15 vshl.s16 q11, q11, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vsub.i16 q3, q3, q14 vsub.i16 q8, q8, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q9, q9, q14 vst1.16 {q2, q3}, [r0, :128]! vsub.i16 q10, q10, q14 vst1.16 {q8, q9}, [r0, :128]! vsub.i16 q11, q11, q14 vst1.16 {q10, q11}, [r0, :128]! bgt 128b pop {r4-r11,pc} endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 vld1.\wd {\d0[]}, [\s0], \strd vld1.\wd {\d1[]}, [\s1], \strd .ifnb \d2 vld1.\wd {\d2[]}, [\s0], \strd vld1.\wd {\d3[]}, [\s1], \strd .endif .ifnb \d4 vld1.\wd {\d4[]}, [\s0], \strd .endif .ifnb \d5 vld1.\wd {\d5[]}, [\s1], \strd .endif .ifnb \d6 vld1.\wd {\d6[]}, [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 vld1.16 {\d0}, [\s0], \strd vld1.16 {\d1}, [\s1], \strd .ifnb \d2 vld1.16 {\d2}, [\s0], \strd vld1.16 {\d3}, [\s1], \strd .endif .ifnb \d4 vld1.16 {\d4}, [\s0], \strd .endif .ifnb \d5 vld1.16 {\d5}, [\s1], \strd .endif .ifnb \d6 vld1.16 {\d6}, [\s0], \strd .endif .endm .macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5 vld1.16 {\d0, \d1}, [\s0], \strd .ifnb \d2 vld1.16 {\d2, \d3}, [\s1], \strd .endif .ifnb \d4 vld1.16 {\d4, \d5}, [\s0], \strd .endif .endm .macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5 load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5 .endm .macro interleave_1_32 r0, r1, r2, r3, r4 vext.8 \r0, \r0, \r1, #4 vext.8 \r1, \r1, \r2, #4 .ifnb \r3 vext.8 \r2, \r2, \r3, #4 vext.8 \r3, \r3, \r4, #4 .endif .endm .macro vmin_u16 c, r0, r1, r2, r3 vmin.u16 \r0, \r0, \c .ifnb \r1 vmin.u16 \r1, \r1, \c .endif .ifnb \r2 vmin.u16 \r2, \r2, \c vmin.u16 \r3, \r3, \c .endif .endm .macro vsub_i16 c, r0, r1, r2, r3 vsub.i16 \r0, \r0, \c .ifnb \r1 vsub.i16 \r1, \r1, \c .endif .ifnb \r2 vsub.i16 \r2, \r2, \c vsub.i16 \r3, \r3, \c .endif .endm .macro vmull_vmlal_4 d, s0, s1, s2, s3 vmull.s16 \d, \s0, d0[0] vmlal.s16 \d, \s1, d0[1] vmlal.s16 \d, \s2, d0[2] vmlal.s16 \d, \s3, d0[3] .endm .macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 vmull.s16 \d, \s0, d0[0] vmlal.s16 \d, \s1, d0[1] vmlal.s16 \d, \s2, d0[2] vmlal.s16 \d, \s3, d0[3] vmlal.s16 \d, \s4, d1[0] vmlal.s16 \d, \s5, d1[1] vmlal.s16 \d, \s6, d1[2] vmlal.s16 \d, \s7, d1[3] .endm .macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3 vqrshrun.s32 \d0, \q0, #\shift .ifnb \q1 vqrshrun.s32 \d1, \q1, #\shift .endif .ifnb \q2 vqrshrun.s32 \d2, \q2, #\shift vqrshrun.s32 \d3, \q3, #\shift .endif .endm .macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3 vmovn.i32 \d0, \q0 .ifnb \q1 vmovn.i32 \d1, \q1 .endif .ifnb \q2 vmovn.i32 \d2, \q2 vmovn.i32 \d3, \q3 .endif .endm .macro vrshl_s32 shift, r0, r1, r2, r3 vrshl.s32 \r0, \r0, \shift vrshl.s32 \r1, \r1, \shift .ifnb \r2 vrshl.s32 \r2, \r2, \shift vrshl.s32 \r3, \r3, \shift .endif .endm .macro vst1_32 strd, r0, r1 vst1.32 {\r0[0]}, [r0, :32], \strd vst1.32 {\r0[1]}, [r9, :32], \strd .ifnb \r1 vst1.32 {\r1[0]}, [r0, :32], \strd vst1.32 {\r1[1]}, [r9, :32], \strd .endif .endm .macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 vst1.16 {\r0}, [r0, \align], \strd vst1.16 {\r1}, [r9, \align], \strd .ifnb \r2 vst1.16 {\r2}, [r0, \align], \strd vst1.16 {\r3}, [r9, \align], \strd .endif .ifnb \r4 vst1.16 {\r4}, [r0, \align], \strd vst1.16 {\r5}, [r9, \align], \strd vst1.16 {\r6}, [r0, \align], \strd vst1.16 {\r7}, [r9, \align], \strd .endif .endm .macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3 .ifc \type, put vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 vmin_u16 q15, \q0, \q1 .else vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits) vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 vsub_i16 q15, \q0, \q1 // PREP_BIAS .endif .endm .macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 vst1_reg \strd, :64, \d0, \d1, \d2, \d3 .endm .macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 vst1_reg \strd, :128, \q0, \q1 .endm .macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 vst1.16 {\q0, \q1}, [r0, :128], \strd .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_16bpc_neon, export=1 push {r4-r11,lr} movw r9, \type_h movw r10, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2 make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] .ifc \bdmax, r8 ldr r8, [sp, #52] .endif movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, r11 mul \my, \my, r11 add \mx, \mx, r9 // mx, 8tap_h, 4tap_h add \my, \my, r10 // my, 8tap_v, 4tap_v .ifc \type, prep lsl \d_strd, \w, #1 .endif vdup.16 q15, \bdmax // bitdepth_max clz \bdmax, \bdmax clz r9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 tst \mx, #(0x7f << 14) sub r9, r9, #24 add lr, \bdmax, #6 // 6 + intermediate_bits rsb r12, \bdmax, #6 // 6 - intermediate_bits movrel r11, X(mc_subpel_filters), -8 bne L(\type\()_8tap_h) tst \my, #(0x7f << 14) bne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx r10, \mx, #7, #7 and \mx, \mx, #0x7f it gt movgt \mx, r10 tst \my, #(0x7f << 14) add \mx, r11, \mx, lsl #3 bne L(\type\()_8tap_hv) adr r10, L(\type\()_8tap_h_tbl) vdup.32 q14, r12 // 6 - intermediate_bits ldr r9, [r10, r9, lsl #2] vneg.s32 q14, q14 // -(6-intermediate_bits) .ifc \type, put vdup.16 q13, \bdmax // intermediate_bits .else vmov.i16 q13, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vneg.s16 q13, q13 // -intermediate_bits .endif bx r10 .align 2 L(\type\()_8tap_h_tbl): .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 2: vld1.16 {q2}, [\src], \s_strd vld1.16 {q3}, [\sr2], \s_strd vext.8 d5, d4, d5, #2 vext.8 d7, d6, d7, #2 subs \h, \h, #2 vtrn.32 d4, d6 vtrn.32 d5, d7 vmull.s16 q1, d4, d0[0] vmlal.s16 q1, d5, d0[1] vmlal.s16 q1, d6, d0[2] vmlal.s16 q1, d7, d0[3] vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) vqmovun.s32 d2, q1 vrshl.s16 d2, d2, d26 // -intermediate_bits vmin.u16 d2, d2, d30 vst1.32 {d2[0]}, [\dst, :32], \d_strd vst1.32 {d2[1]}, [\ds2, :32], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 4: vld1.16 {q8}, [\src], \s_strd vld1.16 {q11}, [\sr2], \s_strd vext.8 d18, d16, d17, #2 vext.8 d19, d16, d17, #4 vext.8 d20, d16, d17, #6 vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d21, d22, d23, #6 subs \h, \h, #2 vmull.s16 q2, d16, d0[0] vmlal.s16 q2, d18, d0[1] vmlal.s16 q2, d19, d0[2] vmlal.s16 q2, d20, d0[3] vmull.s16 q3, d22, d0[0] vmlal.s16 q3, d24, d0[1] vmlal.s16 q3, d25, d0[2] vmlal.s16 q3, d21, d0[3] vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) .ifc \type, put vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vrshl.s16 q2, q2, q13 // -intermediate_bits vmin.u16 q2, q2, q15 .else vmovn.s32 d4, q2 vmovn.s32 d5, q3 vsub.i16 q2, q2, q13 // PREP_BIAS .endif vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd bgt 4b pop {r4-r11,pc} 80: 160: 320: 640: 1280: // 8xN, 16xN, 32xN, ... h vpush {q4-q5} vld1.8 {d0}, [\mx, :64] sub \src, \src, #6 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 sub \s_strd, \s_strd, \w, lsl #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, lsl #1 .endif 81: vld1.16 {q8, q9}, [\src]! vld1.16 {q10, q11}, [\sr2]! mov \mx, \w 8: vmull.s16 q1, d16, d0[0] vmull.s16 q2, d17, d0[0] vmull.s16 q3, d20, d0[0] vmull.s16 q4, d21, d0[0] .irpc i, 1234567 vext.8 q12, q8, q9, #(2*\i) vext.8 q5, q10, q11, #(2*\i) .if \i < 4 vmlal.s16 q1, d24, d0[\i] vmlal.s16 q2, d25, d0[\i] vmlal.s16 q3, d10, d0[\i] vmlal.s16 q4, d11, d0[\i] .else vmlal.s16 q1, d24, d1[\i-4] vmlal.s16 q2, d25, d1[\i-4] vmlal.s16 q3, d10, d1[\i-4] vmlal.s16 q4, d11, d1[\i-4] .endif .endr subs \mx, \mx, #8 vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vrshl.s32 q4, q4, q14 // -(6-intermediate_bits) .ifc \type, put vqmovun.s32 d2, q1 vqmovun.s32 d3, q2 vqmovun.s32 d4, q3 vqmovun.s32 d5, q4 vrshl.s16 q1, q1, q13 // -intermediate_bits vrshl.s16 q2, q2, q13 // -intermediate_bits vmin.u16 q1, q1, q15 vmin.u16 q2, q2, q15 .else vmovn.s32 d2, q1 vmovn.s32 d3, q2 vmovn.s32 d4, q3 vmovn.s32 d5, q4 vsub.i16 q1, q1, q13 // PREP_BIAS vsub.i16 q2, q2, q13 // PREP_BIAS .endif vst1.16 {q1}, [\dst, :128]! vst1.16 {q2}, [\ds2, :128]! ble 9f vmov q8, q9 vmov q10, q11 vld1.16 {q9}, [\src]! vld1.16 {q11}, [\sr2]! b 8b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 81b vpop {q4-q5} pop {r4-r11,pc} L(\type\()_8tap_v): cmp \h, #4 ubfx r10, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r10 add \my, r11, \my, lsl #3 .ifc \type, prep vdup.32 q14, r12 // 6 - intermediate_bits vmov.i16 q15, #PREP_BIAS .endif adr r10, L(\type\()_8tap_v_tbl) ldr r9, [r10, r9, lsl #2] .ifc \type, prep vneg.s32 q14, q14 // -(6-intermediate_bits) .endif add r10, r10, r9 bx r10 .align 2 L(\type\()_8tap_v_tbl): .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put bgt 28f cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 // 2x2 v load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 interleave_1_32 d1, d2, d3, d4, d5 bgt 24f vmull_vmlal_4 q8, d1, d2, d3, d4 vqrshrun_s32 6, q8, d16 vmin_u16 d30, d16 vst1_32 \d_strd, d16 pop {r4-r11,pc} 24: // 2x4 v load_32 \sr2, \src, \s_strd, d6, d7 interleave_1_32 d5, d6, d7 vmull_vmlal_4 q8, d1, d2, d3, d4 vmull_vmlal_4 q9, d3, d4, d5, d6 vqrshrun_s32 6, q8, d16, q9, d17 vmin_u16 q15, q8 vst1_32 \d_strd, d16, d17 pop {r4-r11,pc} 28: // 2x6, 2x8, 2x12, 2x16 v vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16 interleave_1_32 d2, d3, d4, d5, d6 interleave_1_32 d6, d7, d16 216: subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 interleave_1_32 d16, d17, d18, d19, d20 vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 vqrshrun_s32 6, q13, d26, q1, d27 vmin_u16 q15, q13 vst1_32 \d_strd, d26, d27 ble 0f cmp \h, #2 vmov q1, q3 vmov q2, q8 vmov q3, q9 vmov d16, d20 beq 26f b 216b 26: load_32 \sr2, \src, \s_strd, d17, d18 interleave_1_32 d16, d17, d18 vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 vqrshrun_s32 6, q13, d26 vmin_u16 d30, d26 vst1_32 \d_strd, d26 0: pop {r4-r11,pc} .endif 40: bgt 480f // 4x2, 4x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 vmull_vmlal_4 q8, d1, d2, d3, d4 vmull_vmlal_4 q9, d2, d3, d4, d5 shift_store_4 \type, \d_strd, q8, q9, d16, d17 ble 0f load_reg \sr2, \src, \s_strd, d6, d7 vmull_vmlal_4 q8, d3, d4, d5, d6 vmull_vmlal_4 q9, d4, d5, d6, d7 shift_store_4 \type, \d_strd, q8, q9, d16, d17 0: pop {r4-r11,pc} 480: // 4x6, 4x8, 4x12, 4x16 v vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22 48: subs \h, \h, #4 load_reg \sr2, \src, \s_strd, d23, d24, d25, d26 vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25 vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 ble 0f cmp \h, #2 vmov q8, q10 vmov q9, q11 vmov q10, q12 vmov d22, d26 beq 46f b 48b 46: load_reg \sr2, \src, \s_strd, d23, d24 vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 shift_store_4 \type, \d_strd, q1, q2, d2, d3 0: pop {r4-r11,pc} 80: bgt 880f // 8x2, 8x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9 vmull_vmlal_4 q10, d2, d4, d6, d16 vmull_vmlal_4 q11, d3, d5, d7, d17 vmull_vmlal_4 q12, d4, d6, d16, d18 vmull_vmlal_4 q13, d5, d7, d17, d19 shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23 ble 0f load_reg \sr2, \src, \s_strd, q10, q11 vmull_vmlal_4 q1, d6, d16, d18, d20 vmull_vmlal_4 q2, d7, d17, d19, d21 vmull_vmlal_4 q12, d16, d18, d20, d22 vmull_vmlal_4 q13, d17, d19, d21, d23 shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5 0: pop {r4-r11,pc} 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: vpush {q4-q7} vld1.8 {d0}, [\my, :64] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11 88: subs \h, \h, #2 load_reg \sr2, \src, \s_strd, q12, q13 vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24 vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25 vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26 vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27 shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, q1, q2 vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2 vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3 vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4 vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5 shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9 ble 9f vmov q5, q9 vmov q6, q10 vmov q7, q11 vmov q8, q12 vmov q9, q13 vmov q10, q1 vmov q11, q2 b 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: vpop {q4-q7} pop {r4-r11,pc} 160: bgt 1680b // 16x2, 16x4 v vpush {q6-q7} add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd vmovl.s8 q0, d0 load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11 16: load_16s16 \src, \src, \s_strd, q12, q13 subs \h, \h, #1 vmull_vmlal_4 q1, d12, d16, d20, d24 vmull_vmlal_4 q2, d13, d17, d21, d25 vmull_vmlal_4 q3, d14, d18, d22, d26 vmull_vmlal_4 q6, d15, d19, d23, d27 shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5 ble 0f vmov q6, q8 vmov q7, q9 vmov q8, q10 vmov q9, q11 vmov q10, q12 vmov q11, q13 b 16b 0: vpop {q6-q7} pop {r4-r11,pc} L(\type\()_8tap_hv): cmp \h, #4 ubfx r10, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r10 4: add \my, r11, \my, lsl #3 adr r10, L(\type\()_8tap_hv_tbl) neg r12, r12 // -(6-intermediate_bits) ldr r9, [r10, r9, lsl #2] vdup.32 q14, r12 // -(6-intermediate_bits) .ifc \type, put neg r8, lr // -(6+intermeidate_bits) .else vmov.i16 q13, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vdup.32 q13, r8 // -(6+intermediate_bits) .endif bx r10 .align 2 L(\type\()_8tap_hv_tbl): .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB 20: .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 280f add \my, \my, #2 vld1.32 {d2[]}, [\my] // 2x2, 2x4 hv sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vmull.s16 q11, d22, d0 vmull.s16 q12, d24, d0 vpadd.s32 d22, d22, d23 vpadd.s32 d23, d24, d25 vpadd.s32 d22, d22, d23 vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) vmovn.i32 d16, q8 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vext.8 d16, d16, d24, #4 vmov d17, d24 2: bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d24, #4 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d24, d2[3] vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vmin.u16 d4, d4, d30 subs \h, \h, #2 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd ble 0f vmov d16, d18 vmov d17, d24 b 2b 280: // 2x8, 2x16, 2x32 hv vld1.8 {d2}, [\my, :64] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vmull.s16 q11, d22, d0 vmull.s16 q12, d24, d0 vpadd.s32 d22, d22, d23 vpadd.s32 d23, d24, d25 vpadd.s32 d22, d22, d23 vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) vmovn.i32 d16, q8 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vext.8 d16, d16, d24, #4 vmov d17, d24 bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d24, #4 vmov d19, d24 bl L(\type\()_8tap_filter_2) vext.8 d20, d19, d24, #4 vmov d21, d24 28: bl L(\type\()_8tap_filter_2) vext.8 d22, d21, d24, #4 vmull.s16 q3, d16, d2[0] vmlal.s16 q3, d17, d2[1] vmlal.s16 q3, d18, d2[2] vmlal.s16 q3, d19, d2[3] vmlal.s16 q3, d20, d3[0] vmlal.s16 q3, d21, d3[1] vmlal.s16 q3, d22, d3[2] vmlal.s16 q3, d24, d3[3] vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) vqmovun.s32 d6, q3 vmin.u16 d6, d6, d30 subs \h, \h, #2 vst1.32 {d6[0]}, [\dst, :32], \d_strd vst1.32 {d6[1]}, [\ds2, :32], \d_strd ble 0f vmov q8, q9 vmov q9, q10 vmov d20, d22 vmov d21, d24 b 28b 0: pop {r4-r11,pc} L(\type\()_8tap_filter_2): vld1.16 {q11}, [\sr2], \s_strd vld1.16 {q12}, [\src], \s_strd vext.8 d23, d22, d23, #2 vext.8 d25, d24, d25, #2 vtrn.32 q11, q12 vmull.s16 q3, d22, d0[0] vmlal.s16 q3, d23, d0[1] vmlal.s16 q3, d24, d0[2] vmlal.s16 q3, d25, d0[3] vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vmovn.i32 d24, q3 bx lr .endif 40: add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 480f add \my, \my, #2 vld1.32 {d2[]}, [\my] sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 // 4x2, 4x4 hv vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d23, d22, d23, #6 vmull.s16 q10, d22, d0[0] vmlal.s16 q10, d24, d0[1] vmlal.s16 q10, d25, d0[2] vmlal.s16 q10, d23, d0[3] vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) vmovn.i32 d17, q10 bl L(\type\()_8tap_filter_4) vmov q9, q12 4: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d17, d2[0] vmlal.s16 q2, d18, d2[1] vmlal.s16 q2, d19, d2[2] vmlal.s16 q2, d24, d2[3] vmull.s16 q3, d18, d2[0] vmlal.s16 q3, d19, d2[1] vmlal.s16 q3, d24, d2[2] vmlal.s16 q3, d25, d2[3] .ifc \type, put vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q15 .else vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vsub.i16 q2, q2, q13 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd ble 0f vmov d17, d19 vmov q9, q12 b 4b 0: pop {r4-r11,pc} 480: // 4x8, 4x16, 4x32 hv vpush {d13-d15} vld1.8 {d2}, [\my, :64] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d23, d22, d23, #6 vmull.s16 q10, d22, d0[0] vmlal.s16 q10, d24, d0[1] vmlal.s16 q10, d25, d0[2] vmlal.s16 q10, d23, d0[3] vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) vmovn.i32 d13, q10 bl L(\type\()_8tap_filter_4) vmov q7, q12 bl L(\type\()_8tap_filter_4) vmov q8, q12 bl L(\type\()_8tap_filter_4) vmov q9, q12 48: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d13, d2[0] vmlal.s16 q2, d14, d2[1] vmlal.s16 q2, d15, d2[2] vmlal.s16 q2, d16, d2[3] vmlal.s16 q2, d17, d3[0] vmlal.s16 q2, d18, d3[1] vmlal.s16 q2, d19, d3[2] vmlal.s16 q2, d24, d3[3] vmull.s16 q3, d14, d2[0] vmlal.s16 q3, d15, d2[1] vmlal.s16 q3, d16, d2[2] vmlal.s16 q3, d17, d2[3] vmlal.s16 q3, d18, d3[0] vmlal.s16 q3, d19, d3[1] vmlal.s16 q3, d24, d3[2] vmlal.s16 q3, d25, d3[3] .ifc \type, put vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q15 .else vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vsub.i16 q2, q2, q13 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd ble 0f vmov d13, d15 vmov q7, q8 vmov q8, q9 vmov q9, q12 b 48b 0: vpop {d13-d15} pop {r4-r11,pc} L(\type\()_8tap_filter_4): vld1.16 {q10}, [\sr2], \s_strd vld1.16 {q11}, [\src], \s_strd vext.8 d24, d20, d21, #2 vext.8 d25, d20, d21, #4 vext.8 d21, d20, d21, #6 vmull.s16 q3, d20, d0[0] vmlal.s16 q3, d24, d0[1] vmlal.s16 q3, d25, d0[2] vmlal.s16 q3, d21, d0[3] vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d23, d22, d23, #6 vmull.s16 q10, d22, d0[0] vmlal.s16 q10, d24, d0[1] vmlal.s16 q10, d25, d0[2] vmlal.s16 q10, d23, d0[3] vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) vmovn.i32 d24, q3 vmovn.i32 d25, q10 bx lr 80: 160: 320: bgt 880f add \my, \my, #2 vld1.8 {d0}, [\mx, :64] vld1.32 {d2[]}, [\my] sub \src, \src, #6 sub \src, \src, \s_strd vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.16 {q11, q12}, [\src], \s_strd vmull.s16 q2, d22, d0[0] vmull.s16 q3, d23, d0[0] vdup.32 q14, r12 // -(6-intermediate_bits) .irpc i, 1234567 vext.8 q10, q11, q12, #(2*\i) .if \i < 4 vmlal.s16 q2, d20, d0[\i] vmlal.s16 q3, d21, d0[\i] .else vmlal.s16 q2, d20, d1[\i - 4] vmlal.s16 q3, d21, d1[\i - 4] .endif .endr vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vmovn.i32 d16, q2 vmovn.i32 d17, q3 bl L(\type\()_8tap_filter_8) vmov q9, q11 vmov q10, q12 8: bl L(\type\()_8tap_filter_8) vmull.s16 q2, d16, d2[0] vmull.s16 q3, d17, d2[0] vmull.s16 q13, d18, d2[0] vmull.s16 q14, d19, d2[0] .ifc \type, put vdup.32 q8, r8 // -(6+intermediate_bits) .endif vmlal.s16 q2, d18, d2[1] vmlal.s16 q3, d19, d2[1] vmlal.s16 q13, d20, d2[1] vmlal.s16 q14, d21, d2[1] vmlal.s16 q2, d20, d2[2] vmlal.s16 q3, d21, d2[2] vmlal.s16 q13, d22, d2[2] vmlal.s16 q14, d23, d2[2] vmlal.s16 q2, d22, d2[3] vmlal.s16 q3, d23, d2[3] vmlal.s16 q13, d24, d2[3] vmlal.s16 q14, d25, d2[3] .ifc \type, put vdup.16 q9, \bdmax // bitdepth_max vrshl.s32 q2, q2, q8 // -(6+intermediate_bits) vrshl.s32 q3, q3, q8 // -(6+intermediate_bits) vrshl.s32 q13, q13, q8 // -(6+intermediate_bits) vrshl.s32 q14, q14, q8 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vqmovun.s32 d6, q13 vqmovun.s32 d7, q14 vmin.u16 q2, q2, q15 vmin.u16 q3, q3, q15 .else vmov.i16 q9, #PREP_BIAS vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q13, #6 vrshrn.i32 d7, q14, #6 vsub.i16 q2, q2, q9 // PREP_BIAS vsub.i16 q3, q3, q9 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {q2}, [\dst, :128], \d_strd vst1.16 {q3}, [\ds2, :128], \d_strd ble 9f vmov q8, q10 vmov q9, q11 vmov q10, q12 b 8b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 164b 0: pop {r4-r11,pc} 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: vpush {q4-q7} vld1.8 {d0}, [\mx, :64] vld1.8 {d2}, [\my, :64] sub \src, \src, #6 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.16 {q11, q12}, [\src], \s_strd vmull.s16 q2, d22, d0[0] vmull.s16 q3, d23, d0[0] vdup.32 q14, r12 // -(6-intermediate_bits) .irpc i, 1234567 vext.8 q10, q11, q12, #(2*\i) .if \i < 4 vmlal.s16 q2, d20, d0[\i] vmlal.s16 q3, d21, d0[\i] .else vmlal.s16 q2, d20, d1[\i - 4] vmlal.s16 q3, d21, d1[\i - 4] .endif .endr vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vmovn.i32 d8, q2 vmovn.i32 d9, q3 bl L(\type\()_8tap_filter_8) vmov q5, q11 vmov q6, q12 bl L(\type\()_8tap_filter_8) vmov q7, q11 vmov q8, q12 bl L(\type\()_8tap_filter_8) vmov q9, q11 vmov q10, q12 88: bl L(\type\()_8tap_filter_8) vmull.s16 q2, d8, d2[0] vmull.s16 q3, d9, d2[0] vmull.s16 q13, d10, d2[0] vmull.s16 q14, d11, d2[0] .ifc \type, put vdup.32 q4, r8 // -(6+intermediate_bits) .endif vmlal.s16 q2, d10, d2[1] vmlal.s16 q3, d11, d2[1] vmlal.s16 q13, d12, d2[1] vmlal.s16 q14, d13, d2[1] vmlal.s16 q2, d12, d2[2] vmlal.s16 q3, d13, d2[2] vmlal.s16 q13, d14, d2[2] vmlal.s16 q14, d15, d2[2] vmlal.s16 q2, d14, d2[3] vmlal.s16 q3, d15, d2[3] vmlal.s16 q13, d16, d2[3] vmlal.s16 q14, d17, d2[3] vmlal.s16 q2, d16, d3[0] vmlal.s16 q3, d17, d3[0] vmlal.s16 q13, d18, d3[0] vmlal.s16 q14, d19, d3[0] vmlal.s16 q2, d18, d3[1] vmlal.s16 q3, d19, d3[1] vmlal.s16 q13, d20, d3[1] vmlal.s16 q14, d21, d3[1] vmlal.s16 q2, d20, d3[2] vmlal.s16 q3, d21, d3[2] vmlal.s16 q13, d22, d3[2] vmlal.s16 q14, d23, d3[2] vmlal.s16 q2, d22, d3[3] vmlal.s16 q3, d23, d3[3] vmlal.s16 q13, d24, d3[3] vmlal.s16 q14, d25, d3[3] .ifc \type, put vrshl.s32 q2, q2, q4 // -(6+intermediate_bits) vrshl.s32 q3, q3, q4 // -(6+intermediate_bits) vrshl.s32 q13, q13, q4 // -(6+intermediate_bits) vrshl.s32 q14, q14, q4 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vqmovun.s32 d6, q13 vqmovun.s32 d7, q14 vmin.u16 q2, q2, q15 vmin.u16 q3, q3, q15 .else vmov.i16 q5, #PREP_BIAS vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q13, #6 vrshrn.i32 d7, q14, #6 vsub.i16 q2, q2, q5 // PREP_BIAS vsub.i16 q3, q3, q5 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {q2}, [\dst, :128], \d_strd vst1.16 {q3}, [\ds2, :128], \d_strd ble 9f vmov q4, q6 vmov q5, q7 vmov q6, q8 vmov q7, q9 vmov q8, q10 vmov q9, q11 vmov q10, q12 b 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: vpop {q4-q7} pop {r4-r11,pc} L(\type\()_8tap_filter_8): vld1.16 {q13, q14}, [\sr2], \s_strd vmull.s16 q2, d26, d0[0] vmull.s16 q3, d27, d0[0] .irpc i, 1234567 vext.8 q12, q13, q14, #(2*\i) .if \i < 4 vmlal.s16 q2, d24, d0[\i] vmlal.s16 q3, d25, d0[\i] .else vmlal.s16 q2, d24, d1[\i - 4] vmlal.s16 q3, d25, d1[\i - 4] .endif .endr vdup.32 q12, r12 // -(6-intermediate_bits) vld1.16 {q13, q14}, [\src], \s_strd vrshl.s32 q2, q2, q12 // -(6-intermediate_bits) vrshl.s32 q3, q3, q12 // -(6-intermediate_bits) vmovn.i32 d4, q2 vmovn.i32 d5, q3 vmull.s16 q3, d26, d0[0] vmull.s16 q11, d27, d0[0] .irpc i, 1234567 vext.8 q12, q13, q14, #(2*\i) .if \i < 4 vmlal.s16 q3, d24, d0[\i] vmlal.s16 q11, d25, d0[\i] .else vmlal.s16 q3, d24, d1[\i - 4] vmlal.s16 q11, d25, d1[\i - 4] .endif .endr vdup.32 q13, r12 // -(6-intermediate_bits) vrshl.s32 q3, q3, q13 // -(6-intermediate_bits) vrshl.s32 q11, q11, q13 // -(6-intermediate_bits) vmovn.i32 d24, q3 vmovn.i32 d25, q11 vmov q11, q2 bx lr endfunc function \type\()_bilin_16bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] .ifc \bdmax, r8 ldr r8, [sp, #52] .endif vdup.16 q1, \mx vdup.16 q3, \my rsb r9, \mx, #16 rsb r10, \my, #16 vdup.16 q0, r9 vdup.16 q2, r10 .ifc \type, prep lsl \d_strd, \w, #1 .endif clz \bdmax, \bdmax // bitdepth_max clz r9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 cmp \mx, #0 sub r9, r9, #24 rsb r11, \bdmax, #4 // 4 - intermediate_bits add r12, \bdmax, #4 // 4 + intermediate_bits bne L(\type\()_bilin_h) cmp \my, #0 bne L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cmp \my, #0 bne L(\type\()_bilin_hv) adr r10, L(\type\()_bilin_h_tbl) vdup.16 q15, r11 // 4 - intermediate_bits ldr r9, [r10, r9, lsl #2] vneg.s16 q15, q15 // -(4-intermediate_bits) .ifc \type, put vdup.16 q14, \bdmax // intermediate_bits .else vmov.i16 q14, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vneg.s16 q14, q14 // -intermediate_bits .endif bx r10 .align 2 L(\type\()_bilin_h_tbl): .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: vld1.16 {d16}, [\src], \s_strd vld1.16 {d18}, [\sr2], \s_strd vext.8 d17, d16, d16, #2 vext.8 d19, d18, d18, #2 vtrn.32 d16, d18 vtrn.32 d17, d19 subs \h, \h, #2 vmul.i16 d16, d16, d0 vmla.i16 d16, d17, d2 vrshl.u16 d16, d16, d30 vrshl.u16 d16, d16, d28 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: vld1.16 {q8}, [\src], \s_strd vld1.16 {q10}, [\sr2], \s_strd vext.8 q9, q8, q8, #2 vext.8 q11, q10, q10, #2 vmov d17, d20 vmov d19, d22 subs \h, \h, #2 vmul.i16 q8, q8, q0 vmla.i16 q8, q9, q1 vrshl.u16 q8, q8, q15 .ifc \type, put vrshl.u16 q8, q8, q14 .else vsub.i16 q8, q8, q14 .endif vst1.16 {d16}, [\dst, :64], \d_strd vst1.16 {d17}, [\ds2, :64], \d_strd bgt 4b pop {r4-r11,pc} 80: // 8xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: vld1.16 {d16, d17, d18}, [\src], \s_strd vld1.16 {d20, d21, d22}, [\sr2], \s_strd vext.8 q9, q8, q9, #2 vext.8 q11, q10, q11, #2 subs \h, \h, #2 vmul.i16 q8, q8, q0 vmla.i16 q8, q9, q1 vmul.i16 q10, q10, q0 vmla.i16 q10, q11, q1 vrshl.u16 q8, q8, q15 vrshl.u16 q10, q10, q15 .ifc \type, put vrshl.u16 q8, q8, q14 vrshl.u16 q10, q10, q14 .else vsub.i16 q8, q8, q14 vsub.i16 q10, q10, q14 .endif vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q10}, [\ds2, :128], \d_strd bgt 8b pop {r4-r11,pc} 160: 320: 640: 1280: // 16xN, 32xN, ... h vpush {q4-q7} add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, lsl #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, lsl #1 .endif 161: vld1.16 {q4}, [\src]! vld1.16 {q9}, [\sr2]! mov \mx, \w 16: vld1.16 {q5, q6}, [\src]! vld1.16 {q10, q11}, [\sr2]! vext.8 q7, q4, q5, #2 vext.8 q8, q5, q6, #2 vext.8 q12, q9, q10, #2 vext.8 q13, q10, q11, #2 vmul.i16 q4, q4, q0 vmla.i16 q4, q7, q1 vmul.i16 q5, q5, q0 vmla.i16 q5, q8, q1 vmul.i16 q9, q9, q0 vmla.i16 q9, q12, q1 vmul.i16 q10, q10, q0 vmla.i16 q10, q13, q1 vrshl.u16 q4, q4, q15 vrshl.u16 q5, q5, q15 vrshl.u16 q9, q9, q15 vrshl.u16 q10, q10, q15 subs \mx, \mx, #16 .ifc \type, put vrshl.u16 q4, q4, q14 vrshl.u16 q5, q5, q14 vrshl.u16 q9, q9, q14 vrshl.u16 q10, q10, q14 .else vsub.i16 q4, q4, q14 vsub.i16 q5, q5, q14 vsub.i16 q9, q9, q14 vsub.i16 q10, q10, q14 .endif vst1.16 {q4, q5}, [\dst, :128]! vst1.16 {q9, q10}, [\ds2, :128]! ble 9f vmov q4, q6 vmov q9, q11 b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 161b vpop {q4-q7} pop {r4-r11,pc} L(\type\()_bilin_v): cmp \h, #4 adr r10, L(\type\()_bilin_v_tbl) .ifc \type, prep vdup.16 q15, r11 // 4 - intermediate_bits .endif ldr r9, [r10, r9, lsl #2] .ifc \type, prep vmov.i16 q14, #PREP_BIAS vneg.s16 q15, q15 // -(4-intermediate_bits) .endif add r10, r10, r9 bx r10 .align 2 L(\type\()_bilin_v_tbl): .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v vld1.32 {d16[]}, [\src], \s_strd bgt 24f 22: vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vmul.i16 d16, d16, d4 vmla.i16 d16, d17, d6 vrshr.u16 d16, d16, #4 vst1.32 {d16[0]}, [\dst, :32] vst1.32 {d16[1]}, [\ds2, :32] pop {r4-r11,pc} 24: // 2x4, 2x6, 2x8, ... v vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vld1.32 {d19[]}, [\sr2], \s_strd vld1.32 {d20[]}, [\src], \s_strd subs \h, \h, #4 vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vext.8 d18, d18, d19, #4 vext.8 d19, d19, d20, #4 vswp d17, d18 vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 cmp \h, #2 vrshr.u16 q8, q8, #4 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd vst1.32 {d17[0]}, [\dst, :32], \d_strd vst1.32 {d17[1]}, [\ds2, :32], \d_strd blt 0f vmov d16, d20 beq 22b b 24b 0: pop {r4-r11,pc} .endif 40: // 4xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {d16}, [\src], \s_strd 4: vld1.16 {d17}, [\sr2], \s_strd vld1.16 {d19}, [\src], \s_strd vmov d18, d17 vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 subs \h, \h, #2 .ifc \type, put vrshr.u16 q8, q8, #4 .else vrshl.u16 q8, q8, q15 vsub.i16 q8, q8, q14 .endif vst1.16 {d16}, [\dst, :64], \d_strd vst1.16 {d17}, [\ds2, :64], \d_strd ble 0f vmov d16, d19 b 4b 0: pop {r4-r11,pc} 80: // 8xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {q8}, [\src], \s_strd 8: vld1.16 {q9}, [\sr2], \s_strd vld1.16 {q10}, [\src], \s_strd vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 vmul.i16 q9, q9, q2 vmla.i16 q9, q10, q3 subs \h, \h, #2 .ifc \type, put vrshr.u16 q8, q8, #4 vrshr.u16 q9, q9, #4 .else vrshl.u16 q8, q8, q15 vrshl.u16 q9, q9, q15 vsub.i16 q8, q8, q14 vsub.i16 q9, q9, q14 .endif vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q9}, [\ds2, :128], \d_strd ble 0f vmov q8, q10 b 8b 0: pop {r4-r11,pc} 160: // 16xN, 32xN, ... 320: 640: 1280: mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {q8, q9}, [\src], \s_strd 2: vld1.16 {q10, q11}, [\sr2], \s_strd vld1.16 {q12, q13}, [\src], \s_strd vmul.i16 q8, q8, q2 vmla.i16 q8, q10, q3 vmul.i16 q9, q9, q2 vmla.i16 q9, q11, q3 vmul.i16 q10, q10, q2 vmla.i16 q10, q12, q3 vmul.i16 q11, q11, q2 vmla.i16 q11, q13, q3 subs \h, \h, #2 .ifc \type, put vrshr.u16 q8, q8, #4 vrshr.u16 q9, q9, #4 vrshr.u16 q10, q10, #4 vrshr.u16 q11, q11, #4 .else vrshl.u16 q8, q8, q15 vrshl.u16 q9, q9, q15 vrshl.u16 q10, q10, q15 vrshl.u16 q11, q11, q15 vsub.i16 q8, q8, q14 vsub.i16 q9, q9, q14 vsub.i16 q10, q10, q14 vsub.i16 q11, q11, q14 .endif vst1.16 {q8, q9}, [\dst, :128], \d_strd vst1.16 {q10, q11}, [\ds2, :128], \d_strd ble 9f vmov q8, q12 vmov q9, q13 b 2b 9: subs \w, \w, #16 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #32 add \dst, \dst, #32 b 1b 0: pop {r4-r11,pc} L(\type\()_bilin_hv): adr r10, L(\type\()_bilin_hv_tbl) vdup.16 q15, r11 // 4 - intermediate_bits ldr r9, [r10, r9, lsl #2] vneg.s16 q15, q15 // -(4-intermediate_bits) .ifc \type, put vdup.32 q14, r12 // 4 + intermediate_bits .else vmov.i16 q14, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vneg.s32 q14, q14 // -(4+intermediate_bits) .endif bx r10 .align 2 L(\type\()_bilin_hv_tbl): .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB 20: // 2xN hv .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {d20}, [\src], \s_strd vext.8 d21, d20, d20, #2 vmul.i16 d16, d20, d0 vmla.i16 d16, d21, d2 vrshl.u16 d16, d16, d30 vext.8 d16, d16, d16, #4 2: vld1.16 {d20}, [\sr2], \s_strd vld1.16 {d22}, [\src], \s_strd vext.8 d21, d20, d20, #2 vext.8 d23, d22, d22, #2 vtrn.32 d20, d22 vtrn.32 d21, d23 vmul.i16 d18, d20, d0 vmla.i16 d18, d21, d2 vrshl.u16 d18, d18, d30 vext.8 d16, d16, d18, #4 vmull.u16 q8, d16, d4 vmlal.u16 q8, d18, d6 vrshl.u32 q8, q8, q14 vmovn.i32 d16, q8 subs \h, \h, #2 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd ble 0f vmov d16, d18 b 2b 0: pop {r4-r11,pc} .endif 40: // 4xN hv add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {q10}, [\src], \s_strd vext.8 d21, d20, d21, #2 vmul.i16 d16, d20, d0 vmla.i16 d16, d21, d2 vrshl.u16 d16, d16, d30 4: vld1.16 {q10}, [\sr2], \s_strd vld1.16 {q11}, [\src], \s_strd vext.8 d21, d20, d21, #2 vext.8 d23, d22, d23, #2 vswp d21, d22 vmul.i16 q9, q10, q0 vmla.i16 q9, q11, q1 vrshl.u16 q9, q9, q15 vmull.u16 q10, d16, d4 vmlal.u16 q10, d18, d6 vmull.u16 q11, d18, d4 vmlal.u16 q11, d19, d6 .ifc \type, put vrshl.u32 q10, q10, q14 vrshl.u32 q11, q11, q14 vmovn.i32 d20, q10 vmovn.i32 d21, q11 .else vrshrn.i32 d20, q10, #4 vrshrn.i32 d21, q11, #4 vsub.i16 q10, q10, q14 .endif subs \h, \h, #2 vst1.16 {d20}, [\dst, :64], \d_strd vst1.16 {d21}, [\ds2, :64], \d_strd ble 0f vmov d16, d19 b 4b 0: pop {r4-r11,pc} 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {d20, d21, d22}, [\src], \s_strd vext.8 q11, q10, q11, #2 vmul.i16 q8, q10, q0 vmla.i16 q8, q11, q1 vrshl.u16 q8, q8, q15 2: vld1.16 {d20, d21, d22}, [\sr2], \s_strd vld1.16 {d24, d25, d26}, [\src], \s_strd vext.8 q11, q10, q11, #2 vext.8 q13, q12, q13, #2 vmul.i16 q9, q10, q0 vmla.i16 q9, q11, q1 vmul.i16 q10, q12, q0 vmla.i16 q10, q13, q1 vrshl.u16 q9, q9, q15 vrshl.u16 q10, q10, q15 vmull.u16 q11, d16, d4 vmlal.u16 q11, d18, d6 vmull.u16 q12, d17, d4 vmlal.u16 q12, d19, d6 vmull.u16 q8, d18, d4 vmlal.u16 q8, d20, d6 vmull.u16 q9, d19, d4 vmlal.u16 q9, d21, d6 .ifc \type, put vrshl.u32 q11, q11, q14 vrshl.u32 q12, q12, q14 vrshl.u32 q8, q8, q14 vrshl.u32 q9, q9, q14 vmovn.i32 d22, q11 vmovn.i32 d23, q12 vmovn.i32 d16, q8 vmovn.i32 d17, q9 .else vrshrn.i32 d22, q11, #4 vrshrn.i32 d23, q12, #4 vrshrn.i32 d16, q8, #4 vrshrn.i32 d17, q9, #4 vsub.i16 q11, q11, q14 vsub.i16 q8, q8, q14 .endif subs \h, \h, #2 vst1.16 {q11}, [\dst, :128], \d_strd vst1.16 {q8}, [\ds2, :128], \d_strd ble 9f vmov q8, q10 b 2b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 1b 0: pop {r4-r11,pc} endfunc .endm filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 .macro load_filter_ptr src asr r12, \src, #10 add r12, r11, r12, lsl #3 .endm .macro load_filter_coef dst, src, inc add \src, \src, \inc vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc load_filter_ptr \src load_filter_coef \dst, \src, \inc .endm function warp_filter_horz_neon load_filter_ptr r5 // filter 0 vld1.16 {q6,q7}, [r2], r3 load_filter_coef d0, r5, r7 // filter 0 load_filter_row d2, r5, r7 // filter 1 vmovl.s8 q0, d0 // filter 0 vext.8 q3, q6, q7, #2*1 // filter 1 pixels vmovl.s8 q1, d2 // filter 1 vmull.s16 q4, d12, d0 // filter 0 output (0-3) vmull.s16 q5, d13, d1 // filter 0 output (4-7) load_filter_ptr r5 // filter 2 vmull.s16 q2, d6, d2 // filter 1 output (0-3) vmull.s16 q3, d7, d3 // filter 1 output (4-7) load_filter_coef d0, r5, r7 // filter 2 vpadd.i32 d8, d8, d9 // half pixel 0 (2x32) vpadd.i32 d9, d10, d11 // half pixel 0 (2x32) load_filter_ptr r5 // filter 3 vpadd.i32 d4, d4, d5 // half pixel 1 (2x32) vpadd.i32 d5, d6, d7 // half pixel 1 (2x32) vmovl.s8 q0, d0 // filter 2 vext.8 q3, q6, q7, #2*2 // filter 2 pixels vpadd.i32 d8, d8, d9 // pixel 0 (2x32) vpadd.i32 d9, d4, d5 // pixel 1 (2x32) load_filter_coef d2, r5, r7 // filter 3 vmull.s16 q2, d6, d0 // filter 2 output (0-3) vmull.s16 q3, d7, d1 // filter 2 output (4-7) load_filter_ptr r5 // filter 4 vpadd.i32 d8, d8, d9 // pixel 0,1 vpadd.i32 d9, d4, d5 // half pixel 2 (2x32) vpadd.i32 d10, d6, d7 // half pixel 2 (2x32) vmovl.s8 q1, d2 // filter 3 vext.8 q3, q6, q7, #2*3 // filter 3 pixels load_filter_coef d0, r5, r7 // filter 4 vpadd.i32 d9, d9, d10 // pixel 2 (2x32) vmull.s16 q2, d6, d2 // filter 3 output (0-3) vmull.s16 q3, d7, d3 // filter 3 output (4-7) vmovl.s8 q0, d0 // filter 4 load_filter_ptr r5 // filter 5 vpadd.i32 d10, d4, d5 // half pixel 3 (2x32) vpadd.i32 d11, d6, d7 // half pixel 3 (2x32) vext.8 q3, q6, q7, #2*4 // filter 4 pixels load_filter_coef d2, r5, r7 // filter 5 vpadd.i32 d10, d10, d11 // pixel 3 (2x32) vpadd.i32 d9, d9, d10 // pixel 2,3 vmull.s16 q2, d6, d0 // filter 4 output (0-3) vmull.s16 q3, d7, d1 // filter 4 output (4-7) vmovl.s8 q1, d2 // filter 5 load_filter_ptr r5 // filter 6 vpadd.i32 d10, d4, d5 // half pixel 4 (2x32) vpadd.i32 d11, d6, d7 // half pixel 4 (2x32) vext.8 q3, q6, q7, #2*5 // filter 5 pixels load_filter_coef d0, r5, r7 // filter 6 vpadd.i32 d10, d10, d11 // pixel 4 (2x32) vmull.s16 q2, d6, d2 // filter 5 output (0-3) vmull.s16 q3, d7, d3 // filter 5 output (4-7) vmovl.s8 q0, d0 // filter 6 load_filter_ptr r5 // filter 7 vpadd.i32 d4, d4, d5 // half pixel 5 (2x32) vpadd.i32 d5, d6, d7 // half pixel 5 (2x32) vext.8 q3, q6, q7, #2*6 // filter 6 pixels load_filter_coef d2, r5, r7 // filter 7 vpadd.i32 d11, d4, d5 // pixel 5 (2x32) vmull.s16 q2, d6, d0 // filter 6 output (0-3) vmull.s16 q3, d7, d1 // filter 6 output (4-7) vmovl.s8 q1, d2 // filter 7 vpadd.i32 d10, d10, d11 // pixel 4,5 vpadd.i32 d4, d4, d5 // half pixel 6 (2x32) vpadd.i32 d5, d6, d7 // half pixel 6 (2x32) vext.8 q3, q6, q7, #2*7 // filter 7 pixels vpadd.i32 d11, d4, d5 // pixel 6 (2x32) vmull.s16 q2, d6, d2 // filter 7 output (0-3) vmull.s16 q3, d7, d3 // filter 7 output (4-7) vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits) vpadd.i32 d4, d4, d5 // half pixel 7 (2x32) vpadd.i32 d5, d6, d7 // half pixel 7 (2x32) sub r5, r5, r7, lsl #3 vpadd.i32 d4, d4, d5 // pixel 7 (2x32) add r5, r5, r8 vpadd.i32 d11, d11, d4 // pixel 6,7 vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits) vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits) bx lr endfunc // void dav2d_warp_affine_8x8_16bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my, // const int bitdepth_max) .macro warp t function warp_affine_8x8\t\()_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] sub sp, sp, #8 clz r7, r7 // intermediate_bits = clz(bitdepth_max) - 18 .ifb \t sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 .endif sub r7, r7, #25 // -(7 - intermediate_bits) .ifb \t neg r8, r8 // -(7 + intermediate_bits) .endif str r7, [sp] // spill -(7 - intermediate_bits) on stack .ifb \t str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack .endif ldrd r8, r9, [r4] sxth r7, r8 asr r8, r8, #16 asr r4, r9, #16 sxth r9, r9 mov r10, #8 sub r2, r2, r3, lsl #1 sub r2, r2, r3 sub r2, r2, #6 movrel r11, X(mc_warp_filter), 64*8 .ifnb \t lsl r1, r1, #1 .endif add r5, r5, #512 add r6, r6, #512 bl warp_filter_horz_neon vmovn.i32 d16, q4 vmovn.i32 d17, q5 bl warp_filter_horz_neon vmovn.i32 d18, q4 vmovn.i32 d19, q5 bl warp_filter_horz_neon vmovn.i32 d20, q4 vmovn.i32 d21, q5 bl warp_filter_horz_neon vmovn.i32 d22, q4 vmovn.i32 d23, q5 bl warp_filter_horz_neon vmovn.i32 d24, q4 vmovn.i32 d25, q5 bl warp_filter_horz_neon vmovn.i32 d26, q4 vmovn.i32 d27, q5 bl warp_filter_horz_neon vmovn.i32 d28, q4 vmovn.i32 d29, q5 1: bl warp_filter_horz_neon vmovn.i32 d30, q4 vmovn.i32 d31, q5 load_filter_row d8, r6, r9 load_filter_row d9, r6, r9 load_filter_row d10, r6, r9 load_filter_row d11, r6, r9 load_filter_row d12, r6, r9 load_filter_row d13, r6, r9 load_filter_row d14, r6, r9 load_filter_row d15, r6, r9 transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 vmovl.s8 q1, d8 vmovl.s8 q2, d9 vmovl.s8 q3, d10 vmovl.s8 q4, d11 vmovl.s8 q5, d12 vmovl.s8 q6, d13 sub r6, r6, r9, lsl #3 // This ordering of vmull/vmlal is highly beneficial for // Cortex A8/A9/A53 here, but harmful for Cortex A7. vmull.s16 q0, d16, d2 vmlal.s16 q0, d18, d4 vmlal.s16 q0, d20, d6 vmlal.s16 q0, d22, d8 vmlal.s16 q0, d24, d10 vmlal.s16 q0, d26, d12 vmull.s16 q1, d17, d3 vmlal.s16 q1, d19, d5 vmlal.s16 q1, d21, d7 vmlal.s16 q1, d23, d9 vmlal.s16 q1, d25, d11 vmlal.s16 q1, d27, d13 vmovl.s8 q2, d14 vmovl.s8 q3, d15 vmlal.s16 q0, d28, d4 vmlal.s16 q0, d30, d6 vmlal.s16 q1, d29, d5 vmlal.s16 q1, d31, d7 .ifb \t ldr lr, [sp, #4] // -(7 + intermediate_bits) ldr r12, [sp, #120] // bitdepth_max vdup.32 q2, lr // -(7 + intermediate_bits) vdup.16 q3, r12 // bitdepth_max .endif vmov q8, q9 vmov q9, q10 .ifb \t vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits) vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits) .else vrshrn.s32 d0, q0, #7 vrshrn.s32 d1, q1, #7 vmov.i16 q3, #PREP_BIAS .endif vmov q10, q11 .ifb \t vqmovun.s32 d0, q0 vqmovun.s32 d1, q1 .else vsub.i16 q0, q0, q3 // PREP_BIAS .endif vmov q11, q12 vmov q12, q13 .ifb \t vmin.u16 q0, q0, q3 // bitdepth_max .endif vmov q13, q14 vmov q14, q15 subs r10, r10, #1 vst1.16 {q0}, [r0, :128], r1 add r6, r6, r4 bgt 1b add sp, sp, #8 vpop {q4-q7} pop {r4-r11,pc} endfunc .endm warp warp t // void dav2d_emu_edge_16bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_16bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] ldrd r8, r9, [sp, #52] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub r12, r3, #1 // ih - 1 cmp r5, r3 sub lr, r2, #1 // iw - 1 it lt movlt r12, r5 // min(y, ih - 1) cmp r4, r2 bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) it lt movlt lr, r4 // min(x, iw - 1) bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) mla r8, r12, r9, r8 // ref += iclip() * stride add r8, r8, lr, lsl #1 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add r10, r5, r1 // y + bh neg r5, r5 // -y sub r10, r10, r3 // y + bh - ih sub r12, r1, #1 // bh - 1 cmp r10, r1 bic r5, r5, r5, asr #31 // max(-y, 0) it ge movge r10, r12 // min(y + bh - ih, bh-1) cmp r5, r1 bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) it ge movge r5, r12 // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add r11, r4, r0 // x + bw neg r4, r4 // -x sub r11, r11, r2 // x + bw - iw sub lr, r0, #1 // bw - 1 cmp r11, r0 bic r4, r4, r4, asr #31 // max(-x, 0) it ge movge r11, lr // min(x + bw - iw, bw-1) cmp r4, r0 bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) it ge movge r4, lr // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub r1, r1, r5 // bh - top_ext mla r6, r5, r7, r6 sub r2, r0, r4 // bw - left_ext sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext sub r2, r2, r11 // center_w = bw - left_ext - right_ext mov r0, r6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left vld1.16 {d0[], d1[]}, [r8] mov r12, r6 // out = dst mov r3, r4 vmov q1, q0 1: subs r3, r3, #16 vst1.16 {q0, q1}, [r12, :128]! bgt 1b .endif mov lr, r8 add r12, r6, r4, lsl #1 // out = dst + left_ext mov r3, r2 1: vld1.16 {q0, q1}, [lr]! subs r3, r3, #32 vld1.16 {q2, q3}, [lr]! .if \need_left vst1.16 {q0, q1}, [r12]! vst1.16 {q2, q3}, [r12]! .else vst1.16 {q0, q1}, [r12, :128]! vst1.16 {q2, q3}, [r12, :128]! .endif bgt 1b .if \need_right add r3, r8, r2, lsl #1 // in + center_w sub r3, r3, #2 // in + center_w - 1 add r12, r6, r4, lsl #1 // dst + left_ext vld1.16 {d0[], d1[]}, [r3] add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w mov r3, r11 vmov q1, q0 1: subs r3, r3, #16 vst1.16 {q0, q1}, [r12]! bgt 1b .endif subs r1, r1, #1 // center_h-- add r6, r6, r7 add r8, r8, r9 bgt 0b .endm cmp r4, #0 beq 2f // need_left cmp r11, #0 beq 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cmp r11, #0 beq 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cmp r10, #0 // Storing the original dst in r0 overwrote bw, recalculate it here add r2, r2, r4 // center_w + left_ext add r2, r2, r11 // bw = center_w + left_ext + right_ext beq 3f // need_bottom sub r8, r6, r7 // ref = dst - stride mov r4, r2 sub r12, r7, #32 1: vld1.16 {q0, q1}, [r8, :128]! mov r3, r10 vld1.16 {q2, q3}, [r8, :128]! 2: vst1.16 {q0, q1}, [r6, :128]! subs r3, r3, #1 vst1.16 {q2, q3}, [r6, :128], r12 bgt 2b mls r6, r7, r10, r6 // dst -= bottom_ext * stride subs r4, r4, #32 // bw -= 32 add r6, r6, #64 // dst += 32 bgt 1b 3: cmp r5, #0 beq 3f // need_top mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride sub r12, r7, #32 1: vld1.16 {q0, q1}, [r0, :128]! mov r3, r5 vld1.16 {q2, q3}, [r0, :128]! 2: vst1.16 {q0, q1}, [r6, :128]! subs r3, r3, #1 vst1.16 {q2, q3}, [r6, :128], r12 bgt 2b mls r6, r7, r5, r6 // dst -= top_ext * stride subs r2, r2, #32 // bw -= 32 add r6, r6, #64 // dst += 32 bgt 1b 3: pop {r4-r11,pc} endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/msac.S000066400000000000000000000521661517466257200227170ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define BUF_POS 0 #define BUF_END 4 #define DIF 8 #define RNG 12 #define CNT 16 #define ALLOW_UPDATE_CDF 20 const coeffs .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 endconst const bits, align=4 .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 endconst .macro vld1_align_n d0, q0, q1, src, n .if \n == 4 vld1.16 {\d0}, [\src, :64] .elseif \n == 8 vld1.16 {\q0}, [\src, :128] .else vld1.16 {\q0, \q1}, [\src, :128] .endif .endm .macro vld1_n d0, q0, q1, src, n .if \n == 4 vld1.16 {\d0}, [\src] .elseif \n == 8 vld1.16 {\q0}, [\src] .else vld1.16 {\q0, \q1}, [\src] .endif .endm .macro vst1_align_n d0, q0, q1, src, n .if \n == 4 vst1.16 {\d0}, [\src, :64] .elseif \n == 8 vst1.16 {\q0}, [\src, :128] .else vst1.16 {\q0, \q1}, [\src, :128] .endif .endm .macro vst1_n d0, q0, q1, src, n .if \n == 4 vst1.16 {\d0}, [\src] .elseif \n == 8 vst1.16 {\q0}, [\src] .else vst1.16 {\q0, \q1}, [\src] .endif .endm .macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vshr.u16 \d0, \s0, \s3 .else vshr.u16 \d1, \s1, \s4 .if \n == 16 vshr.u16 \d2, \s2, \s5 .endif .endif .endm .macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vadd.i16 \d0, \s0, \s3 .else vadd.i16 \d1, \s1, \s4 .if \n == 16 vadd.i16 \d2, \s2, \s5 .endif .endif .endm .macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vsub.i16 \d0, \s0, \s3 .else vsub.i16 \d1, \s1, \s4 .if \n == 16 vsub.i16 \d2, \s2, \s5 .endif .endif .endm .macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vand \d0, \s0, \s3 .else vand \d1, \s1, \s4 .if \n == 16 vand \d2, \s2, \s5 .endif .endif .endm .macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vcge.u16 \d0, \s0, \s3 .else vcge.u16 \d1, \s1, \s4 .if \n == 16 vcge.u16 \d2, \s2, \s5 .endif .endif .endm .macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vrhadd.u16 \d0, \s0, \s3 .else vrhadd.u16 \d1, \s1, \s4 .if \n == 16 vrhadd.u16 \d2, \s2, \s5 .endif .endif .endm .macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vshl.s16 \d0, \s0, \s3 .else vshl.s16 \d1, \s1, \s4 .if \n == 16 vshl.s16 \d2, \s2, \s5 .endif .endif .endm .macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vqdmulh.s16 \d0, \s0, \s3 .else vqdmulh.s16 \d1, \s1, \s4 .if \n == 16 vqdmulh.s16 \d2, \s2, \s5 .endif .endif .endm // unsigned dav2d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, // size_t n_symbols); function msac_decode_symbol_adapt4_neon, export=1 .macro decode_update n push {r4-r10,lr} sub sp, sp, #48 add r8, r0, #RNG vld1_align_n d0, q0, q1, r1, \n // cdf vld1.16 {d16[]}, [r8, :16] // rng movrel_local r9, coeffs, 30 vmov.i16 d30, #0x7f00 // 0x7f00 sub r9, r9, r2, lsl #1 vmvn.i16 q14, #0x3f // 0xffc0 add r8, sp, #14 vand d22, d16, d30 // rng & 0x7f00 vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 .if \n > 4 vmov d23, d22 .endif vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add r8, r0, #DIF + 2 vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) .if \n == 4 vmov.i16 d17, #0 .endif vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) add r9, sp, #16 vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) movrel_local r8, bits vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access vmov d21, d20 vld1_align_n q12, q12, q13, r8, \n vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask vadd.i16 d20, d20, d21 // Aggregate mask bits ldr r4, [r0, #ALLOW_UPDATE_CDF] vpadd.i16 d20, d20, d20 lsl r10, r2, #1 vpadd.i16 d20, d20, d20 vmov.u16 r3, d20[0] cmp r4, #0 rbit r3, r3 clz lr, r3 // ret beq L(renorm) // update_cdf ldrh r3, [r1, r10] // count = cdf[n_symbols] vmov.i8 q10, #0xff mvn r12, r2 mov r4, #-4 cmn r12, #3 // set C if n_symbols <= 2 vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 lsr r12, r3, #4 // count >> 4 sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) .if \n == 4 vdup.16 d20, r4 // -rate .else vdup.16 q10, r4 // -rate .endif sub r3, r3, r3, lsr #5 // count - (count == 32) vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate add r3, r3, #1 // count + (count < 32) vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate vst1_align_n d0, q0, q1, r1, \n strh r3, [r1, r10] .endm decode_update 4 L(renorm): add r8, sp, #16 add r8, r8, lr, lsl #1 ldrh r3, [r8] // v ldrh r4, [r8, #-2] // u ldr r6, [r0, #CNT] ldr r7, [r0, #DIF] sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 sub r7, r7, r3, lsl #16 // dif - (v << 16) L(renorm2): lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] bhs 4f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 subs r5, r5, r4 bhi 6f ldr r8, [r3] // next_bits rsb r5, r6, #16 add r4, r6, #16 // shift_bits = cnt + 16 mvn r8, r8 lsr r5, r5, #3 // num_bytes_read rev r8, r8 // next_bits = bswap(next_bits) lsr r8, r8, r4 // next_bits >>= shift_bits 2: // refill_end add r3, r3, r5 add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] 3: // refill_end2 orr r7, r7, r8 // dif |= next_bits 4: // end str r6, [r0, #CNT] str r7, [r0, #DIF] mov r0, lr add sp, sp, #48 pop {r4-r10,pc} 5: // pad_with_ones add r8, r6, #-240 lsr r8, r8, r8 b 3b 6: // refill_eob cmp r3, r4 bhs 5b ldr r8, [r4, #-4] lsl r5, r5, #3 lsr r8, r8, r5 add r5, r6, #16 mvn r8, r8 sub r4, r4, r3 // num_bytes_left rev r8, r8 lsr r8, r8, r5 rsb r5, r6, #16 lsr r5, r5, #3 cmp r5, r4 it hs movhs r5, r4 b 2b endfunc function msac_decode_symbol_adapt8_neon, export=1 decode_update 8 b L(renorm) endfunc function msac_decode_hi_tok_neon, export=1 push {r4-r10,lr} vld1.16 {d0}, [r1, :64] // cdf add r4, r0, #RNG vmov.i16 d31, #0x7f00 // 0x7f00 movrel_local r5, coeffs, 30-2*3 vmvn.i16 d30, #0x3f // 0xffc0 ldrh r9, [r1, #6] // count = cdf[n_symbols] vld1.16 {d1[]}, [r4, :16] // rng movrel_local r4, bits vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) add r5, r0, #DIF + 2 vld1.16 {q8}, [r4, :128] mov r2, #-24 vand d20, d0, d30 // cdf & 0xffc0 ldr r10, [r0, #ALLOW_UPDATE_CDF] vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) sub sp, sp, #48 ldr r6, [r0, #CNT] ldr r7, [r0, #DIF] vmov d3, d2 1: vand d23, d1, d31 // rng & 0x7f00 vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add r12, sp, #14 vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) vmov.i16 d7, #0 vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng add r12, sp, #16 vcge.u16 q2, q1, q3 // c >= v vst1.16 {q3}, [r12] // store v values to allow indexed access vand q9, q2, q8 // One bit per halfword set in the mask vadd.i16 d18, d18, d19 // Aggregate mask bits vpadd.i16 d18, d18, d18 vpadd.i16 d18, d18, d18 vmov.u16 r3, d18[0] cmp r10, #0 add r2, r2, #5 rbit r3, r3 add r8, sp, #16 clz lr, r3 // ret beq 2f // update_cdf vmov.i8 d22, #0xff mov r4, #-5 vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) vdup.16 d18, r4 // -rate sub r9, r9, r9, lsr #5 // count - (count == 32) vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate add r9, r9, #1 // count + (count < 32) vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate vst1.16 {d0}, [r1, :64] vand d20, d0, d30 // cdf & 0xffc0 strh r9, [r1, #6] 2: add r8, r8, lr, lsl #1 ldrh r3, [r8] // v ldrh r4, [r8, #-2] // u sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 sub r7, r7, r3, lsl #16 // dif - (v << 16) lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d lsl r7, r7, r5 // (dif - (v << 16)) << d str r4, [r0, #RNG] vdup.16 d1, r4 bhs 5f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 subs r5, r5, r4 bhi 7f ldr r8, [r3] // next_bits rsb r5, r6, #16 add r4, r6, #16 // shift_bits = cnt + 16 mvn r8, r8 lsr r5, r5, #3 // num_bytes_read rev r8, r8 // next_bits = bswap(next_bits) lsr r8, r8, r4 // next_bits >>= shift_bits 3: // refill_end add r3, r3, r5 add r6, r6, r5, lsl #3 // cnt += num_bits_read str r3, [r0, #BUF_POS] 4: // refill_end2 orr r7, r7, r8 // dif |= next_bits 5: // end lsl lr, lr, #1 sub lr, lr, #5 lsr r12, r7, #16 adds r2, r2, lr // carry = tok_br < 3 || tok == 15 vdup.16 q1, r12 bcc 1b // loop if !carry add r2, r2, #30 str r6, [r0, #CNT] add sp, sp, #48 str r7, [r0, #DIF] lsr r0, r2, #1 pop {r4-r10,pc} 6: // pad_with_ones add r8, r6, #-240 lsr r8, r8, r8 b 4b 7: // refill_eob cmp r3, r4 bhs 6b ldr r8, [r4, #-4] lsl r5, r5, #3 lsr r8, r8, r5 add r5, r6, #16 mvn r8, r8 sub r4, r4, r3 // num_bytes_left rev r8, r8 lsr r8, r8, r5 rsb r5, r6, #16 lsr r5, r5, #3 cmp r5, r4 it hs movhs r5, r4 b 3b endfunc function msac_decode_bool_equi_neon, export=1 push {r4-r10,lr} ldr r5, [r0, #RNG] ldr r6, [r0, #CNT] sub sp, sp, #48 ldr r7, [r0, #DIF] bic r4, r5, #0xff // r &= 0xff00 add r4, r4, #8 mov r2, #0 subs r8, r7, r4, lsl #15 // dif - vw lsr r4, r4, #1 // v sub r5, r5, r4 // r - v itee lo movlo r2, #1 movhs r4, r5 // if (ret) v = r - v; movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) endfunc function msac_decode_bool_neon, export=1 push {r4-r10,lr} ldr r5, [r0, #RNG] ldr r6, [r0, #CNT] sub sp, sp, #48 ldr r7, [r0, #DIF] lsr r4, r5, #8 // r >> 8 bic r1, r1, #0x3f // f &= ~63 mul r4, r4, r1 mov r2, #0 lsr r4, r4, #7 add r4, r4, #4 // v subs r8, r7, r4, lsl #16 // dif - vw sub r5, r5, r4 // r - v itee lo movlo r2, #1 movhs r4, r5 // if (ret) v = r - v; movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) endfunc function msac_decode_bool_adapt_neon, export=1 push {r4-r10,lr} ldr r9, [r1] // cdf[0-1] ldr r5, [r0, #RNG] movw lr, #0xffc0 ldr r6, [r0, #CNT] sub sp, sp, #48 ldr r7, [r0, #DIF] lsr r4, r5, #8 // r >> 8 and r2, r9, lr // f &= ~63 mul r4, r4, r2 mov r2, #0 lsr r4, r4, #7 add r4, r4, #4 // v subs r8, r7, r4, lsl #16 // dif - vw sub r5, r5, r4 // r - v ldr r10, [r0, #ALLOW_UPDATE_CDF] itee lo movlo r2, #1 movhs r4, r5 // if (ret) v = r - v; movhs r7, r8 // if (ret) dif = dif - vw; cmp r10, #0 clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 beq L(renorm2) lsr r2, r9, #16 // count = cdf[1] uxth r9, r9 // cdf[0] sub r3, r2, r2, lsr #5 // count - (count >= 32) lsr r2, r2, #4 // count >> 4 add r10, r3, #1 // count + (count < 32) add r2, r2, #4 // rate = (count >> 4) | 4 sub r9, r9, lr // cdf[0] -= bit sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate sub r9, r9, r3 // cdf[0] strh r9, [r1] strh r10, [r1, #2] b L(renorm2) endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/refmvs.S000066400000000000000000000273251517466257200232750ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void dav2d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, // int bx4, int bw4, int bh4) function splat_mv_neon, export=1 push {r4, lr} vld1.8 {q3}, [r1] ldr r4, [sp, #8] clz r3, r3 adr lr, L(splat_tbl) sub r3, r3, #26 vext.8 q2, q3, q3, #12 ldr r3, [lr, r3, lsl #2] add r2, r2, r2, lsl #1 vext.8 q0, q2, q3, #4 add r3, lr, r3 vext.8 q1, q2, q3, #8 lsl r2, r2, #2 vext.8 q2, q2, q3, #12 vmov q3, q0 1: ldr r1, [r0], #4 subs r4, r4, #1 add r1, r1, r2 bx r3 .align 2 L(splat_tbl): .word 320f - L(splat_tbl) + CONFIG_THUMB .word 160f - L(splat_tbl) + CONFIG_THUMB .word 80f - L(splat_tbl) + CONFIG_THUMB .word 40f - L(splat_tbl) + CONFIG_THUMB .word 20f - L(splat_tbl) + CONFIG_THUMB .word 10f - L(splat_tbl) + CONFIG_THUMB 10: vst1.8 {d0}, [r1] vstr s2, [r1, #8] bgt 1b pop {r4, pc} 20: vst1.8 {q0}, [r1] vstr d2, [r1, #16] bgt 1b pop {r4, pc} 40: vst1.8 {q0, q1}, [r1]! vst1.8 {q2}, [r1] bgt 1b pop {r4, pc} 320: vst1.8 {q0, q1}, [r1]! vst1.8 {q2, q3}, [r1]! vst1.8 {q1, q2}, [r1]! vst1.8 {q0, q1}, [r1]! vst1.8 {q2, q3}, [r1]! vst1.8 {q1, q2}, [r1]! 160: vst1.8 {q0, q1}, [r1]! vst1.8 {q2, q3}, [r1]! vst1.8 {q1, q2}, [r1]! 80: vst1.8 {q0, q1}, [r1]! vst1.8 {q2, q3}, [r1]! vst1.8 {q1, q2}, [r1] bgt 1b pop {r4, pc} endfunc const mv_tbls, align=4 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 endconst const mask_mult, align=4 .byte 1, 2, 1, 2, 0, 0, 0, 0 endconst // void dav2d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, // refmvs_block **rr, const uint8_t *ref_sign, // int col_end8, int row_end8, // int col_start8, int row_start8) function save_tmvs_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] vmov.i8 d30, #0 vld1.8 {d31}, [r3] adr r8, L(save_tmvs_tbl) movrel_local lr, mask_mult movrel_local r12, mv_tbls vld1.8 {d29}, [lr] vext.8 d31, d30, d31, #7 // [0, ref_sign] mov r3, #5 mul r1, r1, r3 // stride *= 5 sub r5, r5, r7 // h = row_end8 - row_start8 lsl r7, r7, #1 // row_start8 <<= 1 1: mov r3, #5 mov r11, #12*2 and r9, r7, #30 // (y & 15) * 2 ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2] add r9, r9, #12 // &b[... + 1] mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1] mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1] mla r3, r6, r3, r0 // &rp[x] push {r2,r4,r6} 2: ldrb r11, [r9, #10] // cand_b->bs add lr, r9, #8 vld1.8 {d0, d1}, [r9] // cand_b->mv add r11, r8, r11, lsl #3 vld1.16 {d2[]}, [lr] // cand_b->ref ldrh lr, [r11] // bw8 mov r2, r8 add r9, r9, lr, lsl #1 // cand_b += bw8*2 cmp r9, r10 vmov d4, d0 bge 3f ldrb r2, [r9, #10] // cand_b->bs add lr, r9, #8 vld1.8 {d6, d7}, [r9] // cand_b->mv add r2, r8, r2, lsl #3 vld1.16 {d2[1]}, [lr] // cand_b->ref ldrh lr, [r2] // bw8 add r9, r9, lr, lsl #1 // cand_b += bw8*2 vmov d5, d6 3: vabs.s16 q2, q2 // abs(mv[].xy) vtbl.8 d2, {d31}, d2 // ref_sign[ref] vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12 vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2} vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096 vmovn.i32 d4, q2 // abs() condition to 16 bit vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1] vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0] vmov.u16 r4, d2[0] // Extract case for first block vmov.u16 r6, d2[1] ldr r11, [r11, #4] // Fetch jump table entry ldr r2, [r2, #4] add r4, r12, r4, lsl #4 add r6, r12, r6, lsl #4 vld1.8 {d2, d3}, [r4] // Load permutation table base on case vld1.8 {d4, d5}, [r6] add r11, r8, r11 // Find jump table target add r2, r8, r2 vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block vtbl.8 d17, {d0, d1}, d3 vtbl.8 d18, {d6, d7}, d4 vtbl.8 d19, {d6, d7}, d5 vmov q0, q8 // q1 follows on q0 (q8), with another 3 full repetitions of the pattern. vext.8 q1, q8, q8, #1 vext.8 q10, q9, q9, #1 // q2 ends with 3 complete repetitions of the pattern. vext.8 q2, q8, q1, #4 vext.8 q11, q9, q10, #4 blx r11 bge 4f // if (cand_b >= end) vmov q0, q9 vmov q1, q10 vmov q2, q11 cmp r9, r10 blx r2 blt 2b // if (cand_b < end) 4: pop {r2,r4,r6} subs r5, r5, #1 // h-- add r7, r7, #2 // y += 2 add r0, r0, r1 // rp += stride bgt 1b pop {r4-r11,pc} .align 2 L(save_tmvs_tbl): .word 16 * 12 .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB .word 16 * 12 .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB .word 8 * 12 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB .word 8 * 12 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB .word 8 * 12 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB .word 8 * 12 .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB .word 4 * 12 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB .word 4 * 12 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB .word 4 * 12 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB .word 4 * 12 .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB .word 2 * 12 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB .word 2 * 12 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB .word 2 * 12 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB .word 2 * 12 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB .word 2 * 12 .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB .word 1 * 12 .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB 10: add r4, r3, #4 vst1.32 {d0[0]}, [r3] vst1.8 {d0[4]}, [r4] add r3, r3, #5 bx lr 20: add r4, r3, #8 vst1.8 {d0}, [r3] vst1.16 {d1[0]}, [r4] add r3, r3, #2*5 bx lr 40: add r4, r3, #16 vst1.8 {q0}, [r3] vst1.32 {d2[0]}, [r4] add r3, r3, #4*5 bx lr 80: add r4, r3, #(8*5-16) // This writes 6 full entries plus 2 extra bytes vst1.8 {q0, q1}, [r3] // Write the last few, overlapping with the first write. vst1.8 {q2}, [r4] add r3, r3, #8*5 bx lr 160: add r4, r3, #6*5 add r6, r3, #12*5 // This writes 6 full entries plus 2 extra bytes vst1.8 {q0, q1}, [r3] // Write another 6 full entries, slightly overlapping with the first set vst1.8 {q0, q1}, [r4] add r4, r3, #(16*5-16) // Write 8 bytes (one full entry) after the first 12 vst1.8 {d0}, [r6] // Write the last 3 entries vst1.8 {q2}, [r4] add r3, r3, #16*5 bx lr endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/32/util.S000066400000000000000000000141131517466257200227370ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2015 Martin Storsjo * Copyright © 2015 Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #ifndef DAV2D_SRC_ARM_32_UTIL_S #define DAV2D_SRC_ARM_32_UTIL_S #include "config.h" #include "src/arm/asm.S" #include "src/arm/arm-arch.h" .macro v4bx rd #if __ARM_ARCH >= 5 || defined(__ARM_ARCH_4T__) bx \rd #else mov pc, \rd #endif .endm .macro v4blx rd #if __ARM_ARCH >= 5 blx \rd #else mov lr, pc v4bx \rd #endif .endm .macro movrel_local rd, val, offset=0 #if (__ARM_ARCH >= 7 || defined(__ARM_ARCH_6T2__)) && !defined(PIC) movw \rd, #:lower16:\val+\offset movt \rd, #:upper16:\val+\offset #else ldr \rd, 90001f b 90002f 90001: .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB) 90002: add \rd, \rd, pc #endif .endm .macro movrel rd, val, offset=0 #if defined(PIC) && defined(__APPLE__) ldr \rd, 1f b 2f 1: .word 3f - (2f + 8 - 4 * CONFIG_THUMB) 2: ldr \rd, [pc, \rd] .if \offset < 0 sub \rd, \rd, #-(\offset) .elseif \offset > 0 add \rd, \rd, #\offset .endif .non_lazy_symbol_pointer 3: .indirect_symbol \val .word 0 .text #else movrel_local \rd, \val, \offset #endif .endm // This macro clobbers r7 (and r12 on windows) and stores data at the // bottom of the stack; sp is the start of the space allocated that // the caller can use. .macro sub_sp_align space #if CONFIG_THUMB mov r7, sp and r7, r7, #15 #else and r7, sp, #15 #endif sub sp, sp, r7 // Now the stack is aligned, store the amount of adjustment back // on the stack, as we don't want to waste a register as frame // pointer. str r7, [sp, #-16]! #ifdef _WIN32 .if \space > 8192 // Here, we'd need to touch two (or more) pages while decrementing // the stack pointer. .error "sub_sp_align doesn't support values over 8K at the moment" .elseif \space > 4096 sub r7, sp, #4096 ldr r12, [r7] sub r7, r7, #(\space - 4096) mov sp, r7 .else sub sp, sp, #\space .endif #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif #endif .endm .macro add_sp_align space .if \space >= 4096 add sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 add sp, sp, #(\space)%4096 .endif ldr r7, [sp], #16 // Add back the original stack adjustment add sp, sp, r7 .endm .macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vtrn.32 \q0, \q2 vtrn.32 \q1, \q3 vtrn.16 \r0, \r2 vtrn.16 \r1, \r3 vtrn.16 \r4, \r6 vtrn.16 \r5, \r7 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 vtrn.8 \r4, \r5 vtrn.8 \r6, \r7 .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7 vswp \d0, \d4 vswp \d1, \d5 vswp \d2, \d6 vswp \d3, \d7 vtrn.32 \r0, \r2 vtrn.32 \r1, \r3 vtrn.32 \r4, \r6 vtrn.32 \r5, \r7 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 vtrn.16 \r4, \r5 vtrn.16 \r6, \r7 .endm .macro transpose_4x8b q0, q1, r0, r1, r2, r3 vtrn.16 \q0, \q1 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 .endm .macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vswp \r1, \r4 // vtrn.64 \q0, \q2 vswp \r3, \r6 // vtrn.64 \q1, \q3 vtrn.32 \q0, \q1 vtrn.32 \q2, \q3 .endm .macro transpose_4x4h q0, q1, r0, r1, r2, r3 vtrn.32 \q0, \q1 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 .endm .macro transpose_4x8h r0, r1, r2, r3 vtrn.32 \r0, \r2 vtrn.32 \r1, \r3 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 .endm #endif /* DAV2D_SRC_ARM_32_UTIL_S */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/000077500000000000000000000000001517466257200216435ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/cdef.S000066400000000000000000000452351517466257200227010ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #2 sub \s2, \s2, #2 tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr s1, [\s1, #\w] ldr \rn\()2, [\s2] ldr s3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str \rw\()0, [x0] str d1, [x0, #2*\w] add x0, x0, #2*\stride str \rw\()2, [x0] str d3, [x0, #2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr h1, [\s1, #\w] ldr \rn\()2, [\s2] ldr h3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str \rw\()0, [x0] str s1, [x0, #2*\w] str s31, [x0, #2*\w+4] add x0, x0, #2*\stride str \rw\()2, [x0] str s3, [x0, #2*\w] str s31, [x0, #2*\w+4] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr h1, [\s1, #\w] ldr \rn\()2, [\s2] ldr h3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str s31, [x0] stur \rw\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \rw\()2, [x0, #4] str s3, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr \rn\()1, [\s2] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] stur \rw\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \rw\()1, [x0, #4] str s31, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride .endif 3: .endm .macro load_n_incr dst, src, incr, w .if \w == 4 ld1 {\dst\().s}[0], [\src], \incr .else ld1 {\dst\().8b}, [\src], \incr .endif .endm // void dav2d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func w, stride, rn, rw function cdef_padding\w\()_8bpc_neon, export=1 cmp w7, #0xf // fully edged b.eq cdef_padding\w\()_edged_8bpc_neon movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif b 3f 1: // CDEF_HAVE_TOP add x9, x4, x2 pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 // Middle section 3: tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b str s0, [x0] stur \rw\()1, [x0, #4] str s2, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.h}[0], [x3], #2 load_n_incr v1, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s0, [x0] stur \rw\()1, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b b 3f 2: tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr h1, [x1, #\w] load_n_incr v0, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] stur \rw\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr v0, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b str s31, [x0] stur \rw\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b 3: tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif ret 1: // CDEF_HAVE_BOTTOM add x9, x5, x2 pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 endfunc .endm padding_func 8, 16, d, q padding_func 4, 8, s, d // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg function cdef_padding\w\()_edged_8bpc_neon, export=1 sub x4, x4, #2 sub x5, x5, #2 sub x0, x0, #(2*\stride+2) .if \w == 4 ldr d0, [x4] ldr d1, [x4, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else add x9, x4, x2 ldr d0, [x4] ldr s1, [x4, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] str s1, [x0, #8] str d2, [x0, #\stride] str s3, [x0, #\stride+8] add x0, x0, #2*\stride .endif 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w subs w6, w6, #1 str h0, [x0] stur \reg\()1, [x0, #2] str h2, [x0, #2+\w] add x0, x0, #\stride b.gt 0b .if \w == 4 ldr d0, [x5] ldr d1, [x5, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else add x9, x5, x2 ldr d0, [x5] ldr s1, [x5, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] str s1, [x0, #8] str d2, [x0, #\stride] str s3, [x0, #\stride+8] .endif ret endfunc .endm padding_func_edged 8, 16, d padding_func_edged 4, 8, s tables filter 8, 8 filter 4, 8 find_dir 8 .macro load_px_8 d1, d2, w .if \w == 8 add x6, x2, w9, sxtb // x + off sub x9, x2, w9, sxtb // x - off ld1 {\d1\().d}[0], [x6] // p0 add x6, x6, #16 // += stride ld1 {\d2\().d}[0], [x9] // p1 add x9, x9, #16 // += stride ld1 {\d1\().d}[1], [x6] // p0 ld1 {\d2\().d}[1], [x9] // p0 .else add x6, x2, w9, sxtb // x + off sub x9, x2, w9, sxtb // x - off ld1 {\d1\().s}[0], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[0], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[1], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[1], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[2], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[2], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[3], [x6] // p0 ld1 {\d2\().s}[3], [x9] // p1 .endif .endm .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min .if \min umin v3.16b, v3.16b, \s1\().16b umax v4.16b, v4.16b, \s1\().16b umin v3.16b, v3.16b, \s2\().16b umax v4.16b, v4.16b, \s2\().16b .endif uabd v16.16b, v0.16b, \s1\().16b // abs(diff) uabd v20.16b, v0.16b, \s2\().16b // abs(diff) ushl v17.16b, v16.16b, \shift // abs(diff) >> shift ushl v21.16b, v20.16b, \shift // abs(diff) >> shift uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) cmhi v18.16b, v0.16b, \s1\().16b // px > p0 cmhi v22.16b, v0.16b, \s2\().16b // px > p1 umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) dup v19.16b, \tap // taps[k] neg v16.16b, v17.16b // -imin() neg v20.16b, v21.16b // -imin() bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint8_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h); .macro filter_func_8 w, pri, sec, min, suffix function cdef_filter\w\suffix\()_edged_8bpc_neon .if \pri movrel x8, pri_taps and w9, w3, #1 add x8, x8, w9, uxtw #1 .endif movrel x9, directions\w add x5, x9, w5, uxtw #1 movi v30.8b, #7 dup v28.8b, w6 // damping .if \pri dup v25.16b, w3 // threshold .endif .if \sec dup v27.16b, w4 // threshold .endif trn1 v24.8b, v25.8b, v27.8b clz v24.8b, v24.8b // clz(threshold) sub v24.8b, v30.8b, v24.8b // ulog2(threshold) uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) neg v24.8b, v24.8b // -shift .if \sec dup v26.16b, v24.b[1] .endif .if \pri dup v24.16b, v24.b[0] .endif 1: .if \w == 8 add x12, x2, #16 ld1 {v0.d}[0], [x2] // px ld1 {v0.d}[1], [x12] // px .else add x12, x2, #1*8 add x13, x2, #2*8 add x14, x2, #3*8 ld1 {v0.s}[0], [x2] // px ld1 {v0.s}[1], [x12] // px ld1 {v0.s}[2], [x13] // px ld1 {v0.s}[3], [x14] // px .endif // We need 9-bits or two 8-bit accululators to fit the sum. // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. // Start sum at -1 instead of 0 to help handle rounding later. movi v1.16b, #255 // sum movi v2.16b, #0 // sum .if \min mov v3.16b, v0.16b // min mov v4.16b, v0.16b // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov w11, #2 // sec_taps[0] 2: .if \pri ldrb w9, [x5] // off1 load_px_8 v5, v6, \w .endif .if \sec add x5, x5, #4 // +2*2 ldrb w9, [x5] // off2 load_px_8 v28, v29, \w .endif .if \pri ldrb w10, [x8] // *pri_taps handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min .endif .if \sec add x5, x5, #8 // +2*4 ldrb w9, [x5] // off3 load_px_8 v5, v6, \w handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; .else add x5, x5, #1 // x5 += 1 .endif subs w11, w11, #1 // sec_tap-- (value) .if \pri add x8, x8, #1 // pri_taps++ (pointer) .endif b.ne 2b // Perform halving adds since the value won't fit otherwise. // To handle the offset for negative values, use both halving w/ and w/o rounding. srhadd v5.16b, v1.16b, v2.16b // sum >> 1 shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 cmlt v1.16b, v5.16b, #0 // sum < 0 bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 .if \min umin v0.16b, v0.16b, v4.16b umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) .endif .if \w == 8 st1 {v0.d}[0], [x0], x1 add x2, x2, #2*16 // tmp += 2*tmp_stride subs w7, w7, #2 // h -= 2 st1 {v0.d}[1], [x0], x1 .else st1 {v0.s}[0], [x0], x1 add x2, x2, #4*8 // tmp += 4*tmp_stride st1 {v0.s}[1], [x0], x1 subs w7, w7, #4 // h -= 4 st1 {v0.s}[2], [x0], x1 st1 {v0.s}[3], [x0], x1 .endif // Reset pri_taps and directions back to the original point sub x5, x5, #2 .if \pri sub x8, x8, #2 .endif b.gt 1b ret endfunc .endm .macro filter_8 w filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec .endm filter_8 8 filter_8 4 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/cdef16.S000066400000000000000000000170211517466257200230400ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" .macro pad_top_bot_16 s1, s2, w, stride, reg, ret tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #4 sub \s2, \s2, #4 tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr d1, [\s1, #2*\w] ldr \reg\()2, [\s2] ldr d3, [\s2, #2*\w] str \reg\()0, [x0] str d1, [x0, #2*\w] add x0, x0, #2*\stride str \reg\()2, [x0] str d3, [x0, #2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr s1, [\s1, #2*\w] ldr \reg\()2, [\s2] ldr s3, [\s2, #2*\w] str \reg\()0, [x0] str s1, [x0, #2*\w] str s31, [x0, #2*\w+4] add x0, x0, #2*\stride str \reg\()2, [x0] str s3, [x0, #2*\w] str s31, [x0, #2*\w+4] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr s1, [\s1, #2*\w] ldr \reg\()2, [\s2] ldr s3, [\s2, #2*\w] str s31, [x0] stur \reg\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \reg\()2, [x0, #4] str s3, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr \reg\()1, [\s2] str s31, [x0] stur \reg\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \reg\()1, [x0, #4] str s31, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride .endif 3: .endm .macro load_n_incr_16 dst, src, incr, w .if \w == 4 ld1 {\dst\().4h}, [\src], \incr .else ld1 {\dst\().8h}, [\src], \incr .endif .endm // void dav2d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_16 w, stride, reg function cdef_padding\w\()_16bpc_neon, export=1 movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif b 3f 1: // CDEF_HAVE_TOP add x9, x4, x2 pad_top_bot_16 x4, x9, \w, \stride, \reg, 0 // Middle section 3: tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.s}[0], [x3], #4 ldr s2, [x1, #2*\w] load_n_incr_16 v1, x1, x2, \w subs w6, w6, #1 str s0, [x0] stur \reg\()1, [x0, #4] str s2, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.s}[0], [x3], #4 load_n_incr_16 v1, x1, x2, \w subs w6, w6, #1 str s0, [x0] stur \reg\()1, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b b 3f 2: tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr s1, [x1, #2*\w] load_n_incr_16 v0, x1, x2, \w subs w6, w6, #1 str s31, [x0] stur \reg\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr_16 v0, x1, x2, \w subs w6, w6, #1 str s31, [x0] stur \reg\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b 3: tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif ret 1: // CDEF_HAVE_BOTTOM add x9, x5, x2 pad_top_bot_16 x5, x9, \w, \stride, \reg, 1 endfunc .endm padding_func_16 8, 16, q padding_func_16 4, 8, d tables filter 8, 16 filter 4, 16 find_dir 16 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/cdef_tmpl.S000066400000000000000000000502601517466257200237270ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro dir_table w, stride const directions\w .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 .byte 1 * \stride + 0, 2 * \stride + 0 .byte 1 * \stride + 0, 2 * \stride - 1 // Repeated, to avoid & 7 .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 endconst .endm .macro tables dir_table 8, 16 dir_table 4, 8 const pri_taps .byte 4, 2, 3, 3 endconst .endm .macro load_px d1, d2, w .if \w == 8 add x6, x2, w9, sxtb #1 // x + off sub x9, x2, w9, sxtb #1 // x - off ld1 {\d1\().8h}, [x6] // p0 ld1 {\d2\().8h}, [x9] // p1 .else add x6, x2, w9, sxtb #1 // x + off sub x9, x2, w9, sxtb #1 // x - off ld1 {\d1\().4h}, [x6] // p0 add x6, x6, #2*8 // += stride ld1 {\d2\().4h}, [x9] // p1 add x9, x9, #2*8 // += stride ld1 {\d1\().d}[1], [x6] // p0 ld1 {\d2\().d}[1], [x9] // p1 .endif .endm .macro handle_pixel s1, s2, thresh_vec, shift, tap, min .if \min umin v2.8h, v2.8h, \s1\().8h smax v3.8h, v3.8h, \s1\().8h umin v2.8h, v2.8h, \s2\().8h smax v3.8h, v3.8h, \s2\().8h .endif uabd v16.8h, v0.8h, \s1\().8h // abs(diff) uabd v20.8h, v0.8h, \s2\().8h // abs(diff) ushl v17.8h, v16.8h, \shift // abs(diff) >> shift ushl v21.8h, v20.8h, \shift // abs(diff) >> shift uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift)) uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift)) sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px neg v16.8h, v17.8h // -clip neg v20.8h, v21.8h // -clip smin v18.8h, v18.8h, v17.8h // imin(diff, clip) smin v22.8h, v22.8h, v21.8h // imin(diff, clip) dup v19.8h, \tap // taps[k] smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip) smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip) mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() .endm // void dav2d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint16_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h, size_t edges); .macro filter_func w, bpc, pri, sec, min, suffix function cdef_filter\w\suffix\()_\bpc\()bpc_neon .if \bpc == 8 ldr w8, [sp] // edges cmp w8, #0xf b.eq cdef_filter\w\suffix\()_edged_8bpc_neon .endif .if \pri .if \bpc == 16 ldr w9, [sp, #8] // bitdepth_max clz w9, w9 sub w9, w9, #24 // -bitdepth_min_8 neg w9, w9 // bitdepth_min_8 .endif movrel x8, pri_taps .if \bpc == 16 lsr w9, w3, w9 // pri_strength >> bitdepth_min_8 and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1 .else and w9, w3, #1 .endif add x8, x8, w9, uxtw #1 .endif movrel x9, directions\w add x5, x9, w5, uxtw #1 movi v30.4h, #15 dup v28.4h, w6 // damping .if \pri dup v25.8h, w3 // threshold .endif .if \sec dup v27.8h, w4 // threshold .endif trn1 v24.4h, v25.4h, v27.4h clz v24.4h, v24.4h // clz(threshold) sub v24.4h, v30.4h, v24.4h // ulog2(threshold) uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold)) neg v24.4h, v24.4h // -shift .if \sec dup v26.8h, v24.h[1] .endif .if \pri dup v24.8h, v24.h[0] .endif 1: .if \w == 8 ld1 {v0.8h}, [x2] // px .else add x12, x2, #2*8 ld1 {v0.4h}, [x2] // px ld1 {v0.d}[1], [x12] // px .endif movi v1.8h, #0 // sum .if \min mov v2.16b, v0.16b // min mov v3.16b, v0.16b // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov w11, #2 // sec_taps[0] 2: .if \pri ldrb w9, [x5] // off1 load_px v4, v5, \w .endif .if \sec add x5, x5, #4 // +2*2 ldrb w9, [x5] // off2 load_px v6, v7, \w .endif .if \pri ldrb w10, [x8] // *pri_taps handle_pixel v4, v5, v25.8h, v24.8h, w10, \min .endif .if \sec add x5, x5, #8 // +2*4 ldrb w9, [x5] // off3 load_px v4, v5, \w handle_pixel v6, v7, v27.8h, v26.8h, w11, \min handle_pixel v4, v5, v27.8h, v26.8h, w11, \min sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; .else add x5, x5, #1 // x5 += 1 .endif subs w11, w11, #1 // sec_tap-- (value) .if \pri add x8, x8, #1 // pri_taps++ (pointer) .endif b.ne 2b cmlt v4.8h, v1.8h, #0 // -(sum < 0) add v1.8h, v1.8h, v4.8h // sum - (sum < 0) srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 .if \min smin v0.8h, v0.8h, v3.8h smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) .endif .if \bpc == 8 xtn v0.8b, v0.8h .endif .if \w == 8 add x2, x2, #2*16 // tmp += tmp_stride subs w7, w7, #1 // h-- .if \bpc == 8 st1 {v0.8b}, [x0], x1 .else st1 {v0.8h}, [x0], x1 .endif .else .if \bpc == 8 st1 {v0.s}[0], [x0], x1 .else st1 {v0.d}[0], [x0], x1 .endif add x2, x2, #2*16 // tmp += 2*tmp_stride subs w7, w7, #2 // h -= 2 .if \bpc == 8 st1 {v0.s}[1], [x0], x1 .else st1 {v0.d}[1], [x0], x1 .endif .endif // Reset pri_taps and directions back to the original point sub x5, x5, #2 .if \pri sub x8, x8, #2 .endif b.gt 1b ret endfunc .endm .macro filter w, bpc filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec function cdef_filter\w\()_\bpc\()bpc_neon, export=1 cbnz w3, 1f // pri_strength b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 1: cbnz w4, 1f // sec_strength b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 1: b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec endfunc .endm const div_table .short 840, 420, 280, 210, 168, 140, 120, 105 endconst const alt_fact .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 endconst .macro cost_alt d1, d2, s1, s2, s3, s4 smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] smull2 v23.4s, \s1\().8h, \s1\().8h smull v24.4s, \s2\().4h, \s2\().4h smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] smull2 v26.4s, \s3\().8h, \s3\().8h smull v27.4s, \s4\().4h, \s4\().4h mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact mla v22.4s, v23.4s, v30.4s mla v22.4s, v24.4s, v31.4s mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact mla v25.4s, v26.4s, v30.4s mla v25.4s, v27.4s, v31.4s addv \d1, v22.4s // *cost_ptr addv \d2, v25.4s // *cost_ptr .endm .macro find_best s1, s2, s3 .ifnb \s2 mov w5, \s2\().s[0] .endif cmp w4, w1 // cost[n] > best_cost csel w0, w3, w0, gt // best_dir = n csel w1, w4, w1, gt // best_cost = cost[n] .ifnb \s2 add w3, w3, #1 // n++ cmp w5, w1 // cost[n] > best_cost mov w4, \s3\().s[0] csel w0, w3, w0, gt // best_dir = n csel w1, w5, w1, gt // best_cost = cost[n] add w3, w3, #1 // n++ .endif .endm // Steps for loading and preparing each row .macro dir_load_step1 s1, bpc .if \bpc == 8 ld1 {\s1\().8b}, [x0], x1 .else ld1 {\s1\().8h}, [x0], x1 .endif .endm .macro dir_load_step2 s1, bpc .if \bpc == 8 usubl \s1\().8h, \s1\().8b, v31.8b .else ushl \s1\().8h, \s1\().8h, v8.8h .endif .endm .macro dir_load_step3 s1, bpc // Nothing for \bpc == 8 .if \bpc != 8 sub \s1\().8h, \s1\().8h, v31.8h .endif .endm // int dav2d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, // unsigned *const var) .macro find_dir bpc function cdef_find_dir_\bpc\()bpc_neon, export=1 .if \bpc == 16 str d8, [sp, #-0x10]! clz w3, w3 // clz(bitdepth_max) sub w3, w3, #24 // -bitdepth_min_8 dup v8.8h, w3 .endif sub sp, sp, #32 // cost mov w3, #8 .if \bpc == 8 movi v31.16b, #128 .else movi v31.8h, #128 .endif movi v30.16b, #0 movi v1.8h, #0 // v0-v1 sum_diag[0] movi v3.8h, #0 // v2-v3 sum_diag[1] movi v5.8h, #0 // v4-v5 sum_hv[0-1] movi v7.8h, #0 // v6-v7 sum_alt[0] dir_load_step1 v26, \bpc // Setup first row early movi v17.8h, #0 // v16-v17 sum_alt[1] movi v18.8h, #0 // v18-v19 sum_alt[2] dir_load_step2 v26, \bpc movi v19.8h, #0 dir_load_step3 v26, \bpc movi v21.8h, #0 // v20-v21 sum_alt[3] .irpc i, 01234567 addv h25, v26.8h // [y] rev64 v27.8h, v26.8h addp v28.8h, v26.8h, v30.8h // [(x >> 1)] add v5.8h, v5.8h, v26.8h // sum_hv[1] ext v27.16b, v27.16b, v27.16b, #8 // [-x] rev64 v29.4h, v28.4h // [-(x >> 1)] ins v4.h[\i], v25.h[0] // sum_hv[0] .if \i < 6 ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) add v18.8h, v18.8h, v22.8h // sum_alt[2] add v19.4h, v19.4h, v23.4h // sum_alt[2] .else add v18.8h, v18.8h, v26.8h // sum_alt[2] .endif .if \i == 0 mov v20.16b, v26.16b // sum_alt[3] .elseif \i == 1 add v20.8h, v20.8h, v26.8h // sum_alt[3] .else ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) add v20.8h, v20.8h, v24.8h // sum_alt[3] add v21.4h, v21.4h, v25.4h // sum_alt[3] .endif .if \i == 0 mov v0.16b, v26.16b // sum_diag[0] dir_load_step1 v26, \bpc mov v2.16b, v27.16b // sum_diag[1] dir_load_step2 v26, \bpc mov v6.16b, v28.16b // sum_alt[0] dir_load_step3 v26, \bpc mov v16.16b, v29.16b // sum_alt[1] .else ext v22.16b, v30.16b, v26.16b, #(16-2*\i) ext v23.16b, v26.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v27.16b, #(16-2*\i) ext v25.16b, v27.16b, v30.16b, #(16-2*\i) .if \i != 7 // Nothing to load for the final row dir_load_step1 v26, \bpc // Start setting up the next row early. .endif add v0.8h, v0.8h, v22.8h // sum_diag[0] add v1.8h, v1.8h, v23.8h // sum_diag[0] add v2.8h, v2.8h, v24.8h // sum_diag[1] add v3.8h, v3.8h, v25.8h // sum_diag[1] .if \i != 7 dir_load_step2 v26, \bpc .endif ext v22.16b, v30.16b, v28.16b, #(16-2*\i) ext v23.16b, v28.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v29.16b, #(16-2*\i) ext v25.16b, v29.16b, v30.16b, #(16-2*\i) .if \i != 7 dir_load_step3 v26, \bpc .endif add v6.8h, v6.8h, v22.8h // sum_alt[0] add v7.4h, v7.4h, v23.4h // sum_alt[0] add v16.8h, v16.8h, v24.8h // sum_alt[1] add v17.4h, v17.4h, v25.4h // sum_alt[1] .endif .endr movi v31.4s, #105 smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] smlal2 v26.4s, v4.8h, v4.8h smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] smlal2 v27.4s, v5.8h, v5.8h mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 addv s4, v26.4s // cost[2] addv s5, v27.4s // cost[6] rev64 v1.8h, v1.8h rev64 v3.8h, v3.8h ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n] ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n] str s4, [sp, #2*4] // cost[2] str s5, [sp, #6*4] // cost[6] movrel x4, div_table ld1 {v31.8h}, [x4] smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] smull2 v23.4s, v0.8h, v0.8h smlal v22.4s, v1.4h, v1.4h smlal2 v23.4s, v1.8h, v1.8h smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] smull2 v25.4s, v2.8h, v2.8h smlal v24.4s, v3.4h, v3.4h smlal2 v25.4s, v3.8h, v3.8h uxtl v30.4s, v31.4h // div_table uxtl2 v31.4s, v31.8h mul v22.4s, v22.4s, v30.4s // cost[0] mla v22.4s, v23.4s, v31.4s // cost[0] mul v24.4s, v24.4s, v30.4s // cost[4] mla v24.4s, v25.4s, v31.4s // cost[4] addv s0, v22.4s // cost[0] addv s2, v24.4s // cost[4] movrel x5, alt_fact ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 str s0, [sp, #0*4] // cost[0] str s2, [sp, #4*4] // cost[4] uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 uxtl v30.4s, v30.4h uxtl v31.4s, v31.4h cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] str s6, [sp, #1*4] // cost[1] str s16, [sp, #3*4] // cost[3] mov w0, #0 // best_dir mov w1, v0.s[0] // best_cost mov w3, #1 // n str s18, [sp, #5*4] // cost[5] str s20, [sp, #7*4] // cost[7] mov w4, v6.s[0] find_best v6, v4, v16 find_best v16, v2, v18 find_best v18, v5, v20 find_best v20 eor w3, w0, #4 // best_dir ^4 ldr w4, [sp, w3, uxtw #2] sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] lsr w1, w1, #10 str w1, [x2] // *var add sp, sp, #32 .if \bpc == 16 ldr d8, [sp], 0x10 .endif ret endfunc .endm dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/filmgrain.S000066400000000000000000002173721517466257200237530ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 .macro increment_seed steps, shift=1 lsr w11, w2, #3 lsr w12, w2, #12 lsr w13, w2, #1 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) eor w12, w12, w13 // (r >> 12) ^ (r >> 1) eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) .if \shift lsr w2, w2, #\steps .endif and w11, w11, #((1 << \steps) - 1) // bit .if \shift orr w2, w2, w11, lsl #(16 - \steps) // *state .else orr w2, w2, w11, lsl #16 // *state .endif .endm .macro read_rand dest, bits, age ubfx \dest, x2, #16 - \bits - \age, #\bits .endm .macro read_shift_rand dest, bits ubfx \dest, x2, #17 - \bits, #\bits lsr w2, w2, #1 .endm // special calling convention: // w2 holds seed // x3 holds dav2d_gaussian_sequence // clobbers x11-x15 // returns in v0.8h function get_gaussian_neon increment_seed 4 read_rand x14, 11, 3 read_rand x15, 11, 2 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] read_rand x14, 11, 1 ld1 {v0.h}[1], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 0 increment_seed 4 add x15, x3, x15, lsl #1 ld1 {v0.h}[2], [x14] read_rand x14, 11, 3 ld1 {v0.h}[3], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 2 ld1 {v0.h}[4], [x14] add x15, x3, x15, lsl #1 read_rand x14, 11, 1 ld1 {v0.h}[5], [x15] read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[6], [x14] ld1 {v0.h}[7], [x15] ret endfunc .macro get_grain_row r0, r1, r2, r3, r4, r5 bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn \r0\().8b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn2 \r0\().16b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn \r1\().8b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn2 \r1\().16b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn \r2\().8b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn2 \r2\().16b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn \r3\().8b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn2 \r3\().16b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn \r4\().8b, \r5\().8h bl get_gaussian_neon srshl \r5\().8h, v0.8h, v31.8h xtn2 \r4\().16b, \r5\().8h increment_seed 2 read_rand x14, 11, 1 read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {\r5\().h}[0], [x14] ld1 {\r5\().h}[1], [x15] srshl v0.4h, \r5\().4h, v31.4h xtn \r5\().8b, v0.8h .endm .macro store_grain_row r0, r1, r2, r3, r4, r5 st1 {\r0\().16b,\r1\().16b}, [x0], #32 st1 {\r2\().16b,\r3\().16b}, [x0], #32 st1 {\r4\().16b}, [x0], #16 st1 {\r5\().h}[0], [x0], #2 .endm .macro get_grain_row_44 r0, r1, r2 bl get_gaussian_neon srshl \r2\().8h, v0.8h, v31.8h xtn \r0\().8b, \r2\().8h bl get_gaussian_neon srshl \r2\().8h, v0.8h, v31.8h xtn2 \r0\().16b, \r2\().8h bl get_gaussian_neon srshl \r2\().8h, v0.8h, v31.8h xtn \r1\().8b, \r2\().8h bl get_gaussian_neon srshl \r2\().8h, v0.8h, v31.8h xtn2 \r1\().16b, \r2\().8h bl get_gaussian_neon srshl \r2\().8h, v0.8h, v31.8h xtn \r2\().8b, \r2\().8h increment_seed 4 read_rand x14, 11, 3 read_rand x15, 11, 2 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] read_rand x14, 11, 1 ld1 {v0.h}[1], [x15] read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[2], [x14] ld1 {v0.h}[3], [x15] srshl v0.4h, v0.4h, v31.4h xtn2 \r2\().16b, v0.8h .endm .macro store_grain_row_44 r0, r1, r2 st1 {\r0\().16b,\r1\().16b}, [x0], #32 st1 {\r2\().16b}, [x0] add x0, x0, #GRAIN_WIDTH-32 .endm function get_grain_2_neon increment_seed 2 read_rand x14, 11, 1 read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] ld1 {v0.h}[1], [x15] srshl v0.4h, v0.4h, v31.4h xtn v0.8b, v0.8h ret endfunc .macro get_grain_2 dst bl get_grain_2_neon .ifnc \dst, v0 mov \dst\().8b, v0.8b .endif .endm // w15 holds the number of entries to produce // w14, w16 and w17 hold the previous output entries // v0 holds the vector of produced entries // v1 holds the input vector of sums from above .macro output_lag n function output_lag\n\()_neon 1: read_shift_rand x13, 11 mov w11, v1.s[0] ldrsh w12, [x3, x13, lsl #1] ext v0.16b, v0.16b, v0.16b, #1 .if \n == 1 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output .elseif \n == 2 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 madd w11, w14, w17, w11 // += *coeff * prev output 2 mov w16, w14 .else madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 madd w11, w14, w21, w11 // += *coeff * prev output 3 mov w17, w16 mov w16, w14 .endif add w14, w11, w8 // 1 << (ar_coeff_shift - 1) add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) asr w14, w14, w7 // >> ar_coeff_shift asr w12, w12, w9 // >> (4 + grain_scale_shift) add w14, w14, w12 cmp w14, w5 csel w14, w14, w5, le cmp w14, w6 csel w14, w14, w6, ge subs w15, w15, #1 ext v1.16b, v1.16b, v1.16b, #4 ins v0.b[15], w14 b.gt 1b ret endfunc .endm output_lag 1 output_lag 2 output_lag 3 function sum_lag1_above_neon smull v2.8h, v3.8b, v28.8b smull2 v3.8h, v3.16b, v28.16b smull v4.8h, v0.8b, v27.8b smull2 v5.8h, v0.16b, v27.16b smull v6.8h, v1.8b, v29.8b smull2 v7.8h, v1.16b, v29.16b saddl v0.4s, v2.4h, v4.4h saddl2 v1.4s, v2.8h, v4.8h saddl v2.4s, v3.4h, v5.4h saddl2 v3.4s, v3.8h, v5.8h saddw v4.4s, v0.4s, v6.4h saddw2 v5.4s, v1.4s, v6.8h saddw v6.4s, v2.4s, v7.4h saddw2 v7.4s, v3.4s, v7.8h ret endfunc .macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff bl sum_\lag\()_above_neon .ifc \type, uv_420 add x12, x19, #GRAIN_WIDTH ld1 {v22.16b, v23.16b}, [x19], #32 ld1 {v24.16b, v25.16b}, [x12] saddlp v22.8h, v22.16b saddlp v23.8h, v23.16b saddlp v24.8h, v24.16b saddlp v25.8h, v25.16b add v22.8h, v22.8h, v24.8h add v23.8h, v23.8h, v25.8h rshrn v0.8b, v22.8h, #2 rshrn2 v0.16b, v23.8h, #2 .endif .ifc \type, uv_422 ld1 {v22.16b, v23.16b}, [x19], #32 saddlp v22.8h, v22.16b saddlp v23.8h, v23.16b rshrn v0.8b, v22.8h, #1 rshrn2 v0.16b, v23.8h, #1 .endif .ifc \type, uv_444 ld1 {v0.16b}, [x19], #16 .endif .if \uv_layout .ifnb \uv_coeff dup v1.16b, \uv_coeff smull v2.8h, v0.8b, v1.8b smull2 v3.8h, v0.16b, v1.16b .else smull v2.8h, v0.8b, v30.8b smull2 v3.8h, v0.16b, v30.16b .endif saddw v4.4s, v4.4s, v2.4h saddw2 v5.4s, v5.4s, v2.8h saddw v6.4s, v6.4s, v3.4h saddw2 v7.4s, v7.4s, v3.8h .endif .if \uv_layout && \elems == 16 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 444 && \elems == 15 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 422 && \elems == 9 b sum_\lag\()_uv_420_\edge\()_start .else sum_\lag\()_\type\()_\edge\()_start: .ifc \edge, left increment_seed 4 read_rand x12, 11, 3 read_rand x13, 11, 2 read_rand x14, 11, 1 add x12, x3, x12, lsl #1 add x13, x3, x13, lsl #1 add x14, x3, x14, lsl #1 ld1 {v0.h}[5], [x12] ld1 {v0.h}[6], [x13] ld1 {v0.h}[7], [x14] lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 srshl v0.8h, v0.8h, v31.8h xtn2 v0.16b, v0.8h ext v4.16b, v4.16b, v4.16b, #12 .ifc \lag, lag3 smov w17, v0.b[13] .endif .ifnc \lag, lag1 smov w16, v0.b[14] .endif smov w14, v0.b[15] mov v1.16b, v4.16b mov w15, #1 bl output_\lag\()_neon .else increment_seed 4, shift=0 mov v1.16b, v4.16b mov w15, #4 bl output_\lag\()_neon .endif increment_seed 4, shift=0 mov v1.16b, v5.16b mov w15, #4 bl output_\lag\()_neon increment_seed 4, shift=0 mov v1.16b, v6.16b .if \elems == 9 mov w15, #1 bl output_\lag\()_neon lsr w2, w2, #3 read_rand x12, 11, 2 read_rand x13, 11, 1 read_rand x14, 11, 0 add x12, x3, x12, lsl #1 add x13, x3, x13, lsl #1 add x14, x3, x14, lsl #1 ld1 {v1.h}[0], [x12] ld1 {v1.h}[1], [x13] ld1 {v1.h}[2], [x14] srshl v1.4h, v1.4h, v31.4h xtn v1.8b, v1.8h ext v0.16b, v0.16b, v1.16b, #7 .else mov w15, #4 bl output_\lag\()_neon increment_seed 4, shift=0 mov v1.16b, v7.16b .ifc \edge, right mov w15, #3 bl output_\lag\()_neon read_shift_rand x15, 11 add x15, x3, x15, lsl #1 ld1 {v1.h}[0], [x15] srshl v1.4h, v1.4h, v31.4h ext v0.16b, v0.16b, v1.16b, #1 .else mov w15, #4 bl output_\lag\()_neon .endif .endif .if \store st1 {v0.16b}, [x0], #16 .endif ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret .endif .endm .macro sum_lag1_func type, uv_layout, edge, elems=16 function sum_\type\()_lag1_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 endfunc .endm sum_lag1_func y, 0, left sum_lag1_func y, 0, mid sum_lag1_func y, 0, right, 15 sum_lag1_func uv_444, 444, left sum_lag1_func uv_444, 444, mid sum_lag1_func uv_444, 444, right, 15 sum_lag1_func uv_422, 422, left sum_lag1_func uv_422, 422, mid sum_lag1_func uv_422, 422, right, 9 sum_lag1_func uv_420, 420, left sum_lag1_func uv_420, 420, mid sum_lag1_func uv_420, 420, right, 9 .macro sum_lag1 type, dst, left, mid, right, edge=mid mov v3.16b, \mid\().16b ext v0.16b, \left\().16b, \mid\().16b, #15 ext v1.16b, \mid\().16b, \right\().16b, #1 bl sum_\type\()_lag1_\edge\()_neon mov \dst\().16b, v0.16b .endm .macro sum_y_lag1 dst, left, mid, right, edge=mid sum_lag1 y, \dst, \left, \mid, \right, \edge .endm .macro sum_uv_444_lag1 dst, left, mid, right, edge=mid sum_lag1 uv_444, \dst, \left, \mid, \right, \edge .endm .macro sum_uv_422_lag1 dst, left, mid, right, edge=mid sum_lag1 uv_422, \dst, \left, \mid, \right, \edge .endm .macro sum_uv_420_lag1 dst, left, mid, right, edge=mid sum_lag1 uv_420, \dst, \left, \mid, \right, \edge .endm function sum_lag2_above_neon sub x12, x0, #2*GRAIN_WIDTH - 16 sub x13, x0, #1*GRAIN_WIDTH - 16 ld1 {v18.16b}, [x12] // load top right ld1 {v21.16b}, [x13] ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid dup v26.16b, v30.b[0] ext v23.16b, v16.16b, v17.16b, #15 dup v27.16b, v30.b[1] ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right dup v28.16b, v30.b[3] ext v1.16b, v17.16b, v18.16b, #2 dup v29.16b, v30.b[4] smull v2.8h, v22.8b, v26.8b smull2 v3.8h, v22.16b, v26.16b smull v4.8h, v23.8b, v27.8b smull2 v5.8h, v23.16b, v27.16b smull v6.8h, v0.8b, v28.8b smull2 v7.8h, v0.16b, v28.16b smull v0.8h, v1.8b, v29.8b smull2 v1.8h, v1.16b, v29.16b saddl v22.4s, v2.4h, v4.4h saddl2 v23.4s, v2.8h, v4.8h saddl v26.4s, v3.4h, v5.4h saddl2 v27.4s, v3.8h, v5.8h saddl v2.4s, v0.4h, v6.4h saddl2 v3.4s, v0.8h, v6.8h saddl v6.4s, v1.4h, v7.4h saddl2 v7.4s, v1.8h, v7.8h add v4.4s, v22.4s, v2.4s add v5.4s, v23.4s, v3.4s add v6.4s, v26.4s, v6.4s add v7.4s, v27.4s, v7.4s ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid dup v26.16b, v30.b[5] ext v23.16b, v19.16b, v20.16b, #15 dup v27.16b, v30.b[6] ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right dup v28.16b, v30.b[8] ext v1.16b, v20.16b, v21.16b, #2 dup v29.16b, v30.b[9] smull v2.8h, v22.8b, v26.8b smull2 v3.8h, v22.16b, v26.16b smull v22.8h, v23.8b, v27.8b smull2 v23.8h, v23.16b, v27.16b smull v26.8h, v0.8b, v28.8b smull2 v27.8h, v0.16b, v28.16b smull v28.8h, v1.8b, v29.8b smull2 v29.8h, v1.16b, v29.16b saddl v0.4s, v2.4h, v22.4h saddl2 v1.4s, v2.8h, v22.8h saddl v2.4s, v3.4h, v23.4h saddl2 v3.4s, v3.8h, v23.8h saddl v22.4s, v26.4h, v28.4h saddl2 v23.4s, v26.8h, v28.8h saddl v26.4s, v27.4h, v29.4h saddl2 v27.4s, v27.8h, v29.8h add v0.4s, v0.4s, v22.4s add v1.4s, v1.4s, v23.4s add v2.4s, v2.4s, v26.4s add v3.4s, v3.4s, v27.4s dup v26.16b, v30.b[2] dup v27.16b, v30.b[7] smull v22.8h, v17.8b, v26.8b smull2 v23.8h, v17.16b, v26.16b smull v24.8h, v20.8b, v27.8b smull2 v25.8h, v20.16b, v27.16b add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s mov v16.16b, v17.16b mov v17.16b, v18.16b saddl v0.4s, v22.4h, v24.4h saddl2 v1.4s, v22.8h, v24.8h saddl v2.4s, v23.4h, v25.4h saddl2 v3.4s, v23.8h, v25.8h mov v19.16b, v20.16b mov v20.16b, v21.16b add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s ret endfunc .macro sum_lag2_func type, uv_layout, edge, elems=16 function sum_\type\()_lag2_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x12, x0, #2*GRAIN_WIDTH sub x13, x0, #1*GRAIN_WIDTH ld1 {v17.16b}, [x12] // load the previous block right above ld1 {v20.16b}, [x13] .endif sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] endfunc .endm sum_lag2_func y, 0, left sum_lag2_func y, 0, mid sum_lag2_func y, 0, right, 15 sum_lag2_func uv_444, 444, left sum_lag2_func uv_444, 444, mid sum_lag2_func uv_444, 444, right, 15 sum_lag2_func uv_422, 422, left sum_lag2_func uv_422, 422, mid sum_lag2_func uv_422, 422, right, 9 sum_lag2_func uv_420, 420, left sum_lag2_func uv_420, 420, mid sum_lag2_func uv_420, 420, right, 9 function sum_lag3_above_neon sub x11, x0, #3*GRAIN_WIDTH - 16 sub x12, x0, #2*GRAIN_WIDTH - 16 sub x13, x0, #1*GRAIN_WIDTH - 16 ld1 {v15.16b}, [x11] // load top right ld1 {v18.16b}, [x12] ld1 {v21.16b}, [x13] ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid dup v22.16b, v29.b[0] ext v9.16b, v13.16b, v14.16b, #14 dup v23.16b, v29.b[1] ext v10.16b, v13.16b, v14.16b, #15 dup v24.16b, v29.b[2] dup v25.16b, v29.b[3] ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right dup v26.16b, v29.b[4] ext v12.16b, v14.16b, v15.16b, #2 dup v27.16b, v29.b[5] ext v13.16b, v14.16b, v15.16b, #3 dup v28.16b, v29.b[6] smull v0.8h, v8.8b, v22.8b smull2 v1.8h, v8.16b, v22.16b smull v2.8h, v9.8b, v23.8b smull2 v3.8h, v9.16b, v23.16b smull v8.8h, v10.8b, v24.8b smull2 v9.8h, v10.16b, v24.16b smull v10.8h, v11.8b, v26.8b smull2 v11.8h, v11.16b, v26.16b saddl v22.4s, v0.4h, v2.4h saddl2 v23.4s, v0.8h, v2.8h saddl v24.4s, v1.4h, v3.4h saddl2 v26.4s, v1.8h, v3.8h saddl v0.4s, v8.4h, v10.4h saddl2 v1.4s, v8.8h, v10.8h saddl v2.4s, v9.4h, v11.4h saddl2 v3.4s, v9.8h, v11.8h smull v8.8h, v12.8b, v27.8b smull2 v9.8h, v12.16b, v27.16b smull v10.8h, v13.8b, v28.8b smull2 v11.8h, v13.16b, v28.16b smull v12.8h, v14.8b, v25.8b smull2 v13.8h, v14.16b, v25.16b add v4.4s, v22.4s, v0.4s add v5.4s, v23.4s, v1.4s add v6.4s, v24.4s, v2.4s add v7.4s, v26.4s, v3.4s saddl v0.4s, v8.4h, v10.4h saddl2 v1.4s, v8.8h, v10.8h saddl v2.4s, v9.4h, v11.4h saddl2 v3.4s, v9.8h, v11.8h add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s saddw v4.4s, v4.4s, v12.4h saddw2 v5.4s, v5.4s, v12.8h saddw v6.4s, v6.4s, v13.4h saddw2 v7.4s, v7.4s, v13.8h ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid dup v22.16b, v29.b[7] ext v9.16b, v16.16b, v17.16b, #14 dup v23.16b, v29.b[8] ext v10.16b, v16.16b, v17.16b, #15 dup v24.16b, v29.b[9] dup v25.16b, v29.b[10] ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right dup v26.16b, v29.b[11] ext v12.16b, v17.16b, v18.16b, #2 dup v27.16b, v29.b[12] ext v13.16b, v17.16b, v18.16b, #3 dup v28.16b, v29.b[13] smull v0.8h, v8.8b, v22.8b smull2 v1.8h, v8.16b, v22.16b smull v2.8h, v9.8b, v23.8b smull2 v3.8h, v9.16b, v23.16b smull v8.8h, v10.8b, v24.8b smull2 v9.8h, v10.16b, v24.16b smull v10.8h, v11.8b, v26.8b smull2 v11.8h, v11.16b, v26.16b saddl v22.4s, v0.4h, v2.4h saddl2 v23.4s, v0.8h, v2.8h saddl v24.4s, v1.4h, v3.4h saddl2 v26.4s, v1.8h, v3.8h saddl v0.4s, v8.4h, v10.4h saddl2 v1.4s, v8.8h, v10.8h saddl v2.4s, v9.4h, v11.4h saddl2 v3.4s, v9.8h, v11.8h smull v8.8h, v12.8b, v27.8b smull2 v9.8h, v12.16b, v27.16b smull v10.8h, v13.8b, v28.8b smull2 v11.8h, v13.16b, v28.16b smull v12.8h, v17.8b, v25.8b smull2 v13.8h, v17.16b, v25.16b add v22.4s, v22.4s, v0.4s add v23.4s, v23.4s, v1.4s add v24.4s, v24.4s, v2.4s add v26.4s, v26.4s, v3.4s saddl v0.4s, v8.4h, v10.4h saddl2 v1.4s, v8.8h, v10.8h saddl v2.4s, v9.4h, v11.4h saddl2 v3.4s, v9.8h, v11.8h add v4.4s, v4.4s, v22.4s add v5.4s, v5.4s, v23.4s add v6.4s, v6.4s, v24.4s add v7.4s, v7.4s, v26.4s add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s saddw v4.4s, v4.4s, v12.4h saddw2 v5.4s, v5.4s, v12.8h saddw v6.4s, v6.4s, v13.4h saddw2 v7.4s, v7.4s, v13.8h ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid dup v22.16b, v29.b[14] ext v9.16b, v19.16b, v20.16b, #14 dup v23.16b, v29.b[15] ext v10.16b, v19.16b, v20.16b, #15 dup v24.16b, v30.b[0] dup v25.16b, v30.b[1] ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right dup v26.16b, v30.b[2] ext v12.16b, v20.16b, v21.16b, #2 dup v27.16b, v30.b[3] ext v13.16b, v20.16b, v21.16b, #3 dup v28.16b, v30.b[4] smull v0.8h, v8.8b, v22.8b smull2 v1.8h, v8.16b, v22.16b smull v2.8h, v9.8b, v23.8b smull2 v3.8h, v9.16b, v23.16b smull v8.8h, v10.8b, v24.8b smull2 v9.8h, v10.16b, v24.16b smull v10.8h, v11.8b, v26.8b smull2 v11.8h, v11.16b, v26.16b saddl v22.4s, v0.4h, v2.4h saddl2 v23.4s, v0.8h, v2.8h saddl v24.4s, v1.4h, v3.4h saddl2 v26.4s, v1.8h, v3.8h saddl v0.4s, v8.4h, v10.4h saddl2 v1.4s, v8.8h, v10.8h saddl v2.4s, v9.4h, v11.4h saddl2 v3.4s, v9.8h, v11.8h smull v8.8h, v12.8b, v27.8b smull2 v9.8h, v12.16b, v27.16b smull v10.8h, v13.8b, v28.8b smull2 v11.8h, v13.16b, v28.16b smull v12.8h, v20.8b, v25.8b smull2 v19.8h, v20.16b, v25.16b add v22.4s, v22.4s, v0.4s add v23.4s, v23.4s, v1.4s add v24.4s, v24.4s, v2.4s add v26.4s, v26.4s, v3.4s saddl v0.4s, v8.4h, v10.4h saddl2 v1.4s, v8.8h, v10.8h saddl v2.4s, v9.4h, v11.4h saddl2 v3.4s, v9.8h, v11.8h add v4.4s, v4.4s, v22.4s add v5.4s, v5.4s, v23.4s add v6.4s, v6.4s, v24.4s add v7.4s, v7.4s, v26.4s mov v13.16b, v14.16b mov v14.16b, v15.16b add v4.4s, v4.4s, v0.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v2.4s add v7.4s, v7.4s, v3.4s mov v16.16b, v17.16b mov v17.16b, v18.16b saddw v4.4s, v4.4s, v12.4h saddw2 v5.4s, v5.4s, v12.8h saddw v6.4s, v6.4s, v19.4h saddw2 v7.4s, v7.4s, v19.8h mov v19.16b, v20.16b mov v20.16b, v21.16b ret endfunc .macro sum_lag3_func type, uv_layout, edge, elems=16 function sum_\type\()_lag3_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x11, x0, #3*GRAIN_WIDTH sub x12, x0, #2*GRAIN_WIDTH sub x13, x0, #1*GRAIN_WIDTH ld1 {v14.16b}, [x11] // load the previous block right above ld1 {v17.16b}, [x12] ld1 {v20.16b}, [x13] .endif sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] endfunc .endm sum_lag3_func y, 0, left sum_lag3_func y, 0, mid sum_lag3_func y, 0, right, 15 sum_lag3_func uv_444, 444, left sum_lag3_func uv_444, 444, mid sum_lag3_func uv_444, 444, right, 15 sum_lag3_func uv_422, 422, left sum_lag3_func uv_422, 422, mid sum_lag3_func uv_422, 422, right, 9 sum_lag3_func uv_420, 420, left sum_lag3_func uv_420, 420, mid sum_lag3_func uv_420, 420, right, 9 function generate_grain_rows_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! 1: get_grain_row v16, v17, v18, v19, v20, v21 subs w1, w1, #1 store_grain_row v16, v17, v18, v19, v20, v21 b.gt 1b ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function generate_grain_rows_44_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! 1: get_grain_row_44 v16, v17, v18 subs w1, w1, #1 store_grain_row_44 v16, v17, v18 b.gt 1b ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function get_grain_row_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! get_grain_row v16, v17, v18, v19, v20, v21 ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function get_grain_row_44_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! get_grain_row_44 v16, v17, v18 ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function add_uv_444_coeff_lag0_neon add_coeff_lag0_start: smull v2.8h, v0.8b, v27.8b smull2 v3.8h, v0.16b, v27.16b srshl v2.8h, v2.8h, v28.8h srshl v3.8h, v3.8h, v28.8h saddw v2.8h, v2.8h, v1.8b saddw2 v3.8h, v3.8h, v1.16b sqxtn v2.8b, v2.8h sqxtn2 v2.16b, v3.8h ret endfunc function add_uv_420_coeff_lag0_neon ld1 {v4.16b, v5.16b}, [x19], #32 ld1 {v6.16b, v7.16b}, [x12], #32 saddlp v4.8h, v4.16b saddlp v5.8h, v5.16b saddlp v6.8h, v6.16b saddlp v7.8h, v7.16b add v4.8h, v4.8h, v6.8h add v5.8h, v5.8h, v7.8h rshrn v4.8b, v4.8h, #2 rshrn2 v4.16b, v5.8h, #2 and v0.16b, v4.16b, v0.16b b add_coeff_lag0_start endfunc function add_uv_422_coeff_lag0_neon ld1 {v4.16b, v5.16b}, [x19], #32 saddlp v4.8h, v4.16b saddlp v5.8h, v5.16b rshrn v4.8b, v4.8h, #1 rshrn2 v4.16b, v5.8h, #1 and v0.16b, v4.16b, v0.16b b add_coeff_lag0_start endfunc .macro gen_grain_82 type function generate_grain_\type\()_8bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x30, x19, [sp, #-96]! .ifc \type, uv_444 mov w13, w3 mov w14, #28 add x19, x1, #3*GRAIN_WIDTH mov x1, x2 mul w13, w13, w14 .endif movrel x3, X(gaussian_sequence) ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] .ifc \type, y add x4, x1, #FGD_AR_COEFFS_Y .else add x4, x1, #FGD_AR_COEFFS_UV .endif movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 + data->grain_scale_shift add x16, x16, x17 neg v31.8h, v31.8h .ifc \type, uv_444 cmp w13, #0 mov w11, #0x49d8 mov w14, #0xb524 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] csel w11, w11, w14, ne .endif ldr w7, [x1, #FGD_AR_COEFF_SHIFT] mov w8, #1 mov w10, #1 lsl w8, w8, w7 // 1 << ar_coeff_shift lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) mov w5, #127 mov w6, #-128 .ifc \type, uv_444 eor w2, w2, w11 .endif br x16 L(generate_grain_\type\()_lag0): AARCH64_VALID_JUMP_TARGET .ifc \type, y mov w1, #GRAIN_HEIGHT bl generate_grain_rows_neon .else dup v28.8h, w7 ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] movi v0.16b, #0 movi v1.16b, #255 ext v29.16b, v0.16b, v1.16b, #13 ext v30.16b, v1.16b, v0.16b, #1 neg v28.8h, v28.8h mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT-3 1: ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 bl get_grain_row_neon and v0.16b, v22.16b, v29.16b mov v1.16b, v16.16b bl add_uv_444_coeff_lag0_neon mov v0.16b, v23.16b mov v1.16b, v17.16b mov v16.16b, v2.16b bl add_uv_444_coeff_lag0_neon ld1 {v26.16b}, [x19], #16 mov v0.16b, v24.16b mov v1.16b, v18.16b mov v17.16b, v2.16b bl add_uv_444_coeff_lag0_neon add x19, x19, #2 mov v0.16b, v25.16b mov v1.16b, v19.16b mov v18.16b, v2.16b bl add_uv_444_coeff_lag0_neon and v0.16b, v26.16b, v30.16b mov v1.16b, v20.16b mov v19.16b, v2.16b bl add_uv_444_coeff_lag0_neon mov v20.16b, v2.16b subs w1, w1, #1 store_grain_row v16, v17, v18, v19, v20, v21 b.gt 1b .endif ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag1): AARCH64_VALID_JUMP_TARGET ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] ld1r {v29.16b}, [x4] // ar_coeffs_y[2] .ifc \type, y ldrsb w4, [x4, #1] // ar_coeffs_y[3] .else add x4, x4, #2 .endif mov w1, #3 .ifc \type, uv_444 ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] ldursb w4, [x4, #-1] // ar_coeffs_uv[3] .endif bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: sum_\type\()_lag1 v22, v16, v16, v17, left sum_\type\()_lag1 v23, v16, v17, v18 sum_\type\()_lag1 v24, v17, v18, v19 sum_\type\()_lag1 v25, v18, v19, v20 sum_\type\()_lag1 v20, v19, v20, v21, right get_grain_2 v21 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #2 .endif store_grain_row v22, v23, v24, v25, v20, v21 mov v16.16b, v22.16b mov v17.16b, v23.16b mov v18.16b, v24.16b mov v19.16b, v25.16b b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag2): AARCH64_VALID_JUMP_TARGET ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] smov w4, v30.b[10] smov w17, v30.b[11] mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag2_left_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_right_neon get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #2 .endif st1 {v16.h}[0], [x0], #2 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag3): AARCH64_VALID_JUMP_TARGET ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] stp x20, x21, [sp, #80] smov w4, v30.b[5] smov w20, v30.b[6] smov w21, v30.b[7] mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag3_left_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_right_neon get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #2 .endif st1 {v16.h}[0], [x0], #2 b.gt 1b ldp x20, x21, [sp, #80] ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl endjumptable .endm gen_grain_82 y gen_grain_82 uv_444 .macro set_height dst, type .ifc \type, uv_420 mov \dst, #SUB_GRAIN_HEIGHT-3 .else mov \dst, #GRAIN_HEIGHT-3 .endif .endm .macro increment_y_ptr reg, type .ifc \type, uv_420 add \reg, \reg, #2*GRAIN_WIDTH-(3*32) .else sub \reg, \reg, #3*32-GRAIN_WIDTH .endif .endm .macro gen_grain_44 type function generate_grain_\type\()_8bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x30, x19, [sp, #-96]! mov w13, w3 mov w14, #28 add x19, x1, #3*GRAIN_WIDTH-3 mov x1, x2 mul w13, w13, w14 movrel x3, X(gaussian_sequence) ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] add x4, x1, #FGD_AR_COEFFS_UV movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 + data->grain_scale_shift add x16, x16, x17 neg v31.8h, v31.8h cmp w13, #0 mov w11, #0x49d8 mov w14, #0xb524 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] csel w11, w11, w14, ne ldr w7, [x1, #FGD_AR_COEFF_SHIFT] mov w8, #1 mov w10, #1 lsl w8, w8, w7 // 1 << ar_coeff_shift lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) mov w5, #127 mov w6, #-128 eor w2, w2, w11 br x16 L(generate_grain_\type\()_lag0): AARCH64_VALID_JUMP_TARGET dup v28.8h, w7 ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] movi v0.16b, #0 movi v1.16b, #255 ext v29.16b, v0.16b, v1.16b, #13 ext v30.16b, v1.16b, v0.16b, #7 neg v28.8h, v28.8h mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl get_grain_row_44_neon .ifc \type, uv_420 add x12, x19, #GRAIN_WIDTH .endif mov v0.16b, v29.16b mov v1.16b, v16.16b bl add_\type\()_coeff_lag0_neon movi v0.16b, #255 mov v1.16b, v17.16b mov v16.16b, v2.16b bl add_\type\()_coeff_lag0_neon mov v0.16b, v30.16b mov v1.16b, v18.16b mov v17.16b, v2.16b bl add_\type\()_coeff_lag0_neon mov v18.16b, v2.16b subs w1, w1, #1 increment_y_ptr x19, \type store_grain_row_44 v16, v17, v18 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag1): AARCH64_VALID_JUMP_TARGET ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] add x4, x4, #2 mov w1, #3 ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] ldursb w4, [x4, #-1] // ar_coeffs_uv[3] bl generate_grain_rows_44_neon set_height w1, \type 1: sum_\type\()_lag1 v20, v16, v16, v17, left sum_\type\()_lag1 v21, v16, v17, v18 sum_\type\()_lag1 v18, v17, v18, v18, right subs w1, w1, #1 increment_y_ptr x19, \type store_grain_row_44 v20, v21, v18 mov v16.16b, v20.16b mov v17.16b, v21.16b b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag2): AARCH64_VALID_JUMP_TARGET ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] smov w4, v30.b[10] smov w17, v30.b[11] mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl sum_\type\()_lag2_left_neon bl sum_\type\()_lag2_mid_neon bl sum_\type\()_lag2_right_neon subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH-48 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag3): AARCH64_VALID_JUMP_TARGET ldr q29, [x4] // ar_coeffs_uv[0-15] ldr q30, [x4, #16] // ar_coeffs_uv[16-24] stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] stp x20, x21, [sp, #80] smov w4, v30.b[5] smov w20, v30.b[6] smov w21, v30.b[7] mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl sum_\type\()_lag3_left_neon bl sum_\type\()_lag3_mid_neon bl sum_\type\()_lag3_right_neon subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH-48 b.gt 1b ldp x20, x21, [sp, #80] ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl endjumptable .endm gen_grain_44 uv_420 gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, off umov w14, \src1[0+\off] umov w15, \src2[8+\off] umov w16, \src1[2+\off] add x14, x14, x3 umov w17, \src2[10+\off] add x15, x15, x3 ld1 {\dst1}[0+\off], [x14] umov w14, \src1[4+\off] add x16, x16, x3 ld1 {\dst2}[8+\off], [x15] umov w15, \src2[12+\off] add x17, x17, x3 ld1 {\dst1}[2+\off], [x16] umov w16, \src1[6+\off] add x14, x14, x3 ld1 {\dst2}[10+\off], [x17] umov w17, \src2[14+\off] add x15, x15, x3 ld1 {\dst1}[4+\off], [x14] add x16, x16, x3 ld1 {\dst2}[12+\off], [x15] add x17, x17, x3 ld1 {\dst1}[6+\off], [x16] ld1 {\dst2}[14+\off], [x17] .endm .macro gather dst1, dst2, src1, src2 gather_interleaved \dst1, \dst2, \src1, \src2, 0 gather_interleaved \dst2, \dst1, \src2, \src1, 0 gather_interleaved \dst1, \dst2, \src1, \src2, 1 gather_interleaved \dst2, \dst1, \src2, \src1, 1 .endm function gather32_neon gather v4.b, v5.b, v0.b, v1.b ret endfunc function gather16_neon gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 ins v4.d[1], v5.d[1] ret endfunc const overlap_coeffs_0, align=4 .byte 27, 17, 0, 0, 0, 0, 0, 0 .byte 17, 27, 32, 32, 32, 32, 32, 32 endconst const overlap_coeffs_1, align=4 .byte 23, 0, 0, 0, 0, 0, 0, 0 .byte 22, 32, 32, 32, 32, 32, 32, 32 endconst .macro calc_offset offx, offy, src, sx, sy and \offy, \src, #0xF // randval & 0xF lsr \offx, \src, #4 // randval >> 4 .if \sy == 0 add \offy, \offy, \offy // 2 * (randval & 0xF) .endif .if \sx == 0 add \offx, \offx, \offx // 2 * (randval >> 4) .endif .endm .macro add_offset dst, offx, offy, src, stride madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy add \dst, \dst, \offx, uxtw // grain_lut += offx .endm // void dav2d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const int scaling_shift, // const entry grain_lut[][GRAIN_WIDTH], // const int offsets[][2], // const int h, const ptrdiff_t clip, // const ptrdiff_t type); function fgy_32x32_8bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ldr w11, [x6, #8] // offsets[1][0] ldr w13, [x6, #4] // offsets[0][1] ldr w15, [x6, #12] // offsets[1][1] ldr w6, [x6] // offsets[0][0] ldr w8, [sp, #16] // clip mov x9, #GRAIN_WIDTH // grain_lut stride neg w4, w4 dup v29.8h, w4 // -scaling_shift movrel x16, overlap_coeffs_0 cbz w8, 1f // clip movi v30.16b, #16 movi v31.16b, #235 b 2f 1: // no clip movi v30.16b, #0 movi v31.16b, #255 2: ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs add x5, x5, #9 // grain_lut += 9 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride add x5, x5, x9 // grain_lut += grain_stride calc_offset w11, w12, w11, 0, 0 calc_offset w13, w14, w13, 0, 0 calc_offset w15, w16, w15, 0, 0 calc_offset w6, w10, w6, 0, 0 add_offset x12, w11, x12, x5, x9 add_offset x14, w13, x14, x5, x9 add_offset x16, w15, x16, x5, x9 add_offset x5, w6, x10, x5, x9 ldr w11, [sp, #24] // type movrel x13, fgy_loop_tbl add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by tst w11, #1 ldrsw x11, [x13, w11, uxtw #2] add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx add x11, x13, x11 b.eq 1f // y overlap dup v6.16b, v27.b[0] dup v7.16b, v27.b[1] mov w10, w7 // backup actual h mov w7, #2 1: br x11 endfunc function fgy_loop_neon .macro fgy ox, oy L(loop_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: ld1 {v0.16b, v1.16b}, [x1], x2 // src .if \ox ld1 {v20.8b}, [x4], x9 // grain_lut old .endif .if \oy ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top .endif .if \ox && \oy ld1 {v21.8b}, [x8], x9 // grain_lut top old .endif ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut bl gather32_neon .if \ox smull v20.8h, v20.8b, v27.8b smlal v20.8h, v18.8b, v28.8b .endif .if \oy .if \ox smull v21.8h, v21.8b, v27.8b smlal v21.8h, v22.8b, v28.8b sqrshrn v20.8b, v20.8h, #5 sqrshrn v21.8b, v21.8h, #5 .endif .if \ox smull v16.8h, v20.8b, v7.8b .else smull v16.8h, v18.8b, v7.8b .endif smull2 v17.8h, v18.16b, v7.16b smull v18.8h, v19.8b, v7.8b smull2 v19.8h, v19.16b, v7.16b .if \ox smlal v16.8h, v21.8b, v6.8b .else smlal v16.8h, v22.8b, v6.8b .endif smlal2 v17.8h, v22.16b, v6.16b smlal v18.8h, v23.8b, v6.8b smlal2 v19.8h, v23.16b, v6.16b sqrshrn v22.8b, v16.8h, #5 sqrshrn2 v22.16b, v17.8h, #5 sqrshrn v23.8b, v18.8h, #5 sqrshrn2 v23.16b, v19.8h, #5 .endif // sxtl of grain .if \oy sxtl v16.8h, v22.8b sxtl2 v17.8h, v22.16b sxtl v18.8h, v23.8b sxtl2 v19.8h, v23.16b .elseif \ox sqrshrn v20.8b, v20.8h, #5 sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b sxtl v16.8h, v20.8b .else sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b .endif uxtl v2.8h, v4.8b // scaling uxtl2 v3.8h, v4.16b uxtl v4.8h, v5.8b uxtl2 v5.8h, v5.16b mul v16.8h, v16.8h, v2.8h // scaling * grain mul v17.8h, v17.8h, v3.8h mul v18.8h, v18.8h, v4.8h mul v19.8h, v19.8h, v5.8h srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) srshl v17.8h, v17.8h, v29.8h srshl v18.8h, v18.8h, v29.8h srshl v19.8h, v19.8h, v29.8h uaddw v16.8h, v16.8h, v0.8b // *src + noise uaddw2 v17.8h, v17.8h, v0.16b uaddw v18.8h, v18.8h, v1.8b uaddw2 v19.8h, v19.8h, v1.16b sqxtun v0.8b, v16.8h sqxtun2 v0.16b, v17.8h sqxtun v1.8b, v18.8h sqxtun2 v1.16b, v19.8h umax v0.16b, v0.16b, v30.16b umax v1.16b, v1.16b, v30.16b umin v0.16b, v0.16b, v31.16b umin v1.16b, v1.16b, v31.16b subs w7, w7, #1 .if \oy dup v6.16b, v28.b[0] dup v7.16b, v28.b[1] .endif st1 {v0.16b, v1.16b}, [x0], x2 // dst b.gt 1b .if \oy cmp w10, #2 sub w7, w10, #2 // restore actual remaining h b.gt L(loop_\ox\()0) .endif ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret .endm fgy 0, 0 fgy 0, 1 fgy 1, 0 fgy 1, 1 endfunc jumptable fgy_loop_tbl .word L(loop_00) - fgy_loop_tbl .word L(loop_01) - fgy_loop_tbl .word L(loop_10) - fgy_loop_tbl .word L(loop_11) - fgy_loop_tbl endjumptable // void dav2d_fguv_32x32_420_8bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const Dav2dFilmGrainData *const data, // const entry grain_lut[][GRAIN_WIDTH], // const pixel *const luma_row, // const ptrdiff_t luma_stride, // const int offsets[][2], // const ptrdiff_t h, const ptrdiff_t uv, // const ptrdiff_t is_id, // const ptrdiff_t type); .macro fguv layout, sx, sy function fguv_32x32_\layout\()_8bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-32]! str d8, [sp, #16] ldp x8, x9, [sp, #32] // offsets, h ldp x10, x11, [sp, #48] // uv, is_id ldr w13, [x4, #FGD_SCALING_SHIFT] ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] neg w13, w13 // -scaling_shift // !csfl add x10, x4, x10, lsl #2 // + 4*uv add x14, x10, #FGD_UV_LUMA_MULT add x15, x10, #FGD_UV_MULT add x10, x10, #FGD_UV_OFFSET ld1 {v8.h}[0], [x14] // uv_luma_mult ld1r {v24.8h}, [x10] // uv_offset ld1 {v8.h}[1], [x15] // uv_mult dup v29.8h, w13 // -scaling_shift cbz w12, 1f // clip movi v30.16b, #16 movi v31.16b, #240 cbz w11, 2f // is_id movi v31.16b, #235 b 2f 1: // no clip movi v30.16b, #0 movi v31.16b, #255 2: ldr w12, [x8, #8] // offsets[1][0] ldr w14, [x8, #4] // offsets[0][1] ldr w16, [x8, #12] // offsets[1][1] ldr w8, [x8] // offsets[0][0] mov x10, #GRAIN_WIDTH // grain_lut stride add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 .if \sy add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride .else add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride add x5, x5, x10 // grain_lut += grain_stride .endif calc_offset w12, w13, w12, \sx, \sy calc_offset w14, w15, w14, \sx, \sy calc_offset w16, w17, w16, \sx, \sy calc_offset w8, w11, w8, \sx, \sy add_offset x13, w12, x13, x5, x10 add_offset x15, w14, x15, x5, x10 add_offset x17, w16, x17, x5, x10 add_offset x5, w8, x11, x5, x10 add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx ldr w13, [sp, #64] // type movrel x16, overlap_coeffs_\sx movrel x14, fguv_loop_sx\sx\()_tbl ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs tst w13, #1 ldrsw x13, [x14, w13, uxtw #2] b.eq 1f // y overlap sub w12, w9, #(2 >> \sy) // backup remaining h mov w9, #(2 >> \sy) 1: add x13, x14, x13 .if \sy movi v25.16b, #23 movi v26.16b, #22 .else movi v25.16b, #27 movi v26.16b, #17 .endif .if \sy add x7, x7, x7 // luma_stride *= 2 .endif br x13 endfunc .endm fguv 420, 1, 1 fguv 422, 1, 0 fguv 444, 0, 0 function fguv_loop_sx0_neon .macro fguv_loop_sx0 csfl, ox, oy L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: ld1 {v0.16b, v1.16b}, [x6], x7 // luma ld1 {v6.16b, v7.16b}, [x1], x2 // src .if \ox ld1 {v20.8b}, [x4], x10 // grain_lut old .endif .if \oy ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top .endif .if \ox && \oy ld1 {v21.8b}, [x11], x10 // grain_lut top old .endif ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut .if !\csfl uxtl v2.8h, v0.8b uxtl2 v3.8h, v0.16b uxtl v4.8h, v1.8b uxtl2 v5.8h, v1.16b uxtl v0.8h, v6.8b uxtl2 v1.8h, v6.16b uxtl v16.8h, v7.8b uxtl2 v17.8h, v7.16b mul v2.8h, v2.8h, v8.h[0] mul v3.8h, v3.8h, v8.h[0] mul v4.8h, v4.8h, v8.h[0] mul v5.8h, v5.8h, v8.h[0] mul v0.8h, v0.8h, v8.h[1] mul v1.8h, v1.8h, v8.h[1] mul v16.8h, v16.8h, v8.h[1] mul v17.8h, v17.8h, v8.h[1] sqadd v2.8h, v2.8h, v0.8h sqadd v3.8h, v3.8h, v1.8h sqadd v4.8h, v4.8h, v16.8h sqadd v5.8h, v5.8h, v17.8h sshr v2.8h, v2.8h, #6 sshr v3.8h, v3.8h, #6 sshr v4.8h, v4.8h, #6 sshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v24.8h add v3.8h, v3.8h, v24.8h add v4.8h, v4.8h, v24.8h add v5.8h, v5.8h, v24.8h sqxtun v0.8b, v2.8h sqxtun2 v0.16b, v3.8h sqxtun v1.8b, v4.8h sqxtun2 v1.16b, v5.8h .endif bl gather32_neon .if \ox smull v20.8h, v20.8b, v27.8b smlal v20.8h, v18.8b, v28.8b .endif .if \oy .if \ox smull v21.8h, v21.8b, v27.8b smlal v21.8h, v22.8b, v28.8b sqrshrn v20.8b, v20.8h, #5 sqrshrn v21.8b, v21.8h, #5 .endif .if \ox smull v16.8h, v20.8b, v26.8b .else smull v16.8h, v18.8b, v26.8b .endif smull2 v17.8h, v18.16b, v26.16b smull v18.8h, v19.8b, v26.8b smull2 v19.8h, v19.16b, v26.16b .if \ox smlal v16.8h, v21.8b, v25.8b .else smlal v16.8h, v22.8b, v25.8b .endif smlal2 v17.8h, v22.16b, v25.16b smlal v18.8h, v23.8b, v25.8b smlal2 v19.8h, v23.16b, v25.16b sqrshrn v22.8b, v16.8h, #5 sqrshrn2 v22.16b, v17.8h, #5 sqrshrn v23.8b, v18.8h, #5 sqrshrn2 v23.16b, v19.8h, #5 .endif // sxtl of grain .if \oy sxtl v16.8h, v22.8b sxtl2 v17.8h, v22.16b sxtl v18.8h, v23.8b sxtl2 v19.8h, v23.16b .elseif \ox sqrshrn v20.8b, v20.8h, #5 sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b sxtl v16.8h, v20.8b .else sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b .endif uxtl v2.8h, v4.8b // scaling uxtl2 v3.8h, v4.16b uxtl v4.8h, v5.8b uxtl2 v5.8h, v5.16b mul v16.8h, v16.8h, v2.8h // scaling * grain mul v17.8h, v17.8h, v3.8h mul v18.8h, v18.8h, v4.8h mul v19.8h, v19.8h, v5.8h srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) srshl v17.8h, v17.8h, v29.8h srshl v18.8h, v18.8h, v29.8h srshl v19.8h, v19.8h, v29.8h uaddw v16.8h, v16.8h, v6.8b // *src + noise uaddw2 v17.8h, v17.8h, v6.16b uaddw v18.8h, v18.8h, v7.8b uaddw2 v19.8h, v19.8h, v7.16b sqxtun v0.8b, v16.8h sqxtun2 v0.16b, v17.8h sqxtun v1.8b, v18.8h sqxtun2 v1.16b, v19.8h umax v0.16b, v0.16b, v30.16b umax v1.16b, v1.16b, v30.16b umin v0.16b, v0.16b, v31.16b umin v1.16b, v1.16b, v31.16b subs w9, w9, #1 .if \oy dup v25.16b, v28.b[0] dup v26.16b, v28.b[1] .endif st1 {v0.16b, v1.16b}, [x0], x2 // dst b.gt 1b .if \oy cmp w12, #0 mov w9, w12 // restore actual remaining h b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx0 0, 0, 0 fguv_loop_sx0 0, 0, 1 fguv_loop_sx0 0, 1, 0 fguv_loop_sx0 0, 1, 1 fguv_loop_sx0 1, 0, 0 fguv_loop_sx0 1, 0, 1 fguv_loop_sx0 1, 1, 0 fguv_loop_sx0 1, 1, 1 9: ldr d8, [sp, #16] ldr x30, [sp], #32 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl endjumptable function fguv_loop_sx1_neon .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: ld1 {v0.16b, v1.16b}, [x6], x7 // luma ld1 {v6.16b}, [x1], x2 // src .if \ox ld1 {v20.8b}, [x4], x10 // grain_lut old .endif .if \oy ld1 {v22.16b}, [x8], x10 // grain_lut top .endif .if \ox && \oy ld1 {v21.8b}, [x11], x10 // grain_lut top old .endif ld1 {v18.16b}, [x5], x10 // grain_lut uaddlp v2.8h, v0.16b uaddlp v3.8h, v1.16b .if \csfl rshrn v0.8b, v2.8h, #1 rshrn2 v0.16b, v3.8h, #1 .else urshr v2.8h, v2.8h, #1 urshr v3.8h, v3.8h, #1 uxtl v0.8h, v6.8b uxtl2 v1.8h, v6.16b mul v2.8h, v2.8h, v8.h[0] mul v3.8h, v3.8h, v8.h[0] mul v0.8h, v0.8h, v8.h[1] mul v1.8h, v1.8h, v8.h[1] sqadd v2.8h, v2.8h, v0.8h sqadd v3.8h, v3.8h, v1.8h sshr v2.8h, v2.8h, #6 sshr v3.8h, v3.8h, #6 add v2.8h, v2.8h, v24.8h add v3.8h, v3.8h, v24.8h sqxtun v0.8b, v2.8h sqxtun2 v0.16b, v3.8h .endif bl gather16_neon .if \ox smull v20.8h, v20.8b, v27.8b smlal v20.8h, v18.8b, v28.8b .endif .if \oy .if \ox smull v21.8h, v21.8b, v27.8b smlal v21.8h, v22.8b, v28.8b sqrshrn v20.8b, v20.8h, #5 sqrshrn v21.8b, v21.8h, #5 .endif .if \ox smull v16.8h, v20.8b, v26.8b .else smull v16.8h, v18.8b, v26.8b .endif smull2 v17.8h, v18.16b, v26.16b .if \ox smlal v16.8h, v21.8b, v25.8b .else smlal v16.8h, v22.8b, v25.8b .endif smlal2 v17.8h, v22.16b, v25.16b sqrshrn v22.8b, v16.8h, #5 sqrshrn2 v22.16b, v17.8h, #5 .endif // sxtl of grain .if \oy sxtl v16.8h, v22.8b sxtl2 v17.8h, v22.16b .elseif \ox sqrshrn v20.8b, v20.8h, #5 sxtl2 v17.8h, v18.16b sxtl v16.8h, v20.8b .else sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b .endif uxtl v2.8h, v4.8b // scaling uxtl2 v3.8h, v4.16b mul v16.8h, v16.8h, v2.8h // scaling * grain mul v17.8h, v17.8h, v3.8h srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) srshl v17.8h, v17.8h, v29.8h uaddw v16.8h, v16.8h, v6.8b // *src + noise uaddw2 v17.8h, v17.8h, v6.16b sqxtun v0.8b, v16.8h sqxtun2 v0.16b, v17.8h umax v0.16b, v0.16b, v30.16b umin v0.16b, v0.16b, v31.16b .if \oy mov v16.16b, v25.16b .endif subs w9, w9, #1 .if \oy mov v25.16b, v26.16b mov v26.16b, v16.16b .endif st1 {v0.16b}, [x0], x2 // dst b.gt 1b .if \oy cmp w12, #0 mov w9, w12 // restore actual remaining h b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx1 0, 0, 0 fguv_loop_sx1 0, 0, 1 fguv_loop_sx1 0, 1, 0 fguv_loop_sx1 0, 1, 1 fguv_loop_sx1 1, 0, 0 fguv_loop_sx1 1, 0, 1 fguv_loop_sx1 1, 1, 0 fguv_loop_sx1 1, 1, 1 9: ldr d8, [sp, #16] ldr x30, [sp], #32 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl endjumptable dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/filmgrain16.S000066400000000000000000002217171517466257200241200ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 .macro increment_seed steps, shift=1 lsr w11, w2, #3 lsr w12, w2, #12 lsr w13, w2, #1 eor w11, w2, w11 // (r >> 0) ^ (r >> 3) eor w12, w12, w13 // (r >> 12) ^ (r >> 1) eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) .if \shift lsr w2, w2, #\steps .endif and w11, w11, #((1 << \steps) - 1) // bit .if \shift orr w2, w2, w11, lsl #(16 - \steps) // *state .else orr w2, w2, w11, lsl #16 // *state .endif .endm .macro read_rand dest, bits, age ubfx \dest, x2, #16 - \bits - \age, #\bits .endm .macro read_shift_rand dest, bits ubfx \dest, x2, #17 - \bits, #\bits lsr w2, w2, #1 .endm // special calling convention: // w2 holds seed // x3 holds dav2d_gaussian_sequence // clobbers x11-x15 // returns in v0.8h function get_gaussian_neon increment_seed 4 read_rand x14, 11, 3 read_rand x15, 11, 2 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] read_rand x14, 11, 1 ld1 {v0.h}[1], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 0 increment_seed 4 add x15, x3, x15, lsl #1 ld1 {v0.h}[2], [x14] read_rand x14, 11, 3 ld1 {v0.h}[3], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 2 ld1 {v0.h}[4], [x14] add x15, x3, x15, lsl #1 read_rand x14, 11, 1 ld1 {v0.h}[5], [x15] read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[6], [x14] ld1 {v0.h}[7], [x15] ret endfunc .macro store_grain_row r0, r1, r2, r3, r4, r5 st1 {\r0\().16b,\r1\().16b}, [x0], #32 st1 {\r2\().16b,\r3\().16b}, [x0], #32 st1 {\r4\().16b}, [x0], #16 st1 {\r5\().h}[0], [x0], #2 .endm function get_grain_2_neon increment_seed 2 read_rand x14, 11, 1 read_rand x15, 11, 0 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] ld1 {v0.h}[1], [x15] srshl v0.4h, v0.4h, v31.4h ret endfunc .macro get_grain_2 dst bl get_grain_2_neon .ifnc \dst, v0 mov \dst\().8b, v0.8b .endif .endm function get_grain_4_neon increment_seed 4 read_rand x14, 11, 3 read_rand x15, 11, 2 add x14, x3, x14, lsl #1 add x15, x3, x15, lsl #1 ld1 {v0.h}[0], [x14] read_rand x14, 11, 1 ld1 {v0.h}[1], [x15] add x14, x3, x14, lsl #1 read_rand x15, 11, 0 add x15, x3, x15, lsl #1 ld1 {v0.h}[2], [x14] ld1 {v0.h}[3], [x15] srshl v0.4h, v0.4h, v31.4h ret endfunc .macro get_grain_4 dst bl get_grain_4_neon .ifnc \dst, v0 mov \dst\().8b, v0.8b .endif .endm // w15 holds the number of entries to produce // w14, w16 and w17 hold the previous output entries // v0 holds the vector of produced entries // v1 holds the input vector of sums from above .macro output_lag n function output_lag\n\()_neon 1: read_shift_rand x13, 11 mov w11, v1.s[0] ldrsh w12, [x3, x13, lsl #1] ext v0.16b, v0.16b, v0.16b, #2 .if \n == 1 madd w11, w14, w4, w11 // sum (above) + *coeff * prev output .elseif \n == 2 madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 madd w11, w14, w17, w11 // += *coeff * prev output 2 mov w16, w14 .else madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 madd w11, w14, w21, w11 // += *coeff * prev output 3 mov w17, w16 mov w16, w14 .endif add w14, w11, w8 // 1 << (ar_coeff_shift - 1) add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) asr w14, w14, w7 // >> ar_coeff_shift asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) add w14, w14, w12 cmp w14, w5 csel w14, w14, w5, le cmp w14, w6 csel w14, w14, w6, ge subs w15, w15, #1 ext v1.16b, v1.16b, v1.16b, #4 ins v0.h[7], w14 b.gt 1b ret endfunc .endm output_lag 1 output_lag 2 output_lag 3 function sum_lag1_above_neon sub x12, x0, #1*GRAIN_WIDTH*2 - 16 ld1 {v18.8h}, [x12] // load top right ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right smull v4.4s, v17.4h, v28.4h smlal v4.4s, v0.4h, v27.4h smlal v4.4s, v1.4h, v29.4h smull2 v5.4s, v17.8h, v28.8h smlal2 v5.4s, v0.8h, v27.8h smlal2 v5.4s, v1.8h, v29.8h mov v16.16b, v17.16b mov v17.16b, v18.16b ret endfunc .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff bl sum_\lag\()_above_neon .ifc \type, uv_420 add x12, x19, #GRAIN_WIDTH*2 ld1 {v22.8h, v23.8h}, [x19], #32 ld1 {v24.8h, v25.8h}, [x12] addp v22.8h, v22.8h, v23.8h addp v23.8h, v24.8h, v25.8h add v22.8h, v22.8h, v23.8h srshr v0.8h, v22.8h, #2 .endif .ifc \type, uv_422 ld1 {v22.8h, v23.8h}, [x19], #32 addp v22.8h, v22.8h, v23.8h srshr v0.8h, v22.8h, #1 .endif .ifc \type, uv_444 ld1 {v0.8h}, [x19], #16 .endif .if \uv_layout .ifnb \uv_coeff dup v1.8b, \uv_coeff sxtl v1.8h, v1.8b smlal v4.4s, v0.4h, v1.4h smlal2 v5.4s, v0.8h, v1.8h .else smlal v4.4s, v0.4h, v30.4h smlal2 v5.4s, v0.8h, v30.8h .endif .endif .if \uv_layout && \elems == 8 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 444 && \elems == 7 b sum_\lag\()_y_\edge\()_start .elseif \uv_layout == 422 && \elems == 1 b sum_\lag\()_uv_420_\edge\()_start .else sum_\lag\()_\type\()_\edge\()_start: .if \elems > 4 .ifc \edge, left increment_seed 4 read_rand x12, 11, 3 read_rand x13, 11, 2 read_rand x14, 11, 1 add x12, x3, x12, lsl #1 add x13, x3, x13, lsl #1 add x14, x3, x14, lsl #1 ld1 {v0.h}[5], [x12] ld1 {v0.h}[6], [x13] ld1 {v0.h}[7], [x14] lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 srshl v0.8h, v0.8h, v31.8h ext v4.16b, v4.16b, v4.16b, #12 .ifc \lag, lag3 smov w17, v0.h[5] .endif .ifnc \lag, lag1 smov w16, v0.h[6] .endif smov w14, v0.h[7] mov v1.16b, v4.16b mov w15, #1 bl output_\lag\()_neon .else increment_seed 4, shift=0 mov v1.16b, v4.16b mov w15, #4 bl output_\lag\()_neon .endif increment_seed 4, shift=0 mov v1.16b, v5.16b .ifc \edge, right mov w15, #3 bl output_\lag\()_neon read_shift_rand x15, 11 add x15, x3, x15, lsl #1 ld1 {v1.h}[0], [x15] srshl v1.4h, v1.4h, v31.4h ext v0.16b, v0.16b, v1.16b, #2 .else mov w15, #4 bl output_\lag\()_neon .endif .else // elems == 1 increment_seed 4, shift=0 mov v1.16b, v4.16b mov w15, #1 bl output_\lag\()_neon lsr w2, w2, #3 read_rand x12, 11, 2 read_rand x13, 11, 1 read_rand x14, 11, 0 add x12, x3, x12, lsl #1 add x13, x3, x13, lsl #1 add x14, x3, x14, lsl #1 ld1 {v1.h}[0], [x12] ld1 {v1.h}[1], [x13] ld1 {v1.h}[2], [x14] srshl v1.4h, v1.4h, v31.4h ext v0.16b, v0.16b, v1.16b, #14 .endif st1 {v0.8h}, [x0], #16 ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret .endif .endm .macro sum_lag1_func type, uv_layout, edge, elems=8 function sum_\type\()_lag1_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x12, x0, #1*GRAIN_WIDTH*2 ld1 {v17.8h}, [x12] // load the previous block right above .endif sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems endfunc .endm sum_lag1_func y, 0, left sum_lag1_func y, 0, mid sum_lag1_func y, 0, right, 7 sum_lag1_func uv_444, 444, left sum_lag1_func uv_444, 444, mid sum_lag1_func uv_444, 444, right, 7 sum_lag1_func uv_422, 422, left sum_lag1_func uv_422, 422, mid sum_lag1_func uv_422, 422, right, 1 sum_lag1_func uv_420, 420, left sum_lag1_func uv_420, 420, mid sum_lag1_func uv_420, 420, right, 1 function sum_lag2_above_neon sub x12, x0, #2*GRAIN_WIDTH*2 - 16 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 ld1 {v18.8h}, [x12] // load top right ld1 {v21.8h}, [x13] dup v26.8b, v30.b[0] ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid dup v27.8b, v30.b[1] ext v23.16b, v16.16b, v17.16b, #14 sxtl v26.8h, v26.8b dup v28.8b, v30.b[3] ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right sxtl v27.8h, v27.8b dup v29.8b, v30.b[4] ext v1.16b, v17.16b, v18.16b, #4 sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b smull v4.4s, v22.4h, v26.4h smlal v4.4s, v23.4h, v27.4h smlal v4.4s, v0.4h, v28.4h smlal v4.4s, v1.4h, v29.4h smull2 v5.4s, v22.8h, v26.8h smlal2 v5.4s, v23.8h, v27.8h smlal2 v5.4s, v0.8h, v28.8h smlal2 v5.4s, v1.8h, v29.8h dup v26.16b, v30.b[5] ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid dup v27.16b, v30.b[6] ext v23.16b, v19.16b, v20.16b, #14 sxtl v26.8h, v26.8b dup v28.16b, v30.b[8] ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right sxtl v27.8h, v27.8b dup v29.16b, v30.b[9] ext v1.16b, v20.16b, v21.16b, #4 sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b smlal v4.4s, v22.4h, v26.4h smlal v4.4s, v23.4h, v27.4h smlal v4.4s, v0.4h, v28.4h smlal v4.4s, v1.4h, v29.4h smlal2 v5.4s, v22.8h, v26.8h smlal2 v5.4s, v23.8h, v27.8h smlal2 v5.4s, v0.8h, v28.8h smlal2 v5.4s, v1.8h, v29.8h dup v26.16b, v30.b[2] dup v27.16b, v30.b[7] sxtl v26.8h, v26.8b sxtl v27.8h, v27.8b smlal v4.4s, v17.4h, v26.4h smlal v4.4s, v20.4h, v27.4h smlal2 v5.4s, v17.8h, v26.8h smlal2 v5.4s, v20.8h, v27.8h mov v16.16b, v17.16b mov v17.16b, v18.16b mov v19.16b, v20.16b mov v20.16b, v21.16b ret endfunc .macro sum_lag2_func type, uv_layout, edge, elems=8 function sum_\type\()_lag2_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x12, x0, #2*GRAIN_WIDTH*2 sub x13, x0, #1*GRAIN_WIDTH*2 ld1 {v17.8h}, [x12] // load the previous block right above ld1 {v20.8h}, [x13] .endif sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] endfunc .endm sum_lag2_func y, 0, left sum_lag2_func y, 0, mid sum_lag2_func y, 0, right, 7 sum_lag2_func uv_444, 444, left sum_lag2_func uv_444, 444, mid sum_lag2_func uv_444, 444, right, 7 sum_lag2_func uv_422, 422, left sum_lag2_func uv_422, 422, mid sum_lag2_func uv_422, 422, right, 1 sum_lag2_func uv_420, 420, left sum_lag2_func uv_420, 420, mid sum_lag2_func uv_420, 420, right, 1 function sum_lag3_above_neon sub x11, x0, #3*GRAIN_WIDTH*2 - 16 sub x12, x0, #2*GRAIN_WIDTH*2 - 16 sub x13, x0, #1*GRAIN_WIDTH*2 - 16 ld1 {v15.8h}, [x11] // load top right ld1 {v18.8h}, [x12] ld1 {v21.8h}, [x13] dup v22.8b, v29.b[0] ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid dup v23.8b, v29.b[1] ext v9.16b, v13.16b, v14.16b, #12 sxtl v22.8h, v22.8b dup v24.8b, v29.b[2] sxtl v23.8h, v23.8b dup v25.8b, v29.b[3] ext v10.16b, v13.16b, v14.16b, #14 sxtl v24.8h, v24.8b dup v26.8b, v29.b[4] ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right sxtl v25.8h, v25.8b dup v27.8b, v29.b[5] ext v12.16b, v14.16b, v15.16b, #4 sxtl v26.8h, v26.8b dup v28.8b, v29.b[6] ext v13.16b, v14.16b, v15.16b, #6 sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b smull v4.4s, v8.4h, v22.4h smlal v4.4s, v9.4h, v23.4h smlal v4.4s, v10.4h, v24.4h smlal v4.4s, v11.4h, v26.4h smlal v4.4s, v12.4h, v27.4h smlal v4.4s, v13.4h, v28.4h smlal v4.4s, v14.4h, v25.4h smull2 v5.4s, v8.8h, v22.8h smlal2 v5.4s, v9.8h, v23.8h smlal2 v5.4s, v10.8h, v24.8h smlal2 v5.4s, v11.8h, v26.8h smlal2 v5.4s, v12.8h, v27.8h smlal2 v5.4s, v13.8h, v28.8h smlal2 v5.4s, v14.8h, v25.8h dup v22.8b, v29.b[7] ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid dup v23.8b, v29.b[8] ext v9.16b, v16.16b, v17.16b, #12 sxtl v22.8h, v22.8b dup v24.8b, v29.b[9] sxtl v23.8h, v23.8b dup v25.8b, v29.b[10] ext v10.16b, v16.16b, v17.16b, #14 sxtl v24.8h, v24.8b dup v26.8b, v29.b[11] ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right sxtl v25.8h, v25.8b dup v27.8b, v29.b[12] ext v12.16b, v17.16b, v18.16b, #4 sxtl v26.8h, v26.8b dup v28.8b, v29.b[13] ext v13.16b, v17.16b, v18.16b, #6 sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b smlal v4.4s, v8.4h, v22.4h smlal v4.4s, v9.4h, v23.4h smlal v4.4s, v10.4h, v24.4h smlal v4.4s, v11.4h, v26.4h smlal v4.4s, v12.4h, v27.4h smlal v4.4s, v13.4h, v28.4h smlal v4.4s, v17.4h, v25.4h smlal2 v5.4s, v8.8h, v22.8h smlal2 v5.4s, v9.8h, v23.8h smlal2 v5.4s, v10.8h, v24.8h smlal2 v5.4s, v11.8h, v26.8h smlal2 v5.4s, v12.8h, v27.8h smlal2 v5.4s, v13.8h, v28.8h smlal2 v5.4s, v17.8h, v25.8h dup v22.8b, v29.b[14] ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid dup v23.8b, v29.b[15] ext v9.16b, v19.16b, v20.16b, #12 sxtl v22.8h, v22.8b dup v24.8b, v30.b[0] sxtl v23.8h, v23.8b dup v25.8b, v30.b[1] ext v10.16b, v19.16b, v20.16b, #14 sxtl v24.8h, v24.8b dup v26.8b, v30.b[2] ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right sxtl v25.8h, v25.8b dup v27.8b, v30.b[3] ext v12.16b, v20.16b, v21.16b, #4 sxtl v26.8h, v26.8b dup v28.8b, v30.b[4] ext v13.16b, v20.16b, v21.16b, #6 sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b smlal v4.4s, v8.4h, v22.4h smlal v4.4s, v9.4h, v23.4h smlal v4.4s, v10.4h, v24.4h smlal v4.4s, v11.4h, v26.4h smlal v4.4s, v12.4h, v27.4h smlal v4.4s, v13.4h, v28.4h smlal v4.4s, v20.4h, v25.4h mov v16.16b, v17.16b mov v17.16b, v18.16b smlal2 v5.4s, v8.8h, v22.8h smlal2 v5.4s, v9.8h, v23.8h smlal2 v5.4s, v10.8h, v24.8h smlal2 v5.4s, v11.8h, v26.8h smlal2 v5.4s, v12.8h, v27.8h smlal2 v5.4s, v13.8h, v28.8h smlal2 v5.4s, v20.8h, v25.8h mov v13.16b, v14.16b mov v14.16b, v15.16b mov v19.16b, v20.16b mov v20.16b, v21.16b ret endfunc .macro sum_lag3_func type, uv_layout, edge, elems=8 function sum_\type\()_lag3_\edge\()_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! .ifc \edge, left sub x11, x0, #3*GRAIN_WIDTH*2 sub x12, x0, #2*GRAIN_WIDTH*2 sub x13, x0, #1*GRAIN_WIDTH*2 ld1 {v14.8h}, [x11] // load the previous block right above ld1 {v17.8h}, [x12] ld1 {v20.8h}, [x13] .endif sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] endfunc .endm sum_lag3_func y, 0, left sum_lag3_func y, 0, mid sum_lag3_func y, 0, right, 7 sum_lag3_func uv_444, 444, left sum_lag3_func uv_444, 444, mid sum_lag3_func uv_444, 444, right, 7 sum_lag3_func uv_422, 422, left sum_lag3_func uv_422, 422, mid sum_lag3_func uv_422, 422, right, 1 sum_lag3_func uv_420, 420, left sum_lag3_func uv_420, 420, mid sum_lag3_func uv_420, 420, right, 1 function generate_grain_rows_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! 1: mov w16, #80 2: bl get_gaussian_neon srshl v0.8h, v0.8h, v31.8h subs w16, w16, #8 st1 {v0.8h}, [x0], #16 b.gt 2b get_grain_2 v0 subs w1, w1, #1 st1 {v0.s}[0], [x0], #4 b.gt 1b ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function generate_grain_rows_44_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! 1: mov w16, #40 2: bl get_gaussian_neon srshl v0.8h, v0.8h, v31.8h subs w16, w16, #8 st1 {v0.8h}, [x0], #16 b.gt 2b get_grain_4 v0 subs w1, w1, #1 st1 {v0.4h}, [x0] add x0, x0, #GRAIN_WIDTH*2-80 b.gt 1b ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function gen_grain_uv_444_lag0_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v4.8h}, [x19], #16 gen_grain_uv_lag0_8_start: bl get_gaussian_neon srshl v0.8h, v0.8h, v31.8h gen_grain_uv_lag0_8_add: and v4.16b, v4.16b, v1.16b smull v2.4s, v4.4h, v27.4h smull2 v3.4s, v4.8h, v27.8h srshl v2.4s, v2.4s, v28.4s srshl v3.4s, v3.4s, v28.4s sqxtn v2.4h, v2.4s sqxtn2 v2.8h, v3.4s sqadd v2.8h, v2.8h, v0.8h smin v2.8h, v2.8h, v25.8h smax v2.8h, v2.8h, v26.8h st1 {v2.8h}, [x0], #16 ldr x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret endfunc function gen_grain_uv_420_lag0_8_neon AARCH64_SIGN_LINK_REGISTER add x12, x19, #GRAIN_WIDTH*2 str x30, [sp, #-16]! ld1 {v16.8h, v17.8h}, [x19], #32 ld1 {v18.8h, v19.8h}, [x12] addp v16.8h, v16.8h, v17.8h addp v17.8h, v18.8h, v19.8h add v16.8h, v16.8h, v17.8h srshr v4.8h, v16.8h, #2 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_422_lag0_8_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v16.8h, v17.8h}, [x19], #32 addp v16.8h, v16.8h, v17.8h srshr v4.8h, v16.8h, #1 b gen_grain_uv_lag0_8_start endfunc function gen_grain_uv_420_lag0_4_neon add x12, x19, #GRAIN_WIDTH*2 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v16.4h, v17.4h}, [x19] ld1 {v18.4h, v19.4h}, [x12] add x19, x19, #32 addp v16.4h, v16.4h, v17.4h addp v17.4h, v18.4h, v19.4h add v16.4h, v16.4h, v17.4h srshr v4.4h, v16.4h, #2 get_grain_4 v0 b gen_grain_uv_lag0_8_add endfunc function gen_grain_uv_422_lag0_4_neon AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-16]! ld1 {v16.4h, v17.4h}, [x19] add x19, x19, #32 addp v16.4h, v16.4h, v17.4h srshr v4.4h, v16.4h, #1 get_grain_4 v0 b gen_grain_uv_lag0_8_add endfunc .macro gen_grain_82 type function generate_grain_\type\()_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x30, x19, [sp, #-96]! .ifc \type, uv_444 mov w13, w3 mov w14, #28 add x19, x1, #3*GRAIN_WIDTH*2 mov x1, x2 mul w13, w13, w14 clz w15, w4 .else clz w15, w2 .endif movrel x3, X(gaussian_sequence) sub w15, w15, #24 // -bitdepth_min_8 ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] .ifc \type, y add x4, x1, #FGD_AR_COEFFS_Y .else add x4, x1, #FGD_AR_COEFFS_UV .endif add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift add x16, x16, x17 neg v31.8h, v31.8h .ifc \type, uv_444 cmp w13, #0 mov w11, #0x49d8 mov w14, #0xb524 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] csel w11, w11, w14, ne .endif ldr w7, [x1, #FGD_AR_COEFF_SHIFT] neg w15, w15 // bitdepth_min_8 mov w8, #1 mov w10, #1 lsl w8, w8, w7 // 1 << ar_coeff_shift lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) mov w5, #128 lsl w5, w5, w15 // 128 << bitdepth_min_8 neg w6, w5 // -(128 << bitpdeth_min_8) sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 .ifc \type, uv_444 eor w2, w2, w11 .endif br x16 L(generate_grain_\type\()_lag0): AARCH64_VALID_JUMP_TARGET .ifc \type, y mov w1, #GRAIN_HEIGHT bl generate_grain_rows_neon .else dup v28.4s, w7 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] movi v0.16b, #0 movi v1.16b, #255 dup v25.8h, w5 dup v26.8h, w6 ext v29.16b, v0.16b, v1.16b, #10 ext v30.16b, v1.16b, v0.16b, #2 neg v28.4s, v28.4s sxtl v27.8h, v27.8b mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT-3 1: mov v1.16b, v29.16b bl gen_grain_uv_444_lag0_neon // 8 movi v1.16b, #255 bl gen_grain_uv_444_lag0_neon // 16 bl gen_grain_uv_444_lag0_neon // 24 bl gen_grain_uv_444_lag0_neon // 32 bl gen_grain_uv_444_lag0_neon // 40 bl gen_grain_uv_444_lag0_neon // 48 bl gen_grain_uv_444_lag0_neon // 56 bl gen_grain_uv_444_lag0_neon // 64 bl gen_grain_uv_444_lag0_neon // 72 mov v1.16b, v30.16b bl gen_grain_uv_444_lag0_neon // 80 get_grain_2 v16 subs w1, w1, #1 add x19, x19, #4 st1 {v16.s}[0], [x0], #4 b.gt 1b .endif ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag1): AARCH64_VALID_JUMP_TARGET ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] ld1r {v29.8b}, [x4] // ar_coeffs_y[2] .ifc \type, y ldrsb w4, [x4, #1] // ar_coeffs_y[3] .else add x4, x4, #2 .endif mov w1, #3 .ifc \type, uv_444 ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] ldursb w4, [x4, #-1] // ar_coeffs_uv[3] .endif bl generate_grain_rows_neon sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b .ifc \type, uv_444 sxtl v30.8h, v30.8b .endif mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_mid_neon // 48 bl sum_\type\()_lag1_mid_neon // 56 bl sum_\type\()_lag1_mid_neon // 64 bl sum_\type\()_lag1_mid_neon // 72 bl sum_\type\()_lag1_right_neon // 80 get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #4 .endif st1 {v16.s}[0], [x0], #4 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag2): AARCH64_VALID_JUMP_TARGET ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] smov w4, v30.b[10] smov w17, v30.b[11] mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_mid_neon // 48 bl sum_\type\()_lag2_mid_neon // 56 bl sum_\type\()_lag2_mid_neon // 64 bl sum_\type\()_lag2_mid_neon // 72 bl sum_\type\()_lag2_right_neon // 80 get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #4 .endif st1 {v16.s}[0], [x0], #4 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag3): AARCH64_VALID_JUMP_TARGET ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] stp x20, x21, [sp, #80] smov w4, v30.b[5] smov w20, v30.b[6] smov w21, v30.b[7] mov w1, #3 bl generate_grain_rows_neon mov w1, #GRAIN_HEIGHT - 3 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_mid_neon // 48 bl sum_\type\()_lag3_mid_neon // 56 bl sum_\type\()_lag3_mid_neon // 64 bl sum_\type\()_lag3_mid_neon // 72 bl sum_\type\()_lag3_right_neon // 80 get_grain_2 v16 subs w1, w1, #1 .ifc \type, uv_444 add x19, x19, #4 .endif st1 {v16.s}[0], [x0], #4 b.gt 1b ldp x20, x21, [sp, #80] ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl endjumptable .endm gen_grain_82 y gen_grain_82 uv_444 .macro set_height dst, type .ifc \type, uv_420 mov \dst, #SUB_GRAIN_HEIGHT-3 .else mov \dst, #GRAIN_HEIGHT-3 .endif .endm .macro increment_y_ptr reg, type .ifc \type, uv_420 add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) .else sub \reg, \reg, #6*32-GRAIN_WIDTH*2 .endif .endm .macro gen_grain_44 type function generate_grain_\type\()_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x30, x19, [sp, #-96]! mov w13, w3 mov w14, #28 add x19, x1, #(3*GRAIN_WIDTH-3)*2 mov x1, x2 mul w13, w13, w14 clz w15, w4 movrel x3, X(gaussian_sequence) sub w15, w15, #24 // -bitdepth_min_8 ldr w2, [x1, #FGD_SEED] ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] add x4, x1, #FGD_AR_COEFFS_UV add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 movrel x16, gen_grain_\type\()_tbl ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 ldrsw x17, [x16, w17, uxtw #2] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift add x16, x16, x17 neg v31.8h, v31.8h cmp w13, #0 mov w11, #0x49d8 mov w14, #0xb524 add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] csel w11, w11, w14, ne ldr w7, [x1, #FGD_AR_COEFF_SHIFT] neg w15, w15 // bitdepth_min_8 mov w8, #1 mov w10, #1 lsl w8, w8, w7 // 1 << ar_coeff_shift lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) mov w5, #128 lsl w5, w5, w15 // 128 << bitdepth_min_8 neg w6, w5 // -(128 << bitpdeth_min_8) sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 eor w2, w2, w11 br x16 L(generate_grain_\type\()_lag0): AARCH64_VALID_JUMP_TARGET dup v28.4s, w7 ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] movi v0.16b, #0 movi v1.16b, #255 dup v25.8h, w5 dup v26.8h, w6 ext v29.16b, v0.16b, v1.16b, #10 ext v30.16b, v1.16b, v0.16b, #14 neg v28.4s, v28.4s sxtl v27.8h, v27.8b mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: mov v1.16b, v29.16b bl gen_grain_\type\()_lag0_8_neon // 8 movi v1.16b, #255 bl gen_grain_\type\()_lag0_8_neon // 16 bl gen_grain_\type\()_lag0_8_neon // 24 bl gen_grain_\type\()_lag0_8_neon // 32 bl gen_grain_\type\()_lag0_8_neon // 40 mov v1.16b, v30.16b bl gen_grain_\type\()_lag0_4_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag1): AARCH64_VALID_JUMP_TARGET ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] add x4, x4, #2 mov w1, #3 ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] ldursb w4, [x4, #-1] // ar_coeffs_uv[3] bl generate_grain_rows_44_neon sxtl v27.8h, v27.8b sxtl v28.8h, v28.8b sxtl v29.8h, v29.8b sxtl v30.8h, v30.8b set_height w1, \type 1: bl sum_\type\()_lag1_left_neon // 8 bl sum_\type\()_lag1_mid_neon // 16 bl sum_\type\()_lag1_mid_neon // 24 bl sum_\type\()_lag1_mid_neon // 32 bl sum_\type\()_lag1_mid_neon // 40 bl sum_\type\()_lag1_right_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag2): AARCH64_VALID_JUMP_TARGET ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] smov w4, v30.b[10] smov w17, v30.b[11] mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl sum_\type\()_lag2_left_neon // 8 bl sum_\type\()_lag2_mid_neon // 16 bl sum_\type\()_lag2_mid_neon // 24 bl sum_\type\()_lag2_mid_neon // 32 bl sum_\type\()_lag2_mid_neon // 40 bl sum_\type\()_lag2_right_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret L(generate_grain_\type\()_lag3): AARCH64_VALID_JUMP_TARGET ldr q29, [x4] // ar_coeffs_uv[0-15] ldr q30, [x4, #16] // ar_coeffs_uv[16-24] stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] stp x20, x21, [sp, #80] smov w4, v30.b[5] smov w20, v30.b[6] smov w21, v30.b[7] mov w1, #3 bl generate_grain_rows_44_neon set_height w1, \type 1: bl sum_\type\()_lag3_left_neon // 8 bl sum_\type\()_lag3_mid_neon // 16 bl sum_\type\()_lag3_mid_neon // 24 bl sum_\type\()_lag3_mid_neon // 32 bl sum_\type\()_lag3_mid_neon // 40 bl sum_\type\()_lag3_right_neon // 44 subs w1, w1, #1 increment_y_ptr x19, \type add x0, x0, #GRAIN_WIDTH*2-6*16 b.gt 1b ldp x20, x21, [sp, #80] ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldp x30, x19, [sp], #96 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl endjumptable .endm gen_grain_44 uv_420 gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, off umov w14, \src1[0] umov w15, \src2[1] umov w16, \src1[2] add x14, x14, x3 umov w17, \src2[3] add x15, x15, x3 ld1 {\dst1}[0+\off], [x14] umov w14, \src1[4] add x16, x16, x3 ld1 {\dst2}[1+\off], [x15] umov w15, \src2[5] add x17, x17, x3 ld1 {\dst1}[2+\off], [x16] umov w16, \src1[6] add x14, x14, x3 ld1 {\dst2}[3+\off], [x17] umov w17, \src2[7] add x15, x15, x3 ld1 {\dst1}[4+\off], [x14] add x16, x16, x3 ld1 {\dst2}[5+\off], [x15] add x17, x17, x3 ld1 {\dst1}[6+\off], [x16] ld1 {\dst2}[7+\off], [x17] .endm .macro gather dst1, dst2, src1, src2, src3, src4 gather_interleaved \dst1, \dst2, \src1, \src3, 0 gather_interleaved \dst2, \dst1, \src3, \src1, 0 gather_interleaved \dst1, \dst2, \src2, \src4, 8 gather_interleaved \dst2, \dst1, \src4, \src2, 8 .endm function gather32_neon gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h ret endfunc function gather16_neon gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 ins v6.d[1], v7.d[0] ret endfunc const overlap_coeffs_0, align=4 .short 27, 17, 0, 0 .short 17, 27, 32, 32 endconst const overlap_coeffs_1, align=4 .short 23, 0, 0, 0 .short 22, 32, 32, 32 endconst .macro calc_offset offx, offy, src, sx, sy and \offy, \src, #0xF // randval & 0xF lsr \offx, \src, #4 // randval >> 4 .if \sy == 0 add \offy, \offy, \offy // 2 * (randval & 0xF) .endif .if \sx == 0 add \offx, \offx, \offx // 2 * (randval >> 4) .endif .endm .macro add_offset dst, offx, offy, src, stride madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy add \dst, \dst, \offx, uxtw #1 // grain_lut += offx .endm // void dav2d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const int scaling_shift, // const entry grain_lut[][GRAIN_WIDTH], // const int offsets[][2], // const int h, const ptrdiff_t clip, // const ptrdiff_t type, // const int bitdepth_max); function fgy_32x32_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-80]! stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] str d14, [sp, #64] eor w4, w4, #15 // 15 - scaling_shift ldr w11, [x6, #8] // offsets[1][0] ldr w13, [x6, #4] // offsets[0][1] ldr w15, [x6, #12] // offsets[1][1] ldr w10, [sp, #96] // bitdepth_max ldr w6, [x6] // offsets[0][0] dup v26.8h, w10 // bitdepth_max clz w10, w10 ldr w8, [sp, #80] // clip sub w10, w10, #24 // -bitdepth_min_8 mov x9, #GRAIN_WIDTH*2 // grain_lut stride neg w10, w10 // bitdepth_min_8 dup v29.8h, w4 // 15 - scaling_shift dup v27.8h, w10 // bitdepth_min_8 movrel x16, overlap_coeffs_0 cbz w8, 1f // clip movi v30.8h, #16 movi v31.8h, #235 sshl v30.8h, v30.8h, v27.8h sshl v31.8h, v31.8h, v27.8h b 2f 1: // no clip movi v30.8h, #0 mov v31.16b, v26.16b // bitdepth_max 2: ushr v26.8h, v26.8h, #1 // grain_max not v25.16b, v26.16b // grain_min ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs add x5, x5, #18 // grain_lut += 9 add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride add x5, x5, x9 // grain_lut += grain_stride calc_offset w11, w12, w11, 0, 0 calc_offset w13, w14, w13, 0, 0 calc_offset w15, w16, w15, 0, 0 calc_offset w6, w10, w6, 0, 0 add_offset x12, w11, x12, x5, x9 add_offset x14, w13, x14, x5, x9 add_offset x16, w15, x16, x5, x9 add_offset x5, w6, x10, x5, x9 ldr w11, [sp, #88] // type movrel x13, fgy_loop_tbl add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by tst w11, #1 ldrsw x11, [x13, w11, uxtw #2] add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx add x11, x13, x11 b.eq 1f // y overlap dup v8.8h, v27.h[0] dup v9.8h, v27.h[1] mov w10, w7 // backup actual h mov w7, #2 1: br x11 endfunc function fgy_loop_neon .macro fgy ox, oy L(loop_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src .if \ox ld1 {v20.4h}, [x4], x9 // grain_lut old .endif .if \oy ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top .endif .if \ox && \oy ld1 {v14.4h}, [x8], x9 // grain_lut top old .endif mvni v4.8h, #0xf0, lsl #8 // 0x0fff ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. and v0.16b, v0.16b, v4.16b and v1.16b, v1.16b, v4.16b and v2.16b, v2.16b, v4.16b and v3.16b, v3.16b, v4.16b bl gather32_neon .if \ox smull v20.4s, v20.4h, v27.4h smlal v20.4s, v16.4h, v28.4h .endif .if \oy .if \ox smull v14.4s, v14.4h, v27.4h smlal v14.4s, v21.4h, v28.4h sqrshrn v20.4h, v20.4s, #5 sqrshrn v14.4h, v14.4s, #5 smin v20.4h, v20.4h, v26.4h smin v14.4h, v14.4h, v26.4h smax v20.4h, v20.4h, v25.4h smax v14.4h, v14.4h, v25.4h .endif .if \ox smull v10.4s, v20.4h, v9.4h .else smull v10.4s, v16.4h, v9.4h .endif smull2 v11.4s, v16.8h, v9.8h smull v12.4s, v17.4h, v9.4h smull2 v13.4s, v17.8h, v9.8h smull v16.4s, v18.4h, v9.4h smull2 v17.4s, v18.8h, v9.8h smull v18.4s, v19.4h, v9.4h smull2 v19.4s, v19.8h, v9.8h .if \ox smlal v10.4s, v14.4h, v8.4h .else smlal v10.4s, v21.4h, v8.4h .endif smlal2 v11.4s, v21.8h, v8.8h smlal v12.4s, v22.4h, v8.4h smlal2 v13.4s, v22.8h, v8.8h smlal v16.4s, v23.4h, v8.4h smlal2 v17.4s, v23.8h, v8.8h smlal v18.4s, v24.4h, v8.4h smlal2 v19.4s, v24.8h, v8.8h sqrshrn v10.4h, v10.4s, #5 sqrshrn2 v10.8h, v11.4s, #5 sqrshrn v11.4h, v12.4s, #5 sqrshrn2 v11.8h, v13.4s, #5 sqrshrn v12.4h, v16.4s, #5 sqrshrn2 v12.8h, v17.4s, #5 sqrshrn v13.4h, v18.4s, #5 sqrshrn2 v13.8h, v19.4s, #5 smin v16.8h, v10.8h, v26.8h smin v17.8h, v11.8h, v26.8h smin v18.8h, v12.8h, v26.8h smin v19.8h, v13.8h, v26.8h smax v16.8h, v16.8h, v25.8h smax v17.8h, v17.8h, v25.8h smax v18.8h, v18.8h, v25.8h smax v19.8h, v19.8h, v25.8h .endif uxtl v4.8h, v6.8b // scaling .if \ox && !\oy sqrshrn v20.4h, v20.4s, #5 .endif uxtl2 v5.8h, v6.16b .if \ox && !\oy smin v20.4h, v20.4h, v26.4h .endif uxtl v6.8h, v7.8b .if \ox && !\oy smax v20.4h, v20.4h, v25.4h .endif uxtl2 v7.8h, v7.16b .if \ox && !\oy ins v16.d[0], v20.d[0] .endif ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) ushl v5.8h, v5.8h, v29.8h ushl v6.8h, v6.8h, v29.8h ushl v7.8h, v7.8h, v29.8h sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) sqrdmulh v21.8h, v17.8h, v5.8h sqrdmulh v22.8h, v18.8h, v6.8h sqrdmulh v23.8h, v19.8h, v7.8h usqadd v0.8h, v20.8h // *src + noise usqadd v1.8h, v21.8h usqadd v2.8h, v22.8h usqadd v3.8h, v23.8h umax v0.8h, v0.8h, v30.8h umax v1.8h, v1.8h, v30.8h umax v2.8h, v2.8h, v30.8h umax v3.8h, v3.8h, v30.8h umin v0.8h, v0.8h, v31.8h umin v1.8h, v1.8h, v31.8h umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h subs w7, w7, #1 .if \oy dup v8.8h, v28.h[0] dup v9.8h, v28.h[1] .endif st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst b.gt 1b .if \oy cmp w10, #2 sub w7, w10, #2 // restore actual remaining h b.gt L(loop_\ox\()0) .endif ldr d14, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret .endm fgy 0, 0 fgy 0, 1 fgy 1, 0 fgy 1, 1 endfunc jumptable fgy_loop_tbl .word L(loop_00) - fgy_loop_tbl .word L(loop_01) - fgy_loop_tbl .word L(loop_10) - fgy_loop_tbl .word L(loop_11) - fgy_loop_tbl endjumptable // void dav2d_fguv_32x32_420_16bpc_neon(pixel *const dst, // const pixel *const src, // const ptrdiff_t stride, // const uint8_t scaling[SCALING_SIZE], // const Dav2dFilmGrainData *const data, // const entry grain_lut[][GRAIN_WIDTH], // const pixel *const luma_row, // const ptrdiff_t luma_stride, // const int offsets[][2], // const ptrdiff_t h, const ptrdiff_t uv, // const ptrdiff_t is_id, // const ptrdiff_t type, // const int bitdepth_max); .macro fguv layout, sx, sy function fguv_32x32_\layout\()_16bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER str x30, [sp, #-80]! stp d8, d9, [sp, #16] stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] stp d14, d15, [sp, #64] ldp x8, x9, [sp, #80] // offsets, h ldp x10, x11, [sp, #96] // uv, is_id ldr w16, [sp, #120] // bitdepth_max ldr w13, [x4, #FGD_SCALING_SHIFT] ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] dup v23.8h, w16 // bitdepth_max clz w16, w16 eor w13, w13, #15 // 15 - scaling_shift sub w16, w16, #24 // -bitdepth_min_8 // !csfl add x10, x4, x10, lsl #2 // + 4*uv add x14, x10, #FGD_UV_LUMA_MULT add x15, x10, #FGD_UV_MULT add x10, x10, #FGD_UV_OFFSET neg w16, w16 // bitdepth_min_8 ld1r {v8.8h}, [x14] // uv_luma_mult ld1r {v24.8h}, [x10] // uv_offset ld1r {v9.8h}, [x15] // uv_mult dup v29.8h, w13 // 15 - scaling_shift dup v27.8h, w16 // bitdepth_min_8 cbz w12, 1f // clip movi v30.8h, #16 movi v31.8h, #240 sshl v30.8h, v30.8h, v27.8h sshl v31.8h, v31.8h, v27.8h cbz w11, 2f // is_id movi v31.8h, #235 sshl v31.8h, v31.8h, v27.8h b 2f 1: // no clip movi v30.8h, #0 mov v31.16b, v23.16b // bitdepth_max 2: ushr v15.8h, v23.8h, #1 // grain_max sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 not v14.16b, v15.16b // grain_min ldr w12, [x8, #8] // offsets[1][0] ldr w14, [x8, #4] // offsets[0][1] ldr w16, [x8, #12] // offsets[1][1] ldr w8, [x8] // offsets[0][0] mov x10, #GRAIN_WIDTH*2 // grain_lut stride add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 .if \sy add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride .else add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride add x5, x5, x10 // grain_lut += grain_stride .endif calc_offset w12, w13, w12, \sx, \sy calc_offset w14, w15, w14, \sx, \sy calc_offset w16, w17, w16, \sx, \sy calc_offset w8, w11, w8, \sx, \sy add_offset x13, w12, x13, x5, x10 add_offset x15, w14, x15, x5, x10 add_offset x17, w16, x17, x5, x10 add_offset x5, w8, x11, x5, x10 add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx ldr w13, [sp, #112] // type movrel x16, overlap_coeffs_\sx movrel x14, fguv_loop_sx\sx\()_tbl ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs tst w13, #1 ldrsw x13, [x14, w13, uxtw #2] b.eq 1f // y overlap sub w12, w9, #(2 >> \sy) // backup remaining h mov w9, #(2 >> \sy) 1: add x13, x14, x13 .if \sy movi v25.8h, #23 movi v26.8h, #22 .else movi v25.8h, #27 movi v26.8h, #17 .endif .if \sy add x7, x7, x7 // luma_stride *= 2 .endif br x13 endfunc .endm fguv 420, 1, 1 fguv 422, 1, 0 fguv 444, 0, 0 function fguv_loop_sx0_neon .macro fguv_loop_sx0 csfl, ox, oy L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: .if \ox ld1 {v4.4h}, [x4], x10 // grain_lut old .endif .if \oy ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top .endif .if \ox && \oy ld1 {v5.4h}, [x11], x10 // grain_lut top old .endif ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut .if \ox smull v4.4s, v4.4h, v27.4h smlal v4.4s, v16.4h, v28.4h .endif .if \oy .if \ox smull v5.4s, v5.4h, v27.4h smlal v5.4s, v0.4h, v28.4h sqrshrn v4.4h, v4.4s, #5 sqrshrn v5.4h, v5.4s, #5 smin v4.4h, v4.4h, v15.4h smin v5.4h, v5.4h, v15.4h smax v4.4h, v4.4h, v14.4h smax v5.4h, v5.4h, v14.4h ins v16.d[0], v4.d[0] ins v0.d[0], v5.d[0] .endif smull v6.4s, v16.4h, v26.4h smull2 v7.4s, v16.8h, v26.8h smull v10.4s, v17.4h, v26.4h smull2 v11.4s, v17.8h, v26.8h smull v16.4s, v18.4h, v26.4h smull2 v17.4s, v18.8h, v26.8h smull v18.4s, v19.4h, v26.4h smull2 v19.4s, v19.8h, v26.8h smlal v6.4s, v0.4h, v25.4h smlal2 v7.4s, v0.8h, v25.8h smlal v10.4s, v1.4h, v25.4h smlal2 v11.4s, v1.8h, v25.8h smlal v16.4s, v2.4h, v25.4h smlal2 v17.4s, v2.8h, v25.8h smlal v18.4s, v3.4h, v25.4h smlal2 v19.4s, v3.8h, v25.8h sqrshrn v6.4h, v6.4s, #5 sqrshrn2 v6.8h, v7.4s, #5 sqrshrn v7.4h, v10.4s, #5 sqrshrn2 v7.8h, v11.4s, #5 sqrshrn v10.4h, v16.4s, #5 sqrshrn2 v10.8h, v17.4s, #5 sqrshrn v11.4h, v18.4s, #5 sqrshrn2 v11.8h, v19.4s, #5 .endif .if \ox && !\oy sqrshrn v4.4h, v4.4s, #5 smin v4.4h, v4.4h, v15.4h .endif ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma .if \oy smin v16.8h, v6.8h, v15.8h smin v17.8h, v7.8h, v15.8h smin v18.8h, v10.8h, v15.8h smin v19.8h, v11.8h, v15.8h smax v16.8h, v16.8h, v14.8h smax v17.8h, v17.8h, v14.8h smax v18.8h, v18.8h, v14.8h smax v19.8h, v19.8h, v14.8h .endif .if \ox && !\oy smax v4.4h, v4.4h, v14.4h .endif ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src .if \ox && !\oy ins v16.d[0], v4.d[0] .endif .if !\csfl smull v4.4s, v0.4h, v8.4h smull2 v5.4s, v0.8h, v8.8h smull v6.4s, v1.4h, v8.4h smull2 v7.4s, v1.8h, v8.8h smull v0.4s, v2.4h, v8.4h smull2 v1.4s, v2.8h, v8.8h smull v2.4s, v3.4h, v8.4h smull2 v3.4s, v3.8h, v8.8h smlal v4.4s, v10.4h, v9.4h smlal2 v5.4s, v10.8h, v9.8h smlal v6.4s, v11.4h, v9.4h smlal2 v7.4s, v11.8h, v9.8h smlal v0.4s, v12.4h, v9.4h smlal2 v1.4s, v12.8h, v9.8h smlal v2.4s, v13.4h, v9.4h smlal2 v3.4s, v13.8h, v9.8h shrn v4.4h, v4.4s, #6 shrn2 v4.8h, v5.4s, #6 shrn v5.4h, v6.4s, #6 shrn2 v5.8h, v7.4s, #6 shrn v6.4h, v0.4s, #6 shrn2 v6.8h, v1.4s, #6 shrn v7.4h, v2.4s, #6 shrn2 v7.8h, v3.4s, #6 add v0.8h, v4.8h, v24.8h add v1.8h, v5.8h, v24.8h add v2.8h, v6.8h, v24.8h add v3.8h, v7.8h, v24.8h movi v20.8h, #0 smin v0.8h, v0.8h, v23.8h smin v1.8h, v1.8h, v23.8h smin v2.8h, v2.8h, v23.8h smin v3.8h, v3.8h, v23.8h smax v0.8h, v0.8h, v20.8h smax v1.8h, v1.8h, v20.8h smax v2.8h, v2.8h, v20.8h smax v3.8h, v3.8h, v20.8h .else // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. and v0.16b, v0.16b, v23.16b and v1.16b, v1.16b, v23.16b and v2.16b, v2.16b, v23.16b and v3.16b, v3.16b, v23.16b .endif bl gather32_neon uxtl v4.8h, v6.8b // scaling uxtl2 v5.8h, v6.16b uxtl v6.8h, v7.8b uxtl2 v7.8h, v7.16b ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) ushl v5.8h, v5.8h, v29.8h ushl v6.8h, v6.8h, v29.8h ushl v7.8h, v7.8h, v29.8h sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) sqrdmulh v17.8h, v17.8h, v5.8h sqrdmulh v18.8h, v18.8h, v6.8h sqrdmulh v19.8h, v19.8h, v7.8h usqadd v10.8h, v16.8h // *src + noise usqadd v11.8h, v17.8h usqadd v12.8h, v18.8h usqadd v13.8h, v19.8h umax v0.8h, v10.8h, v30.8h umax v1.8h, v11.8h, v30.8h umax v2.8h, v12.8h, v30.8h umax v3.8h, v13.8h, v30.8h umin v0.8h, v0.8h, v31.8h umin v1.8h, v1.8h, v31.8h umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h subs w9, w9, #1 .if \oy dup v25.8h, v28.h[0] dup v26.8h, v28.h[1] .endif st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst b.gt 1b .if \oy cmp w12, #0 mov w9, w12 // restore actual remaining h b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx0 0, 0, 0 fguv_loop_sx0 0, 0, 1 fguv_loop_sx0 0, 1, 0 fguv_loop_sx0 0, 1, 1 fguv_loop_sx0 1, 0, 0 fguv_loop_sx0 1, 0, 1 fguv_loop_sx0 1, 1, 0 fguv_loop_sx0 1, 1, 1 9: ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl endjumptable function fguv_loop_sx1_neon .macro fguv_loop_sx1 csfl, ox, oy L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): AARCH64_VALID_JUMP_TARGET 1: .if \ox ld1 {v18.4h}, [x4], x10 // grain_lut old .endif .if \oy ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top .endif .if \ox && \oy ld1 {v19.4h}, [x11], x10 // grain_lut top old .endif ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut .if \ox smull v18.4s, v18.4h, v27.4h smlal v18.4s, v16.4h, v28.4h .endif .if \oy .if \ox smull v19.4s, v19.4h, v27.4h smlal v19.4s, v20.4h, v28.4h sqrshrn v18.4h, v18.4s, #5 sqrshrn v19.4h, v19.4s, #5 smin v18.4h, v18.4h, v15.4h smin v19.4h, v19.4h, v15.4h smax v18.4h, v18.4h, v14.4h smax v19.4h, v19.4h, v14.4h ins v16.d[0], v18.d[0] ins v20.d[0], v19.d[0] .endif smull v0.4s, v16.4h, v26.4h smull2 v1.4s, v16.8h, v26.8h smull v2.4s, v17.4h, v26.4h smull2 v3.4s, v17.8h, v26.8h smlal v0.4s, v20.4h, v25.4h smlal2 v1.4s, v20.8h, v25.8h smlal v2.4s, v21.4h, v25.4h smlal2 v3.4s, v21.8h, v25.8h sqrshrn v16.4h, v0.4s, #5 sqrshrn2 v16.8h, v1.4s, #5 sqrshrn v17.4h, v2.4s, #5 sqrshrn2 v17.8h, v3.4s, #5 .endif .if \ox && !\oy sqrshrn v18.4h, v18.4s, #5 smin v18.4h, v18.4h, v15.4h .endif ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma .if \oy smin v16.8h, v16.8h, v15.8h smin v17.8h, v17.8h, v15.8h smax v16.8h, v16.8h, v14.8h smax v17.8h, v17.8h, v14.8h .endif .if \ox && !\oy smax v18.4h, v18.4h, v14.4h .endif ld1 {v10.8h, v11.8h}, [x1], x2 // src .if \ox && !\oy ins v16.d[0], v18.d[0] .endif addp v0.8h, v0.8h, v1.8h addp v1.8h, v2.8h, v3.8h urshr v0.8h, v0.8h, #1 urshr v1.8h, v1.8h, #1 .if !\csfl smull v2.4s, v0.4h, v8.4h smull2 v3.4s, v0.8h, v8.8h smull v0.4s, v1.4h, v8.4h smull2 v1.4s, v1.8h, v8.8h smlal v2.4s, v10.4h, v9.4h smlal2 v3.4s, v10.8h, v9.8h smlal v0.4s, v11.4h, v9.4h smlal2 v1.4s, v11.8h, v9.8h shrn v2.4h, v2.4s, #6 shrn2 v2.8h, v3.4s, #6 shrn v3.4h, v0.4s, #6 shrn2 v3.8h, v1.4s, #6 add v0.8h, v2.8h, v24.8h add v1.8h, v3.8h, v24.8h movi v2.8h, #0 smin v0.8h, v0.8h, v23.8h smin v1.8h, v1.8h, v23.8h smax v0.8h, v0.8h, v2.8h smax v1.8h, v1.8h, v2.8h .else // Make sure that uninitialized pixels out of range past the right // edge are in range; their actual values shouldn't matter. and v0.16b, v0.16b, v23.16b and v1.16b, v1.16b, v23.16b .endif bl gather16_neon uxtl v4.8h, v6.8b // scaling uxtl2 v5.8h, v6.16b ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) ushl v5.8h, v5.8h, v29.8h sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) sqrdmulh v17.8h, v17.8h, v5.8h usqadd v10.8h, v16.8h // *src + noise usqadd v11.8h, v17.8h umax v0.8h, v10.8h, v30.8h umax v1.8h, v11.8h, v30.8h umin v0.8h, v0.8h, v31.8h umin v1.8h, v1.8h, v31.8h .if \oy mov v16.16b, v25.16b .endif subs w9, w9, #1 .if \oy mov v25.16b, v26.16b mov v26.16b, v16.16b .endif st1 {v0.8h, v1.8h}, [x0], x2 // dst b.gt 1b .if \oy cmp w12, #0 mov w9, w12 // restore actual remaining h b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) .endif b 9f .endm fguv_loop_sx1 0, 0, 0 fguv_loop_sx1 0, 0, 1 fguv_loop_sx1 0, 1, 0 fguv_loop_sx1 0, 1, 1 fguv_loop_sx1 1, 0, 0 fguv_loop_sx1 1, 0, 1 fguv_loop_sx1 1, 1, 0 fguv_loop_sx1 1, 1, 1 9: ldp d14, d15, [sp, #64] ldp d12, d13, [sp, #48] ldp d10, d11, [sp, #32] ldp d8, d9, [sp, #16] ldr x30, [sp], #80 AARCH64_VALIDATE_LINK_REGISTER ret endfunc jumptable fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl endjumptable dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/ipred.S000066400000000000000000007047101517466257200231030ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * Copyright © 2026, Arm Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 clz w3, w3 movrel x5, ipred_dc_128_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] movi v0.16b, #128 add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 movi v2.16b, #128 movi v3.16b, #128 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_dc_128_tbl .word 640b - ipred_dc_128_tbl .word 320b - ipred_dc_128_tbl .word 160b - ipred_dc_128_tbl .word 80b - ipred_dc_128_tbl .word 40b - ipred_dc_128_tbl endjumptable // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 clz w3, w3 movrel x5, ipred_v_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #1 add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_v_tbl .word 640b - ipred_v_tbl .word 320b - ipred_v_tbl .word 160b - ipred_v_tbl .word 80b - ipred_v_tbl .word 40b - ipred_v_tbl endjumptable // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 clz w3, w3 movrel x5, ipred_h_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] sub x2, x2, #4 add x5, x5, x3 mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x6], x1 subs w4, w4, #4 st1 {v1.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.8b}, [x0], x1 st1 {v2.8b}, [x6], x1 subs w4, w4, #4 st1 {v1.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_h_tbl .word 640b - ipred_h_tbl .word 320b - ipred_h_tbl .word 160b - ipred_h_tbl .word 80b - ipred_h_tbl .word 40b - ipred_h_tbl endjumptable // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 clz w3, w3 movrel x5, ipred_dc_top_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #1 add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v2.4h, v0.4h, v1.4h rshrn v2.8b, v2.8h, #5 dup v0.16b, v2.b[0] dup v1.16b, v2.b[0] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v4.4h, v0.4h, v1.4h add v5.4h, v2.4h, v3.4h add v4.4h, v4.4h, v5.4h rshrn v4.8b, v4.8h, #6 dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_dc_top_tbl .word 640b - ipred_dc_top_tbl .word 320b - ipred_dc_top_tbl .word 160b - ipred_dc_top_tbl .word 80b - ipred_dc_top_tbl .word 40b - ipred_dc_top_tbl endjumptable // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 movrel x5, ipred_dc_left_tbl sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrsw x3, [x5, w3, uxtw #2] ldrsw x7, [x5, w7, uxtw #2] add x3, x5, x3 add x5, x5, x7 add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET 1: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 1b ret L(ipred_dc_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET 1: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET 1: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v0.4h, v0.4h, v1.4h rshrn v0.8b, v0.8h, #5 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w32): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add v0.4h, v0.4h, v2.4h rshrn v0.8b, v0.8h, #6 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w64): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 1b ret endfunc jumptable ipred_dc_left_tbl .word L(ipred_dc_left_h64) - ipred_dc_left_tbl .word L(ipred_dc_left_h32) - ipred_dc_left_tbl .word L(ipred_dc_left_h16) - ipred_dc_left_tbl .word L(ipred_dc_left_h8) - ipred_dc_left_tbl .word L(ipred_dc_left_h4) - ipred_dc_left_tbl .word L(ipred_dc_left_w64) - ipred_dc_left_tbl .word L(ipred_dc_left_w32) - ipred_dc_left_tbl .word L(ipred_dc_left_w16) - ipred_dc_left_tbl .word L(ipred_dc_left_w8) - ipred_dc_left_tbl .word L(ipred_dc_left_w4) - ipred_dc_left_tbl endjumptable // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.8h, w7 // width + height movrel x5, ipred_dc_tbl rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrsw x3, [x5, w3, uxtw #2] ldrsw x6, [x5, w6, uxtw #2] neg w7, w7 // -ctz(width + height) add x3, x5, x3 add x5, x5, x6 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w4): AARCH64_VALID_JUMP_TARGET ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #4 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 2b ret L(ipred_dc_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w8): AARCH64_VALID_JUMP_TARGET ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #8 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x3 L(ipred_dc_w16): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b cmp w4, #16 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.16b, v0.b[0] 2: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b add x2, x2, #1 add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b cmp w4, #32 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v2.4h ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v4.4h, v4.4h, v16.4h 1: dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] 2: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add x2, x2, #1 add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b uaddlv h4, v4.16b add v1.4h, v1.4h, v2.4h add v3.4h, v3.4h, v4.4h cmp w4, #64 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v3.4h ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 16/32 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 lsr w16, w16, w4 dup v16.4h, w16 sqdmulh v4.4h, v4.4h, v16.4h 1: dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 2: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 2b ret endfunc jumptable ipred_dc_tbl .word L(ipred_dc_h64) - ipred_dc_tbl .word L(ipred_dc_h32) - ipred_dc_tbl .word L(ipred_dc_h16) - ipred_dc_tbl .word L(ipred_dc_h8) - ipred_dc_tbl .word L(ipred_dc_h4) - ipred_dc_tbl .word L(ipred_dc_w64) - ipred_dc_tbl .word L(ipred_dc_w32) - ipred_dc_tbl .word L(ipred_dc_w16) - ipred_dc_tbl .word L(ipred_dc_w8) - ipred_dc_tbl .word L(ipred_dc_w4) - ipred_dc_tbl endjumptable // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 clz w9, w3 movrel x5, ipred_paeth_tbl sub w9, w9, #25 ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.16b}, [x2] add x8, x2, #1 sub x2, x2, #4 add x5, x5, x9 mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v5.4s}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 zip1 v0.2s, v0.2s, v1.2s zip1 v2.2s, v2.2s, v3.2s uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v2.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h zip1 v0.2d, v0.2d, v2.2d uabd v20.16b, v5.16b, v16.16b // tdiff uabd v22.16b, v4.16b, v16.16b // tldiff uabd v16.16b, v0.16b, v16.16b // ldiff umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... st1 {v20.s}[3], [x0], x1 st1 {v20.s}[2], [x6], x1 subs w4, w4, #4 st1 {v20.s}[1], [x0], x1 st1 {v20.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v1.8b uaddw v18.8h, v6.8h, v2.8b uaddw v19.8h, v6.8h, v3.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h zip1 v2.2d, v2.2d, v3.2d zip1 v0.2d, v0.2d, v1.2d uabd v21.16b, v5.16b, v18.16b // tdiff uabd v20.16b, v5.16b, v16.16b uabd v23.16b, v4.16b, v18.16b // tldiff uabd v22.16b, v4.16b, v16.16b uabd v17.16b, v2.16b, v18.16b // ldiff uabd v16.16b, v0.16b, v16.16b umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) umin v18.16b, v20.16b, v22.16b cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff cmhs v20.16b, v22.16b, v20.16b cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff cmhs v16.16b, v18.16b, v16.16b bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET ld1 {v5.16b}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 2: usubl v6.8h, v5.8b, v4.8b // top - topleft usubl2 v7.8h, v5.16b, v4.16b uaddw v24.8h, v6.8h, v0.8b uaddw v25.8h, v7.8h, v0.8b uaddw v26.8h, v6.8h, v1.8b uaddw v27.8h, v7.8h, v1.8b uaddw v28.8h, v6.8h, v2.8b uaddw v29.8h, v7.8h, v2.8b uaddw v30.8h, v6.8h, v3.8b uaddw v31.8h, v7.8h, v3.8b sqxtun v17.8b, v26.8h // base sqxtun2 v17.16b, v27.8h sqxtun v16.8b, v24.8h sqxtun2 v16.16b, v25.8h sqxtun v19.8b, v30.8h sqxtun2 v19.16b, v31.8h sqxtun v18.8b, v28.8h sqxtun2 v18.16b, v29.8h uabd v23.16b, v5.16b, v19.16b // tdiff uabd v22.16b, v5.16b, v18.16b uabd v21.16b, v5.16b, v17.16b uabd v20.16b, v5.16b, v16.16b uabd v27.16b, v4.16b, v19.16b // tldiff uabd v26.16b, v4.16b, v18.16b uabd v25.16b, v4.16b, v17.16b uabd v24.16b, v4.16b, v16.16b uabd v19.16b, v3.16b, v19.16b // ldiff uabd v18.16b, v2.16b, v18.16b uabd v17.16b, v1.16b, v17.16b uabd v16.16b, v0.16b, v16.16b umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) umin v30.16b, v22.16b, v26.16b umin v29.16b, v21.16b, v25.16b umin v28.16b, v20.16b, v24.16b cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff cmhs v22.16b, v26.16b, v22.16b cmhs v21.16b, v25.16b, v21.16b cmhs v20.16b, v24.16b, v20.16b cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff cmhs v18.16b, v30.16b, v18.16b cmhs v17.16b, v29.16b, v17.16b cmhs v16.16b, v28.16b, v16.16b bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b subs w3, w3, #16 st1 {v23.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v21.16b}, [x5], #16 st1 {v20.16b}, [x10], #16 b.le 8f ld1 {v5.16b}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.16b}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_paeth_tbl .word 640b - ipred_paeth_tbl .word 320b - ipred_paeth_tbl .word 160b - ipred_paeth_tbl .word 80b - ipred_paeth_tbl .word 40b - ipred_paeth_tbl endjumptable // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_8bpc_neon, export=1 mul w13, w3, w4 // n_pel = width * height clz w6, w3 mov x11, #0x0201 // ramp = bytes 01 02 add w12, w3, #1 movi v28.8h, #128 // rounding constant for addhn movrel x14, X(sm_weights) sub w6, w6, #24 // to128 = clz(width) - 24, turns 1,2,3,... into 128 / width-scaled coefficients movk x11, #0x0403, lsl #16 // ramp = bytes 01 02 03 04 mov w10, #16 cmp w13, #512 cset w9, gt movk x11, #0x0605, lsl #32 // ramp = bytes 01 02 03 04 05 06 add x12, x12, x2 cmp w13, #63 movk x11, #0x0807, lsl #48 // ramp = bytes 01 02 03 04 05 06 07 08 cinc w9, w9, gt // scale = (n_pel >= 64) + (n_pel > 512) ld1r {v2.16b}, [x12] // broadcast right = topleft[width + 1] add x14, x14, x9, lsl #6 // weights = sm_weights[scale] lsl x11, x11, x6 // scale ramp bytes into 128 / width units lsl w10, w10, w6 // 16 << to128, for advancing 16 pixels, width >= 16 only cmp w3, #8 b.gt 160f b.eq 80f 40: // width = 4 dup v18.2s, w11 movi v29.8b, #1 movi v27.16b, #2 ld1r {v31.2s}, [x14] // broadcast 4 horizontal weights add x6, x0, x1 // x6 = dst + stride (row 1 when storing 2 rows interleaved) lsl x1, x1, #1 // x1 = 2 * stride, store rows 0,2 via x0 and 1,3 via x6 add v27.16b, v27.16b, v29.16b usubw v0.8h, v28.8h, v18.8b // coeff = 128 - [1..4] * 128 / width uzp1 v29.4s, v27.4s, v29.4s // table indices to pack 4 scalar left values add v0.8h, v0.8h, v0.8h // coeff scaled up for addhn rounding ushll v31.8h, v31.8b, #2 // weights[x] * 4, so addhn(..., 128) gives ((x * weight + 32) >> 6) 4: // process 4 rows ldr s4, [x2, #-4]! // load left samples topleft[-1,-2,-3,-4] backwards tbl v4.16b, {v4.16b}, v29.16b // expand 4 left bytes into 4 packed rows usubl v5.8h, v4.8b, v2.8b usubl2 v6.8h, v4.16b, v2.16b // diff = left - right for 4 rows mul v16.8h, v0.8h, v5.8h mul v18.8h, v0.8h, v6.8h // mul = diff * coeff addhn v16.8b, v16.8h, v28.8h addhn2 v16.16b, v18.8h, v28.8h // ((diff * coeff + 128) >> 8) add v16.16b, v16.16b, v2.16b // pred = right + ((diff * coeff + 128) >> 8) usubl v24.8h, v4.8b, v16.8b usubl2 v25.8h, v4.16b, v16.16b // left - pred subs w4, w4, #4 mul v24.8h, v24.8h, v31.8h mul v25.8h, v25.8h, v31.8h addhn v24.8b, v24.8h, v28.8h addhn2 v24.16b, v25.8h, v28.8h // ((left - pred) * weight + 32) >> 6 add v24.16b, v24.16b, v16.16b // dst = pred + ((left - pred) * weight + 32) >> 6 st1 {v24.s}[0], [x0], x1 st1 {v24.s}[1], [x6], x1 st1 {v24.s}[2], [x0], x1 st1 {v24.s}[3], [x6], x1 b.gt 4b ret 80: // width = 8 fmov d18, x11 ldr d31, [x14] // load 8 horizontal weights add x6, x0, x1 lsl x1, x1, #1 usubw v0.8h, v28.8h, v18.8b // coeff = 128 - [1..8] * 128 / width movi v29.8b, #1 movi v27.16b, #2 add v0.8h, v0.8h, v0.8h add v27.16b, v27.16b, v29.16b ushll v31.8h, v31.8b, #2 // weights[x] * 4 for rounded >> 6 via addhn 8: // process 4 rows ldr s4, [x2, #-4]! usubl v3.8h, v4.8b, v2.8b // diff = left - right for 4 rows mul v16.8h, v0.8h, v3.h[3] mul v18.8h, v0.8h, v3.h[1] mul v17.8h, v0.8h, v3.h[2] mul v19.8h, v0.8h, v3.h[0] // mul = diff * coeff addhn v16.8b, v16.8h, v28.8h addhn v18.8b, v18.8h, v28.8h addhn2 v16.16b, v17.8h, v28.8h addhn2 v18.16b, v19.8h, v28.8h // preds for rows 3,2 and 1,0 tbl v7.16b, {v4.16b}, v27.16b tbl v6.16b, {v4.16b}, v29.16b // duplicate left for two rows add v16.16b, v16.16b, v2.16b // pred rows 3,2 add v18.16b, v18.16b, v2.16b // pred rows 1,0 usubl v20.8h, v7.8b, v16.8b usubl2 v21.8h, v7.16b, v16.16b usubl v22.8h, v6.8b, v18.8b usubl2 v23.8h, v6.16b, v18.16b // left - pred for each row subs w4, w4, #4 mul v20.8h, v20.8h, v31.8h mul v22.8h, v22.8h, v31.8h mul v21.8h, v21.8h, v31.8h mul v23.8h, v23.8h, v31.8h addhn v20.8b, v20.8h, v28.8h addhn v22.8b, v22.8h, v28.8h addhn2 v20.16b, v21.8h, v28.8h addhn2 v22.16b, v23.8h, v28.8h // weighted correction toward left add v20.16b, v20.16b, v16.16b // final rows 3,2 add v22.16b, v22.16b, v18.16b // final rows 1,0 st1 {v20.8b}, [x0], x1 st1 {v20.d}[1], [x6], x1 st1 {v22.8b}, [x0], x1 st1 {v22.d}[1], [x6], x1 b.gt 8b ret 160: // width >= 16 fmov d18, x11 ldr q31, [x14], #16 // load 16 horizontal weights for this 16-pixel block dup v29.8h, w10 // step corresponding to 16 pixels in the horizontal ramp mov x12, x2 // reset left-pointer for this strip usubw v0.8h, v28.8h, v18.8b // coeffs for x = 0..7 : 128 - [1..8] * 128 / width mov w7, w4 // remaining rows add v0.8h, v0.8h, v0.8h lsl x11, x1, #1 // 2 * stride for paired-row stores sub v1.8h, v0.8h, v29.8h // coeffs for x = 8..15 : (v0 - 16) * (128 / width) ushll v30.8h, v31.8b, #2 // weights[0..7] * 4 mov x5, x0 // row 0,2 store pointer ushll2 v31.8h, v31.16b, #2 // weights[8..15] * 4, last 2 weights are zeros! add x6, x0, x1 // row 1,3 store pointer 16: // process 4 rows for the 1st 16-pixel stripe ldr s4, [x12, #-4]! usubl v3.8h, v4.8b, v2.8b // diff = left - right for 4 rows ld4r {v4.16b, v5.16b, v6.16b, v7.16b}, [x12] // broadcast left samples with load // Build pred for columns 0..15 of row 3,2. mul v16.8h, v0.8h, v3.h[3] mul v17.8h, v0.8h, v3.h[2] mul v20.8h, v1.8h, v3.h[3] mul v21.8h, v1.8h, v3.h[2] addhn v16.8b, v16.8h, v28.8h addhn v17.8b, v17.8h, v28.8h addhn2 v16.16b, v20.8h, v28.8h addhn2 v17.16b, v21.8h, v28.8h // Same for row 1,0. mul v18.8h, v0.8h, v3.h[1] mul v19.8h, v0.8h, v3.h[0] mul v22.8h, v1.8h, v3.h[1] mul v23.8h, v1.8h, v3.h[0] addhn v18.8b, v18.8h, v28.8h addhn v19.8b, v19.8h, v28.8h addhn2 v18.16b, v22.8h, v28.8h addhn2 v19.16b, v23.8h, v28.8h add v16.16b, v16.16b, v2.16b // pred row 3 add v17.16b, v17.16b, v2.16b // pred row 2 add v18.16b, v18.16b, v2.16b // pred row 1 add v19.16b, v19.16b, v2.16b // pred row 0 usubl v20.8h, v7.8b, v16.8b usubl v21.8h, v6.8b, v17.8b usubl v22.8h, v5.8b, v18.8b usubl v23.8h, v4.8b, v19.8b usubl2 v24.8h, v7.16b, v16.16b usubl2 v25.8h, v6.16b, v17.16b usubl2 v26.8h, v5.16b, v18.16b usubl2 v27.8h, v4.16b, v19.16b // left - pred for rows 3,2,1,0 subs w7, w7, #4 mul v20.8h, v20.8h, v30.8h mul v21.8h, v21.8h, v30.8h mul v24.8h, v24.8h, v31.8h mul v25.8h, v25.8h, v31.8h addhn v20.8b, v20.8h, v28.8h addhn v21.8b, v21.8h, v28.8h addhn2 v20.16b, v24.8h, v28.8h addhn2 v21.16b, v25.8h, v28.8h mul v22.8h, v22.8h, v30.8h mul v23.8h, v23.8h, v30.8h mul v26.8h, v26.8h, v31.8h mul v27.8h, v27.8h, v31.8h addhn v22.8b, v22.8h, v28.8h addhn v23.8b, v23.8h, v28.8h addhn2 v22.16b, v26.8h, v28.8h addhn2 v23.16b, v27.8h, v28.8h // weighted correction over 16 columns add v20.16b, v20.16b, v16.16b // final row 3 add v21.16b, v21.16b, v17.16b // final row 2 st1 {v20.16b}, [x5], x11 add v22.16b, v22.16b, v18.16b // final row 1 st1 {v21.16b}, [x6], x11 add v23.16b, v23.16b, v19.16b // final row 0 st1 {v22.16b}, [x5], x11 st1 {v23.16b}, [x6], x11 b.gt 16b subs w3, w3, #16 // check for additional stripes b.le 1f // At this point the horizontal weights are zero, continue with a simpler loop. add v29.8h, v29.8h, v29.8h // advance amount for next 16-pixel block (32 * (128 / width)) 161: mov x12, x2 // reset left-pointer for this block add x0, x0, #16 // advance dst by 16 pixels sub v0.8h, v0.8h, v29.8h // advance horizontal ramp by 16 pixels mov w7, w4 // remaining rows sub v1.8h, v1.8h, v29.8h // advance horizontal ramp by 16 pixels mov x5, x0 // row 0,2 store pointer add x6, x0, x1 // row 1,3 store pointer 32: // process 4 rows for current 16-pixel stripe ldr s4, [x12, #-4]! usubl v3.8h, v4.8b, v2.8b // diff = left - right for 4 rows // Build pred for columns 0..15 of row 3,2. mul v16.8h, v0.8h, v3.h[3] mul v17.8h, v0.8h, v3.h[2] mul v20.8h, v1.8h, v3.h[3] mul v21.8h, v1.8h, v3.h[2] addhn v16.8b, v16.8h, v28.8h addhn v17.8b, v17.8h, v28.8h addhn2 v16.16b, v20.8h, v28.8h addhn2 v17.16b, v21.8h, v28.8h // Same for row 1,0. mul v18.8h, v0.8h, v3.h[1] mul v19.8h, v0.8h, v3.h[0] mul v22.8h, v1.8h, v3.h[1] mul v23.8h, v1.8h, v3.h[0] subs w7, w7, #4 addhn v18.8b, v18.8h, v28.8h addhn v19.8b, v19.8h, v28.8h addhn2 v18.16b, v22.8h, v28.8h addhn2 v19.16b, v23.8h, v28.8h add v16.16b, v16.16b, v2.16b // pred row 3 add v17.16b, v17.16b, v2.16b // pred row 2 add v18.16b, v18.16b, v2.16b // pred row 1 add v19.16b, v19.16b, v2.16b // pred row 0 st1 {v16.16b}, [x5], x11 st1 {v17.16b}, [x6], x11 st1 {v18.16b}, [x5], x11 st1 {v19.16b}, [x6], x11 b.gt 32b sub w3, w3, #16 // next 16-column stripe cbnz w3, 161b 1: ret endfunc // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_8bpc_neon, export=1 clz w6, w4 mov w11, #0x0201 // ramp = bytes 01 02 mul w13, w3, w4 // n_pel = width * height add w12, w4, #1 movi v7.8h, #128 // rounding constant for addhn sub w6, w6, #24 // to128 = clz(height) - 24 = converts row 1,2,3,... into 128 / height units mov w10, #8 movk w11, #0x0403, lsl #16 // ramp = bytes 01 02 03 04 sub x12, x2, x12 // &topleft[-(height + 1)] lsl w11, w11, w6 // [1..4] scaled into 128 / height units movrel x14, X(sm_weights) fmov s1, w11 ld1r {v2.16b}, [x12] // broadcast bottom = topleft[-(height + 1)] cmp w13, #512 cset w9, gt lsl w10, w10, w6 // 8 << to128 = for advancing 4 rows cmp w13, #63 usubw v0.8h, v7.8h, v1.8b // coeff = 128 - [1..4] * 128 / height, vertical coeffs for the first 4 rows cinc w9, w9, gt // scale = (n_pel >= 64) + (n_pel > 512) dup v5.8h, w10 // decrement amount for moving to the next 4 rows add v0.8h, v0.8h, v0.8h // coeff scaled up for addhn rounding add x2, x2, #1 add x14, x14, x9, lsl #6 // weights = sm_weights[scale] cmp w3, #8 b.gt 160f b.eq 80f 40: // width = 4 ld1r {v3.4s}, [x2] // load and replicate top 4 samples topleft[1..4] ("above" values) zip1 v22.8h, v0.8h, v0.8h // duplicate row coeffs: [r0,r0,r1,r1,r2,r2,r3,r3] mov v18.16b, v7.16b mov v19.16b, v7.16b usubl v4.8h, v3.8b, v2.8b // above - bottom (same for all rows) zip1 v6.4s, v22.4s, v22.4s // row coeffs for rows 0,1 packed as 32-bit chunks zip2 v22.4s, v22.4s, v22.4s // row coeffs for rows 2,3 packed as 32-bit chunks add x6, x0, x1 // x6 = dst + stride (row 1 when storing 2 rows interleaved) lsl x1, x1, #1 // x1 = 2 * stride, store rows 0,2 via x0 and 1,3 via x6 mla v18.8h, v4.8h, v6.8h mla v19.8h, v4.8h, v22.8h // 1st pred-bottom term = ((above - bottom) * off + rnd mul v5.8h, v4.8h, v5.8h // decrement to compute next pred-bottom term b 42f // skip the pred-bottom term update for 1st iteration 41: // process 4 rows - only 1 or 2 iterations: scale(w9) can only be 0 or 1! sub v18.8h, v18.8h, v5.8h sub v19.8h, v19.8h, v5.8h // next pred-bottom term, only from 2nd iteration 42: ldr s1, [x14], #4 // load weights[y..y+3], one byte per row uzp2 v16.16b, v18.16b, v19.16b // packed pred-bottom term for 4 rows, ((above - bottom) * off + rnd) >> log2(height) ushll v1.8h, v1.8b, #2 // weights[y..y+3] * 4 for rounded >> 6 via addhn add v16.16b, v16.16b, v2.16b // pred = bottom + interpolated(above - bottom) zip1 v23.8h, v1.8h, v1.8h // duplicate each row weight to match packed lanes zip1 v1.4s, v23.4s, v23.4s // row weights for rows 0,1 packed as 32-bit chunks zip2 v23.4s, v23.4s, v23.4s // row weights for rows 2,3 packed as 32-bit chunks usubl v20.8h, v3.8b, v16.8b usubl2 v21.8h, v3.16b, v16.16b // above - pred subs w9, w9, #1 // limited iterations when vertical weights are non-zero mul v20.8h, v20.8h, v1.8h mul v21.8h, v21.8h, v23.8h addhn v20.8b, v20.8h, v7.8h addhn2 v20.16b, v21.8h, v7.8h // ((above - pred) * w_ver + 32) >> 6 sub w4, w4, #4 add v20.16b, v20.16b, v16.16b // dst = pred + correction toward above st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 st1 {v20.s}[2], [x0], x1 st1 {v20.s}[3], [x6], x1 b.ge 41b cbz w4, 43f 44: // process last rows, 4 per iteration where the vertical weights are zero sub v18.8h, v18.8h, v5.8h sub v19.8h, v19.8h, v5.8h // next pred-bottom term uzp2 v20.16b, v18.16b, v19.16b // packed pred-bottom term for 4 rows, ((above - bottom) * off + rnd) >> log2(height) add v20.16b, v20.16b, v2.16b // pred = bottom + interpolated(above - bottom) subs w4, w4, #4 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 st1 {v20.s}[2], [x0], x1 st1 {v20.s}[3], [x6], x1 b.gt 44b 43: ret 80: // width = 8 ld1r {v3.2d}, [x2] // load and replicate the top 8 samples topleft[1..8] add x6, x0, x1 lsl x1, x1, #1 usubl v4.8h, v3.8b, v2.8b // above - bottom, 8 columns mov v16.16b, v7.16b mov v17.16b, v7.16b mov v18.16b, v7.16b mov v19.16b, v7.16b mla v16.8h, v4.8h, v0.h[0] mla v17.8h, v4.8h, v0.h[1] mla v18.8h, v4.8h, v0.h[2] mla v19.8h, v4.8h, v0.h[3] // 1st pred-bottom term = ((above - bottom) * off + rnd mul v5.8h, v4.8h, v5.8h // decrement to compute next pred-bottom term b 82f 81: // process 4 rows - only 1 or 2 iterations: scale(w9) can only be 0 or 1! sub v16.8h, v16.8h, v5.8h sub v17.8h, v17.8h, v5.8h sub v18.8h, v18.8h, v5.8h sub v19.8h, v19.8h, v5.8h // next pred-bottom term, only from 2nd iteration 82: ldr s1, [x14], #4 // weights for these 4 rows uzp2 v25.16b, v16.16b, v17.16b // packed pred-bottom term for rows 0,1 uzp2 v26.16b, v18.16b, v19.16b // packed pred-bottom term for rows 2,3 ushll v1.8h, v1.8b, #2 // weights * 4 for rounded >> 6 via addhn add v25.16b, v25.16b, v2.16b // pred rows 0,1 add v26.16b, v26.16b, v2.16b // pred rows 2,3 usubl v20.8h, v3.8b, v25.8b usubl2 v21.8h, v3.16b, v25.16b usubl v22.8h, v3.8b, v26.8b usubl2 v23.8h, v3.16b, v26.16b // above - pred for each row pair mul v20.8h, v20.8h, v1.h[0] mul v22.8h, v22.8h, v1.h[2] mul v21.8h, v21.8h, v1.h[1] mul v23.8h, v23.8h, v1.h[3] // (above - pred) * w_ver subs w9, w9, #1 // limited iterations when vertical weights are non-zero addhn v20.8b, v20.8h, v7.8h addhn v22.8b, v22.8h, v7.8h addhn2 v20.16b, v21.8h, v7.8h addhn2 v22.16b, v23.8h, v7.8h // ((above - pred) * w_ver + 32) >> 6 sub w4, w4, #4 add v20.16b, v20.16b, v25.16b // final rows 0,1 add v22.16b, v22.16b, v26.16b // final rows 2,3 st1 {v20.8b}, [x0], x1 st1 {v20.d}[1], [x6], x1 st1 {v22.8b}, [x0], x1 st1 {v22.d}[1], [x6], x1 b.ge 81b cbz w4, 83f 84: // process last rows, 4 per iteration where the vertical weights are zero sub v16.8h, v16.8h, v5.8h sub v17.8h, v17.8h, v5.8h sub v18.8h, v18.8h, v5.8h sub v19.8h, v19.8h, v5.8h // next pred-bottom term uzp2 v20.16b, v16.16b, v17.16b // packed pred-bottom term for rows 0,1 uzp2 v22.16b, v18.16b, v19.16b // packed pred-bottom term for rows 2,3 add v20.16b, v20.16b, v2.16b // pred rows 0,1 add v22.16b, v22.16b, v2.16b // pred rows 2,3 subs w4, w4, #4 st1 {v20.8b}, [x0], x1 st1 {v20.d}[1], [x6], x1 st1 {v22.8b}, [x0], x1 st1 {v22.d}[1], [x6], x1 b.gt 84b 83: ret 160: // width >= 16 stp d8, d9, [sp, #-0x20]! stp d10, d11, [sp, #0x10] lsl x11, x1, #1 // 2 * stride for paired-row stores 165: ldr q3, [x2], #16 // load 16 top samples for this 16-pixel stripe mov x7, x14 // restart row-weight pointer for this stripe lsl w10, w9, #1 // maximum 1st loop iterations: scale(w9) * 2 mov w8, w4 // row counter for this stripe mov x5, x0 add x6, x0, x1 cmp w4, #4 csel w10, w10, w9, ne // limit 1st loop iterations to 1 for Nx4 case usubl v4.8h, v3.8b, v2.8b // above[0..7] - bottom usubl2 v6.8h, v3.16b, v2.16b // above[8..15] - bottom mov v8.16b, v7.16b mov v9.16b, v7.16b mla v8.8h, v4.8h, v0.h[0] mla v9.8h, v6.8h, v0.h[0] // 1st pred-bottom for row 0 = ((above - bottom) * off + rnd mov v10.16b, v7.16b mov v11.16b, v7.16b mla v10.8h, v4.8h, v0.h[1] mla v11.8h, v6.8h, v0.h[1] // 1st pred-bottom for row 1 mov v28.16b, v7.16b mov v29.16b, v7.16b mla v28.8h, v4.8h, v0.h[2] mla v29.8h, v6.8h, v0.h[2] // 1st pred-bottom for row 2 mov v30.16b, v7.16b mov v31.16b, v7.16b mla v30.8h, v4.8h, v0.h[3] mla v31.8h, v6.8h, v0.h[3] // 1st pred-bottom for row 3 mul v4.8h, v4.8h, v5.8h mul v6.8h, v6.8h, v5.8h // decrement to compute next pred-bottom term b 162f 161: // process 4 rows for the current 16-pixel stripe sub v8.8h, v8.8h, v4.8h sub v9.8h, v9.8h, v6.8h sub v10.8h, v10.8h, v4.8h sub v11.8h, v11.8h, v6.8h sub v28.8h, v28.8h, v4.8h sub v29.8h, v29.8h, v6.8h sub v30.8h, v30.8h, v4.8h sub v31.8h, v31.8h, v6.8h // next pred-bottom term, only from 2nd iteration 162: ldr s1, [x7], #4 // weights[y..y+3] uzp2 v16.16b, v8.16b, v9.16b // pred-bottom for rows 0 uzp2 v17.16b, v10.16b, v11.16b // pred-bottom for rows 1 uzp2 v18.16b, v28.16b, v29.16b // pred-bottom for rows 2 uzp2 v19.16b, v30.16b, v31.16b // pred-bottom for rows 3 ushll v1.8h, v1.8b, #2 // weights * 4 add v16.16b, v16.16b, v2.16b // pred row 0 add v17.16b, v17.16b, v2.16b // pred row 1 add v18.16b, v18.16b, v2.16b // pred row 2 add v19.16b, v19.16b, v2.16b // pred row 3 // Second stage: move each pred toward the top-row sample using weights[y]. usubl v20.8h, v3.8b, v16.8b usubl v21.8h, v3.8b, v17.8b usubl v22.8h, v3.8b, v18.8b usubl v23.8h, v3.8b, v19.8b usubl2 v24.8h, v3.16b, v16.16b usubl2 v25.8h, v3.16b, v17.16b usubl2 v26.8h, v3.16b, v18.16b usubl2 v27.8h, v3.16b, v19.16b // above - pred for rows 0..3 mul v20.8h, v20.8h, v1.h[0] mul v21.8h, v21.8h, v1.h[1] mul v24.8h, v24.8h, v1.h[0] mul v25.8h, v25.8h, v1.h[1] subs w10, w10, #1 // limited iterations when vertical weights are non-zero addhn v20.8b, v20.8h, v7.8h addhn v21.8b, v21.8h, v7.8h addhn2 v20.16b, v24.8h, v7.8h addhn2 v21.16b, v25.8h, v7.8h // correction for rows 0,1 mul v22.8h, v22.8h, v1.h[2] mul v23.8h, v23.8h, v1.h[3] mul v26.8h, v26.8h, v1.h[2] mul v27.8h, v27.8h, v1.h[3] addhn v22.8b, v22.8h, v7.8h addhn v23.8b, v23.8h, v7.8h addhn2 v22.16b, v26.8h, v7.8h addhn2 v23.16b, v27.8h, v7.8h // correction for rows 2,3 sub w8, w8, #4 add v20.16b, v20.16b, v16.16b // final row 0 add v21.16b, v21.16b, v17.16b // final row 1 st1 {v20.16b}, [x5], x11 add v22.16b, v22.16b, v18.16b // final row 2 st1 {v21.16b}, [x6], x11 add v23.16b, v23.16b, v19.16b // final row 3 st1 {v22.16b}, [x5], x11 st1 {v23.16b}, [x6], x11 b.gt 161b cbz w8, 163f 164: // process last rows, 4 per iteration for the current 16-pixel stripe sub v8.8h, v8.8h, v4.8h sub v9.8h, v9.8h, v6.8h sub v10.8h, v10.8h, v4.8h sub v11.8h, v11.8h, v6.8h sub v28.8h, v28.8h, v4.8h sub v29.8h, v29.8h, v6.8h sub v30.8h, v30.8h, v4.8h sub v31.8h, v31.8h, v6.8h // next pred-bottom term uzp2 v16.16b, v8.16b, v9.16b // pred-bottom for rows 0 uzp2 v17.16b, v10.16b, v11.16b // pred-bottom for rows 1 uzp2 v18.16b, v28.16b, v29.16b // pred-bottom for rows 2 uzp2 v19.16b, v30.16b, v31.16b // pred-bottom for rows 3 subs w8, w8, #4 add v16.16b, v16.16b, v2.16b // pred row 0 add v17.16b, v17.16b, v2.16b // pred row 1 add v18.16b, v18.16b, v2.16b // pred row 2 add v19.16b, v19.16b, v2.16b // pred row 3 st1 {v16.16b}, [x5], x11 st1 {v17.16b}, [x6], x11 st1 {v18.16b}, [x5], x11 st1 {v19.16b}, [x6], x11 b.gt 164b 163: sub w3, w3, #16 // next 16-pixel stripe add x0, x0, #16 // advance dst base by 16 columns cbnz w3, 165b ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], #0x20 ret endfunc #if 0 // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_8bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 movrel x5, ipred_smooth_tbl sub x12, x2, w4, uxtw sub w9, w9, #25 ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.16b}, [x12] // bottom add x8, x2, #1 add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #4 mov x7, #-4 dup v5.16b, v6.b[3] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h uhadd v20.8h, v20.8h, v22.8h uhadd v21.8h, v21.8h, v23.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #4 mov x7, #-4 dup v5.16b, v6.b[7] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw sub x2, x2, #2 mov x7, #-2 ld1r {v5.16b}, [x12] // right sub x1, x1, w3, uxtw mov w9, w3 1: ld2r {v0.8b, v1.8b}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v3.16b}, [x8], #16 // top shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h // (left flipped) mla v22.8h, v0.8h, v6.8h mla v23.8h, v0.8h, v7.8h shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v3.8h, v16.8h mla v26.8h, v2.8h, v17.8h mla v27.8h, v3.8h, v17.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_smooth_tbl .word 640b - ipred_smooth_tbl .word 320b - ipred_smooth_tbl .word 160b - ipred_smooth_tbl .word 80b - ipred_smooth_tbl .word 40b - ipred_smooth_tbl endjumptable #endif const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 padding_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, // const pixel *const in, const int end); function ipred_z1_upsample_edge_8bpc_neon, export=1 movrel x4, padding_mask ld1 {v0.16b}, [x2] // in[] add x5, x2, w3, uxtw // in[end] sub x4, x4, w3, uxtw ld1r {v1.16b}, [x5] // padding ld1 {v3.16b}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v1.16b, v3.16b // padded in[] ext v4.16b, v0.16b, v1.16b, #1 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #3 uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] uaddl2 v17.8h, v4.16b, v5.16b uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] uaddl2 v19.8h, v0.16b, v6.16b mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) mul v17.8h, v17.8h, v31.8h sub v16.8h, v16.8h, v18.8h sub v17.8h, v17.8h, v19.8h sqrshrun v16.8b, v16.8h, #4 sqrshrun2 v16.16b, v17.8h, #4 zip1 v0.16b, v4.16b, v16.16b zip2 v1.16b, v4.16b, v16.16b st1 {v0.16b, v1.16b}, [x0] ret endfunc // void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz, // const pixel *const in); function ipred_z2_upsample_edge_8bpc_neon, export=1 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. movrel x4, padding_mask ld1 {v0.16b}, [x2] // in[] add x5, x2, w1, uxtw // in[sz] sub x4, x4, w1, uxtw ld1r {v2.16b}, [x2] // in[0] for padding ld1r {v1.16b}, [x5] // padding ld1 {v3.16b}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v1.16b, v3.16b // padded in[] ext v4.16b, v2.16b, v0.16b, #15 ext v5.16b, v0.16b, v1.16b, #1 ext v6.16b, v0.16b, v1.16b, #2 uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1] uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2] mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) sub v16.8h, v16.8h, v18.8h sqrshrun v16.8b, v16.8h, #4 add x5, x0, #16 zip1 v2.16b, v0.16b, v16.16b st1 {v1.b}[0], [x5] // In case sz=8, output one single pixel in out[16]. st1 {v2.16b}, [x0] ret endfunc const edge_filter .byte 0, 4, 8, 0 .byte 0, 5, 6, 0 // Leaving out the coeffs for strength=3 // .byte 2, 4, 4, 0 endconst // void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, // const pixel *const in, const int end, // const int strength); function ipred_z1_filter_edge_8bpc_neon, export=1 cmp w4, #3 b.eq L(fivetap) // if (strength == 3) goto fivetap movrel x5, edge_filter, -3 add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 ld1 {v31.h}[0], [x5] // kernel[1-2] ld1 {v0.16b}, [x2], #16 dup v30.16b, v31.b[0] dup v31.16b, v31.b[1] 1: // in[end], is the last valid pixel. We produce 16 pixels out by // using 18 pixels in - the last pixel used is [17] of the ones // read/buffered. cmp w3, #17 ld1 {v1.16b}, [x2], #16 b.lt 2f ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 umull v4.8h, v0.8b, v30.8b umlal v4.8h, v2.8b, v31.8b umlal v4.8h, v3.8b, v30.8b umull2 v5.8h, v0.16b, v30.16b umlal2 v5.8h, v2.16b, v31.16b umlal2 v5.8h, v3.16b, v30.16b subs w1, w1, #16 mov v0.16b, v1.16b rshrn v4.8b, v4.8h, #4 rshrn2 v4.16b, v5.8h, #4 sub w3, w3, #16 st1 {v4.16b}, [x0], #16 b.gt 1b ret 2: // Right padding // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) movrel x5, padding_mask sub w6, w3, #32 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw ld1 {v2.16b}, [x5] // padding_mask ld1r {v1.16b}, [x6] bit v0.16b, v1.16b, v2.16b // Pad v0-v1 // Filter one block ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 umull v4.8h, v0.8b, v30.8b umlal v4.8h, v2.8b, v31.8b umlal v4.8h, v3.8b, v30.8b umull2 v5.8h, v0.16b, v30.16b umlal2 v5.8h, v2.16b, v31.16b umlal2 v5.8h, v3.16b, v30.16b subs w1, w1, #16 rshrn v4.8b, v4.8h, #4 rshrn2 v4.16b, v5.8h, #4 st1 {v4.16b}, [x0], #16 b.le 9f 5: // After one block, any remaining output would only be filtering // padding - thus just store the padding. subs w1, w1, #16 st1 {v1.16b}, [x0], #16 b.gt 5b 9: ret L(fivetap): sub x2, x2, #1 // topleft -= 1 movi v29.16b, #2 ld1 {v0.16b}, [x2], #16 movi v30.16b, #4 movi v31.16b, #4 ins v0.b[0], v0.b[1] 1: // in[end+1], is the last valid pixel. We produce 16 pixels out by // using 20 pixels in - the last pixel used is [19] of the ones // read/buffered. cmp w3, #18 ld1 {v1.16b}, [x2], #16 b.lt 2f // if (end + 1 < 19) ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v0.16b, v1.16b, #3 ext v5.16b, v0.16b, v1.16b, #4 umull v6.8h, v0.8b, v29.8b umlal v6.8h, v2.8b, v30.8b umlal v6.8h, v3.8b, v31.8b umlal v6.8h, v4.8b, v30.8b umlal v6.8h, v5.8b, v29.8b umull2 v7.8h, v0.16b, v29.16b umlal2 v7.8h, v2.16b, v30.16b umlal2 v7.8h, v3.16b, v31.16b umlal2 v7.8h, v4.16b, v30.16b umlal2 v7.8h, v5.16b, v29.16b subs w1, w1, #16 mov v0.16b, v1.16b rshrn v6.8b, v6.8h, #4 rshrn2 v6.16b, v7.8h, #4 sub w3, w3, #16 st1 {v6.16b}, [x0], #16 b.gt 1b ret 2: // Right padding // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) movrel x5, padding_mask, -1 sub w6, w3, #31 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw ld1 {v2.16b, v3.16b}, [x5] // padding_mask ld1r {v28.16b}, [x6] bit v0.16b, v28.16b, v2.16b // Pad v0-v1 bit v1.16b, v28.16b, v3.16b 4: // Filter one block ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v0.16b, v1.16b, #3 ext v5.16b, v0.16b, v1.16b, #4 umull v6.8h, v0.8b, v29.8b umlal v6.8h, v2.8b, v30.8b umlal v6.8h, v3.8b, v31.8b umlal v6.8h, v4.8b, v30.8b umlal v6.8h, v5.8b, v29.8b umull2 v7.8h, v0.16b, v29.16b umlal2 v7.8h, v2.16b, v30.16b umlal2 v7.8h, v3.16b, v31.16b umlal2 v7.8h, v4.16b, v30.16b umlal2 v7.8h, v5.16b, v29.16b subs w1, w1, #16 mov v0.16b, v1.16b mov v1.16b, v28.16b rshrn v6.8b, v6.8h, #4 rshrn2 v6.16b, v7.8h, #4 sub w3, w3, #16 st1 {v6.16b}, [x0], #16 b.le 9f // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to // filter properly once more - aka (w3 >= 0). cmp w3, #0 b.ge 4b 5: // When w3 <= 0, all remaining pixels in v0-v1 are equal to the // last valid pixel - thus just output that without filtering. subs w1, w1, #16 st1 {v1.16b}, [x0], #16 b.gt 5b 9: ret endfunc // void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, // const int n); function ipred_pixel_set_8bpc_neon, export=1 dup v0.16b, w1 1: subs w2, w2, #16 st1 {v0.16b}, [x0], #16 b.gt 1b ret endfunc // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, // const int dx, const int max_base_x); function ipred_z1_fill1_8bpc_neon, export=1 clz w9, w3 movrel x8, ipred_z1_fill1_tbl sub w9, w9, #25 ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw // top[max_base_x] add x8, x8, x9 ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f ldr d0, [x2, w8, uxtw] // top[base] ldr d2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] ext v3.8b, v2.8b, v2.8b, #1 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] usubl v7.8h, v3.8b, v2.8b ushll v16.8h, v0.8b, #6 // top[base]*64 ushll v17.8h, v2.8b, #6 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac mla v17.4h, v7.4h, v5.4h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.s}[0], [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.s}[0], [x0], x1 b.gt 4b ret 49: st1 {v31.s}[0], [x0], x1 subs w4, w4, #2 st1 {v31.s}[0], [x0], x1 b.gt 49b ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.8b, w9 // frac dup v5.8b, w11 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8b, w9 // 64 - frac dup v7.8b, w11 ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] ext v3.16b, v2.16b, v2.16b, #1 umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac umull v17.8h, v2.8b, v7.8b umlal v17.8h, v3.8b, v5.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.gt 8b ret 89: st1 {v31.8b}, [x0], x1 subs w4, w4, #2 st1 {v31.8b}, [x0], x1 b.gt 89b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w3 add x13, x0, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 169f add x8, x2, w8, uxtw add x10, x2, w10, uxtw dup v4.16b, w9 // frac dup v5.16b, w11 ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] ld1 {v2.16b, v3.16b}, [x10], #32 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.16b, w9 // 64 - frac dup v7.16b, w11 add w7, w7, w5 // xpos += dx 2: ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w3, w3, #16 umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac umull2 v19.8h, v0.16b, v6.16b umlal2 v19.8h, v16.16b, v4.16b umull v20.8h, v2.8b, v7.8b umlal v20.8h, v17.8b, v5.8b umull2 v21.8h, v2.16b, v7.16b umlal2 v21.8h, v17.16b, v5.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 rshrn2 v17.16b, v21.8h, #6 st1 {v16.16b}, [x0], #16 st1 {v17.16b}, [x13], #16 b.le 3f mov v0.16b, v1.16b ld1 {v1.16b}, [x8], #16 // top[base] mov v2.16b, v3.16b ld1 {v3.16b}, [x10], #16 b 2b 3: subs w4, w4, #2 b.le 9f add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 1b 9: ret 169: st1 {v31.16b}, [x0], #16 subs w3, w3, #16 st1 {v31.16b}, [x13], #16 b.gt 169b subs w4, w4, #2 b.le 9b add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 169b endfunc jumptable ipred_z1_fill1_tbl .word 640b - ipred_z1_fill1_tbl .word 320b - ipred_z1_fill1_tbl .word 160b - ipred_z1_fill1_tbl .word 80b - ipred_z1_fill1_tbl .word 40b - ipred_z1_fill1_tbl endjumptable function ipred_z1_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 b.eq 8f 4: // w == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f ldr d0, [x2, w8, uxtw] // top[base] ldr d2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8b, v0.8b, v0.8b // top[base+1] uzp1 v0.8b, v0.8b, v0.8b // top[base] uzp2 v3.8b, v2.8b, v2.8b uzp1 v2.8b, v2.8b, v2.8b usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] usubl v7.8h, v3.8b, v2.8b ushll v16.8h, v0.8b, #6 // top[base]*64 ushll v17.8h, v2.8b, #6 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac mla v17.4h, v7.4h, v5.4h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.s}[0], [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.s}[0], [x0], x1 b.gt 4b ret 49: st1 {v31.s}[0], [x0], x1 subs w4, w4, #2 st1 {v31.s}[0], [x0], x1 b.gt 49b ret 8: // w == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.8b, w9 // frac dup v5.8b, w11 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8b, w9 // 64 - frac dup v7.8b, w11 uzp2 v1.16b, v0.16b, v0.16b // top[base+1] uzp1 v0.16b, v0.16b, v0.16b // top[base] uzp2 v3.16b, v2.16b, v2.16b uzp1 v2.16b, v2.16b, v2.16b umull v16.8h, v1.8b, v4.8b // top[base+1]*frac umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) umull v17.8h, v3.8b, v5.8b umlal v17.8h, v2.8b, v7.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.gt 8b ret 89: st1 {v31.8b}, [x0], x1 subs w4, w4, #2 st1 {v31.8b}, [x0], x1 b.gt 89b ret endfunc // void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, // const int n); function ipred_reverse_8bpc_neon, export=1 sub x1, x1, #16 add x3, x0, #8 mov x4, #16 1: ld1 {v0.16b}, [x1] subs w2, w2, #16 rev64 v0.16b, v0.16b sub x1, x1, #16 st1 {v0.d}[1], [x0], x4 st1 {v0.d}[0], [x3], x4 b.gt 1b ret endfunc const increments .short 0, 1, 2, 3, 4, 5, 6, 7 .short 8, 9, 10, 11, 12, 13, 14, 15 endconst // void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const pixel *const left, // const int width, const int height, // const int dx, const int dy); function ipred_z2_fill1_8bpc_neon, export=1 clz w10, w4 movrel x9, ipred_z2_fill1_tbl sub w10, w10, #25 ldrsw x10, [x9, w10, uxtw #2] mov w8, #(1 << 6) // xpos = 1 << 6 add x9, x9, x10 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy br x9 40: AARCH64_VALID_JUMP_TARGET dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // Worst case height for w=4 is 16, but we need at least h+1 elements ld1 {v0.16b, v1.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v29.8b, #2 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos // Cut corners here; only doing tbl over v0 here; we only // seem to need the last pixel, from v1, after skipping to the // left-only codepath below. tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] ext v5.8b, v4.8b, v4.8b, #1 and v6.8b, v6.8b, v25.8b // frac_x trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] trn1 v2.2s, v2.2s, v4.2s // top[base_x] trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 4b 49: tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2] trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 49b 9: ret 80: AARCH64_VALID_JUMP_TARGET dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // Worst case height for w=8 is 32, but we need at least h+1 elements ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbl v18.8b, {v0.16b}, v29.8b // left[base_y] add v30.8b, v29.8b, v19.8b // base_y + 2 add v29.8b, v29.8b, v17.8b // base_y + 1 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} movi v24.8b, #2 // 2 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] // Cut corners here; only doing tbl over v0-v1 here; we only // seem to need the last pixel, from v2, after skipping to the // left-only codepath below. tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2] ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 8b 89: tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1] tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 89b 9: ret 160: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] add x11, x11, #16 // increments dup v18.8h, w7 // -dy movi v17.16b, #1 add x3, x3, #1 // Skip past left[0] ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy movi v25.16b, #0x3e add v16.8h, v16.8h, v18.8h // -= dy add v18.8h, v19.8h, v18.8h xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} // Worst case height is 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] ld1r {v15.16b}, [x2] // left[0] == top[0] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v16.8h // (uint8_t)ypos xtn2 v27.16b, v18.8h shrn v29.8b, v16.8h, #6 // ypos >> 6 shrn2 v29.16b, v18.8h, #6 mov v18.16b, v15.16b // left[0] and v27.16b, v27.16b, v25.16b // frac_y // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbx v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 add v29.16b, v29.16b, v17.16b // base_y + 1 sub v28.16b, v26.16b, v27.16b // 64 - frac_y movi v24.16b, #2 // 2 16: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 169f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw add x11, x2, w11, sxtw ld1 {v4.16b, v5.16b}, [x9] // top[base_x] mov v19.16b, v15.16b // left[0] ld1 {v6.16b, v7.16b}, [x11] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] mov v20.16b, v15.16b // left[0] shrn v21.8b, v16.8h, #6 // first base_x shrn v22.8b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn v17.8b, v17.8h tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] trn1 v21.2d, v21.2d, v21.2d // first base_x trn1 v22.2d, v22.2d, v22.2d trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos trn1 v17.2d, v17.2d, v17.2d ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y sub v8.16b, v26.16b, v16.16b // 64 - frac_x sub v9.16b, v26.16b, v17.16b umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b add v21.16b, v21.16b, v31.16b // actual base_x add v22.16b, v22.16b, v31.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v5.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v7.8b, v17.8b umull2 v18.8h, v6.16b, v9.16b umlal2 v18.8h, v7.16b, v17.16b cmge v21.16b, v21.16b, #0 cmge v22.16b, v22.16b, #0 rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v18.8h, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.16b}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.16b}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 add v30.16b, v30.16b, v24.16b // base_y += 2 b 16b 169: mov v19.16b, v15.16b mov v20.16b, v15.16b tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v5.8h, v18.16b, v28.16b umlal2 v5.8h, v19.16b, v27.16b umull v6.8h, v19.8b, v28.8b umlal v6.8h, v20.8b, v27.8b umull2 v7.8h, v19.16b, v28.16b umlal2 v7.8h, v20.16b, v27.16b rshrn v4.8b, v4.8h, #6 rshrn2 v4.16b, v5.8h, #6 rshrn v5.8b, v6.8h, #6 rshrn2 v5.16b, v7.8h, #6 st1 {v4.16b}, [x0], x1 subs w5, w5, #2 st1 {v5.16b}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 add v30.16b, v30.16b, v24.16b // base_y += 2 b 169b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret 320: 640: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] add x11, x11, #16 // increments dup v25.8h, w7 // -dy add x3, x3, #1 // Skip past left[0] ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} add x13, x0, x1 // alternating row lsl x1, x1, #1 // stride *= 2 sub x1, x1, w4, uxtw // stride -= width movi v11.8h, #8 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy add v26.8h, v26.8h, v25.8h // -= dy mul v25.8h, v25.8h, v11.8h // -8*dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} // Worst case height is 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] ld1r {v15.16b}, [x2] // left[0] == top[0] mov w12, w4 // orig w neg w14, w4 // -w 1: mov v23.16b, v26.16b // reset ypos asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, w14 // base_x <= -w asr w11, w8, #6 // base_x b.le 329f dup v17.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx add x9, x2, w9, sxtw add x11, x2, w11, sxtw sqshrn v21.8b, v16.8h, #6 // first base_x sqshrn v22.8b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn v17.8b, v17.8h ld1 {v4.16b}, [x9], #16 // top[base_x] ld1 {v6.16b}, [x11], #16 trn1 v21.2d, v21.2d, v21.2d // first base_x trn1 v22.2d, v22.2d, v22.2d trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos trn1 v17.2d, v17.2d, v17.2d movi v10.16b, #0x3e movi v11.16b, #64 and v16.16b, v16.16b, v10.16b // frac_x and v17.16b, v17.16b, v10.16b sub v8.16b, v11.16b, v16.16b // 64 - frac_x sub v9.16b, v11.16b, v17.16b add v21.16b, v21.16b, v31.16b // actual base_x add v22.16b, v22.16b, v31.16b 2: add v13.8h, v23.8h, v25.8h // ypos -= 8*dy movi v12.16b, #64 movi v20.16b, #2 movi v10.16b, #0x3e smov w10, v22.b[0] xtn v27.8b, v23.8h // (uint8_t)ypos xtn2 v27.16b, v13.8h shrn v29.8b, v23.8h, #6 // ypos >> 6 shrn2 v29.16b, v13.8h, #6 cmp w10, #0 // base_x (bottom left) >= 0 and v27.16b, v27.16b, v10.16b // frac_y mov v18.16b, v15.16b // left[0] b.ge 4f add v23.8h, v13.8h, v25.8h // ypos -= 8*dy movi v13.16b, #1 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v13.16b // base_y + 1 mov v19.16b, v15.16b // left[0] sub v28.16b, v12.16b, v27.16b // 64 - frac_y ld1 {v5.16b}, [x9], #16 // top[base_x] ld1 {v7.16b}, [x11], #16 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v13.16b // base_y + 2 mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #1 rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v18.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v19.8b, v17.8b umull2 v20.8h, v6.16b, v9.16b umlal2 v20.8h, v19.16b, v17.16b cmge v18.16b, v21.16b, #0 cmge v19.16b, v22.16b, #0 rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v20.8h, #6 bit v10.16b, v12.16b, v18.16b bit v11.16b, v13.16b, v19.16b st1 {v10.16b}, [x0], #16 subs w4, w4, #16 st1 {v11.16b}, [x13], #16 b.le 3f movi v10.16b, #16 mov v4.16b, v5.16b mov v6.16b, v7.16b add v21.16b, v21.16b, v10.16b // base_x += 16 add v22.16b, v22.16b, v10.16b b 2b 3: subs w5, w5, #2 b.le 9f movi v10.8h, #128 add x0, x0, x1 add x13, x13, x1 mov w4, w12 // reset w add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) b 1b 4: // The rest of the row only predicted from top[] ld1 {v5.16b}, [x9], #16 // top[base_x] ld1 {v7.16b}, [x11], #16 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #1 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v18.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v19.8b, v17.8b umull2 v20.8h, v6.16b, v9.16b umlal2 v20.8h, v19.16b, v17.16b rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v20.8h, #6 st1 {v12.16b}, [x0], #16 subs w4, w4, #16 st1 {v13.16b}, [x13], #16 b.le 3b mov v4.16b, v5.16b mov v6.16b, v7.16b b 4b 329: // The rest of the block only predicted from left[] add x1, x1, w4, uxtw // restore stride mov w12, w5 // orig remaining h 1: add v13.8h, v23.8h, v25.8h // ypos -= 8*dy movi v12.16b, #64 movi v10.16b, #0x3e xtn v27.8b, v23.8h // (uint8_t)ypos xtn2 v27.16b, v13.8h shrn v29.8b, v23.8h, #6 // ypos >> 6 shrn2 v29.16b, v13.8h, #6 and v27.16b, v27.16b, v10.16b // frac_y mov v18.16b, v15.16b // left[0] add v23.8h, v13.8h, v25.8h // ypos -= 8*dy movi v21.16b, #1 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v21.16b // base_y + 1 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 2: mov v19.16b, v15.16b // left[0] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v21.16b // base_y + 2 mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] add v29.16b, v29.16b, v21.16b // next base_y umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 st1 {v10.16b}, [x0], x1 subs w5, w5, #2 st1 {v11.16b}, [x13], x1 b.le 3f mov v18.16b, v20.16b b 2b 3: subs w4, w4, #16 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w5, w12 // reset h b 1b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc jumptable ipred_z2_fill1_tbl .word 640b - ipred_z2_fill1_tbl .word 320b - ipred_z2_fill1_tbl .word 160b - ipred_z2_fill1_tbl .word 80b - ipred_z2_fill1_tbl .word 40b - ipred_z2_fill1_tbl endjumptable function ipred_z2_fill2_8bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v29.8b, #2 add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6} 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1] uzp1 v2.8b, v2.8b, v4.8b // top[base_x] and v6.8b, v6.8b, v25.8b // frac_x trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 4b 49: tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 49b 9: ret 80: dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] add v30.8b, v29.8b, v19.8b // base_y + 2 add v29.8b, v29.8b, v17.8b // base_y + 1 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} movi v24.8b, #2 // 2 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1] uzp1 v4.16b, v4.16b, v6.16b // top[base_x] and v16.16b, v16.16b, v25.16b // frac_x sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 8b 89: tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 89b 9: ret endfunc function ipred_z2_fill3_8bpc_neon, export=1 cmp w4, #8 mov w8, #(1 << 6) // xpos = 1 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.16b, v1.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} add v24.8b, v30.8b, v19.8b // base_y + 3 trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2 trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v24.8b, #4 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] ext v5.8b, v4.8b, v4.8b, #1 and v6.8b, v6.8b, v25.8b // frac_x trn1 v2.2s, v2.2s, v4.2s // top[base_x] trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f add v29.8b, v29.8b, v24.8b // base_y += 4 add v30.8b, v30.8b, v24.8b // base_y += 4 b 4b 49: tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f add v29.8b, v29.8b, v24.8b // base_y += 4 add v30.8b, v30.8b, v24.8b // base_y += 4 b 49b 9: ret 80: dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 add v28.8b, v29.8b, v17.8b // base_y + 1 add v30.8b, v29.8b, v19.8b // base_y + 2 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} add v24.8b, v28.8b, v19.8b // base_y + 3 trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2 trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3 sub v28.8b, v26.8b, v27.8b // 64 - frac_y movi v24.16b, #4 trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v17.8h, v18.16b, v28.16b umlal2 v17.8h, v19.16b, v27.16b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 add v30.16b, v30.16b, v24.16b // base_y += 4 b 8b 89: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v17.8h, v18.16b, v28.16b umlal2 v17.8h, v19.16b, v27.16b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 add v30.16b, v30.16b, v24.16b // base_y += 4 b 89b 9: ret endfunc // void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const left, // const int width, const int height, // const int dy, const int max_base_y); function ipred_z3_fill1_8bpc_neon, export=1 cmp w6, #64 clz w9, w3 movrel x8, ipred_z3_fill1_tbl sub w9, w9, #25 ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw // left[max_base_y] add x8, x8, x9 movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments mov w7, w5 b.gt L(ipred_z3_fill1_large_h16) br x8 40: AARCH64_VALID_JUMP_TARGET dup v29.4h, w5 // dy mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac mov v4.8b, v31.8b uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 trn1 v24.2s, v24.2s, v24.2s // frac trn1 v25.2s, v25.2s, v25.2s // 64 - frac 1: mov v5.8b, v31.8b tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac rshrn v16.8b, v16.8h, #6 st1 {v16.s}[0], [x0], x1 subs w4, w4, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v4.8b, v5.8b, v5.8b, #4 uqadd v27.8b, v27.8b, v21.8b // base += 2 b 1b 9: ret 80: AARCH64_VALID_JUMP_TARGET dup v29.8h, w5 // dy mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac mov v4.8b, v31.8b uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] 1: mov v5.8b, v31.8b mov v6.8b, v31.8b tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull v17.8h, v5.8b, v25.8b umlal v17.8h, v6.8b, v24.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.le 9f mov v4.8b, v6.8b uqadd v27.8b, v27.8b, v21.8b // base += 2 uqadd v28.8b, v28.8b, v21.8b // base += 2 b 1b 9: ret 160: AARCH64_VALID_JUMP_TARGET dup v28.8h, w5 // dy shl v29.8h, v28.8h, #3 // 8*dy mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] add v28.8h, v28.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 add v29.8h, v28.8h, v29.8h // ypos + 8*dy xtn v24.8b, v28.8h // (uint8_t)ypos xtn2 v24.16b, v29.8h uqshrn v26.8b, v28.8h, #6 // base uqshrn2 v26.16b, v29.8h, #6 and v24.16b, v24.16b, v23.16b // frac mov v4.16b, v31.16b uqadd v27.16b, v26.16b, v20.16b // base + 1 uqadd v28.16b, v26.16b, v21.16b // base + 2 sub v25.16b, v22.16b, v24.16b // 64 - frac tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] 1: mov v5.16b, v31.16b mov v6.16b, v31.16b tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b umull v18.8h, v5.8b, v25.8b umlal v18.8h, v6.8b, v24.8b umull2 v19.8h, v5.16b, v25.16b umlal2 v19.8h, v6.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn2 v16.16b, v17.8h, #6 rshrn v17.8b, v18.8h, #6 rshrn2 v17.16b, v19.8h, #6 st1 {v16.16b}, [x0], x1 subs w4, w4, #2 st1 {v17.16b}, [x0], x1 b.le 9f mov v4.16b, v6.16b uqadd v27.16b, v27.16b, v21.16b // base += 2 uqadd v28.16b, v28.16b, v21.16b // base += 2 b 1b 9: ret 320: 640: AARCH64_VALID_JUMP_TARGET dup v28.8h, w5 // dy mov w12, w3 add x13, x0, x1 shl v29.8h, v28.8h, #3 // 8*dy mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e lsl x1, x1, #1 sub x1, x1, w3, uxtw add v30.8h, v28.8h, v30.8h // ypos // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 1: mov v26.16b, v30.16b // reset ypos 2: add v27.8h, v26.8h, v29.8h // ypos + 8*dy uqshrn v16.8b, v26.8h, #6 // base uqshrn2 v16.16b, v27.8h, #6 xtn v24.8b, v26.8h // (uint8_t)ypos xtn2 v24.16b, v27.8h umov w14, v16.b[0] and v24.16b, v24.16b, v23.16b // frac uqadd v17.16b, v16.16b, v20.16b // base + 1 cmp w14, w6 // base >= max_base_y uqadd v18.16b, v16.16b, v21.16b // base + 2 sub v25.16b, v22.16b, v24.16b // 64 - frac b.ge 4f mov v4.16b, v31.16b mov v5.16b, v31.16b mov v6.16b, v31.16b tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] subs w3, w3, #16 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b umull v18.8h, v5.8b, v25.8b umlal v18.8h, v6.8b, v24.8b umull2 v19.8h, v5.16b, v25.16b umlal2 v19.8h, v6.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn2 v16.16b, v17.8h, #6 rshrn v17.8b, v18.8h, #6 rshrn2 v17.16b, v19.8h, #6 st1 {v16.16b}, [x0], #16 st1 {v17.16b}, [x13], #16 b.le 3f add v26.8h, v27.8h, v29.8h // ypos += 16*dy b 2b 3: subs w4, w4, #2 b.le 9f movi v16.8h, #128 add x0, x0, x1 add x13, x13, x1 add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 mov w3, w12 b 1b 4: subs w3, w3, #16 st1 {v31.16b}, [x0], #16 st1 {v31.16b}, [x13], #16 b.gt 4b b 3b 9: ret L(ipred_z3_fill1_large_h16): // Fallback case for max_base_y > 64; similar to the z1 // implementation. This does the filtering vertically, filling out // a 2x pixel column at a time. mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 mov w12, w4 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // ypos += dy cmp w8, w6 // base >= max_base_y lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw add x10, x2, w10, uxtw dup v4.16b, w9 // frac dup v5.16b, w11 ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] ld1 {v2.16b, v3.16b}, [x10], #32 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.16b, w9 // 64 - frac dup v7.16b, w11 add w7, w7, w5 // ypos += dy 2: ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w4, w4, #16 umull v18.8h, v16.8b, v4.8b // left[base+1]*frac umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) umull2 v19.8h, v16.16b, v4.16b umlal2 v19.8h, v0.16b, v6.16b umull v20.8h, v17.8b, v5.8b umlal v20.8h, v2.8b, v7.8b umull2 v21.8h, v17.16b, v5.16b umlal2 v21.8h, v2.16b, v7.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 rshrn2 v17.16b, v21.8h, #6 zip1 v18.16b, v16.16b, v17.16b zip2 v19.16b, v16.16b, v17.16b st1 {v18.h}[0], [x0], x1 st1 {v18.h}[1], [x13], x1 st1 {v18.h}[2], [x0], x1 st1 {v18.h}[3], [x13], x1 st1 {v18.h}[4], [x0], x1 st1 {v18.h}[5], [x13], x1 st1 {v18.h}[6], [x0], x1 st1 {v18.h}[7], [x13], x1 st1 {v19.h}[0], [x0], x1 st1 {v19.h}[1], [x13], x1 st1 {v19.h}[2], [x0], x1 st1 {v19.h}[3], [x13], x1 st1 {v19.h}[4], [x0], x1 st1 {v19.h}[5], [x13], x1 st1 {v19.h}[6], [x0], x1 st1 {v19.h}[7], [x13], x1 b.le 3f mov v0.16b, v1.16b ld1 {v1.16b}, [x8], #16 // left[base] mov v2.16b, v3.16b ld1 {v3.16b}, [x10], #16 b 2b 3: subs w3, w3, #2 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #2 add x13, x13, #2 mov w4, w12 b 1b 9: ret endfunc jumptable ipred_z3_fill1_tbl .word 640b - ipred_z3_fill1_tbl .word 320b - ipred_z3_fill1_tbl .word 160b - ipred_z3_fill1_tbl .word 80b - ipred_z3_fill1_tbl .word 40b - ipred_z3_fill1_tbl endjumptable function ipred_z3_fill_padding_neon, export=0 cmp w3, #16 movrel x8, ipred_z3_fill_padding_tbl b.gt ipred_z3_fill_padding_wide // w3 = remaining width, w4 = constant height mov w12, w4 1: // Fill a WxH rectangle with padding. W can be any number; // this fills the exact width by filling in the largest // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 ldrsw x9, [x8, w9, uxtw #2] add x9, x8, x9 br x9 20: AARCH64_VALID_JUMP_TARGET 2: st1 {v31.h}[0], [x0], x1 subs w4, w4, #4 st1 {v31.h}[0], [x13], x1 st1 {v31.h}[0], [x0], x1 st1 {v31.h}[0], [x13], x1 b.gt 2b subs w3, w3, #2 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #2 add x13, x13, #2 mov w4, w12 b 1b 40: AARCH64_VALID_JUMP_TARGET 4: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 st1 {v31.s}[0], [x0], x1 st1 {v31.s}[0], [x13], x1 b.gt 4b subs w3, w3, #4 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 80: AARCH64_VALID_JUMP_TARGET 8: st1 {v31.8b}, [x0], x1 subs w4, w4, #4 st1 {v31.8b}, [x13], x1 st1 {v31.8b}, [x0], x1 st1 {v31.8b}, [x13], x1 b.gt 8b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #8 add x13, x13, #8 mov w4, w12 b 1b 160: 320: 640: AARCH64_VALID_JUMP_TARGET 16: st1 {v31.16b}, [x0], x1 subs w4, w4, #4 st1 {v31.16b}, [x13], x1 st1 {v31.16b}, [x0], x1 st1 {v31.16b}, [x13], x1 b.gt 16b subs w3, w3, #16 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w4, w12 b 1b 9: ret endfunc jumptable ipred_z3_fill_padding_tbl .word 640b - ipred_z3_fill_padding_tbl .word 320b - ipred_z3_fill_padding_tbl .word 160b - ipred_z3_fill_padding_tbl .word 80b - ipred_z3_fill_padding_tbl .word 40b - ipred_z3_fill_padding_tbl .word 20b - ipred_z3_fill_padding_tbl endjumptable function ipred_z3_fill_padding_wide // Fill a WxH rectangle with padding, with W > 16. lsr x1, x1, #1 mov w12, w3 sub x1, x1, w3, uxtw 1: ands w5, w3, #15 b.eq 2f // If the width isn't aligned to 16, first do one 16 byte write // and align the start pointer. sub w3, w3, w5 st1 {v31.16b}, [x0] add x0, x0, w5, uxtw 2: // Fill the rest of the line with aligned 16 byte writes. subs w3, w3, #16 st1 {v31.16b}, [x0], #16 b.gt 2b subs w4, w4, #1 add x0, x0, x1 b.le 9f mov w3, w12 b 1b 9: ret endfunc function ipred_z3_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // left[max_base_y] movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments b.eq 80f 40: // w == 4 dup v29.4h, w5 // dy mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac uqadd v29.8b, v27.8b, v21.8b // base + 3 trn1 v24.2s, v24.2s, v24.2s // frac trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 trn1 v25.2s, v25.2s, v25.2s // 64 - frac movi v21.16b, #4 1: mov v4.8b, v31.8b mov v5.8b, v31.8b tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac rshrn v16.8b, v16.8h, #6 st1 {v16.s}[0], [x0], x1 subs w4, w4, #2 st1 {v16.s}[1], [x0], x1 b.le 9f uqadd v26.8b, v26.8b, v21.8b // base += 4 uqadd v27.8b, v27.8b, v21.8b // base += 4 b 1b 9: ret 80: // w == 8 dup v29.8h, w5 // dy mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac uqadd v29.8b, v27.8b, v21.8b // base + 3 trn1 v24.2d, v24.2d, v24.2d // frac trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 trn1 v25.2d, v25.2d, v25.2d // 64 - frac movi v21.16b, #4 1: mov v4.16b, v31.16b mov v5.16b, v31.16b tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.le 9f uqadd v26.16b, v26.16b, v21.16b // base += 4 uqadd v27.16b, v27.16b, v21.16b // base += 4 b 1b 9: ret endfunc // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height); function ipred_filter_8bpc_neon, export=1 and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 movrel x5, ipred_filter_tbl ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrsw x9, [x5, w9, uxtw #2] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b add x5, x5, x9 sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b br x5 40: AARCH64_VALID_JUMP_TARGET ldur s0, [x2, #1] // top (0-3) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-3) 4: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.8b, v2.8h, #4 subs w4, w4, #2 st1 {v2.s}[0], [x0], x1 uxtl v0.8h, v2.8b st1 {v2.s}[1], [x6], x1 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #1] // top (0-7) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-7) 8: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.8b, v2.8h, #4 uxtl v1.8h, v2.8b // first block, in 16 bit mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) sqrshrun v3.8b, v3.8h, #4 subs w4, w4, #2 st2 {v2.s, v3.s}[0], [x0], x1 zip2 v0.2s, v2.2s, v3.2s st2 {v2.s, v3.s}[1], [x6], x1 uxtl v0.8h, v0.8b b.gt 8b ret 160: 320: AARCH64_VALID_JUMP_TARGET add x8, x2, #1 sub x2, x2, #2 mov x7, #-2 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 2: ld1 {v2.16b}, [x8], #16 // top(0-15) mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) uxtl v1.8h, v2.8b // top(0-7) uxtl2 v2.8h, v2.16b // top(8-15) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.8b, v3.8h, #4 uxtl v0.8h, v3.8b // first block, in 16 bit mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.8b, v4.8h, #4 uxtl v0.8h, v4.8b // second block, in 16 bit mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.8b, v5.8h, #4 uxtl v0.8h, v5.8b // third block, in 16 bit mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.8b, v6.8h, #4 st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 b.le 8f ins v0.h[2], v2.h[7] ins v0.b[0], v6.b[7] ins v0.b[2], v6.b[3] b 2b 8: subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_filter_tbl .word 320b - ipred_filter_tbl .word 160b - ipred_filter_tbl .word 80b - ipred_filter_tbl .word 40b - ipred_filter_tbl endjumptable // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 ld1 {v0.8b}, [x2] clz w9, w4 movrel x6, pal_pred_tbl sub w9, w9, #25 movi v31.16b, #7 ldrsw x9, [x6, x9, lsl #2] add x2, x0, x1 add x6, x6, x9 lsl x1, x1, #1 br x6 40: AARCH64_VALID_JUMP_TARGET 4: ld1 {v1.8b}, [x3], #8 subs w5, w5, #4 ushr v3.8b, v1.8b, #4 and v2.8b, v1.8b, v31.8b zip1 v1.16b, v2.16b, v3.16b tbl v1.16b, {v0.16b}, v1.16b st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x2], x1 st1 {v1.s}[2], [x0], x1 st1 {v1.s}[3], [x2], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ld1 {v1.16b}, [x3], #16 ushr v4.16b, v1.16b, #4 and v3.16b, v1.16b, v31.16b subs w5, w5, #4 zip1 v1.16b, v3.16b, v4.16b zip2 v2.16b, v3.16b, v4.16b tbl v1.16b, {v0.16b}, v1.16b tbl v2.16b, {v0.16b}, v2.16b st1 {v1.8b}, [x0], x1 st1 {v1.d}[1], [x2], x1 st1 {v2.8b}, [x0], x1 st1 {v2.d}[1], [x2], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ld1 {v16.16b, v17.16b}, [x3], #32 and v20.16b, v16.16b, v31.16b ushr v21.16b, v16.16b, #4 and v22.16b, v17.16b, v31.16b ushr v23.16b, v17.16b, #4 subs w5, w5, #4 tbl v16.16b, {v0.16b}, v20.16b tbl v17.16b, {v0.16b}, v21.16b tbl v18.16b, {v0.16b}, v22.16b tbl v19.16b, {v0.16b}, v23.16b zip2 v21.16b, v16.16b, v17.16b st2 {v16.8b, v17.8b}, [x0], x1 zip2 v23.16b, v18.16b, v19.16b st1 {v21.16b}, [x2], x1 st2 {v18.8b, v19.8b}, [x0], x1 st1 {v23.16b}, [x2], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ushr v21.16b, v16.16b, #4 and v20.16b, v16.16b, v31.16b ushr v23.16b, v17.16b, #4 and v22.16b, v17.16b, v31.16b ushr v25.16b, v18.16b, #4 and v24.16b, v18.16b, v31.16b ushr v27.16b, v19.16b, #4 and v26.16b, v19.16b, v31.16b subs w5, w5, #4 tbl v16.16b, {v0.16b}, v20.16b tbl v17.16b, {v0.16b}, v21.16b tbl v18.16b, {v0.16b}, v22.16b tbl v19.16b, {v0.16b}, v23.16b tbl v20.16b, {v0.16b}, v24.16b tbl v21.16b, {v0.16b}, v25.16b st2 {v16.16b, v17.16b}, [x0], x1 tbl v22.16b, {v0.16b}, v26.16b st2 {v18.16b, v19.16b}, [x2], x1 tbl v23.16b, {v0.16b}, v27.16b st2 {v20.16b, v21.16b}, [x0], x1 st2 {v22.16b, v23.16b}, [x2], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET add x4, x0, #32 add x6, x2, #32 64: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ushr v21.16b, v16.16b, #4 and v20.16b, v16.16b, v31.16b ushr v23.16b, v17.16b, #4 and v22.16b, v17.16b, v31.16b ushr v25.16b, v18.16b, #4 and v24.16b, v18.16b, v31.16b ushr v27.16b, v19.16b, #4 and v26.16b, v19.16b, v31.16b subs w5, w5, #2 tbl v20.16b, {v0.16b}, v20.16b tbl v21.16b, {v0.16b}, v21.16b tbl v22.16b, {v0.16b}, v22.16b tbl v23.16b, {v0.16b}, v23.16b tbl v24.16b, {v0.16b}, v24.16b tbl v25.16b, {v0.16b}, v25.16b st2 {v20.16b, v21.16b}, [x0], x1 tbl v26.16b, {v0.16b}, v26.16b st2 {v22.16b, v23.16b}, [x4], x1 tbl v27.16b, {v0.16b}, v27.16b st2 {v24.16b, v25.16b}, [x2], x1 st2 {v26.16b, v27.16b}, [x6], x1 b.gt 64b ret endfunc jumptable pal_pred_tbl .word 640b - pal_pred_tbl .word 320b - pal_pred_tbl .word 160b - pal_pred_tbl .word 80b - pal_pred_tbl .word 40b - pal_pred_tbl endjumptable // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 clz w9, w3 movrel x7, ipred_cfl_128_tbl sub w9, w9, #26 ldrsw x9, [x7, w9, uxtw #2] movi v0.8h, #128 // dc dup v1.8h, w6 // alpha add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET 1: ld1 {v2.8h, v3.8h}, [x5], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h cmlt v4.8h, v2.8h, #0 // sign cmlt v5.8h, v3.8h, #0 add v2.8h, v2.8h, v4.8h // diff + sign add v3.8h, v3.8h, v5.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h st1 {v2.s}[0], [x0], x1 st1 {v2.s}[1], [x6], x1 subs w4, w4, #4 st1 {v3.s}[0], [x0], x1 st1 {v3.s}[1], [x6], x1 b.gt 1b ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET 1: ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h cmlt v16.8h, v2.8h, #0 // sign cmlt v17.8h, v3.8h, #0 cmlt v18.8h, v4.8h, #0 cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x6], x1 subs w4, w4, #4 st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x6], x1 b.gt 1b ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h cmlt v16.8h, v2.8h, #0 // sign cmlt v17.8h, v3.8h, #0 cmlt v18.8h, v4.8h, #0 cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h subs w3, w3, #16 st1 {v2.8b, v3.8b}, [x0], #16 st1 {v4.8b, v5.8b}, [x6], #16 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret endfunc jumptable ipred_cfl_128_tbl ipred_cfl_splat_tbl: .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl endjumptable // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 clz w9, w3 movrel x7, ipred_cfl_top_tbl sub w9, w9, #26 ldrsw x9, [x7, w9, uxtw #2] dup v1.8h, w6 // alpha add x2, x2, #1 add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 br x7 4: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) endfunc jumptable ipred_cfl_top_tbl .word 32b - ipred_cfl_top_tbl .word 16b - ipred_cfl_top_tbl .word 8b - ipred_cfl_top_tbl .word 4b - ipred_cfl_top_tbl endjumptable // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 movrel x10, ipred_cfl_splat_tbl movrel x7, ipred_cfl_left_tbl sub w9, w9, #26 sub w8, w8, #26 ldrsw x9, [x10, w9, uxtw #2] ldrsw x8, [x7, w8, uxtw #2] dup v1.8h, w6 // alpha add x9, x10, x9 add x7, x7, x8 add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_left_h4): AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] br x9 endfunc jumptable ipred_cfl_left_tbl .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl .word L(ipred_cfl_left_h8) - ipred_cfl_left_tbl .word L(ipred_cfl_left_h4) - ipred_cfl_left_tbl endjumptable // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.8h, w8 // width + height movrel x7, ipred_cfl_tbl rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrsw x9, [x7, w9, uxtw #2] ldrsw x6, [x7, w6, uxtw #2] neg w8, w8 // -ctz(width + height) add x9, x7, x9 add x7, x7, x6 ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr add x2, x2, #1 uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #4 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x9 L(ipred_cfl_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #8 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x9 L(ipred_cfl_w16): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b cmp w4, #16 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32 cmp w4, #4 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b add x2, x2, #1 add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b uaddlv h3, v3.16b cmp w4, #32 add v0.4h, v0.4h, v2.4h add v0.4h, v0.4h, v3.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) endfunc jumptable ipred_cfl_tbl .word L(ipred_cfl_h32) - ipred_cfl_tbl .word L(ipred_cfl_h16) - ipred_cfl_tbl .word L(ipred_cfl_h8) - ipred_cfl_tbl .word L(ipred_cfl_h4) - ipred_cfl_tbl .word L(ipred_cfl_w32) - ipred_cfl_tbl .word L(ipred_cfl_w16) - ipred_cfl_tbl .word L(ipred_cfl_w8) - ipred_cfl_tbl .word L(ipred_cfl_w4) - ipred_cfl_tbl endjumptable // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 movrel x7, ipred_cfl_ac_420_tbl sub w8, w8, #27 ldrsw x8, [x7, w8, uxtw #2] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 add v16.8h, v16.8h, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h b.gt 2b 3: // Aggregate the sums add v0.8h, v16.8h, v17.8h uaddlv s0, v0.8h // sum sub x0, x0, w6, uxtw #3 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 add v16.4h, v16.4h, v0.4h add v17.4h, v17.4h, v1.4h add v18.4h, v18.4h, v2.4h add v19.4h, v19.4h, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 add v18.8h, v18.8h, v0.8h add v19.8h, v19.8h, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w8_calc_subtract_dc): // Aggregate the sums add v0.8h, v16.8h, v17.8h add v2.8h, v18.8h, v19.8h uaddlp v0.4s, v0.8h uaddlp v2.4s, v2.8h add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] L(ipred_cfl_ac_420_w8_subtract_dc): 6: // Subtract dc from ac ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h sub v2.8h, v2.8h, v4.8h sub v3.8h, v3.8h, v4.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 6b ret L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET movrel x7, ipred_cfl_ac_420_w16_tbl ldrsw x3, [x7, w3, uxtw #2] add x7, x7, x3 br x7 L(ipred_cfl_ac_420_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b ld1 {v4.16b, v5.16b}, [x1], x2 uaddlp v1.8h, v1.16b ld1 {v6.16b, v7.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b uaddlp v4.8h, v4.16b uaddlp v5.8h, v5.16b uaddlp v6.8h, v6.16b uaddlp v7.8h, v7.16b add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h add v4.8h, v4.8h, v6.8h add v5.8h, v5.8h, v7.8h shl v0.8h, v0.8h, #1 shl v1.8h, v1.8h, #1 shl v2.8h, v4.8h, #1 shl v3.8h, v5.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b ldr d5, [x1, #16] uaddlp v0.8h, v0.16b ld1 {v4.16b}, [x1], x2 uaddlp v3.4h, v3.8b ldr d7, [x10, #16] uaddlp v2.8h, v2.16b ld1 {v6.16b}, [x10], x2 uaddlp v5.4h, v5.8b uaddlp v4.8h, v4.16b uaddlp v7.4h, v7.8b uaddlp v6.8h, v6.16b add v1.4h, v1.4h, v3.4h add v0.8h, v0.8h, v2.8h add v5.4h, v5.4h, v7.4h add v4.8h, v4.8h, v6.8h shl v1.4h, v1.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v5.4h, #1 shl v2.8h, v4.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v6.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v4.8h, v4.16b uaddlp v6.8h, v6.16b add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 uaddlp v0.4h, v0.8b ld1 {v6.8b}, [x10], x2 uaddlp v2.4h, v2.8b uaddlp v4.4h, v4.8b uaddlp v6.4h, v6.8b add v0.4h, v0.4h, v2.4h add v4.4h, v4.4h, v6.4h shl v0.4h, v0.4h, #1 shl v2.4h, v4.4h, #1 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 2b 3: // Double the height and reuse the w8 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) endfunc jumptable ipred_cfl_ac_420_tbl .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl .word L(ipred_cfl_ac_420_w8) - ipred_cfl_ac_420_tbl .word L(ipred_cfl_ac_420_w4) - ipred_cfl_ac_420_tbl endjumptable jumptable ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl endjumptable // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 movrel x7, ipred_cfl_ac_422_tbl sub w8, w8, #27 ldrsw x8, [x7, w8, uxtw #2] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.8b}, [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 subs w8, w8, #4 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v2.8b}, [x1], x2 ld1 {v2.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET movrel x7, ipred_cfl_ac_422_w16_tbl ldrsw x3, [x7, w3, uxtw #2] add x7, x7, x3 br x7 L(ipred_cfl_ac_422_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b uaddlp v0.8h, v0.16b uaddlp v3.4h, v3.8b uaddlp v2.8h, v2.16b shl v1.4h, v1.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v3.4h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 uaddlp v0.4h, v0.8b uaddlp v2.4h, v2.8b shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) endfunc jumptable ipred_cfl_ac_422_tbl .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl endjumptable jumptable ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl endjumptable // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 movrel x7, ipred_cfl_ac_444_tbl sub w8, w8, #26 ldrsw x8, [x7, w8, uxtw #2] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_444_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.s}[0], [x1], x2 ld1 {v0.s}[1], [x10], x2 ld1 {v1.s}[0], [x1], x2 ld1 {v1.s}[1], [x10], x2 ushll v0.8h, v0.8b, #3 ushll v1.8h, v1.8b, #3 subs w8, w8, #4 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v2.8b}, [x1], x2 ushll v0.8h, v0.8b, #3 ld1 {v3.8b}, [x10], x2 ushll v1.8h, v1.8b, #3 ushll v2.8h, v2.8b, #3 ushll v3.8h, v3.8b, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 ushll2 v1.8h, v0.16b, #3 ushll v0.8h, v0.8b, #3 ld1 {v6.16b}, [x10], x2 ushll2 v3.8h, v2.16b, #3 ushll v2.8h, v2.8b, #3 ushll2 v5.8h, v4.16b, #3 ushll v4.8h, v4.8b, #3 ushll2 v7.8h, v6.16b, #3 ushll v6.8h, v6.8b, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b mov v0.16b, v6.16b mov v1.16b, v7.16b mov v2.16b, v6.16b mov v3.16b, v7.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 ld1 {v6.8b}, [x10], x2 ushll v0.8h, v0.8b, #3 ushll v2.8h, v2.8b, #3 ushll v4.8h, v4.8b, #3 ushll v6.8h, v6.8b, #3 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] dup v5.8h, v4.h[7] dup v7.8h, v6.h[7] subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b mov v0.16b, v6.16b mov v1.16b, v7.16b mov v2.16b, v6.16b mov v3.16b, v7.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET movrel x7, ipred_cfl_ac_444_w32_tbl lsr w3, w3, #1 ldrsw x3, [x7, w3, uxtw #2] add x7, x7, x3 br x7 L(ipred_cfl_ac_444_w32_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v2.16b, v3.16b}, [x1], x2 ld1 {v6.16b, v7.16b}, [x10], x2 ushll v0.8h, v2.8b, #3 ushll2 v1.8h, v2.16b, #3 ushll v2.8h, v3.8b, #3 ushll2 v3.8h, v3.16b, #3 ushll v4.8h, v6.8b, #3 ushll2 v5.8h, v6.16b, #3 ushll v6.8h, v7.8b, #3 ushll2 v7.8h, v7.16b, #3 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ldr d2, [x1, #16] ld1 {v1.16b}, [x1], x2 ldr d6, [x10, #16] ld1 {v5.16b}, [x10], x2 ushll v2.8h, v2.8b, #3 ushll v0.8h, v1.8b, #3 ushll2 v1.8h, v1.16b, #3 ushll v6.8h, v6.8b, #3 ushll v4.8h, v5.8b, #3 ushll2 v5.8h, v5.16b, #3 dup v3.8h, v2.h[7] dup v7.8h, v6.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v1.16b}, [x1], x2 ld1 {v5.16b}, [x10], x2 ushll v0.8h, v1.8b, #3 ushll2 v1.8h, v1.16b, #3 ushll v4.8h, v5.8b, #3 ushll2 v5.8h, v5.16b, #3 dup v2.8h, v1.h[7] dup v3.8h, v1.h[7] dup v6.8h, v5.h[7] dup v7.8h, v5.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8b}, [x1], x2 ld1 {v4.8b}, [x10], x2 ushll v0.8h, v0.8b, #3 ushll v4.8h, v4.8b, #3 dup v1.8h, v0.h[7] dup v2.8h, v0.h[7] dup v3.8h, v0.h[7] dup v5.8h, v4.h[7] dup v6.8h, v4.h[7] dup v7.8h, v4.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b L(ipred_cfl_ac_444_w32_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 2b 3: // Quadruple the height and reuse the w8 subtracting lsl w6, w6, #2 // Aggregate the sums, with wider intermediates earlier than in // ipred_cfl_ac_420_w8_calc_subtract_dc. uaddlp v0.4s, v16.8h uaddlp v1.4s, v17.8h uaddlp v2.4s, v18.8h uaddlp v3.4s, v19.8h add v0.4s, v0.4s, v1.4s add v2.4s, v2.4s, v3.4s add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] b L(ipred_cfl_ac_420_w8_subtract_dc) endfunc jumptable ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w8) - ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w4) - ipred_cfl_ac_444_tbl endjumptable jumptable ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl endjumptable dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/ipred16.S000066400000000000000000007215061517466257200232540ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height, // const int bitdepth_max); function ipred_dc_128_16bpc_neon, export=1 ldr w8, [sp] clz w3, w3 movrel x5, ipred_dc_128_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] dup v0.8h, w8 add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 urshr v0.8h, v0.8h, #1 br x5 40: AARCH64_VALID_JUMP_TARGET 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b sub x1, x1, #64 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_dc_128_tbl .word 640b - ipred_dc_128_tbl .word 320b - ipred_dc_128_tbl .word 160b - ipred_dc_128_tbl .word 80b - ipred_dc_128_tbl .word 40b - ipred_dc_128_tbl endjumptable // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_16bpc_neon, export=1 clz w3, w3 movrel x5, ipred_v_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #2 add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 sub x1, x1, #64 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_v_tbl .word 640b - ipred_v_tbl .word 320b - ipred_v_tbl .word 160b - ipred_v_tbl .word 80b - ipred_v_tbl .word 40b - ipred_v_tbl endjumptable // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_16bpc_neon, export=1 clz w3, w3 movrel x5, ipred_h_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] sub x2, x2, #8 add x5, x5, x3 mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET 4: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.4h}, [x0], x1 st1 {v2.4h}, [x6], x1 subs w4, w4, #4 st1 {v1.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] stp q3, q3, [x0, #64] stp q2, q2, [x6, #64] stp q3, q3, [x0, #96] stp q2, q2, [x6, #96] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] stp q1, q1, [x0, #64] stp q0, q0, [x6, #64] stp q1, q1, [x0, #96] stp q0, q0, [x6, #96] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_h_tbl .word 640b - ipred_h_tbl .word 320b - ipred_h_tbl .word 160b - ipred_h_tbl .word 80b - ipred_h_tbl .word 40b - ipred_h_tbl endjumptable // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_16bpc_neon, export=1 clz w3, w3 movrel x5, ipred_dc_top_tbl sub w3, w3, #25 ldrsw x3, [x5, w3, uxtw #2] add x2, x2, #2 add x5, x5, x3 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.4h, v0.h[0] 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h urshr v2.4h, v0.4h, #4 dup v0.8h, v2.h[0] dup v1.8h, v2.h[0] 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #5 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #6 sub x1, x1, #64 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret endfunc jumptable ipred_dc_top_tbl .word 640b - ipred_dc_top_tbl .word 320b - ipred_dc_top_tbl .word 160b - ipred_dc_top_tbl .word 80b - ipred_dc_top_tbl .word 40b - ipred_dc_top_tbl endjumptable // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 clz w3, w3 clz w7, w4 movrel x5, ipred_dc_left_tbl sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrsw x3, [x5, w3, uxtw #2] ldrsw x7, [x5, w7, uxtw #2] add x3, x5, x3 add x5, x5, x7 add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET 1: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET 1: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h urshr v2.4h, v0.4h, #4 dup v0.8h, v2.h[0] dup v1.8h, v2.h[0] br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h uaddlp v0.4s, v0.8h addv s0, v0.4s rshrn v4.4h, v0.4s, #5 dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w32): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #6 dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w64): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b sub x1, x1, #64 1: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret endfunc jumptable ipred_dc_left_tbl .word L(ipred_dc_left_h64) - ipred_dc_left_tbl .word L(ipred_dc_left_h32) - ipred_dc_left_tbl .word L(ipred_dc_left_h16) - ipred_dc_left_tbl .word L(ipred_dc_left_h8) - ipred_dc_left_tbl .word L(ipred_dc_left_h4) - ipred_dc_left_tbl .word L(ipred_dc_left_w64) - ipred_dc_left_tbl .word L(ipred_dc_left_w32) - ipred_dc_left_tbl .word L(ipred_dc_left_w16) - ipred_dc_left_tbl .word L(ipred_dc_left_w8) - ipred_dc_left_tbl .word L(ipred_dc_left_w4) - ipred_dc_left_tbl endjumptable // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.4s, w7 // width + height movrel x5, ipred_dc_tbl rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrsw x3, [x5, w3, uxtw #2] ldrsw x6, [x5, w6, uxtw #2] neg w7, w7 // -ctz(width + height) add x3, x5, x3 add x5, x5, x6 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x3 L(ipred_dc_w4): AARCH64_VALID_JUMP_TARGET ld1 {v1.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.4h cmp w4, #4 add v0.2s, v0.2s, v1.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.4h, v0.h[0] 2: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 2b ret L(ipred_dc_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x3 L(ipred_dc_w8): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.8h cmp w4, #8 add v0.2s, v0.2s, v1.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] 2: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2], #32 addp v0.8h, v0.8h, v1.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w16): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h uaddlv s1, v1.8h cmp w4, #16 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] 2: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w32): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h addp v3.8h, v3.8h, v4.8h addp v1.8h, v1.8h, v3.8h uaddlv s1, v1.8h cmp w4, #32 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 2: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w64): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] addp v3.8h, v3.8h, v4.8h addp v20.8h, v20.8h, v21.8h addp v22.8h, v22.8h, v23.8h addp v1.8h, v1.8h, v3.8h addp v20.8h, v20.8h, v22.8h addp v1.8h, v1.8h, v20.8h uaddlv s1, v1.8h cmp w4, #64 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 16/32 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: sub x1, x1, #64 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 2: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret endfunc jumptable ipred_dc_tbl .word L(ipred_dc_h64) - ipred_dc_tbl .word L(ipred_dc_h32) - ipred_dc_tbl .word L(ipred_dc_h16) - ipred_dc_tbl .word L(ipred_dc_h8) - ipred_dc_tbl .word L(ipred_dc_h4) - ipred_dc_tbl .word L(ipred_dc_w64) - ipred_dc_tbl .word L(ipred_dc_w32) - ipred_dc_tbl .word L(ipred_dc_w16) - ipred_dc_tbl .word L(ipred_dc_w8) - ipred_dc_tbl .word L(ipred_dc_w4) - ipred_dc_tbl endjumptable // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_16bpc_neon, export=1 clz w9, w3 movrel x5, ipred_paeth_tbl sub w9, w9, #25 ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.8h}, [x2] add x8, x2, #2 sub x2, x2, #8 add x5, x5, x9 mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] sub v6.8h, v5.8h, v4.8h // top - topleft 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d add v16.8h, v6.8h, v0.8h // base add v17.8h, v6.8h, v2.8h sabd v20.8h, v5.8h, v16.8h // tdiff sabd v21.8h, v5.8h, v17.8h sabd v22.8h, v4.8h, v16.8h // tldiff sabd v23.8h, v4.8h, v17.8h sabd v16.8h, v0.8h, v16.8h // ldiff sabd v17.8h, v2.8h, v17.8h umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) umin v19.8h, v21.8h, v23.8h cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff cmge v21.8h, v23.8h, v21.8h cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff cmge v17.8h, v19.8h, v17.8h bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 4b ret 80: 160: 320: 640: AARCH64_VALID_JUMP_TARGET ld1 {v5.8h}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 1: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 2: sub v6.8h, v5.8h, v4.8h // top - topleft add v16.8h, v6.8h, v0.8h // base add v17.8h, v6.8h, v1.8h add v18.8h, v6.8h, v2.8h add v19.8h, v6.8h, v3.8h sabd v20.8h, v5.8h, v16.8h // tdiff sabd v21.8h, v5.8h, v17.8h sabd v22.8h, v5.8h, v18.8h sabd v23.8h, v5.8h, v19.8h sabd v24.8h, v4.8h, v16.8h // tldiff sabd v25.8h, v4.8h, v17.8h sabd v26.8h, v4.8h, v18.8h sabd v27.8h, v4.8h, v19.8h sabd v16.8h, v0.8h, v16.8h // ldiff sabd v17.8h, v1.8h, v17.8h sabd v18.8h, v2.8h, v18.8h sabd v19.8h, v3.8h, v19.8h umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) umin v29.8h, v21.8h, v25.8h umin v30.8h, v22.8h, v26.8h umin v31.8h, v23.8h, v27.8h cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff cmge v21.8h, v25.8h, v21.8h cmge v22.8h, v26.8h, v22.8h cmge v23.8h, v27.8h, v23.8h cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff cmge v17.8h, v29.8h, v17.8h cmge v18.8h, v30.8h, v18.8h cmge v19.8h, v31.8h, v19.8h bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b st1 {v23.8h}, [x0], #16 st1 {v22.8h}, [x6], #16 subs w3, w3, #8 st1 {v21.8h}, [x5], #16 st1 {v20.8h}, [x10], #16 b.le 8f ld1 {v5.8h}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.8h}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_paeth_tbl .word 640b - ipred_paeth_tbl .word 320b - ipred_paeth_tbl .word 160b - ipred_paeth_tbl .word 80b - ipred_paeth_tbl .word 40b - ipred_paeth_tbl endjumptable #if 0 // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_16bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 movrel x5, ipred_smooth_tbl sub x12, x2, w4, uxtw #1 sub w9, w9, #25 ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.8h}, [x12] // bottom add x8, x2, #2 add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #8 mov x7, #-8 dup v5.8h, v6.h[3] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor add v31.4h, v4.4h, v5.4h // bottom+right 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 zip1 v1.2d, v1.2d, v0.2d // left, flipped zip1 v0.2d, v3.2d, v2.2d zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor smlal2 v21.4s, v0.8h, v7.8h smlal v22.4s, v1.4h, v7.4h smlal2 v23.4s, v1.8h, v7.8h smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v6.8h, v16.8h smlal v22.4s, v6.4h, v18.4h smlal2 v23.4s, v6.8h, v18.8h rshrn v20.4h, v20.4s, #9 rshrn v21.4h, v21.4s, #9 rshrn v22.4h, v22.4s, #9 rshrn v23.4h, v23.4s, #9 st1 {v20.4h}, [x0], x1 st1 {v21.4h}, [x6], x1 subs w4, w4, #4 st1 {v22.4h}, [x0], x1 st1 {v23.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #8 mov x7, #-8 dup v5.8h, v6.h[7] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor add v31.4h, v4.4h, v5.4h // bottom+right 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 ushll v24.4s, v31.4h, #8 ushll v25.4s, v31.4h, #8 ushll v26.4s, v31.4h, #8 ushll v27.4s, v31.4h, #8 sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sub v2.8h, v2.8h, v5.8h sub v3.8h, v3.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor smlal2 v21.4s, v3.8h, v7.8h // (left flipped) smlal v22.4s, v2.4h, v7.4h smlal2 v23.4s, v2.8h, v7.8h smlal v24.4s, v1.4h, v7.4h smlal2 v25.4s, v1.8h, v7.8h smlal v26.4s, v0.4h, v7.4h smlal2 v27.4s, v0.8h, v7.8h smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v6.8h, v16.8h smlal v22.4s, v6.4h, v17.4h smlal2 v23.4s, v6.8h, v17.8h smlal v24.4s, v6.4h, v18.4h smlal2 v25.4s, v6.8h, v18.8h smlal v26.4s, v6.4h, v19.4h smlal2 v27.4s, v6.8h, v19.8h rshrn v20.4h, v20.4s, #9 rshrn2 v20.8h, v21.4s, #9 rshrn v21.4h, v22.4s, #9 rshrn2 v21.8h, v23.4s, #9 rshrn v22.4h, v24.4s, #9 rshrn2 v22.8h, v25.4s, #9 rshrn v23.4h, v26.4s, #9 rshrn2 v23.8h, v27.4s, #9 st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw #1 sub x1, x1, w3, uxtw #1 ld1r {v5.8h}, [x12] // right sub x2, x2, #4 mov x7, #-4 mov w9, w3 add v31.4h, v4.4h, v5.4h // bottom+right 1: ld2r {v0.8h, v1.8h}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v2.8h, v3.8h}, [x8], #32 // top ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 ushll v24.4s, v31.4h, #8 ushll v25.4s, v31.4h, #8 ushll v26.4s, v31.4h, #8 ushll v27.4s, v31.4h, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b sub v2.8h, v2.8h, v4.8h // top-bottom sub v3.8h, v3.8h, v4.8h smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor smlal2 v21.4s, v1.8h, v6.8h // (left flipped) smlal v22.4s, v1.4h, v7.4h smlal2 v23.4s, v1.8h, v7.8h smlal v24.4s, v0.4h, v6.4h smlal2 v25.4s, v0.8h, v6.8h smlal v26.4s, v0.4h, v7.4h smlal2 v27.4s, v0.8h, v7.8h smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v2.8h, v16.8h smlal v22.4s, v3.4h, v16.4h smlal2 v23.4s, v3.8h, v16.8h smlal v24.4s, v2.4h, v17.4h smlal2 v25.4s, v2.8h, v17.8h smlal v26.4s, v3.4h, v17.4h smlal2 v27.4s, v3.8h, v17.8h rshrn v20.4h, v20.4s, #9 rshrn2 v20.8h, v21.4s, #9 rshrn v21.4h, v22.4s, #9 rshrn2 v21.8h, v23.4s, #9 rshrn v22.4h, v24.4s, #9 rshrn2 v22.8h, v25.4s, #9 rshrn v23.4h, v26.4s, #9 rshrn2 v23.8h, v27.4s, #9 subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw #1 sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_smooth_tbl .word 640b - ipred_smooth_tbl .word 320b - ipred_smooth_tbl .word 160b - ipred_smooth_tbl .word 80b - ipred_smooth_tbl .word 40b - ipred_smooth_tbl endjumptable // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_16bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 movrel x5, ipred_smooth_v_tbl sub x8, x2, w4, uxtw #1 sub w9, w9, #25 ldrsw x9, [x5, w9, uxtw #2] ld1r {v4.8h}, [x8] // bottom add x2, x2, #2 add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 4: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v18.8h, v18.8b, #7 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v6.8h, v18.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h st1 {v20.d}[0], [x0], x1 st1 {v20.d}[1], [x6], x1 subs w4, w4, #4 st1 {v21.d}[0], [x0], x1 st1 {v21.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 8: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v17.8h, v17.8b, #7 ushll v18.8h, v18.8b, #7 ushll v19.8h, v19.8b, #7 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v6.8h, v17.8h sqrdmulh v22.8h, v6.8h, v18.8h sqrdmulh v23.8h, v6.8h, v19.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h add v22.8h, v22.8h, v4.8h add v23.8h, v23.8h, v4.8h st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v17.8h, v17.8b, #7 ushll v18.8h, v18.8b, #7 ushll v19.8h, v19.8b, #7 2: ld1 {v2.8h, v3.8h}, [x2], #32 // top sub v2.8h, v2.8h, v4.8h // top-bottom sub v3.8h, v3.8h, v4.8h sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v3.8h, v16.8h sqrdmulh v22.8h, v2.8h, v17.8h sqrdmulh v23.8h, v3.8h, v17.8h sqrdmulh v24.8h, v2.8h, v18.8h sqrdmulh v25.8h, v3.8h, v18.8h sqrdmulh v26.8h, v2.8h, v19.8h sqrdmulh v27.8h, v3.8h, v19.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h add v22.8h, v22.8h, v4.8h add v23.8h, v23.8h, v4.8h add v24.8h, v24.8h, v4.8h add v25.8h, v25.8h, v4.8h add v26.8h, v26.8h, v4.8h add v27.8h, v27.8h, v4.8h subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 st1 {v24.8h, v25.8h}, [x5], #32 st1 {v26.8h, v27.8h}, [x8], #32 b.gt 2b subs w4, w4, #4 b.le 9f sub x2, x2, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x8, x8, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_smooth_v_tbl .word 640b - ipred_smooth_v_tbl .word 320b - ipred_smooth_v_tbl .word 160b - ipred_smooth_v_tbl .word 80b - ipred_smooth_v_tbl .word 40b - ipred_smooth_v_tbl endjumptable // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_16bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 movrel x5, ipred_smooth_h_tbl add x12, x2, w3, uxtw #1 sub w9, w9, #25 ldrsw x9, [x5, w9, uxtw #2] ld1r {v5.8h}, [x12] // right add x5, x5, x9 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 ushll v7.8h, v7.8b, #7 // weights_hor << 7 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left zip1 v1.2d, v1.2d, v0.2d // left, flipped zip1 v0.2d, v3.2d, v2.2d sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v1.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h st1 {v20.d}[0], [x0], x1 st1 {v20.d}[1], [x6], x1 subs w4, w4, #4 st1 {v21.d}[0], [x0], x1 st1 {v21.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 ushll v7.8h, v7.8b, #7 // weights_hor << 7 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left sub v3.8h, v3.8h, v5.8h // left-right sub v2.8h, v2.8h, v5.8h sub v1.8h, v1.8h, v5.8h sub v0.8h, v0.8h, v5.8h sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) sqrdmulh v22.8h, v1.8h, v7.8h sqrdmulh v23.8h, v0.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h add v22.8h, v22.8h, v5.8h add v23.8h, v23.8h, v5.8h st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET sub x2, x2, #8 mov x7, #-8 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sub v2.8h, v2.8h, v5.8h sub v3.8h, v3.8h, v5.8h 2: ld1 {v7.16b}, [x8], #16 // weights_hor ushll v6.8h, v7.8b, #7 // weights_hor << 7 ushll2 v7.8h, v7.16b, #7 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) sqrdmulh v22.8h, v2.8h, v6.8h sqrdmulh v23.8h, v2.8h, v7.8h sqrdmulh v24.8h, v1.8h, v6.8h sqrdmulh v25.8h, v1.8h, v7.8h sqrdmulh v26.8h, v0.8h, v6.8h sqrdmulh v27.8h, v0.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h add v22.8h, v22.8h, v5.8h add v23.8h, v23.8h, v5.8h add v24.8h, v24.8h, v5.8h add v25.8h, v25.8h, v5.8h add v26.8h, v26.8h, v5.8h add v27.8h, v27.8h, v5.8h subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 st1 {v24.8h, v25.8h}, [x5], #32 st1 {v26.8h, v27.8h}, [x10], #32 b.gt 2b subs w4, w4, #4 b.le 9f sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_smooth_h_tbl .word 640b - ipred_smooth_h_tbl .word 320b - ipred_smooth_h_tbl .word 160b - ipred_smooth_h_tbl .word 80b - ipred_smooth_h_tbl .word 40b - ipred_smooth_h_tbl endjumptable #endif const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 padding_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, // const pixel *const in, const int end, // const int bitdepth_max); function ipred_z1_upsample_edge_16bpc_neon, export=1 dup v30.8h, w4 // bitdepth_max movrel x4, padding_mask ld1 {v0.8h, v1.8h}, [x2] // in[] add x5, x2, w3, uxtw #1 // in[end] sub x4, x4, w3, uxtw #1 ld1r {v2.8h}, [x5] // padding ld1 {v3.8h, v4.8h}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v2.16b, v3.16b // padded in[] bit v1.16b, v2.16b, v4.16b ext v4.16b, v0.16b, v1.16b, #2 ext v5.16b, v1.16b, v2.16b, #2 ext v6.16b, v0.16b, v1.16b, #4 ext v7.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] add v19.8h, v5.8h, v7.8h add v20.8h, v0.8h, v16.8h add v21.8h, v1.8h, v17.8h umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) umull2 v23.4s, v18.8h, v31.8h umull v24.4s, v19.4h, v31.4h umull2 v25.4s, v19.8h, v31.8h usubw v22.4s, v22.4s, v20.4h usubw2 v23.4s, v23.4s, v20.8h usubw v24.4s, v24.4s, v21.4h usubw2 v25.4s, v25.4s, v21.8h sqrshrun v16.4h, v22.4s, #4 sqrshrun2 v16.8h, v23.4s, #4 sqrshrun v17.4h, v24.4s, #4 sqrshrun2 v17.8h, v25.4s, #4 smin v16.8h, v16.8h, v30.8h smin v17.8h, v17.8h, v30.8h zip1 v0.8h, v4.8h, v16.8h zip2 v1.8h, v4.8h, v16.8h zip1 v2.8h, v5.8h, v17.8h zip2 v3.8h, v5.8h, v17.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] ret endfunc // void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz, // const pixel *const in, // const int bitdepth_max); function ipred_z2_upsample_edge_16bpc_neon, export=1 dup v30.8h, w3 // bitdepth_max // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. movrel x4, padding_mask ld1 {v0.8h, v1.8h}, [x2] // in[] add x5, x2, w1, uxtw #1 // in[sz] sub x4, x4, w1, uxtw #1 ld1r {v3.8h}, [x2] // in[0] for padding ld1r {v2.8h}, [x5] // padding ld1 {v4.8h, v5.8h}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v2.16b, v4.16b // padded in[] bit v1.16b, v2.16b, v5.16b ext v4.16b, v3.16b, v0.16b, #14 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #4 add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1] add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2] umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2]) umull2 v19.4s, v16.8h, v31.8h usubw v18.4s, v18.4s, v17.4h usubw2 v19.4s, v19.4s, v17.8h sqrshrun v16.4h, v18.4s, #4 sqrshrun2 v16.8h, v19.4s, #4 add x5, x0, #2*16 smin v16.8h, v16.8h, v30.8h zip1 v4.8h, v0.8h, v16.8h zip2 v5.8h, v0.8h, v16.8h st1 {v2.h}[0], [x5] // In case sz=8, output one single pixel in out[16]. st1 {v4.8h, v5.8h}, [x0] ret endfunc const edge_filter .short 0, 4, 8, 0 .short 0, 5, 6, 0 // Leaving out the coeffs for strength=3 // .byte 2, 4, 4, 0 endconst // void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, // const pixel *const in, const int end, // const int strength); function ipred_z1_filter_edge_16bpc_neon, export=1 cmp w4, #3 b.eq L(fivetap) // if (strength == 3) goto fivetap movrel x5, edge_filter, -6 add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) ld1 {v31.s}[0], [x5] // kernel[1-2] ld1 {v0.8h}, [x2], #16 dup v30.8h, v31.h[0] dup v31.8h, v31.h[1] 1: // in[end], is the last valid pixel. We produce 16 pixels out by // using 18 pixels in - the last pixel used is [17] of the ones // read/buffered. cmp w3, #17 ld1 {v1.8h, v2.8h}, [x2], #32 b.lt 2f ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 mul v16.8h, v0.8h, v30.8h mla v16.8h, v3.8h, v31.8h mla v16.8h, v5.8h, v30.8h mul v17.8h, v1.8h, v30.8h mla v17.8h, v4.8h, v31.8h mla v17.8h, v6.8h, v30.8h subs w1, w1, #16 mov v0.16b, v2.16b urshr v16.8h, v16.8h, #4 urshr v17.8h, v17.8h, #4 sub w3, w3, #16 st1 {v16.8h, v17.8h}, [x0], #32 b.gt 1b ret 2: // Right padding // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) movrel x5, padding_mask sub w6, w3, #24 sub x5, x5, w3, uxtw #1 add x6, x2, w6, sxtw #1 ld1 {v3.8h, v4.8h}, [x5] // padding_mask ld1r {v2.8h}, [x6] bit v0.16b, v2.16b, v3.16b // Pad v0-v1 bit v1.16b, v2.16b, v4.16b // Filter one block ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 mul v16.8h, v0.8h, v30.8h mla v16.8h, v3.8h, v31.8h mla v16.8h, v5.8h, v30.8h mul v17.8h, v1.8h, v30.8h mla v17.8h, v4.8h, v31.8h mla v17.8h, v6.8h, v30.8h subs w1, w1, #16 urshr v16.8h, v16.8h, #4 urshr v17.8h, v17.8h, #4 st1 {v16.8h, v17.8h}, [x0], #32 b.le 9f 5: // After one block, any remaining output would only be filtering // padding - thus just store the padding. subs w1, w1, #16 st1 {v2.16b}, [x0], #16 b.gt 5b 9: ret L(fivetap): sub x2, x2, #2 // topleft -= 1 pixel movi v29.8h, #2 ld1 {v0.8h}, [x2], #16 movi v30.8h, #4 movi v31.8h, #4 ins v0.h[0], v0.h[1] 1: // in[end+1], is the last valid pixel. We produce 16 pixels out by // using 20 pixels in - the last pixel used is [19] of the ones // read/buffered. cmp w3, #18 ld1 {v1.8h, v2.8h}, [x2], #32 b.lt 2f // if (end + 1 < 19) ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 ext v18.16b, v0.16b, v1.16b, #8 ext v19.16b, v1.16b, v2.16b, #8 mul v20.8h, v0.8h, v29.8h mla v20.8h, v3.8h, v30.8h mla v20.8h, v5.8h, v31.8h mla v20.8h, v16.8h, v30.8h mla v20.8h, v18.8h, v29.8h mul v21.8h, v1.8h, v29.8h mla v21.8h, v4.8h, v30.8h mla v21.8h, v6.8h, v31.8h mla v21.8h, v17.8h, v30.8h mla v21.8h, v19.8h, v29.8h subs w1, w1, #16 mov v0.16b, v2.16b urshr v20.8h, v20.8h, #4 urshr v21.8h, v21.8h, #4 sub w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 b.gt 1b ret 2: // Right padding // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) movrel x5, padding_mask, -2 sub w6, w3, #23 sub x5, x5, w3, uxtw #1 add x6, x2, w6, sxtw #1 ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask ld1r {v28.8h}, [x6] bit v0.16b, v28.16b, v3.16b // Pad v0-v2 bit v1.16b, v28.16b, v4.16b bit v2.16b, v28.16b, v5.16b 4: // Filter one block ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 ext v18.16b, v0.16b, v1.16b, #8 ext v19.16b, v1.16b, v2.16b, #8 mul v20.8h, v0.8h, v29.8h mla v20.8h, v3.8h, v30.8h mla v20.8h, v5.8h, v31.8h mla v20.8h, v16.8h, v30.8h mla v20.8h, v18.8h, v29.8h mul v21.8h, v1.8h, v29.8h mla v21.8h, v4.8h, v30.8h mla v21.8h, v6.8h, v31.8h mla v21.8h, v17.8h, v30.8h mla v21.8h, v19.8h, v29.8h subs w1, w1, #16 mov v0.16b, v2.16b mov v1.16b, v28.16b mov v2.16b, v28.16b urshr v20.8h, v20.8h, #4 urshr v21.8h, v21.8h, #4 sub w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 b.le 9f // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to // filter properly once more - aka (w3 >= 0). cmp w3, #0 b.ge 4b 5: // When w3 <= 0, all remaining pixels in v0-v1 are equal to the // last valid pixel - thus just output that without filtering. subs w1, w1, #8 st1 {v28.8h}, [x0], #16 b.gt 5b 9: ret endfunc // void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, // const int n); function ipred_pixel_set_16bpc_neon, export=1 dup v0.8h, w1 1: subs w2, w2, #8 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc // void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, // const int dx, const int max_base_x); function ipred_z1_fill1_16bpc_neon, export=1 clz w9, w3 movrel x8, ipred_z1_fill1_tbl sub w9, w9, #25 ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw #1 // top[max_base_x] add x8, x8, x9 ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] ext v3.16b, v2.16b, v2.16b, #2 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 st1 {v16.4h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.4h}, [x0], x1 b.gt 4b ret 49: st1 {v31.4h}, [x0], x1 subs w4, w4, #2 st1 {v31.4h}, [x0], x1 b.gt 49b ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h}, [x8] // top[base] ld1 {v2.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 ldr h1, [x8, #16] ldr h3, [x10, #16] dup v6.8h, w9 // 64 - frac dup v7.8h, w11 ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] ext v3.16b, v2.16b, v3.16b, #2 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v1.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v3.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v3.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 st1 {v16.8h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8h}, [x0], x1 b.gt 8b ret 89: st1 {v31.8h}, [x0], x1 subs w4, w4, #2 st1 {v31.8h}, [x0], x1 b.gt 89b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w3 add x13, x0, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 169f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v6.8h, w9 // frac dup v7.8h, w11 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v16.8h, w9 // 64 - frac dup v17.8h, w11 add w7, w7, w5 // xpos += dx 2: ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] ext v19.16b, v1.16b, v2.16b, #2 ext v20.16b, v3.16b, v4.16b, #2 ext v21.16b, v4.16b, v5.16b, #2 subs w3, w3, #16 umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac umull2 v23.4s, v0.8h, v16.8h umlal2 v23.4s, v18.8h, v6.8h umull v24.4s, v1.4h, v16.4h umlal v24.4s, v19.4h, v6.4h umull2 v25.4s, v1.8h, v16.8h umlal2 v25.4s, v19.8h, v6.8h umull v26.4s, v3.4h, v17.4h umlal v26.4s, v20.4h, v7.4h umull2 v27.4s, v3.8h, v17.8h umlal2 v27.4s, v20.8h, v7.8h umull v28.4s, v4.4h, v17.4h umlal v28.4s, v21.4h, v7.4h umull2 v29.4s, v4.8h, v17.8h umlal2 v29.4s, v21.8h, v7.8h rshrn v22.4h, v22.4s, #6 rshrn2 v22.8h, v23.4s, #6 rshrn v23.4h, v24.4s, #6 rshrn2 v23.8h, v25.4s, #6 rshrn v24.4h, v26.4s, #6 rshrn2 v24.8h, v27.4s, #6 rshrn v25.4h, v28.4s, #6 rshrn2 v25.8h, v29.4s, #6 st1 {v22.8h, v23.8h}, [x0], #32 st1 {v24.8h, v25.8h}, [x13], #32 b.le 3f mov v0.16b, v2.16b ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] mov v3.16b, v5.16b ld1 {v4.8h, v5.8h}, [x10], #32 b 2b 3: subs w4, w4, #2 b.le 9f add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 1b 9: ret 169: st1 {v31.8h}, [x0], #16 subs w3, w3, #8 st1 {v31.8h}, [x13], #16 b.gt 169b subs w4, w4, #2 b.le 9b add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 169b endfunc jumptable ipred_z1_fill1_tbl .word 640b - ipred_z1_fill1_tbl .word 320b - ipred_z1_fill1_tbl .word 160b - ipred_z1_fill1_tbl .word 80b - ipred_z1_fill1_tbl .word 40b - ipred_z1_fill1_tbl endjumptable function ipred_z1_fill2_16bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 b.eq 8f 4: // w == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] uzp1 v0.8h, v0.8h, v0.8h // top[base] uzp2 v3.8h, v2.8h, v2.8h uzp1 v2.8h, v2.8h, v2.8h sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 st1 {v16.4h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.4h}, [x0], x1 b.gt 4b ret 49: st1 {v31.4h}, [x0], x1 subs w4, w4, #2 st1 {v31.4h}, [x0], x1 b.gt 49b ret 8: // w == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h, v1.8h}, [x8] // top[base] ld1 {v2.8h, v3.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8h, w9 // 64 - frac dup v7.8h, w11 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] uzp1 v0.8h, v0.8h, v1.8h // top[base] uzp2 v21.8h, v2.8h, v3.8h uzp1 v2.8h, v2.8h, v3.8h umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v20.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v21.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v21.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 st1 {v16.8h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8h}, [x0], x1 b.gt 8b ret 89: st1 {v31.8h}, [x0], x1 subs w4, w4, #2 st1 {v31.8h}, [x0], x1 b.gt 89b ret endfunc // void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, // const int n); function ipred_reverse_16bpc_neon, export=1 sub x1, x1, #16 add x3, x0, #8 mov x4, #16 1: ld1 {v0.8h}, [x1] subs w2, w2, #8 rev64 v0.8h, v0.8h sub x1, x1, #16 st1 {v0.d}[1], [x0], x4 st1 {v0.d}[0], [x3], x4 b.gt 1b ret endfunc const increments .short 0, 1, 2, 3, 4, 5, 6, 7 endconst // void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const pixel *const left, // const int width, const int height, // const int dx, const int dy); function ipred_z2_fill1_16bpc_neon, export=1 clz w10, w4 movrel x9, ipred_z2_fill1_tbl sub w10, w10, #25 ldrsw x10, [x9, w10, uxtw #2] mov w8, #(1 << 6) // xpos = 1 << 6 add x9, x9, x10 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy br x9 40: AARCH64_VALID_JUMP_TARGET dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // Worst case height for w=4 is 16, but we need at least h+1 elements ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) tbl v18.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v29.16b, #4 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos // Cut corners here; only doing tbl over v0-v1 here; we only // seem to need the last pixel, from v2, after skipping to the // left-only codepath below. tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] sshr v20.8h, v16.8h, #6 // first base_x for each row ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 4b 49: tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2] trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 49b 9: ret 80: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy add x3, x3, #2 // Skip past left[0] mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // Worst case height for w=8 is 32. ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] ld1r {v15.8h}, [x2] // left[0] == top[0] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y mov v18.16b, v15.16b // left[0] zip1 v29.16b, v29.16b, v29.16b // duplicate elements movi v17.16b, #2 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbx v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] mov v19.16b, v15.16b // left[0] ld1 {v6.8h, v7.8h}, [x11] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] mov v20.16b, v15.16b // left[0] sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v21.8h, v21.8h, #0 cmge v22.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 8b 89: mov v19.16b, v15.16b mov v20.16b, v15.16b tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v19.4h, v28.4h umlal v6.4s, v20.4h, v27.4h umull2 v7.4s, v19.8h, v28.8h umlal2 v7.4s, v20.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v25.8h, w7 // -dy add x3, x3, #2 // Skip past left[0] add x13, x0, x1 // alternating row lsl x1, x1, #1 // stride *= 2 sub x1, x1, w4, uxtw #1 // stride -= width movi v11.8h, #8 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy add v26.8h, v26.8h, v25.8h // -= dy mul v25.8h, v25.8h, v11.8h // -8*dy // Worst case height is 64, but we can only fit 32 pixels into // v0-v3 usable within one tbx instruction. As long as base_y is // up to 32, we use tbx. ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] ld1r {v15.8h}, [x2] // left[0] == top[0] mov w12, w4 // orig w neg w14, w4 // -w 1: mov v23.16b, v26.16b // reset ypos asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, w14 // base_x <= -2*w asr w11, w8, #6 // base_x b.le 169f dup v17.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 ld1 {v4.8h}, [x9], #16 // top[base_x] ld1 {v6.8h}, [x11], #16 movi v10.8h, #0x3e movi v11.8h, #64 and v16.16b, v16.16b, v10.16b // frac_x and v17.16b, v17.16b, v10.16b sub v8.8h, v11.8h, v16.8h // 64 - frac_x sub v9.8h, v11.8h, v17.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h 2: smov w10, v22.h[0] shrn v29.8b, v23.8h, #6 // ypos >> 6 movi v12.8h, #64 cmp w10, #0 // base_x (bottom left) >= 0 smov w10, v29.b[0] // base_y[0] movi v10.8h, #0x3e b.ge 4f and v27.16b, v23.16b, v10.16b // frac_y cmp w10, #(32-3) mov v18.16b, v15.16b // left[0] sub v28.8h, v12.8h, v27.8h // 64 - frac_y b.gt 22f 21: // base_y < 32, using tbx shl v29.8b, v29.8b, #1 // 2*base_y movi v11.8h, #1, lsl #8 zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... movi v13.16b, #2 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v13.16b // base_y + 1 (*2) mov v19.16b, v15.16b // left[0] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v13.16b // base_y + 2 (*2) mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] b 23f 22: // base_y >= 32, using separate loads. smov w15, v29.b[1] smov w16, v29.b[2] add x10, x3, w10, sxtw #1 smov w17, v29.b[3] add x15, x3, w15, sxtw #1 ld3 {v18.h, v19.h, v20.h}[0], [x10] smov w10, v29.b[4] add x16, x3, w16, sxtw #1 ld3 {v18.h, v19.h, v20.h}[1], [x15] smov w15, v29.b[5] add x17, x3, w17, sxtw #1 ld3 {v18.h, v19.h, v20.h}[2], [x16] smov w16, v29.b[6] add x10, x3, w10, sxtw #1 ld3 {v18.h, v19.h, v20.h}[3], [x17] smov w17, v29.b[7] add x15, x3, w15, sxtw #1 add x16, x3, w16, sxtw #1 ld3 {v18.h, v19.h, v20.h}[4], [x10] add x17, x3, w17, sxtw #1 ld3 {v18.h, v19.h, v20.h}[5], [x15] ld3 {v18.h, v19.h, v20.h}[6], [x16] ld3 {v18.h, v19.h, v20.h}[7], [x17] 23: ld1 {v5.8h}, [x9], #16 // top[base_x] ld1 {v7.8h}, [x11], #16 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #2 rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v18.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v19.4h, v17.4h umull2 v20.4s, v6.8h, v9.8h umlal2 v20.4s, v19.8h, v17.8h cmge v18.8h, v21.8h, #0 cmge v19.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v20.4s, #6 bit v10.16b, v12.16b, v18.16b bit v11.16b, v13.16b, v19.16b st1 {v10.8h}, [x0], #16 subs w4, w4, #8 st1 {v11.8h}, [x13], #16 b.le 3f movi v10.8h, #8 mov v4.16b, v5.16b mov v6.16b, v7.16b add v21.8h, v21.8h, v10.8h // base_x += 8 add v22.8h, v22.8h, v10.8h b 2b 3: subs w5, w5, #2 b.le 9f movi v10.8h, #128 add x0, x0, x1 add x13, x13, x1 mov w4, w12 // reset w add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) b 1b 4: // The rest of the row only predicted from top[] ld1 {v5.8h}, [x9], #16 // top[base_x] ld1 {v7.8h}, [x11], #16 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #2 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v18.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v19.4h, v17.4h umull2 v20.4s, v6.8h, v9.8h umlal2 v20.4s, v19.8h, v17.8h rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v20.4s, #6 st1 {v12.8h}, [x0], #16 subs w4, w4, #8 st1 {v13.8h}, [x13], #16 b.le 3b mov v4.16b, v5.16b mov v6.16b, v7.16b b 4b 169: // The rest of the block only predicted from left[] add x1, x1, w4, uxtw #1 // restore stride mov w12, w5 // orig remaining h 1: movi v12.8h, #64 movi v10.8h, #0x3e shrn v29.8b, v23.8h, #6 // ypos >> 6 and v27.16b, v23.16b, v10.16b // frac_y smov w10, v29.b[0] // base_y[0] shl v29.8b, v29.8b, #1 // 2*base_y movi v11.8h, #1, lsl #8 zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v23.8h, v23.8h, v25.8h // ypos -= 8*dy add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... cmp w10, #(32-1) mov v18.16b, v15.16b // left[0] movi v21.16b, #2 sub v28.8h, v12.8h, v27.8h // 64 - frac_y b.gt 31f tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v21.16b // base_y + 1 (*2) 2: // base_y < 32, using tbx. smov w10, v29.b[0] // base_y[0] mov v19.16b, v15.16b // left[0] cmp w10, #(64-4) b.gt 32f tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v21.16b // base_y + 2 (*2) mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] add v29.16b, v29.16b, v21.16b // next base_y umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 st1 {v10.8h}, [x0], x1 subs w5, w5, #2 st1 {v11.8h}, [x13], x1 b.le 4f mov v18.16b, v20.16b b 2b 31: // base_y >= 32, using separate loads, loading v18 if we had to bail // in the prologue. smov w10, v29.b[0] smov w15, v29.b[2] movi v21.16b, #2 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld1 {v18.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld1 {v18.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld1 {v18.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld1 {v18.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld1 {v18.h}[4], [x10] add x17, x3, w17, sxtw ld1 {v18.h}[5], [x15] add v29.16b, v29.16b, v21.16b // next base_y ld1 {v18.h}[6], [x16] ld1 {v18.h}[7], [x17] 32: // base_y >= 32, using separate loads. cmp w5, #4 b.lt 34f 33: // h >= 4, preserving v18 from the previous round, loading v19-v22. smov w10, v29.b[0] subs w5, w5, #4 smov w15, v29.b[2] movi v10.16b, #8 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10] add x17, x3, w17, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15] ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16] add v29.16b, v29.16b, v10.16b // next base_y ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17] umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y) umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y umull2 v13.4s, v20.8h, v28.8h umlal2 v13.4s, v21.8h, v27.8h umull v14.4s, v21.4h, v28.4h umlal v14.4s, v22.4h, v27.4h umull2 v18.4s, v21.8h, v28.8h umlal2 v18.4s, v22.8h, v27.8h rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 st1 {v10.8h}, [x0], x1 cmp w5, #2 st1 {v11.8h}, [x13], x1 st1 {v12.8h}, [x0], x1 st1 {v13.8h}, [x13], x1 b.lt 4f mov v18.16b, v22.16b b.gt 33b 34: // h == 2, preserving v18 from the previous round, loading v19-v20. smov w10, v29.b[0] smov w15, v29.b[2] movi v21.16b, #4 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld2 {v19.h, v20.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld2 {v19.h, v20.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld2 {v19.h, v20.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld2 {v19.h, v20.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld2 {v19.h, v20.h}[4], [x10] add x17, x3, w17, sxtw ld2 {v19.h, v20.h}[5], [x15] ld2 {v19.h, v20.h}[6], [x16] add v29.16b, v29.16b, v21.16b // next base_y ld2 {v19.h, v20.h}[7], [x17] umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 st1 {v10.8h}, [x0], x1 st1 {v11.8h}, [x13], x1 // The h==2 case only happens once at the end, if at all. 4: subs w4, w4, #8 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w5, w12 // reset h b 1b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc jumptable ipred_z2_fill1_tbl .word 640b - ipred_z2_fill1_tbl .word 320b - ipred_z2_fill1_tbl .word 160b - ipred_z2_fill1_tbl .word 80b - ipred_z2_fill1_tbl .word 40b - ipred_z2_fill1_tbl endjumptable function ipred_z2_fill2_16bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.8h, v1.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) tbl v18.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v29.16b, #4 add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6} 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] sshr v20.8h, v16.8h, #6 // first base_x for each row uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1] uzp1 v4.8h, v4.8h, v6.8h // top[base_x] and v16.16b, v16.16b, v25.16b // frac_x trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 4b 49: tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 49b 9: ret 80: stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.8h, v1.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.16b, v29.16b, v29.16b // duplicate elements movi v17.16b, #2 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbl v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] ld1 {v6.8h, v7.8h}, [x11] tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1] uzp1 v4.8h, v4.8h, v5.8h // top[base_x] uzp2 v3.8h, v6.8h, v7.8h uzp1 v6.8h, v6.8h, v7.8h mov v5.16b, v2.16b mov v7.16b, v3.16b and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v21.8h, v21.8h, #0 cmge v22.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 8b 89: tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v19.4h, v28.4h umlal v6.4s, v20.4h, v27.4h umull2 v7.4s, v19.8h, v28.8h umlal2 v7.4s, v20.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc function ipred_z2_fill3_16bpc_neon, export=1 cmp w4, #8 mov w8, #(1 << 6) // xpos = 1 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #2 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y movi v19.16b, #4 zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} add v24.8b, v30.8b, v19.8b // base_y + 3 (*2) trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2 trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v24.16b, #8 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] sshr v20.8h, v16.8h, #6 // first base_x for each row ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 movi v24.16b, #8 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) b 4b 49: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) b 49b 9: ret 80: stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy movi v17.16b, #2 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2 movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y mov v18.16b, v15.16b // left[0] zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... add v30.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] ld1 {v6.8h, v7.8h}, [x11] tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] add v30.16b, v30.16b, v24.16b sshr v22.8h, v16.8h, #6 // first base_x tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] sshr v23.8h, v17.8h, #6 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v22.8h, v22.8h, v31.8h // actual base_x add v23.8h, v23.8h, v31.8h umull v12.4s, v20.4h, v28.4h umlal v12.4s, v21.4h, v27.4h umull2 v13.4s, v20.8h, v28.8h umlal2 v13.4s, v21.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v22.8h, v22.8h, #0 cmge v23.8h, v23.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v22.16b bit v11.16b, v13.16b, v23.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b b 8b 89: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] add v30.16b, v30.16b, v24.16b tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v20.4h, v28.4h umlal v6.4s, v21.4h, v27.4h umull2 v7.4s, v20.8h, v28.8h umlal2 v7.4s, v21.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc // void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const left, // const int width, const int height, // const int dy, const int max_base_y); function ipred_z3_fill1_16bpc_neon, export=1 clz w9, w4 movrel x8, ipred_z3_fill1_tbl sub w9, w9, #25 ldrsw x9, [x8, w9, uxtw #2] add x10, x2, w6, uxtw #1 // left[max_base_y] add x8, x8, x9 ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // left[base] ldr q2, [x2, w10, uxtw] dup v4.8h, w9 // frac dup v5.8h, w11 ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] ext v3.16b, v2.16b, v2.16b, #2 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 add w7, w7, w5 // xpos += dx st1 {v18.s}[2], [x0] st1 {v18.s}[3], [x13] b.le 9f sub x0, x0, x1 // ptr -= 4 * (2*stride) sub x13, x13, x1 add x0, x0, #4 add x13, x13, #4 b 4b 9: ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h}, [x8] // left[base] ld1 {v2.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 ldr h1, [x8, #16] ldr h3, [x10, #16] dup v6.8h, w9 // 64 - frac dup v7.8h, w11 ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] ext v3.16b, v2.16b, v3.16b, #2 umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v1.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v3.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v3.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h zip2 v19.8h, v16.8h, v17.8h add w7, w7, w5 // xpos += dx st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 b.le 9f sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) sub x13, x13, x1, lsl #2 add x0, x0, #4 add x13, x13, #4 b 8b 9: ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w4 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // ypos += dy cmp w8, w6 // base >= max_base_y lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v6.8h, w9 // frac dup v7.8h, w11 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v16.8h, w9 // 64 - frac dup v17.8h, w11 add w7, w7, w5 // ypos += dy 2: ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] ext v19.16b, v1.16b, v2.16b, #2 ext v20.16b, v3.16b, v4.16b, #2 ext v21.16b, v4.16b, v5.16b, #2 subs w4, w4, #16 umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac umull2 v23.4s, v0.8h, v16.8h umlal2 v23.4s, v18.8h, v6.8h umull v24.4s, v1.4h, v16.4h umlal v24.4s, v19.4h, v6.4h umull2 v25.4s, v1.8h, v16.8h umlal2 v25.4s, v19.8h, v6.8h umull v26.4s, v3.4h, v17.4h umlal v26.4s, v20.4h, v7.4h umull2 v27.4s, v3.8h, v17.8h umlal2 v27.4s, v20.8h, v7.8h umull v28.4s, v4.4h, v17.4h umlal v28.4s, v21.4h, v7.4h umull2 v29.4s, v4.8h, v17.8h umlal2 v29.4s, v21.8h, v7.8h rshrn v22.4h, v22.4s, #6 rshrn2 v22.8h, v23.4s, #6 rshrn v23.4h, v24.4s, #6 rshrn2 v23.8h, v25.4s, #6 rshrn v24.4h, v26.4s, #6 rshrn2 v24.8h, v27.4s, #6 rshrn v25.4h, v28.4s, #6 rshrn2 v25.8h, v29.4s, #6 zip1 v18.8h, v22.8h, v24.8h zip2 v19.8h, v22.8h, v24.8h zip1 v20.8h, v23.8h, v25.8h zip2 v21.8h, v23.8h, v25.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x13], x1 st1 {v20.s}[2], [x0], x1 st1 {v20.s}[3], [x13], x1 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x13], x1 st1 {v21.s}[2], [x0], x1 st1 {v21.s}[3], [x13], x1 b.le 3f mov v0.16b, v2.16b ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] mov v3.16b, v5.16b ld1 {v4.8h, v5.8h}, [x10], #32 b 2b 3: subs w3, w3, #2 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 9: ret endfunc jumptable ipred_z3_fill1_tbl .word 640b - ipred_z3_fill1_tbl .word 320b - ipred_z3_fill1_tbl .word 160b - ipred_z3_fill1_tbl .word 80b - ipred_z3_fill1_tbl .word 40b - ipred_z3_fill1_tbl endjumptable function ipred_z3_fill_padding_neon, export=0 cmp w3, #8 movrel x8, ipred_z3_fill_padding_tbl b.gt ipred_z3_fill_padding_wide // w3 = remaining width, w4 = constant height mov w12, w4 1: // Fill a WxH rectangle with padding. W can be any number; // this fills the exact width by filling in the largest // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 ldrsw x9, [x8, w9, uxtw #2] add x9, x8, x9 br x9 20: AARCH64_VALID_JUMP_TARGET 2: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 st1 {v31.s}[0], [x0], x1 st1 {v31.s}[0], [x13], x1 b.gt 2b subs w3, w3, #2 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 40: AARCH64_VALID_JUMP_TARGET 4: st1 {v31.4h}, [x0], x1 subs w4, w4, #4 st1 {v31.4h}, [x13], x1 st1 {v31.4h}, [x0], x1 st1 {v31.4h}, [x13], x1 b.gt 4b subs w3, w3, #4 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #8 add x13, x13, #8 mov w4, w12 b 1b 80: 160: 320: 640: AARCH64_VALID_JUMP_TARGET 8: st1 {v31.8h}, [x0], x1 subs w4, w4, #4 st1 {v31.8h}, [x13], x1 st1 {v31.8h}, [x0], x1 st1 {v31.8h}, [x13], x1 b.gt 8b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w4, w12 b 1b 9: ret endfunc jumptable ipred_z3_fill_padding_tbl .word 640b - ipred_z3_fill_padding_tbl .word 320b - ipred_z3_fill_padding_tbl .word 160b - ipred_z3_fill_padding_tbl .word 80b - ipred_z3_fill_padding_tbl .word 40b - ipred_z3_fill_padding_tbl .word 20b - ipred_z3_fill_padding_tbl endjumptable function ipred_z3_fill_padding_wide // Fill a WxH rectangle with padding, with W > 8. lsr x1, x1, #1 mov w12, w3 sub x1, x1, w3, uxtw #1 1: ands w5, w3, #7 b.eq 2f // If the width isn't aligned to 8, first do one 8 pixel write // and align the start pointer. sub w3, w3, w5 st1 {v31.8h}, [x0] add x0, x0, w5, uxtw #1 2: // Fill the rest of the line with aligned 8 pixel writes. subs w3, w3, #8 st1 {v31.8h}, [x0], #16 b.gt 2b subs w4, w4, #1 add x0, x0, x1 b.le 9f mov w3, w12 b 1b 9: ret endfunc function ipred_z3_fill2_16bpc_neon, export=1 cmp w4, #8 add x10, x2, w6, uxtw // left[max_base_y] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 b.eq 8f 4: // h == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] uzp1 v0.8h, v0.8h, v0.8h // top[base] uzp2 v3.8h, v2.8h, v2.8h uzp1 v2.8h, v2.8h, v2.8h sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 add w7, w7, w5 // xpos += dx st1 {v18.s}[2], [x0] st1 {v18.s}[3], [x13] b.le 9f sub x0, x0, x1 // ptr -= 4 * (2*stride) sub x13, x13, x1 add x0, x0, #4 add x13, x13, #4 b 4b 9: ret 8: // h == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h, v1.8h}, [x8] // top[base] ld1 {v2.8h, v3.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8h, w9 // 64 - frac dup v7.8h, w11 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] uzp1 v0.8h, v0.8h, v1.8h // top[base] uzp2 v21.8h, v2.8h, v3.8h uzp1 v2.8h, v2.8h, v3.8h umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v20.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v21.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v21.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h zip2 v19.8h, v16.8h, v17.8h add w7, w7, w5 // xpos += dx st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 b.le 9f sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) sub x13, x13, x1, lsl #2 add x0, x0, #4 add x13, x13, #4 b 8b 9: ret endfunc // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height, // const int bitdepth_max); .macro filter_fn bpc function ipred_filter_\bpc\()bpc_neon and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 movrel x5, ipred_filter\bpc\()_tbl ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrsw x9, [x5, w9, uxtw #2] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b add x5, x5, x9 sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b dup v31.8h, w8 .if \bpc == 10 movi v30.8h, #0 .endif br x5 40: AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #2] // top (0-3) sub x2, x2, #4 mov x7, #-4 4: ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) .if \bpc == 10 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) srshr v2.8h, v2.8h, #4 smax v2.8h, v2.8h, v30.8h .else smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.4h, v2.4s, #4 sqrshrun2 v2.8h, v3.4s, #4 .endif smin v2.8h, v2.8h, v31.8h subs w4, w4, #2 st1 {v2.d}[0], [x0], x1 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] st1 {v2.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ldur q0, [x2, #2] // top (0-7) sub x2, x2, #4 mov x7, #-4 8: ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) .if \bpc == 10 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) srshr v2.8h, v2.8h, #4 smax v2.8h, v2.8h, v30.8h smin v2.8h, v2.8h, v31.8h mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) srshr v3.8h, v3.8h, #4 smax v3.8h, v3.8h, v30.8h .else smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.4h, v2.4s, #4 sqrshrun2 v2.8h, v3.4s, #4 smin v2.8h, v2.8h, v31.8h smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) sqrshrun v3.4h, v4.4s, #4 sqrshrun2 v3.8h, v5.4s, #4 .endif smin v3.8h, v3.8h, v31.8h subs w4, w4, #2 st2 {v2.d, v3.d}[0], [x0], x1 zip2 v0.2d, v2.2d, v3.2d st2 {v2.d, v3.d}[1], [x6], x1 b.gt 8b ret 160: 320: AARCH64_VALID_JUMP_TARGET add x8, x2, #2 sub x2, x2, #4 mov x7, #-4 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 2: ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) .if \bpc == 10 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) srshr v3.8h, v3.8h, #4 smax v3.8h, v3.8h, v30.8h smin v3.8h, v3.8h, v31.8h mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) srshr v4.8h, v4.8h, #4 smax v4.8h, v4.8h, v30.8h smin v4.8h, v4.8h, v31.8h mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) srshr v5.8h, v5.8h, #4 smax v5.8h, v5.8h, v30.8h smin v5.8h, v5.8h, v31.8h mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 srshr v6.8h, v6.8h, #4 smax v6.8h, v6.8h, v30.8h .else smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.4h, v3.4s, #4 sqrshrun2 v3.8h, v4.4s, #4 smin v3.8h, v3.8h, v31.8h smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.4h, v5.4s, #4 sqrshrun2 v4.8h, v6.4s, #4 smin v4.8h, v4.8h, v31.8h smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.4h, v24.4s, #4 sqrshrun2 v5.8h, v25.4s, #4 smin v5.8h, v5.8h, v31.8h smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.4h, v26.4s, #4 sqrshrun2 v6.8h, v27.4s, #4 .endif smin v6.8h, v6.8h, v31.8h ins v0.h[2], v2.h[7] st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 ins v0.h[0], v6.h[7] st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 ins v0.h[1], v6.h[3] b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret endfunc jumptable ipred_filter\bpc\()_tbl .word 320b - ipred_filter\bpc\()_tbl .word 160b - ipred_filter\bpc\()_tbl .word 80b - ipred_filter\bpc\()_tbl .word 40b - ipred_filter\bpc\()_tbl endjumptable .endm filter_fn 10 filter_fn 12 function ipred_filter_16bpc_neon, export=1 ldr w8, [sp] cmp w8, 0x3ff b.le ipred_filter_10bpc_neon b ipred_filter_12bpc_neon endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 ld1 {v30.8h}, [x2] clz w9, w4 movrel x6, pal_pred_tbl sub w9, w9, #25 movi v29.16b, #7 ldrsw x9, [x6, w9, uxtw #2] movi v31.8h, #1, lsl #8 add x6, x6, x9 br x6 40: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 4: ld1 {v1.8b}, [x3], #8 subs w5, w5, #4 ushr v3.8b, v1.8b, #4 and v2.8b, v1.8b, v29.8b zip1 v1.16b, v2.16b, v3.16b // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... add v1.16b, v1.16b, v1.16b zip1 v0.16b, v1.16b, v1.16b zip2 v1.16b, v1.16b, v1.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b st1 {v0.d}[0], [x0], x1 tbl v1.16b, {v30.16b}, v1.16b st1 {v0.d}[1], [x2], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x2], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 8: ld1 {v2.16b}, [x3], #16 subs w5, w5, #4 ushr v4.16b, v2.16b, #4 and v3.16b, v2.16b, v29.16b zip1 v2.16b, v3.16b, v4.16b zip2 v3.16b, v3.16b, v4.16b add v2.16b, v2.16b, v2.16b add v3.16b, v3.16b, v3.16b zip1 v0.16b, v2.16b, v2.16b zip2 v1.16b, v2.16b, v2.16b zip1 v2.16b, v3.16b, v3.16b zip2 v3.16b, v3.16b, v3.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b tbl v1.16b, {v30.16b}, v1.16b st1 {v0.8h}, [x0], x1 tbl v2.16b, {v30.16b}, v2.16b st1 {v1.8h}, [x2], x1 tbl v3.16b, {v30.16b}, v3.16b st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x2], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 16: ld1 {v4.16b, v5.16b}, [x3], #32 subs w5, w5, #4 ushr v7.16b, v4.16b, #4 and v6.16b, v4.16b, v29.16b ushr v3.16b, v5.16b, #4 and v2.16b, v5.16b, v29.16b zip1 v4.16b, v6.16b, v7.16b zip2 v5.16b, v6.16b, v7.16b zip1 v6.16b, v2.16b, v3.16b zip2 v7.16b, v2.16b, v3.16b add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b st1 {v2.8h, v3.8h}, [x2], x1 tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h}, [x0], x1 st1 {v6.8h, v7.8h}, [x2], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 32: ld1 {v4.16b, v5.16b}, [x3], #32 subs w5, w5, #2 ushr v7.16b, v4.16b, #4 and v6.16b, v4.16b, v29.16b ushr v3.16b, v5.16b, #4 and v2.16b, v5.16b, v29.16b zip1 v4.16b, v6.16b, v7.16b zip2 v5.16b, v6.16b, v7.16b zip1 v6.16b, v2.16b, v3.16b zip2 v7.16b, v2.16b, v3.16b add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET add x2, x0, #64 64: ld1 {v4.16b, v5.16b}, [x3], #32 subs w5, w5, #1 ushr v7.16b, v4.16b, #4 and v6.16b, v4.16b, v29.16b ushr v3.16b, v5.16b, #4 and v2.16b, v5.16b, v29.16b zip1 v4.16b, v6.16b, v7.16b zip2 v5.16b, v6.16b, v7.16b zip1 v6.16b, v2.16b, v3.16b zip2 v7.16b, v2.16b, v3.16b add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 64b ret endfunc jumptable pal_pred_tbl .word 640b - pal_pred_tbl .word 320b - pal_pred_tbl .word 160b - pal_pred_tbl .word 80b - pal_pred_tbl .word 40b - pal_pred_tbl endjumptable // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_128_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 movrel x7, ipred_cfl_128_tbl sub w9, w9, #26 ldrsw x9, [x7, w9, uxtw #2] urshr v0.8h, v31.8h, #1 dup v1.8h, w6 // alpha add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET 1: ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #4 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v16.4s, v2.4s, #0 // sign cmlt v17.4s, v3.4s, #0 cmlt v18.4s, v4.4s, #0 cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s add v5.4s, v5.4s, v19.4s rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v2.d}[0], [x0], x1 st1 {v2.d}[1], [x6], x1 st1 {v3.d}[0], [x0], x1 st1 {v3.d}[1], [x6], x1 b.gt 1b ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET 1: ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #2 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v16.4s, v2.4s, #0 // sign cmlt v17.4s, v3.4s, #0 cmlt v18.4s, v4.4s, #0 cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s add v5.4s, v5.4s, v19.4s rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x6], x1 b.gt 1b ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 subs w3, w3, #16 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha smull2 v17.4s, v2.8h, v1.8h smull v18.4s, v3.4h, v1.4h smull2 v19.4s, v3.8h, v1.8h smull v2.4s, v4.4h, v1.4h smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v20.4s, v16.4s, #0 // sign cmlt v21.4s, v17.4s, #0 cmlt v22.4s, v18.4s, #0 cmlt v23.4s, v19.4s, #0 cmlt v24.4s, v2.4s, #0 cmlt v25.4s, v3.4s, #0 cmlt v26.4s, v4.4s, #0 cmlt v27.4s, v5.4s, #0 add v16.4s, v16.4s, v20.4s // diff + sign add v17.4s, v17.4s, v21.4s add v18.4s, v18.4s, v22.4s add v19.4s, v19.4s, v23.4s add v2.4s, v2.4s, v24.4s add v3.4s, v3.4s, v25.4s add v4.4s, v4.4s, v26.4s add v5.4s, v5.4s, v27.4s rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 rshrn v6.4h, v2.4s, #6 rshrn2 v6.8h, v3.4s, #6 rshrn v7.4h, v4.4s, #6 rshrn2 v7.8h, v5.4s, #6 add v2.8h, v16.8h, v0.8h // dc + apply_sign() add v3.8h, v17.8h, v0.8h add v4.8h, v6.8h, v0.8h add v5.8h, v7.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smax v4.8h, v4.8h, v30.8h smax v5.8h, v5.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h smin v4.8h, v4.8h, v31.8h smin v5.8h, v5.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], #32 st1 {v4.8h, v5.8h}, [x6], #32 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret endfunc jumptable ipred_cfl_128_tbl ipred_cfl_splat_tbl: .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl endjumptable // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_top_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 movrel x7, ipred_cfl_top_tbl sub w9, w9, #26 ldrsw x9, [x7, w9, uxtw #2] dup v1.8h, w6 // alpha add x2, x2, #2 add x7, x7, x9 add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h uaddlv s0, v0.8h rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) endfunc jumptable ipred_cfl_top_tbl .word 32b - ipred_cfl_top_tbl .word 16b - ipred_cfl_top_tbl .word 8b - ipred_cfl_top_tbl .word 4b - ipred_cfl_top_tbl endjumptable // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_left_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max sub x2, x2, w4, uxtw #1 clz w9, w3 clz w8, w4 movrel x10, ipred_cfl_splat_tbl movrel x7, ipred_cfl_left_tbl sub w9, w9, #26 sub w8, w8, #26 ldrsw x9, [x10, w9, uxtw #2] ldrsw x8, [x7, w8, uxtw #2] dup v1.8h, w6 // alpha add x9, x10, x9 add x7, x7, x8 add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_left_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h uaddlv s0, v0.8h rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] br x9 endfunc jumptable ipred_cfl_left_tbl .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl .word L(ipred_cfl_left_h8) - ipred_cfl_left_tbl .word L(ipred_cfl_left_h4) - ipred_cfl_left_tbl endjumptable // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max sub x2, x2, w4, uxtw #1 add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.4s, w8 // width + height movrel x7, ipred_cfl_tbl rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrsw x9, [x7, w9, uxtw #2] ldrsw x6, [x7, w6, uxtw #2] neg w8, w8 // -ctz(width + height) add x9, x7, x9 add x7, x7, x6 ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x9 L(ipred_cfl_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.4h cmp w4, #4 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x9 L(ipred_cfl_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.8h cmp w4, #8 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2], #32 addp v0.8h, v2.8h, v3.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v2.8h, v2.8h, v3.8h uaddlv s2, v2.8h cmp w4, #16 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/8/32 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] add v0.4s, v0.4s, v16.4s addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v2.8h, v2.8h, v4.8h cmp w4, #32 uaddlv s2, v2.8h add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #8 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) endfunc jumptable ipred_cfl_tbl .word L(ipred_cfl_h32) - ipred_cfl_tbl .word L(ipred_cfl_h16) - ipred_cfl_tbl .word L(ipred_cfl_h8) - ipred_cfl_tbl .word L(ipred_cfl_h4) - ipred_cfl_tbl .word L(ipred_cfl_w32) - ipred_cfl_tbl .word L(ipred_cfl_w16) - ipred_cfl_tbl .word L(ipred_cfl_w8) - ipred_cfl_tbl .word L(ipred_cfl_w4) - ipred_cfl_tbl endjumptable // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 movrel x7, ipred_cfl_ac_420_tbl sub w8, w8, #27 ldrsw x8, [x7, w8, uxtw #2] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v2.8h addp v1.8h, v1.8h, v3.8h add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums add v24.4s, v24.4s, v25.4s add v26.4s, v26.4s, v27.4s add v0.4s, v24.4s, v26.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #3 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v1.8h, v4.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v2.8h addp v1.8h, v1.8h, v3.8h add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw v25.4s, v25.4s, v1.4h uaddw v26.4s, v26.4s, v2.4h uaddw v27.4s, v27.4s, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET movrel x7, ipred_cfl_ac_420_w16_tbl ldrsw x3, [x7, w3, uxtw #2] add x7, x7, x3 br x7 L(ipred_cfl_ac_420_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 add v0.8h, v0.8h, v4.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 add v2.8h, v2.8h, v6.8h addp v16.8h, v16.8h, v17.8h addp v18.8h, v18.8h, v19.8h addp v20.8h, v20.8h, v21.8h addp v22.8h, v22.8h, v23.8h add v16.8h, v16.8h, v20.8h add v18.8h, v18.8h, v22.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 shl v2.8h, v16.8h, #1 shl v3.8h, v18.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 ldr q5, [x10, #32] ld1 {v3.8h, v4.8h}, [x10], x2 addp v2.8h, v2.8h, v2.8h addp v0.8h, v0.8h, v1.8h addp v5.8h, v5.8h, v5.8h addp v3.8h, v3.8h, v4.8h ldr q18, [x1, #32] add v2.4h, v2.4h, v5.4h ld1 {v16.8h, v17.8h}, [x1], x2 add v0.8h, v0.8h, v3.8h ldr q21, [x10, #32] ld1 {v19.8h, v20.8h}, [x10], x2 addp v18.8h, v18.8h, v18.8h addp v16.8h, v16.8h, v17.8h addp v21.8h, v21.8h, v21.8h addp v19.8h, v19.8h, v20.8h add v18.4h, v18.4h, v21.4h add v16.8h, v16.8h, v19.8h shl v1.4h, v2.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v18.4h, #1 shl v2.8h, v16.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 ld1 {v4.8h}, [x1], x2 ld1 {v6.8h}, [x10], x2 addp v0.8h, v0.8h, v4.8h addp v2.8h, v2.8h, v6.8h add v0.8h, v0.8h, v2.8h shl v0.8h, v0.8h, #1 dup v1.8h, v0.h[3] dup v3.8h, v0.h[7] trn2 v2.2d, v0.2d, v3.2d trn1 v0.2d, v0.2d, v1.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl w6, w6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc jumptable ipred_cfl_ac_420_tbl .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl .word L(ipred_cfl_ac_420_w8) - ipred_cfl_ac_420_tbl .word L(ipred_cfl_ac_420_w4) - ipred_cfl_ac_420_tbl endjumptable jumptable ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl endjumptable // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 movrel x7, ipred_cfl_ac_422_tbl sub w8, w8, #27 ldrsw x8, [x7, w8, uxtw #2] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 shl v2.8h, v4.8h, #2 shl v3.8h, v6.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET movrel x7, ipred_cfl_ac_422_w16_tbl ldrsw x3, [x7, w3, uxtw #2] add x7, x7, x3 br x7 L(ipred_cfl_ac_422_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 shl v2.8h, v4.8h, #2 shl v3.8h, v6.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 ldr q6, [x10, #32] ld1 {v4.8h, v5.8h}, [x10], x2 addp v2.8h, v2.8h, v2.8h addp v0.8h, v0.8h, v1.8h addp v6.8h, v6.8h, v6.8h addp v4.8h, v4.8h, v5.8h shl v1.4h, v2.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v6.4h, #2 shl v2.8h, v4.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 addp v0.8h, v0.8h, v0.8h addp v2.8h, v2.8h, v2.8h shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) endfunc jumptable ipred_cfl_ac_422_tbl .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl endjumptable jumptable ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl endjumptable // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 movrel x7, ipred_cfl_ac_444_tbl sub w8, w8, #26 ldrsw x8, [x7, w8, uxtw #2] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 add x7, x7, x8 sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_444_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.4h}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.4h}, [x1], x2 ld1 {v1.d}[1], [x10], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 shl v0.8h, v0.8h, #3 ld1 {v3.8h}, [x10], x2 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 shl v0.8h, v0.8h, #3 shl v2.8h, v2.8h, #3 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET movrel x7, ipred_cfl_ac_444_w32_tbl lsr w3, w3, #1 ldrsw x3, [x7, w3, uxtw #2] lsr x2, x2, #1 // Restore the stride to one line increments add x7, x7, x3 br x7 L(ipred_cfl_ac_444_w32_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 shl v2.8h, v2.8h, #3 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 dup v3.8h, v2.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v0.8h, v1.8h}, [x1], x2 shl v1.8h, v1.8h, #3 shl v0.8h, v0.8h, #3 dup v2.8h, v1.h[7] dup v3.8h, v1.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8h}, [x1], x2 shl v0.8h, v0.8h, #3 dup v1.8h, v0.h[7] dup v2.8h, v0.h[7] dup v3.8h, v0.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b L(ipred_cfl_ac_444_w32_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl w6, w6, #3 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc jumptable ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w8) - ipred_cfl_ac_444_tbl .word L(ipred_cfl_ac_444_w4) - ipred_cfl_ac_444_tbl endjumptable jumptable ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl endjumptable dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/itx.S000066400000000000000000003554251517466257200226110ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); // Most of the functions use the following register layout: // x0-x3 external parameters // x4 function pointer to first transform // x5 function pointer to second transform // x6 output parameter for helper function // x7 input parameter for helper function // x8 input stride for helper function // x9-x12 scratch variables for helper functions // x13 pointer to list of eob thresholds // x14 return pointer for helper function // x15 return pointer for main function // The SIMD registers most often use the following layout: // v0-v1 multiplication coefficients // v2-v7 scratch registers // v8-v15 unused // v16-v31 inputs/outputs of transforms // Potential further optimizations, that are left unimplemented for now: // - Trying to keep multiplication coefficients in registers across multiple // transform functions. (The register layout is designed to potentially // allow this.) // - Use a simplified version of the transforms themselves for cases where // we know a significant number of inputs are zero. E.g. if the eob value // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. const idct_coeffs, align=4 // idct4 .short 2896, 2896*8, 1567, 3784 // idct8 .short 799, 4017, 3406, 2276 // idct16 .short 401, 4076, 3166, 2598 .short 1931, 3612, 3920, 1189 // idct32 .short 201, 4091, 3035, 2751 .short 1751, 3703, 3857, 1380 .short 995, 3973, 3513, 2106 .short 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .short 101*8, 4095*8, 2967*8, -2824*8 .short 1660*8, 3745*8, 3822*8, -1474*8 .short 4076, 401, 4017, 799 .short 0, 0, 0, 0 .short 4036*8, -700*8, 2359*8, 3349*8 .short 3461*8, -2191*8, 897*8, 3996*8 .short -3166, -2598, -799, -4017 .short 0, 0, 0, 0 .short 501*8, 4065*8, 3229*8, -2520*8 .short 2019*8, 3564*8, 3948*8, -1092*8 .short 3612, 1931, 2276, 3406 .short 0, 0, 0, 0 .short 4085*8, -301*8, 2675*8, 3102*8 .short 3659*8, -1842*8, 1285*8, 3889*8 .short -3920, -1189, -3406, -2276 .short 0, 0, 0, 0 endconst const iadst4_coeffs, align=4 // .h[4-5] can be interpreted as .s[2] .short 1321, 3803, 2482, 3344, 3344, 0 endconst const iadst8_coeffs, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 // idct_coeffs .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst const iadst16_coeffs, align=4 .short 4091, 201, 3973, 995 .short 3703, 1751, 3290, 2440 .short 2751, 3035, 2106, 3513 .short 1380, 3857, 601, 4052 endconst .macro smull_smlal d0, d1, s0, s1, c0, c1, sz smull \d0\().4s, \s0\().4h, \c0 smlal \d0\().4s, \s1\().4h, \c1 .ifc \sz, .8h smull2 \d1\().4s, \s0\().8h, \c0 smlal2 \d1\().4s, \s1\().8h, \c1 .endif .endm .macro smull_smlsl d0, d1, s0, s1, c0, c1, sz smull \d0\().4s, \s0\().4h, \c0 smlsl \d0\().4s, \s1\().4h, \c1 .ifc \sz, .8h smull2 \d1\().4s, \s0\().8h, \c0 smlsl2 \d1\().4s, \s1\().8h, \c1 .endif .endm .macro sqrshrn_sz d0, s0, s1, shift, sz sqrshrn \d0\().4h, \s0\().4s, \shift .ifc \sz, .8h sqrshrn2 \d0\().8h, \s1\().4s, \shift .endif .endm .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 sqrdmulh \r0\sz, \r0\sz, \c sqrdmulh \r1\sz, \r1\sz, \c sqrdmulh \r2\sz, \r2\sz, \c sqrdmulh \r3\sz, \r3\sz, \c .ifnb \r4 sqrdmulh \r4\sz, \r4\sz, \c sqrdmulh \r5\sz, \r5\sz, \c sqrdmulh \r6\sz, \r6\sz, \c sqrdmulh \r7\sz, \r7\sz, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif .ifnb \shift srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc uaddw \adddst, \adddst, \addsrc .endif .ifnb \narrowsrc sqxtun \narrowdst, \narrowsrc .endif .ifnb \store st1 {\store}, [\dst], x1 .endif .endm .macro load_add_store_8x16 dst, src mov \src, \dst load_add_store v2.8b, v16.8h, , , , , , \dst, \src load_add_store v3.8b, v17.8h, , , , , , \dst, \src load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src load_add_store , , , , , , v5.8b, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src mov \src, \dst load_add_store v2.8b, v16.8h, , , , , , \dst, \src load_add_store v3.8b, v17.8h, , , , , , \dst, \src load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src load_add_store , , , , , , v5.8b, \dst, \src .endm .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif .ifnb \inssrc ins \insdst\().d[1], \inssrc\().d[0] .endif .ifnb \shift srshr \shift, \shift, #4 .endif .ifnb \load ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc uaddw \adddst, \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif .ifnb \narrowsrc sqxtun \narrowdst, \narrowsrc .endif .ifnb \store st1 {\store}[1], [\dst], x1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src load_add_store4 , , , , , , , , v7.s, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src load_add_store4 , , , , , , , , v3.s, \dst, \src .endm .macro idct_dc w, h, shift cbnz w3, 1f mov w16, #2896*8 ld1r {v16.8h}, [x2] dup v0.4h, w16 sqrdmulh v16.8h, v16.8h, v0.h[0] strh wzr, [x2] .if (\w == 2*\h) || (2*\w == \h) sqrdmulh v16.8h, v16.8h, v0.h[0] .endif .if \shift > 0 srshr v16.8h, v16.8h, #\shift .endif sqrdmulh v16.8h, v16.8h, v0.h[0] srshr v16.8h, v16.8h, #4 mov w4, #\h b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon 1: ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 subs w4, w4, #4 sub x0, x0, x1, lsl #2 uaddw v0.8h, v16.8h, v0.8b sqxtun v0.8b, v0.8h uaddw v1.8h, v16.8h, v1.8b st1 {v0.s}[0], [x0], x1 sqxtun v1.8b, v1.8h st1 {v0.s}[1], [x0], x1 st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x0], x1 b.gt 1b ret endfunc function idct_dc_w8_neon 1: ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x0], x1 ld1 {v2.8b}, [x0], x1 uaddw v20.8h, v16.8h, v0.8b ld1 {v3.8b}, [x0], x1 sub x0, x0, x1, lsl #2 subs w4, w4, #4 uaddw v21.8h, v16.8h, v1.8b sqxtun v0.8b, v20.8h uaddw v22.8h, v16.8h, v2.8b sqxtun v1.8b, v21.8h uaddw v23.8h, v16.8h, v3.8b st1 {v0.8b}, [x0], x1 sqxtun v2.8b, v22.8h st1 {v1.8b}, [x0], x1 sqxtun v3.8b, v23.8h st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w16_neon 1: ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 ld1 {v2.16b}, [x0], x1 subs w4, w4, #4 uaddw v20.8h, v16.8h, v0.8b uaddw2 v21.8h, v16.8h, v0.16b ld1 {v3.16b}, [x0], x1 uaddw v22.8h, v16.8h, v1.8b uaddw2 v23.8h, v16.8h, v1.16b sub x0, x0, x1, lsl #2 uaddw v24.8h, v16.8h, v2.8b uaddw2 v25.8h, v16.8h, v2.16b sqxtun v0.8b, v20.8h sqxtun2 v0.16b, v21.8h uaddw v26.8h, v16.8h, v3.8b uaddw2 v27.8h, v16.8h, v3.16b sqxtun v1.8b, v22.8h sqxtun2 v1.16b, v23.8h sqxtun v2.8b, v24.8h sqxtun2 v2.16b, v25.8h st1 {v0.16b}, [x0], x1 sqxtun v3.8b, v26.8h sqxtun2 v3.16b, v27.8h st1 {v1.16b}, [x0], x1 st1 {v2.16b}, [x0], x1 st1 {v3.16b}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w32_neon 1: ld1 {v0.16b, v1.16b}, [x0], x1 subs w4, w4, #2 uaddw v20.8h, v16.8h, v0.8b uaddw2 v21.8h, v16.8h, v0.16b ld1 {v2.16b, v3.16b}, [x0] uaddw v22.8h, v16.8h, v1.8b uaddw2 v23.8h, v16.8h, v1.16b sub x0, x0, x1 uaddw v24.8h, v16.8h, v2.8b uaddw2 v25.8h, v16.8h, v2.16b sqxtun v0.8b, v20.8h sqxtun2 v0.16b, v21.8h uaddw v26.8h, v16.8h, v3.8b uaddw2 v27.8h, v16.8h, v3.16b sqxtun v1.8b, v22.8h sqxtun2 v1.16b, v23.8h sqxtun v2.8b, v24.8h sqxtun2 v2.16b, v25.8h st1 {v0.16b, v1.16b}, [x0], x1 sqxtun v3.8b, v26.8h sqxtun2 v3.16b, v27.8h st1 {v2.16b, v3.16b}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w64_neon 1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] subs w4, w4, #1 uaddw v20.8h, v16.8h, v0.8b uaddw2 v21.8h, v16.8h, v0.16b uaddw v22.8h, v16.8h, v1.8b uaddw2 v23.8h, v16.8h, v1.16b uaddw v24.8h, v16.8h, v2.8b uaddw2 v25.8h, v16.8h, v2.16b sqxtun v0.8b, v20.8h sqxtun2 v0.16b, v21.8h uaddw v26.8h, v16.8h, v3.8b uaddw2 v27.8h, v16.8h, v3.16b sqxtun v1.8b, v22.8h sqxtun2 v1.16b, v23.8h sqxtun v2.8b, v24.8h sqxtun2 v2.16b, v25.8h sqxtun v3.8b, v26.8h sqxtun2 v3.16b, v27.8h st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 b.gt 1b ret endfunc .macro iwht4 add v16.4h, v16.4h, v17.4h sub v21.4h, v18.4h, v19.4h sub v20.4h, v16.4h, v21.4h sshr v20.4h, v20.4h, #1 sub v18.4h, v20.4h, v17.4h sub v17.4h, v20.4h, v19.4h add v19.4h, v21.4h, v18.4h sub v16.4h, v16.4h, v17.4h .endm .macro idct_4 r0, r1, r2, r3, sz smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz sqrshrn_sz v6, v6, v7, #12, \sz sqrshrn_sz v7, v4, v5, #12, \sz smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz sqrshrn_sz v2, v2, v3, #12, \sz sqrshrn_sz v3, v4, v5, #12, \sz sqadd \r0\sz, v2\sz, v6\sz sqsub \r3\sz, v2\sz, v6\sz sqadd \r1\sz, v3\sz, v7\sz sqsub \r2\sz, v3\sz, v7\sz .endm function inv_dct_4h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .4h ret endfunc function inv_dct_8h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .8h ret endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.8h}, [x16] ssubl v3.4s, v16.4h, v18.4h smull v4.4s, v16.4h, v0.h[0] smlal v4.4s, v18.4h, v0.h[1] smlal v4.4s, v19.4h, v0.h[2] smull v7.4s, v17.4h, v0.h[3] saddw v3.4s, v3.4s, v19.4h smull v5.4s, v16.4h, v0.h[2] smlsl v5.4s, v18.4h, v0.h[0] smlsl v5.4s, v19.4h, v0.h[1] add \o3\().4s, v4.4s, v5.4s mul \o2\().4s, v3.4s, v0.s[2] add \o0\().4s, v4.4s, v7.4s add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s sqrshrn \o0\().4h, \o0\().4s, #12 sqrshrn \o2\().4h, \o2\().4s, #12 sqrshrn \o1\().4h, \o1\().4s, #12 sqrshrn \o3\().4h, \o3\().4s, #12 .endm function inv_adst_4h_x4_neon, export=1 iadst_4x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 v19, v18, v17, v16 ret endfunc .macro iadst_8x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.8h}, [x16] ssubl v2.4s, v16.4h, v18.4h ssubl2 v3.4s, v16.8h, v18.8h smull v4.4s, v16.4h, v0.h[0] smlal v4.4s, v18.4h, v0.h[1] smlal v4.4s, v19.4h, v0.h[2] smull2 v5.4s, v16.8h, v0.h[0] smlal2 v5.4s, v18.8h, v0.h[1] smlal2 v5.4s, v19.8h, v0.h[2] saddw v2.4s, v2.4s, v19.4h saddw2 v3.4s, v3.4s, v19.8h smull v6.4s, v16.4h, v0.h[2] smlsl v6.4s, v18.4h, v0.h[0] smlsl v6.4s, v19.4h, v0.h[1] smull2 v7.4s, v16.8h, v0.h[2] smlsl2 v7.4s, v18.8h, v0.h[0] smlsl2 v7.4s, v19.8h, v0.h[1] mul v18.4s, v2.4s, v0.s[2] mul v19.4s, v3.4s, v0.s[2] smull v2.4s, v17.4h, v0.h[3] smull2 v3.4s, v17.8h, v0.h[3] add v16.4s, v4.4s, v2.4s // out0 add v17.4s, v5.4s, v3.4s add v4.4s, v4.4s, v6.4s // out3 add v5.4s, v5.4s, v7.4s add v6.4s, v6.4s, v2.4s // out1 add v7.4s, v7.4s, v3.4s sub v4.4s, v4.4s, v2.4s // out3 sub v5.4s, v5.4s, v3.4s sqrshrn v18.4h, v18.4s, #12 sqrshrn2 v18.8h, v19.4s, #12 sqrshrn \o0\().4h, v16.4s, #12 sqrshrn2 \o0\().8h, v17.4s, #12 .ifc \o2, v17 mov v17.16b, v18.16b .endif sqrshrn \o1\().4h, v6.4s, #12 sqrshrn2 \o1\().8h, v7.4s, #12 sqrshrn \o3\().4h, v4.4s, #12 sqrshrn2 \o3\().8h, v5.4s, #12 .endm function inv_adst_8h_x4_neon, export=1 iadst_8x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 v19, v18, v17, v16 ret endfunc function inv_identity_4h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.4h, v16.4h, v0.h[0] sqrdmulh v5.4h, v17.4h, v0.h[0] sqrdmulh v6.4h, v18.4h, v0.h[0] sqrdmulh v7.4h, v19.4h, v0.h[0] sqadd v16.4h, v16.4h, v4.4h sqadd v17.4h, v17.4h, v5.4h sqadd v18.4h, v18.4h, v6.4h sqadd v19.4h, v19.4h, v7.4h ret endfunc function inv_identity_8h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.8h, v16.8h, v0.h[0] sqrdmulh v5.8h, v17.8h, v0.h[0] sqrdmulh v6.8h, v18.8h, v0.h[0] sqrdmulh v7.8h, v19.8h, v0.h[0] sqadd v16.8h, v16.8h, v4.8h sqadd v17.8h, v17.8h, v5.8h sqadd v18.8h, v18.8h, v6.8h sqadd v19.8h, v19.8h, v7.8h ret endfunc .macro identity_8x4_shift1 r0, r1, r2, r3, c .irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h sqrdmulh v2.8h, \i, \c srhadd \i, \i, v2.8h .endr .endm function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 mov x15, x30 movi v31.8h, #0 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] st1 {v31.8h}, [x2], #16 sshr v16.4h, v16.4h, #2 sshr v17.4h, v17.4h, #2 sshr v18.4h, v18.4h, #2 sshr v19.4h, v19.4h, #2 iwht4 st1 {v31.8h}, [x2], #16 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 iwht4 ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon movi v31.8h, #0 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] st1 {v31.8h}, [x2], #16 blr x4 st1 {v31.8h}, [x2], #16 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 blr x5 ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 srshr v16.8h, v16.8h, #4 srshr v18.8h, v18.8h, #4 L(itx_4x4_end): sub x0, x0, x1, lsl #2 uaddw v16.8h, v16.8h, v0.8b sqxtun v0.8b, v16.8h uaddw v18.8h, v18.8h, v1.8b st1 {v0.s}[0], [x0], x1 sqxtun v1.8b, v18.8h st1 {v0.s}[1], [x0], x1 st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x0], x1 ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct cbnz w3, 1f mov w16, #2896*8 ld1r {v16.8h}, [x2] dup v4.8h, w16 strh wzr, [x2] sqrdmulh v16.8h, v16.8h, v4.h[0] ld1 {v0.s}[0], [x0], x1 sqrdmulh v20.8h, v16.8h, v4.h[0] ld1 {v0.s}[1], [x0], x1 srshr v16.8h, v20.8h, #4 ld1 {v1.s}[0], [x0], x1 srshr v18.8h, v20.8h, #4 ld1 {v1.s}[1], [x0], x1 b L(itx_4x4_end) 1: .endif adr x4, inv_\txfm1\()_4h_x4_neon adr x5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb idct_4 \r0, \r2, \r4, \r6, \sz smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a sqrshrn_sz \r1, v2, v3, #12, \sz // t4a sqrshrn_sz \r7, v4, v5, #12, \sz // t7a smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a sqrshrn_sz \r3, v6, v7, #12, \sz // t5a sqrshrn_sz \r5, v2, v3, #12, \sz // t6a sqadd v2\sz, \r1\sz, \r3\sz // t4 sqsub \r1\sz, \r1\sz, \r3\sz // t5a sqadd v3\sz, \r7\sz, \r5\sz // t7 sqsub \r3\sz, \r7\sz, \r5\sz // t6a smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 sqrshrn_sz v4, v4, v5, #12, \sz // t5 sqrshrn_sz v5, v6, v7, #12, \sz // t6 sqsub \r7\sz, \r0\sz, v3\sz // out7 sqadd \r0\sz, \r0\sz, v3\sz // out0 sqadd \r1\sz, \r2\sz, v5\sz // out1 sqsub v6\sz, \r2\sz, v5\sz // out6 sqadd \r2\sz, \r4\sz, v4\sz // out2 sqsub \r5\sz, \r4\sz, v4\sz // out5 sqadd \r3\sz, \r6\sz, v2\sz // out3 sqsub \r4\sz, \r6\sz, v2\sz // out4 mov \r6\szb, v6\szb // out6 .endm function inv_dct_8h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b ret endfunc function inv_dct_4h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b ret endfunc .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz movrel x16, iadst8_coeffs ld1 {v0.8h, v1.8h}, [x16] smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz sqrshrn_sz v16, v2, v3, #12, \sz // t0a sqrshrn_sz v23, v4, v5, #12, \sz // t1a smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz sqrshrn_sz v18, v6, v7, #12, \sz // t2a sqrshrn_sz v21, v2, v3, #12, \sz // t3a smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz sqrshrn_sz v20, v4, v5, #12, \sz // t4a sqrshrn_sz v19, v6, v7, #12, \sz // t5a smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz sqrshrn_sz v22, v2, v3, #12, \sz // t6a sqrshrn_sz v17, v4, v5, #12, \sz // t7a sqadd v2\sz, v16\sz, v20\sz // t0 sqsub v3\sz, v16\sz, v20\sz // t4 sqadd v4\sz, v23\sz, v19\sz // t1 sqsub v5\sz, v23\sz, v19\sz // t5 sqadd v6\sz, v18\sz, v22\sz // t2 sqsub v7\sz, v18\sz, v22\sz // t6 sqadd v18\sz, v21\sz, v17\sz // t3 sqsub v19\sz, v21\sz, v17\sz // t7 smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz sqrshrn_sz v3, v16, v17, #12, \sz // t4a sqrshrn_sz v5, v20, v21, #12, \sz // t5a smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz sqrshrn_sz v7, v22, v23, #12, \sz // t6a sqrshrn_sz v19, v16, v17, #12, \sz // t7a sqadd \o0\()\sz, v2\sz, v6\sz // out0 sqsub v2\sz, v2\sz, v6\sz // t2 sqadd \o7\()\sz, v4\sz, v18\sz // out7 sqsub v4\sz, v4\sz, v18\sz // t3 sqneg \o7\()\sz, \o7\()\sz // out7 sqadd \o1\()\sz, v3\sz, v7\sz // out1 sqsub v3\sz, v3\sz, v7\sz // t6 sqadd \o6\()\sz, v5\sz, v19\sz // out6 sqsub v5\sz, v5\sz, v19\sz // t7 sqneg \o1\()\sz, \o1\()\sz // out1 smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) sqrshrn_sz v2, v18, v19, #12, \sz // out3 smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) sqrshrn_sz v3, v20, v21, #12, \sz // out5 sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) sqneg \o3\()\sz, v2\sz // out3 sqneg \o5\()\sz, v3\sz // out5 .endm function inv_adst_8h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h ret endfunc function inv_flipadst_8h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h ret endfunc function inv_adst_4h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h ret endfunc function inv_flipadst_4h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h ret endfunc function inv_identity_8h_x8_neon, export=1 sqshl v16.8h, v16.8h, #1 sqshl v17.8h, v17.8h, #1 sqshl v18.8h, v18.8h, #1 sqshl v19.8h, v19.8h, #1 sqshl v20.8h, v20.8h, #1 sqshl v21.8h, v21.8h, #1 sqshl v22.8h, v22.8h, #1 sqshl v23.8h, v23.8h, #1 ret endfunc function inv_identity_4h_x8_neon, export=1 sqshl v16.4h, v16.4h, #1 sqshl v17.4h, v17.4h, #1 sqshl v18.4h, v18.4h, #1 sqshl v19.4h, v19.4h, #1 sqshl v20.4h, v20.4h, #1 sqshl v21.4h, v21.4h, #1 sqshl v22.4h, v22.4h, #1 sqshl v23.4h, v23.4h, #1 ret endfunc .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_neon movi v28.8h, #0 movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2] st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out b L(itx_8x8_epilog) .else blr x4 srshr v16.8h, v16.8h, #1 srshr v17.8h, v17.8h, #1 srshr v18.8h, v18.8h, #1 srshr v19.8h, v19.8h, #1 srshr v20.8h, v20.8h, #1 srshr v21.8h, v21.8h, #1 srshr v22.8h, v22.8h, #1 srshr v23.8h, v23.8h, #1 L(itx_8x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 blr x5 load_add_store_8x8 x0, x7 ret x15 .endif endfunc .endm def_fn_8x8_base identity_ def_fn_8x8_base .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif adr x5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else adr x4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc .endm def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst def_fn_8x8 adst, flipadst def_fn_8x8 flipadst, dct def_fn_8x8 flipadst, adst def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst function inv_txfm_add_8x4_neon movi v30.8h, #0 movi v31.8h, #0 mov w16, #2896*8 dup v0.4h, w16 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] st1 {v30.8h,v31.8h}, [x2], #32 ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2] st1 {v30.8h,v31.8h}, [x2] scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] blr x5 load_add_store_8x4 x0, x7 ret x15 endfunc function inv_txfm_add_4x8_neon movi v28.8h, #0 movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 mov w16, #2896*8 dup v0.4h, w16 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] scale_input .8h, v0.h[0], v16, v17, v18, v19 blr x4 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 ins v20.d[0], v16.d[1] ins v21.d[0], v17.d[1] ins v22.d[0], v18.d[1] ins v23.d[0], v19.d[1] blr x5 load_add_store_4x8 x0, x7 ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct def_fn_48 \w, \h, identity, identity def_fn_48 \w, \h, dct, adst def_fn_48 \w, \h, dct, flipadst def_fn_48 \w, \h, dct, identity def_fn_48 \w, \h, adst, dct def_fn_48 \w, \h, adst, adst def_fn_48 \w, \h, adst, flipadst def_fn_48 \w, \h, flipadst, dct def_fn_48 \w, \h, flipadst, adst def_fn_48 \w, \h, flipadst, flipadst def_fn_48 \w, \h, identity, dct def_fn_48 \w, \h, adst, identity def_fn_48 \w, \h, flipadst, identity def_fn_48 \w, \h, identity, adst def_fn_48 \w, \h, identity, flipadst .endm def_fns_48 4, 8 def_fns_48 8, 4 .macro idct_16 sz, szb idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a sqrshrn_sz v17, v2, v3, #12, \sz // t8a sqrshrn_sz v31, v4, v5, #12, \sz // t15a smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a sqrshrn_sz v23, v6, v7, #12, \sz // t9a sqrshrn_sz v25, v2, v3, #12, \sz // t14a smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a sqrshrn_sz v21, v4, v5, #12, \sz // t10a sqrshrn_sz v27, v6, v7, #12, \sz // t13a smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a sqrshrn_sz v19, v2, v3, #12, \sz // t11a sqrshrn_sz v29, v4, v5, #12, \sz // t12a sqsub v2\sz, v17\sz, v23\sz // t9 sqadd v17\sz, v17\sz, v23\sz // t8 sqsub v3\sz, v31\sz, v25\sz // t14 sqadd v31\sz, v31\sz, v25\sz // t15 sqsub v23\sz, v19\sz, v21\sz // t10 sqadd v19\sz, v19\sz, v21\sz // t11 sqadd v25\sz, v29\sz, v27\sz // t12 sqsub v29\sz, v29\sz, v27\sz // t13 smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a sqrshrn_sz v21, v4, v5, #12, \sz // t9a sqrshrn_sz v27, v6, v7, #12, \sz // t14a smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a sqrshrn_sz v29, v4, v5, #12, \sz // t13a neg v6.4s, v6.4s .ifc \sz, .8h neg v7.4s, v7.4s .endif sqrshrn_sz v23, v6, v7, #12, \sz // t10a sqsub v2\sz, v17\sz, v19\sz // t11a sqadd v17\sz, v17\sz, v19\sz // t8a sqsub v3\sz, v31\sz, v25\sz // t12a sqadd v31\sz, v31\sz, v25\sz // t15a sqadd v19\sz, v21\sz, v23\sz // t9 sqsub v21\sz, v21\sz, v23\sz // t10 sqsub v25\sz, v27\sz, v29\sz // t13 sqadd v27\sz, v27\sz, v29\sz // t14 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11 smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a sqrshrn_sz v4, v4, v5, #12, \sz // t11 sqrshrn_sz v5, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a sqrshrn_sz v2, v2, v3, #12, \sz // t10a sqrshrn_sz v3, v6, v7, #12, \sz // t13a sqadd v6\sz, v16\sz, v31\sz // out0 sqsub v31\sz, v16\sz, v31\sz // out15 mov v16\szb, v6\szb sqadd v23\sz, v30\sz, v17\sz // out7 sqsub v7\sz, v30\sz, v17\sz // out8 sqadd v17\sz, v18\sz, v27\sz // out1 sqsub v30\sz, v18\sz, v27\sz // out14 sqadd v18\sz, v20\sz, v3\sz // out2 sqsub v29\sz, v20\sz, v3\sz // out13 sqadd v3\sz, v28\sz, v19\sz // out6 sqsub v25\sz, v28\sz, v19\sz // out9 sqadd v19\sz, v22\sz, v5\sz // out3 sqsub v28\sz, v22\sz, v5\sz // out12 sqadd v20\sz, v24\sz, v4\sz // out4 sqsub v27\sz, v24\sz, v4\sz // out11 sqadd v21\sz, v26\sz, v2\sz // out5 sqsub v26\sz, v26\sz, v2\sz // out10 mov v24\szb, v7\szb mov v22\szb, v3\szb .endm function inv_dct_8h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .8h, .16b ret endfunc function inv_dct_4h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .4h, .8b ret endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb movrel x16, iadst16_coeffs ld1 {v0.8h, v1.8h}, [x16] movrel x16, idct_coeffs smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 sqrshrn_sz v16, v2, v3, #12, \sz // t0 sqrshrn_sz v31, v4, v5, #12, \sz // t1 smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 sqrshrn_sz v18, v6, v7, #12, \sz // t2 sqrshrn_sz v29, v2, v3, #12, \sz // t3 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 sqrshrn_sz v20, v4, v5, #12, \sz // t4 sqrshrn_sz v27, v6, v7, #12, \sz // t5 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 sqrshrn_sz v22, v2, v3, #12, \sz // t6 sqrshrn_sz v25, v4, v5, #12, \sz // t7 smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 sqrshrn_sz v23, v6, v7, #12, \sz // t8 sqrshrn_sz v24, v2, v3, #12, \sz // t9 smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 sqrshrn_sz v21, v4, v5, #12, \sz // t10 sqrshrn_sz v26, v6, v7, #12, \sz // t11 smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 sqrshrn_sz v19, v2, v3, #12, \sz // t12 sqrshrn_sz v28, v4, v5, #12, \sz // t13 smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 sqrshrn_sz v17, v6, v7, #12, \sz // t14 sqrshrn_sz v30, v2, v3, #12, \sz // t15 ld1 {v0.8h}, [x16] sqsub v2\sz, v16\sz, v23\sz // t8a sqadd v16\sz, v16\sz, v23\sz // t0a sqsub v3\sz, v31\sz, v24\sz // t9a sqadd v31\sz, v31\sz, v24\sz // t1a sqadd v23\sz, v18\sz, v21\sz // t2a sqsub v18\sz, v18\sz, v21\sz // t10a sqadd v24\sz, v29\sz, v26\sz // t3a sqsub v29\sz, v29\sz, v26\sz // t11a sqadd v21\sz, v20\sz, v19\sz // t4a sqsub v20\sz, v20\sz, v19\sz // t12a sqadd v26\sz, v27\sz, v28\sz // t5a sqsub v27\sz, v27\sz, v28\sz // t13a sqadd v19\sz, v22\sz, v17\sz // t6a sqsub v22\sz, v22\sz, v17\sz // t14a sqadd v28\sz, v25\sz, v30\sz // t7a sqsub v25\sz, v25\sz, v30\sz // t15a smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 sqrshrn_sz v17, v4, v5, #12, \sz // t8 sqrshrn_sz v30, v6, v7, #12, \sz // t9 smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 sqrshrn_sz v18, v2, v3, #12, \sz // t10 sqrshrn_sz v29, v4, v5, #12, \sz // t11 smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 sqrshrn_sz v27, v6, v7, #12, \sz // t12 sqrshrn_sz v20, v2, v3, #12, \sz // t13 smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 sqrshrn_sz v25, v4, v5, #12, \sz // t14 sqrshrn_sz v22, v6, v7, #12, \sz // t15 sqsub v2\sz, v16\sz, v21\sz // t4 sqadd v16\sz, v16\sz, v21\sz // t0 sqsub v3\sz, v31\sz, v26\sz // t5 sqadd v31\sz, v31\sz, v26\sz // t1 sqadd v21\sz, v23\sz, v19\sz // t2 sqsub v23\sz, v23\sz, v19\sz // t6 sqadd v26\sz, v24\sz, v28\sz // t3 sqsub v24\sz, v24\sz, v28\sz // t7 sqadd v19\sz, v17\sz, v27\sz // t8a sqsub v17\sz, v17\sz, v27\sz // t12a sqadd v28\sz, v30\sz, v20\sz // t9a sqsub v30\sz, v30\sz, v20\sz // t13a sqadd v27\sz, v18\sz, v25\sz // t10a sqsub v18\sz, v18\sz, v25\sz // t14a sqadd v20\sz, v29\sz, v22\sz // t11a sqsub v29\sz, v29\sz, v22\sz // t15a smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a sqrshrn_sz v22, v4, v5, #12, \sz // t4a sqrshrn_sz v25, v6, v7, #12, \sz // t5a smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 sqrshrn_sz v24, v2, v3, #12, \sz // t6a sqrshrn_sz v23, v4, v5, #12, \sz // t7a smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 sqrshrn_sz v17, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 sqrshrn_sz v29, v2, v3, #12, \sz // t13 sqrshrn_sz v30, v4, v5, #12, \sz // t14 sqrshrn_sz v18, v6, v7, #12, \sz // t15 sqsub v2\sz, v16\sz, v21\sz // t2a .ifc \o0, v16 sqadd \o0\sz, v16\sz, v21\sz // out0 sqsub v21\sz, v31\sz, v26\sz // t3a sqadd \o15\sz, v31\sz, v26\sz // out15 .else sqadd v4\sz, v16\sz, v21\sz // out0 sqsub v21\sz, v31\sz, v26\sz // t3a sqadd \o15\sz, v31\sz, v26\sz // out15 mov \o0\szb, v4\szb .endif sqneg \o15\sz, \o15\sz // out15 sqsub v3\sz, v29\sz, v18\sz // t15a sqadd \o13\sz, v29\sz, v18\sz // out13 sqadd \o2\sz, v17\sz, v30\sz // out2 sqsub v26\sz, v17\sz, v30\sz // t14a sqneg \o13\sz, \o13\sz // out13 sqadd \o1\sz, v19\sz, v27\sz // out1 sqsub v27\sz, v19\sz, v27\sz // t10 sqadd \o14\sz, v28\sz, v20\sz // out14 sqsub v20\sz, v28\sz, v20\sz // t11 sqneg \o1\sz, \o1\sz // out1 sqadd \o3\sz, v22\sz, v24\sz // out3 sqsub v22\sz, v22\sz, v24\sz // t6 sqadd \o12\sz, v25\sz, v23\sz // out12 sqsub v23\sz, v25\sz, v23\sz // t7 sqneg \o3\sz, \o3\sz // out3 smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23) smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26) sqrshrn_sz v24, v24, v25, #12, \sz // out8 sqrshrn_sz v4, v4, v5, #12, \sz // out7 sqrshrn_sz v5, v6, v7, #12, \sz // out5 smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) sqrshrn_sz v26, v6, v7, #12, \sz // out10 smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22) sqrshrn_sz \o4, v2, v3, #12, \sz // out4 sqrshrn_sz v6, v6, v7, #12, \sz // out11 sqrshrn_sz v7, v21, v25, #12, \sz // out9 sqrshrn_sz \o6, v22, v23, #12, \sz // out6 .ifc \o8, v23 mov \o8\szb, v24\szb mov \o10\szb, v26\szb .endif sqneg \o7\sz, v4\sz // out7 sqneg \o5\sz, v5\sz // out5 sqneg \o11\sz, v6\sz // out11 sqneg \o9\sz, v7\sz // out9 .endm function inv_adst_8h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b ret endfunc function inv_flipadst_8h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b ret endfunc function inv_adst_4h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b ret endfunc function inv_flipadst_4h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b ret endfunc function inv_identity_8h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.8h, v\i\().8h, v0.h[0] sqadd v\i\().8h, v\i\().8h, v\i\().8h sqadd v\i\().8h, v\i\().8h, v2.8h .endr ret endfunc function inv_identity_4h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.4h, v\i\().4h, v0.h[0] sqadd v\i\().4h, v\i\().4h, v\i\().4h sqadd v\i\().4h, v\i\().4h, v2.4h .endr ret endfunc .macro identity_8x16_shift2 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h sqrdmulh v2.8h, \i, \c sshr v2.8h, v2.8h, #1 srhadd \i, \i, v2.8h .endr .endm .macro identity_8x16_shift1 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h sqrdmulh v2.8h, \i, \c srshr v2.8h, v2.8h, #1 sqadd \i, \i, v2.8h .endr .endm .macro identity_8x8_shift1 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h sqrdmulh v2.8h, \i, \c srshr v2.8h, v2.8h, #1 sqadd \i, \i, v2.8h .endr .endm .macro identity_8x8 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h sqrdmulh v2.8h, \i, \c sqadd \i, \i, \i sqadd \i, \i, v2.8h .endr .endm .macro def_horz_16 scale=0, identity=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x8_neon AARCH64_VALID_CALL_TARGET mov x14, x30 movi v7.8h, #0 .if \identity mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .elseif \scale mov w16, #2896*8 dup v0.4h, w16 .endif .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr .if \scale scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif .if \identity identity_8x16_shift2 v0.h[0] b L(horz_16x8_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #\shift .endr .if \shift == 1 b L(horz_16x8_epilog) .else L(horz_16x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 .irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h st1 {\i}, [x6], #16 .endr ret x14 .endif .endif endfunc .endm def_horz_16 scale=1, identity=0, shift=1, suffix=_scale def_horz_16 scale=0, identity=1, shift=0, suffix=_identity def_horz_16 scale=0, identity=0, shift=2 function inv_txfm_add_vert_8x16_neon mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr blr x5 load_add_store_8x16 x6, x7 ret x14 endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 mov x8, #16*2 .irp i, 0, 8 add x6, sp, #(\i*16*2) .if \i == 8 cmp w3, w13 b.lt 1f .endif add x7, x2, #(\i*2) blr x9 .endr b 2f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr 2: .irp i, 0, 8 add x6, x0, #(\i) add x7, sp, #(\i*2) bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #512 ret x15 endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif .ifc \txfm1, identity adr x9, inv_txfm_horz_identity_16x8_neon .else adr x9, inv_txfm_horz_16x8_neon adr x4, inv_\txfm1\()_8h_x16_neon .endif adr x5, inv_\txfm2\()_8h_x16_neon mov x13, #\eob_half b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct, 36 def_fn_16x16 identity, identity, 36 def_fn_16x16 dct, adst, 36 def_fn_16x16 dct, flipadst, 36 def_fn_16x16 dct, identity, 8 def_fn_16x16 adst, dct, 36 def_fn_16x16 adst, adst, 36 def_fn_16x16 adst, flipadst, 36 def_fn_16x16 flipadst, dct, 36 def_fn_16x16 flipadst, adst, 36 def_fn_16x16 flipadst, flipadst, 36 def_fn_16x16 identity, dct, 8 .macro def_fn_416_base variant function inv_txfm_\variant\()add_16x4_neon mov x15, x30 movi v4.8h, #0 .ifc \variant, identity_ .irp i, v16.4h, v17.4h, v18.4h, v19.4h ld1 {\i}, [x2] st1 {v4.4h}, [x2], #8 .endr .irp i, v16.d, v17.d, v18.d, v19.d ld1 {\i}[1], [x2] st1 {v4.4h}, [x2], #8 .endr mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, v20.4h, v21.4h, v22.4h, v23.4h ld1 {\i}, [x2] st1 {v4.4h}, [x2], #8 .endr .irp i, v20.d, v21.d, v22.d, v23.d ld1 {\i}[1], [x2] st1 {v4.4h}, [x2], #8 .endr identity_8x16_shift1 v0.h[0] b L(itx_16x4_epilog) .else .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h ld1 {\i}, [x2] st1 {v4.4h}, [x2], #8 .endr blr x4 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr ins v24.d[1], v28.d[0] ins v25.d[1], v29.d[0] ins v26.d[1], v30.d[0] ins v27.d[1], v31.d[0] srshr v20.8h, v24.8h, #1 srshr v21.8h, v25.8h, #1 srshr v22.8h, v26.8h, #1 srshr v23.8h, v27.8h, #1 L(itx_16x4_epilog): transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x4 x6, x7 transpose_4x8h_mov v20, v21, v22, v23, v2, v3, v4, v5, v16, v17, v18, v19 blr x5 add x6, x0, #8 load_add_store_8x4 x6, x7 ret x15 .endif endfunc function inv_txfm_\variant\()add_4x16_neon mov x15, x30 movi v2.8h, #0 mov x11, #32 cmp w3, w13 b.lt 1f add x6, x2, #16 .ifc \variant, identity_ .irp i, v24.8h, v25.8h, v26.8h, v27.8h ld1 {\i}, [x6] st1 {v2.8h}, [x6], x11 .endr mov w16, #(5793-4096)*8 dup v0.4h, w16 identity_8x4_shift1 v24, v25, v26, v27, v0.h[0] .else .irp i, v16.8h, v17.8h, v18.8h, v19.8h ld1 {\i}, [x6] st1 {v2.8h}, [x6], x11 .endr blr x4 srshr v24.8h, v16.8h, #1 srshr v25.8h, v17.8h, #1 srshr v26.8h, v18.8h, #1 srshr v27.8h, v19.8h, #1 .endif transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7 ins v28.d[0], v24.d[1] ins v29.d[0], v25.d[1] ins v30.d[0], v26.d[1] ins v31.d[0], v27.d[1] b 2f 1: .irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h movi \i, #0 .endr 2: movi v2.8h, #0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h ld1 {\i}, [x2] st1 {v2.8h}, [x2], x11 .endr .ifc \variant, identity_ mov w16, #(5793-4096)*8 dup v0.4h, w16 identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] b L(itx_4x16_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr L(itx_4x16_epilog): transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 ins v20.d[0], v16.d[1] ins v21.d[0], v17.d[1] ins v22.d[0], v18.d[1] ins v23.d[0], v19.d[1] blr x5 load_add_store_4x16 x0, x6 ret x15 .endif endfunc .endm def_fn_416_base identity_ def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 .ifnc \txfm1, identity adr x4, inv_\txfm1\()_8h_x\w\()_neon .endif adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else .ifnc \txfm1, identity adr x4, inv_\txfm1\()_4h_x\w\()_neon .endif adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 29 def_fn_416 \w, \h, identity, identity, 29 def_fn_416 \w, \h, dct, adst, 29 def_fn_416 \w, \h, dct, flipadst, 29 def_fn_416 \w, \h, dct, identity, 8 def_fn_416 \w, \h, adst, dct, 29 def_fn_416 \w, \h, adst, adst, 29 def_fn_416 \w, \h, adst, flipadst, 29 def_fn_416 \w, \h, flipadst, dct, 29 def_fn_416 \w, \h, flipadst, adst, 29 def_fn_416 \w, \h, flipadst, flipadst, 29 def_fn_416 \w, \h, identity, dct, 32 def_fn_416 \w, \h, adst, identity, 8 def_fn_416 \w, \h, flipadst, identity, 8 def_fn_416 \w, \h, identity, adst, 32 def_fn_416 \w, \h, identity, flipadst, 32 .endm def_fns_416 4, 16 def_fns_416 16, 4 .macro def_fn_816_base variant function inv_txfm_\variant\()add_16x8_neon mov x15, x30 movi v4.8h, #0 mov w16, #2896*8 dup v0.4h, w16 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x2] st1 {v4.8h}, [x2], #16 .endr scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .ifc \variant, identity_ mov w16, #2*(5793-4096)*8 dup v0.4h, w16 identity_8x16_shift1 v0.h[0] b L(itx_16x8_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #1 .endr L(itx_16x8_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 mov x6, x0 load_add_store_8x8 x6, x7 transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23 blr x5 add x0, x0, #8 load_add_store_8x8 x0, x7 ret x15 .endif endfunc function inv_txfm_\variant\()add_8x16_neon mov x15, x30 movi v4.8h, #0 mov w16, #2896*8 dup v0.4h, w16 mov x11, #32 cmp w3, w13 b.lt 1f add x6, x2, #16 .ifc \variant, identity_ .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x6] st1 {v4.8h}, [x6], x11 .endr scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 // The identity shl #1 and downshift srshr #1 cancel out .else .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x6] st1 {v4.8h}, [x6], x11 .endr scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 srshr v24.8h, v16.8h, #1 srshr v25.8h, v17.8h, #1 srshr v26.8h, v18.8h, #1 srshr v27.8h, v19.8h, #1 srshr v28.8h, v20.8h, #1 srshr v29.8h, v21.8h, #1 srshr v30.8h, v22.8h, #1 srshr v31.8h, v23.8h, #1 .endif transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr 2: movi v4.8h, #0 mov w16, #2896*8 dup v0.4h, w16 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v4.8h}, [x2], x11 .endr scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out b L(itx_8x16_epilog) .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h srshr \i, \i, #1 .endr L(itx_8x16_epilog): transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 load_add_store_8x16 x0, x6 ret x15 .endif endfunc .endm def_fn_816_base identity_ def_fn_816_base .macro def_fn_816 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .ifnc \txfm1, identity adr x4, inv_\txfm1\()_8h_x\w\()_neon .endif adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct, 43 def_fn_816 \w, \h, identity, identity, 43 def_fn_816 \w, \h, dct, adst, 43 def_fn_816 \w, \h, dct, flipadst, 43 def_fn_816 \w, \h, dct, identity, 8 def_fn_816 \w, \h, adst, dct, 43 def_fn_816 \w, \h, adst, adst, 43 def_fn_816 \w, \h, adst, flipadst, 43 def_fn_816 \w, \h, flipadst, dct, 43 def_fn_816 \w, \h, flipadst, adst, 43 def_fn_816 \w, \h, flipadst, flipadst, 43 def_fn_816 \w, \h, identity, dct, 64 def_fn_816 \w, \h, adst, identity, 8 def_fn_816 \w, \h, flipadst, identity, 8 def_fn_816 \w, \h, identity, adst, 64 def_fn_816 \w, \h, identity, flipadst, 64 .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_8h_x16_neon, export=1 movrel x16, idct_coeffs, 2*16 ld1 {v0.8h, v1.8h}, [x16] sub x16, x16, #2*16 smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a sqrshrn_sz v16, v2, v3, #12, .8h // t16a sqrshrn_sz v31, v4, v5, #12, .8h // t31a smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a sqrshrn_sz v24, v6, v7, #12, .8h // t17a sqrshrn_sz v23, v2, v3, #12, .8h // t30a smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a sqrshrn_sz v20, v4, v5, #12, .8h // t18a sqrshrn_sz v27, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a sqrshrn_sz v28, v2, v3, #12, .8h // t19a sqrshrn_sz v19, v4, v5, #12, .8h // t28a smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a sqrshrn_sz v18, v6, v7, #12, .8h // t20a sqrshrn_sz v29, v2, v3, #12, .8h // t27a smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a sqrshrn_sz v26, v4, v5, #12, .8h // t21a sqrshrn_sz v21, v6, v7, #12, .8h // t26a smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a sqrshrn_sz v22, v2, v3, #12, .8h // t22a sqrshrn_sz v25, v4, v5, #12, .8h // t25a smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a sqrshrn_sz v30, v6, v7, #12, .8h // t23a sqrshrn_sz v17, v2, v3, #12, .8h // t24a ld1 {v0.8h}, [x16] sqsub v2.8h, v16.8h, v24.8h // t17 sqadd v16.8h, v16.8h, v24.8h // t16 sqsub v3.8h, v31.8h, v23.8h // t30 sqadd v31.8h, v31.8h, v23.8h // t31 sqsub v24.8h, v28.8h, v20.8h // t18 sqadd v28.8h, v28.8h, v20.8h // t19 sqadd v23.8h, v18.8h, v26.8h // t20 sqsub v18.8h, v18.8h, v26.8h // t21 sqsub v20.8h, v30.8h, v22.8h // t22 sqadd v30.8h, v30.8h, v22.8h // t23 sqadd v26.8h, v17.8h, v25.8h // t24 sqsub v17.8h, v17.8h, v25.8h // t25 sqsub v22.8h, v29.8h, v21.8h // t26 sqadd v29.8h, v29.8h, v21.8h // t27 sqadd v25.8h, v19.8h, v27.8h // t28 sqsub v19.8h, v19.8h, v27.8h // t29 smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a sqrshrn_sz v21, v4, v5, #12, .8h // t17a sqrshrn_sz v27, v6, v7, #12, .8h // t30a neg v2.4s, v2.4s // -> t18a neg v3.4s, v3.4s // -> t18a smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a sqrshrn_sz v19, v2, v3, #12, .8h // t18a sqrshrn_sz v24, v4, v5, #12, .8h // t29a smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a sqrshrn_sz v22, v6, v7, #12, .8h // t21a sqrshrn_sz v18, v2, v3, #12, .8h // t26a neg v4.4s, v4.4s // -> t22a neg v5.4s, v5.4s // -> t22a smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a sqrshrn_sz v17, v4, v5, #12, .8h // t22a sqrshrn_sz v20, v6, v7, #12, .8h // t25a sqsub v2.8h, v27.8h, v24.8h // t29 sqadd v27.8h, v27.8h, v24.8h // t30 sqsub v3.8h, v21.8h, v19.8h // t18 sqadd v21.8h, v21.8h, v19.8h // t17 sqsub v24.8h, v16.8h, v28.8h // t19a sqadd v16.8h, v16.8h, v28.8h // t16a sqsub v19.8h, v30.8h, v23.8h // t20a sqadd v30.8h, v30.8h, v23.8h // t23a sqsub v28.8h, v17.8h, v22.8h // t21 sqadd v17.8h, v17.8h, v22.8h // t22 sqadd v23.8h, v26.8h, v29.8h // t24a sqsub v26.8h, v26.8h, v29.8h // t27a sqadd v22.8h, v20.8h, v18.8h // t25 sqsub v20.8h, v20.8h, v18.8h // t26 sqsub v29.8h, v31.8h, v25.8h // t28a sqadd v31.8h, v31.8h, v25.8h // t31a smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 sqrshrn_sz v18, v4, v5, #12, .8h // t18a sqrshrn_sz v25, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 sqrshrn_sz v29, v2, v3, #12, .8h // t19 sqrshrn_sz v24, v4, v5, #12, .8h // t28 neg v6.4s, v6.4s // -> t20 neg v7.4s, v7.4s // -> t20 smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a sqrshrn_sz v26, v6, v7, #12, .8h // t20 sqrshrn_sz v19, v2, v3, #12, .8h // t27 neg v4.4s, v4.4s // -> t21a neg v5.4s, v5.4s // -> t21a smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a sqrshrn_sz v20, v4, v5, #12, .8h // t21a sqrshrn_sz v28, v6, v7, #12, .8h // t26a sqsub v2.8h, v16.8h, v30.8h // t23 sqadd v16.8h, v16.8h, v30.8h // t16 = out16 sqsub v3.8h, v31.8h, v23.8h // t24 sqadd v31.8h, v31.8h, v23.8h // t31 = out31 sqsub v23.8h, v21.8h, v17.8h // t22a sqadd v17.8h, v21.8h, v17.8h // t17a = out17 sqadd v30.8h, v27.8h, v22.8h // t30a = out30 sqsub v21.8h, v27.8h, v22.8h // t25a sqsub v27.8h, v18.8h, v20.8h // t21 sqadd v18.8h, v18.8h, v20.8h // t18 = out18 sqadd v4.8h, v29.8h, v26.8h // t19a = out19 sqsub v26.8h, v29.8h, v26.8h // t20a sqadd v29.8h, v25.8h, v28.8h // t29 = out29 sqsub v25.8h, v25.8h, v28.8h // t26 sqadd v28.8h, v24.8h, v19.8h // t28a = out28 sqsub v24.8h, v24.8h, v19.8h // t27a mov v19.16b, v4.16b // out19 smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 sqrshrn_sz v20, v4, v5, #12, .8h // t20 sqrshrn_sz v22, v6, v7, #12, .8h // t27 smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a mov v27.16b, v22.16b // t27 sqrshrn_sz v26, v4, v5, #12, .8h // t26a smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 sqrshrn_sz v21, v6, v7, #12, .8h // t21a sqrshrn_sz v22, v24, v25, #12, .8h // t22 sqrshrn_sz v25, v4, v5, #12, .8h // t25 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a sqrshrn_sz v23, v4, v5, #12, .8h // t23a sqrshrn_sz v24, v6, v7, #12, .8h // t24a ret endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x8_neon mov x14, x30 movi v7.8h, #0 lsl x8, x8, #1 .if \scale mov w16, #2896*8 dup v0.4h, w16 .endif .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .if \scale scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_8h_x16_neon transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 .macro store1 r0, r1 st1 {\r0}, [x6], #16 st1 {\r1}, [x6], #16 add x6, x6, #32 .endm store1 v16.8h, v24.8h store1 v17.8h, v25.8h store1 v18.8h, v26.8h store1 v19.8h, v27.8h store1 v20.8h, v28.8h store1 v21.8h, v29.8h store1 v22.8h, v30.8h store1 v23.8h, v31.8h .purgem store1 sub x6, x6, #64*8 movi v7.8h, #0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in v0.h[1] scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct32_odd_8h_x16_neon transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 .macro store2 r0, r1, shift ld1 {v4.8h, v5.8h}, [x6] sqsub v7.8h, v4.8h, \r0 sqsub v6.8h, v5.8h, \r1 sqadd v4.8h, v4.8h, \r0 sqadd v5.8h, v5.8h, \r1 rev64 v6.8h, v6.8h rev64 v7.8h, v7.8h srshr v4.8h, v4.8h, #\shift srshr v5.8h, v5.8h, #\shift srshr v6.8h, v6.8h, #\shift srshr v7.8h, v7.8h, #\shift ext v6.16b, v6.16b, v6.16b, #8 st1 {v4.8h, v5.8h}, [x6], #32 ext v7.16b, v7.16b, v7.16b, #8 st1 {v6.8h, v7.8h}, [x6], #32 .endm store2 v31.8h, v23.8h, \shift store2 v30.8h, v22.8h, \shift store2 v29.8h, v21.8h, \shift store2 v28.8h, v20.8h, \shift store2 v27.8h, v19.8h, \shift store2 v26.8h, v18.8h, \shift store2 v25.8h, v17.8h, \shift store2 v24.8h, v16.8h, \shift .purgem store2 ret x14 endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_8x32_neon mov x14, x30 lsl x8, x8, #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 bl inv_dct_8h_x16_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 bl inv_dct32_odd_8h_x16_neon neg x9, x8 mov x10, x6 .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride ld1 {v2.8b}, [x10], x1 ld1 {v6.8h}, [x7], \stride ld1 {v3.8b}, [x10], x1 \op v5.8h, v5.8h, \r0 ld1 {v7.8h}, [x7], \stride ld1 {v4.8b}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 uaddw v5.8h, v5.8h, v2.8b srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 sqxtun v2.8b, v5.8h ld1 {v5.8h}, [x7], \stride uaddw v6.8h, v6.8h, v3.8b srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8b}, [x6], x1 ld1 {v2.8b}, [x10], x1 sqxtun v3.8b, v6.8h uaddw v7.8h, v7.8h, v4.8b srshr v5.8h, v5.8h, #4 st1 {v3.8b}, [x6], x1 sqxtun v4.8b, v7.8h uaddw v5.8h, v5.8h, v2.8b st1 {v4.8b}, [x6], x1 sqxtun v2.8b, v5.8h st1 {v2.8b}, [x6], x1 .endm combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 sub x7, x7, x8 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine ret x14 endfunc const eob_32x32 .short 36, 136, 300, 1024 endconst const eob_16x32 .short 36, 151, 279, 512 endconst const eob_16x32_shortside .short 36, 512 endconst const eob_8x32 .short 43, 107, 171, 256 endconst function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_32x32 mov x8, #2*32 1: mov w9, #0 movrel x12, eob_32x32 2: add w9, w9, #8 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x2] st1 {v0.8h}, [x2], x8 .endr transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 load_add_store_8x8 x0, x7, shiftbits=2 ldrh w11, [x12], #2 sub x0, x0, x1, lsl #3 add x0, x0, #8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #2 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #2*8 b 1b 9: ret endfunc .macro shift_8_regs op, shift .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 mov w16, #2896*8 mov w17, #2*(5793-4096)*8 dup v1.4h, w16 movi v0.8h, #0 mov v1.h[1], w17 movrel x13, eob_16x32\hshort mov x8, #2*\h 1: mov w9, #0 movrel x12, eob_16x32\wshort 2: add w9, w9, #8 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 .endr scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23 .if \w == 16 // 16x32 identity_8x8_shift1 v1.h[1] .else // 32x16 shift_8_regs sqshl, 1 identity_8x8 v1.h[1] .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 .if \w == 16 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=4 .endif ldrh w11, [x12], #2 sub x0, x0, x1, lsl #3 add x0, x0, #8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #2 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #2*8 b 1b 9: ret endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_8x32 mov w8, #2*\h 1: ldrh w12, [x13], #2 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 .endr .if \w == 8 // 8x32 shift_8_regs srshr, 1 .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 add x2, x2, #2*8 .else sub x0, x0, x1, lsl #3 add x0, x0, #8 .endif b 1b 9: ret endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 sub sp, sp, #2048 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 8, 16, 24 add x6, sp, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_dct_32x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #2048 ret x15 endfunc function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, sp, #(\i*16*2) add x7, x2, #(\i*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif mov x8, #2*32 bl inv_txfm_horz_scale_16x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #8 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i) add x7, sp, #(\i*2) mov x8, #16*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 adr x5, inv_dct_8h_x16_neon .irp i, 0, 8 add x6, sp, #(\i*32*2) add x7, x2, #(\i*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, #36 b.lt 1f .endif mov x8, #2*16 bl inv_txfm_horz_scale_dct_32x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: mov x8, #32*2 .irp i, 0, 8, 16, 24 add x6, x0, #(\i) add x7, sp, #(\i*2) bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 sub sp, sp, #512 movrel x13, eob_8x32 movi v28.8h, #0 mov x8, #2*32 mov w9, #32 mov x6, sp 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x2] st1 {v28.8h}, [x2], x8 .endr ldrh w12, [x13], #2 sub x2, x2, x8, lsl #3 sub w9, w9, #8 add x2, x2, #2*8 bl inv_dct_8h_x8_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23 srshr v\i\().8h, v\i\().8h, #2 .endr transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 cmp w3, w12 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 b.ge 1b cbz w9, 3f movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 2: subs w9, w9, #8 .rept 2 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 .endr b.gt 2b 3: mov x6, x0 mov x7, sp mov x8, #8*2 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 ret x15 endfunc function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 sub sp, sp, #512 mov x6, sp mov x7, x2 mov x8, #8*2 bl inv_txfm_horz_dct_32x8_neon mov x8, #2*32 mov w9, #0 1: add x6, x0, x9 add x7, sp, x9, lsl #1 // #(\i*2) .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x7], x8 .endr add w9, w9, #8 bl inv_dct_8h_x8_neon cmp w9, #32 load_add_store_8x8 x6, x7 b.lt 1b add sp, sp, #512 ret x15 endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a ld1 {v0.8h, v1.8h}, [x17], #32 sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a sqadd v24.8h, v16.8h, v17.8h // t32 sqsub v25.8h, v16.8h, v17.8h // t33 sqsub v26.8h, v19.8h, v18.8h // t34 sqadd v27.8h, v19.8h, v18.8h // t35 sqadd v28.8h, v20.8h, v21.8h // t60 sqsub v29.8h, v20.8h, v21.8h // t61 sqsub v30.8h, v23.8h, v22.8h // t62 sqadd v31.8h, v23.8h, v22.8h // t63 smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a neg v2.4s, v2.4s // t34a neg v3.4s, v3.4s // t34a smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a sqrshrn_sz v26, v2, v3, #12, .8h // t34a smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a sqrshrn_sz v29, v4, v5, #12, .8h // t61a sqrshrn_sz v25, v6, v7, #12, .8h // t33a sqrshrn_sz v30, v2, v3, #12, .8h // t62a sqadd v16.8h, v24.8h, v27.8h // t32a sqsub v19.8h, v24.8h, v27.8h // t35a sqadd v17.8h, v25.8h, v26.8h // t33 sqsub v18.8h, v25.8h, v26.8h // t34 sqsub v20.8h, v31.8h, v28.8h // t60a sqadd v23.8h, v31.8h, v28.8h // t63a sqsub v21.8h, v30.8h, v29.8h // t61 sqadd v22.8h, v30.8h, v29.8h // t62 smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 sqrshrn_sz v21, v2, v3, #12, .8h // t61a sqrshrn_sz v18, v4, v5, #12, .8h // t34a smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 sqrshrn_sz v20, v6, v7, #12, .8h // t60 sqrshrn_sz v19, v2, v3, #12, .8h // t35 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 ret endfunc function inv_dct64_step2_neon movrel x16, idct_coeffs ld1 {v0.4h}, [x16] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a ldr q16, [x6, #2*8*0] // t32a ldr q17, [x9, #2*8*8] // t39a ldr q18, [x9, #2*8*0] // t63a ldr q19, [x6, #2*8*8] // t56a ldr q20, [x6, #2*8*16] // t40a ldr q21, [x9, #2*8*24] // t47a ldr q22, [x9, #2*8*16] // t55a ldr q23, [x6, #2*8*24] // t48a sqadd v24.8h, v16.8h, v17.8h // t32 sqsub v25.8h, v16.8h, v17.8h // t39 sqadd v26.8h, v18.8h, v19.8h // t63 sqsub v27.8h, v18.8h, v19.8h // t56 sqsub v28.8h, v21.8h, v20.8h // t40 sqadd v29.8h, v21.8h, v20.8h // t47 sqadd v30.8h, v23.8h, v22.8h // t48 sqsub v31.8h, v23.8h, v22.8h // t55 smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a sqrshrn_sz v25, v2, v3, #12, .8h // t56a sqrshrn_sz v27, v4, v5, #12, .8h // t39a neg v6.4s, v6.4s // t40a neg v7.4s, v7.4s // t40a smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a sqrshrn_sz v31, v6, v7, #12, .8h // t40a sqrshrn_sz v28, v2, v3, #12, .8h // t55a sqadd v16.8h, v24.8h, v29.8h // t32a sqsub v19.8h, v24.8h, v29.8h // t47a sqadd v17.8h, v27.8h, v31.8h // t39 sqsub v18.8h, v27.8h, v31.8h // t40 sqsub v20.8h, v26.8h, v30.8h // t48a sqadd v23.8h, v26.8h, v30.8h // t63a sqsub v21.8h, v25.8h, v28.8h // t55 sqadd v22.8h, v25.8h, v28.8h // t56 smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 sqrshrn_sz v18, v2, v3, #12, .8h // t40a sqrshrn_sz v21, v4, v5, #12, .8h // t55a smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 sqrshrn_sz v19, v6, v7, #12, .8h // t47 sqrshrn_sz v20, v2, v3, #12, .8h // t48 str q16, [x6, #2*8*0] // t32a str q17, [x9, #2*8*0] // t39 str q18, [x6, #2*8*8] // t40a str q19, [x9, #2*8*8] // t47 str q20, [x6, #2*8*16] // t48 str q21, [x9, #2*8*16] // t55a str q22, [x6, #2*8*24] // t56 str q23, [x9, #2*8*24] // t63a add x6, x6, #2*8 sub x9, x9, #2*8 cmp x6, x9 b.lt 1b ret endfunc .macro load8 src, strd, zero, clear .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h .if \clear ld1 {\i}, [\src] st1 {\zero}, [\src], \strd .else ld1 {\i}, [\src], \strd .endif .endr .endm .macro store16 dst .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h st1 {\i}, [\dst], #16 .endr .endm .macro clear_upper8 .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr .endm .macro movi_if reg, val, cond .if \cond movi \reg, \val .endif .endm .macro movdup_if reg, gpr, val, cond .if \cond mov \gpr, \val dup \reg, \gpr .endif .endm .macro st1_if regs, dst, cond .if \cond st1 \regs, \dst .endif .endm .macro str_if reg, dst, cond .if \cond str \reg, \dst .endif .endm .macro stroff_if reg, dst, dstoff, cond .if \cond str \reg, \dst, \dstoff .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 mov x14, x30 mov x6, sp lsl x8, x8, #2 movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.8h, \clear clear_upper8 sub x7, x7, x8, lsl #3 add x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct_8h_x16_neon store16 x6 movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.8h, \clear clear_upper8 sub x7, x7, x8, lsl #3 lsr x8, x8, #1 sub x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct32_odd_8h_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 mov x9, #-16 .macro store_addsub r0, r1, r2, r3 ld1 {v2.8h}, [x6], #16 ld1 {v3.8h}, [x6], #16 sqadd v6.8h, v2.8h, \r0 sqsub \r0, v2.8h, \r0 ld1 {v4.8h}, [x6], #16 sqadd v7.8h, v3.8h, \r1 sqsub \r1, v3.8h, \r1 ld1 {v5.8h}, [x6], #16 sqadd v2.8h, v4.8h, \r2 sub x6, x6, #16*4 sqsub \r2, v4.8h, \r2 st1 {v6.8h}, [x6], #16 st1 {\r0}, [x10], x9 sqadd v3.8h, v5.8h, \r3 sqsub \r3, v5.8h, \r3 st1 {v7.8h}, [x6], #16 st1 {\r1}, [x10], x9 st1 {v2.8h}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.8h}, [x6], #16 st1 {\r3}, [x10], x9 .endm store_addsub v31.8h, v30.8h, v29.8h, v28.8h store_addsub v27.8h, v26.8h, v25.8h, v24.8h store_addsub v23.8h, v22.8h, v21.8h, v20.8h store_addsub v19.8h, v18.8h, v17.8h, v16.8h .purgem store_addsub add x6, x6, #2*8*16 movrel x17, idct64_coeffs movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear add x9, x7, x8, lsl #4 // offset 16 add x10, x7, x8, lsl #3 // offset 8 sub x9, x9, x8 // offset 15 sub x11, x10, x8 // offset 7 ld1 {v16.8h}, [x7] // in1 (offset 0) ld1 {v17.8h}, [x9] // in31 (offset 15) ld1 {v18.8h}, [x10] // in17 (offset 8) ld1 {v19.8h}, [x11] // in15 (offset 7) st1_if {v7.8h}, [x7], \clear st1_if {v7.8h}, [x9], \clear st1_if {v7.8h}, [x10], \clear st1_if {v7.8h}, [x11], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear add x7, x7, x8, lsl #2 // offset 4 sub x9, x9, x8, lsl #2 // offset 11 sub x10, x7, x8 // offset 3 add x11, x9, x8 // offset 12 ld1 {v16.8h}, [x10] // in7 (offset 3) ld1 {v17.8h}, [x11] // in25 (offset 12) ld1 {v18.8h}, [x9] // in23 (offset 11) ld1 {v19.8h}, [x7] // in9 (offset 4) st1_if {v7.8h}, [x7], \clear st1_if {v7.8h}, [x9], \clear st1_if {v7.8h}, [x10], \clear st1_if {v7.8h}, [x11], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear sub x10, x10, x8, lsl #1 // offset 1 sub x9, x9, x8, lsl #1 // offset 9 add x7, x7, x8 // offset 5 add x11, x11, x8 // offset 13 ldr q16, [x10, x8] // in5 (offset 2) ldr q17, [x11] // in27 (offset 13) ldr q18, [x9, x8] // in21 (offset 10) ldr q19, [x7] // in11 (offset 5) stroff_if q7, [x10, x8], \clear str_if q7, [x11], \clear stroff_if q7, [x9, x8], \clear str_if q7, [x7], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear ldr q16, [x10] // in3 (offset 1) ldr q17, [x11, x8] // in29 (offset 14) ldr q18, [x9] // in19 (offset 9) ldr q19, [x7, x8] // in13 (offset 6) str_if q7, [x10], \clear stroff_if q7, [x11, x8], \clear str_if q7, [x9], \clear stroff_if q7, [x7, x8], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon sub x6, x6, #2*8*32 add x9, x6, #2*8*7 bl inv_dct64_step2_neon ret x14 endfunc .endm def_dct64_func def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x8_neon mov x14, x30 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, #2*56 mov x10, #2*64 mov x11, #-2*8*4 dup v7.8h, w12 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 .macro store_addsub src0, src1, src2, src3 sqsub v1.8h, \src0, \src1 sqadd v0.8h, \src0, \src1 sqsub v3.8h, \src2, \src3 srshl v1.8h, v1.8h, v7.8h sqadd v2.8h, \src2, \src3 srshl v0.8h, v0.8h, v7.8h srshl v3.8h, v3.8h, v7.8h rev64 v1.8h, v1.8h srshl v2.8h, v2.8h, v7.8h rev64 v3.8h, v3.8h ext v1.16b, v1.16b, v1.16b, #8 st1 {v0.8h}, [x6], x10 ext v3.16b, v3.16b, v3.16b, #8 st1 {v1.8h}, [x9], x10 st1 {v2.8h}, [x6], x10 st1 {v3.8h}, [x9], x10 .endm store_addsub v16.8h, v31.8h, v17.8h, v30.8h store_addsub v18.8h, v29.8h, v19.8h, v28.8h store_addsub v20.8h, v27.8h, v21.8h, v26.8h store_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem store_addsub sub x6, x6, x10, lsl #3 sub x9, x9, x10, lsl #3 add x6, x6, #16 sub x9, x9, #16 cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon mov x14, x30 lsl x8, x8, #1 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, x1, lsl #6 sub x9, x9, x1 neg x10, x1 mov x11, #-2*8*4 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8b}, [x6], x1 ld1 {v1.8b}, [x9], x10 sqadd v4.8h, \src0, \src1 ld1 {v2.8b}, [x6] sqsub v5.8h, \src0, \src1 ld1 {v3.8b}, [x9] sqadd v6.8h, \src2, \src3 sqsub v7.8h, \src2, \src3 sub x6, x6, x1 sub x9, x9, x10 srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr v6.8h, v6.8h, #4 uaddw v4.8h, v4.8h, v0.8b srshr v7.8h, v7.8h, #4 uaddw v5.8h, v5.8h, v1.8b uaddw v6.8h, v6.8h, v2.8b sqxtun v0.8b, v4.8h uaddw v7.8h, v7.8h, v3.8b sqxtun v1.8b, v5.8h st1 {v0.8b}, [x6], x1 sqxtun v2.8b, v6.8h st1 {v1.8b}, [x9], x10 sqxtun v3.8b, v7.8h st1 {v2.8b}, [x6], x1 st1 {v3.8b}, [x9], x10 .endm add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem add_dest_addsub cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 sub_sp 64*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 .irp i, 0, 8, 16, 24 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-2 // shift bl inv_txfm_dct_clear_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 sub_sp 64*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 .irp i, 0, 8, 16, 24 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-1 // shift bl inv_txfm_dct_clear_scale_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 sub_sp 32*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 8, 16, 24 add x6, x5, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_scale_dct_32x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #32*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 sub_sp 64*16*2+64*8*2 add x4, sp, #64*8*2 movrel x13, eob_16x32 .irp i, 0, 8 add x6, x4, #(\i*64*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #16*2 mov x12, #-2 // shift bl inv_txfm_dct_clear_8h_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 8 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: adr x5, inv_dct_8h_x16_neon mov x8, #64*2 .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) bl inv_txfm_add_vert_8x16_neon .endr add sp, x4, #64*16*2 ret x15 endfunc function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 sub_sp 16*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, x5, #(\i*16*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_16x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #8 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #16*32*2 ret x15 endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/itx16.S000066400000000000000000004073071517466257200227550ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, // int bitdepth_max); // Most of the functions use the following register layout: // x0-x3 external parameters // x4 function pointer to first transform // x5 function pointer to second transform // x6 output parameter for helper function // x7 input parameter for helper function // x8 input stride for helper function // x9-x12 scratch variables for helper functions // x13 pointer to list of eob thresholds // x14 return pointer for helper function // x15 return pointer for main function // The SIMD registers most often use the following layout: // v0-v1 multiplication coefficients // v2-v7 scratch registers // v8-v15 unused // v16-v31 inputs/outputs of transforms const idct_coeffs, align=4 // idct4 .int 2896, 2896*8*(1<<16), 1567, 3784 // idct8 .int 799, 4017, 3406, 2276 // idct16 .int 401, 4076, 3166, 2598 .int 1931, 3612, 3920, 1189 // idct32 .int 201, 4091, 3035, 2751 .int 1751, 3703, 3857, 1380 .int 995, 3973, 3513, 2106 .int 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) .int 4076, 401, 4017, 799 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) .int -3166, -2598, -799, -4017 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) .int 3612, 1931, 2276, 3406 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) .int -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 .int 1321, 3803, 2482, 3344 endconst const iadst8_coeffs, align=4 .int 4076, 401, 3612, 1931 .int 2598, 3166, 1189, 3920 // idct_coeffs .int 2896, 0, 1567, 3784 endconst const iadst16_coeffs, align=4 .int 4091, 201, 3973, 995 .int 3703, 1751, 3290, 2440 .int 2751, 3035, 2106, 3513 .int 1380, 3857, 601, 4052 endconst .macro mul_mla d, s0, s1, c0, c1 mul \d\().4s, \s0\().4s, \c0 mla \d\().4s, \s1\().4s, \c1 .endm .macro mul_mls d, s0, s1, c0, c1 mul \d\().4s, \s0\().4s, \c0 mls \d\().4s, \s1\().4s, \c1 .endm .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 sqrdmulh \r0\sz, \r0\sz, \c sqrdmulh \r1\sz, \r1\sz, \c sqrdmulh \r2\sz, \r2\sz, \c sqrdmulh \r3\sz, \r3\sz, \c .ifnb \r4 sqrdmulh \r4\sz, \r4\sz, \c sqrdmulh \r5\sz, \r5\sz, \c sqrdmulh \r6\sz, \r6\sz, \c sqrdmulh \r7\sz, \r7\sz, \c .endif .endm .macro smin_4s r0, r1, r2 smin \r0\().4s, \r1\().4s, \r2\().4s .endm .macro smax_4s r0, r1, r2 smax \r0\().4s, \r1\().4s, \r2\().4s .endm .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif .ifnb \shift srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc usqadd \adddst, \addsrc .endif .ifnb \min smin \min, \min, v7.8h .endif .ifnb \store st1 {\store}, [\dst], x1 .endif .endm .macro load_add_store_8x16 dst, src mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , \dst, \src load_add_store v3.8h, v17.8h, , , , , \dst, \src load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src load_add_store , , , , v27.8h, v26.8h, \dst, \src load_add_store , , , , , v27.8h, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits load_add_store , , , , , v19.8h, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits load_add_store , , , , , v5.8h, \dst, \src, \shiftbits .endm .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif .ifnb \inssrc ins \insdst\().d[1], \inssrc\().d[0] .endif .ifnb \shift srshr \shift, \shift, #4 .endif .ifnb \load ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc usqadd \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif .ifnb \min smin \min, \min, v7.8h .endif .ifnb \store st1 {\store}[1], [\dst], x1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store4 v0.d, v17, v16, , , , , , \dst, \src load_add_store4 v1.d, v19, v18, , , , , , \dst, \src load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src load_add_store4 , , , , , , , v23.d, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store4 v0.d, v17, v16, , , , , , \dst, \src load_add_store4 v1.d, v19, v18, , , , , , \dst, \src load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src load_add_store4 , , , , , , , v3.d, \dst, \src .endm .macro idct_dc w, h, shift cbnz w3, 1f movz w16, #2896*8, lsl #16 ld1r {v16.4s}, [x2] dup v0.2s, w16 sqrdmulh v20.4s, v16.4s, v0.s[0] str wzr, [x2] .if (\w == 2*\h) || (2*\w == \h) sqrdmulh v20.4s, v20.4s, v0.s[0] .endif .if \shift > 0 sqrshrn v16.4h, v20.4s, #\shift sqrshrn2 v16.8h, v20.4s, #\shift .else sqxtn v16.4h, v20.4s sqxtn2 v16.8h, v20.4s .endif sqrdmulh v16.8h, v16.8h, v0.h[1] srshr v16.8h, v16.8h, #4 mov w4, #\h b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 ld1 {v1.d}[0], [x0], x1 subs w4, w4, #4 ld1 {v1.d}[1], [x0], x1 usqadd v0.8h, v16.8h sub x0, x0, x1, lsl #2 usqadd v1.8h, v16.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h st1 {v0.d}[1], [x0], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 b.gt 1b ret endfunc function idct_dc_w8_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h}, [x0], x1 subs w4, w4, #4 ld1 {v1.8h}, [x0], x1 usqadd v0.8h, v16.8h ld1 {v2.8h}, [x0], x1 usqadd v1.8h, v16.8h ld1 {v3.8h}, [x0], x1 usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h sub x0, x0, x1, lsl #2 smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h st1 {v0.8h}, [x0], x1 smin v2.8h, v2.8h, v31.8h st1 {v1.8h}, [x0], x1 smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w16_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h}, [x0], x1 subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x0], x1 usqadd v0.8h, v16.8h usqadd v1.8h, v16.8h sub x0, x0, x1, lsl #1 usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h st1 {v0.8h, v1.8h}, [x0], x1 smin v3.8h, v3.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w32_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w4, w4, #1 usqadd v0.8h, v16.8h usqadd v1.8h, v16.8h usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w64_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x1, x1, #64 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 subs w4, w4, #1 usqadd v0.8h, v16.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] usqadd v1.8h, v16.8h sub x0, x0, #64 usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h usqadd v4.8h, v16.8h usqadd v5.8h, v16.8h usqadd v6.8h, v16.8h usqadd v7.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h smin v4.8h, v4.8h, v31.8h smin v5.8h, v5.8h, v31.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 smin v6.8h, v6.8h, v31.8h smin v7.8h, v7.8h, v31.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 b.gt 1b ret endfunc .macro iwht4 add v16.4s, v16.4s, v17.4s sub v21.4s, v18.4s, v19.4s sub v20.4s, v16.4s, v21.4s sshr v20.4s, v20.4s, #1 sub v18.4s, v20.4s, v17.4s sub v17.4s, v20.4s, v19.4s add v19.4s, v21.4s, v18.4s sub v16.4s, v16.4s, v17.4s .endm .macro idct_4 r0, r1, r2, r3 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] srshr v6.4s, v6.4s, #12 srshr v2.4s, v2.4s, #12 srshr v7.4s, v4.4s, #12 srshr v3.4s, v3.4s, #12 sqadd \r0\().4s, v2.4s, v6.4s sqsub \r3\().4s, v2.4s, v6.4s sqadd \r1\().4s, v3.4s, v7.4s sqsub \r2\().4s, v3.4s, v7.4s .endm function inv_dct_4s_x4_neon AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s}, [x16] idct_4 v16, v17, v18, v19 ret endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.4s}, [x16] sub v3.4s, v16.4s, v18.4s mul v4.4s, v16.4s, v0.s[0] mla v4.4s, v18.4s, v0.s[1] mla v4.4s, v19.4s, v0.s[2] mul v7.4s, v17.4s, v0.s[3] add v3.4s, v3.4s, v19.4s mul v5.4s, v16.4s, v0.s[2] mls v5.4s, v18.4s, v0.s[0] mls v5.4s, v19.4s, v0.s[1] add \o3\().4s, v4.4s, v5.4s mul \o2\().4s, v3.4s, v0.s[3] add \o0\().4s, v4.4s, v7.4s add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s srshr \o0\().4s, \o0\().4s, #12 srshr \o2\().4s, \o2\().4s, #12 srshr \o1\().4s, \o1\().4s, #12 srshr \o3\().4s, \o3\().4s, #12 .endm function inv_adst_4s_x4_neon AARCH64_VALID_CALL_TARGET iadst_4x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_4s_x4_neon AARCH64_VALID_CALL_TARGET iadst_4x4 v19, v18, v17, v16 ret endfunc function inv_identity_4s_x4_neon AARCH64_VALID_CALL_TARGET movz w16, #(5793-4096)*8, lsl #16 dup v0.2s, w16 sqrdmulh v4.4s, v16.4s, v0.s[0] sqrdmulh v5.4s, v17.4s, v0.s[0] sqrdmulh v6.4s, v18.4s, v0.s[0] sqrdmulh v7.4s, v19.4s, v0.s[0] sqadd v16.4s, v16.4s, v4.4s sqadd v17.4s, v17.4s, v5.4s sqadd v18.4s, v18.4s, v6.4s sqadd v19.4s, v19.4s, v7.4s ret endfunc function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 mov x15, x30 movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 sshr v16.4s, v16.4s, #2 sshr v17.4s, v17.4s, #2 sshr v18.4s, v18.4s, #2 sshr v19.4s, v19.4s, #2 iwht4 st1 {v30.4s, v31.4s}, [x2], #32 transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 iwht4 ld1 {v0.d}[0], [x0], x1 sqxtn v16.4h, v16.4s ld1 {v0.d}[1], [x0], x1 sqxtn2 v16.8h, v17.4s ld1 {v1.d}[0], [x0], x1 sqxtn v18.4h, v18.4s ld1 {v1.d}[1], [x0], x1 sqxtn2 v18.8h, v19.4s b L(itx_4x4_end) endfunc // HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers // x0-x4 external parameters // x5 function pointer to first transform // x6 function pointer to second transform function inv_txfm_add_4x4_neon movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 blr x5 st1 {v30.4s, v31.4s}, [x2], #32 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 blr x6 ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.d}[0], [x0], x1 ld1 {v1.d}[1], [x0], x1 srshr v16.8h, v16.8h, #4 srshr v18.8h, v18.8h, #4 L(itx_4x4_end): dup v31.8h, w4 sub x0, x0, x1, lsl #2 usqadd v0.8h, v16.8h usqadd v1.8h, v18.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h st1 {v0.d}[1], [x0], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct cbnz w3, 1f movz w16, #2896*8, lsl #16 ld1r {v16.4s}, [x2] dup v4.2s, w16 str wzr, [x2] sqrdmulh v16.4s, v16.4s, v4.s[0] ld1 {v0.d}[0], [x0], x1 sqxtn v20.4h, v16.4s sqxtn2 v20.8h, v16.4s ld1 {v0.d}[1], [x0], x1 sqrdmulh v20.8h, v20.8h, v4.h[1] ld1 {v1.d}[0], [x0], x1 srshr v16.8h, v20.8h, #4 ld1 {v1.d}[1], [x0], x1 srshr v18.8h, v20.8h, #4 movi v30.8h, #0 b L(itx_4x4_end) 1: .endif adr x5, inv_\txfm1\()_4s_x4_neon movrel x6, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, \r0, \r2, \r4, \r6 smin_4s \r, \r, v5 .endr .irp r, \r0, \r2, \r4, \r6 smax_4s \r, \r, v4 .endr mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a srshr \r7\().4s, v3.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // t6a sqadd v2.4s, \r1\().4s, \r3\().4s // t4 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a .irp r, v2, \r1, v3, \r3 smin_4s \r, \r, v5 .endr .irp r, v2, \r1, v3, \r3 smax_4s \r, \r, v4 .endr mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 srshr v7.4s, v7.4s, #12 // t5 srshr v6.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 sqadd \r1\().4s, \r2\().4s, v6.4s // out1 sqsub v6.4s, \r2\().4s, v6.4s // out6 sqadd \r2\().4s, \r4\().4s, v7.4s // out2 sqsub \r5\().4s, \r4\().4s, v7.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 .endm function inv_dct_4s_x8_neon AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 movrel x16, iadst8_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 mul_mla v2, v23, v16, v0.s[0], v0.s[1] mul_mls v4, v23, v16, v0.s[1], v0.s[0] mul_mla v6, v21, v18, v0.s[2], v0.s[3] srshr v16.4s, v2.4s, #12 // t0a srshr v23.4s, v4.4s, #12 // t1a mul_mls v2, v21, v18, v0.s[3], v0.s[2] mul_mla v4, v19, v20, v1.s[0], v1.s[1] srshr v18.4s, v6.4s, #12 // t2a srshr v21.4s, v2.4s, #12 // t3a mul_mls v6, v19, v20, v1.s[1], v1.s[0] mul_mla v2, v17, v22, v1.s[2], v1.s[3] srshr v20.4s, v4.4s, #12 // t4a srshr v19.4s, v6.4s, #12 // t5a mul_mls v4, v17, v22, v1.s[3], v1.s[2] srshr v22.4s, v2.4s, #12 // t6a srshr v17.4s, v4.4s, #12 // t7a ld1 {v0.4s}, [x16] movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 sqsub v7.4s, v18.4s, v22.4s // t6 sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 .irp r, v2, v3, v4, v5, v6, v7, v18, v19 smin_4s \r, \r, v1 .endr .irp r, v2, v3, v4, v5, v6, v7, v18, v19 smax_4s \r, \r, v20 .endr mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] srshr v3.4s, v16.4s, #12 // t4a srshr v5.4s, v20.4s, #12 // t5a mul_mla v16, v19, v7, v0.s[2], v0.s[3] srshr v7.4s, v22.4s, #12 // t6a srshr v19.4s, v16.4s, #12 // t7a sqadd \o0\().4s, v2.4s, v6.4s // out0 sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, v2, v4, v3, v5 smin_4s \r, \r, v1 .endr .irp r, v2, v4, v3, v5 smax_4s \r, \r, v18 .endr sqneg \o7\().4s, \o7\().4s // out7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) srshr v2.4s, v18.4s, #12 // out3 mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) srshr v3.4s, v20.4s, #12 // out5 srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) sqneg \o3\().4s, v2.4s // out3 sqneg \o5\().4s, v3.4s // out5 .endm function inv_adst_4s_x8_neon AARCH64_VALID_CALL_TARGET iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc function inv_flipadst_4s_x8_neon AARCH64_VALID_CALL_TARGET iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x8_neon AARCH64_VALID_CALL_TARGET sqshl v16.4s, v16.4s, #1 sqshl v17.4s, v17.4s, #1 sqshl v18.4s, v18.4s, #1 sqshl v19.4s, v19.4s, #1 sqshl v20.4s, v20.4s, #1 sqshl v21.4s, v21.4s, #1 sqshl v22.4s, v22.4s, #1 sqshl v23.4s, v23.4s, #1 ret endfunc function inv_txfm_add_8x8_neon movi v31.4s, #0 cmp w3, w13 mov x11, #32 b.lt 1f add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v31.4s}, [x6], x11 .endr blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 sqrshrn2 v24.8h, v20.4s, #1 sqrshrn2 v25.8h, v21.4s, #1 sqrshrn2 v26.8h, v22.4s, #1 sqrshrn2 v27.8h, v23.4s, #1 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h movi \i, #0 .endr 2: .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x2] st1 {v31.4s}, [x2], x11 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 mov v20.16b, v24.16b mov v21.16b, v25.16b mov v22.16b, v26.16b mov v23.16b, v27.16b blr x5 load_add_store_8x8 x0, x7 ret x15 endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif movrel x5, X(inv_\txfm2\()_8h_x8_neon) mov w13, #\eob_half adr x4, inv_\txfm1\()_4s_x8_neon b inv_txfm_add_8x8_neon endfunc .endm def_fn_8x8 dct, dct, 10 def_fn_8x8 identity, identity, 10 def_fn_8x8 dct, adst, 10 def_fn_8x8 dct, flipadst, 10 def_fn_8x8 dct, identity, 4 def_fn_8x8 adst, dct, 10 def_fn_8x8 adst, adst, 10 def_fn_8x8 adst, flipadst, 10 def_fn_8x8 flipadst, dct, 10 def_fn_8x8 flipadst, adst, 10 def_fn_8x8 flipadst, flipadst, 10 def_fn_8x8 identity, dct, 4 def_fn_8x8 adst, identity, 4 def_fn_8x8 flipadst, identity, 4 def_fn_8x8 identity, adst, 4 def_fn_8x8 identity, flipadst, 4 function inv_txfm_add_8x4_neon movi v28.4s, #0 movi v29.4s, #0 movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 movz w16, #2896*8, lsl #16 dup v0.2s, w16 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s sqxtn v20.4h, v20.4s sqxtn v21.4h, v21.4s sqxtn v22.4h, v22.4s sqxtn v23.4h, v23.4s transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] blr x5 load_add_store_8x4 x0, x7 ret x15 endfunc function inv_txfm_add_4x8_neon movz w16, #2896*8, lsl #16 movi v31.4s, #0 dup v30.2s, w16 cmp w3, w13 mov x11, #32 b.lt 1f add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v31.4s}, [x6], x11 .endr scale_input .4s, v30.s[0], v16, v17, v18, v19 blr x4 sqxtn v20.4h, v16.4s sqxtn v21.4h, v17.4s sqxtn v22.4h, v18.4s sqxtn v23.4h, v19.4s transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f 1: .irp i, v20, v21, v22, v23 movi \i\().4h, #0 .endr 2: .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x2] st1 {v31.4s}, [x2], x11 .endr scale_input .4s, v30.s[0], v16, v17, v18, v19 blr x4 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 load_add_store_4x8 x0, x7 ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif adr x4, inv_\txfm1\()_4s_x\w\()_neon .if \w == 4 mov w13, #\eob_half .endif movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct, 13 def_fn_48 \w, \h, identity, identity, 13 def_fn_48 \w, \h, dct, adst, 13 def_fn_48 \w, \h, dct, flipadst, 13 def_fn_48 \w, \h, dct, identity, 4 def_fn_48 \w, \h, adst, dct, 13 def_fn_48 \w, \h, adst, adst, 13 def_fn_48 \w, \h, adst, flipadst, 13 def_fn_48 \w, \h, flipadst, dct, 13 def_fn_48 \w, \h, flipadst, adst, 13 def_fn_48 \w, \h, flipadst, flipadst, 13 def_fn_48 \w, \h, identity, dct, 16 def_fn_48 \w, \h, adst, identity, 4 def_fn_48 \w, \h, flipadst, identity, 4 def_fn_48 \w, \h, identity, adst, 16 def_fn_48 \w, \h, identity, flipadst, 16 .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_4s_x16_neon AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 idct_8 v16, v18, v20, v22, v24, v26, v28, v30 // idct_8 leaves the row_clip_max/min constants in v5 and v4 .irp r, v16, v18, v20, v22, v24, v26, v28, v30 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v16, v18, v20, v22, v24, v26, v28, v30 smax \r\().4s, \r\().4s, v4.4s .endr ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a srshr v31.4s, v3.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a srshr v21.4s, v3.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a srshr v29.4s, v3.4s, #12 // t12a ld1 {v0.4s}, [x16] sqsub v2.4s, v17.4s, v23.4s // t9 sqadd v17.4s, v17.4s, v23.4s // t8 sqsub v3.4s, v31.4s, v25.4s // t14 sqadd v31.4s, v31.4s, v25.4s // t15 sqsub v23.4s, v19.4s, v21.4s // t10 sqadd v19.4s, v19.4s, v21.4s // t11 sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 .irp r, v2, v17, v3, v31, v23, v19, v25, v29 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v17, v3, v31, v23, v19, v25, v29 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a srshr v21.4s, v7.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a srshr v29.4s, v7.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a sqsub v2.4s, v17.4s, v19.4s // t11a sqadd v17.4s, v17.4s, v19.4s // t8a sqsub v3.4s, v31.4s, v25.4s // t12a sqadd v31.4s, v31.4s, v25.4s // t15a sqadd v19.4s, v21.4s, v23.4s // t9 sqsub v21.4s, v21.4s, v23.4s // t10 sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 .irp r, v2, v17, v3, v31, v19, v21, v25, v27 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v17, v3, v31, v19, v21, v25, v27 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a srshr v7.4s, v7.4s, #12 // t11 srshr v6.4s, v6.4s, #12 // t12 mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a srshr v3.4s, v3.4s, #12 // t13a sqadd v1.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 mov v16.16b, v1.16b sqadd v23.4s, v30.4s, v17.4s // out7 sqsub v1.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 sqadd v19.4s, v22.4s, v6.4s // out3 sqsub v28.4s, v22.4s, v6.4s // out12 sqadd v20.4s, v24.4s, v7.4s // out4 sqsub v27.4s, v24.4s, v7.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 mov v24.16b, v1.16b mov v22.16b, v3.16b ret endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel x16, iadst16_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 srshr v16.4s, v2.4s, #12 // t0 srshr v31.4s, v4.4s, #12 // t1 mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 srshr v18.4s, v6.4s, #12 // t2 srshr v29.4s, v2.4s, #12 // t3 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 srshr v20.4s, v4.4s, #12 // t4 srshr v27.4s, v6.4s, #12 // t5 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 ld1 {v0.4s, v1.4s}, [x16] movrel x16, idct_coeffs mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 srshr v22.4s, v2.4s, #12 // t6 srshr v25.4s, v4.4s, #12 // t7 mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 srshr v23.4s, v6.4s, #12 // t8 srshr v24.4s, v2.4s, #12 // t9 mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 srshr v21.4s, v4.4s, #12 // t10 srshr v26.4s, v6.4s, #12 // t11 mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 srshr v19.4s, v2.4s, #12 // t12 srshr v28.4s, v4.4s, #12 // t13 mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 srshr v17.4s, v6.4s, #12 // t14 srshr v30.4s, v2.4s, #12 // t15 ld1 {v0.4s, v1.4s}, [x16] movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a sqadd v31.4s, v31.4s, v24.4s // t1a sqadd v23.4s, v18.4s, v21.4s // t2a sqsub v18.4s, v18.4s, v21.4s // t10a sqadd v24.4s, v29.4s, v26.4s // t3a sqsub v29.4s, v29.4s, v26.4s // t11a sqadd v21.4s, v20.4s, v19.4s // t4a sqsub v20.4s, v20.4s, v19.4s // t12a sqadd v26.4s, v27.4s, v28.4s // t5a sqsub v27.4s, v27.4s, v28.4s // t13a sqadd v19.4s, v22.4s, v17.4s // t6a sqsub v22.4s, v22.4s, v17.4s // t14a sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a .irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 smin_4s \r, \r, v5 .endr .irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 smax_4s \r, \r, v7 .endr mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 srshr v17.4s, v4.4s, #12 // t8 srshr v30.4s, v6.4s, #12 // t9 mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 srshr v18.4s, v2.4s, #12 // t10 srshr v29.4s, v4.4s, #12 // t11 mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 srshr v27.4s, v6.4s, #12 // t12 srshr v20.4s, v2.4s, #12 // t13 mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 srshr v25.4s, v4.4s, #12 // t14 srshr v22.4s, v6.4s, #12 // t15 sqsub v2.4s, v16.4s, v21.4s // t4 sqadd v16.4s, v16.4s, v21.4s // t0 sqsub v3.4s, v31.4s, v26.4s // t5 sqadd v31.4s, v31.4s, v26.4s // t1 sqadd v21.4s, v23.4s, v19.4s // t2 sqsub v23.4s, v23.4s, v19.4s // t6 sqadd v26.4s, v24.4s, v28.4s // t3 sqsub v24.4s, v24.4s, v28.4s // t7 sqadd v19.4s, v17.4s, v27.4s // t8a sqsub v17.4s, v17.4s, v27.4s // t12a sqadd v28.4s, v30.4s, v20.4s // t9a sqsub v30.4s, v30.4s, v20.4s // t13a sqadd v27.4s, v18.4s, v25.4s // t10a sqsub v18.4s, v18.4s, v25.4s // t14a sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a .irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 smin_4s \r, \r, v5 .endr .irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 smax_4s \r, \r, v7 .endr mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a srshr v22.4s, v4.4s, #12 // t4a srshr v25.4s, v6.4s, #12 // t5a mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 srshr v24.4s, v2.4s, #12 // t6a srshr v23.4s, v4.4s, #12 // t7a mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 srshr v17.4s, v6.4s, #12 // t12 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 srshr v29.4s, v2.4s, #12 // t13 srshr v30.4s, v4.4s, #12 // t14 srshr v18.4s, v6.4s, #12 // t15 sqsub v2.4s, v16.4s, v21.4s // t2a .ifc \o0, v16 sqadd \o0\().4s, v16.4s, v21.4s // out0 sqsub v21.4s, v31.4s, v26.4s // t3a sqadd \o15\().4s, v31.4s, v26.4s // out15 .else sqadd v4.4s, v16.4s, v21.4s // out0 sqsub v21.4s, v31.4s, v26.4s // t3a sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, v2, v21, v3, v26, v27, v20, v22, v23 smin_4s \r, \r, v5 .endr .irp r, v2, v21, v3, v26, v27, v20, v22, v23 smax_4s \r, \r, v7 .endr sqneg \o15\().4s, \o15\().4s // out15 sqneg \o13\().4s, \o13\().4s // out13 sqneg \o1\().4s, \o1\().4s // out1 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) srshr v24.4s, v24.4s, #12 // out8 srshr v4.4s, v4.4s, #12 // out7 srshr v5.4s, v6.4s, #12 // out5 mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) srshr v26.4s, v6.4s, #12 // out10 mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) srshr \o4\().4s, v2.4s, #12 // out4 srshr v6.4s, v6.4s, #12 // out11 srshr v7.4s, v21.4s, #12 // out9 srshr \o6\().4s, v22.4s, #12 // out6 .ifc \o8, v23 mov \o8\().16b, v24.16b mov \o10\().16b, v26.16b .endif sqneg \o7\().4s, v4.4s // out7 sqneg \o5\().4s, v5.4s // out5 sqneg \o11\().4s, v6.4s // out11 sqneg \o9\().4s, v7.4s // out9 .endm function inv_adst_4s_x16_neon AARCH64_VALID_CALL_TARGET iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 ret endfunc function inv_flipadst_4s_x16_neon AARCH64_VALID_CALL_TARGET iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x16_neon AARCH64_VALID_CALL_TARGET movz w16, #2*(5793-4096)*8, lsl #16 dup v0.2s, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.4s, v\i\().4s, v0.s[0] sqadd v\i\().4s, v\i\().4s, v\i\().4s sqadd v\i\().4s, v\i\().4s, v2.4s .endr ret endfunc .macro identity_4x16_shift1 c .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s sqrdmulh v3.4s, \i, \c srshr v3.4s, v3.4s, #1 sqadd \i, \i, v3.4s .endr .endm .macro identity_4x16 c .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s sqrdmulh v3.4s, \i, \c sqadd \i, \i, \i sqadd \i, \i, v3.4s .endr .endm .macro def_horz_16 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x4_neon mov x14, x30 movi v7.4s, #0 .if \scale movz w16, #2896*8, lsl #16 dup v0.2s, w16 .endif .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr .if \scale scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif blr x4 sqrshrn v16.4h, v16.4s, #\shift sqrshrn v17.4h, v17.4s, #\shift sqrshrn v18.4h, v18.4s, #\shift sqrshrn v19.4h, v19.4s, #\shift sqrshrn2 v16.8h, v20.4s, #\shift sqrshrn2 v17.8h, v21.4s, #\shift sqrshrn2 v18.8h, v22.4s, #\shift sqrshrn2 v19.8h, v23.4s, #\shift sqrshrn v20.4h, v24.4s, #\shift sqrshrn v21.4h, v25.4s, #\shift sqrshrn v22.4h, v26.4s, #\shift sqrshrn v23.4h, v27.4s, #\shift sqrshrn2 v20.8h, v28.4s, #\shift sqrshrn2 v21.8h, v29.4s, #\shift sqrshrn2 v22.8h, v30.4s, #\shift sqrshrn2 v23.8h, v31.4s, #\shift .if \scale b L(horz_16x4_epilog) .else L(horz_16x4_epilog): transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 .irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h st1 {\i}, [x6], #16 .endr ret x14 .endif endfunc .endm def_horz_16 scale=1, shift=1, suffix=_scale def_horz_16 scale=0, shift=2 function inv_txfm_add_vert_8x16_neon mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr blr x5 load_add_store_8x16 x6, x7 ret x14 endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12 add x6, sp, #(\i*16*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .if \i < 12 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #16*4 bl inv_txfm_horz_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #512 ret x15 endfunc const eob_16x16 .short 10, 36, 78, 256 endconst const eob_16x16_identity .short 4, 8, 12, 256 endconst .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif adr x4, inv_\txfm1\()_4s_x16_neon movrel x5, X(inv_\txfm2\()_8h_x16_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_16x16 .else movrel x13, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel x13, eob_16x16_identity .else movrel x13, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst def_fn_16x16 adst, flipadst def_fn_16x16 flipadst, dct def_fn_16x16 flipadst, adst def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct function inv_txfm_add_16x4_neon mov x15, x30 movi v4.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], #16 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x4 x6, x7 sqrshrn v16.4h, v24.4s, #1 sqrshrn v17.4h, v25.4s, #1 sqrshrn v18.4h, v26.4s, #1 sqrshrn v19.4h, v27.4s, #1 sqrshrn2 v16.8h, v28.4s, #1 sqrshrn2 v17.8h, v29.4s, #1 sqrshrn2 v18.8h, v30.4s, #1 sqrshrn2 v19.8h, v31.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 add x6, x0, #16 load_add_store_8x4 x6, x7 ret x15 endfunc function inv_txfm_add_4x16_neon ldrh w12, [x13, #4] mov x15, x30 mov x11, #64 cmp w3, w12 ldrh w12, [x13, #2] b.lt 1f add x6, x2, #48 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 sqrshrn v28.4h, v16.4s, #1 sqrshrn v29.4h, v17.4s, #1 sqrshrn v30.4h, v18.4s, #1 sqrshrn v31.4h, v19.4s, #1 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 b 2f 1: .irp i, v28.4h, v29.4h, v30.4h, v31.4h movi \i, #0 .endr 2: cmp w3, w12 ldrh w12, [x13, #0] b.lt 1f add x6, x2, #32 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 b 2f 1: .irp i, v24.4h, v25.4h, v26.4h, v27.4h movi \i, #0 .endr 2: cmp w3, w12 b.lt 1f add x6, x2, #16 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 sqrshrn v20.4h, v16.4s, #1 sqrshrn v21.4h, v17.4s, #1 sqrshrn v22.4h, v18.4s, #1 sqrshrn v23.4h, v19.4s, #1 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f 1: .irp i, v20.4h, v21.4h, v22.4h, v23.4h movi \i, #0 .endr 2: movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x2] st1 {v2.4s}, [x2], x11 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 load_add_store_4x16 x0, x6 ret x15 endfunc const eob_4x16 .short 13, 29, 45, 64 endconst const eob_4x16_identity1 .short 16, 32, 48, 64 endconst const eob_4x16_identity2 .short 4, 8, 12, 64 endconst .macro def_fn_416 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_4x16 .else movrel x13, eob_4x16_identity1 .endif .else .ifc \txfm2, identity movrel x13, eob_4x16_identity2 .else movrel x13, eob_4x16 .endif .endif .else adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct def_fn_416 \w, \h, identity, identity def_fn_416 \w, \h, dct, adst def_fn_416 \w, \h, dct, flipadst def_fn_416 \w, \h, dct, identity def_fn_416 \w, \h, adst, dct def_fn_416 \w, \h, adst, adst def_fn_416 \w, \h, adst, flipadst def_fn_416 \w, \h, flipadst, dct def_fn_416 \w, \h, flipadst, adst def_fn_416 \w, \h, flipadst, flipadst def_fn_416 \w, \h, identity, dct def_fn_416 \w, \h, adst, identity def_fn_416 \w, \h, flipadst, identity def_fn_416 \w, \h, identity, adst def_fn_416 \w, \h, identity, flipadst .endm def_fns_416 4, 16 def_fns_416 16, 4 function inv_txfm_add_16x8_neon mov x15, x30 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] cmp w3, w13 mov x11, #32 b.lt 1f movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 blr x4 sqrshrn v8.4h, v16.4s, #1 sqrshrn v9.4h, v17.4s, #1 sqrshrn v10.4h, v18.4s, #1 sqrshrn v11.4h, v19.4s, #1 sqrshrn2 v8.8h, v20.4s, #1 sqrshrn2 v9.8h, v21.4s, #1 sqrshrn2 v10.8h, v22.4s, #1 sqrshrn2 v11.8h, v23.4s, #1 sqrshrn v12.4h, v24.4s, #1 sqrshrn v13.4h, v25.4s, #1 sqrshrn v14.4h, v26.4s, #1 sqrshrn v15.4h, v27.4s, #1 sqrshrn2 v12.8h, v28.4s, #1 sqrshrn2 v13.8h, v29.4s, #1 sqrshrn2 v14.8h, v30.4s, #1 sqrshrn2 v15.8h, v31.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 b 2f 1: .irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h movi \i, #0 .endr 2: movz w16, #2896*8, lsl #16 dup v0.2s, w16 movi v4.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 mov v20.16b, v8.16b mov v21.16b, v9.16b mov v22.16b, v10.16b mov v23.16b, v11.16b transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 sqrshrn v8.4h, v24.4s, #1 sqrshrn v9.4h, v25.4s, #1 sqrshrn v10.4h, v26.4s, #1 sqrshrn v11.4h, v27.4s, #1 sqrshrn2 v8.8h, v28.4s, #1 sqrshrn2 v9.8h, v29.4s, #1 sqrshrn2 v10.8h, v30.4s, #1 sqrshrn2 v11.8h, v31.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x8 x6, x7 mov v16.16b, v8.16b mov v17.16b, v9.16b mov v18.16b, v10.16b mov v19.16b, v11.16b mov v20.16b, v12.16b mov v21.16b, v13.16b mov v22.16b, v14.16b mov v23.16b, v15.16b blr x5 add x0, x0, #16 load_add_store_8x8 x0, x7 ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x15 endfunc function inv_txfm_add_8x16_neon mov x15, x30 stp d8, d9, [sp, #-0x20]! stp d10, d11, [sp, #0x10] ldrh w12, [x13, #4] mov x11, #64 cmp w3, w12 ldrh w12, [x13, #2] b.lt 1f add x6, x2, #48 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v28.4h, v16.4s, #1 sqrshrn v29.4h, v17.4s, #1 sqrshrn v30.4h, v18.4s, #1 sqrshrn v31.4h, v19.4s, #1 sqrshrn2 v28.8h, v20.4s, #1 sqrshrn2 v29.8h, v21.4s, #1 sqrshrn2 v30.8h, v22.4s, #1 sqrshrn2 v31.8h, v23.4s, #1 transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 b 2f 1: .irp i, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr 2: cmp w3, w12 ldrh w12, [x13, #0] b.lt 1f add x6, x2, #32 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 sqrshrn2 v24.8h, v20.4s, #1 sqrshrn2 v25.8h, v21.4s, #1 sqrshrn2 v26.8h, v22.4s, #1 sqrshrn2 v27.8h, v23.4s, #1 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h movi \i, #0 .endr 2: cmp w3, w12 b.lt 1f add x6, x2, #16 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v8.4h, v16.4s, #1 sqrshrn v9.4h, v17.4s, #1 sqrshrn v10.4h, v18.4s, #1 sqrshrn v11.4h, v19.4s, #1 sqrshrn2 v8.8h, v20.4s, #1 sqrshrn2 v9.8h, v21.4s, #1 sqrshrn2 v10.8h, v22.4s, #1 sqrshrn2 v11.8h, v23.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 b 2f 1: .irp i, v8.8h, v9.8h, v10.8h, v11.8h movi \i, #0 .endr 2: movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 mov v20.16b, v8.16b mov v21.16b, v9.16b mov v22.16b, v10.16b mov v23.16b, v11.16b blr x5 load_add_store_8x16 x0, x6 ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x20 ret x15 endfunc const eob_8x16 .short 10, 43, 75, 128 endconst const eob_8x16_identity1 .short 4, 64, 96, 128 endconst const eob_8x16_identity2 .short 4, 8, 12, 128 endconst .macro def_fn_816 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_8x16 .else movrel x13, eob_8x16_identity1 .endif .else .ifc \txfm2, identity movrel x13, eob_8x16_identity2 .else movrel x13, eob_8x16 .endif .endif .if \h == 8 ldrh w13, [x13] .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct def_fn_816 \w, \h, identity, identity def_fn_816 \w, \h, dct, adst def_fn_816 \w, \h, dct, flipadst def_fn_816 \w, \h, dct, identity def_fn_816 \w, \h, adst, dct def_fn_816 \w, \h, adst, adst def_fn_816 \w, \h, adst, flipadst def_fn_816 \w, \h, flipadst, dct def_fn_816 \w, \h, flipadst, adst def_fn_816 \w, \h, flipadst, flipadst def_fn_816 \w, \h, identity, dct def_fn_816 \w, \h, adst, identity def_fn_816 \w, \h, flipadst, identity def_fn_816 \w, \h, identity, adst def_fn_816 \w, \h, identity, flipadst .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_4s_x16_neon movrel x16, idct_coeffs, 4*16 ld1 {v0.4s, v1.4s}, [x16], #32 mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a srshr v16.4s, v2.4s, #12 // t16a srshr v31.4s, v4.4s, #12 // t31a mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a srshr v24.4s, v6.4s, #12 // t17a srshr v23.4s, v2.4s, #12 // t30a mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a srshr v20.4s, v4.4s, #12 // t18a srshr v27.4s, v6.4s, #12 // t29a mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #4*24 mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a srshr v28.4s, v2.4s, #12 // t19a srshr v19.4s, v4.4s, #12 // t28a mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a srshr v18.4s, v6.4s, #12 // t20a srshr v29.4s, v2.4s, #12 // t27a mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a srshr v26.4s, v4.4s, #12 // t21a srshr v21.4s, v6.4s, #12 // t26a mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a srshr v22.4s, v2.4s, #12 // t22a srshr v25.4s, v4.4s, #12 // t25a mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a srshr v30.4s, v6.4s, #12 // t23a srshr v17.4s, v2.4s, #12 // t24a ld1 {v0.4s, v1.4s}, [x16] movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 sqadd v31.4s, v31.4s, v23.4s // t31 sqsub v24.4s, v28.4s, v20.4s // t18 sqadd v28.4s, v28.4s, v20.4s // t19 sqadd v23.4s, v18.4s, v26.4s // t20 sqsub v18.4s, v18.4s, v26.4s // t21 sqsub v20.4s, v30.4s, v22.4s // t22 sqadd v30.4s, v30.4s, v22.4s // t23 sqadd v26.4s, v17.4s, v25.4s // t24 sqsub v17.4s, v17.4s, v25.4s // t25 sqsub v22.4s, v29.4s, v21.4s // t26 sqadd v29.4s, v29.4s, v21.4s // t27 sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 .irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a srshr v21.4s, v7.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a srshr v24.4s, v7.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a neg v7.4s, v7.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a srshr v17.4s, v7.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 sqadd v27.4s, v27.4s, v24.4s // t30 sqsub v3.4s, v21.4s, v19.4s // t18 sqadd v21.4s, v21.4s, v19.4s // t17 sqsub v24.4s, v16.4s, v28.4s // t19a sqadd v16.4s, v16.4s, v28.4s // t16a sqsub v19.4s, v30.4s, v23.4s // t20a sqadd v30.4s, v30.4s, v23.4s // t23a sqsub v28.4s, v17.4s, v22.4s // t21 sqadd v17.4s, v17.4s, v22.4s // t22 sqadd v23.4s, v26.4s, v29.4s // t24a sqsub v26.4s, v26.4s, v29.4s // t27a sqadd v22.4s, v20.4s, v18.4s // t25 sqsub v20.4s, v20.4s, v18.4s // t26 sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a .irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 srshr v18.4s, v7.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 srshr v24.4s, v7.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 neg v7.4s, v7.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a srshr v20.4s, v7.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 sqadd v16.4s, v16.4s, v30.4s // t16 = out16 sqsub v3.4s, v31.4s, v23.4s // t24 sqadd v31.4s, v31.4s, v23.4s // t31 = out31 sqsub v23.4s, v21.4s, v17.4s // t22a sqadd v17.4s, v21.4s, v17.4s // t17a = out17 sqadd v30.4s, v27.4s, v22.4s // t30a = out30 sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 sqadd v7.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a mov v19.16b, v7.16b // out19 .irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 srshr v20.4s, v7.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 srshr v26.4s, v7.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 srshr v25.4s, v7.4s, #12 // t25 mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a srshr v23.4s, v7.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x4_neon mov x14, x30 movi v7.4s, #0 lsl x8, x8, #1 .if \scale movz w16, #2896*8, lsl #16 dup v0.2s, w16 .endif .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .if \scale scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon // idct_16 leaves the row_clip_max/min constants in v5 and v4 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 .macro store1 r0, r1, r2, r3 st1 {\r0}, [x6], #16 st1 {\r1}, [x6], #16 st1 {\r2}, [x6], #16 st1 {\r3}, [x6], #16 .endm store1 v16.4s, v20.4s, v24.4s, v28.4s store1 v17.4s, v21.4s, v25.4s, v29.4s store1 v18.4s, v22.4s, v26.4s, v30.4s store1 v19.4s, v23.4s, v27.4s, v31.4s .purgem store1 sub x6, x6, #64*4 movi v7.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in v0.s[1] scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct32_odd_4s_x16_neon transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 .macro store2 r0, r1, r2, r3, shift ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] sqsub v4.4s, v0.4s, \r0 sqadd v0.4s, v0.4s, \r0 sqsub v5.4s, v1.4s, \r1 sqadd v1.4s, v1.4s, \r1 sqsub v6.4s, v2.4s, \r2 sqadd v2.4s, v2.4s, \r2 sqsub v7.4s, v3.4s, \r3 sqadd v3.4s, v3.4s, \r3 sqrshrn v0.4h, v0.4s, #\shift sqrshrn2 v0.8h, v1.4s, #\shift sqrshrn v1.4h, v2.4s, #\shift sqrshrn2 v1.8h, v3.4s, #\shift sqrshrn v2.4h, v7.4s, #\shift sqrshrn2 v2.8h, v6.4s, #\shift sqrshrn v3.4h, v5.4s, #\shift sqrshrn2 v3.8h, v4.4s, #\shift st1 {v0.8h, v1.8h}, [x6], #32 rev64 v2.8h, v2.8h rev64 v3.8h, v3.8h st1 {v2.8h, v3.8h}, [x6], #32 .endm store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift .purgem store2 ret x14 endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_8x32_neon mov x14, x30 lsl x8, x8, #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 bl X(inv_dct_8h_x16_neon) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 bl X(inv_dct32_odd_8h_x16_neon) neg x9, x8 mov x10, x6 mvni v1.8h, #0xfc, lsl #8 // 0x3ff .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride ld1 {v2.8h}, [x10], x1 ld1 {v6.8h}, [x7], \stride ld1 {v3.8h}, [x10], x1 \op v5.8h, v5.8h, \r0 ld1 {v7.8h}, [x7], \stride ld1 {v4.8h}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 usqadd v2.8h, v5.8h srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 ld1 {v5.8h}, [x7], \stride usqadd v3.8h, v6.8h smin v2.8h, v2.8h, v1.8h srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8h}, [x6], x1 ld1 {v2.8h}, [x10], x1 usqadd v4.8h, v7.8h smin v3.8h, v3.8h, v1.8h srshr v5.8h, v5.8h, #4 st1 {v3.8h}, [x6], x1 usqadd v2.8h, v5.8h smin v4.8h, v4.8h, v1.8h st1 {v4.8h}, [x6], x1 smin v2.8h, v2.8h, v1.8h st1 {v2.8h}, [x6], x1 .endm combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 sub x7, x7, x8 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine ret x14 endfunc const eob_32x32 .short 10, 36, 78, 136, 210, 300, 406, 1024 endconst const eob_16x32 .short 10, 36, 78, 151, 215, 279, 343, 512 endconst const eob_16x32_shortside .short 10, 36, 78, 512 endconst const eob_8x32 .short 10, 43, 75, 107, 139, 171, 203, 256 endconst function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 movi v0.8h, #0 movi v1.8h, #0 movrel x13, eob_32x32, 2 mov x8, #4*32 1: mov w9, #0 movrel x12, eob_32x32, 2 2: add w9, w9, #8 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 load_add_store_8x8 x0, x7, shiftbits=2 ldrh w11, [x12], #4 sub x0, x0, x1, lsl #3 add x0, x0, #2*8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #4 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw #1 add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #4*8 b 1b 9: ret endfunc .macro shift_16_regs op, shift .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 movz w16, #2896*8, lsl #16 movz w17, #2*(5793-4096)*8, lsl #16 movi v0.4s, #0 movi v1.4s, #0 movrel x13, eob_16x32\hshort, 2 mov x8, #4*\h 1: mov w9, #0 movrel x12, eob_16x32\wshort, 2 2: add w9, w9, #8 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 dup v2.2s, w16 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 mov v2.s[1], w17 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .if \w == 16 // 16x32 identity_4x16_shift1 v2.s[1] .else // 32x16 shift_16_regs sqshl, 1 identity_4x16 v2.s[1] .endif sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 .if \w == 16 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=4 .endif ldrh w11, [x12], #4 sub x0, x0, x1, lsl #3 add x0, x0, #16 cmp w3, w11 b.ge 2b ldrh w11, [x13], #4 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw #1 add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #4*8 b 1b 9: ret endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 movi v0.4s, #0 movi v1.4s, #0 // Working on 8x8 blocks, read every other entry from eob_8x32 movrel x13, eob_8x32, 2 mov w8, #4*\h 1: // Working on 8x8 blocks, read every other entry from eob_8x32 ldrh w12, [x13], #4 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 .if \w == 8 sqrshrn v16.4h, v16.4s, #1 sqrshrn2 v16.8h, v17.4s, #1 sqrshrn v17.4h, v18.4s, #1 sqrshrn2 v17.8h, v19.4s, #1 sqrshrn v18.4h, v20.4s, #1 sqrshrn2 v18.8h, v21.4s, #1 sqrshrn v19.4h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 sqrshrn v20.4h, v24.4s, #1 sqrshrn2 v20.8h, v25.4s, #1 sqrshrn v21.4h, v26.4s, #1 sqrshrn2 v21.8h, v27.4s, #1 sqrshrn v22.4h, v28.4s, #1 sqrshrn2 v22.8h, v29.4s, #1 sqrshrn v23.4h, v30.4s, #1 sqrshrn2 v23.8h, v31.4s, #1 .else sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 add x2, x2, #4*8 .else sub x0, x0, x1, lsl #3 add x0, x0, #2*8 .endif b 1b 9: ret endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 sub sp, sp, #2048 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, sp, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #2048 ret x15 endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_4s_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, sp, #(\i*16*2) add x7, x2, #(\i*4) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif mov x8, #4*32 bl inv_txfm_horz_scale_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #16*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 movrel x5, X(inv_dct_8h_x16_neon) ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12 add x6, sp, #(\i*32*2) add x7, x2, #(\i*4) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .if \i < 12 ldrh w12, [x13], #2 .endif .endif mov x8, #4*16 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 sub sp, sp, #512 movrel x13, eob_8x32 movi v28.4s, #0 mov x8, #4*32 mov w9, #32 mov x6, sp mov x7, x2 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().4s}, [x7] st1 {v28.4s}, [x7], x8 .endr ldrh w12, [x13], #2 sub w9, w9, #4 sub x7, x7, x8, lsl #3 add x7, x7, #4*4 bl inv_dct_4s_x8_neon sqrshrn v16.4h, v16.4s, #2 sqrshrn v17.4h, v17.4s, #2 sqrshrn v18.4h, v18.4s, #2 sqrshrn v19.4h, v19.4s, #2 sqrshrn2 v16.8h, v20.4s, #2 sqrshrn2 v17.8h, v21.4s, #2 sqrshrn2 v18.8h, v22.4s, #2 sqrshrn2 v19.8h, v23.4s, #2 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 cmp w3, w12 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 b.ge 1b cbz w9, 3f movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 2: subs w9, w9, #4 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 b.gt 2b 3: mov x6, x0 mov x7, sp mov x8, #8*2 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 ret x15 endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 sub sp, sp, #512 .irp i, 0, 4 add x6, sp, #(\i*32*2) add x7, x2, #(\i*4) .if \i > 0 cmp w3, #10 b.lt 1f .endif mov x8, #8*4 bl inv_txfm_horz_dct_32x4_neon .endr b 2f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr 2: mov x8, #2*32 mov w9, #0 1: add x6, x0, x9, lsl #1 add x7, sp, x9, lsl #1 // #(\i*2) .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x7], x8 .endr add w9, w9, #8 bl X(inv_dct_8h_x8_neon) cmp w9, #32 load_add_store_8x8 x6, x7 b.lt 1b add sp, sp, #512 ret x15 endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a ld1 {v0.4s, v1.4s}, [x17], #32 sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a ld1 {v0.4s}, [x17], #16 sqadd v24.4s, v16.4s, v17.4s // t32 sqsub v25.4s, v16.4s, v17.4s // t33 sqsub v26.4s, v19.4s, v18.4s // t34 sqadd v27.4s, v19.4s, v18.4s // t35 sqadd v28.4s, v20.4s, v21.4s // t60 sqsub v29.4s, v20.4s, v21.4s // t61 sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a srshr v29.4s, v7.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a sqadd v16.4s, v24.4s, v27.4s // t32a sqsub v19.4s, v24.4s, v27.4s // t35a sqadd v17.4s, v25.4s, v26.4s // t33 sqsub v18.4s, v25.4s, v26.4s // t34 sqsub v20.4s, v31.4s, v28.4s // t60a sqadd v23.4s, v31.4s, v28.4s // t63a sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smin_4s \r, \r, v5 .endr .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smax_4s \r, \r, v4 .endr mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a srshr v18.4s, v7.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 ret endfunc function inv_dct64_step2_neon movrel x16, idct_coeffs ld1 {v0.4s}, [x16] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a ldr q16, [x6, #4*4*0] // t32a ldr q17, [x9, #4*4*8] // t39a ldr q18, [x9, #4*4*0] // t63a ldr q19, [x6, #4*4*8] // t56a ldr q20, [x6, #4*4*16] // t40a ldr q21, [x9, #4*4*24] // t47a ldr q22, [x9, #4*4*16] // t55a ldr q23, [x6, #4*4*24] // t48a sqadd v24.4s, v16.4s, v17.4s // t32 sqsub v25.4s, v16.4s, v17.4s // t39 sqadd v26.4s, v18.4s, v19.4s // t63 sqsub v27.4s, v18.4s, v19.4s // t56 sqsub v28.4s, v21.4s, v20.4s // t40 sqadd v29.4s, v21.4s, v20.4s // t47 sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a srshr v27.4s, v7.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a srshr v28.4s, v2.4s, #12 // t55a sqadd v16.4s, v24.4s, v29.4s // t32a sqsub v19.4s, v24.4s, v29.4s // t47a sqadd v17.4s, v27.4s, v31.4s // t39 sqsub v18.4s, v27.4s, v31.4s // t40 sqsub v20.4s, v26.4s, v30.4s // t48a sqadd v23.4s, v26.4s, v30.4s // t63a sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smin_4s \r, \r, v5 .endr .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smax_4s \r, \r, v4 .endr mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a srshr v21.4s, v7.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 str q16, [x6, #4*4*0] // t32a str q17, [x9, #4*4*0] // t39 str q18, [x6, #4*4*8] // t40a str q19, [x9, #4*4*8] // t47 str q20, [x6, #4*4*16] // t48 str q21, [x9, #4*4*16] // t55a str q22, [x6, #4*4*24] // t56 str q23, [x9, #4*4*24] // t63a add x6, x6, #4*4 sub x9, x9, #4*4 cmp x6, x9 b.lt 1b ret endfunc .macro load8 src, strd, zero, clear .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s .if \clear ld1 {\i}, [\src] st1 {\zero}, [\src], \strd .else ld1 {\i}, [\src], \strd .endif .endr .endm .macro store16 dst .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s st1 {\i}, [\dst], #16 .endr .endm .macro clear_upper8 .irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s movi \i, #0 .endr .endm .macro movi_if reg, val, cond .if \cond movi \reg, \val .endif .endm .macro movz16dup_if reg, gpr, val, cond .if \cond movz \gpr, \val, lsl #16 dup \reg, \gpr .endif .endm .macro st1_if regs, dst, cond .if \cond st1 \regs, \dst .endif .endm .macro str_if reg, dst, cond .if \cond str \reg, \dst .endif .endm .macro stroff_if reg, dst, dstoff, cond .if \cond str \reg, \dst, \dstoff .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_4s_x64_neon mov x14, x30 mov x6, sp lsl x8, x8, #2 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear load8 x7, x8, v7.4s, \clear clear_upper8 sub x7, x7, x8, lsl #3 add x7, x7, x8, lsr #1 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct_4s_x16_neon // idct_16 leaves the row_clip_max/min constants in v5 and v4 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.4s, \clear clear_upper8 sub x7, x7, x8, lsl #3 lsr x8, x8, #1 sub x7, x7, x8, lsr #1 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct32_odd_4s_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 mov x9, #-16 movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 sqadd v6.4s, v2.4s, \r0 sqsub \r0, v2.4s, \r0 ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 smin v6.4s, v6.4s, v1.4s smin \r0, \r0, v1.4s ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 smax v6.4s, v6.4s, v0.4s smax \r0, \r0, v0.4s sqsub \r2, v4.4s, \r2 smin v7.4s, v7.4s, v1.4s smin \r1, \r1, v1.4s st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 smin v2.4s, v2.4s, v1.4s smin \r2, \r2, v1.4s smax v7.4s, v7.4s, v0.4s smax \r1, \r1, v0.4s sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 smax v2.4s, v2.4s, v0.4s smax \r2, \r2, v0.4s smin v3.4s, v3.4s, v1.4s smin \r3, \r3, v1.4s st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 smax v3.4s, v3.4s, v0.4s smax \r3, \r3, v0.4s st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 st1 {\r3}, [x10], x9 .endm store_addsub v31.4s, v30.4s, v29.4s, v28.4s store_addsub v27.4s, v26.4s, v25.4s, v24.4s store_addsub v23.4s, v22.4s, v21.4s, v20.4s store_addsub v19.4s, v18.4s, v17.4s, v16.4s .purgem store_addsub add x6, x6, #4*4*16 movrel x17, idct64_coeffs movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 add x10, x7, x8, lsl #3 // offset 8 sub x9, x9, x8 // offset 15 sub x11, x10, x8 // offset 7 ld1 {v16.4s}, [x7] // in1 (offset 0) ld1 {v17.4s}, [x9] // in31 (offset 15) ld1 {v18.4s}, [x10] // in17 (offset 8) ld1 {v19.4s}, [x11] // in15 (offset 7) st1_if {v7.4s}, [x7], \clear st1_if {v7.4s}, [x9], \clear st1_if {v7.4s}, [x10], \clear st1_if {v7.4s}, [x11], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x7, x7, x8, lsl #2 // offset 4 sub x9, x9, x8, lsl #2 // offset 11 sub x10, x7, x8 // offset 3 add x11, x9, x8 // offset 12 ld1 {v16.4s}, [x10] // in7 (offset 3) ld1 {v17.4s}, [x11] // in25 (offset 12) ld1 {v18.4s}, [x9] // in23 (offset 11) ld1 {v19.4s}, [x7] // in9 (offset 4) st1_if {v7.4s}, [x7], \clear st1_if {v7.4s}, [x9], \clear st1_if {v7.4s}, [x10], \clear st1_if {v7.4s}, [x11], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear sub x10, x10, x8, lsl #1 // offset 1 sub x9, x9, x8, lsl #1 // offset 9 add x7, x7, x8 // offset 5 add x11, x11, x8 // offset 13 ldr q16, [x10, x8] // in5 (offset 2) ldr q17, [x11] // in27 (offset 13) ldr q18, [x9, x8] // in21 (offset 10) ldr q19, [x7] // in11 (offset 5) stroff_if q7, [x10, x8], \clear str_if q7, [x11], \clear stroff_if q7, [x9, x8], \clear str_if q7, [x7], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear ldr q16, [x10] // in3 (offset 1) ldr q17, [x11, x8] // in29 (offset 14) ldr q18, [x9] // in19 (offset 9) ldr q19, [x7, x8] // in13 (offset 6) str_if q7, [x10], \clear stroff_if q7, [x11, x8], \clear str_if q7, [x9], \clear stroff_if q7, [x7, x8], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon sub x6, x6, #4*4*32 add x9, x6, #4*4*7 bl inv_dct64_step2_neon ret x14 endfunc .endm def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x4_neon mov x14, x30 mov x7, sp add x8, sp, #4*4*(64 - 4) add x9, x6, #2*56 mov x10, #2*64 mov x11, #-4*4*4 dup v7.4s, w12 1: ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 .macro store_addsub src0, src1, src2, src3 sqsub v1.4s, \src0, \src1 sqadd v0.4s, \src0, \src1 sqsub v3.4s, \src2, \src3 srshl v1.4s, v1.4s, v7.4s sqadd v2.4s, \src2, \src3 srshl v3.4s, v3.4s, v7.4s srshl v0.4s, v0.4s, v7.4s srshl v2.4s, v2.4s, v7.4s sqxtn v3.4h, v3.4s sqxtn2 v3.8h, v1.4s sqxtn v0.4h, v0.4s sqxtn2 v0.8h, v2.4s rev64 v3.8h, v3.8h st1 {v0.8h}, [x6], x10 st1 {v3.8h}, [x9], x10 .endm store_addsub v16.4s, v31.4s, v20.4s, v27.4s store_addsub v17.4s, v30.4s, v21.4s, v26.4s store_addsub v18.4s, v29.4s, v22.4s, v25.4s store_addsub v19.4s, v28.4s, v23.4s, v24.4s .purgem store_addsub sub x6, x6, x10, lsl #2 sub x9, x9, x10, lsl #2 add x6, x6, #16 sub x9, x9, #16 cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon mov x14, x30 lsl x8, x8, #1 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, x1, lsl #6 sub x9, x9, x1 neg x10, x1 mov x11, #-2*8*4 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 mvni v7.8h, #0xfc, lsl #8 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8h}, [x6], x1 ld1 {v1.8h}, [x9], x10 sqadd v4.8h, \src0, \src1 ld1 {v2.8h}, [x6] sqsub \src0, \src0, \src1 ld1 {v3.8h}, [x9] sqadd v5.8h, \src2, \src3 sqsub \src2, \src2, \src3 sub x6, x6, x1 sub x9, x9, x10 srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr \src0, \src0, #4 usqadd v0.8h, v4.8h srshr \src2, \src2, #4 usqadd v1.8h, \src0 usqadd v2.8h, v5.8h smin v0.8h, v0.8h, v7.8h usqadd v3.8h, \src2 smin v1.8h, v1.8h, v7.8h st1 {v0.8h}, [x6], x1 smin v2.8h, v2.8h, v7.8h st1 {v1.8h}, [x9], x10 smin v3.8h, v3.8h, v7.8h st1 {v2.8h}, [x6], x1 st1 {v3.8h}, [x9], x10 .endm add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem add_dest_addsub cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 sub_sp 64*32*2+64*4*4 add x5, sp, #64*4*4 movrel x13, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #32*4 mov x12, #-2 // shift bl inv_txfm_dct_clear_4s_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 sub_sp 64*32*2+64*4*4 add x5, sp, #64*4*4 movrel x13, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #32*4 mov x12, #-1 // shift bl inv_txfm_dct_clear_scale_4s_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i*2) add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 sub_sp 32*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f ldrh w12, [x13], #2 .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #32*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 sub_sp 64*16*2+64*4*4 add x4, sp, #64*4*4 movrel x13, eob_16x32 .irp i, 0, 4, 8, 12 add x6, x4, #(\i*64*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #16*4 mov x12, #-2 // shift bl inv_txfm_dct_clear_4s_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 12 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: movrel x5, X(inv_dct_8h_x16_neon) .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i*2) add x7, x4, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, x4, #64*16*2 ret x15 endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 sub_sp 16*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_4s_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*16*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #16*32*2 ret x15 endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/loopfilter.S000066400000000000000000001350031517466257200241500ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // depending on how many pixels need to be stored, returns: // x14 = (1 << 0) : 0 pixels // x14 = (1 << 4) : inner 4 pixels // x14 = (1 << 6) : inner 6 pixels // x14 = 0 : all pixels .macro loop_filter wd function lpf_16_wd\wd\()_neon uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0) uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0) uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1) .if \wd >= 6 uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1) uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1) .endif .if \wd >= 8 uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2) uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3) .endif .if \wd >= 6 umax v4.16b, v4.16b, v5.16b .endif uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2 .if \wd >= 8 umax v6.16b, v6.16b, v7.16b .endif ushr v3.16b, v3.16b, #1 .if \wd >= 8 umax v4.16b, v4.16b, v6.16b .endif .if \wd >= 6 and v4.16b, v4.16b, v14.16b .endif umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0)) uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 umax v4.16b, v0.16b, v4.16b cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E and v1.16b, v1.16b, v2.16b // fm and v1.16b, v1.16b, v13.16b // fm && wd >= 4 .if \wd >= 6 and v14.16b, v14.16b, v1.16b // fm && wd > 4 .endif .if \wd >= 16 and v15.16b, v15.16b, v1.16b // fm && wd == 16 .endif mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 b.ne 9f // if (!fm || wd < 4) return; mov x14, #(1 << 0) ret 9: .if \wd >= 6 movi v10.16b, #1 uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0) uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0) uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0) .if \wd >= 8 uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0) uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0) .endif umax v2.16b, v2.16b, v3.16b umax v4.16b, v4.16b, v5.16b .if \wd >= 8 umax v6.16b, v6.16b, v7.16b .endif umax v2.16b, v2.16b, v4.16b .if \wd >= 8 umax v2.16b, v2.16b, v6.16b .endif .if \wd == 16 uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0) uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0) uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0) .endif cmhs v2.16b, v10.16b, v2.16b // flat8in .if \wd == 16 uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0) uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0) uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0) .endif and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in .if \wd == 16 umax v3.16b, v3.16b, v4.16b umax v5.16b, v5.16b, v6.16b .endif mov x16, v1.d[0] mov x17, v1.d[1] .if \wd == 16 umax v7.16b, v7.16b, v8.16b umax v3.16b, v3.16b, v5.16b umax v3.16b, v3.16b, v7.16b cmhs v3.16b, v10.16b, v3.16b // flat8out .endif adds x16, x16, x17 .if \wd == 16 and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out .endif b.eq 1f // skip wd == 4 case .endif movi v3.16b, #128 eor v2.16b, v22.16b, v3.16b // p1 - 128 eor v3.16b, v25.16b, v3.16b // q1 - 128 cmhi v0.16b, v0.16b, v12.16b // hev sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1) and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) usubl v2.8h, v24.8b, v23.8b movi v5.8h, #3 usubl2 v3.8h, v24.16b, v23.16b mul v2.8h, v2.8h, v5.8h mul v3.8h, v3.8h, v5.8h movi v6.16b, #4 saddw v2.8h, v2.8h, v4.8b saddw2 v3.8h, v3.8h, v4.16b movi v7.16b, #3 sqxtn v2.8b, v2.8h // f sqxtn2 v2.16b, v3.8h sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127) sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) sshr v4.16b, v4.16b, #3 // f1 sshr v5.16b, v5.16b, #3 // f2 mov v2.16b, v23.16b // p0 mov v3.16b, v24.16b // q0 neg v6.16b, v4.16b // -f1 srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 // p0 + f2, q0 - f1 usqadd v2.16b, v5.16b // out p0 usqadd v3.16b, v6.16b // out q0 neg v6.16b, v4.16b // -((f1 + 1) >> 1) bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4) mov v2.16b, v22.16b // p1 mov v3.16b, v25.16b // q1 // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1) usqadd v2.16b, v4.16b // out p1 usqadd v3.16b, v6.16b // out q1 bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 2f // skip if there's no flat8in uaddl v0.8h, v21.8b, v21.8b // p2 * 2 uaddl2 v1.8h, v21.16b, v21.16b uaddl v2.8h, v21.8b, v22.8b // p2 + p1 uaddl2 v3.8h, v21.16b, v22.16b uaddl v4.8h, v22.8b, v23.8b // p1 + p0 uaddl2 v5.8h, v22.16b, v23.16b uaddl v6.8h, v23.8b, v24.8b // p0 + q0 uaddl2 v7.8h, v23.16b, v24.16b add v8.8h, v0.8h, v2.8h add v9.8h, v1.8h, v3.8h add v10.8h, v4.8h, v6.8h add v11.8h, v5.8h, v7.8h uaddl v12.8h, v24.8b, v25.8b // q0 + q1 uaddl2 v13.8h, v24.16b, v25.16b add v8.8h, v8.8h, v10.8h add v9.8h, v9.8h, v11.8h sub v12.8h, v12.8h, v0.8h sub v13.8h, v13.8h, v1.8h uaddl v10.8h, v25.8b, v26.8b // q1 + q2 uaddl2 v11.8h, v25.16b, v26.16b rshrn v0.8b, v8.8h, #3 // out p1 rshrn2 v0.16b, v9.8h, #3 add v8.8h, v8.8h, v12.8h add v9.8h, v9.8h, v13.8h sub v10.8h, v10.8h, v2.8h sub v11.8h, v11.8h, v3.8h uaddl v12.8h, v26.8b, v26.8b // q2 + q2 uaddl2 v13.8h, v26.16b, v26.16b rshrn v1.8b, v8.8h, #3 // out p0 rshrn2 v1.16b, v9.8h, #3 add v8.8h, v8.8h, v10.8h add v9.8h, v9.8h, v11.8h sub v12.8h, v12.8h, v4.8h sub v13.8h, v13.8h, v5.8h rshrn v2.8b, v8.8h, #3 // out q0 rshrn2 v2.16b, v9.8h, #3 bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) add v8.8h, v8.8h, v12.8h add v9.8h, v9.8h, v13.8h bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) rshrn v3.8b, v8.8h, #3 // out q1 rshrn2 v3.16b, v9.8h, #3 bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) .elseif \wd >= 8 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 .if \wd == 8 b.eq 8f // skip if there's no flat8in .else b.eq 2f // skip if there's no flat8in .endif uaddl v0.8h, v20.8b, v21.8b // p3 + p2 uaddl2 v1.8h, v20.16b, v21.16b uaddl v2.8h, v22.8b, v25.8b // p1 + q1 uaddl2 v3.8h, v22.16b, v25.16b uaddl v4.8h, v20.8b, v22.8b // p3 + p1 uaddl2 v5.8h, v20.16b, v22.16b uaddl v6.8h, v23.8b, v26.8b // p0 + q2 uaddl2 v7.8h, v23.16b, v26.16b add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) add v9.8h, v1.8h, v1.8h uaddw v8.8h, v8.8h, v23.8b // + p0 uaddw2 v9.8h, v9.8h, v23.16b uaddw v8.8h, v8.8h, v24.8b // + q0 uaddw2 v9.8h, v9.8h, v24.16b add v8.8h, v8.8h, v4.8h add v9.8h, v9.8h, v5.8h // + p3 + p1 sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 sub v3.8h, v3.8h, v1.8h sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 sub v7.8h, v7.8h, v5.8h rshrn v10.8b, v8.8h, #3 // out p2 rshrn2 v10.16b, v9.8h, #3 add v8.8h, v8.8h, v2.8h add v9.8h, v9.8h, v3.8h uaddl v0.8h, v20.8b, v23.8b // p3 + p0 uaddl2 v1.8h, v20.16b, v23.16b uaddl v2.8h, v24.8b, v27.8b // q0 + q3 uaddl2 v3.8h, v24.16b, v27.16b rshrn v11.8b, v8.8h, #3 // out p1 rshrn2 v11.16b, v9.8h, #3 add v8.8h, v8.8h, v6.8h add v9.8h, v9.8h, v7.8h sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 sub v3.8h, v3.8h, v1.8h uaddl v4.8h, v21.8b, v24.8b // p2 + q0 uaddl2 v5.8h, v21.16b, v24.16b uaddl v6.8h, v25.8b, v27.8b // q1 + q3 uaddl2 v7.8h, v25.16b, v27.16b rshrn v12.8b, v8.8h, #3 // out p0 rshrn2 v12.16b, v9.8h, #3 add v8.8h, v8.8h, v2.8h add v9.8h, v9.8h, v3.8h sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 sub v7.8h, v7.8h, v5.8h uaddl v0.8h, v22.8b, v25.8b // p1 + q1 uaddl2 v1.8h, v22.16b, v25.16b uaddl v2.8h, v26.8b, v27.8b // q2 + q3 uaddl2 v3.8h, v26.16b, v27.16b rshrn v13.8b, v8.8h, #3 // out q0 rshrn2 v13.16b, v9.8h, #3 add v8.8h, v8.8h, v6.8h add v9.8h, v9.8h, v7.8h sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 sub v3.8h, v3.8h, v1.8h rshrn v0.8b, v8.8h, #3 // out q1 rshrn2 v0.16b, v9.8h, #3 add v8.8h, v8.8h, v2.8h add v9.8h , v9.8h, v3.8h bit v21.16b, v10.16b, v14.16b bit v22.16b, v11.16b, v14.16b bit v23.16b, v12.16b, v14.16b rshrn v1.8b, v8.8h, #3 // out q2 rshrn2 v1.16b, v9.8h, #3 bit v24.16b, v13.16b, v14.16b bit v25.16b, v0.16b, v14.16b bit v26.16b, v1.16b, v14.16b .endif 2: .if \wd == 16 mov x16, v15.d[0] mov x17, v15.d[1] adds x16, x16, x17 b.ne 1f // check if flat8out is needed mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: uaddl v2.8h, v17.8b, v17.8b // p6 + p6 uaddl2 v3.8h, v17.16b, v17.16b uaddl v4.8h, v17.8b, v18.8b // p6 + p5 uaddl2 v5.8h, v17.16b, v18.16b uaddl v6.8h, v17.8b, v19.8b // p6 + p4 uaddl2 v7.8h, v17.16b, v19.16b uaddl v8.8h, v17.8b, v20.8b // p6 + p3 uaddl2 v9.8h, v17.16b, v20.16b add v12.8h, v2.8h, v4.8h add v13.8h, v3.8h, v5.8h add v10.8h, v6.8h, v8.8h add v11.8h, v7.8h, v9.8h uaddl v6.8h, v17.8b, v21.8b // p6 + p2 uaddl2 v7.8h, v17.16b, v21.16b add v12.8h, v12.8h, v10.8h add v13.8h, v13.8h, v11.8h uaddl v8.8h, v17.8b, v22.8b // p6 + p1 uaddl2 v9.8h, v17.16b, v22.16b uaddl v10.8h, v18.8b, v23.8b // p5 + p0 uaddl2 v11.8h, v18.16b, v23.16b add v6.8h, v6.8h, v8.8h add v7.8h, v7.8h, v9.8h uaddl v8.8h, v19.8b, v24.8b // p4 + q0 uaddl2 v9.8h, v19.16b, v24.16b add v12.8h, v12.8h, v6.8h add v13.8h, v13.8h, v7.8h add v10.8h, v10.8h, v8.8h add v11.8h, v11.8h, v9.8h uaddl v6.8h, v20.8b, v25.8b // p3 + q1 uaddl2 v7.8h, v20.16b, v25.16b add v12.8h, v12.8h, v10.8h add v13.8h, v13.8h, v11.8h sub v6.8h, v6.8h, v2.8h sub v7.8h, v7.8h, v3.8h uaddl v2.8h, v21.8b, v26.8b // p2 + q2 uaddl2 v3.8h, v21.16b, v26.16b rshrn v0.8b, v12.8h, #4 // out p5 rshrn2 v0.16b, v13.8h, #4 add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) add v13.8h, v13.8h, v7.8h sub v2.8h, v2.8h, v4.8h sub v3.8h, v3.8h, v5.8h uaddl v4.8h, v22.8b, v27.8b // p1 + q3 uaddl2 v5.8h, v22.16b, v27.16b uaddl v6.8h, v17.8b, v19.8b // p6 + p4 uaddl2 v7.8h, v17.16b, v19.16b rshrn v1.8b, v12.8h, #4 // out p4 rshrn2 v1.16b, v13.8h, #4 add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) add v13.8h, v13.8h, v3.8h sub v4.8h, v4.8h, v6.8h sub v5.8h, v5.8h, v7.8h uaddl v6.8h, v23.8b, v28.8b // p0 + q4 uaddl2 v7.8h, v23.16b, v28.16b uaddl v8.8h, v17.8b, v20.8b // p6 + p3 uaddl2 v9.8h, v17.16b, v20.16b rshrn v2.8b, v12.8h, #4 // out p3 rshrn2 v2.16b, v13.8h, #4 add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) add v13.8h, v13.8h, v5.8h sub v6.8h, v6.8h, v8.8h sub v7.8h, v7.8h, v9.8h uaddl v8.8h, v24.8b, v29.8b // q0 + q5 uaddl2 v9.8h, v24.16b, v29.16b uaddl v4.8h, v17.8b, v21.8b // p6 + p2 uaddl2 v5.8h, v17.16b, v21.16b rshrn v3.8b, v12.8h, #4 // out p2 rshrn2 v3.16b, v13.8h, #4 add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) add v13.8h, v13.8h, v7.8h sub v8.8h, v8.8h, v4.8h sub v9.8h, v9.8h, v5.8h uaddl v6.8h, v25.8b, v30.8b // q1 + q6 uaddl2 v7.8h, v25.16b, v30.16b uaddl v10.8h, v17.8b, v22.8b // p6 + p1 uaddl2 v11.8h, v17.16b, v22.16b rshrn v4.8b, v12.8h, #4 // out p1 rshrn2 v4.16b, v13.8h, #4 add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) add v13.8h, v13.8h, v9.8h sub v6.8h, v6.8h, v10.8h sub v7.8h, v7.8h, v11.8h uaddl v8.8h, v26.8b, v30.8b // q2 + q6 uaddl2 v9.8h, v26.16b, v30.16b bif v0.16b, v18.16b, v15.16b // out p5 uaddl v10.8h, v18.8b, v23.8b // p5 + p0 uaddl2 v11.8h, v18.16b, v23.16b rshrn v5.8b, v12.8h, #4 // out p0 rshrn2 v5.16b, v13.8h, #4 add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) add v13.8h, v13.8h, v7.8h sub v8.8h, v8.8h, v10.8h sub v9.8h, v9.8h, v11.8h uaddl v10.8h, v27.8b, v30.8b // q3 + q6 uaddl2 v11.8h, v27.16b, v30.16b bif v1.16b, v19.16b, v15.16b // out p4 uaddl v18.8h, v19.8b, v24.8b // p4 + q0 uaddl2 v19.8h, v19.16b, v24.16b rshrn v6.8b, v12.8h, #4 // out q0 rshrn2 v6.16b, v13.8h, #4 add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) add v13.8h, v13.8h, v9.8h sub v10.8h, v10.8h, v18.8h sub v11.8h, v11.8h, v19.8h uaddl v8.8h, v28.8b, v30.8b // q4 + q6 uaddl2 v9.8h, v28.16b, v30.16b bif v2.16b, v20.16b, v15.16b // out p3 uaddl v18.8h, v20.8b, v25.8b // p3 + q1 uaddl2 v19.8h, v20.16b, v25.16b rshrn v7.8b, v12.8h, #4 // out q1 rshrn2 v7.16b, v13.8h, #4 add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) add v13.8h, v13.8h, v11.8h sub v18.8h, v8.8h, v18.8h sub v19.8h, v9.8h, v19.8h uaddl v10.8h, v29.8b, v30.8b // q5 + q6 uaddl2 v11.8h, v29.16b, v30.16b bif v3.16b, v21.16b, v15.16b // out p2 uaddl v20.8h, v21.8b, v26.8b // p2 + q2 uaddl2 v21.8h, v21.16b, v26.16b rshrn v8.8b, v12.8h, #4 // out q2 rshrn2 v8.16b, v13.8h, #4 add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) add v13.8h, v13.8h, v19.8h sub v10.8h, v10.8h, v20.8h sub v11.8h, v11.8h, v21.8h uaddl v18.8h, v30.8b, v30.8b // q6 + q6 uaddl2 v19.8h, v30.16b, v30.16b bif v4.16b, v22.16b, v15.16b // out p1 uaddl v20.8h, v22.8b, v27.8b // p1 + q3 uaddl2 v21.8h, v22.16b, v27.16b rshrn v9.8b, v12.8h, #4 // out q3 rshrn2 v9.16b, v13.8h, #4 add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) add v13.8h, v13.8h, v11.8h sub v18.8h, v18.8h, v20.8h sub v19.8h, v19.8h, v21.8h bif v5.16b, v23.16b, v15.16b // out p0 rshrn v10.8b, v12.8h, #4 // out q4 rshrn2 v10.16b, v13.8h, #4 add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) add v13.8h, v13.8h, v19.8h rshrn v11.8b, v12.8h, #4 // out q5 rshrn2 v11.16b, v13.8h, #4 bif v6.16b, v24.16b, v15.16b // out q0 bif v7.16b, v25.16b, v15.16b // out q1 bif v8.16b, v26.16b, v15.16b // out q2 bif v9.16b, v27.16b, v15.16b // out q3 bif v10.16b, v28.16b, v15.16b // out q4 bif v11.16b, v29.16b, v15.16b // out q5 .endif mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels mov x14, #(1 << 6) ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels mov x14, #(1 << 4) ret .endif endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_16_wd16 bl lpf_16_wd16_neon cbz x14, 1f tbnz x14, #6, 7f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_16_wd8 bl lpf_16_wd8_neon cbz x14, 1f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_16_wd6 bl lpf_16_wd6_neon cbz x14, 1f ret x15 1: .endm .macro lpf_16_wd4 bl lpf_16_wd4_neon cbz x14, 1f ret x15 1: .endm function lpf_v_4_16_neon mov x15, x30 sub x16, x0, x1, lsl #1 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 lpf_16_wd4 sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_4_16_neon mov x15, x30 sub x16, x0, #2 add x0, x16, x1, lsl #3 ld1 {v22.s}[0], [x16], x1 ld1 {v22.s}[2], [x0], x1 ld1 {v23.s}[0], [x16], x1 ld1 {v23.s}[2], [x0], x1 ld1 {v24.s}[0], [x16], x1 ld1 {v24.s}[2], [x0], x1 ld1 {v25.s}[0], [x16], x1 ld1 {v25.s}[2], [x0], x1 ld1 {v22.s}[1], [x16], x1 ld1 {v22.s}[3], [x0], x1 ld1 {v23.s}[1], [x16], x1 ld1 {v23.s}[3], [x0], x1 ld1 {v24.s}[1], [x16], x1 ld1 {v24.s}[3], [x0], x1 ld1 {v25.s}[1], [x16], x1 ld1 {v25.s}[3], [x0], x1 add x0, x0, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 lpf_16_wd4 sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc function lpf_v_6_16_neon mov x15, x30 sub x16, x0, x1, lsl #1 sub x16, x16, x1 ld1 {v21.16b}, [x16], x1 // p2 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v25.16b}, [x0], x1 // q1 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 lpf_16_wd6 sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_6_16_neon mov x15, x30 sub x16, x0, #4 add x0, x16, x1, lsl #3 ld1 {v20.d}[0], [x16], x1 ld1 {v20.d}[1], [x0], x1 ld1 {v21.d}[0], [x16], x1 ld1 {v21.d}[1], [x0], x1 ld1 {v22.d}[0], [x16], x1 ld1 {v22.d}[1], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v23.d}[1], [x0], x1 ld1 {v24.d}[0], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v25.d}[0], [x16], x1 ld1 {v25.d}[1], [x0], x1 ld1 {v26.d}[0], [x16], x1 ld1 {v26.d}[1], [x0], x1 ld1 {v27.d}[0], [x16], x1 ld1 {v27.d}[1], [x0], x1 add x0, x0, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_16_wd6 sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc function lpf_v_8_16_neon mov x15, x30 sub x16, x0, x1, lsl #2 ld1 {v20.16b}, [x16], x1 // p3 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v21.16b}, [x16], x1 // p2 ld1 {v25.16b}, [x0], x1 // q1 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v26.16b}, [x0], x1 // q2 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v27.16b}, [x0], x1 // q3 sub x0, x0, x1, lsl #2 lpf_16_wd8 sub x16, x0, x1, lsl #1 sub x16, x16, x1 st1 {v21.16b}, [x16], x1 // p2 st1 {v24.16b}, [x0], x1 // q0 st1 {v22.16b}, [x16], x1 // p1 st1 {v25.16b}, [x0], x1 // q1 st1 {v23.16b}, [x16], x1 // p0 st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_8_16_neon mov x15, x30 sub x16, x0, #4 add x0, x16, x1, lsl #3 ld1 {v20.d}[0], [x16], x1 ld1 {v20.d}[1], [x0], x1 ld1 {v21.d}[0], [x16], x1 ld1 {v21.d}[1], [x0], x1 ld1 {v22.d}[0], [x16], x1 ld1 {v22.d}[1], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v23.d}[1], [x0], x1 ld1 {v24.d}[0], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v25.d}[0], [x16], x1 ld1 {v25.d}[1], [x0], x1 ld1 {v26.d}[0], [x16], x1 ld1 {v26.d}[1], [x0], x1 ld1 {v27.d}[0], [x16], x1 ld1 {v27.d}[1], [x0], x1 add x0, x0, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_16_wd8 sub x16, x0, x1, lsl #4 sub x16, x16, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v20.d}[0], [x16], x1 st1 {v20.d}[1], [x0], x1 st1 {v21.d}[0], [x16], x1 st1 {v21.d}[1], [x0], x1 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 st1 {v26.d}[0], [x16], x1 st1 {v26.d}[1], [x0], x1 st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc function lpf_v_16_16_neon mov x15, x30 sub x16, x0, x1, lsl #3 add x16, x16, x1 ld1 {v17.16b}, [x16], x1 // p6 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v18.16b}, [x16], x1 // p5 ld1 {v25.16b}, [x0], x1 // q1 ld1 {v19.16b}, [x16], x1 // p4 ld1 {v26.16b}, [x0], x1 // q2 ld1 {v20.16b}, [x16], x1 // p3 ld1 {v27.16b}, [x0], x1 // q3 ld1 {v21.16b}, [x16], x1 // p2 ld1 {v28.16b}, [x0], x1 // q4 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v29.16b}, [x0], x1 // q5 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v30.16b}, [x0], x1 // q6 sub x0, x0, x1, lsl #3 add x0, x0, x1 lpf_16_wd16 sub x16, x0, x1, lsl #2 sub x16, x16, x1, lsl #1 st1 {v0.16b}, [x16], x1 // p5 st1 {v6.16b}, [x0], x1 // q0 st1 {v1.16b}, [x16], x1 // p4 st1 {v7.16b}, [x0], x1 // q1 st1 {v2.16b}, [x16], x1 // p3 st1 {v8.16b}, [x0], x1 // q2 st1 {v3.16b}, [x16], x1 // p2 st1 {v9.16b}, [x0], x1 // q3 st1 {v4.16b}, [x16], x1 // p1 st1 {v10.16b}, [x0], x1 // q4 st1 {v5.16b}, [x16], x1 // p0 st1 {v11.16b}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 st1 {v21.16b}, [x16], x1 // p2 st1 {v24.16b}, [x0], x1 // q0 st1 {v22.16b}, [x16], x1 // p1 st1 {v25.16b}, [x0], x1 // q1 st1 {v23.16b}, [x16], x1 // p0 st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_16_16_neon mov x15, x30 sub x16, x0, #8 ld1 {v16.d}[0], [x16], x1 ld1 {v24.d}[0], [x0], x1 ld1 {v17.d}[0], [x16], x1 ld1 {v25.d}[0], [x0], x1 ld1 {v18.d}[0], [x16], x1 ld1 {v26.d}[0], [x0], x1 ld1 {v19.d}[0], [x16], x1 ld1 {v27.d}[0], [x0], x1 ld1 {v20.d}[0], [x16], x1 ld1 {v28.d}[0], [x0], x1 ld1 {v21.d}[0], [x16], x1 ld1 {v29.d}[0], [x0], x1 ld1 {v22.d}[0], [x16], x1 ld1 {v30.d}[0], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v31.d}[0], [x0], x1 ld1 {v16.d}[1], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v17.d}[1], [x16], x1 ld1 {v25.d}[1], [x0], x1 ld1 {v18.d}[1], [x16], x1 ld1 {v26.d}[1], [x0], x1 ld1 {v19.d}[1], [x16], x1 ld1 {v27.d}[1], [x0], x1 ld1 {v20.d}[1], [x16], x1 ld1 {v28.d}[1], [x0], x1 ld1 {v21.d}[1], [x16], x1 ld1 {v29.d}[1], [x0], x1 ld1 {v22.d}[1], [x16], x1 ld1 {v30.d}[1], [x0], x1 ld1 {v23.d}[1], [x16], x1 ld1 {v31.d}[1], [x0], x1 transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 lpf_16_wd16 sub x0, x0, x1, lsl #4 sub x16, x0, #8 transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 st1 {v16.d}[0], [x16], x1 st1 {v6.d}[0], [x0], x1 st1 {v17.d}[0], [x16], x1 st1 {v7.d}[0], [x0], x1 st1 {v0.d}[0], [x16], x1 st1 {v8.d}[0], [x0], x1 st1 {v1.d}[0], [x16], x1 st1 {v9.d}[0], [x0], x1 st1 {v2.d}[0], [x16], x1 st1 {v10.d}[0], [x0], x1 st1 {v3.d}[0], [x16], x1 st1 {v11.d}[0], [x0], x1 st1 {v4.d}[0], [x16], x1 st1 {v30.d}[0], [x0], x1 st1 {v5.d}[0], [x16], x1 st1 {v31.d}[0], [x0], x1 st1 {v16.d}[1], [x16], x1 st1 {v6.d}[1], [x0], x1 st1 {v17.d}[1], [x16], x1 st1 {v7.d}[1], [x0], x1 st1 {v0.d}[1], [x16], x1 st1 {v8.d}[1], [x0], x1 st1 {v1.d}[1], [x16], x1 st1 {v9.d}[1], [x0], x1 st1 {v2.d}[1], [x16], x1 st1 {v10.d}[1], [x0], x1 st1 {v3.d}[1], [x16], x1 st1 {v11.d}[1], [x0], x1 st1 {v4.d}[1], [x16], x1 st1 {v30.d}[1], [x0], x1 st1 {v5.d}[1], [x16], x1 st1 {v31.d}[1], [x0], x1 ret x15 7: sub x16, x0, x1, lsl #4 sub x16, x16, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v20.d}[0], [x16], x1 st1 {v20.d}[1], [x0], x1 st1 {v21.d}[0], [x16], x1 st1 {v21.d}[1], [x0], x1 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 st1 {v26.d}[0], [x16], x1 st1 {v26.d}[1], [x0], x1 st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc // void dav2d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av2FilterLUT *lut, const int w) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 mov x11, x30 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] ldp w6, w7, [x2] // vmask[0], vmask[1] .ifc \type, y ldr w2, [x2, #8] // vmask[2] .endif add x5, x5, #128 // Move to sharp part of lut .ifc \type, y orr w7, w7, w2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub x4, x3, x4, lsl #2 .else sub x3, x3, #4 lsl x4, x4, #2 .endif orr w6, w6, w7 // vmask[0] |= vmask[1] 1: tst w6, #0x0f .ifc \dir, v ld1 {v0.16b}, [x4], #16 ld1 {v1.16b}, [x3], #16 .else ld2 {v0.s,v1.s}[0], [x3], x4 ld2 {v0.s,v1.s}[1], [x3], x4 ld2 {v0.s,v1.s}[2], [x3], x4 ld2 {v0.s,v1.s}[3], [x3], x4 .endif b.eq 7f // if (!(vm & bits)) continue; ld1r {v5.16b}, [x5] // sharp[0] add x5, x5, #8 movi v2.4s, #0xff dup v13.4s, w6 // vmask[0] and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word and v1.16b, v1.16b, v2.16b cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0] movi v4.16b, #1 ld1r {v6.16b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] cmtst v2.4s, v1.4s, v2.4s // L != 0 mul v1.4s, v1.4s, v4.4s // L .ifc \type, y dup v15.4s, w2 // vmask[2] .endif dup v14.4s, w7 // vmask[1] mov x16, v2.d[0] mov x17, v2.d[1] adds x16, x16, x17 b.eq 7f // if (!L) continue; neg v5.16b, v5.16b // -sharp[0] movrel x16, word_1248 ushr v12.16b, v1.16b, #4 // H ld1 {v16.4s}, [x16] sshl v3.16b, v1.16b, v5.16b // L >> sharp[0] .ifc \type, y cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits) .endif movi v7.16b, #2 umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1]) add v0.16b, v1.16b, v7.16b // L + 2 umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I add v0.16b, v0.16b, v0.16b // 2*(L + 2) cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits) add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits) and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0 .ifc \type, y tst w2, #0x0f b.eq 2f // wd16 bl lpf_\dir\()_16_16_neon b 8f 2: .endif tst w7, #0x0f b.eq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_16_neon .else // wd6 bl lpf_\dir\()_6_16_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_16_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment x0. // If the whole function is skipped, increment it here instead. add x0, x0, x1, lsl #4 .else 7: .endif 8: lsr w6, w6, #4 // vmask[0] >>= 4 lsr w7, w7, #4 // vmask[1] >>= 4 .ifc \type, y lsr w2, w2, #4 // vmask[2] >>= 4 .endif .ifc \dir, v add x0, x0, #16 .else // For dir h, x0 is returned incremented .endif cbnz w6, 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x11 endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv const word_1248 .word 1, 2, 4, 8 endconst dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/loopfilter16.S000066400000000000000000001122421517466257200243170ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // depending on how many pixels need to be stored, returns: // x14 = (1 << 0) : 0 pixels // x14 = (1 << 4) : inner 4 pixels // x14 = (1 << 6) : inner 6 pixels // x14 = 0 : all pixels .macro loop_filter wd function lpf_8_wd\wd\()_neon uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0) uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1) .if \wd >= 6 uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1) uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1) .endif .if \wd >= 8 uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2) uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3) .endif .if \wd >= 6 umax v4.8h, v4.8h, v5.8h .endif uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2 .if \wd >= 8 umax v6.8h, v6.8h, v7.8h .endif ushr v3.8h, v3.8h, #1 .if \wd >= 8 umax v4.8h, v4.8h, v6.8h .endif .if \wd >= 6 and v4.16b, v4.16b, v14.16b .endif umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 umax v4.8h, v0.8h, v4.8h cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E and v1.16b, v1.16b, v2.16b // fm and v1.16b, v1.16b, v13.16b // fm && wd >= 4 .if \wd >= 6 and v14.16b, v14.16b, v1.16b // fm && wd > 4 .endif .if \wd >= 16 and v15.16b, v15.16b, v1.16b // fm && wd == 16 .endif mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 b.ne 9f // if (!fm || wd < 4) return; mov x14, #(1 << 0) ret 9: .if \wd >= 6 movi v10.8h, #1 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0) uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0) uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0) dup v9.8h, w9 // bitdepth_min_8 .if \wd >= 8 uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0) .endif umax v2.8h, v2.8h, v3.8h umax v4.8h, v4.8h, v5.8h .if \wd >= 8 umax v6.8h, v6.8h, v7.8h .endif umax v2.8h, v2.8h, v4.8h ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8 .if \wd >= 8 umax v2.8h, v2.8h, v6.8h .endif .if \wd == 16 uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0) uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0) uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0) .endif cmhs v2.8h, v10.8h, v2.8h // flat8in .if \wd == 16 uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0) uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0) uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0) .endif and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in .if \wd == 16 umax v3.8h, v3.8h, v4.8h umax v5.8h, v5.8h, v6.8h .endif mov x16, v1.d[0] mov x17, v1.d[1] .if \wd == 16 umax v7.8h, v7.8h, v8.8h umax v3.8h, v3.8h, v5.8h umax v3.8h, v3.8h, v7.8h cmhs v3.8h, v10.8h, v3.8h // flat8out .endif adds x16, x16, x17 .if \wd == 16 and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out .endif b.eq 1f // skip wd == 4 case .endif dup v3.8h, w8 // bitdepth_max sub v2.8h, v22.8h, v25.8h // p1 - q1 ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1 cmhi v0.8h, v0.8h, v12.8h // hev not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8) smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1) smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1) and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) sub v2.8h, v24.8h, v23.8h movi v5.8h, #3 bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) mul v2.8h, v2.8h, v5.8h movi v6.8h, #4 add v2.8h, v2.8h, v4.8h smin v2.8h, v2.8h, v3.8h // f = iclip_diff() smax v2.8h, v2.8h, v9.8h // f = iclip_diff() sqadd v4.8h, v6.8h, v2.8h // f + 4 sqadd v5.8h, v5.8h, v2.8h // f + 3 smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) sshr v4.8h, v4.8h, #3 // f1 sshr v5.8h, v5.8h, #3 // f2 movi v9.8h, #0 dup v3.8h, w8 // bitdepth_max sqadd v2.8h, v23.8h, v5.8h // p0 + f2 sqsub v6.8h, v24.8h, v4.8h // q0 - f1 srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1 smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel() smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel() smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel() smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel() bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) sqadd v2.8h, v22.8h, v4.8h // p1 + f sqsub v6.8h, v25.8h, v4.8h // q1 - f smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel() smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel() smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel() smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel() bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 2f // skip if there's no flat8in add v0.8h, v21.8h, v21.8h // p2 * 2 add v2.8h, v21.8h, v22.8h // p2 + p1 add v4.8h, v22.8h, v23.8h // p1 + p0 add v6.8h, v23.8h, v24.8h // p0 + q0 add v8.8h, v0.8h, v2.8h add v10.8h, v4.8h, v6.8h add v12.8h, v24.8h, v25.8h // q0 + q1 add v8.8h, v8.8h, v10.8h sub v12.8h, v12.8h, v0.8h add v10.8h, v25.8h, v26.8h // q1 + q2 urshr v0.8h, v8.8h, #3 // out p1 add v8.8h, v8.8h, v12.8h sub v10.8h, v10.8h, v2.8h add v12.8h, v26.8h, v26.8h // q2 + q2 urshr v1.8h, v8.8h, #3 // out p0 add v8.8h, v8.8h, v10.8h sub v12.8h, v12.8h, v4.8h urshr v2.8h, v8.8h, #3 // out q0 bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) add v8.8h, v8.8h, v12.8h bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) urshr v3.8h, v8.8h, #3 // out q1 bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) .elseif \wd >= 8 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 .if \wd == 8 b.eq 8f // skip if there's no flat8in .else b.eq 2f // skip if there's no flat8in .endif add v0.8h, v20.8h, v21.8h // p3 + p2 add v2.8h, v22.8h, v25.8h // p1 + q1 add v4.8h, v20.8h, v22.8h // p3 + p1 add v6.8h, v23.8h, v26.8h // p0 + q2 add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) add v9.8h, v23.8h, v24.8h // p0 + q0 add v8.8h, v8.8h, v4.8h // + p3 + p1 sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 add v8.8h, v8.8h, v9.8h // + p0 + q0 sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 urshr v10.8h, v8.8h, #3 // out p2 add v8.8h, v8.8h, v2.8h add v0.8h, v20.8h, v23.8h // p3 + p0 add v2.8h, v24.8h, v27.8h // q0 + q3 urshr v11.8h, v8.8h, #3 // out p1 add v8.8h, v8.8h, v6.8h sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 add v4.8h, v21.8h, v24.8h // p2 + q0 add v6.8h, v25.8h, v27.8h // q1 + q3 urshr v12.8h, v8.8h, #3 // out p0 add v8.8h, v8.8h, v2.8h sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 add v0.8h, v22.8h, v25.8h // p1 + q1 add v2.8h, v26.8h, v27.8h // q2 + q3 urshr v13.8h, v8.8h, #3 // out q0 add v8.8h, v8.8h, v6.8h sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 urshr v0.8h, v8.8h, #3 // out q1 add v8.8h, v8.8h, v2.8h bit v21.16b, v10.16b, v14.16b bit v22.16b, v11.16b, v14.16b bit v23.16b, v12.16b, v14.16b urshr v1.8h, v8.8h, #3 // out q2 bit v24.16b, v13.16b, v14.16b bit v25.16b, v0.16b, v14.16b bit v26.16b, v1.16b, v14.16b .endif 2: .if \wd == 16 mov x16, v15.d[0] mov x17, v15.d[1] adds x16, x16, x17 b.ne 1f // check if flat8out is needed mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: add v2.8h, v17.8h, v17.8h // p6 + p6 add v4.8h, v17.8h, v18.8h // p6 + p5 add v6.8h, v17.8h, v19.8h // p6 + p4 add v8.8h, v17.8h, v20.8h // p6 + p3 add v12.8h, v2.8h, v4.8h add v10.8h, v6.8h, v8.8h add v6.8h, v17.8h, v21.8h // p6 + p2 add v12.8h, v12.8h, v10.8h add v8.8h, v17.8h, v22.8h // p6 + p1 add v10.8h, v18.8h, v23.8h // p5 + p0 add v6.8h, v6.8h, v8.8h add v8.8h, v19.8h, v24.8h // p4 + q0 add v12.8h, v12.8h, v6.8h add v10.8h, v10.8h, v8.8h add v6.8h, v20.8h, v25.8h // p3 + q1 add v12.8h, v12.8h, v10.8h sub v6.8h, v6.8h, v2.8h add v2.8h, v21.8h, v26.8h // p2 + q2 urshr v0.8h, v12.8h, #4 // out p5 add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) sub v2.8h, v2.8h, v4.8h add v4.8h, v22.8h, v27.8h // p1 + q3 add v6.8h, v17.8h, v19.8h // p6 + p4 urshr v1.8h, v12.8h, #4 // out p4 add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) sub v4.8h, v4.8h, v6.8h add v6.8h, v23.8h, v28.8h // p0 + q4 add v8.8h, v17.8h, v20.8h // p6 + p3 urshr v2.8h, v12.8h, #4 // out p3 add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) sub v6.8h, v6.8h, v8.8h add v8.8h, v24.8h, v29.8h // q0 + q5 add v4.8h, v17.8h, v21.8h // p6 + p2 urshr v3.8h, v12.8h, #4 // out p2 add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) sub v8.8h, v8.8h, v4.8h add v6.8h, v25.8h, v30.8h // q1 + q6 add v10.8h, v17.8h, v22.8h // p6 + p1 urshr v4.8h, v12.8h, #4 // out p1 add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) sub v6.8h, v6.8h, v10.8h add v8.8h, v26.8h, v30.8h // q2 + q6 bif v0.16b, v18.16b, v15.16b // out p5 add v10.8h, v18.8h, v23.8h // p5 + p0 urshr v5.8h, v12.8h, #4 // out p0 add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) sub v8.8h, v8.8h, v10.8h add v10.8h, v27.8h, v30.8h // q3 + q6 bif v1.16b, v19.16b, v15.16b // out p4 add v18.8h, v19.8h, v24.8h // p4 + q0 urshr v6.8h, v12.8h, #4 // out q0 add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) sub v10.8h, v10.8h, v18.8h add v8.8h, v28.8h, v30.8h // q4 + q6 bif v2.16b, v20.16b, v15.16b // out p3 add v18.8h, v20.8h, v25.8h // p3 + q1 urshr v7.8h, v12.8h, #4 // out q1 add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) sub v18.8h, v8.8h, v18.8h add v10.8h, v29.8h, v30.8h // q5 + q6 bif v3.16b, v21.16b, v15.16b // out p2 add v20.8h, v21.8h, v26.8h // p2 + q2 urshr v8.8h, v12.8h, #4 // out q2 add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) sub v10.8h, v10.8h, v20.8h add v18.8h, v30.8h, v30.8h // q6 + q6 bif v4.16b, v22.16b, v15.16b // out p1 add v20.8h, v22.8h, v27.8h // p1 + q3 urshr v9.8h, v12.8h, #4 // out q3 add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) sub v18.8h, v18.8h, v20.8h bif v5.16b, v23.16b, v15.16b // out p0 urshr v10.8h, v12.8h, #4 // out q4 add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) urshr v11.8h, v12.8h, #4 // out q5 bif v6.16b, v24.16b, v15.16b // out q0 bif v7.16b, v25.16b, v15.16b // out q1 bif v8.16b, v26.16b, v15.16b // out q2 bif v9.16b, v27.16b, v15.16b // out q3 bif v10.16b, v28.16b, v15.16b // out q4 bif v11.16b, v29.16b, v15.16b // out q5 .endif mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels mov x14, #(1 << 6) ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels mov x14, #(1 << 4) ret .endif endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_8_wd16 bl lpf_8_wd16_neon cbz x14, 1f tbnz x14, #6, 7f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_8_wd8 bl lpf_8_wd8_neon cbz x14, 1f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon cbz x14, 1f ret x15 1: .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon cbz x14, 1f ret x15 1: .endm function lpf_v_4_8_neon mov x15, x30 sub x16, x0, x1, lsl #1 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 lpf_8_wd4 sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_4_8_neon mov x15, x30 sub x16, x0, #4 add x0, x16, x1, lsl #2 ld1 {v22.d}[0], [x16], x1 ld1 {v22.d}[1], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v23.d}[1], [x0], x1 ld1 {v24.d}[0], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v25.d}[0], [x16], x1 ld1 {v25.d}[1], [x0], x1 add x0, x0, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 lpf_8_wd4 sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc function lpf_v_6_8_neon mov x15, x30 sub x16, x0, x1, lsl #1 sub x16, x16, x1 ld1 {v21.8h}, [x16], x1 // p2 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v25.8h}, [x0], x1 // q1 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 lpf_8_wd6 sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_6_8_neon mov x15, x30 sub x16, x0, #8 add x0, x16, x1, lsl #2 ld1 {v20.8h}, [x16], x1 ld1 {v24.8h}, [x0], x1 ld1 {v21.8h}, [x16], x1 ld1 {v25.8h}, [x0], x1 ld1 {v22.8h}, [x16], x1 ld1 {v26.8h}, [x0], x1 ld1 {v23.8h}, [x16], x1 ld1 {v27.8h}, [x0], x1 add x0, x0, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_8_wd6 sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc function lpf_v_8_8_neon mov x15, x30 sub x16, x0, x1, lsl #2 ld1 {v20.8h}, [x16], x1 // p3 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v21.8h}, [x16], x1 // p2 ld1 {v25.8h}, [x0], x1 // q1 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v26.8h}, [x0], x1 // q2 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v27.8h}, [x0], x1 // q3 sub x0, x0, x1, lsl #2 lpf_8_wd8 sub x16, x0, x1, lsl #1 sub x16, x16, x1 st1 {v21.8h}, [x16], x1 // p2 st1 {v24.8h}, [x0], x1 // q0 st1 {v22.8h}, [x16], x1 // p1 st1 {v25.8h}, [x0], x1 // q1 st1 {v23.8h}, [x16], x1 // p0 st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_8_8_neon mov x15, x30 sub x16, x0, #8 add x0, x16, x1, lsl #2 ld1 {v20.8h}, [x16], x1 ld1 {v24.8h}, [x0], x1 ld1 {v21.8h}, [x16], x1 ld1 {v25.8h}, [x0], x1 ld1 {v22.8h}, [x16], x1 ld1 {v26.8h}, [x0], x1 ld1 {v23.8h}, [x16], x1 ld1 {v27.8h}, [x0], x1 add x0, x0, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_8_wd8 sub x16, x0, x1, lsl #3 sub x16, x16, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v20.8h}, [x16], x1 st1 {v24.8h}, [x0], x1 st1 {v21.8h}, [x16], x1 st1 {v25.8h}, [x0], x1 st1 {v22.8h}, [x16], x1 st1 {v26.8h}, [x0], x1 st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc function lpf_v_16_8_neon mov x15, x30 sub x16, x0, x1, lsl #3 add x16, x16, x1 ld1 {v17.8h}, [x16], x1 // p6 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v18.8h}, [x16], x1 // p5 ld1 {v25.8h}, [x0], x1 // q1 ld1 {v19.8h}, [x16], x1 // p4 ld1 {v26.8h}, [x0], x1 // q2 ld1 {v20.8h}, [x16], x1 // p3 ld1 {v27.8h}, [x0], x1 // q3 ld1 {v21.8h}, [x16], x1 // p2 ld1 {v28.8h}, [x0], x1 // q4 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v29.8h}, [x0], x1 // q5 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v30.8h}, [x0], x1 // q6 sub x0, x0, x1, lsl #3 add x0, x0, x1 lpf_8_wd16 sub x16, x0, x1, lsl #2 sub x16, x16, x1, lsl #1 st1 {v0.8h}, [x16], x1 // p5 st1 {v6.8h}, [x0], x1 // q0 st1 {v1.8h}, [x16], x1 // p4 st1 {v7.8h}, [x0], x1 // q1 st1 {v2.8h}, [x16], x1 // p3 st1 {v8.8h}, [x0], x1 // q2 st1 {v3.8h}, [x16], x1 // p2 st1 {v9.8h}, [x0], x1 // q3 st1 {v4.8h}, [x16], x1 // p1 st1 {v10.8h}, [x0], x1 // q4 st1 {v5.8h}, [x16], x1 // p0 st1 {v11.8h}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 st1 {v21.8h}, [x16], x1 // p2 st1 {v24.8h}, [x0], x1 // q0 st1 {v22.8h}, [x16], x1 // p1 st1 {v25.8h}, [x0], x1 // q1 st1 {v23.8h}, [x16], x1 // p0 st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_16_8_neon mov x15, x30 sub x16, x0, #16 ld1 {v16.8h}, [x16], x1 ld1 {v24.8h}, [x0], x1 ld1 {v17.8h}, [x16], x1 ld1 {v25.8h}, [x0], x1 ld1 {v18.8h}, [x16], x1 ld1 {v26.8h}, [x0], x1 ld1 {v19.8h}, [x16], x1 ld1 {v27.8h}, [x0], x1 ld1 {v20.8h}, [x16], x1 ld1 {v28.8h}, [x0], x1 ld1 {v21.8h}, [x16], x1 ld1 {v29.8h}, [x0], x1 ld1 {v22.8h}, [x16], x1 ld1 {v30.8h}, [x0], x1 ld1 {v23.8h}, [x16], x1 ld1 {v31.8h}, [x0], x1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 lpf_8_wd16 sub x0, x0, x1, lsl #3 sub x16, x0, #16 transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 st1 {v16.8h}, [x16], x1 st1 {v6.8h}, [x0], x1 st1 {v17.8h}, [x16], x1 st1 {v7.8h}, [x0], x1 st1 {v0.8h}, [x16], x1 st1 {v8.8h}, [x0], x1 st1 {v1.8h}, [x16], x1 st1 {v9.8h}, [x0], x1 st1 {v2.8h}, [x16], x1 st1 {v10.8h}, [x0], x1 st1 {v3.8h}, [x16], x1 st1 {v11.8h}, [x0], x1 st1 {v4.8h}, [x16], x1 st1 {v30.8h}, [x0], x1 st1 {v5.8h}, [x16], x1 st1 {v31.8h}, [x0], x1 ret x15 7: sub x16, x0, x1, lsl #3 sub x16, x16, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v20.8h}, [x16], x1 st1 {v24.8h}, [x0], x1 st1 {v21.8h}, [x16], x1 st1 {v25.8h}, [x0], x1 st1 {v22.8h}, [x16], x1 st1 {v26.8h}, [x0], x1 st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc // void dav2d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av2FilterLUT *lut, const int w, // const int bitdepth_max) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 mov x11, x30 mov w8, w7 // bitdepth_max clz w9, w8 mov w10, #24 sub w9, w10, w9 // bitdepth_min_8 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] ldp w6, w7, [x2] // vmask[0], vmask[1] .ifc \type, y ldr w2, [x2, #8] // vmask[2] .endif add x5, x5, #128 // Move to sharp part of lut .ifc \type, y orr w7, w7, w2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub x4, x3, x4, lsl #2 .else sub x3, x3, #4 lsl x4, x4, #2 .endif orr w6, w6, w7 // vmask[0] |= vmask[1] 1: tst w6, #0x03 .ifc \dir, v ld1 {v0.8b}, [x4], #8 ld1 {v1.8b}, [x3], #8 .else ld2 {v0.s,v1.s}[0], [x3], x4 ld2 {v0.s,v1.s}[1], [x3], x4 .endif b.eq 7f // if (!(vm & bits)) continue; ld1r {v5.8b}, [x5] // sharp[0] add x5, x5, #8 movi v2.2s, #0xff dup v13.2s, w6 // vmask[0] dup v31.8h, w9 // bitdepth_min_8 and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word and v1.8b, v1.8b, v2.8b cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0] movi v4.8b, #1 ld1r {v6.8b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] cmtst v2.2s, v1.2s, v2.2s // L != 0 mul v1.2s, v1.2s, v4.2s // L .ifc \type, y dup v15.2s, w2 // vmask[2] .endif dup v14.2s, w7 // vmask[1] mov x16, v2.d[0] cmp x16, #0 b.eq 7f // if (!L) continue; neg v5.8b, v5.8b // -sharp[0] movrel x16, word_12 ushr v12.8b, v1.8b, #4 // H ld1 {v16.2s}, [x16] sshl v3.8b, v1.8b, v5.8b // L >> sharp[0] .ifc \type, y cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits) .endif movi v7.8b, #2 umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1]) add v0.8b, v1.8b, v7.8b // L + 2 umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I add v0.8b, v0.8b, v0.8b // 2*(L + 2) cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits) uxtl v12.8h, v12.8b add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits) uxtl v11.8h, v11.8b uxtl v10.8h, v10.8b and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0 sxtl v14.8h, v14.8b sxtl v13.8h, v13.8b .ifc \type, y sxtl v15.8h, v15.8b .endif ushl v12.8h, v12.8h, v31.8h ushl v11.8h, v11.8h, v31.8h ushl v10.8h, v10.8h, v31.8h .ifc \type, y tst w2, #0x03 b.eq 2f // wd16 bl lpf_\dir\()_16_8_neon b 8f 2: .endif tst w7, #0x03 b.eq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_8_neon .else // wd6 bl lpf_\dir\()_6_8_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_8_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment x0. // If the whole function is skipped, increment it here instead. add x0, x0, x1, lsl #3 .else 7: .endif 8: lsr w6, w6, #2 // vmask[0] >>= 2 lsr w7, w7, #2 // vmask[1] >>= 2 .ifc \type, y lsr w2, w2, #2 // vmask[2] >>= 2 .endif .ifc \dir, v add x0, x0, #16 .else // For dir h, x0 is returned incremented .endif cbnz w6, 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x11 endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv const word_12 .word 1, 2 endconst dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/mc.S000066400000000000000000003705451517466257200224040ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * Copyright © 2018, Martin Storsjo * Copyright © 2026, Arm Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], #32 ld1 {\t2\().8h,\t3\().8h}, [x3], #32 add \t0\().8h, \t0\().8h, \t2\().8h add \t1\().8h, \t1\().8h, \t3\().8h sqrshrun \dst\().8b, \t0\().8h, #5 sqrshrun2 \dst\().16b, \t1\().8h, #5 .endm .macro w_avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], #32 ld1 {\t2\().8h,\t3\().8h}, [x3], #32 sub \t2\().8h, \t0\().8h, \t2\().8h sub \t3\().8h, \t1\().8h, \t3\().8h sqdmulh \t2\().8h, \t2\().8h, v30.8h sqdmulh \t3\().8h, \t3\().8h, v30.8h add \t0\().8h, \t2\().8h, \t0\().8h add \t1\().8h, \t3\().8h, \t1\().8h sqrshrun \dst\().8b, \t0\().8h, #4 sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm .macro mask dst, t0, t1, t2, t3 ld1 {v30.16b}, [x6], #16 ld1 {\t0\().8h,\t1\().8h}, [x2], #32 mul v30.16b, v30.16b, v31.16b ld1 {\t2\().8h,\t3\().8h}, [x3], #32 shll v28.8h, v30.8b, #8 shll2 v29.8h, v30.16b, #8 sub \t0\().8h, \t2\().8h, \t0\().8h sub \t1\().8h, \t3\().8h, \t1\().8h sqdmulh \t0\().8h, \t0\().8h, v28.8h sqdmulh \t1\().8h, \t1\().8h, v29.8h add \t0\().8h, \t2\().8h, \t0\().8h add \t1\().8h, \t3\().8h, \t1\().8h sqrshrun \dst\().8b, \t0\().8h, #4 sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm .macro bidir_fn type function \type\()_8bpc_neon, export=1 clz w4, w4 .ifc \type, w_avg // (a * weight + b * (16 - weight) + 128) >> 8 // = ((a - b) * weight + (b << 4) + 128) >> 8 // = ((a - b) * (weight-16) + (a << 4) + 128) >> 8 // = ((b - a) * -weight + (b << 4) + 128) >> 8 // = ((((b - a) * (-weight << 11) * 2) >> 16) + b + 8) >> 4 // = ((((a - b) * ((weight-16) << 11) * 2) >> 16) + a + 8) >> 4 movi v31.8h, #16 cmp w6, #16 dup v30.8h, w6 b.lt 1f // if weight >= 16 sub v30.8h, v30.8h, v31.8h // weight - 16 b 2f 1: mov x7, x2 // swap tmp1/tmp2 mov x2, x3 mov x3, x7 neg v30.8h, v30.8h // -weight 2: shl v30.8h, v30.8h, #11 .endif .ifc \type, mask movi v31.16b, #256-2 .endif movrel x7, \type\()_tbl sub w4, w4, #25 ldrsw x4, [x7, x4, lsl #2] \type v4, v0, v1, v2, v3 add x7, x7, x4 br x7 40: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: subs w5, w5, #4 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x7], x1 st1 {v4.s}[2], [x0], x1 st1 {v4.s}[3], [x7], x1 b.le 0f \type v5, v0, v1, v2, v3 subs w5, w5, #4 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 st1 {v5.s}[3], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 41: subs w5, w5, #8 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x7], x1 \type v5, v0, v1, v2, v3 st1 {v4.s}[2], [x0], x1 st1 {v4.s}[3], [x7], x1 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 st1 {v5.s}[3], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 41b 80: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: st1 {v4.8b}, [x0], x1 \type v5, v0, v1, v2, v3 st1 {v4.d}[1], [x7], x1 st1 {v5.8b}, [x0], x1 subs w5, w5, #4 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 8b 160: AARCH64_VALID_JUMP_TARGET 16: \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 \type v6, v0, v1, v2, v3 st1 {v5.16b}, [x0], x1 \type v7, v0, v1, v2, v3 st1 {v6.16b}, [x0], x1 subs w5, w5, #4 st1 {v7.16b}, [x0], x1 b.le 0f \type v4, v0, v1, v2, v3 b 16b 320: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 32: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 st1 {v4.16b,v5.16b}, [x0], x1 \type v7, v0, v1, v2, v3 subs w5, w5, #2 st1 {v6.16b,v7.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 32b 640: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 64: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 \type v7, v0, v1, v2, v3 \type v16, v0, v1, v2, v3 \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 \type v18, v0, v1, v2, v3 \type v19, v0, v1, v2, v3 subs w5, w5, #2 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 64b 0: ret endfunc jumptable \type\()_tbl .word 640b - \type\()_tbl .word 320b - \type\()_tbl .word 160b - \type\()_tbl .word 80b - \type\()_tbl .word 40b - \type\()_tbl endjumptable .endm bidir_fn avg bidir_fn w_avg bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 ldr w11, [sp] // sign clz w8, w4 movrel x9, w_mask_\type\()_tbl sub w8, w8, #25 ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 mov w10, #6903 dup v0.8h, w10 .if \type == 444 movi v1.16b, #64 .elseif \type == 422 dup v2.8b, w11 movi v3.8b, #129 sub v3.8b, v3.8b, v2.8b .elseif \type == 420 dup v2.8h, w11 movi v3.8h, #1, lsl #8 sub v3.8h, v3.8h, v2.8h .endif add x12, x0, x1 lsl x1, x1, #1 br x9 40: AARCH64_VALID_JUMP_TARGET 4: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 sub v16.8h, v6.8h, v4.8h sub v17.8h, v7.8h, v5.8h sabd v18.8h, v4.8h, v6.8h sabd v19.8h, v5.8h, v7.8h uqsub v18.8h, v0.8h, v18.8h uqsub v19.8h, v0.8h, v19.8h ushr v18.8h, v18.8h, #8 ushr v19.8h, v19.8h, #8 shl v20.8h, v18.8h, #9 shl v21.8h, v19.8h, #9 sqdmulh v20.8h, v20.8h, v16.8h sqdmulh v21.8h, v21.8h, v17.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v5.8h sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 addp v18.8h, v18.8h, v19.8h xtn v18.8b, v18.8h uhsub v18.8b, v3.8b, v18.8b st1 {v18.8b}, [x6], #8 .elseif \type == 420 trn1 v24.2d, v18.2d, v19.2d trn2 v25.2d, v18.2d, v19.2d add v24.8h, v24.8h, v25.8h addp v18.8h, v24.8h, v24.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 str s18, [x6], #4 .endif st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x12], x1 st1 {v23.s}[0], [x0], x1 st1 {v23.s}[1], [x12], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 subs w5, w5, #2 sub v16.8h, v6.8h, v4.8h sub v17.8h, v7.8h, v5.8h sabd v18.8h, v4.8h, v6.8h sabd v19.8h, v5.8h, v7.8h uqsub v18.8h, v0.8h, v18.8h uqsub v19.8h, v0.8h, v19.8h ushr v18.8h, v18.8h, #8 ushr v19.8h, v19.8h, #8 shl v20.8h, v18.8h, #9 shl v21.8h, v19.8h, #9 sqdmulh v20.8h, v20.8h, v16.8h sqdmulh v21.8h, v21.8h, v17.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v5.8h sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 addp v18.8h, v18.8h, v19.8h xtn v18.8b, v18.8h uhsub v18.8b, v3.8b, v18.8b st1 {v18.8b}, [x6], #8 .elseif \type == 420 add v18.8h, v18.8h, v19.8h addp v18.8h, v18.8h, v18.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 str s18, [x6], #4 .endif st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x12], x1 b.gt 8b ret 640: 320: 160: AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw .if \type == 444 add x10, x6, x7 lsl x7, x7, #1 sub x7, x7, w4, uxtw .elseif \type == 422 uxtw x4, w4 add x10, x6, x7 lsl x7, x7, #1 sub x7, x7, x4, lsr #1 .elseif \type == 420 uxtw x4, w4 sub x7, x7, x4, lsr #1 .endif add x9, x3, w4, uxtw #1 add x13, x2, w4, uxtw #1 161: mov w8, w4 16: ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 ld1 {v16.8h, v17.8h}, [x13], #32 ld1 {v18.8h, v19.8h}, [x9], #32 subs w8, w8, #16 sub v6.8h, v6.8h, v4.8h sub v7.8h, v7.8h, v5.8h sub v18.8h, v18.8h, v16.8h sub v19.8h, v19.8h, v17.8h abs v20.8h, v6.8h abs v21.8h, v7.8h abs v22.8h, v18.8h abs v23.8h, v19.8h uqsub v20.8h, v0.8h, v20.8h uqsub v21.8h, v0.8h, v21.8h uqsub v22.8h, v0.8h, v22.8h uqsub v23.8h, v0.8h, v23.8h ushr v20.8h, v20.8h, #8 ushr v21.8h, v21.8h, #8 ushr v22.8h, v22.8h, #8 ushr v23.8h, v23.8h, #8 shl v24.8h, v20.8h, #9 shl v25.8h, v21.8h, #9 shl v26.8h, v22.8h, #9 shl v27.8h, v23.8h, #9 sqdmulh v24.8h, v24.8h, v6.8h sqdmulh v25.8h, v25.8h, v7.8h sqdmulh v26.8h, v26.8h, v18.8h sqdmulh v27.8h, v27.8h, v19.8h add v24.8h, v24.8h, v4.8h add v25.8h, v25.8h, v5.8h add v26.8h, v26.8h, v16.8h add v27.8h, v27.8h, v17.8h sqrshrun v24.8b, v24.8h, #4 sqrshrun v25.8b, v25.8h, #4 sqrshrun v26.8b, v26.8h, #4 sqrshrun v27.8b, v27.8h, #4 .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 uzp1 v21.16b, v22.16b, v23.16b // Ditto sub v20.16b, v1.16b, v20.16b sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 st1 {v21.16b}, [x10], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h addp v21.8h, v22.8h, v23.8h xtn v20.8b, v20.8h xtn v21.8b, v21.8h uhsub v20.8b, v3.8b, v20.8b uhsub v21.8b, v3.8b, v21.8b st1 {v20.8b}, [x6], #8 st1 {v21.8b}, [x10], #8 .elseif \type == 420 add v20.8h, v20.8h, v22.8h add v21.8h, v21.8h, v23.8h addp v20.8h, v20.8h, v21.8h sub v20.8h, v3.8h, v20.8h rshrn v20.8b, v20.8h, #2 st1 {v20.8b}, [x6], #8 .endif st1 {v24.8b, v25.8b}, [x0], #16 st1 {v26.8b, v27.8b}, [x12], #16 b.gt 16b subs w5, w5, #2 add x2, x2, w4, uxtw #1 add x3, x3, w4, uxtw #1 add x13, x13, w4, uxtw #1 add x9, x9, w4, uxtw #1 add x6, x6, x7 .if \type == 444 add x10, x10, x7 .elseif \type == 422 add x10, x10, x7 .endif add x0, x0, x1 add x12, x12, x1 b.gt 161b ret endfunc jumptable w_mask_\type\()_tbl .word 640b - w_mask_\type\()_tbl .word 320b - w_mask_\type\()_tbl .word 160b - w_mask_\type\()_tbl .word 80b - w_mask_\type\()_tbl .word 40b - w_mask_\type\()_tbl endjumptable .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_8bpc_neon, export=1 movrel x6, blend_tbl clz w3, w3 sub w3, w3, #25 ldrsw x3, [x6, x3, lsl #2] add x6, x6, x3 movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 br x6 40: AARCH64_VALID_JUMP_TARGET 4: ld1 {v2.8b}, [x5], #8 ldr d1, [x2], #8 ldr s0, [x0] subs w4, w4, #2 ld1 {v0.s}[1], [x8] sub v3.8b, v4.8b, v2.8b umull v5.8h, v1.8b, v2.8b umlal v5.8h, v0.8b, v3.8b rshrn v6.8b, v5.8h, #6 st1 {v6.s}[0], [x0], x1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ld1 {v2.16b}, [x5], #16 ld1 {v1.16b}, [x2], #16 ldr d0, [x0] ld1 {v0.d}[1], [x8] sub v3.16b, v4.16b, v2.16b subs w4, w4, #2 umull v5.8h, v1.8b, v2.8b umlal v5.8h, v0.8b, v3.8b umull2 v6.8h, v1.16b, v2.16b umlal2 v6.8h, v0.16b, v3.16b rshrn v7.8b, v5.8h, #6 rshrn v16.8b, v6.8h, #6 st1 {v7.8b}, [x0], x1 st1 {v16.8b}, [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ld1 {v1.16b, v2.16b}, [x5], #32 ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v0.16b}, [x0] subs w4, w4, #2 sub v7.16b, v4.16b, v1.16b sub v20.16b, v4.16b, v2.16b ld1 {v3.16b}, [x8] umull v16.8h, v5.8b, v1.8b umlal v16.8h, v0.8b, v7.8b umull2 v17.8h, v5.16b, v1.16b umlal2 v17.8h, v0.16b, v7.16b umull v21.8h, v6.8b, v2.8b umlal v21.8h, v3.8b, v20.8b umull2 v22.8h, v6.16b, v2.16b umlal2 v22.8h, v3.16b, v20.16b rshrn v18.8b, v16.8h, #6 rshrn2 v18.16b, v17.8h, #6 rshrn v19.8b, v21.8h, #6 rshrn2 v19.16b, v22.8h, #6 st1 {v18.16b}, [x0], x1 st1 {v19.16b}, [x8], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v20.16b, v21.16b}, [x0] subs w4, w4, #2 ld1 {v22.16b, v23.16b}, [x8] sub v5.16b, v4.16b, v0.16b sub v6.16b, v4.16b, v1.16b sub v30.16b, v4.16b, v2.16b sub v31.16b, v4.16b, v3.16b umull v24.8h, v16.8b, v0.8b umlal v24.8h, v20.8b, v5.8b umull2 v26.8h, v16.16b, v0.16b umlal2 v26.8h, v20.16b, v5.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v21.8b, v6.8b umull2 v7.8h, v17.16b, v1.16b umlal2 v7.8h, v21.16b, v6.16b umull v27.8h, v18.8b, v2.8b umlal v27.8h, v22.8b, v30.8b umull2 v1.8h, v18.16b, v2.16b umlal2 v1.8h, v22.16b, v30.16b umull v29.8h, v19.8b, v3.8b umlal v29.8h, v23.8b, v31.8b umull2 v21.8h, v19.16b, v3.16b umlal2 v21.8h, v23.16b, v31.16b rshrn v24.8b, v24.8h, #6 rshrn2 v24.16b, v26.8h, #6 rshrn v25.8b, v28.8h, #6 rshrn2 v25.16b, v7.8h, #6 rshrn v27.8b, v27.8h, #6 rshrn2 v27.16b, v1.8h, #6 rshrn v28.8b, v29.8h, #6 rshrn2 v28.16b, v21.8h, #6 st1 {v24.16b, v25.16b}, [x0], x1 st1 {v27.16b, v28.16b}, [x8], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET // Reset the destination stride, which is doubled in the prologue, // for the other cases where we write alternating rows with two // pointers. lsr x1, x1, #1 64: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x0] subs w4, w4, #1 sub v5.16b, v4.16b, v0.16b sub v6.16b, v4.16b, v1.16b sub v30.16b, v4.16b, v2.16b sub v31.16b, v4.16b, v3.16b umull v24.8h, v16.8b, v0.8b umlal v24.8h, v20.8b, v5.8b umull2 v26.8h, v16.16b, v0.16b umlal2 v26.8h, v20.16b, v5.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v21.8b, v6.8b umull2 v7.8h, v17.16b, v1.16b umlal2 v7.8h, v21.16b, v6.16b umull v27.8h, v18.8b, v2.8b umlal v27.8h, v22.8b, v30.8b umull2 v1.8h, v18.16b, v2.16b umlal2 v1.8h, v22.16b, v30.16b umull v29.8h, v19.8b, v3.8b umlal v29.8h, v23.8b, v31.8b umull2 v21.8h, v19.16b, v3.16b umlal2 v21.8h, v23.16b, v31.16b rshrn v24.8b, v24.8h, #6 rshrn2 v24.16b, v26.8h, #6 rshrn v25.8b, v28.8h, #6 rshrn2 v25.16b, v7.8h, #6 rshrn v26.8b, v27.8h, #6 rshrn2 v26.16b, v1.8h, #6 rshrn v27.8b, v29.8h, #6 rshrn2 v27.16b, v21.8h, #6 st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x0], x1 b.gt 64b ret endfunc jumptable blend_tbl .word 640b - blend_tbl .word 320b - blend_tbl .word 160b - blend_tbl .word 80b - blend_tbl .word 40b - blend_tbl endjumptable // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). function put_neon, export=1 movrel x9, put_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: AARCH64_VALID_JUMP_TARGET 2: ldrh w9, [x2] ldrh w10, [x2, x3] add x2, x2, x3, lsl #1 subs w5, w5, #2 strh w9, [x0] strh w10, [x0, x1] add x0, x0, x1, lsl #1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET 4: ldr w9, [x2] ldr w10, [x2, x3] add x2, x2, x3, lsl #1 subs w5, w5, #2 str w9, [x0] str w10, [x0, x1] add x0, x0, x1, lsl #1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ldr x9, [x2] ldr x10, [x2, x3] add x2, x2, x3, lsl #1 subs w5, w5, #2 str x9, [x0] str x10, [x0, x1] add x0, x0, x1, lsl #1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ldr q0, [x2] ldr q1, [x2, x3] add x2, x2, x3, lsl #1 subs w5, w5, #2 str q0, [x0] str q1, [x0, x1] add x0, x0, x1, lsl #1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ldp q0, q1, [x2] add x2, x2, x3 stp q0, q1, [x0] add x0, x0, x1 ldp q2, q3, [x2] add x2, x2, x3 stp q2, q3, [x0] subs w5, w5, #2 add x0, x0, x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: ldp q0, q1, [x2] stp q0, q1, [x0] ldp q2, q3, [x2, #32] add x2, x2, x3 stp q2, q3, [x0, #32] subs w5, w5, #1 add x0, x0, x1 b.gt 64b ret endfunc jumptable put_tbl .word 640b - put_tbl .word 320b - put_tbl .word 160b - put_tbl .word 80b - put_tbl .word 40b - put_tbl .word 20b - put_tbl endjumptable // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. function prep_neon, export=1 movrel x9, prep_tbl ldrsw x8, [x9, x8, lsl #2] movi v24.16b, #16 add x9, x9, x8 br x9 40: AARCH64_VALID_JUMP_TARGET 4: ldr s0, [x2] ldr s2, [x2, x3] add x2, x2, x3, lsl #1 ldr s1, [x2] ldr s3, [x2, x3] add x2, x2, x3, lsl #1 mov v0.s[1], v2.s[0] mov v1.s[1], v3.s[0] ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 subs w5, w5, #4 mov v2.d[0], v0.d[1] mov v3.d[0], v1.d[1] str d0, [x0] str d2, [x0, x1] add x0, x0, x1, lsl #1 str d1, [x0] str d3, [x0, x1] add x0, x0, x1, lsl #1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ldr d0, [x2] ldr d1, [x2, x3] add x2, x2, x3, lsl #1 ldr d2, [x2] ldr d3, [x2, x3] add x2, x2, x3, lsl #1 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 umull v2.8h, v2.8b, v24.8b umull v3.8h, v3.8b, v24.8b subs w5, w5, #4 str q0, [x0] str q1, [x0, x1] add x0, x0, x1, lsl #1 str q2, [x0] str q3, [x0, x1] add x0, x0, x1, lsl #1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ldr q1, [x2] ldr q3, [x2, x3] add x2, x2, x3, lsl #1 ushll v0.8h, v1.8b, #4 ushll2 v1.8h, v1.16b, #4 ldr q5, [x2] ldr q7, [x2, x3] add x2, x2, x3, lsl #1 umull v2.8h, v3.8b, v24.8b umull2 v3.8h, v3.16b, v24.16b ushll v4.8h, v5.8b, #4 ushll2 v5.8h, v5.16b, #4 umull v6.8h, v7.8b, v24.8b umull2 v7.8h, v7.16b, v24.16b subs w5, w5, #4 stp q0, q1, [x0] add x0, x0, x1 stp q2, q3, [x0] add x0, x0, x1 stp q4, q5, [x0] add x0, x0, x1 stp q6, q7, [x0] add x0, x0, x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ldp q4, q5, [x2] add x2, x2, x3 ldp q6, q7, [x2] add x2, x2, x3 ushll v0.8h, v4.8b, #4 ushll2 v1.8h, v4.16b, #4 umull v2.8h, v5.8b, v24.8b umull2 v3.8h, v5.16b, v24.16b ushll v4.8h, v6.8b, #4 ushll2 v5.8h, v6.16b, #4 umull v6.8h, v7.8b, v24.8b umull2 v7.8h, v7.16b, v24.16b subs w5, w5, #2 stp q0, q1, [x0] stp q2, q3, [x0, #32] add x0, x0, x1 stp q4, q5, [x0] stp q6, q7, [x0, #32] add x0, x0, x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: ldp q4, q5, [x2] ldp q6, q7, [x2, #32] add x2, x2, x3 ushll v0.8h, v4.8b, #4 ushll2 v1.8h, v4.16b, #4 umull v2.8h, v5.8b, v24.8b umull2 v3.8h, v5.16b, v24.16b ushll v4.8h, v6.8b, #4 ushll2 v5.8h, v6.16b, #4 umull v6.8h, v7.8b, v24.8b umull2 v7.8h, v7.16b, v24.16b subs w5, w5, #1 stp q0, q1, [x0] stp q2, q3, [x0, #32] stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x0, x0, x1 b.gt 64b ret endfunc jumptable prep_tbl .word 640b - prep_tbl .word 320b - prep_tbl .word 160b - prep_tbl .word 80b - prep_tbl .word 40b - prep_tbl endjumptable .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd ld1 {\d1\wd}[0], [\s1], \strd .ifnb \d2 ld1 {\d2\wd}[0], [\s0], \strd ld1 {\d3\wd}[0], [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}[0], [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}[0], [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}[0], [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}, [\s0], \strd ld1 {\d1\wd}, [\s1], \strd .ifnb \d2 ld1 {\d2\wd}, [\s0], \strd ld1 {\d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}, [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}, [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}, [\s0], \strd .endif .endm .macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro interleave_1 wd, r0, r1, r2, r3, r4 trn1 \r0\wd, \r0\wd, \r1\wd trn1 \r1\wd, \r1\wd, \r2\wd .ifnb \r3 trn1 \r2\wd, \r2\wd, \r3\wd trn1 \r3\wd, \r3\wd, \r4\wd .endif .endm .macro interleave_1_h r0, r1, r2, r3, r4 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 .endm .macro interleave_1_s r0, r1, r2, r3, r4 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 .endm .macro interleave_2 wd, r0, r1, r2, r3, r4, r5 trn1 \r0\wd, \r0\wd, \r2\wd trn1 \r1\wd, \r1\wd, \r3\wd trn1 \r2\wd, \r2\wd, \r4\wd trn1 \r3\wd, \r3\wd, \r5\wd .endm .macro interleave_2_s r0, r1, r2, r3, r4, r5 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 .endm .macro uxtl_b r0, r1, r2, r3, r4, r5, r6 uxtl \r0\().8h, \r0\().8b uxtl \r1\().8h, \r1\().8b .ifnb \r2 uxtl \r2\().8h, \r2\().8b uxtl \r3\().8h, \r3\().8b .endif .ifnb \r4 uxtl \r4\().8h, \r4\().8b .endif .ifnb \r5 uxtl \r5\().8h, \r5\().8b .endif .ifnb \r6 uxtl \r6\().8h, \r6\().8b .endif .endm .macro mul_mla_4tap d, s0, s1, s2, s3, wd mul \d\wd, \s0\wd, v0.h[0] mla \d\wd, \s1\wd, v0.h[1] mla \d\wd, \s2\wd, v0.h[2] mla \d\wd, \s3\wd, v0.h[3] .endm // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. .macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] mla \d0\().4h, \s3\().4h, v0.h[3] mla \d0\().4h, \s4\().4h, v0.h[4] mla \d0\().4h, \s5\().4h, v0.h[5] mla \d0\().4h, \s6\().4h, v0.h[6] .endm .macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] .endm .macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mul \d1\().8h, \s2\().8h, v0.h[1] mla \d1\().8h, \s3\().8h, v0.h[2] mla \d1\().8h, \s4\().8h, v0.h[3] mla \d1\().8h, \s5\().8h, v0.h[4] mla \d1\().8h, \s6\().8h, v0.h[5] mla \d1\().8h, \s7\().8h, v0.h[6] .endm .macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mul \d1\().8h, \s3\().8h, v0.h[1] mla \d1\().8h, \s4\().8h, v0.h[2] mla \d1\().8h, \s5\().8h, v0.h[3] mla \d1\().8h, \s6\().8h, v0.h[4] mla \d1\().8h, \s7\().8h, v0.h[5] mla \d1\().8h, \s8\().8h, v0.h[6] .endm .macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s0\().4h, v0.h[0] mla \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] mla \d0\().4h, \s3\().4h, v0.h[3] mla \d0\().4h, \s4\().4h, v0.h[4] mla \d0\().4h, \s5\().4h, v0.h[5] mla \d0\().4h, \s6\().4h, v0.h[6] mla \d0\().4h, \s7\().4h, v0.h[7] .endm .macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] .endm .macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] mul \d1\().8h, \s1\().8h, v0.h[0] mla \d1\().8h, \s2\().8h, v0.h[1] mla \d1\().8h, \s3\().8h, v0.h[2] mla \d1\().8h, \s4\().8h, v0.h[3] mla \d1\().8h, \s5\().8h, v0.h[4] mla \d1\().8h, \s6\().8h, v0.h[5] mla \d1\().8h, \s7\().8h, v0.h[6] mla \d1\().8h, \s8\().8h, v0.h[7] .endm .macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] mul \d1\().8h, \s2\().8h, v0.h[0] mla \d1\().8h, \s3\().8h, v0.h[1] mla \d1\().8h, \s4\().8h, v0.h[2] mla \d1\().8h, \s5\().8h, v0.h[3] mla \d1\().8h, \s6\().8h, v0.h[4] mla \d1\().8h, \s7\().8h, v0.h[5] mla \d1\().8h, \s8\().8h, v0.h[6] mla \d1\().8h, \s9\().8h, v0.h[7] .endm .macro sqrshrun_b shift, r0, r1, r2, r3 sqrshrun \r0\().8b, \r0\().8h, #\shift .ifnb \r1 sqrshrun \r1\().8b, \r1\().8h, #\shift .endif .ifnb \r2 sqrshrun \r2\().8b, \r2\().8h, #\shift sqrshrun \r3\().8b, \r3\().8h, #\shift .endif .endm .macro srshr_h shift, r0, r1, r2, r3 srshr \r0\().8h, \r0\().8h, #\shift .ifnb \r1 srshr \r1\().8h, \r1\().8h, #\shift .endif .ifnb \r2 srshr \r2\().8h, \r2\().8h, #\shift srshr \r3\().8h, \r3\().8h, #\shift .endif .endm .macro st_h strd, reg, lanes st1 {\reg\().h}[0], [x0], \strd st1 {\reg\().h}[1], [x8], \strd .if \lanes > 2 st1 {\reg\().h}[2], [x0], \strd st1 {\reg\().h}[3], [x8], \strd .endif .endm .macro st_s strd, r0, r1 st1 {\r0\().s}[0], [x0], \strd st1 {\r0\().s}[1], [x8], \strd .ifnb \r1 st1 {\r1\().s}[0], [x0], \strd st1 {\r1\().s}[1], [x8], \strd .endif .endm .macro st_d strd, r0, r1 st1 {\r0\().8b}, [x0], \strd st1 {\r0\().d}[1], [x8], \strd .ifnb \r1 st1 {\r1\().8b}, [x0], \strd st1 {\r1\().d}[1], [x8], \strd .endif .endm .macro shift_store_4 type, strd, r0, r1 .ifc \type, put sqrshrun_b 6, \r0, \r1 st_s \strd, \r0, \r1 .else srshr_h 2, \r0, \r1 st_d \strd, \r0, \r1 .endif .endm .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 st1 {\r0\wd}, [x0], \strd st1 {\r1\wd}, [x8], \strd .ifnb \r2 st1 {\r2\wd}, [x0], \strd st1 {\r3\wd}, [x8], \strd .endif .ifnb \r4 st1 {\r4\wd}, [x0], \strd st1 {\r5\wd}, [x8], \strd st1 {\r6\wd}, [x0], \strd st1 {\r7\wd}, [x8], \strd .endif .endm .macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro shift_store_8 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_b 6, \r0, \r1, \r2, \r3 st_8b \strd, \r0, \r1, \r2, \r3 .else srshr_h 2, \r0, \r1, \r2, \r3 st_16b \strd, \r0, \r1, \r2, \r3 .endif .endm .macro shift_store_16 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun \r0\().8b, \r0\().8h, #6 sqrshrun2 \r0\().16b, \r1\().8h, #6 sqrshrun \r2\().8b, \r2\().8h, #6 sqrshrun2 \r2\().16b, \r3\().8h, #6 st_16b \strd, \r0, \r2 .else srshr_h 2, \r0, \r1, \r2, \r3 st1 {\r0\().8h, \r1\().8h}, [x0], \strd st1 {\r2\().8h, \r3\().8h}, [x8], \strd .endif .endm .macro make_8tap_fn op, name, type, taps function \op\()_8tap_\name\()_8bpc_neon, export=1 mov x8, \type b \op\()_\taps\()_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps function \type\()_\taps\()_neon mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w10 mul \my, \my, w10 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h add \my, \my, w8 // my, 8tap_v, 4tap_v .ifc \type, prep lsl \d_strd, \d_strd, #1 .endif clz w8, \w tst \mx, #(0x7f << 14) sub w8, w8, #25 movrel x10, X(mc_subpel_filters), -8 b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) b.ne L(\type\()_\taps\()_v) b \type\()_neon L(\type\()_\taps\()_h): cmp \w, #4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7f b.le 4f mov \mx, w9 4: tst \my, #(0x7f << 14) add \xmx, x10, \mx, uxtw #3 b.ne L(\type\()_\taps\()_hv) movrel x9, \type\()_\taps\()_h_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put ldur s0, [\xmx, #2] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 2: ld1 {v4.8b}, [\src], \s_strd ld1 {v6.8b}, [\sr2], \s_strd uxtl v4.8h, v4.8b uxtl v6.8h, v6.8b ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 trn1 v3.2s, v4.2s, v6.2s trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s mul v3.4h, v3.4h, v0.h[0] mla v3.4h, v4.4h, v0.h[1] mla v3.4h, v6.4h, v0.h[2] mla v3.4h, v7.4h, v0.h[3] srshr v3.4h, v3.4h, #2 sqrshrun v3.8b, v3.8h, #4 st1 {v3.h}[0], [\dst], \d_strd st1 {v3.h}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET ldur s0, [\xmx, #2] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 4: ld1 {v16.8b}, [\src], \s_strd ld1 {v20.8b}, [\sr2], \s_strd uxtl v16.8h, v16.8b uxtl v20.8h, v20.8b ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 ext v21.16b, v20.16b, v20.16b, #2 ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 mul v16.4h, v16.4h, v0.h[0] mla v16.4h, v17.4h, v0.h[1] mla v16.4h, v18.4h, v0.h[2] mla v16.4h, v19.4h, v0.h[3] mul v20.4h, v20.4h, v0.h[0] mla v20.4h, v21.4h, v0.h[1] mla v20.4h, v22.4h, v0.h[2] mla v20.4h, v23.4h, v0.h[3] srshr v16.4h, v16.4h, #2 srshr v20.4h, v20.4h, #2 .ifc \type, put sqrshrun v16.8b, v16.8h, #4 sqrshrun v20.8b, v20.8h, #4 str s16, [\dst] str s20, [\ds2] add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd .else st1 {v16.4h}, [\dst], \d_strd st1 {v20.4h}, [\ds2], \d_strd .endif b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] .ifc \taps, 6tap sub \src, \src, #2 .else sub \src, \src, #3 .endif add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 8: ld1 {v16.8b, v17.8b}, [\src], \s_strd ld1 {v20.8b, v21.8b}, [\sr2], \s_strd uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b .ifc \taps, 6tap mul v18.8h, v16.8h, v0.h[1] mul v22.8h, v20.8h, v0.h[1] .irpc i, 23456 ext v19.16b, v16.16b, v17.16b, #(2*\i-2) ext v23.16b, v20.16b, v21.16b, #(2*\i-2) mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr .else // 8tap mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] .irpc i, 1234567 ext v19.16b, v16.16b, v17.16b, #(2*\i) ext v23.16b, v20.16b, v21.16b, #(2*\i) mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr .endif subs \h, \h, #2 srshr v18.8h, v18.8h, #2 srshr v22.8h, v22.8h, #2 .ifc \type, put sqrshrun v18.8b, v18.8h, #4 sqrshrun v22.8b, v22.8h, #4 st1 {v18.8b}, [\dst], \d_strd st1 {v22.8b}, [\ds2], \d_strd .else st1 {v18.8h}, [\dst], \d_strd st1 {v22.8h}, [\ds2], \d_strd .endif b.gt 8b ret 160: 320: 640: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] .ifc \taps, 6tap sub \src, \src, #2 .else sub \src, \src, #3 .endif add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b sub \s_strd, \s_strd, \w, uxtw sub \s_strd, \s_strd, #8 lsl \d_strd, \d_strd, #1 .ifc \type, put sub \d_strd, \d_strd, \w, uxtw .else sub \d_strd, \d_strd, \w, uxtw #1 .endif 161: ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 mov \mx, \w uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b uxtl v22.8h, v22.8b 16: .ifc \taps, 6tap mul v24.8h, v16.8h, v0.h[1] mul v25.8h, v17.8h, v0.h[1] mul v26.8h, v20.8h, v0.h[1] mul v27.8h, v21.8h, v0.h[1] .irpc i, 23456 ext v28.16b, v16.16b, v17.16b, #(2*\i-2) ext v29.16b, v17.16b, v18.16b, #(2*\i-2) ext v30.16b, v20.16b, v21.16b, #(2*\i-2) ext v31.16b, v21.16b, v22.16b, #(2*\i-2) mla v24.8h, v28.8h, v0.h[\i] mla v25.8h, v29.8h, v0.h[\i] mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr .else // 8tap mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] mul v27.8h, v21.8h, v0.h[0] .irpc i, 1234567 ext v28.16b, v16.16b, v17.16b, #(2*\i) ext v29.16b, v17.16b, v18.16b, #(2*\i) ext v30.16b, v20.16b, v21.16b, #(2*\i) ext v31.16b, v21.16b, v22.16b, #(2*\i) mla v24.8h, v28.8h, v0.h[\i] mla v25.8h, v29.8h, v0.h[\i] mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr .endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 srshr v26.8h, v26.8h, #2 srshr v27.8h, v27.8h, #2 subs \mx, \mx, #16 .ifc \type, put sqrshrun v24.8b, v24.8h, #4 sqrshrun2 v24.16b, v25.8h, #4 sqrshrun v26.8b, v26.8h, #4 sqrshrun2 v26.16b, v27.8h, #4 st1 {v24.16b}, [\dst], #16 st1 {v26.16b}, [\ds2], #16 .else st1 {v24.8h, v25.8h}, [\dst], #32 st1 {v26.8h, v27.8h}, [\ds2], #32 .endif b.le 9f mov v16.16b, v18.16b mov v20.16b, v22.16b ld1 {v17.8b, v18.8b}, [\src], #16 ld1 {v21.8b, v22.8b}, [\sr2], #16 uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v21.8h, v21.8b uxtl v22.8h, v22.8b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret endfunc jumptable \type\()_\taps\()_h_tbl .word 640b - \type\()_\taps\()_h_tbl .word 320b - \type\()_\taps\()_h_tbl .word 160b - \type\()_\taps\()_h_tbl .word 80b - \type\()_\taps\()_h_tbl .word 40b - \type\()_\taps\()_h_tbl .word 20b - \type\()_\taps\()_h_tbl endjumptable function L(\type\()_\taps\()_v) cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w9 4: add \xmy, x10, \my, uxtw #3 movrel x9, \type\()_\taps\()_v_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f cmp \h, #2 ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b // 2x2 v load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_h v1, v2, v3, v4, v5 b.gt 24f uxtl_b v1, v2, v3, v4 mul_mla_4tap v6, v1, v2, v3, v4, .4h sqrshrun_b 6, v6 st_h \d_strd, v6, 2 ret 24: // 2x4 v load_h \sr2, \src, \s_strd, v6, v7 interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 mul_mla_4tap v6, v1, v2, v3, v4, .8h sqrshrun_b 6, v6 st_h \d_strd, v6, 4 ret 28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 interleave_1_h v1, v2, v3, v4, v5 interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 216: subs \h, \h, #4 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_h v7, v16, v17, v18, v19 interleave_2_s v5, v6, v7, v16, v17, v18 uxtl_b v5, v6, v7, v16 mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 4 b.le 0f cmp \h, #2 mov v1.16b, v5.16b mov v2.16b, v6.16b mov v3.16b, v7.16b mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b b.eq 26f b 216b 26: load_h \sr2, \src, \s_strd, v16, v17 interleave_1_h v7, v16, v17 uxtl_b v5, v6, v7, v16 mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 2 0: ret .endif 40: AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v cmp \h, #2 ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4 mul_mla_4tap v6, v1, v2, v3, v4, .8h shift_store_4 \type, \d_strd, v6 b.le 0f load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 uxtl_b v5, v6 mul_mla_4tap v7, v3, v4, v5, v6, .8h shift_store_4 \type, \d_strd, v7 0: ret 480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 interleave_1_s v16, v17, v18 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v16, v17 uxtl_b v18, v19, v20, v21 48: subs \h, \h, #4 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 interleave_1_s v22, v23, v24, v25, v26 uxtl_b v22, v23, v24, v25 mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f load_s \sr2, \src, \s_strd, v27, v16 subs \h, \h, #2 interleave_1_s v26, v27, v16 uxtl_b v26, v27 mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 shift_store_4 \type, \d_strd, v1 b.le 0f load_s \sr2, \src, \s_strd, v17, v18 subs \h, \h, #2 interleave_1_s v16, v17, v18 uxtl_b v16, v17 mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v18, v19, v20, v21 mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 b.gt 48b 0: ret 80: AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v cmp \h, #2 ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4, v5 mul_mla_4tap v6, v1, v2, v3, v4, .8h mul_mla_4tap v7, v2, v3, v4, v5, .8h shift_store_8 \type, \d_strd, v6, v7 b.le 0f load_8b \sr2, \src, \s_strd, v6, v7 uxtl_b v6, v7 mul_mla_4tap v1, v3, v4, v5, v6, .8h mul_mla_4tap v2, v4, v5, v6, v7, .8h shift_store_8 \type, \d_strd, v1, v2 0: ret 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 uxtl_b v16, v17, v18, v19, v20, v21, v22 88: subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v23, v24 uxtl_b v23, v24 mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v25, v26 uxtl_b v25, v26 mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v27, v16 uxtl_b v27, v16 mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v17, v18 uxtl_b v17, v18 mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 uxtl_b v19, v20, v21, v22 mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.gt 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: ret 160: AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b cmp \h, #2 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl v16.8h, v1.8b uxtl v17.8h, v2.8b uxtl v18.8h, v3.8b uxtl v19.8h, v4.8b uxtl v20.8h, v5.8b uxtl2 v23.8h, v1.16b uxtl2 v24.8h, v2.16b uxtl2 v25.8h, v3.16b uxtl2 v26.8h, v4.16b uxtl2 v27.8h, v5.16b mul_mla_4tap v1, v16, v17, v18, v19, .8h mul_mla_4tap v16, v17, v18, v19, v20, .8h mul_mla_4tap v2, v23, v24, v25, v26, .8h mul_mla_4tap v17, v24, v25, v26, v27, .8h shift_store_16 \type, \d_strd, v1, v2, v16, v17 b.le 0f load_16b \sr2, \src, \s_strd, v6, v7 uxtl v21.8h, v6.8b uxtl v22.8h, v7.8b uxtl2 v28.8h, v6.16b uxtl2 v29.8h, v7.16b mul_mla_4tap v1, v18, v19, v20, v21, .8h mul_mla_4tap v3, v19, v20, v21, v22, .8h mul_mla_4tap v2, v25, v26, v27, v28, .8h mul_mla_4tap v4, v26, v27, v28, v29, .8h shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret endfunc jumptable \type\()_\taps\()_v_tbl .word 640b - \type\()_\taps\()_v_tbl .word 320b - \type\()_\taps\()_v_tbl .word 160b - \type\()_\taps\()_v_tbl .word 80b - \type\()_\taps\()_v_tbl .word 40b - \type\()_\taps\()_v_tbl .word 20b - \type\()_\taps\()_v_tbl endjumptable function L(\type\()_\taps\()_hv) cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w9 4: add \xmy, x10, \my, uxtw #3 movrel x9, \type\()_\taps\()_hv_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put ldur s0, [\xmx, #2] b.gt 280f ldur s1, [\xmy, #2] // 2x2, 2x4 hv sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v28.8b}, [\src], \s_strd uxtl v28.8h, v28.8b ext v29.16b, v28.16b, v28.16b, #2 mul v28.4h, v28.4h, v0.4h mul v29.4h, v29.4h, v0.4h addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b 2: bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v28.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h subs \h, \h, #2 st1 {v2.h}[0], [\dst], \d_strd st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v28.8b b 2b 280: // 2x8, 2x16, 2x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v28.8b}, [\src], \s_strd uxtl v28.8h, v28.8b ext v29.16b, v28.16b, v28.16b, #2 mul v28.4h, v28.4h, v0.4h mul v29.4h, v29.4h, v0.4h addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 bl L(\type\()_\taps\()_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v28.8b, #4 mov v19.8b, v28.8b bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v28.8b, #4 mov v21.8b, v28.8b 28: bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v28.8b, #4 .ifc \taps, 6tap smull v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] .else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] .endif sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h subs \h, \h, #2 st1 {v2.h}[0], [\dst], \d_strd st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v28.8b b 28b 0: ret x15 L(\type\()_\taps\()_filter_2): ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v30.8h, v30.8b ext v29.16b, v28.16b, v28.16b, #2 ext v31.16b, v30.16b, v30.16b, #2 trn1 v27.2s, v28.2s, v30.2s trn2 v30.2s, v28.2s, v30.2s trn1 v28.2s, v29.2s, v31.2s trn2 v31.2s, v29.2s, v31.2s mul v27.4h, v27.4h, v0.h[0] mla v27.4h, v28.4h, v0.h[1] mla v27.4h, v30.4h, v0.h[2] mla v27.4h, v31.4h, v0.h[3] srshr v28.4h, v27.4h, #2 ret .endif 40: AARCH64_VALID_JUMP_TARGET ldur s0, [\xmx, #2] b.gt 480f ldur s1, [\xmy, #2] sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 // 4x2, 4x4 hv ld1 {v26.8b}, [\src], \s_strd uxtl v26.8h, v26.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b 4: bl L(\type\()_\taps\()_filter_4) // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v28.4h, v1.h[3] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v28.4h, v1.h[2] smlal v3.4s, v29.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h str s2, [\dst] str s3, [\ds2] add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b mov v17.8b, v28.8b mov v18.8b, v29.8b b 4b 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 .ifc \taps, 6tap sub \sr2, \src, \s_strd sub \src, \src, \s_strd, lsl #1 .else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd .endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v26.8b}, [\src], \s_strd uxtl v26.8h, v26.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] .ifc \taps, 6tap srshr v18.4h, v31.4h, #2 .else srshr v16.4h, v31.4h, #2 bl L(\type\()_\taps\()_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b .endif bl L(\type\()_\taps\()_filter_4) mov v19.8b, v28.8b mov v20.8b, v29.8b bl L(\type\()_\taps\()_filter_4) mov v21.8b, v28.8b mov v22.8b, v29.8b 48: bl L(\type\()_\taps\()_filter_4) .ifc \taps, 6tap smull v2.4s, v18.4h, v1.h[1] smlal v2.4s, v19.4h, v1.h[2] smlal v2.4s, v20.4h, v1.h[3] smlal v2.4s, v21.4h, v1.h[4] smlal v2.4s, v22.4h, v1.h[5] smlal v2.4s, v28.4h, v1.h[6] smull v3.4s, v19.4h, v1.h[1] smlal v3.4s, v20.4h, v1.h[2] smlal v3.4s, v21.4h, v1.h[3] smlal v3.4s, v22.4h, v1.h[4] smlal v3.4s, v28.4h, v1.h[5] smlal v3.4s, v29.4h, v1.h[6] .else // 8tap smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v19.4h, v1.h[2] smlal v3.4s, v20.4h, v1.h[3] smlal v3.4s, v21.4h, v1.h[4] smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v28.4h, v1.h[6] smlal v3.4s, v29.4h, v1.h[7] .endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h str s2, [\dst] str s3, [\ds2] add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f .ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b .endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v28.8b mov v22.8b, v29.8b b 48b 0: ret x15 L(\type\()_\taps\()_filter_4): ld1 {v26.8b}, [\sr2], \s_strd ld1 {v27.8b}, [\src], \s_strd uxtl v26.8h, v26.8b uxtl v27.8h, v27.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] ext v28.16b, v27.16b, v27.16b, #2 ext v29.16b, v27.16b, v27.16b, #4 ext v30.16b, v27.16b, v27.16b, #6 mul v27.4h, v27.4h, v0.h[0] mla v27.4h, v28.4h, v0.h[1] mla v27.4h, v29.4h, v0.h[2] mla v27.4h, v30.4h, v0.h[3] srshr v28.4h, v31.4h, #2 srshr v29.4h, v27.4h, #2 ret 80: 160: 320: AARCH64_VALID_JUMP_TARGET b.gt 880f ld1 {v0.8b}, [\xmx] ldur s1, [\xmy, #2] .ifc \taps, 6tap sub \src, \src, #2 .else sub \src, \src, #3 .endif sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 bl L(\type\()_\taps\()_filter_8_first) bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v24.4h, v1.h[2] smlal2 v5.4s, v24.8h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] smlal2 v3.4s, v24.8h, v1.h[3] smlal v4.4s, v25.4h, v1.h[3] smlal2 v5.4s, v25.8h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv sqrshrn2 v4.8h, v5.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v4.8b, v4.8h st1 {v2.8b}, [\dst], \d_strd st1 {v4.8b}, [\ds2], \d_strd .else st1 {v2.8h}, [\dst], \d_strd st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b mov v17.16b, v24.16b mov v18.16b, v25.16b b 8b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] .ifc \taps, 6tap sub \src, \src, #2 .else sub \src, \src, #3 sub \src, \src, \s_strd .endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 bl L(\type\()_\taps\()_filter_8_first) .ifc \taps, 6tap mov v18.16b, v16.16b .else bl L(\type\()_\taps\()_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b .endif bl L(\type\()_\taps\()_filter_8) mov v19.16b, v24.16b mov v20.16b, v25.16b bl L(\type\()_\taps\()_filter_8) mov v21.16b, v24.16b mov v22.16b, v25.16b 88: .ifc \taps, 6tap smull v2.4s, v18.4h, v1.h[1] smull2 v3.4s, v18.8h, v1.h[1] bl L(\type\()_\taps\()_filter_8) smull v4.4s, v19.4h, v1.h[1] smull2 v5.4s, v19.8h, v1.h[1] smlal v2.4s, v19.4h, v1.h[2] smlal2 v3.4s, v19.8h, v1.h[2] smlal v4.4s, v20.4h, v1.h[2] smlal2 v5.4s, v20.8h, v1.h[2] smlal v2.4s, v20.4h, v1.h[3] smlal2 v3.4s, v20.8h, v1.h[3] smlal v4.4s, v21.4h, v1.h[3] smlal2 v5.4s, v21.8h, v1.h[3] smlal v2.4s, v21.4h, v1.h[4] smlal2 v3.4s, v21.8h, v1.h[4] smlal v4.4s, v22.4h, v1.h[4] smlal2 v5.4s, v22.8h, v1.h[4] smlal v2.4s, v22.4h, v1.h[5] smlal2 v3.4s, v22.8h, v1.h[5] smlal v4.4s, v24.4h, v1.h[5] smlal2 v5.4s, v24.8h, v1.h[5] smlal v2.4s, v24.4h, v1.h[6] smlal2 v3.4s, v24.8h, v1.h[6] smlal v4.4s, v25.4h, v1.h[6] smlal2 v5.4s, v25.8h, v1.h[6] .else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v19.4h, v1.h[2] smlal2 v5.4s, v19.8h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal2 v3.4s, v19.8h, v1.h[3] smlal v4.4s, v20.4h, v1.h[3] smlal2 v5.4s, v20.8h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal2 v3.4s, v20.8h, v1.h[4] smlal v4.4s, v21.4h, v1.h[4] smlal2 v5.4s, v21.8h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal2 v3.4s, v21.8h, v1.h[5] smlal v4.4s, v22.4h, v1.h[5] smlal2 v5.4s, v22.8h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal2 v3.4s, v22.8h, v1.h[6] smlal v4.4s, v24.4h, v1.h[6] smlal2 v5.4s, v24.8h, v1.h[6] smlal v2.4s, v24.4h, v1.h[7] smlal2 v3.4s, v24.8h, v1.h[7] smlal v4.4s, v25.4h, v1.h[7] smlal2 v5.4s, v25.8h, v1.h[7] .endif sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv sqrshrn2 v4.8h, v5.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v4.8b, v4.8h st1 {v2.8b}, [\dst], \d_strd st1 {v4.8b}, [\ds2], \d_strd .else st1 {v2.8h}, [\dst], \d_strd st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f .ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b .endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v24.16b mov v22.16b, v25.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif .ifc \taps, 6tap add \src, \src, \s_strd, lsl #1 .endif b 168b 0: ret x15 L(\type\()_\taps\()_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b .ifc \taps, 6tap mul v16.8h, v28.8h, v0.h[1] ext v25.16b, v28.16b, v29.16b, #(2*1) ext v26.16b, v28.16b, v29.16b, #(2*2) ext v27.16b, v28.16b, v29.16b, #(2*3) mla v16.8h, v25.8h, v0.h[2] mla v16.8h, v26.8h, v0.h[3] mla v16.8h, v27.8h, v0.h[4] ext v24.16b, v28.16b, v29.16b, #(2*4) ext v25.16b, v28.16b, v29.16b, #(2*5) mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] .else // 8tap mul v16.8h, v28.8h, v0.h[0] ext v24.16b, v28.16b, v29.16b, #(2*1) ext v25.16b, v28.16b, v29.16b, #(2*2) ext v26.16b, v28.16b, v29.16b, #(2*3) ext v27.16b, v28.16b, v29.16b, #(2*4) mla v16.8h, v24.8h, v0.h[1] mla v16.8h, v25.8h, v0.h[2] mla v16.8h, v26.8h, v0.h[3] mla v16.8h, v27.8h, v0.h[4] ext v24.16b, v28.16b, v29.16b, #(2*5) ext v25.16b, v28.16b, v29.16b, #(2*6) ext v26.16b, v28.16b, v29.16b, #(2*7) mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] mla v16.8h, v26.8h, v0.h[7] .endif srshr v16.8h, v16.8h, #2 ret L(\type\()_\taps\()_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b .ifc \taps, 6tap mul v24.8h, v28.8h, v0.h[1] mul v25.8h, v30.8h, v0.h[1] .irpc i, 23456 ext v26.16b, v28.16b, v29.16b, #(2*\i-2) ext v27.16b, v30.16b, v31.16b, #(2*\i-2) mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr .else // 8tap mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v28.16b, v29.16b, #(2*\i) ext v27.16b, v30.16b, v31.16b, #(2*\i) mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr .endif srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret endfunc jumptable \type\()_\taps\()_hv_tbl .word 640b - \type\()_\taps\()_hv_tbl .word 320b - \type\()_\taps\()_hv_tbl .word 160b - \type\()_\taps\()_hv_tbl .word 80b - \type\()_\taps\()_hv_tbl .word 40b - \type\()_\taps\()_hv_tbl .word 20b - \type\()_\taps\()_hv_tbl endjumptable .endm .macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv function \type\()_bilin_8bpc_neon, export=1 dup v1.16b, \mx dup v3.16b, \my mov w9, #16 sub w8, w9, \mx sub w9, w9, \my dup v0.16b, w8 dup v2.16b, w9 .ifc \type, prep lsl \d_strd, \d_strd, #1 .endif clz w8, \w sub w8, w8, #25 cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) movrel x9, \type\()_bilin_h_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: ld1r {v4.4s}, [\src], \s_strd ld1r {v6.4s}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.4h, v4.4h, v6.4h trn1 v5.4h, v5.4h, v7.4h subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umlal v4.8h, v5.8b, v1.8b uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: ld1 {v4.8b}, [\src], \s_strd ld1 {v6.8b}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.2s, v4.2s, v6.2s trn1 v5.2s, v5.2s, v7.2s subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umlal v4.8h, v5.8b, v1.8b .ifc \type, put uqrshrn v4.8b, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: ld1 {v4.16b}, [\src], \s_strd ld1 {v6.16b}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #1 ext v7.16b, v6.16b, v6.16b, #1 subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umull v6.8h, v6.8b, v0.8b umlal v4.8h, v5.8b, v1.8b umlal v6.8h, v7.8b, v1.8b .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn v6.8b, v6.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v6.8b}, [\ds2], \d_strd .else st1 {v4.8h}, [\dst], \d_strd st1 {v6.8h}, [\ds2], \d_strd .endif b.gt 8b ret 160: 320: 640: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, uxtw sub \s_strd, \s_strd, #8 lsl \d_strd, \d_strd, #1 .ifc \type, put sub \d_strd, \d_strd, \w, uxtw .else sub \d_strd, \d_strd, \w, uxtw #1 .endif 161: ld1 {v16.d}[1], [\src], #8 ld1 {v20.d}[1], [\sr2], #8 mov \mx, \w 16: ld1 {v18.16b}, [\src], #16 ld1 {v22.16b}, [\sr2], #16 ext v17.16b, v16.16b, v18.16b, #8 ext v19.16b, v16.16b, v18.16b, #9 ext v21.16b, v20.16b, v22.16b, #8 ext v23.16b, v20.16b, v22.16b, #9 umull v16.8h, v17.8b, v0.8b umull2 v17.8h, v17.16b, v0.16b umull v20.8h, v21.8b, v0.8b umull2 v21.8h, v21.16b, v0.16b umlal v16.8h, v19.8b, v1.8b umlal2 v17.8h, v19.16b, v1.16b umlal v20.8h, v23.8b, v1.8b umlal2 v21.8h, v23.16b, v1.16b subs \mx, \mx, #16 .ifc \type, put uqrshrn v16.8b, v16.8h, #4 uqrshrn2 v16.16b, v17.8h, #4 uqrshrn v20.8b, v20.8h, #4 uqrshrn2 v20.16b, v21.8h, #4 st1 {v16.16b}, [\dst], #16 st1 {v20.16b}, [\ds2], #16 .else st1 {v16.8h, v17.8h}, [\dst], #32 st1 {v20.8h, v21.8h}, [\ds2], #32 .endif b.le 9f mov v16.16b, v18.16b mov v20.16b, v22.16b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret endfunc jumptable \type\()_bilin_h_tbl .word 640b - \type\()_bilin_h_tbl .word 320b - \type\()_bilin_h_tbl .word 160b - \type\()_bilin_h_tbl .word 80b - \type\()_bilin_h_tbl .word 40b - \type\()_bilin_h_tbl .word 20b - \type\()_bilin_h_tbl endjumptable function L(\type\()_bilin_v) cmp \h, #4 movrel x9, \type\()_bilin_v_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v ld1r {v16.8h}, [\src], \s_strd b.gt 24f 22: ld1r {v17.8h}, [\sr2], \s_strd ld1r {v18.8h}, [\src], \s_strd trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b uqrshrn v4.8b, v4.8h, #4 str h4, [\dst] st1 {v4.h}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v ld1r {v17.8h}, [\sr2], \s_strd ld1r {v18.8h}, [\src], \s_strd ld1r {v19.8h}, [\sr2], \s_strd ld1r {v20.8h}, [\src], \s_strd sub \h, \h, #4 trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h trn1 v18.4h, v18.4h, v19.4h trn1 v19.4h, v19.4h, v20.4h trn1 v16.2s, v16.2s, v18.2s trn1 v17.2s, v17.2s, v19.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b cmp \h, #2 uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd st1 {v4.h}[2], [\dst], \d_strd st1 {v4.h}[3], [\ds2], \d_strd b.lt 0f mov v16.8b, v20.8b b.eq 22b b 24b 0: ret .endif 40: // 4xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1r {v16.4s}, [\src], \s_strd 4: ld1r {v17.4s}, [\sr2], \s_strd ld1r {v18.4s}, [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b b 4b 0: ret 80: // 8xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8b}, [\src], \s_strd 8: ld1 {v17.8b}, [\sr2], \s_strd ld1 {v18.8b}, [\src], \s_strd umull v4.8h, v16.8b, v2.8b umull v5.8h, v17.8b, v2.8b umlal v4.8h, v17.8b, v3.8b umlal v5.8h, v18.8b, v3.8b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn v5.8b, v5.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v5.8b}, [\ds2], \d_strd .else st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b b 8b 0: ret 160: // 16xN, 32xN, ... 320: 640: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.16b}, [\src], \s_strd 2: ld1 {v17.16b}, [\sr2], \s_strd ld1 {v18.16b}, [\src], \s_strd umull v4.8h, v16.8b, v2.8b umull2 v5.8h, v16.16b, v2.16b umull v6.8h, v17.8b, v2.8b umull2 v7.8h, v17.16b, v2.16b umlal v4.8h, v17.8b, v3.8b umlal2 v5.8h, v17.16b, v3.16b umlal v6.8h, v18.8b, v3.8b umlal2 v7.8h, v18.16b, v3.16b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn2 v4.16b, v5.8h, #4 uqrshrn v6.8b, v6.8h, #4 uqrshrn2 v6.16b, v7.8h, #4 st1 {v4.16b}, [\dst], \d_strd st1 {v6.16b}, [\ds2], \d_strd .else st1 {v4.8h, v5.8h}, [\dst], \d_strd st1 {v6.8h, v7.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #16 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 .ifc \type, put add \dst, \dst, #16 .else add \dst, \dst, #32 .endif b 1b 0: ret endfunc jumptable \type\()_bilin_v_tbl .word 640b - \type\()_bilin_v_tbl .word 320b - \type\()_bilin_v_tbl .word 160b - \type\()_bilin_v_tbl .word 80b - \type\()_bilin_v_tbl .word 40b - \type\()_bilin_v_tbl .word 20b - \type\()_bilin_v_tbl endjumptable function L(\type\()_bilin_hv) uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b movrel x9, \type\()_bilin_hv_tbl ldrsw x8, [x9, x8, lsl #2] add x9, x9, x8 br x9 20: // 2xN hv AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1r {v28.4s}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: ld1r {v28.4s}, [\sr2], \s_strd ld1r {v30.4s}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.4h, v28.4h, v30.4h trn1 v29.4h, v29.4h, v31.4h umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b trn1 v16.2s, v16.2s, v17.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h uqrshrn v4.8b, v4.8h, #8 subs \h, \h, #2 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd b.le 0f trn2 v16.2s, v17.2s, v17.2s b 2b 0: ret .endif 40: // 4xN hv AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.8b}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 4: ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.2s, v28.2s, v30.2s trn1 v29.2s, v29.2s, v31.2s umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b trn1 v16.2d, v16.2d, v17.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #8 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f trn2 v16.2d, v17.2d, v17.2d b 4b 0: ret 80: // 8xN, 16xN, ... hv 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.16b}, [\src], \s_strd ext v29.16b, v28.16b, v28.16b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: ld1 {v28.16b}, [\sr2], \s_strd ld1 {v30.16b}, [\src], \s_strd ext v29.16b, v28.16b, v28.16b, #1 ext v31.16b, v30.16b, v30.16b, #1 umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b umull v18.8h, v30.8b, v0.8b umlal v18.8h, v31.8b, v1.8b mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v18.8h, v3.8h subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #8 uqrshrn v5.8b, v5.8h, #8 st1 {v4.8b}, [\dst], \d_strd st1 {v5.8b}, [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 1b 0: ret endfunc jumptable \type\()_bilin_hv_tbl .word 640b - \type\()_bilin_hv_tbl .word 320b - \type\()_bilin_hv_tbl .word 160b - \type\()_bilin_hv_tbl .word 80b - \type\()_bilin_hv_tbl .word 40b - \type\()_bilin_hv_tbl .word 20b - \type\()_bilin_hv_tbl endjumptable .endm make_8tap_fn put, sharp, SHARP, 8tap filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap make_8tap_fn put, regular, REGULAR, 6tap make_8tap_fn put, smooth, SMOOTH, 6tap filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 make_8tap_fn prep, sharp, SHARP, 8tap filter_fn prep, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 6, 8tap make_8tap_fn prep, regular, REGULAR, 6tap make_8tap_fn prep, smooth, SMOOTH, 6tap filter_fn prep, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 6, 6tap filter_bilin_fn prep, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 6 .macro load_filter_row dst, src, inc asr w13, \src, #10 add \src, \src, \inc ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon add w12, w5, #512 ld1 {v16.8b, v17.8b}, [x2], x3 load_filter_row d0, w12, w7 load_filter_row d1, w12, w7 load_filter_row d2, w12, w7 load_filter_row d3, w12, w7 load_filter_row d4, w12, w7 load_filter_row d5, w12, w7 load_filter_row d6, w12, w7 // subtract by 128 to allow using smull eor v16.8b, v16.8b, v22.8b eor v17.8b, v17.8b, v22.8b load_filter_row d7, w12, w7 ext v18.8b, v16.8b, v17.8b, #1 ext v19.8b, v16.8b, v17.8b, #2 smull v0.8h, v0.8b, v16.8b smull v1.8h, v1.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #3 ext v20.8b, v16.8b, v17.8b, #4 smull v2.8h, v2.8b, v19.8b smull v3.8h, v3.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #5 ext v19.8b, v16.8b, v17.8b, #6 smull v4.8h, v4.8b, v20.8b smull v5.8h, v5.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #7 smull v6.8h, v6.8b, v19.8b smull v7.8h, v7.8b, v18.8b addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h add w5, w5, w8 ret endfunc // void dav2d_warp_affine_8x8_8bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my) .macro warp t, shift function warp_affine_8x8\t\()_8bpc_neon, export=1 ldr x4, [x4] sbfx x7, x4, #0, #16 sbfx x8, x4, #16, #16 sbfx x9, x4, #32, #16 sbfx x4, x4, #48, #16 mov w10, #8 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x2, x2, #3 movrel x11, X(mc_warp_filter), 3*64*8 mov x15, x30 .ifnb \t lsl x1, x1, #1 .endif movi v22.8b, #128 .ifb \t movi v23.8h, #128 .else movi v23.8h, #8, lsl #8 .endif bl warp_filter_horz_neon srshr v24.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v25.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v26.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v27.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v28.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v29.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v30.8h, v0.8h, #3 1: add w14, w6, #512 bl warp_filter_horz_neon srshr v31.8h, v0.8h, #3 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 load_filter_row d2, w14, w9 load_filter_row d3, w14, w9 load_filter_row d4, w14, w9 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. smull v16.4s, v24.4h, v0.4h smlal v16.4s, v25.4h, v1.4h smlal v16.4s, v26.4h, v2.4h smlal v16.4s, v27.4h, v3.4h smlal v16.4s, v28.4h, v4.4h smlal v16.4s, v29.4h, v5.4h smlal v16.4s, v30.4h, v6.4h smlal v16.4s, v31.4h, v7.4h smull2 v17.4s, v24.8h, v0.8h smlal2 v17.4s, v25.8h, v1.8h smlal2 v17.4s, v26.8h, v2.8h smlal2 v17.4s, v27.8h, v3.8h smlal2 v17.4s, v28.8h, v4.8h smlal2 v17.4s, v29.8h, v5.8h smlal2 v17.4s, v30.8h, v6.8h smlal2 v17.4s, v31.8h, v7.8h mov v24.16b, v25.16b mov v25.16b, v26.16b sqrshrn v16.4h, v16.4s, #\shift mov v26.16b, v27.16b sqrshrn2 v16.8h, v17.4s, #\shift mov v27.16b, v28.16b mov v28.16b, v29.16b add v16.8h, v16.8h, v23.8h .ifb \t sqxtun v16.8b, v16.8h .endif mov v29.16b, v30.16b mov v30.16b, v31.16b subs w10, w10, #1 .ifnb \t st1 {v16.8h}, [x0], x1 .else st1 {v16.8b}, [x0], x1 .endif add w6, w6, w4 b.gt 1b ret x15 endfunc .endm warp , 11 warp t, 7 // void dav2d_emu_edge_8bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_8bpc_neon, export=1 ldp x8, x9, [sp] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub x12, x3, #1 // ih - 1 cmp x5, x3 sub x13, x2, #1 // iw - 1 csel x12, x12, x5, ge // min(y, ih - 1) cmp x4, x2 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) csel x13, x13, x4, ge // min(x, iw - 1) bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) madd x8, x12, x9, x8 // ref += iclip() * stride add x8, x8, x13 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add x10, x5, x1 // y + bh neg x5, x5 // -y sub x10, x10, x3 // y + bh - ih sub x12, x1, #1 // bh - 1 cmp x10, x1 bic x5, x5, x5, asr #63 // max(-y, 0) csel x10, x10, x12, lt // min(y + bh - ih, bh-1) cmp x5, x1 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) csel x5, x5, x12, lt // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add x11, x4, x0 // x + bw neg x4, x4 // -x sub x11, x11, x2 // x + bw - iw sub x13, x0, #1 // bw - 1 cmp x11, x0 bic x4, x4, x4, asr #63 // max(-x, 0) csel x11, x11, x13, lt // min(x + bw - iw, bw-1) cmp x4, x0 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub x1, x1, x5 // bh - top_ext madd x6, x5, x7, x6 sub x2, x0, x4 // bw - left_ext sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext sub x2, x2, x11 // center_w = bw - left_ext - right_ext mov x14, x6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left ld1r {v0.16b}, [x8] mov x12, x6 // out = dst mov x3, x4 1: subs x3, x3, #16 st1 {v0.16b}, [x12], #16 b.gt 1b .endif mov x13, x8 add x12, x6, x4 // out = dst + left_ext mov x3, x2 1: ld1 {v0.16b, v1.16b}, [x13], #32 subs x3, x3, #32 st1 {v0.16b, v1.16b}, [x12], #32 b.gt 1b .if \need_right add x3, x8, x2 // in + center_w sub x3, x3, #1 // in + center_w - 1 add x12, x6, x4 // dst + left_ext ld1r {v0.16b}, [x3] add x12, x12, x2 // out = dst + left_ext + center_w mov x3, x11 1: subs x3, x3, #16 st1 {v0.16b}, [x12], #16 b.gt 1b .endif subs x1, x1, #1 // center_h-- add x6, x6, x7 add x8, x8, x9 b.gt 0b .endm cbz x4, 2f // need_left cbz x11, 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cbz x11, 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cbz x10, 3f // need_bottom sub x8, x6, x7 // ref = dst - stride mov x4, x0 1: ld1 {v0.16b, v1.16b}, [x8], #32 mov x3, x10 2: subs x3, x3, #1 st1 {v0.16b, v1.16b}, [x6], x7 b.gt 2b msub x6, x7, x10, x6 // dst -= bottom_ext * stride subs x4, x4, #32 // bw -= 32 add x6, x6, #32 // dst += 32 b.gt 1b 3: cbz x5, 3f // need_top msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 1: ld1 {v0.16b, v1.16b}, [x14], #32 mov x3, x5 2: subs x3, x3, #1 st1 {v0.16b, v1.16b}, [x6], x7 b.gt 2b msub x6, x7, x5, x6 // dst -= top_ext * stride subs x0, x0, #32 // bw -= 32 add x6, x6, #32 // dst += 32 b.gt 1b 3: ret endfunc .macro SAD8xN_8bpc ldr q0, [x12] // overread by 4 ldr q1, [x13] ldr q5, [x12, x1] // overread by 4 ldr q6, [x13, x3] add x16, x12, x1, lsl #1 // PXSTRIDE(p0_stride) * 4 add x17, x13, x3, lsl #1 // PXSTRIDE(p1_stride) * 4 uabd v0.16b, v0.16b, v1.16b uabd v5.16b, v5.16b, v6.16b sub w9, w5, #4 uaddlp v2.8h, v0.16b uaddlp v4.8h, v5.16b 7: ldr q0, [x16] // overread by 4 ldr q1, [x17] ldr q5, [x16, x1] // overread by 4 ldr q6, [x17, x3] add x16, x16, x1, lsl #1 // PXSTRIDE(p0_stride) * 2 add x17, x17, x3, lsl #1 // PXSTRIDE(p1_stride) * 2 uabd v0.16b, v0.16b, v1.16b uabd v5.16b, v5.16b, v6.16b sub w9, w9, #4 uadalp v2.8h, v0.16b uadalp v4.8h, v5.16b cbnz w9, 7b add v2.8h, v2.8h, v4.8h and v2.16b, v2.16b, v3.16b // mask the last 4 uaddlv s5, v2.8h fmov w15, s5 .endm .macro SAD16xN_8bpc add x14, x12, #16 add x15, x13, #16 ldr q0, [x12] ldr q1, [x13] sub w9, w5, #4 ldr s4, [x12, #16] ldr s5, [x13, #16] add x14, x14, x1 add x15, x15, x3 ldr q16, [x12, x1] ldr q17, [x13, x3] ld1 {v4.s}[1], [x14] ld1 {v5.s}[1], [x15] uabd v0.16b, v0.16b, v1.16b uabd v16.16b, v16.16b, v17.16b add x16, x12, x1, lsl #1 // PXSTRIDE(p0_stride) * 4 add x17, x13, x3, lsl #1 // PXSTRIDE(p1_stride) * 4 uaddlp v2.8h, v0.16b uaddlp v3.8h, v16.16b uabdl v6.8h, v4.8b, v5.8b 7: ldr q0, [x16] ldr q1, [x17] ldr s4, [x16, #16] ldr s5, [x17, #16] add x14, x14, x1, lsl #1 add x15, x15, x3, lsl #1 ldr q16, [x16, x1] ldr q17, [x17, x3] ld1 {v4.s}[1], [x14] ld1 {v5.s}[1], [x15] uabd v0.16b, v0.16b, v1.16b uabd v16.16b, v16.16b, v17.16b sub w9, w9, #4 add x16, x16, x1, lsl #1 // PXSTRIDE(p0_stride) * 2 add x17, x17, x3, lsl #1 // PXSTRIDE(p1_stride) * 2 uadalp v2.8h, v0.16b uadalp v3.8h, v16.16b uabal v6.8h, v4.8b, v5.8b cbnz w9, 7b add v2.8h, v2.8h, v3.8h add v2.8h, v2.8h, v6.8h uaddlv s2, v2.8h fmov w15, s2 .endm .macro sad_refine_mv_8bpc_w width add w4, w4, #4 // w + 4 add w5, w5, #4 // h + 4 mul w8, w4, w5 lsl w8, w8, #1 // sad_thr mov x10, #0 // best_dx mov x11, #0 // best_dy mov x15, #0xffffffff // best_sad .if \width==8 movi v3.16b, #0xff ins v3.s[3], wzr .endif cbz w6, 2f // is_implicit // is_implicit = 1 add x12, x0, x1 add x13, x2, x3 add x12, x12, #2 // 2 * p0_stride + 2 add x13, x13, #2 // 2 * p1_stride + 2 SAD\width\()xN_8bpc sub x15, x15, x15, lsr #3 // (best_sad * 7 + 7) >> 3 cmp x15, x8 // if best_sad < sad_thr b.lt 6f 2: // is_implicit = 0 add x2, x2, x3, lsl #1 // p1_start = p1 + 4 * p1_stride mov x6, x15 // best_sad add x2, x2, #4 // p1_start = p1 + 4 * p1_stride + 4 mov x4, #-2 // y_off 3: mov x12, x0 // p0_ptr = p0_start mov x13, x2 // p1_ptr = p1_start mov x8, #-2 // x_off 4: orr x9, x4, x8 cbz x9, 5f // skip if both are 0 SAD\width\()xN_8bpc cmp x6, x15 // if (sad >= best_sad) b.ls 5f mov x6, x15 // best_sad = sad mov x10, x8 // best_dx = x_off mov x11, x4 // best_dy = y_off 5: add x8, x8, #1 // x_off++ add x12, x12, #1 // p0++ sub x13, x13, #1 // p1-- cmp x8, #2 b.le 4b add x4, x4, #1 // y_off++ add x0, x0, x1, lsr #1 // p0 += p0_stride sub x2, x2, x3, lsr #1 // p1 -= p1_stride cmp x4, #2 b.le 3b 6: strb w11, [x7] // o->y strb w10, [x7, #1] // o->x ret .endm // void dav2d_sad_refine_mv_8bpc_neon(const pixel *const p0, const ptrdiff_t p0_stride, // const pixel *const p1, const ptrdiff_t p1_stride, // const int w, const int h, const int is_implicit, // struct OpflOffset *const o HIGHBD_DECL_SUFFIX) function sad_refine_mv_8bpc_neon, export=1 cmp w4, #8 lsl x1, x1, #1 lsl x3, x3, #1 b.eq 1f sad_refine_mv_8bpc_w 16 1: sad_refine_mv_8bpc_w 8 endfunc // unsigned dav2d_sad8x8_8bpc_neon(const pixel *p0, const ptrdiff_t p0_stride, // const pixel *p1, const ptrdiff_t p1_stride // HIGHBD_DECL_SUFFIX) function sad8x8_8bpc_neon, export=1 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 ld1 {v5.8b}, [x0], x1 ld1 {v6.8b}, [x2], x3 uabdl v2.8h, v0.8b, v1.8b uabdl v4.8h, v5.8b, v6.8b .rept 3 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 ld1 {v5.8b}, [x0], x1 ld1 {v6.8b}, [x2], x3 uabal v2.8h, v0.8b, v1.8b uabal v4.8h, v5.8b, v6.8b .endr add v2.8h, v2.8h, v4.8h uaddlv s5, v2.8h fmov w0, s5 ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/mc16.S000066400000000000000000004065621517466257200225520ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define PREP_BIAS 8192 .macro avg d0, d1, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], #32 ld1 {\t2\().8h,\t3\().8h}, [x3], #32 sqadd \t0\().8h, \t0\().8h, \t2\().8h sqadd \t1\().8h, \t1\().8h, \t3\().8h smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) .endm .macro w_avg d0, d1, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], #32 ld1 {\t2\().8h,\t3\().8h}, [x3], #32 // This difference requires a 17 bit range, and all bits are // significant for the following multiplication. ssubl \d0\().4s, \t2\().4h, \t0\().4h ssubl2 \t0\().4s, \t2\().8h, \t0\().8h ssubl \d1\().4s, \t3\().4h, \t1\().4h ssubl2 \t1\().4s, \t3\().8h, \t1\().8h mul \d0\().4s, \d0\().4s, v27.4s mul \t0\().4s, \t0\().4s, v27.4s mul \d1\().4s, \d1\().4s, v27.4s mul \t1\().4s, \t1\().4s, v27.4s sshr \d0\().4s, \d0\().4s, #4 sshr \t0\().4s, \t0\().4s, #4 sshr \d1\().4s, \d1\().4s, #4 sshr \t1\().4s, \t1\().4s, #4 saddw \d0\().4s, \d0\().4s, \t2\().4h saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h sqxtn \d0\().4h, \d0\().4s sqxtn2 \d0\().8h, \t0\().4s sqxtn \d1\().4h, \d1\().4s sqxtn2 \d1\().8h, \t1\().4s srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max smax \d0\().8h, \d0\().8h, v30.8h // 0 smax \d1\().8h, \d1\().8h, v30.8h // 0 .endm .macro mask d0, d1, t0, t1, t2, t3 ld1 {v27.16b}, [x6], #16 ld1 {\t0\().8h,\t1\().8h}, [x2], #32 neg v27.16b, v27.16b ld1 {\t2\().8h,\t3\().8h}, [x3], #32 sxtl v26.8h, v27.8b sxtl2 v27.8h, v27.16b sxtl v24.4s, v26.4h sxtl2 v25.4s, v26.8h sxtl v26.4s, v27.4h sxtl2 v27.4s, v27.8h ssubl \d0\().4s, \t2\().4h, \t0\().4h ssubl2 \t0\().4s, \t2\().8h, \t0\().8h ssubl \d1\().4s, \t3\().4h, \t1\().4h ssubl2 \t1\().4s, \t3\().8h, \t1\().8h mul \d0\().4s, \d0\().4s, v24.4s mul \t0\().4s, \t0\().4s, v25.4s mul \d1\().4s, \d1\().4s, v26.4s mul \t1\().4s, \t1\().4s, v27.4s sshr \d0\().4s, \d0\().4s, #6 sshr \t0\().4s, \t0\().4s, #6 sshr \d1\().4s, \d1\().4s, #6 sshr \t1\().4s, \t1\().4s, #6 saddw \d0\().4s, \d0\().4s, \t2\().4h saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max smax \d0\().8h, \d0\().8h, v30.8h // 0 smax \d1\().8h, \d1\().8h, v30.8h // 0 .endm .macro bidir_fn type, bdmax function \type\()_16bpc_neon, export=1 clz w4, w4 .ifnc \type, avg dup v31.8h, \bdmax // bitdepth_max movi v30.8h, #0 .endif clz w7, \bdmax sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 .ifc \type, avg mov w9, #1 mov w8, #-2*PREP_BIAS lsl w9, w9, w7 // 1 << intermediate_bits add w7, w7, #1 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits neg w7, w7 // -(intermediate_bits+1) dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits dup v29.8h, w7 // -(intermediate_bits+1) .else mov w8, #PREP_BIAS lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits neg w7, w7 // -intermediate_bits dup v28.8h, w8 // PREP_BIAS >> intermediate_bits dup v29.8h, w7 // -intermediate_bits .endif .ifc \type, w_avg dup v27.4s, w6 neg v27.4s, v27.4s .endif movrel x7, \type\()_tbl sub w4, w4, #25 \type v4, v5, v0, v1, v2, v3 ldrsw x4, [x7, x4, lsl #2] add x7, x7, x4 br x7 40: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: subs w5, w5, #4 st1 {v4.8b}, [x0], x1 st1 {v4.d}[1], [x7], x1 st1 {v5.8b}, [x0], x1 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 4b 80: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 8b 160: AARCH64_VALID_JUMP_TARGET 16: \type v6, v7, v0, v1, v2, v3 st1 {v4.8h, v5.8h}, [x0], x1 subs w5, w5, #2 st1 {v6.8h, v7.8h}, [x0], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 16b 320: AARCH64_VALID_JUMP_TARGET 32: \type v6, v7, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 32b 640: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 64: \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 \type v18, v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 64b 0: ret endfunc jumptable \type\()_tbl .word 640b - \type\()_tbl .word 320b - \type\()_tbl .word 160b - \type\()_tbl .word 80b - \type\()_tbl .word 40b - \type\()_tbl endjumptable .endm bidir_fn avg, w6 bidir_fn w_avg, w7 bidir_fn mask, w7 .macro w_mask_fn type function w_mask_\type\()_16bpc_neon, export=1 #ifdef __APPLE__ ldp w8, w9, [sp] #else ldp x8, x9, [sp] #endif clz w11, w4 movrel x10, w_mask_\type\()_tbl dup v31.8h, w9 // bitdepth_max sub w11, w11, #25 clz w9, w9 // clz(bitdepth_max) ldrsw x11, [x10, x11, lsl #2] add x10, x10, x11 sub w9, w9, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov w12, #PREP_BIAS*64 neg w9, w9 // -sh mov w11, #27615 // (64 + 1 - 38)<> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s uxtl v22.4s, v20.4h uxtl2 v23.4s, v20.8h uxtl v24.4s, v21.4h uxtl2 v25.4s, v21.8h mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) mla v5.4s, v17.4s, v23.4s mla v6.4s, v18.4s, v24.4s mla v7.4s, v19.4s, v25.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v6.4s sqxtun2 v5.8h, v7.4s umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) xtn v20.8b, v20.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 st1 {v20.8b}, [x6], #8 .elseif \type == 420 trn1 v24.2d, v20.2d, v21.2d trn2 v25.2d, v20.2d, v21.2d add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 str s20, [x6], #4 .endif st1 {v4.8b}, [x0], x1 st1 {v4.d}[1], [x12], x1 st1 {v5.8b}, [x0], x1 st1 {v5.d}[1], [x12], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 subs w5, w5, #2 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) sabd v21.8h, v5.8h, v7.8h ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v17.4s, v6.8h, v4.8h ssubl v18.4s, v7.4h, v5.4h ssubl2 v19.4s, v7.8h, v5.8h uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() uqsub v21.8h, v0.8h, v21.8h sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 sshll v6.4s, v5.4h, #6 sshll2 v5.4s, v4.8h, #6 sshll v4.4s, v4.4h, #6 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s uxtl v22.4s, v20.4h uxtl2 v23.4s, v20.8h uxtl v24.4s, v21.4h uxtl2 v25.4s, v21.8h mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) mla v5.4s, v17.4s, v23.4s mla v6.4s, v18.4s, v24.4s mla v7.4s, v19.4s, v25.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v6.4s sqxtun2 v5.8h, v7.4s umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) xtn v20.8b, v20.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 st1 {v20.8b}, [x6], #8 .elseif \type == 420 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 str s20, [x6], #4 .endif st1 {v4.8h}, [x0], x1 st1 {v5.8h}, [x12], x1 b.gt 8b ret 640: 320: 160: AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw #1 .if \type == 444 add x10, x6, x7 lsl x7, x7, #1 sub x7, x7, w4, uxtw .elseif \type == 422 uxtw x4, w4 add x10, x6, x7 lsl x7, x7, #1 sub x7, x7, x4, lsr #1 .elseif \type == 420 uxtw x4, w4 sub x7, x7, x4, lsr #1 .endif add x9, x3, w4, uxtw #1 add x13, x2, w4, uxtw #1 161: mov w8, w4 16: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 ld1 {v6.8h, v7.8h}, [x13], #32 ld1 {v18.8h, v19.8h}, [x9], #32 subs w8, w8, #16 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) sabd v21.8h, v5.8h, v17.8h ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v23.4s, v16.8h, v4.8h ssubl v24.4s, v17.4h, v5.4h ssubl2 v25.4s, v17.8h, v5.8h uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() uqsub v21.8h, v0.8h, v21.8h sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 sshll v26.4s, v5.4h, #6 sshll2 v5.4s, v4.8h, #6 sshll v4.4s, v4.4h, #6 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v26.4s, v26.4s, v30.4s add v27.4s, v27.4s, v30.4s uxtl v16.4s, v20.4h uxtl2 v17.4s, v20.8h uxtl v28.4s, v21.4h mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) uxtl2 v16.4s, v21.8h mla v5.4s, v23.4s, v17.4s mla v26.4s, v24.4s, v28.4s mla v27.4s, v25.4s, v16.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v26.4s, v26.4s, v29.4s srshl v27.4s, v27.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v26.4s sqxtun2 v5.8h, v27.4s // Start of other half sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) sabd v23.8h, v7.8h, v19.8h umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v17.4s, v18.8h, v6.8h ssubl v18.4s, v19.4h, v7.4h ssubl2 v19.4s, v19.8h, v7.8h uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() uqsub v23.8h, v0.8h, v23.8h sshll v24.4s, v6.4h, #6 // tmp1 << 6 sshll2 v25.4s, v6.8h, #6 sshll v26.4s, v7.4h, #6 sshll2 v27.4s, v7.8h, #6 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v23.8h, v23.8h, #10 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 add v25.4s, v25.4s, v30.4s add v26.4s, v26.4s, v30.4s add v27.4s, v27.4s, v30.4s uxtl v6.4s, v22.4h uxtl2 v7.4s, v22.8h uxtl v28.4s, v23.4h mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) uxtl2 v6.4s, v23.8h mla v25.4s, v17.4s, v7.4s mla v26.4s, v18.4s, v28.4s mla v27.4s, v19.4s, v6.4s srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v25.4s, v25.4s, v29.4s srshl v26.4s, v26.4s, v29.4s srshl v27.4s, v27.4s, v29.4s sqxtun v6.4h, v24.4s // iclip_pixel sqxtun2 v6.8h, v25.4s sqxtun v7.4h, v26.4s sqxtun2 v7.8h, v27.4s umin v6.8h, v6.8h, v31.8h // iclip_pixel umin v7.8h, v7.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m uzp1 v21.16b, v22.16b, v23.16b sub v20.16b, v1.16b, v20.16b // m sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 st1 {v21.16b}, [x10], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) addp v21.8h, v22.8h, v23.8h xtn v20.8b, v20.8h xtn v21.8b, v21.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 uhsub v21.8b, v3.8b, v21.8b st1 {v20.8b}, [x6], #8 st1 {v21.8b}, [x10], #8 .elseif \type == 420 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) add v21.8h, v21.8h, v23.8h addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.8b}, [x6], #8 .endif st1 {v4.8h, v5.8h}, [x0], #32 st1 {v6.8h, v7.8h}, [x12], #32 b.gt 16b subs w5, w5, #2 add x2, x2, w4, uxtw #1 add x3, x3, w4, uxtw #1 add x13, x13, w4, uxtw #1 add x9, x9, w4, uxtw #1 add x6, x6, x7 .if \type == 444 add x10, x10, x7 .elseif \type == 422 add x10, x10, x7 .endif add x0, x0, x1 add x12, x12, x1 b.gt 161b ret endfunc jumptable w_mask_\type\()_tbl .word 640b - w_mask_\type\()_tbl .word 320b - w_mask_\type\()_tbl .word 160b - w_mask_\type\()_tbl .word 80b - w_mask_\type\()_tbl .word 40b - w_mask_\type\()_tbl endjumptable .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_16bpc_neon, export=1 movrel x6, blend_tbl clz w3, w3 sub w3, w3, #25 ldrsw x3, [x6, x3, lsl #2] add x6, x6, x3 add x8, x0, x1 br x6 40: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 4: ld1 {v2.8b}, [x5], #8 ld1 {v1.8h}, [x2], #16 ldr d0, [x0] neg v2.8b, v2.8b // -m subs w4, w4, #2 ld1 {v0.d}[1], [x8] sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.8b}, [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 8: ld1 {v4.16b}, [x5], #16 ld1 {v2.8h, v3.8h}, [x2], #32 neg v5.16b, v4.16b // -m ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x8] sxtl v4.8h, v5.8b sxtl2 v5.8h, v5.16b shl v4.8h, v4.8h, #9 // -m << 9 shl v5.8h, v5.8h, #9 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h subs w4, w4, #2 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v5.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 16: ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #2 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b ld1 {v0.8h, v1.8h}, [x0] sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v2.8h, v3.8h}, [x8] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x8], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #1 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET subs x1, x1, #128 641: mov w6, #64 64: ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w6, w6, #32 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 64b subs w4, w4, #1 add x0, x0, x1 b.gt 641b ret endfunc jumptable blend_tbl .word 640b - blend_tbl .word 320b - blend_tbl .word 160b - blend_tbl .word 80b - blend_tbl .word 40b - blend_tbl endjumptable // This has got the same signature as the put_8tap functions, // and assumes that x9 is set to (clz(w)-24). function put_16bpc_neon, export=1 movrel x10, put_16bpc_tbl ldrsw x9, [x10, x9, lsl #2] add x10, x10, x9 br x10 20: AARCH64_VALID_JUMP_TARGET 2: ld1r {v0.4s}, [x2], x3 ld1r {v1.4s}, [x2], x3 subs w5, w5, #2 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET 4: ld1 {v0.4h}, [x2], x3 ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 st1 {v0.4h}, [x0], x1 st1 {v1.4h}, [x0], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 lsl x3, x3, #1 8: ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x9], x3 subs w5, w5, #2 st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET 16: ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] subs w5, w5, #1 stp x8, x9, [x0, #16] add x2, x2, x3 add x0, x0, x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] ldp x10, x11, [x2, #32] stp x8, x9, [x0, #16] subs w5, w5, #1 ldp x12, x13, [x2, #48] stp x10, x11, [x0, #32] stp x12, x13, [x0, #48] add x2, x2, x3 add x0, x0, x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x2, x2, x3 add x0, x0, x1 b.gt 64b ret endfunc jumptable put_16bpc_tbl .word 640b - put_16bpc_tbl .word 320b - put_16bpc_tbl .word 160b - put_16bpc_tbl .word 80b - put_16bpc_tbl .word 40b - put_16bpc_tbl .word 20b - put_16bpc_tbl endjumptable // This has got the same signature as the prep_8tap functions, // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and // x8 to w*2. function prep_16bpc_neon movrel x10, prep_16bpc_tbl ldrsw x9, [x10, x9, lsl #2] dup v31.8h, w8 // intermediate_bits movi v30.8h, #(PREP_BIAS >> 8), lsl #8 add x10, x10, x9 br x10 40: AARCH64_VALID_JUMP_TARGET add x9, x2, x3 add x10, x0, x1 lsl x3, x3, #1 lsl x1, x1, #1 4: ld1 {v0.4h}, [x2], x3 ld1 {v0.d}[1], [x9], x3 subs w5, w5, #2 sshl v0.8h, v0.8h, v31.8h sub v0.8h, v0.8h, v30.8h st1 {v0.4h}, [x0], x1 st1 {v0.d}[1], [x10], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x9, x2, x3 add x10, x0, x1 lsl x3, x3, #1 lsl x1, x1, #1 8: ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x9], x3 subs w5, w5, #2 sshl v0.8h, v0.8h, v31.8h sshl v1.8h, v1.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x10], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x10, x0, x1 lsl x1, x1, #1 16: ldp q0, q1, [x2] add x2, x2, x3 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x2] add x2, x2, x3 subs w5, w5, #2 sshl v1.8h, v1.8h, v31.8h sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x10], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: ldp q0, q1, [x2] sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x2, #32] add x2, x2, x3 sshl v1.8h, v1.8h, v31.8h sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h subs w5, w5, #1 sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: ldp q0, q1, [x2] subs w5, w5, #1 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x2, #32] sshl v1.8h, v1.8h, v31.8h ldp q4, q5, [x2, #64] sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h ldp q6, q7, [x2, #96] add x2, x2, x3 sshl v4.8h, v4.8h, v31.8h sshl v5.8h, v5.8h, v31.8h sshl v6.8h, v6.8h, v31.8h sshl v7.8h, v7.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h stp q0, q1, [x0] sub v4.8h, v4.8h, v30.8h sub v5.8h, v5.8h, v30.8h stp q2, q3, [x0, #32] sub v6.8h, v6.8h, v30.8h sub v7.8h, v7.8h, v30.8h stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x0, x0, x1 b.gt 64b ret endfunc jumptable prep_16bpc_tbl .word 640b - prep_16bpc_tbl .word 320b - prep_16bpc_tbl .word 160b - prep_16bpc_tbl .word 80b - prep_16bpc_tbl .word 40b - prep_16bpc_tbl endjumptable .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd ld1 {\d1\wd}[0], [\s1], \strd .ifnb \d2 ld1 {\d2\wd}[0], [\s0], \strd ld1 {\d3\wd}[0], [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}[0], [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}[0], [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}[0], [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}, [\s0], \strd ld1 {\d1\wd}, [\s1], \strd .ifnb \d2 ld1 {\d2\wd}, [\s0], \strd ld1 {\d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}, [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}, [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}, [\s0], \strd .endif .endm .macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 ld1 {\d0\wd, \d1\wd}, [\s0], \strd .ifnb \d2 ld1 {\d2\wd, \d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd, \d5\wd}, [\s0], \strd .endif .endm .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 .endm .macro interleave_1 wd, r0, r1, r2, r3, r4 trn1 \r0\wd, \r0\wd, \r1\wd trn1 \r1\wd, \r1\wd, \r2\wd .ifnb \r3 trn1 \r2\wd, \r2\wd, \r3\wd trn1 \r3\wd, \r3\wd, \r4\wd .endif .endm .macro interleave_1_s r0, r1, r2, r3, r4 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 .endm .macro umin_h c, wd, r0, r1, r2, r3 umin \r0\wd, \r0\wd, \c\wd .ifnb \r1 umin \r1\wd, \r1\wd, \c\wd .endif .ifnb \r2 umin \r2\wd, \r2\wd, \c\wd umin \r3\wd, \r3\wd, \c\wd .endif .endm .macro sub_h c, wd, r0, r1, r2, r3 sub \r0\wd, \r0\wd, \c\wd .ifnb \r1 sub \r1\wd, \r1\wd, \c\wd .endif .ifnb \r2 sub \r2\wd, \r2\wd, \c\wd sub \r3\wd, \r3\wd, \c\wd .endif .endm .macro smull_smlal_4tap d, s0, s1, s2, s3 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] .endm .macro smull2_smlal2_4tap d, s0, s1, s2, s3 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] .endm .macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] smlal \d\().4s, \s4\().4h, v0.h[4] smlal \d\().4s, \s5\().4h, v0.h[5] smlal \d\().4s, \s6\().4h, v0.h[6] .endm .macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] smlal2 \d\().4s, \s4\().8h, v0.h[4] smlal2 \d\().4s, \s5\().8h, v0.h[5] smlal2 \d\().4s, \s6\().8h, v0.h[6] .endm .macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] smlal \d\().4s, \s4\().4h, v0.h[4] smlal \d\().4s, \s5\().4h, v0.h[5] smlal \d\().4s, \s6\().4h, v0.h[6] smlal \d\().4s, \s7\().4h, v0.h[7] .endm .macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] smlal2 \d\().4s, \s4\().8h, v0.h[4] smlal2 \d\().4s, \s5\().8h, v0.h[5] smlal2 \d\().4s, \s6\().8h, v0.h[6] smlal2 \d\().4s, \s7\().8h, v0.h[7] .endm .macro sqrshrun_h shift, r0, r1, r2, r3 sqrshrun \r0\().4h, \r0\().4s, #\shift .ifnb \r1 sqrshrun2 \r0\().8h, \r1\().4s, #\shift .endif .ifnb \r2 sqrshrun \r2\().4h, \r2\().4s, #\shift sqrshrun2 \r2\().8h, \r3\().4s, #\shift .endif .endm .macro xtn_h r0, r1, r2, r3 uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 .ifnb \r2 uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto .endif .endm .macro srshl_s shift, r0, r1, r2, r3 srshl \r0\().4s, \r0\().4s, \shift\().4s srshl \r1\().4s, \r1\().4s, \shift\().4s .ifnb \r2 srshl \r2\().4s, \r2\().4s, \shift\().4s srshl \r3\().4s, \r3\().4s, \shift\().4s .endif .endm .macro st_s strd, reg, lanes st1 {\reg\().s}[0], [x0], \strd st1 {\reg\().s}[1], [x9], \strd .if \lanes > 2 st1 {\reg\().s}[2], [x0], \strd st1 {\reg\().s}[3], [x9], \strd .endif .endm .macro st_d strd, r0, r1 st1 {\r0\().8b}, [x0], \strd st1 {\r0\().d}[1], [x9], \strd .ifnb \r1 st1 {\r1\().8b}, [x0], \strd st1 {\r1\().d}[1], [x9], \strd .endif .endm .macro shift_store_4 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin_h v31, .8h, \r0, \r2 .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub_h v29, .8h, \r0, \r2 // PREP_BIAS .endif st_d \strd, \r0, \r2 .endm .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 st1 {\r0\wd}, [x0], \strd st1 {\r1\wd}, [x9], \strd .ifnb \r2 st1 {\r2\wd}, [x0], \strd st1 {\r3\wd}, [x9], \strd .endif .ifnb \r4 st1 {\r4\wd}, [x0], \strd st1 {\r5\wd}, [x9], \strd st1 {\r6\wd}, [x0], \strd st1 {\r7\wd}, [x9], \strd .endif .endm .macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro shift_store_8 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin_h v31, .8h, \r0, \r2 .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub_h v29, .8h, \r0, \r2 // PREP_BIAS .endif st_8h \strd, \r0, \r2 .endm .macro shift_store_16 type, strd, dst, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin \r0\().8h, \r0\().8h, v31.8h umin \r1\().8h, \r2\().8h, v31.8h .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub \r0\().8h, \r0\().8h, v29.8h sub \r1\().8h, \r2\().8h, v29.8h .endif st1 {\r0\().8h, \r1\().8h}, [\dst], \strd .endm .macro make_8tap_fn op, name, type, taps function \op\()_8tap_\name\()_16bpc_neon, export=1 mov w9, \type b \op\()_\taps\()_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps function \type\()_\taps\()_neon .ifc \bdmax, w8 ldr w8, [sp] .endif mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w11 mul \my, \my, w11 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h add \my, \my, w9 // my, 8tap_v, 4tap_v .ifc \type, prep lsl \d_strd, \d_strd, #1 .endif dup v31.8h, \bdmax // bitdepth_max clz \bdmax, \bdmax clz w9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 mov w12, #6 tst \mx, #(0x7f << 14) sub w9, w9, #25 add w13, w12, \bdmax // 6 + intermediate_bits sub w12, w12, \bdmax // 6 - intermediate_bits movrel x11, X(mc_subpel_filters), -8 b.ne L(\type\()_\taps\()_h) tst \my, #(0x7f << 14) b.ne L(\type\()_\taps\()_v) b \type\()_16bpc_neon L(\type\()_\taps\()_h): cmp \w, #4 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7f b.le 4f mov \mx, w10 4: tst \my, #(0x7f << 14) add \xmx, x11, \mx, uxtw #3 b.ne L(\type\()_\taps\()_hv) movrel x10, \type\()_\taps\()_h_tbl ldrsw x9, [x10, x9, lsl #2] .ifc \type, put mov w12, #34 // rounding for 10-bit mov w13, #40 // rounding for 12-bit cmp \bdmax, #2 // 10-bit: 4, 12-bit: 2 csel w12, w12, w13, ne // select rounding based on \bdmax .else neg w12, w12 // -(6 - intermediate_bits) movi v28.8h, #(PREP_BIAS >> 8), lsl #8 .endif add x10, x10, x9 dup v30.4s, w12 // rounding or shift amount br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put ldur s0, [\xmx, #2] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 2: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd mov v2.16b, v30.16b ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 trn1 v3.2s, v4.2s, v6.2s trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s smlal v2.4s, v3.4h, v0.h[0] smlal v2.4s, v4.4h, v0.h[1] smlal v2.4s, v6.4h, v0.h[2] smlal v2.4s, v7.4h, v0.h[3] sqshrun v2.4h, v2.4s, #6 umin v2.4h, v2.4h, v31.4h st1 {v2.s}[0], [\dst], \d_strd st1 {v2.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET ldur s0, [\xmx, #2] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 4: ld1 {v16.8h}, [\src], \s_strd ld1 {v20.8h}, [\sr2], \s_strd .ifc \type, put mov v2.16b, v30.16b mov v3.16b, v30.16b .endif ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 ext v21.16b, v20.16b, v20.16b, #2 ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 .ifc \type, put smlal v2.4s, v16.4h, v0.h[0] .else smull v2.4s, v16.4h, v0.h[0] .endif smlal v2.4s, v17.4h, v0.h[1] smlal v2.4s, v18.4h, v0.h[2] smlal v2.4s, v19.4h, v0.h[3] .ifc \type, put smlal v3.4s, v20.4h, v0.h[0] .else smull v3.4s, v20.4h, v0.h[0] .endif smlal v3.4s, v21.4h, v0.h[1] smlal v3.4s, v22.4h, v0.h[2] smlal v3.4s, v23.4h, v0.h[3] .ifc \type, put sqshrun v16.4h, v2.4s, #6 sqshrun2 v16.8h, v3.4s, #6 umin v16.8h, v16.8h, v31.8h .else srshl v16.4s, v2.4s, v30.4s // -(6-intermediate_bits) srshl v20.4s, v3.4s, v30.4s // -(6-intermediate_bits) uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 sub v16.8h, v16.8h, v28.8h // PREP_BIAS .endif st1 {v16.8b}, [\dst], \d_strd st1 {v16.d}[1], [\ds2], \d_strd b.gt 4b ret 80: 160: 320: 640: // 8xN, 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] .ifc \taps, 6tap sub \src, \src, #4 .else sub \src, \src, #6 .endif add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b sub \s_strd, \s_strd, \w, uxtw #1 sub \s_strd, \s_strd, #16 lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw #1 81: ld1 {v16.8h, v17.8h}, [\src], #32 ld1 {v20.8h, v21.8h}, [\sr2], #32 mov \mx, \w 8: .ifc \taps, 6tap .ifc \type, put mov v18.16b, v30.16b mov v19.16b, v30.16b smlal v18.4s, v16.4h, v0.h[1] smlal2 v19.4s, v16.8h, v0.h[1] mov v22.16b, v30.16b mov v23.16b, v30.16b smlal v22.4s, v20.4h, v0.h[1] smlal2 v23.4s, v20.8h, v0.h[1] .else smull v18.4s, v16.4h, v0.h[1] smull2 v19.4s, v16.8h, v0.h[1] smull v22.4s, v20.4h, v0.h[1] smull2 v23.4s, v20.8h, v0.h[1] .endif .irpc i, 23456 ext v24.16b, v16.16b, v17.16b, #(2*\i-2) ext v25.16b, v20.16b, v21.16b, #(2*\i-2) smlal v18.4s, v24.4h, v0.h[\i] smlal2 v19.4s, v24.8h, v0.h[\i] smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr .else // 8tap .ifc \type, put mov v18.16b, v30.16b mov v19.16b, v30.16b smlal v18.4s, v16.4h, v0.h[0] smlal2 v19.4s, v16.8h, v0.h[0] mov v22.16b, v30.16b mov v23.16b, v30.16b smlal v22.4s, v20.4h, v0.h[0] smlal2 v23.4s, v20.8h, v0.h[0] .else smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] smull2 v23.4s, v20.8h, v0.h[0] .endif .irpc i, 1234567 ext v24.16b, v16.16b, v17.16b, #(2*\i) ext v25.16b, v20.16b, v21.16b, #(2*\i) smlal v18.4s, v24.4h, v0.h[\i] smlal2 v19.4s, v24.8h, v0.h[\i] smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr .endif subs \mx, \mx, #8 .ifc \type, put sqshrun v18.4h, v18.4s, #6 sqshrun2 v18.8h, v19.4s, #6 sqshrun v22.4h, v22.4s, #6 sqshrun2 v22.8h, v23.4s, #6 umin v18.8h, v18.8h, v31.8h umin v22.8h, v22.8h, v31.8h .else srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 uzp1 v22.8h, v22.8h, v23.8h // Ditto sub v18.8h, v18.8h, v28.8h // PREP_BIAS sub v22.8h, v22.8h, v28.8h // PREP_BIAS .endif st1 {v18.8h}, [\dst], #16 st1 {v22.8h}, [\ds2], #16 b.le 9f mov v16.16b, v17.16b mov v20.16b, v21.16b ld1 {v17.8h}, [\src], #16 ld1 {v21.8h}, [\sr2], #16 b 8b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 81b ret endfunc jumptable \type\()_\taps\()_h_tbl .word 640b - \type\()_\taps\()_h_tbl .word 320b - \type\()_\taps\()_h_tbl .word 160b - \type\()_\taps\()_h_tbl .word 80b - \type\()_\taps\()_h_tbl .word 40b - \type\()_\taps\()_h_tbl .word 20b - \type\()_\taps\()_h_tbl endjumptable function L(\type\()_\taps\()_v) cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w10 4: add \xmy, x11, \my, uxtw #3 .ifc \type, prep dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif movrel x10, \type\()_\taps\()_v_tbl ldrsw x9, [x10, x9, lsl #2] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) .endif add x10, x10, x9 br x10 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f cmp \h, #2 ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b // 2x2 v load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 b.gt 24f smull_smlal_4tap v6, v1, v2, v3, v4 sqrshrun_h 6, v6 umin_h v31, .8h, v6 st_s \d_strd, v6, 2 ret 24: // 2x4 v load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 smull_smlal_4tap v16, v1, v2, v3, v4 smull_smlal_4tap v17, v3, v4, v5, v6 sqrshrun_h 6, v16, v17 umin_h v31, .8h, v16 st_s \d_strd, v16, 4 ret 28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 interleave_1_s v1, v2, v3, v4, v5 interleave_1_s v5, v6, v7 216: subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_s v7, v16, v17, v18, v19 smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 smull_smlal_\taps v25, v3, v4, v5, v6, v7, v16, v17, v18 sqrshrun_h 6, v24, v25 umin_h v31, .8h, v24 st_s \d_strd, v24, 4 b.le 0f cmp \h, #2 mov v1.16b, v5.16b mov v2.16b, v6.16b mov v3.16b, v7.16b mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b b.eq 26f b 216b 26: load_s \sr2, \src, \s_strd, v16, v17 interleave_1_s v7, v16, v17 smull_smlal_\taps v24, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_h 6, v24 umin_h v31, .4h, v24 st_s \d_strd, v24, 2 0: ret .endif 40: AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v cmp \h, #2 ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 smull_smlal_4tap v6, v1, v2, v3, v4 smull_smlal_4tap v7, v2, v3, v4, v5 shift_store_4 \type, \d_strd, v6, v7 b.le 0f load_4h \sr2, \src, \s_strd, v6, v7 smull_smlal_4tap v1, v3, v4, v5, v6 smull_smlal_4tap v2, v4, v5, v6, v7 shift_store_4 \type, \d_strd, v1, v2 0: ret 480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 48: subs \h, \h, #4 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25 smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f cmp \h, #2 mov v16.8b, v20.8b mov v17.8b, v21.8b mov v18.8b, v22.8b mov v19.8b, v23.8b mov v20.8b, v24.8b mov v21.8b, v25.8b mov v22.8b, v26.8b b.eq 46f b 48b 46: load_4h \sr2, \src, \s_strd, v23, v24 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_4 \type, \d_strd, v1, v2 0: ret 80: AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v cmp \h, #2 ldur s0, [\xmy, #2] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 smull_smlal_4tap v16, v1, v2, v3, v4 smull2_smlal2_4tap v17, v1, v2, v3, v4 smull_smlal_4tap v18, v2, v3, v4, v5 smull2_smlal2_4tap v19, v2, v3, v4, v5 shift_store_8 \type, \d_strd, v16, v17, v18, v19 b.le 0f load_8h \sr2, \src, \s_strd, v6, v7 smull_smlal_4tap v16, v3, v4, v5, v6 smull2_smlal2_4tap v17, v3, v4, v5, v6 smull_smlal_4tap v18, v4, v5, v6, v7 smull2_smlal2_4tap v19, v4, v5, v6, v7 shift_store_8 \type, \d_strd, v16, v17, v18, v19 0: ret 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 88: subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v23, v24 smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23 smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_\taps v3, v17, v18, v19, v20, v21, v22, v23, v24 smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v25, v26 smull_smlal_\taps v1, v18, v19, v20, v21, v22, v23, v24, v25 smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25 smull_smlal_\taps v3, v19, v20, v21, v22, v23, v24, v25, v26 smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f mov v16.16b, v20.16b mov v17.16b, v21.16b mov v18.16b, v22.16b mov v19.16b, v23.16b mov v20.16b, v24.16b mov v21.16b, v25.16b mov v22.16b, v26.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: ret 160: AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v ldur s0, [\xmy, #2] sub \src, \src, \s_strd sxtl v0.8h, v0.8b load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 16: load_16h \src, \src, \s_strd, v22, v23 subs \h, \h, #1 smull_smlal_4tap v1, v16, v18, v20, v22 smull2_smlal2_4tap v2, v16, v18, v20, v22 smull_smlal_4tap v3, v17, v19, v21, v23 smull2_smlal2_4tap v4, v17, v19, v21, v23 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 b.le 0f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v23.16b b 16b 0: ret endfunc jumptable \type\()_\taps\()_v_tbl .word 640b - \type\()_\taps\()_v_tbl .word 320b - \type\()_\taps\()_v_tbl .word 160b - \type\()_\taps\()_v_tbl .word 80b - \type\()_\taps\()_v_tbl .word 40b - \type\()_\taps\()_v_tbl .word 20b - \type\()_\taps\()_v_tbl endjumptable function L(\type\()_\taps\()_hv) cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w10 4: add \xmy, x11, \my, uxtw #3 movrel x10, \type\()_\taps\()_hv_tbl dup v30.4s, w12 // 6 - intermediate_bits ldrsw x9, [x10, x9, lsl #2] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.4s, w13 // 6 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif add x10, x10, x9 .ifc \type, put neg v29.4s, v29.4s // -(6+intermediate_bits) .endif br x10 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put ldur s0, [\xmx, #2] b.gt 280f ldur s1, [\xmy, #2] // 2x2, 2x4 hv sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 smull v27.4s, v27.4h, v0.4h smull v28.4s, v28.4h, v0.4h addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) bl L(\type\()_\taps\()_filter_2) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b 2: bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s umin v2.4h, v2.4h, v31.4h subs \h, \h, #2 st1 {v2.s}[0], [\dst], \d_strd st1 {v2.s}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v24.8b b 2b 280: // 2x8, 2x16, 2x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 smull v27.4s, v27.4h, v0.4h smull v28.4s, v28.4h, v0.4h addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). bl L(\type\()_\taps\()_filter_2) xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b bl L(\type\()_\taps\()_filter_2) ext v18.8b, v17.8b, v24.8b, #4 mov v19.8b, v24.8b bl L(\type\()_\taps\()_filter_2) ext v20.8b, v19.8b, v24.8b, #4 mov v21.8b, v24.8b 28: bl L(\type\()_\taps\()_filter_2) ext v22.8b, v21.8b, v24.8b, #4 .ifc \taps, 6tap smull v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] .else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] .endif srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s umin v3.4h, v3.4h, v31.4h subs \h, \h, #2 st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v24.8b b 28b 0: ret x15 L(\type\()_\taps\()_filter_2): ld1 {v25.8h}, [\sr2], \s_strd ld1 {v27.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v28.16b, v27.16b, v27.16b, #2 trn1 v24.2s, v25.2s, v27.2s trn2 v27.2s, v25.2s, v27.2s trn1 v25.2s, v26.2s, v28.2s trn2 v28.2s, v26.2s, v28.2s smull v24.4s, v24.4h, v0.h[0] smlal v24.4s, v25.4h, v0.h[1] smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) xtn v24.4h, v24.4s ret .endif 40: AARCH64_VALID_JUMP_TARGET ldur s0, [\xmx, #2] b.gt 480f ldur s1, [\xmy, #2] sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 // 4x2, 4x4 hv ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b 4: bl L(\type\()_\taps\()_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v24.4h, v1.h[2] smlal v3.4s, v25.4h, v1.h[3] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s umin v2.8h, v2.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8b}, [\dst], \d_strd st1 {v2.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v24.8b mov v18.8b, v25.8b b 4b 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 .ifc \taps, 6tap sub \sr2, \src, \s_strd sub \src, \src, \s_strd, lsl #1 .else sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd .endif add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). .ifc \taps, 6tap xtn v18.4h, v16.4s .else xtn v16.4h, v16.4s bl L(\type\()_\taps\()_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b .endif bl L(\type\()_\taps\()_filter_4) mov v19.8b, v24.8b mov v20.8b, v25.8b bl L(\type\()_\taps\()_filter_4) mov v21.8b, v24.8b mov v22.8b, v25.8b 48: bl L(\type\()_\taps\()_filter_4) .ifc \taps, 6tap smull v3.4s, v18.4h, v1.h[1] smlal v3.4s, v19.4h, v1.h[2] smlal v3.4s, v20.4h, v1.h[3] smlal v3.4s, v21.4h, v1.h[4] smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v24.4h, v1.h[6] smull v4.4s, v19.4h, v1.h[1] smlal v4.4s, v20.4h, v1.h[2] smlal v4.4s, v21.4h, v1.h[3] smlal v4.4s, v22.4h, v1.h[4] smlal v4.4s, v24.4h, v1.h[5] smlal v4.4s, v25.4h, v1.h[6] .else // 8tap smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] smull v4.4s, v17.4h, v1.h[0] smlal v4.4s, v18.4h, v1.h[1] smlal v4.4s, v19.4h, v1.h[2] smlal v4.4s, v20.4h, v1.h[3] smlal v4.4s, v21.4h, v1.h[4] smlal v4.4s, v22.4h, v1.h[5] smlal v4.4s, v24.4h, v1.h[6] smlal v4.4s, v25.4h, v1.h[7] .endif .ifc \type, put srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s sqxtun2 v3.8h, v4.4s umin v3.8h, v3.8h, v31.8h .else rshrn v3.4h, v3.4s, #6 rshrn2 v3.8h, v4.4s, #6 sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v3.8b}, [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f .ifc \taps, 8tap mov v16.8b, v18.8b mov v17.8b, v19.8b .endif mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v24.8b mov v22.8b, v25.8b b 48b 0: ret x15 L(\type\()_\taps\()_filter_4): ld1 {v24.8h}, [\sr2], \s_strd ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v24.16b, v24.16b, #2 ext v27.16b, v24.16b, v24.16b, #4 ext v28.16b, v24.16b, v24.16b, #6 smull v24.4s, v24.4h, v0.h[0] smlal v24.4s, v26.4h, v0.h[1] smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) xtn v24.4h, v24.4s xtn v25.4h, v25.4s ret 80: 160: 320: AARCH64_VALID_JUMP_TARGET b.gt 880f ld1 {v0.8b}, [\xmx] ldur s1, [\xmy, #2] .ifc \taps, 6tap sub \src, \src, #4 .else sub \src, \src, #6 .endif sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd .ifc \taps, 6tap smull v24.4s, v27.4h, v0.h[1] smull2 v25.4s, v27.8h, v0.h[1] .irpc i, 23456 ext v26.16b, v27.16b, v28.16b, #(2*\i-2) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr .else smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr .endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v23.4h, v1.h[2] smlal2 v5.4s, v23.8h, v1.h[2] smlal v2.4s, v23.4h, v1.h[3] smlal2 v3.4s, v23.8h, v1.h[3] smlal v4.4s, v24.4h, v1.h[3] smlal2 v5.4s, v24.8h, v1.h[3] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b mov v17.16b, v23.16b mov v18.16b, v24.16b b 8b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] .ifc \taps, 6tap sub \src, \src, #4 .else sub \src, \src, #6 sub \src, \src, \s_strd .endif sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd .ifc \taps, 6tap smull v24.4s, v27.4h, v0.h[1] smull2 v25.4s, v27.8h, v0.h[1] .irpc i, 23456 ext v26.16b, v27.16b, v28.16b, #(2*\i-2) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr .else // 8tap smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr .endif srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). .ifc \taps, 6tap uzp1 v18.8h, v24.8h, v25.8h // Same as xtn, xtn2 .else uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_\taps\()_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b .endif bl L(\type\()_\taps\()_filter_8) mov v19.16b, v23.16b mov v20.16b, v24.16b bl L(\type\()_\taps\()_filter_8) mov v21.16b, v23.16b mov v22.16b, v24.16b 88: .ifc \taps, 6tap smull v2.4s, v18.4h, v1.h[1] smull2 v3.4s, v18.8h, v1.h[1] bl L(\type\()_\taps\()_filter_8) smull v4.4s, v19.4h, v1.h[1] smull2 v5.4s, v19.8h, v1.h[1] smlal v2.4s, v19.4h, v1.h[2] smlal2 v3.4s, v19.8h, v1.h[2] smlal v4.4s, v20.4h, v1.h[2] smlal2 v5.4s, v20.8h, v1.h[2] smlal v2.4s, v20.4h, v1.h[3] smlal2 v3.4s, v20.8h, v1.h[3] smlal v4.4s, v21.4h, v1.h[3] smlal2 v5.4s, v21.8h, v1.h[3] smlal v2.4s, v21.4h, v1.h[4] smlal2 v3.4s, v21.8h, v1.h[4] smlal v4.4s, v22.4h, v1.h[4] smlal2 v5.4s, v22.8h, v1.h[4] smlal v2.4s, v22.4h, v1.h[5] smlal2 v3.4s, v22.8h, v1.h[5] smlal v4.4s, v23.4h, v1.h[5] smlal2 v5.4s, v23.8h, v1.h[5] smlal v2.4s, v23.4h, v1.h[6] smlal2 v3.4s, v23.8h, v1.h[6] smlal v4.4s, v24.4h, v1.h[6] smlal2 v5.4s, v24.8h, v1.h[6] .else // 8tap smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_\taps\()_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v19.4h, v1.h[2] smlal2 v5.4s, v19.8h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal2 v3.4s, v19.8h, v1.h[3] smlal v4.4s, v20.4h, v1.h[3] smlal2 v5.4s, v20.8h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal2 v3.4s, v20.8h, v1.h[4] smlal v4.4s, v21.4h, v1.h[4] smlal2 v5.4s, v21.8h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal2 v3.4s, v21.8h, v1.h[5] smlal v4.4s, v22.4h, v1.h[5] smlal2 v5.4s, v22.8h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal2 v3.4s, v22.8h, v1.h[6] smlal v4.4s, v23.4h, v1.h[6] smlal2 v5.4s, v23.8h, v1.h[6] smlal v2.4s, v23.4h, v1.h[7] smlal2 v3.4s, v23.8h, v1.h[7] smlal v4.4s, v24.4h, v1.h[7] smlal2 v5.4s, v24.8h, v1.h[7] .endif .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f .ifc \taps, 8tap mov v16.16b, v18.16b mov v17.16b, v19.16b .endif mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v23.16b mov v22.16b, v24.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 .ifc \taps, 6tap add \src, \src, \s_strd, lsl #1 .endif b 168b 0: ret x15 L(\type\()_\taps\()_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd .ifc \taps, 6tap smull v25.4s, v4.4h, v0.h[1] smull2 v26.4s, v4.8h, v0.h[1] smull v27.4s, v6.4h, v0.h[1] smull2 v28.4s, v6.8h, v0.h[1] .irpc i, 23456 ext v23.16b, v4.16b, v5.16b, #(2*\i-2) ext v24.16b, v6.16b, v7.16b, #(2*\i-2) smlal v25.4s, v23.4h, v0.h[\i] smlal2 v26.4s, v23.8h, v0.h[\i] smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr .else // 8tap smull v25.4s, v4.4h, v0.h[0] smull2 v26.4s, v4.8h, v0.h[0] smull v27.4s, v6.4h, v0.h[0] smull2 v28.4s, v6.8h, v0.h[0] .irpc i, 1234567 ext v23.16b, v4.16b, v5.16b, #(2*\i) ext v24.16b, v6.16b, v7.16b, #(2*\i) smlal v25.4s, v23.4h, v0.h[\i] smlal2 v26.4s, v23.8h, v0.h[\i] smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr .endif srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 uzp1 v24.8h, v27.8h, v28.8h // Ditto ret endfunc jumptable \type\()_\taps\()_hv_tbl .word 640b - \type\()_\taps\()_hv_tbl .word 320b - \type\()_\taps\()_hv_tbl .word 160b - \type\()_\taps\()_hv_tbl .word 80b - \type\()_\taps\()_hv_tbl .word 40b - \type\()_\taps\()_hv_tbl .word 20b - \type\()_\taps\()_hv_tbl endjumptable .endm .macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 function \type\()_bilin_16bpc_neon, export=1 .ifc \bdmax, w8 ldr w8, [sp] .endif dup v1.8h, \mx dup v3.8h, \my mov w10, #16 sub w9, w10, \mx sub w10, w10, \my dup v0.8h, w9 dup v2.8h, w10 .ifc \type, prep lsl \d_strd, \d_strd, #1 .endif clz \bdmax, \bdmax // bitdepth_max clz w9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 mov w11, #4 sub w9, w9, #25 sub w11, w11, \bdmax // 4 - intermediate_bits add w12, \bdmax, #4 // 4 + intermediate_bits cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) b \type\()_16bpc_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) movrel x10, \type\()_bilin_h_tbl dup v31.8h, w11 // 4 - intermediate_bits ldrsw x9, [x10, x9, lsl #2] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.8h, \bdmax // intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif add x10, x10, x9 .ifc \type, put neg v30.8h, v30.8h // -intermediate_bits .endif br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: ld1 {v4.4h}, [\src], \s_strd ld1 {v6.4h}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #2 ext v7.8b, v6.8b, v6.8b, #2 trn1 v4.2s, v4.2s, v6.2s trn1 v5.2s, v5.2s, v7.2s subs \h, \h, #2 mul v4.4h, v4.4h, v0.4h mla v4.4h, v5.4h, v1.4h urshl v4.4h, v4.4h, v31.4h urshl v4.4h, v4.4h, v30.4h st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 trn1 v4.2d, v4.2d, v6.2d trn1 v5.2d, v5.2d, v7.2d subs \h, \h, #2 mul v4.8h, v4.8h, v0.8h mla v4.8h, v5.8h, v1.8h urshl v4.8h, v4.8h, v31.8h .ifc \type, put urshl v4.8h, v4.8h, v30.8h .else sub v4.8h, v4.8h, v29.8h .endif st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: ldr h5, [\src, #16] ldr h7, [\sr2, #16] ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v5.16b, #2 ext v7.16b, v6.16b, v7.16b, #2 subs \h, \h, #2 mul v4.8h, v4.8h, v0.8h mla v4.8h, v5.8h, v1.8h mul v6.8h, v6.8h, v0.8h mla v6.8h, v7.8h, v1.8h urshl v4.8h, v4.8h, v31.8h urshl v6.8h, v6.8h, v31.8h .ifc \type, put urshl v4.8h, v4.8h, v30.8h urshl v6.8h, v6.8h, v30.8h .else sub v4.8h, v4.8h, v29.8h sub v6.8h, v6.8h, v29.8h .endif st1 {v4.8h}, [\dst], \d_strd st1 {v6.8h}, [\ds2], \d_strd b.gt 8b ret 160: 320: 640: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, uxtw #1 sub \s_strd, \s_strd, #16 lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw #1 161: ld1 {v16.8h}, [\src], #16 ld1 {v21.8h}, [\sr2], #16 mov \mx, \w 16: ld1 {v17.8h, v18.8h}, [\src], #32 ld1 {v22.8h, v23.8h}, [\sr2], #32 ext v19.16b, v16.16b, v17.16b, #2 ext v20.16b, v17.16b, v18.16b, #2 ext v24.16b, v21.16b, v22.16b, #2 ext v25.16b, v22.16b, v23.16b, #2 mul v16.8h, v16.8h, v0.8h mla v16.8h, v19.8h, v1.8h mul v17.8h, v17.8h, v0.8h mla v17.8h, v20.8h, v1.8h mul v21.8h, v21.8h, v0.8h mla v21.8h, v24.8h, v1.8h mul v22.8h, v22.8h, v0.8h mla v22.8h, v25.8h, v1.8h urshl v16.8h, v16.8h, v31.8h urshl v17.8h, v17.8h, v31.8h urshl v21.8h, v21.8h, v31.8h urshl v22.8h, v22.8h, v31.8h subs \mx, \mx, #16 .ifc \type, put urshl v16.8h, v16.8h, v30.8h urshl v17.8h, v17.8h, v30.8h urshl v21.8h, v21.8h, v30.8h urshl v22.8h, v22.8h, v30.8h .else sub v16.8h, v16.8h, v29.8h sub v17.8h, v17.8h, v29.8h sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v29.8h .endif st1 {v16.8h, v17.8h}, [\dst], #32 st1 {v21.8h, v22.8h}, [\ds2], #32 b.le 9f mov v16.16b, v18.16b mov v21.16b, v23.16b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret endfunc jumptable \type\()_bilin_h_tbl .word 640b - \type\()_bilin_h_tbl .word 320b - \type\()_bilin_h_tbl .word 160b - \type\()_bilin_h_tbl .word 80b - \type\()_bilin_h_tbl .word 40b - \type\()_bilin_h_tbl .word 20b - \type\()_bilin_h_tbl endjumptable function L(\type\()_bilin_v) cmp \h, #4 movrel x10, \type\()_bilin_v_tbl .ifc \type, prep dup v31.8h, w11 // 4 - intermediate_bits .endif ldrsw x9, [x10, x9, lsl #2] .ifc \type, prep movi v29.8h, #(PREP_BIAS >> 8), lsl #8 neg v31.8h, v31.8h // -(4-intermediate_bits) .endif add x10, x10, x9 br x10 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v ld1r {v16.4s}, [\src], \s_strd b.gt 24f 22: ld1r {v17.4s}, [\sr2], \s_strd ld1r {v18.4s}, [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h urshr v4.8h, v4.8h, #4 str s4, [\dst] st1 {v4.s}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v ld1r {v17.4s}, [\sr2], \s_strd ld1r {v18.4s}, [\src], \s_strd ld1r {v19.4s}, [\sr2], \s_strd ld1r {v20.4s}, [\src], \s_strd sub \h, \h, #4 trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s trn1 v18.2s, v18.2s, v19.2s trn1 v19.2s, v19.2s, v20.2s trn1 v16.2d, v16.2d, v18.2d trn1 v17.2d, v17.2d, v19.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h cmp \h, #2 urshr v4.8h, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd st1 {v4.s}[2], [\dst], \d_strd st1 {v4.s}[3], [\ds2], \d_strd b.lt 0f mov v16.8b, v20.8b b.eq 22b b 24b 0: ret .endif 40: // 4xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.4h}, [\src], \s_strd 4: ld1 {v17.4h}, [\sr2], \s_strd ld1 {v18.4h}, [\src], \s_strd trn1 v16.2d, v16.2d, v17.2d trn1 v17.2d, v17.2d, v18.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 .else urshl v4.8h, v4.8h, v31.8h sub v4.8h, v4.8h, v29.8h .endif st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b b 4b 0: ret 80: // 8xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8h}, [\src], \s_strd 8: ld1 {v17.8h}, [\sr2], \s_strd ld1 {v18.8h}, [\src], \s_strd mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v18.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 .else urshl v4.8h, v4.8h, v31.8h urshl v5.8h, v5.8h, v31.8h sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h .endif st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd b.le 0f mov v16.16b, v18.16b b 8b 0: ret 160: // 16xN, 32xN, ... 320: 640: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8h, v17.8h}, [\src], \s_strd 2: ld1 {v18.8h, v19.8h}, [\sr2], \s_strd ld1 {v20.8h, v21.8h}, [\src], \s_strd mul v4.8h, v16.8h, v2.8h mla v4.8h, v18.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v19.8h, v3.8h mul v6.8h, v18.8h, v2.8h mla v6.8h, v20.8h, v3.8h mul v7.8h, v19.8h, v2.8h mla v7.8h, v21.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 urshr v6.8h, v6.8h, #4 urshr v7.8h, v7.8h, #4 .else urshl v4.8h, v4.8h, v31.8h urshl v5.8h, v5.8h, v31.8h urshl v6.8h, v6.8h, v31.8h urshl v7.8h, v7.8h, v31.8h sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h sub v6.8h, v6.8h, v29.8h sub v7.8h, v7.8h, v29.8h .endif st1 {v4.8h, v5.8h}, [\dst], \d_strd st1 {v6.8h, v7.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v20.16b mov v17.16b, v21.16b b 2b 9: subs \w, \w, #16 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #32 add \dst, \dst, #32 b 1b 0: ret endfunc jumptable \type\()_bilin_v_tbl .word 640b - \type\()_bilin_v_tbl .word 320b - \type\()_bilin_v_tbl .word 160b - \type\()_bilin_v_tbl .word 80b - \type\()_bilin_v_tbl .word 40b - \type\()_bilin_v_tbl .word 20b - \type\()_bilin_v_tbl endjumptable function L(\type\()_bilin_hv) movrel x10, \type\()_bilin_hv_tbl dup v31.8h, w11 // 4 - intermediate_bits ldrsw x9, [x10, x9, lsl #2] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.4s, w12 // 4 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif add x10, x10, x9 .ifc \type, put neg v30.4s, v30.4s // -(4+intermediate_bits) .endif br x10 20: // 2xN hv AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v20.4h}, [\src], \s_strd ext v21.8b, v20.8b, v20.8b, #2 mul v16.4h, v20.4h, v0.4h mla v16.4h, v21.4h, v1.4h urshl v16.4h, v16.4h, v31.4h 2: ld1 {v22.4h}, [\sr2], \s_strd ld1 {v24.4h}, [\src], \s_strd ext v23.8b, v22.8b, v22.8b, #2 ext v25.8b, v24.8b, v24.8b, #2 trn1 v22.2s, v22.2s, v24.2s trn1 v23.2s, v23.2s, v25.2s mul v17.4h, v22.4h, v0.4h mla v17.4h, v23.4h, v1.4h urshl v17.4h, v17.4h, v31.4h trn1 v16.2s, v16.2s, v17.2s umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h urshl v4.4s, v4.4s, v30.4s xtn v4.4h, v4.4s subs \h, \h, #2 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd b.le 0f trn2 v16.2s, v17.2s, v17.2s b 2b 0: ret .endif 40: // 4xN hv AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v20.8h}, [\src], \s_strd ext v21.16b, v20.16b, v20.16b, #2 mul v16.4h, v20.4h, v0.4h mla v16.4h, v21.4h, v1.4h urshl v16.4h, v16.4h, v31.4h 4: ld1 {v22.8h}, [\sr2], \s_strd ld1 {v24.8h}, [\src], \s_strd ext v23.16b, v22.16b, v22.16b, #2 ext v25.16b, v24.16b, v24.16b, #2 trn1 v22.2d, v22.2d, v24.2d trn1 v23.2d, v23.2d, v25.2d mul v17.8h, v22.8h, v0.8h mla v17.8h, v23.8h, v1.8h urshl v17.8h, v17.8h, v31.8h trn1 v16.2d, v16.2d, v17.2d umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h umull2 v5.4s, v16.8h, v2.8h umlal2 v5.4s, v17.8h, v3.8h .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 sub v4.8h, v4.8h, v29.8h .endif subs \h, \h, #2 st1 {v4.8b}, [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f trn2 v16.2d, v17.2d, v17.2d b 4b 0: ret 80: // 8xN, 16xN, ... hv 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ldr h21, [\src, #16] ld1 {v20.8h}, [\src], \s_strd ext v21.16b, v20.16b, v21.16b, #2 mul v16.8h, v20.8h, v0.8h mla v16.8h, v21.8h, v1.8h urshl v16.8h, v16.8h, v31.8h 2: ldr h23, [\sr2, #16] ld1 {v22.8h}, [\sr2], \s_strd ldr h25, [\src, #16] ld1 {v24.8h}, [\src], \s_strd ext v23.16b, v22.16b, v23.16b, #2 ext v25.16b, v24.16b, v25.16b, #2 mul v17.8h, v22.8h, v0.8h mla v17.8h, v23.8h, v1.8h mul v18.8h, v24.8h, v0.8h mla v18.8h, v25.8h, v1.8h urshl v17.8h, v17.8h, v31.8h urshl v18.8h, v18.8h, v31.8h umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h umull2 v5.4s, v16.8h, v2.8h umlal2 v5.4s, v17.8h, v3.8h umull v6.4s, v17.4h, v2.4h umlal v6.4s, v18.4h, v3.4h umull2 v7.4s, v17.8h, v2.8h umlal2 v7.4s, v18.8h, v3.8h .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s urshl v6.4s, v6.4s, v30.4s urshl v7.4s, v7.4s, v30.4s uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 uzp1 v5.8h, v6.8h, v7.8h // Ditto .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 rshrn v5.4h, v6.4s, #4 rshrn2 v5.8h, v7.4s, #4 sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h .endif subs \h, \h, #2 st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 1b 0: ret endfunc jumptable \type\()_bilin_hv_tbl .word 640b - \type\()_bilin_hv_tbl .word 320b - \type\()_bilin_hv_tbl .word 160b - \type\()_bilin_hv_tbl .word 80b - \type\()_bilin_hv_tbl .word 40b - \type\()_bilin_hv_tbl .word 20b - \type\()_bilin_hv_tbl endjumptable .endm make_8tap_fn put, sharp, SHARP, 8tap filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap make_8tap_fn put, regular, REGULAR, 6tap make_8tap_fn put, smooth, SMOOTH, 6tap filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap filter_bilin_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 make_8tap_fn prep, sharp, SHARP, 8tap filter_fn prep, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap make_8tap_fn prep, regular, REGULAR, 6tap make_8tap_fn prep, smooth, SMOOTH, 6tap filter_fn prep, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap filter_bilin_fn prep, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 .macro load_filter_row dst, src, inc asr w13, \src, #10 add \src, \src, \inc ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon add w12, w5, #512 ld1 {v16.8h, v17.8h}, [x2], x3 load_filter_row d0, w12, w7 load_filter_row d1, w12, w7 load_filter_row d2, w12, w7 sxtl v0.8h, v0.8b load_filter_row d3, w12, w7 sxtl v1.8h, v1.8b load_filter_row d4, w12, w7 sxtl v2.8h, v2.8b load_filter_row d5, w12, w7 sxtl v3.8h, v3.8b load_filter_row d6, w12, w7 sxtl v4.8h, v4.8b load_filter_row d7, w12, w7 sxtl v5.8h, v5.8b ext v18.16b, v16.16b, v17.16b, #2*1 smull v8.4s, v16.4h, v0.4h smull2 v9.4s, v16.8h, v0.8h sxtl v6.8h, v6.8b ext v19.16b, v16.16b, v17.16b, #2*2 smull v10.4s, v18.4h, v1.4h smull2 v11.4s, v18.8h, v1.8h sxtl v7.8h, v7.8b ext v20.16b, v16.16b, v17.16b, #2*3 smull v0.4s, v19.4h, v2.4h smull2 v1.4s, v19.8h, v2.8h ext v21.16b, v16.16b, v17.16b, #2*4 addp v8.4s, v8.4s, v9.4s smull v2.4s, v20.4h, v3.4h smull2 v3.4s, v20.8h, v3.8h ext v22.16b, v16.16b, v17.16b, #2*5 addp v9.4s, v10.4s, v11.4s smull v10.4s, v21.4h, v4.4h smull2 v11.4s, v21.8h, v4.8h ext v23.16b, v16.16b, v17.16b, #2*6 addp v0.4s, v0.4s, v1.4s smull v18.4s, v22.4h, v5.4h smull2 v19.4s, v22.8h, v5.8h ext v16.16b, v16.16b, v17.16b, #2*7 addp v1.4s, v2.4s, v3.4s addp v2.4s, v10.4s, v11.4s smull v20.4s, v23.4h, v6.4h smull2 v21.4s, v23.8h, v6.8h addp v3.4s, v18.4s, v19.4s smull v22.4s, v16.4h, v7.4h smull2 v23.4s, v16.8h, v7.8h addp v4.4s, v20.4s, v21.4s addp v5.4s, v22.4s, v23.4s addp v8.4s, v8.4s, v9.4s addp v0.4s, v0.4s, v1.4s addp v2.4s, v2.4s, v3.4s addp v4.4s, v4.4s, v5.4s addp v16.4s, v8.4s, v0.4s addp v17.4s, v2.4s, v4.4s add w5, w5, w8 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) ret endfunc // void dav2d_warp_affine_8x8_16bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my, // const int bitdepth_max) .macro warp t function warp_affine_8x8\t\()_16bpc_neon, export=1 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] .ifb \t dup v15.8h, w7 // bitdepth_max .else movi v15.8h, #(PREP_BIAS >> 8), lsl #8 .endif clz w7, w7 // intermediate_bits = clz(bitdepth_max) - 18 .ifb \t sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 .endif sub w7, w7, #25 // -(7 - intermediate_bits) .ifb \t neg w8, w8 // -(7 + intermediate_bits) .endif dup v14.4s, w7 // -(7 - intermediate_bits) .ifb \t dup v13.4s, w8 // -(7 + intermediate_bits) .endif ldr x4, [x4] sbfx x7, x4, #0, #16 sbfx x8, x4, #16, #16 sbfx x9, x4, #32, #16 sbfx x4, x4, #48, #16 mov w10, #8 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x2, x2, #6 movrel x11, X(mc_warp_filter), 3*64*8 mov x15, x30 .ifnb \t lsl x1, x1, #1 .endif bl warp_filter_horz_neon uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 bl warp_filter_horz_neon uzp1 v25.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v26.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v27.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v28.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v29.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v30.8h, v16.8h, v17.8h // Ditto 1: add w14, w6, #512 bl warp_filter_horz_neon uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 load_filter_row d2, w14, w9 load_filter_row d3, w14, w9 load_filter_row d4, w14, w9 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. smull v16.4s, v24.4h, v0.4h smlal v16.4s, v25.4h, v1.4h smlal v16.4s, v26.4h, v2.4h smlal v16.4s, v27.4h, v3.4h smlal v16.4s, v28.4h, v4.4h smlal v16.4s, v29.4h, v5.4h smlal v16.4s, v30.4h, v6.4h smlal v16.4s, v31.4h, v7.4h smull2 v17.4s, v24.8h, v0.8h smlal2 v17.4s, v25.8h, v1.8h smlal2 v17.4s, v26.8h, v2.8h smlal2 v17.4s, v27.8h, v3.8h smlal2 v17.4s, v28.8h, v4.8h smlal2 v17.4s, v29.8h, v5.8h smlal2 v17.4s, v30.8h, v6.8h smlal2 v17.4s, v31.8h, v7.8h mov v24.16b, v25.16b mov v25.16b, v26.16b .ifb \t srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) .else rshrn v16.4h, v16.4s, #7 rshrn2 v16.8h, v17.4s, #7 .endif mov v26.16b, v27.16b .ifb \t sqxtun v16.4h, v16.4s sqxtun2 v16.8h, v17.4s .else sub v16.8h, v16.8h, v15.8h // PREP_BIAS .endif mov v27.16b, v28.16b mov v28.16b, v29.16b .ifb \t umin v16.8h, v16.8h, v15.8h // bitdepth_max .endif mov v29.16b, v30.16b mov v30.16b, v31.16b subs w10, w10, #1 st1 {v16.8h}, [x0], x1 add w6, w6, w4 b.gt 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x15 endfunc .endm warp warp t // void dav2d_emu_edge_16bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_16bpc_neon, export=1 ldp x8, x9, [sp] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub x12, x3, #1 // ih - 1 cmp x5, x3 sub x13, x2, #1 // iw - 1 csel x12, x12, x5, ge // min(y, ih - 1) cmp x4, x2 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) csel x13, x13, x4, ge // min(x, iw - 1) bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) madd x8, x12, x9, x8 // ref += iclip() * stride add x8, x8, x13, lsl #1 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add x10, x5, x1 // y + bh neg x5, x5 // -y sub x10, x10, x3 // y + bh - ih sub x12, x1, #1 // bh - 1 cmp x10, x1 bic x5, x5, x5, asr #63 // max(-y, 0) csel x10, x10, x12, lt // min(y + bh - ih, bh-1) cmp x5, x1 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) csel x5, x5, x12, lt // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add x11, x4, x0 // x + bw neg x4, x4 // -x sub x11, x11, x2 // x + bw - iw sub x13, x0, #1 // bw - 1 cmp x11, x0 bic x4, x4, x4, asr #63 // max(-x, 0) csel x11, x11, x13, lt // min(x + bw - iw, bw-1) cmp x4, x0 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub x1, x1, x5 // bh - top_ext madd x6, x5, x7, x6 sub x2, x0, x4 // bw - left_ext sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext sub x2, x2, x11 // center_w = bw - left_ext - right_ext mov x14, x6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left ld1r {v0.8h}, [x8] mov x12, x6 // out = dst mov x3, x4 mov v1.16b, v0.16b 1: subs x3, x3, #16 st1 {v0.8h, v1.8h}, [x12], #32 b.gt 1b .endif mov x13, x8 add x12, x6, x4, lsl #1 // out = dst + left_ext mov x3, x2 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 subs x3, x3, #32 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 b.gt 1b .if \need_right add x3, x8, x2, lsl #1 // in + center_w sub x3, x3, #2 // in + center_w - 1 add x12, x6, x4, lsl #1 // dst + left_ext ld1r {v0.8h}, [x3] add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w mov x3, x11 mov v1.16b, v0.16b 1: subs x3, x3, #16 st1 {v0.8h, v1.8h}, [x12], #32 b.gt 1b .endif subs x1, x1, #1 // center_h-- add x6, x6, x7 add x8, x8, x9 b.gt 0b .endm cbz x4, 2f // need_left cbz x11, 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cbz x11, 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cbz x10, 3f // need_bottom sub x8, x6, x7 // ref = dst - stride mov x4, x0 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 mov x3, x10 2: subs x3, x3, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 b.gt 2b msub x6, x7, x10, x6 // dst -= bottom_ext * stride subs x4, x4, #32 // bw -= 32 add x6, x6, #64 // dst += 32 b.gt 1b 3: cbz x5, 3f // need_top msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 mov x3, x5 2: subs x3, x3, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 b.gt 2b msub x6, x7, x5, x6 // dst -= top_ext * stride subs x0, x0, #32 // bw -= 32 add x6, x6, #64 // dst += 32 b.gt 1b 3: ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/mc16_sve.S000066400000000000000000001555251517466257200234270ustar00rootroot00000000000000/* * Copyright © 2024, Arm Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define PREP_BIAS 32, lsl #8 // 8192 #define PREP_BIAS_NEG 224, lsl #8 // -8192 #if HAVE_SVE2 ENABLE_SVE ENABLE_SVE2 // No spaces in these expressions, due to gas-preprocessor. It is translated by // -1 to save the negative offset when getting the address of `mc_subpel_filters`. #define REGULAR1 (((0*15-1)<<7)|(3*15-1)) #define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) #define SHARP1 (((2*15-1)<<7)|(3*15-1)) #define FUNC_ALIGN 2 #define JUMP_ALIGN 2 #define LOOP_ALIGN 2 // Shuffle indices to permute horizontal samples in preparation for input to // 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample // indices in the interval of [-3, 4] relative to the current sample position. const h_tbl_sve, align=4 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 endconst // Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit // registers contain a transposed 4x4 matrix of values. Subsequent iterations // of the vertical convolution can reuse the 3x4 sub-matrix from the previous // loop iteration. These shuffle indices shift and merge this 4x4 matrix with // the values of a new line. const v_tbl_sve, align=4 .byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25 .byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 .byte 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 .byte 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27 .byte 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31 endconst .macro make_8tap_fn op, name, type, isa, jump=1 function \op\()_8tap_\name\()_16bpc_\isa, export=1, align=FUNC_ALIGN mov x9, \type mov x10, \type .if \jump b \op\()_8tap_\isa .endif endfunc .endm .macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd make_8tap_fn \type, sharp, SHARP1, \isa make_8tap_fn \type, smooth, SMOOTH1, \isa make_8tap_fn \type, regular, REGULAR1, \isa, jump=0 function \type\()_8tap_\isa, align=FUNC_ALIGN clz w8, \w mov w11, #0x4081 // (1<<14) | (1<<7) | 1 ptrue p0.b, vl16 sub w8, w8, #25 // for jump tables movrel x12, X(mc_subpel_filters) cbnz \mx, L(\type\()_8tap_h_hv_\isa) .ifc \type, prep cbz \my, prep_sve .else // put cbnz \my, L(\type\()_8tap_v_\isa) mov w9, w8 b X(put_16bpc_neon) .align JUMP_ALIGN .endif L(\type\()_8tap_v_\isa): madd \my, \my, w11, w9 movrel x13, v_tbl_sve .ifc \type, put ld1r {v5.8h}, [sp] // bdmax .else ldr \bdmax, [sp] .endif sub \src, \src, \s_strd // src - s_strd ubfx w11, \my, #7, #7 and \my, \my, #0x7F ldr q6, [x13] cmp \h, #4 csel \my, \my, w11, le sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd add \xmy, x12, \xmy, lsl #3 // subpel V filter address ldp q28, q29, [x13, #16] ld1sb {z7.h}, p0/z, [\xmy] .ifc \type, prep clz \bdmax, \bdmax sub \bdmax, \bdmax, #24 dup v5.4s, \bdmax .endif cmp \w, #8 b.lt 40f // .align JUMP_ALIGN // fallthrough 80: // V - 8xN+ ldp q30, q31, [x13, #48] .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd // d_strd *= 2 .endif .align LOOP_ALIGN 81: add \lsrc, \src, \s_strd, lsl #1 ldr q16, [\src] ldr q17, [\src, \s_strd] ldr q18, [\lsrc] ldr q19, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 mov \ldst, \dst ldr q20, [\lsrc] ldr q21, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 ldr q22, [\lsrc] ldr q23, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 sub w8, \h, #1 zip1 v0.8h, v16.8h, v17.8h zip2 v1.8h, v16.8h, v17.8h zip1 v2.8h, v18.8h, v19.8h zip2 v3.8h, v18.8h, v19.8h zip1 v18.8h, v20.8h, v21.8h zip2 v21.8h, v20.8h, v21.8h zip1 v24.8h, v22.8h, v23.8h zip2 v27.8h, v22.8h, v23.8h zip1 v16.4s, v0.4s, v2.4s zip2 v19.4s, v0.4s, v2.4s zip1 v22.4s, v1.4s, v3.4s zip2 v25.4s, v1.4s, v3.4s zip1 v17.4s, v18.4s, v24.4s zip2 v20.4s, v18.4s, v24.4s zip1 v23.4s, v21.4s, v27.4s zip2 v26.4s, v21.4s, v27.4s .align LOOP_ALIGN 8: ld1 {v18.16b}, [\lsrc], \s_strd movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 mov v21.16b, v18.16b mov v24.16b, v18.16b mov v27.16b, v18.16b sdot z0.d, z16.h, z7.h[0] tbl v16.16b, {v16.16b, v17.16b}, v6.16b sdot z1.d, z19.h, z7.h[0] tbl v19.16b, {v19.16b, v20.16b}, v6.16b sdot z2.d, z22.h, z7.h[0] tbl v22.16b, {v22.16b, v23.16b}, v6.16b subs w8, w8, #1 sdot z3.d, z25.h, z7.h[0] tbl v25.16b, {v25.16b, v26.16b}, v6.16b sdot z0.d, z17.h, z7.h[1] tbl v17.16b, {v17.16b, v18.16b}, v28.16b sdot z1.d, z20.h, z7.h[1] tbl v20.16b, {v20.16b, v21.16b}, v29.16b sdot z2.d, z23.h, z7.h[1] tbl v23.16b, {v23.16b, v24.16b}, v30.16b sdot z3.d, z26.h, z7.h[1] tbl v26.16b, {v26.16b, v27.16b}, v31.16b uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS .else // put sqrshrun v0.4h, v0.4s, #6 sqrshrun2 v0.8h, v1.4s, #6 umin v0.8h, v0.8h, v5.8h .endif st1 {v0.16b}, [\ldst], \d_strd b.gt 8b movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 sdot z0.d, z16.h, z7.h[0] sdot z1.d, z19.h, z7.h[0] sdot z2.d, z22.h, z7.h[0] sdot z3.d, z25.h, z7.h[0] sdot z0.d, z17.h, z7.h[1] sdot z1.d, z20.h, z7.h[1] sdot z2.d, z23.h, z7.h[1] sdot z3.d, z26.h, z7.h[1] subs \w, \w, #8 uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS .else // put sqrshrun v0.4h, v0.4s, #6 sqrshrun2 v0.8h, v1.4s, #6 umin v0.8h, v0.8h, v5.8h .endif str q0, [\ldst] add \dst, \dst, #16 add \src, \src, #16 b.gt 81b ret .align JUMP_ALIGN 40: // V - 4xN, put only: 2xN .ifc \type, put lsr \d_strd, \d_strd, #1 // hword index for `st1h` .endif whilelt p1.h, wzr, \w // masking for writes cmp \h, #4 b.le 44f ldr d16, [\src] ldr d17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d18, [\src] ldr d19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d20, [\src] ldr d21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d22, [\src] ldr d23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 sub \h, \h, #2 zip1 v0.8h, v16.8h, v17.8h zip1 v2.8h, v18.8h, v19.8h zip1 v18.8h, v20.8h, v21.8h zip1 v24.8h, v22.8h, v23.8h zip1 v16.4s, v0.4s, v2.4s zip2 v19.4s, v0.4s, v2.4s zip1 v17.4s, v18.4s, v24.4s zip2 v20.4s, v18.4s, v24.4s .align LOOP_ALIGN 4: ldr d18, [\src] ldr d24, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 mov v21.16b, v18.16b mov v27.16b, v24.16b sdot z0.d, z16.h, z7.h[0] tbl v22.16b, {v16.16b, v17.16b}, v6.16b sdot z1.d, z19.h, z7.h[0] tbl v25.16b, {v19.16b, v20.16b}, v6.16b sdot z0.d, z17.h, z7.h[1] tbl v23.16b, {v17.16b, v18.16b}, v28.16b sdot z1.d, z20.h, z7.h[1] tbl v26.16b, {v20.16b, v21.16b}, v29.16b subs \h, \h, #2 sdot z2.d, z22.h, z7.h[0] tbl v16.16b, {v22.16b, v23.16b}, v6.16b sdot z3.d, z25.h, z7.h[0] tbl v19.16b, {v25.16b, v26.16b}, v6.16b sdot z2.d, z23.h, z7.h[1] tbl v17.16b, {v23.16b, v24.16b}, v28.16b sdot z3.d, z26.h, z7.h[1] tbl v20.16b, {v26.16b, v27.16b}, v29.16b uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS mov v1.d[0], v0.d[1] .else // put sqrshrun v0.4h, v0.4s, #6 sqrshrun v1.4h, v1.4s, #6 umin v0.4h, v0.4h, v5.4h umin v1.4h, v1.4h, v5.4h .endif st1h {z0.h}, p1, [\dst] st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] add \dst, \dst, \d_strd, lsl #2 b.gt 4b ldr d18, [\src] movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 mov v21.16b, v18.16b sdot z0.d, z16.h, z7.h[0] tbl v22.16b, {v16.16b, v17.16b}, v6.16b sdot z1.d, z19.h, z7.h[0] tbl v25.16b, {v19.16b, v20.16b}, v6.16b sdot z0.d, z17.h, z7.h[1] tbl v23.16b, {v17.16b, v18.16b}, v28.16b sdot z1.d, z20.h, z7.h[1] tbl v26.16b, {v20.16b, v21.16b}, v29.16b sdot z2.d, z22.h, z7.h[0] sdot z3.d, z25.h, z7.h[0] sdot z2.d, z23.h, z7.h[1] sdot z3.d, z26.h, z7.h[1] uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS mov v1.d[0], v0.d[1] .else // put sqrshrun v0.4h, v0.4s, #6 sqrshrun v1.4h, v1.4s, #6 umin v0.4h, v0.4h, v5.4h umin v1.4h, v1.4h, v5.4h .endif st1h {z0.h}, p1, [\dst] st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] ret .align JUMP_ALIGN 44: // V - 4x4, put only: 4x2, 2x4, 2x2 add \src, \src, \s_strd, lsl #1 // src - s_strd subs \h, \h, #2 ldr d16, [\src] ldr d17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d18, [\src] ldr d19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ext v7.16b, v7.16b, v7.16b, #4 // [\xmy + 2 * 2] zip1 v0.8h, v16.8h, v17.8h zip1 v2.8h, v18.8h, v19.8h zip1 v16.4s, v0.4s, v2.4s zip2 v19.4s, v0.4s, v2.4s .ifc \type, put b.eq 42f .endif ldr d17, [\src] ldr d23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 mov v20.16b, v17.16b mov v26.16b, v23.16b sdot z0.d, z16.h, z7.h[0] tbl v22.16b, {v16.16b, v17.16b}, v28.16b sdot z1.d, z19.h, z7.h[0] tbl v25.16b, {v19.16b, v20.16b}, v29.16b sdot z2.d, z22.h, z7.h[0] tbl v16.16b, {v22.16b, v23.16b}, v28.16b sdot z3.d, z25.h, z7.h[0] tbl v19.16b, {v25.16b, v26.16b}, v29.16b uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS mov v1.d[0], v0.d[1] .else // put sqrshrun v0.4h, v0.4s, #6 sqrshrun v1.4h, v1.4s, #6 umin v0.4h, v0.4h, v5.4h umin v1.4h, v1.4h, v5.4h .endif st1h {z0.h}, p1, [\dst] st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] add \dst, \dst, \d_strd, lsl #2 .ifc \type, put .align JUMP_ALIGN 42: .endif ldr d17, [\src] movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 mov v20.16b, v17.16b sdot z0.d, z16.h, z7.h[0] tbl v22.16b, {v16.16b, v17.16b}, v28.16b sdot z1.d, z19.h, z7.h[0] tbl v25.16b, {v19.16b, v20.16b}, v29.16b sdot z2.d, z22.h, z7.h[0] sdot z3.d, z25.h, z7.h[0] uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS mov v1.d[0], v0.d[1] .else // put sqrshrun v0.4h, v0.4s, #6 sqrshrun v1.4h, v1.4s, #6 umin v0.4h, v0.4h, v5.4h umin v1.4h, v1.4h, v5.4h .endif st1h {z0.h}, p1, [\dst] st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] ret .align JUMP_ALIGN L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 movrel x13, h_tbl_sve sub \src, \src, #6 // src - 3 * 2 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7F cmp \w, #4 csel \mx, \mx, w10, le ldp q30, q31, [x13] add \xmx, x12, \xmx, lsl #3 // subpel H filter address cbz \my, L(\type\()_8tap_h_\isa) // HV cases madd w14, \my, w11, w9 .ifc \bdmax, w8 ldr \bdmax, [sp] .endif ubfx w11, w14, #7, #7 and w14, w14, #0x7F ld1sb {z4.h}, p0/z, [\xmx] cmp \h, #4 csel w14, w14, w11, le .ifc \type, put dup v29.8h, \bdmax .endif clz \bdmax, \bdmax add \xmy, x12, x14, lsl #3 // subpel V filter address ld1sb {z7.h}, p0/z, [\xmy] .ifc \type, put mov w10, #12 sub w10, w10, \bdmax dup v6.4s, w10 .endif sub \bdmax, \bdmax, #24 mov x15, x30 sub \src, \src, \s_strd // src - s_strd - 3 * 2 dup v5.4s, \bdmax cmp w9, SHARP1 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 // HV 8-tap cases cmp \w, #4 b.le 40f // .align JUMP_ALIGN // fallthrough 80: // HV8 - 8xN+ .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd // d_strd *= 2 .endif cmp \h, #4 b.le 84f sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2 .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h bl L(\type\()_hv_filter8_\isa) uzp1 v16.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v17.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v18.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v19.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v20.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v21.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v22.8h, v23.8h, v24.8h .align LOOP_ALIGN 8: ldp q24, q28, [\lsrc] smull v0.4s, v16.4h, v7.h[0] smull2 v1.4s, v16.8h, v7.h[0] mov v16.16b, v17.16b movi v2.2d, #0 movi v3.2d, #0 tbl v23.16b, {v24.16b}, v30.16b tbl v24.16b, {v24.16b}, v31.16b ldur q26, [\lsrc, #8] smlal v0.4s, v17.4h, v7.h[1] smlal2 v1.4s, v17.8h, v7.h[1] mov v17.16b, v18.16b add \lsrc, \lsrc, \s_strd sdot z2.d, z23.h, z4.h[0] sdot z3.d, z24.h, z4.h[0] movi v23.2d, #0 movi v24.2d, #0 tbl v25.16b, {v26.16b}, v30.16b tbl v26.16b, {v26.16b}, v31.16b smlal v0.4s, v18.4h, v7.h[2] smlal2 v1.4s, v18.8h, v7.h[2] mov v18.16b, v19.16b sdot z23.d, z25.h, z4.h[0] sdot z24.d, z26.h, z4.h[0] tbl v27.16b, {v28.16b}, v30.16b tbl v28.16b, {v28.16b}, v31.16b smlal v0.4s, v19.4h, v7.h[3] smlal2 v1.4s, v19.8h, v7.h[3] mov v19.16b, v20.16b subs w8, w8, #1 sdot z2.d, z25.h, z4.h[1] sdot z3.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] sdot z24.d, z28.h, z4.h[1] smlal v0.4s, v20.4h, v7.h[4] smlal2 v1.4s, v20.8h, v7.h[4] mov v20.16b, v21.16b uzp1 v3.4s, v2.4s, v3.4s uzp1 v24.4s, v23.4s, v24.4s smlal v0.4s, v21.4h, v7.h[5] smlal2 v1.4s, v21.8h, v7.h[5] mov v21.16b, v22.16b srshl v23.4s, v3.4s, v5.4s srshl v24.4s, v24.4s, v5.4s smlal v0.4s, v22.4h, v7.h[6] smlal2 v1.4s, v22.8h, v7.h[6] uzp1 v22.8h, v23.8h, v24.8h smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 sub z0.h, z0.h, #PREP_BIAS .else // put srshl v0.4s, v0.4s, v6.4s srshl v1.4s, v1.4s, v6.4s sqxtun v0.4h, v0.4s sqxtun2 v0.8h, v1.4s umin v0.8h, v0.8h, v29.8h .endif st1 {v0.8h}, [\ldst], \d_strd b.gt 8b subs \w, \w, #8 add \src, \src, #16 add \dst, \dst, #16 b.gt 81b ret x15 .align JUMP_ALIGN 40: // HV8 - 4xN, put only: 2xN .ifc \type, put lsr \d_strd, \d_strd, #1 // hword index for `st1h` .endif whilelt p1.h, wzr, \w // masking for writes ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] add \src, \src, #4 cmp \h, #4 b.le 44f sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2 bl L(\type\()_hv_filter4_\isa) xtn v16.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v17.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v18.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v19.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v20.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v21.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v22.4h, v0.4s .align LOOP_ALIGN 4: ld1 {v3.16b}, [\src], \s_strd smull v24.4s, v16.4h, v7.h[0] smlal v24.4s, v17.4h, v7.h[1] tbl v2.16b, {v3.16b}, v30.16b tbl v3.16b, {v3.16b}, v31.16b movi v0.2d, #0 movi v1.2d, #0 mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v24.4s, v18.4h, v7.h[2] smlal v24.4s, v19.4h, v7.h[3] sdot z0.d, z2.h, z4.h[0] sdot z1.d, z3.h, z4.h[0] mov v18.16b, v19.16b mov v19.16b, v20.16b uzp1 v0.4s, v0.4s, v1.4s smlal v24.4s, v20.4h, v7.h[4] smlal v24.4s, v21.4h, v7.h[5] srshl v0.4s, v0.4s, v5.4s mov v20.16b, v21.16b mov v21.16b, v22.16b subs \h, \h, #1 smlal v24.4s, v22.4h, v7.h[6] xtn v22.4h, v0.4s smlal v24.4s, v22.4h, v7.h[7] .ifc \type, prep rshrn v0.4h, v24.4s, #6 sub z0.h, z0.h, #PREP_BIAS .else // put srshl v0.4s, v24.4s, v6.4s sqxtun v0.4h, v0.4s umin v0.4h, v0.4h, v29.4h .endif st1h {z0.h}, p1, [\dst] add \dst, \dst, \d_strd, lsl #1 b.gt 4b ret x15 .align JUMP_ALIGN L(\type\()_6tap_hv_\isa): cmp \w, #4 b.le 46f // .align JUMP_ALIGN // fallthrough 80: // HV6 - 8xN+ .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd // d_strd *= 2 .endif cmp \h, #4 b.le 84f sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2 .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h bl L(\type\()_hv_filter8_\isa) uzp1 v16.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v17.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v18.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v19.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v20.8h, v23.8h, v24.8h .align LOOP_ALIGN 8: ldp q24, q28, [\lsrc] smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] mov v16.16b, v17.16b tbl v23.16b, {v24.16b}, v30.16b tbl v24.16b, {v24.16b}, v31.16b movi v2.2d, #0 movi v3.2d, #0 ldur q26, [\lsrc, #8] add \lsrc, \lsrc, \s_strd sdot z2.d, z23.h, z4.h[0] sdot z3.d, z24.h, z4.h[0] tbl v25.16b, {v26.16b}, v30.16b tbl v26.16b, {v26.16b}, v31.16b movi v23.2d, #0 movi v24.2d, #0 sdot z23.d, z25.h, z4.h[0] sdot z24.d, z26.h, z4.h[0] tbl v27.16b, {v28.16b}, v30.16b tbl v28.16b, {v28.16b}, v31.16b smlal v0.4s, v17.4h, v7.h[2] smlal2 v1.4s, v17.8h, v7.h[2] mov v17.16b, v18.16b sdot z2.d, z25.h, z4.h[1] sdot z3.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] sdot z24.d, z28.h, z4.h[1] smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b uzp1 v3.4s, v2.4s, v3.4s uzp1 v24.4s, v23.4s, v24.4s smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] mov v19.16b, v20.16b srshl v23.4s, v3.4s, v5.4s srshl v24.4s, v24.4s, v5.4s smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] subs w8, w8, #1 uzp1 v20.8h, v23.8h, v24.8h smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 sub z0.h, z0.h, #PREP_BIAS .else // put srshl v0.4s, v0.4s, v6.4s srshl v1.4s, v1.4s, v6.4s sqxtun v0.4h, v0.4s sqxtun2 v0.8h, v1.4s umin v0.8h, v0.8h, v29.8h .endif st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 subs \w, \w, #8 add \src, \src, #16 b.gt 81b ret x15 .align LOOP_ALIGN 84: // HV4 - 8x4, 8x2 mov \lsrc, \src mov \ldst, \dst mov w8, \h bl L(\type\()_hv_filter8_\isa) uzp1 v17.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v18.8h, v23.8h, v24.8h bl L(\type\()_hv_filter8_\isa) uzp1 v19.8h, v23.8h, v24.8h .align LOOP_ALIGN 81: ldp q24, q28, [\lsrc] ldur q26, [\lsrc, #8] add \lsrc, \lsrc, \s_strd tbl v23.16b, {v24.16b}, v30.16b tbl v24.16b, {v24.16b}, v31.16b movi v2.2d, #0 movi v3.2d, #0 sdot z2.d, z23.h, z4.h[0] sdot z3.d, z24.h, z4.h[0] tbl v25.16b, {v26.16b}, v30.16b tbl v26.16b, {v26.16b}, v31.16b movi v23.2d, #0 movi v24.2d, #0 sdot z23.d, z25.h, z4.h[0] sdot z24.d, z26.h, z4.h[0] tbl v27.16b, {v28.16b}, v30.16b tbl v28.16b, {v28.16b}, v31.16b sdot z2.d, z25.h, z4.h[1] sdot z3.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] sdot z24.d, z28.h, z4.h[1] smull v0.4s, v17.4h, v7.h[2] smull2 v1.4s, v17.8h, v7.h[2] mov v17.16b, v18.16b subs w8, w8, #1 uzp1 v3.4s, v2.4s, v3.4s uzp1 v24.4s, v23.4s, v24.4s smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b srshl v23.4s, v3.4s, v5.4s srshl v24.4s, v24.4s, v5.4s smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] uzp1 v19.8h, v23.8h, v24.8h smlal v0.4s, v19.4h, v7.h[5] smlal2 v1.4s, v19.8h, v7.h[5] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 sub z0.h, z0.h, #PREP_BIAS .else // put srshl v0.4s, v0.4s, v6.4s srshl v1.4s, v1.4s, v6.4s sqxtun v0.4h, v0.4s sqxtun2 v0.8h, v1.4s umin v0.8h, v0.8h, v29.8h .endif st1 {v0.8h}, [\ldst], \d_strd b.gt 81b subs \w, \w, #8 add \dst, \dst, #16 add \src, \src, #16 b.gt 84b ret x15 .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): ldp q24, q28, [\lsrc] ldur q26, [\lsrc, #8] add \lsrc, \lsrc, \s_strd tbl v23.16b, {v24.16b}, v30.16b tbl v24.16b, {v24.16b}, v31.16b movi v2.2d, #0 movi v3.2d, #0 sdot z2.d, z23.h, z4.h[0] sdot z3.d, z24.h, z4.h[0] tbl v25.16b, {v26.16b}, v30.16b tbl v26.16b, {v26.16b}, v31.16b movi v23.2d, #0 movi v24.2d, #0 sdot z23.d, z25.h, z4.h[0] sdot z24.d, z26.h, z4.h[0] tbl v27.16b, {v28.16b}, v30.16b tbl v28.16b, {v28.16b}, v31.16b sdot z2.d, z25.h, z4.h[1] sdot z3.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] sdot z24.d, z28.h, z4.h[1] uzp1 v3.4s, v2.4s, v3.4s uzp1 v24.4s, v23.4s, v24.4s srshl v23.4s, v3.4s, v5.4s srshl v24.4s, v24.4s, v5.4s ret .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): ld1 {v3.16b}, [\src], \s_strd tbl v2.16b, {v3.16b}, v30.16b tbl v3.16b, {v3.16b}, v31.16b movi v0.2d, #0 movi v1.2d, #0 sdot z0.d, z2.h, z4.h[0] sdot z1.d, z3.h, z4.h[0] uzp1 v0.4s, v0.4s, v1.4s srshl v0.4s, v0.4s, v5.4s ret .align JUMP_ALIGN 46: // H4V6 - 4xN, put only: 2xN .ifc \type, put lsr \d_strd, \d_strd, #1 // hword index for `st1h` .endif whilelt p1.h, wzr, \w // masking for writes ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] add \src, \src, #4 cmp \h, #4 b.le 44f sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2 bl L(\type\()_hv_filter4_\isa) xtn v16.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v17.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v18.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v19.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v20.4h, v0.4s .align LOOP_ALIGN 4: ld1 {v3.16b}, [\src], \s_strd smull v24.4s, v16.4h, v7.h[1] smlal v24.4s, v17.4h, v7.h[2] tbl v2.16b, {v3.16b}, v30.16b tbl v3.16b, {v3.16b}, v31.16b movi v0.2d, #0 movi v1.2d, #0 sdot z0.d, z2.h, z4.h[0] sdot z1.d, z3.h, z4.h[0] mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v24.4s, v18.4h, v7.h[3] smlal v24.4s, v19.4h, v7.h[4] uzp1 v0.4s, v0.4s, v1.4s mov v18.16b, v19.16b mov v19.16b, v20.16b subs \h, \h, #1 srshl v0.4s, v0.4s, v5.4s smlal v24.4s, v20.4h, v7.h[5] xtn v20.4h, v0.4s smlal v24.4s, v20.4h, v7.h[6] .ifc \type, prep rshrn v0.4h, v24.4s, #6 sub z0.h, z0.h, #PREP_BIAS .else // put srshl v0.4s, v24.4s, v6.4s sqxtun v0.4h, v0.4s umin v0.4h, v0.4h, v29.4h .endif st1h {z0.h}, p1, [\dst] add \dst, \dst, \d_strd, lsl #1 b.gt 4b ret x15 .align JUMP_ALIGN 44: // H4V4 - 4x4, put only: 4x2, 2x4, 2x2 bl L(\type\()_hv_filter4_\isa) xtn v17.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v18.4h, v0.4s bl L(\type\()_hv_filter4_\isa) xtn v19.4h, v0.4s .align LOOP_ALIGN 4: ld1 {v3.16b}, [\src], \s_strd smull v24.4s, v17.4h, v7.h[2] smlal v24.4s, v18.4h, v7.h[3] tbl v2.16b, {v3.16b}, v30.16b tbl v3.16b, {v3.16b}, v31.16b movi v0.2d, #0 movi v1.2d, #0 sdot z0.d, z2.h, z4.h[0] sdot z1.d, z3.h, z4.h[0] uzp1 v0.4s, v0.4s, v1.4s mov v17.16b, v18.16b mov v18.16b, v19.16b subs \h, \h, #1 srshl v0.4s, v0.4s, v5.4s smlal v24.4s, v19.4h, v7.h[4] xtn v19.4h, v0.4s smlal v24.4s, v19.4h, v7.h[5] .ifc \type, prep rshrn v0.4h, v24.4s, #6 sub z0.h, z0.h, #PREP_BIAS .else // put srshl v0.4s, v24.4s, v6.4s sqxtun v0.4h, v0.4s umin v0.4h, v0.4h, v29.4h .endif st1h {z0.h}, p1, [\dst] add \dst, \dst, \d_strd, lsl #1 b.gt 4b ret x15 .align JUMP_ALIGN L(\type\()_8tap_h_\isa): movrel x11, \type\()_8tap_h_\isa\()_tbl ldrsw x12, [x11, x8, lsl #2] .ifc \bdmax, w8 ldr \bdmax, [sp] .endif .ifc \type, prep clz \bdmax, \bdmax sub \bdmax, \bdmax, #24 dup v5.4s, \bdmax .else // put mov w9, #34 // rounding for 10-bit case mov w10, #40 // rounding for 12-bit case cmp \bdmax, #0xFFF csel w9, w9, w10, ne // select rounding based on \bdmax dup v5.8h, \bdmax dup v6.2d, x9 .endif add x11, x11, x12 ld1sb {z4.h}, p0/z, [\xmx] br x11 .align JUMP_ALIGN 20: // H - 4xN, put only: 2xN 40: AARCH64_VALID_JUMP_TARGET add \src, \src, #4 // src - 1 * 2 ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] .ifc \type, put lsr \d_strd, \d_strd, #1 // hword index for `st1h` .endif whilelt p1.h, wzr, \w // masking for writes .align LOOP_ALIGN 4: ldr q17, [\src] ldr q19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type, prep movi v0.2d, #0 movi v1.2d, #0 movi v2.2d, #0 movi v3.2d, #0 .else mov v0.16b, v6.16b mov v1.16b, v6.16b mov v2.16b, v6.16b mov v3.16b, v6.16b .endif tbl v16.16b, {v17.16b}, v30.16b tbl v17.16b, {v17.16b}, v31.16b sdot z0.d, z16.h, z4.h[0] sdot z1.d, z17.h, z4.h[0] subs \h, \h, #2 tbl v18.16b, {v19.16b}, v30.16b tbl v19.16b, {v19.16b}, v31.16b sdot z2.d, z18.h, z4.h[0] sdot z3.d, z19.h, z4.h[0] uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v1.4s, v1.4s, v5.4s uzp1 v0.8h, v0.8h, v1.8h sub z0.h, z0.h, #PREP_BIAS mov v1.d[0], v0.d[1] .else // put sqshrun v0.4h, v0.4s, #6 sqshrun v1.4h, v1.4s, #6 umin v0.4h, v0.4h, v5.4h umin v1.4h, v1.4h, v5.4h .endif st1h {z0.h}, p1, [\dst] st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] add \dst, \dst, \d_strd, lsl #2 b.gt 4b ret .align JUMP_ALIGN 80: // H - 8xN AARCH64_VALID_JUMP_TARGET .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd .endif .align LOOP_ALIGN 8: ldp q17, q21, [\src] ldur q19, [\src, #8] .ifc \type, prep movi v0.2d, #0 movi v2.2d, #0 .else mov v0.16b, v6.16b mov v2.16b, v6.16b .endif tbl v16.16b, {v17.16b}, v30.16b tbl v17.16b, {v17.16b}, v31.16b add \src, \src, \s_strd sdot z0.d, z16.h, z4.h[0] sdot z2.d, z17.h, z4.h[0] tbl v18.16b, {v19.16b}, v30.16b tbl v19.16b, {v19.16b}, v31.16b .ifc \type, prep movi v16.2d, #0 movi v17.2d, #0 .else mov v16.16b, v6.16b mov v17.16b, v6.16b .endif ldp q23, q27, [\src] ldur q25, [\src, #8] sdot z16.d, z18.h, z4.h[0] sdot z17.d, z19.h, z4.h[0] tbl v22.16b, {v23.16b}, v30.16b tbl v23.16b, {v23.16b}, v31.16b .ifc \type, prep movi v1.2d, #0 movi v3.2d, #0 .else mov v1.16b, v6.16b mov v3.16b, v6.16b .endif add \src, \src, \s_strd sdot z1.d, z22.h, z4.h[0] sdot z3.d, z23.h, z4.h[0] tbl v24.16b, {v25.16b}, v30.16b tbl v25.16b, {v25.16b}, v31.16b .ifc \type, prep movi v22.2d, #0 movi v23.2d, #0 .else mov v22.16b, v6.16b mov v23.16b, v6.16b .endif sdot z22.d, z24.h, z4.h[0] sdot z23.d, z25.h, z4.h[0] tbl v20.16b, {v21.16b}, v30.16b tbl v21.16b, {v21.16b}, v31.16b sdot z0.d, z18.h, z4.h[1] sdot z2.d, z19.h, z4.h[1] tbl v26.16b, {v27.16b}, v30.16b tbl v27.16b, {v27.16b}, v31.16b sdot z16.d, z20.h, z4.h[1] sdot z17.d, z21.h, z4.h[1] sdot z1.d, z24.h, z4.h[1] sdot z3.d, z25.h, z4.h[1] sdot z22.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] subs \h, \h, #2 uzp1 v0.4s, v0.4s, v2.4s uzp1 v2.4s, v16.4s, v17.4s uzp1 v1.4s, v1.4s, v3.4s uzp1 v3.4s, v22.4s, v23.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v2.4s, v2.4s, v5.4s srshl v1.4s, v1.4s, v5.4s srshl v3.4s, v3.4s, v5.4s uzp1 v0.8h, v0.8h, v2.8h uzp1 v1.8h, v1.8h, v3.8h sub z0.h, z0.h, #PREP_BIAS sub z1.h, z1.h, #PREP_BIAS .else // put sqshrun v0.4h, v0.4s, #6 sqshrun2 v0.8h, v2.4s, #6 sqshrun v1.4h, v1.4s, #6 sqshrun2 v1.8h, v3.4s, #6 umin v0.8h, v0.8h, v5.8h umin v1.8h, v1.8h, v5.8h .endif st1 {v0.16b}, [\dst], \d_strd st1 {v1.16b}, [\dst], \d_strd b.gt 8b ret .align JUMP_ALIGN 160: // H - 16xN AARCH64_VALID_JUMP_TARGET .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd .endif .align LOOP_ALIGN 16: ldp q17, q21, [\src] ldur q19, [\src, #8] .ifc \type, prep movi v0.2d, #0 movi v2.2d, #0 .else mov v0.16b, v6.16b mov v2.16b, v6.16b .endif tbl v16.16b, {v17.16b}, v30.16b tbl v17.16b, {v17.16b}, v31.16b sdot z0.d, z16.h, z4.h[0] sdot z2.d, z17.h, z4.h[0] tbl v18.16b, {v19.16b}, v30.16b tbl v19.16b, {v19.16b}, v31.16b .ifc \type, prep movi v16.2d, #0 movi v17.2d, #0 .else mov v16.16b, v6.16b mov v17.16b, v6.16b .endif ldur q25, [\src, #24] ldr q27, [\src, #32] sdot z16.d, z18.h, z4.h[0] sdot z17.d, z19.h, z4.h[0] tbl v22.16b, {v21.16b}, v30.16b tbl v23.16b, {v21.16b}, v31.16b .ifc \type, prep movi v1.2d, #0 movi v3.2d, #0 .else mov v1.16b, v6.16b mov v3.16b, v6.16b .endif add \src, \src, \s_strd sdot z1.d, z22.h, z4.h[0] sdot z3.d, z23.h, z4.h[0] tbl v24.16b, {v25.16b}, v30.16b tbl v25.16b, {v25.16b}, v31.16b .ifc \type, prep movi v22.2d, #0 movi v23.2d, #0 .else mov v22.16b, v6.16b mov v23.16b, v6.16b .endif sdot z22.d, z24.h, z4.h[0] sdot z23.d, z25.h, z4.h[0] tbl v20.16b, {v21.16b}, v30.16b tbl v21.16b, {v21.16b}, v31.16b sdot z0.d, z18.h, z4.h[1] sdot z2.d, z19.h, z4.h[1] tbl v26.16b, {v27.16b}, v30.16b tbl v27.16b, {v27.16b}, v31.16b sdot z16.d, z20.h, z4.h[1] sdot z17.d, z21.h, z4.h[1] sdot z1.d, z24.h, z4.h[1] sdot z3.d, z25.h, z4.h[1] sdot z22.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] subs \h, \h, #1 uzp1 v0.4s, v0.4s, v2.4s uzp1 v2.4s, v16.4s, v17.4s uzp1 v1.4s, v1.4s, v3.4s uzp1 v3.4s, v22.4s, v23.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v2.4s, v2.4s, v5.4s srshl v1.4s, v1.4s, v5.4s srshl v3.4s, v3.4s, v5.4s uzp1 v0.8h, v0.8h, v2.8h uzp1 v1.8h, v1.8h, v3.8h sub z0.h, z0.h, #PREP_BIAS sub z1.h, z1.h, #PREP_BIAS .else // put sqshrun v0.4h, v0.4s, #6 sqshrun2 v0.8h, v2.4s, #6 sqshrun v1.4h, v1.4s, #6 sqshrun2 v1.8h, v3.4s, #6 umin v0.8h, v0.8h, v5.8h umin v1.8h, v1.8h, v5.8h .endif st1 {v0.16b, v1.16b}, [\dst], \d_strd b.gt 16b ret .align JUMP_ALIGN 320: // H - 32xN+ 640: AARCH64_VALID_JUMP_TARGET .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd .endif sub \d_strd, \d_strd, \w, uxtw #1 sub \s_strd, \s_strd, \w, uxtw #1 mov w8, \w .align LOOP_ALIGN 32: ldp q17, q21, [\src] ldur q19, [\src, #8] .ifc \type, prep movi v0.2d, #0 movi v2.2d, #0 .else mov v0.16b, v6.16b mov v2.16b, v6.16b .endif tbl v16.16b, {v17.16b}, v30.16b tbl v17.16b, {v17.16b}, v31.16b sdot z0.d, z16.h, z4.h[0] sdot z2.d, z17.h, z4.h[0] tbl v18.16b, {v19.16b}, v30.16b tbl v19.16b, {v19.16b}, v31.16b .ifc \type, prep movi v16.2d, #0 movi v17.2d, #0 .else mov v16.16b, v6.16b mov v17.16b, v6.16b .endif ldur q25, [\src, #24] sdot z16.d, z18.h, z4.h[0] sdot z17.d, z19.h, z4.h[0] ldr q27, [\src, #32]! tbl v22.16b, {v21.16b}, v30.16b tbl v23.16b, {v21.16b}, v31.16b .ifc \type, prep movi v1.2d, #0 movi v3.2d, #0 .else mov v1.16b, v6.16b mov v3.16b, v6.16b .endif sdot z1.d, z22.h, z4.h[0] sdot z3.d, z23.h, z4.h[0] tbl v24.16b, {v25.16b}, v30.16b tbl v25.16b, {v25.16b}, v31.16b .ifc \type, prep movi v22.2d, #0 movi v23.2d, #0 .else mov v22.16b, v6.16b mov v23.16b, v6.16b .endif sdot z22.d, z24.h, z4.h[0] sdot z23.d, z25.h, z4.h[0] tbl v20.16b, {v21.16b}, v30.16b tbl v21.16b, {v21.16b}, v31.16b sdot z0.d, z18.h, z4.h[1] sdot z2.d, z19.h, z4.h[1] tbl v26.16b, {v27.16b}, v30.16b tbl v27.16b, {v27.16b}, v31.16b sdot z16.d, z20.h, z4.h[1] sdot z17.d, z21.h, z4.h[1] sdot z1.d, z24.h, z4.h[1] sdot z3.d, z25.h, z4.h[1] sdot z22.d, z26.h, z4.h[1] sdot z23.d, z27.h, z4.h[1] subs w8, w8, #16 uzp1 v0.4s, v0.4s, v2.4s uzp1 v2.4s, v16.4s, v17.4s uzp1 v1.4s, v1.4s, v3.4s uzp1 v3.4s, v22.4s, v23.4s .ifc \type, prep srshl v0.4s, v0.4s, v5.4s srshl v2.4s, v2.4s, v5.4s srshl v1.4s, v1.4s, v5.4s srshl v3.4s, v3.4s, v5.4s uzp1 v0.8h, v0.8h, v2.8h uzp1 v1.8h, v1.8h, v3.8h sub z0.h, z0.h, #PREP_BIAS sub z1.h, z1.h, #PREP_BIAS .else // put sqshrun v0.4h, v0.4s, #6 sqshrun2 v0.8h, v2.4s, #6 sqshrun v1.4h, v1.4s, #6 sqshrun2 v1.8h, v3.4s, #6 umin v0.8h, v0.8h, v5.8h umin v1.8h, v1.8h, v5.8h .endif stp q0, q1, [\dst], #32 b.gt 32b add \src, \src, \s_strd add \dst, \dst, \d_strd subs \h, \h, #1 mov w8, \w b.gt 32b ret endfunc jumptable \type\()_8tap_h_\isa\()_tbl .word 640b - \type\()_8tap_h_\isa\()_tbl .word 320b - \type\()_8tap_h_\isa\()_tbl .word 160b - \type\()_8tap_h_\isa\()_tbl .word 80b - \type\()_8tap_h_\isa\()_tbl .word 40b - \type\()_8tap_h_\isa\()_tbl .ifc \type, put .word 20b - \type\()_8tap_h_\isa\()_tbl .endif endjumptable .endm function prep_sve movrel x9, prep_tbl ldr w10, [sp] // bdmax mov w6, #19 ldrsw x8, [x9, x8, lsl #2] sub w6, w6, w10, lsr #8 // 19 - bdmax / 256 add x9, x9, x8 movi v30.8h, #PREP_BIAS_NEG dup v29.8h, w6 // 10b: 1 << 4, 12b: 1 << 2 lsl x1, x1, #1 br x9 .align JUMP_ALIGN 40: // prep - 4xN AARCH64_VALID_JUMP_TARGET .align LOOP_ALIGN 4: ldr d0, [x2] ldr d1, [x2, x3] add x2, x2, x3, lsl #1 subs w5, w5, #2 mad z0.h, p0/m, z29.h, z30.h mad z1.h, p0/m, z29.h, z30.h str d0, [x0] str d1, [x0, x1] add x0, x0, x1, lsl #1 b.gt 4b ret .align JUMP_ALIGN 80: // prep - 8xN AARCH64_VALID_JUMP_TARGET .align LOOP_ALIGN 8: ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x2], x3 subs w5, w5, #2 mad z0.h, p0/m, z29.h, z30.h mad z1.h, p0/m, z29.h, z30.h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x0], x1 b.gt 8b ret .align JUMP_ALIGN 160: // prep - 16xN AARCH64_VALID_JUMP_TARGET .align LOOP_ALIGN 16: ld1 {v0.8h, v1.8h}, [x2], x3 mad z0.h, p0/m, z29.h, z30.h mad z1.h, p0/m, z29.h, z30.h subs w5, w5, #2 ld1 {v2.8h, v3.8h}, [x2], x3 mad z2.h, p0/m, z29.h, z30.h mad z3.h, p0/m, z29.h, z30.h stp q0, q1, [x0] add x0, x0, x1 stp q2, q3, [x0] add x0, x0, x1 b.gt 16b ret .align JUMP_ALIGN 320: // prep - 32xN AARCH64_VALID_JUMP_TARGET .align LOOP_ALIGN 32: ldp q0, q1, [x2] mad z0.h, p0/m, z29.h, z30.h mad z1.h, p0/m, z29.h, z30.h ldp q2, q3, [x2, #32] subs w5, w5, #1 mad z2.h, p0/m, z29.h, z30.h mad z3.h, p0/m, z29.h, z30.h add x2, x2, x3 stp q0, q1, [x0] stp q2, q3, [x0, #32] add x0, x0, x1 b.gt 32b ret .align JUMP_ALIGN 640: // prep - 64xN AARCH64_VALID_JUMP_TARGET .align LOOP_ALIGN 64: ldp q0, q1, [x2] mad z0.h, p0/m, z29.h, z30.h mad z1.h, p0/m, z29.h, z30.h ldp q2, q3, [x2, #32] mad z2.h, p0/m, z29.h, z30.h mad z3.h, p0/m, z29.h, z30.h ldp q4, q5, [x2, #64] mad z4.h, p0/m, z29.h, z30.h mad z5.h, p0/m, z29.h, z30.h ldp q6, q7, [x2, #96] add x2, x2, x3 subs w5, w5, #1 mad z6.h, p0/m, z29.h, z30.h mad z7.h, p0/m, z29.h, z30.h stp q0, q1, [x0] stp q2, q3, [x0, #32] stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x0, x0, x1 b.gt 64b ret endfunc jumptable prep_tbl .word 640b - prep_tbl .word 320b - prep_tbl .word 160b - prep_tbl .word 80b - prep_tbl .word 40b - prep_tbl endjumptable // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3) filter_8tap_fn prep, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3) filter_8tap_fn put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3 DISABLE_SVE2 DISABLE_SVE #endif // HAVE_SVE2 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/mc_dotprod.S000066400000000000000000001766751517466257200241470ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Janne Grunau * Copyright © 2024, Martin Storsjo * Copyright © 2024, Arm Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #if HAVE_DOTPROD ENABLE_DOTPROD // No spaces in these expressions, due to gas-preprocessor. It is translated by // -1 to save the negative offset at getting the address of `mc_subpel_filters`. #define REGULAR1 (((0*15-1)<<7)|(3*15-1)) #define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) #define SHARP1 (((2*15-1)<<7)|(3*15-1)) #define FUNC_ALIGN 2 #define JUMP_ALIGN 2 #define LOOP_ALIGN 2 const h_tbl_neon_dotprod, align=4 // Shuffle indices to permute horizontal samples in preparation for // input to SDOT instructions. The 8-tap horizontal convolution uses // sample indices in the interval of [-3, 4] relative to the current // sample position. .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 // Shuffle indices to permute horizontal samples in preparation for // input to USMMLA instructions. #define OFFSET_USMMLA 48 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 // Lookup table used to help conversion of shifted 32-bit values to 8-bit. #define OFFSET_CVT_32_8 80 .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 endconst const v_tbl_neon_dotprod, align=4 // Vertical convolutions are also using SDOT instructions, where a // 128-bit register contains a transposed 4x4 matrix of values. // Subsequent iterations of the vertical convolution can reuse the // 3x4 sub-matrix from the previous loop iteration. These shuffle // indices shift and merge this 4x4 matrix with the values of a new // line. .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 endconst .macro make_8tap_fn op, name, type, isa, jump=1 function \op\()_8tap_\name\()_8bpc_\isa, export=1, align=FUNC_ALIGN mov x9, \type .if \jump b \op\()_8tap_\isa .endif endfunc .endm .macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd make_8tap_fn \type, sharp, SHARP1, \isa make_8tap_fn \type, smooth, SMOOTH1, \isa make_8tap_fn \type, regular, REGULAR1, \isa, jump=0 function \type\()_8tap_\isa, align=FUNC_ALIGN clz w8, \w mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) sub w8, w8, #25 // for jump tables movrel x12, X(mc_subpel_filters) .ifc \type, prep add \wd_strd, \wd_strd, \wd_strd // d_strd is expressed in int16 units .endif cbnz \mx, L(\type\()_8tap_h_hv_\isa) cbnz \my, L(\type\()_8tap_v_\isa) b X(\type\()_neon) .align JUMP_ALIGN L(\type\()_8tap_v_\isa): madd \my, \my, w11, w9 movrel x13, v_tbl_neon_dotprod sub \src, \src, \s_strd .ifc \isa, neon_dotprod .ifc \type, prep mov w8, #0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 .else movi v4.4s, #32, lsl #8 // FILTER_WEIGHT * 128, bias for SDOT .endif .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F ldp q6, q28, [x13] cmp \h, #4 csel \my, \my, w11, le sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address ldr q29, [x13, #32] .ifc \isa, neon_dotprod movi v5.16b, #128 .endif ldr d7, [\xmy] cmp \w, #8 b.eq 80f b.lt 40f // .align JUMP_ALIGN // fallthrough 160: // V - 16xN+ ldp q30, q31, [x13, #48] .align LOOP_ALIGN 161: mov \lsrc, \src mov \ldst, \dst sub w8, \h, #1 ldr q16, [\lsrc] ldr q17, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 ldr q18, [\lsrc] ldr q19, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 zip1 v0.16b, v16.16b, v17.16b zip2 v1.16b, v16.16b, v17.16b zip1 v2.16b, v18.16b, v19.16b zip2 v3.16b, v18.16b, v19.16b ldr q20, [\lsrc] ldr q21, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 ldr q22, [\lsrc] ldr q23, [\lsrc, \s_strd] add \lsrc, \lsrc, \s_strd, lsl #1 zip1 v18.16b, v20.16b, v21.16b zip2 v21.16b, v20.16b, v21.16b zip1 v24.16b, v22.16b, v23.16b zip2 v27.16b, v22.16b, v23.16b zip1 v16.8h, v0.8h, v2.8h zip2 v19.8h, v0.8h, v2.8h zip1 v22.8h, v1.8h, v3.8h zip2 v25.8h, v1.8h, v3.8h zip1 v17.8h, v18.8h, v24.8h zip2 v20.8h, v18.8h, v24.8h zip1 v23.8h, v21.8h, v27.8h zip2 v26.8h, v21.8h, v27.8h .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v22.16b, v22.16b, v5.16b sub v25.16b, v25.16b, v5.16b sub v17.16b, v17.16b, v5.16b sub v20.16b, v20.16b, v5.16b sub v23.16b, v23.16b, v5.16b sub v26.16b, v26.16b, v5.16b .endif .align LOOP_ALIGN 16: .ifc \isa, neon_i8mm ld1 {v18.16b}, [\lsrc], \s_strd movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 mov v21.16b, v18.16b mov v24.16b, v18.16b mov v27.16b, v18.16b .else // neon_dotprod ld1 {v27.16b}, [\lsrc], \s_strd mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b sub v18.16b, v27.16b, v5.16b sub v21.16b, v27.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b .endif \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] \dot v3.4s, v25.16b, v7.4b[0] tbl v16.16b, {v16.16b, v17.16b}, v6.16b tbl v19.16b, {v19.16b, v20.16b}, v6.16b tbl v22.16b, {v22.16b, v23.16b}, v6.16b tbl v25.16b, {v25.16b, v26.16b}, v6.16b \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v20.16b, v7.4b[1] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v26.16b, v7.4b[1] tbl v17.16b, {v17.16b, v18.16b}, v28.16b tbl v20.16b, {v20.16b, v21.16b}, v29.16b tbl v23.16b, {v23.16b, v24.16b}, v30.16b tbl v26.16b, {v26.16b, v27.16b}, v31.16b subs w8, w8, #1 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif st1 {v0.8h, v1.8h}, [\ldst], \d_strd .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 st1 {v0.16b}, [\ldst], \d_strd .endif b.gt 16b .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 .else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b .endif \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] \dot v3.4s, v25.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v20.16b, v7.4b[1] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v26.16b, v7.4b[1] subs \w, \w, #16 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif stp q0, q1, [\ldst] add \dst, \dst, #32 .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 str q0, [\ldst] add \dst, \dst, #16 .endif add \src, \src, #16 b.gt 161b ret .align JUMP_ALIGN 80: // V - 8xN ldr d16, [\src] ldr d17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d18, [\src] ldr d19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d20, [\src] ldr d21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr d22, [\src] ldr d23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 subs \h, \h, #2 // for prep: sub is enough zip1 v0.16b, v16.16b, v17.16b zip1 v2.16b, v18.16b, v19.16b zip1 v18.16b, v20.16b, v21.16b zip1 v24.16b, v22.16b, v23.16b zip1 v16.8h, v0.8h, v2.8h zip2 v19.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h zip2 v20.8h, v18.8h, v24.8h .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v17.16b, v17.16b, v5.16b sub v20.16b, v20.16b, v5.16b .endif .ifc \type, put b.eq 82f .endif .align LOOP_ALIGN 8: .ifc \isa, neon_i8mm ldr d18, [\src] movi v0.4s, #0 movi v1.4s, #0 ldr d24, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 movi v2.4s, #0 movi v3.4s, #0 mov v21.8b, v18.8b mov v27.8b, v24.8b .else // neon_dotprod ldr d21, [\src] ldr d27, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b .endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] tbl v16.16b, {v22.16b, v23.16b}, v6.16b tbl v19.16b, {v25.16b, v26.16b}, v6.16b tbl v17.16b, {v23.16b, v24.16b}, v28.16b tbl v20.16b, {v26.16b, v27.16b}, v29.16b \dot v2.4s, v22.16b, v7.4b[0] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v25.16b, v7.4b[0] \dot v3.4s, v26.16b, v7.4b[1] subs \h, \h, #2 uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif str q0, [\dst] str q1, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] str d1, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 8b .ifc \type, put .align JUMP_ALIGN 82: .endif .ifc \isa, neon_i8mm ldr d18, [\src] movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 mov v21.8b, v18.8b .else // neon_dotprod ldr d21, [\src] mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b .endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] \dot v2.4s, v22.16b, v7.4b[0] \dot v2.4s, v23.16b, v7.4b[1] \dot v3.4s, v25.16b, v7.4b[0] \dot v3.4s, v26.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v0.8h, v0.8h, #2 srshr v1.8h, v2.8h, #2 .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 .endif str q0, [\dst] str q1, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] str d1, [\dst, \d_strd] .endif ret .align JUMP_ALIGN 40: // V - 4xN or 2xN (put only) .ifc \type, put cmp \w, #2 b.eq 20f .endif ldr s16, [\src] ldr s17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr s18, [\src] ldr s19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr s20, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr s22, [\src] ldr s23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 subs \h, \h, #2 // for prep: sub is enough zip1 v0.8b, v16.8b, v17.8b zip1 v2.8b, v18.8b, v19.8b zip1 v18.8b, v20.8b, v21.8b zip1 v24.8b, v22.8b, v23.8b zip1 v16.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v17.16b, v17.16b, v5.16b .endif .ifc \type, put b.eq 42f .endif .align LOOP_ALIGN 4: ldr s18, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.16b, v18.16b, v5.16b sub v21.16b, v21.16b, v5.16b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep subs \h, \h, #2 .ifc \isa, neon_i8mm rshrn v0.4h, v0.4s, #2 rshrn2 v0.8h, v1.4s, #2 .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 .endif mov v1.d[0], v0.d[1] st1 {v0.4h}, [\dst], \d_strd st1 {v1.4h}, [\dst], \d_strd .else uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 subs \h, \h, #2 fmov x8, d0 lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 4b .ifc \type, put .align JUMP_ALIGN 42: .endif ldr s18, [\src] .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.16b, v18.16b, v5.16b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep .ifc \isa, neon_i8mm rshrn v0.4h, v0.4s, #2 rshrn2 v0.8h, v1.4s, #2 .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 .endif mov v1.d[0], v0.d[1] st1 {v0.4h}, [\dst], \d_strd st1 {v1.4h}, [\dst], \d_strd .else uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 fmov x8, d0 lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] .endif ret .ifc \type, put .align JUMP_ALIGN 20: // V - 2xN ldr h16, [\src] ldr h17, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr h18, [\src] ldr h19, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr h20, [\src] ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 ldr h22, [\src] ldr h23, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 subs \h, \h, #2 zip1 v0.8b, v16.8b, v17.8b zip1 v2.8b, v18.8b, v19.8b zip1 v18.8b, v20.8b, v21.8b zip1 v24.8b, v22.8b, v23.8b zip1 v16.4h, v0.4h, v2.4h zip1 v17.4h, v18.4h, v24.4h .ifc \isa, neon_dotprod sub v16.8b, v16.8b, v5.8b sub v17.8b, v17.8b, v5.8b .endif b.eq 22f .align LOOP_ALIGN 2: ldr h18, [\src] ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // put mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b sub v21.8b, v21.8b, v5.8b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 subs \h, \h, #2 fmov x8, d0 lsr x9, x8, #32 strh w8, [\dst] strh w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 b.gt 2b .align JUMP_ALIGN 22: ldr h18, [\src] .ifc \isa, neon_i8mm movi v0.4s, #0 movi v1.4s, #0 .else // put mov v0.16b, v4.16b mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b \dot v0.4s, v16.16b, v7.4b[0] \dot v0.4s, v17.16b, v7.4b[1] \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 fmov x8, d0 lsr x9, x8, #32 strh w8, [\dst] strh w9, [\dst, \d_strd] ret .endif .align JUMP_ALIGN L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w9 // for HV .ifc \isa, neon_dotprod mov w13, #0x2002 // FILTER_WEIGHT * 128 + rounding dup v27.4s, w13 // put H overrides this .endif movrel x13, h_tbl_neon_dotprod sub \src, \src, #3 // src - 3 ldr q28, [x13] // for 4-tap & 8-tap H filters ubfx w15, \mx, #7, #7 and \mx, \mx, #0x7F ubfx w11, w14, #7, #7 // for HV and w14, w14, #0x7F // for HV cmp \w, #4 csel \mx, \mx, w15, le add \xmx, x12, \xmx, lsl #3 // subpel H filter address .ifc \isa, neon_dotprod movi v24.16b, #128 .endif cbz \my, L(\type\()_8tap_h_\isa) // HV cases cmp \h, #4 csel w14, w14, w11, le sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3 add \xmy, x12, x14, lsl #3 // subpel V filter address mov x15, x30 ldr d7, [\xmy] .ifc \type, put ldr q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion .endif // of 32b values to 8b sxtl v7.8h, v7.8b cmp w9, #SHARP1 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 // HV 8-tap cases sub \src, \src, \s_strd // src - s_strd * 3 - 3 cmp \w, #4 b.eq 40f .ifc \type, put b.lt 20f .endif // .align JUMP_ALIGN // fallthrough 80: // HV8 - 8xN+ ldp q29, q30, [x13, #16] ldr d26, [\xmx] .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h .ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v20.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v21.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v22.8h, v22.8h, #2 .else bl L(\type\()_hv_filter8_\isa) sshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v20.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v21.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v22.8h, v22.8h, #2 .endif .align LOOP_ALIGN 8: ldr q23, [\lsrc] add \lsrc, \lsrc, \s_strd smull v0.4s, v16.4h, v7.h[0] smull2 v1.4s, v16.8h, v7.h[0] mov v16.16b, v17.16b .ifc \isa, neon_i8mm movi v5.4s, #0 movi v6.4s, #0 tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b .else // neon_dotprod sub v23.16b, v23.16b, v24.16b mov v5.16b, v27.16b mov v6.16b, v27.16b .endif smlal v0.4s, v17.4h, v7.h[1] smlal2 v1.4s, v17.8h, v7.h[1] .ifc \isa, neon_i8mm tbl v4.16b, {v23.16b}, v30.16b mov v17.16b, v18.16b .else // neon_dotprod mov v17.16b, v18.16b tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b tbl v4.16b, {v23.16b}, v30.16b .endif smlal v0.4s, v18.4h, v7.h[2] smlal2 v1.4s, v18.8h, v7.h[2] mov v18.16b, v19.16b \dot v5.4s, v2.16b, v26.4b[0] \dot v6.4s, v3.16b, v26.4b[0] smlal v0.4s, v19.4h, v7.h[3] smlal2 v1.4s, v19.8h, v7.h[3] mov v19.16b, v20.16b \dot v5.4s, v3.16b, v26.4b[1] \dot v6.4s, v4.16b, v26.4b[1] smlal v0.4s, v20.4h, v7.h[4] smlal2 v1.4s, v20.8h, v7.h[4] mov v20.16b, v21.16b smlal v0.4s, v21.4h, v7.h[5] smlal2 v1.4s, v21.8h, v7.h[5] .ifc \type, prep uzp1 v23.8h, v5.8h, v6.8h .endif mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] smlal2 v1.4s, v22.8h, v7.h[6] .ifc \isa, neon_i8mm subs w8, w8, #1 .endif .ifc \type, prep .ifc \isa, neon_i8mm srshr v22.8h, v23.8h, #2 .else sshr v22.8h, v23.8h, #2 .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 .else // put .ifc \isa, neon_i8mm rshrn v22.4h, v5.4s, #2 rshrn2 v22.8h, v6.4s, #2 .else shrn v22.4h, v5.4s, #2 shrn2 v22.8h, v6.4s, #2 .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 .endif .ifc \isa, neon_dotprod subs w8, w8, #1 .endif .ifc \type, prep st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 .endif add \src, \src, #8 subs \w, \w, #8 b.gt 81b ret x15 .align JUMP_ALIGN 40: // HV8 - 4xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 4: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[0] smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] .ifc \isa, neon_i8mm rshrn v22.4h, v5.4s, #2 .else shrn v22.4h, v5.4s, #2 .endif smlal v0.4s, v22.4h, v7.h[7] .ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst] subs \h, \h, #1 add \dst, \dst, \d_strd .else subs \h, \h, #1 tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd .endif b.gt 4b ret x15 .ifc \type, put .align JUMP_ALIGN 20: // HV8 - 2xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 2: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[0] smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] .ifc \isa, neon_i8mm rshrn v22.4h, v5.4s, #2 .else shrn v22.4h, v5.4s, #2 .endif smlal v0.4s, v22.4h, v7.h[7] subs \h, \h, #1 tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] add \dst, \dst, \d_strd b.gt 2b ret x15 .endif .align JUMP_ALIGN L(\type\()_6tap_hv_\isa): cmp \w, #4 b.eq 40f .ifc \type, put b.lt 20f .endif // .align JUMP_ALIGN // fallthrough 80: // HV6 - 8xN+ ldr d26, [\xmx] .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 88f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h bl L(\type\()_hv_filter6_neon_i8mm) srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter6_neon_i8mm) srshr v20.8h, v22.8h, #2 .align LOOP_ALIGN 8: ld1 {v23.16b}, [\lsrc], \s_strd smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] mov v16.16b, v17.16b movi v5.4s, #0 movi v6.4s, #0 tbl v2.16b, {v23.16b}, v29.16b tbl v3.16b, {v23.16b}, v30.16b smlal v0.4s, v17.4h, v7.h[2] smlal2 v1.4s, v17.8h, v7.h[2] mov v17.16b, v18.16b usmmla v5.4s, v2.16b, v26.16b usmmla v6.4s, v3.16b, v26.16b smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b subs w8, w8, #1 smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] uzp1 v23.8h, v5.8h, v6.8h mov v19.16b, v20.16b smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] srshr v20.8h, v23.8h, #2 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 .endif add \src, \src, #8 subs \w, \w, #8 b.gt 81b ret x15 .align JUMP_ALIGN 88: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 81: mov \lsrc, \src mov \ldst, \dst mov w8, \h .ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) srshr v20.8h, v22.8h, #2 .else bl L(\type\()_hv_filter8_\isa) sshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) sshr v20.8h, v22.8h, #2 .endif .align LOOP_ALIGN 8: ldr q23, [\lsrc] add \lsrc, \lsrc, \s_strd smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] .ifc \isa, neon_dotprod sub v23.16b, v23.16b, v24.16b .endif mov v16.16b, v17.16b .ifc \isa, neon_i8mm movi v5.4s, #0 movi v6.4s, #0 .else mov v5.16b, v27.16b mov v6.16b, v27.16b .endif tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b smlal v0.4s, v17.4h, v7.h[2] smlal2 v1.4s, v17.8h, v7.h[2] tbl v4.16b, {v23.16b}, v30.16b mov v17.16b, v18.16b \dot v5.4s, v2.16b, v26.4b[0] \dot v6.4s, v3.16b, v26.4b[0] smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b \dot v5.4s, v3.16b, v26.4b[1] \dot v6.4s, v4.16b, v26.4b[1] smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] mov v19.16b, v20.16b uzp1 v23.8h, v5.8h, v6.8h smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] .ifc \isa, neon_i8mm srshr v20.8h, v23.8h, #2 .else sshr v20.8h, v23.8h, #2 .endif subs w8, w8, #1 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 .else tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 .endif add \src, \src, #8 subs \w, \w, #8 b.gt 81b ret x15 .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): ld1 {v4.16b}, [\lsrc], \s_strd .ifc \isa, neon_i8mm movi v22.4s, #0 movi v23.4s, #0 .else // neon_dotprod sub v4.16b, v4.16b, v24.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v2.16b, {v4.16b}, v28.16b tbl v3.16b, {v4.16b}, v29.16b tbl v4.16b, {v4.16b}, v30.16b \dot v22.4s, v2.16b, v26.4b[0] \dot v23.4s, v3.16b, v26.4b[0] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] uzp1 v22.8h, v22.8h, v23.8h ret .ifc \isa, neon_i8mm .align FUNC_ALIGN L(\type\()_hv_filter6_neon_i8mm): ld1 {v4.16b}, [\lsrc], \s_strd movi v22.4s, #0 movi v23.4s, #0 tbl v2.16b, {v4.16b}, v29.16b tbl v3.16b, {v4.16b}, v30.16b usmmla v22.4s, v2.16b, v26.16b usmmla v23.4s, v3.16b, v26.16b uzp1 v22.8h, v22.8h, v23.8h ret .endif .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): ld1 {v4.8b}, [\src], \s_strd .ifc \isa, neon_i8mm movi v22.4s, #2 .else mov v22.16b, v27.16b sub v4.16b, v4.16b, v24.16b .endif tbl v2.16b, {v4.16b}, v28.16b \dot v22.4s, v2.16b, v26.4b[0] ret .align JUMP_ALIGN 40: // HV6 - 4xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 4: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] .ifc \isa, neon_i8mm rshrn v20.4h, v5.4s, #2 .else shrn v20.4h, v5.4s, #2 .endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] .ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst] add \dst, \dst, \d_strd .else tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd .endif b.gt 4b ret x15 .ifc \type, put .align JUMP_ALIGN 20: // HV6 - 2xN ldur s26, [\xmx, #2] add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 2: ld1 {v4.8b}, [\src], \s_strd smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b .ifc \isa, neon_i8mm movi v5.4s, #0 .else mov v5.16b, v27.16b .endif mov v18.16b, v19.16b mov v19.16b, v20.16b \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] .ifc \isa, neon_i8mm rshrn v20.4h, v5.4s, #2 .else shrn v20.4h, v5.4s, #2 .endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] add \dst, \dst, \d_strd b.gt 2b ret x15 .endif .align JUMP_ALIGN L(\type\()_8tap_h_\isa): movrel x11, \type\()_8tap_h_\isa\()_tbl ldrsw x8, [x11, x8, lsl #2] .ifc \type, put .ifc \isa, neon_i8mm movi v27.4s, #34 // special rounding .else mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT dup v27.4s, w10 .endif .endif add x11, x11, x8 br x11 .ifc \type, put .align JUMP_ALIGN 20: // H - 2xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 ldur s26, [\xmx, #2] .align LOOP_ALIGN 2: ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b .endif mov v4.16b, v27.16b mov v5.16b, v27.16b tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b \dot v4.4s, v2.16b, v26.4b[0] \dot v5.4s, v3.16b, v26.4b[0] uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 subs \h, \h, #2 fmov x8, d4 lsr x9, x8, #32 strh w8, [\dst] strh w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 b.gt 2b ret .endif .align JUMP_ALIGN 40: // H - 4xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 ldur s26, [\xmx, #2] .align LOOP_ALIGN 4: ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type\()_\isa, prep_neon_i8mm movi v4.4s, #0 movi v5.4s, #0 .else .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b .endif mov v4.16b, v27.16b mov v5.16b, v27.16b .endif tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b \dot v4.4s, v2.16b, v26.4b[0] \dot v5.4s, v3.16b, v26.4b[0] .ifc \type, prep subs \h, \h, #2 .ifc \isa, neon_i8mm uzp1 v4.8h, v4.8h, v5.8h srshr v4.8h, v4.8h, #2 .else shrn v4.4h, v4.4s, #2 shrn2 v4.8h, v5.4s, #2 .endif mov v5.d[0], v4.d[1] str d4, [\dst] str d5, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .else // put uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 subs \h, \h, #2 fmov x8, d4 lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 4b ret .align JUMP_ALIGN 80: // H - 8xN AARCH64_VALID_JUMP_TARGET ldr d26, [\xmx] .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 88f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 8: ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type, prep movi v4.4s, #0 movi v5.4s, #0 movi v20.4s, #0 movi v21.4s, #0 .else mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b mov v21.16b, v27.16b .endif tbl v1.16b, {v0.16b}, v29.16b tbl v2.16b, {v0.16b}, v30.16b tbl v17.16b, {v16.16b}, v29.16b tbl v18.16b, {v16.16b}, v30.16b usmmla v4.4s, v1.16b, v26.16b usmmla v5.4s, v2.16b, v26.16b usmmla v20.4s, v17.16b, v26.16b usmmla v21.4s, v18.16b, v26.16b uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h .ifc \type, prep srshr v4.8h, v4.8h, #2 srshr v20.8h, v20.8h, #2 subs \h, \h, #2 str q4, [\dst] str q20, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .else // put sqshrun v4.8b, v4.8h, #6 sqshrun v20.8b, v20.8h, #6 subs \h, \h, #2 str d4, [\dst] str d20, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 8b ret .align JUMP_ALIGN 88: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 8: ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 .ifc \type\()_\isa, prep_neon_i8mm movi v4.4s, #0 movi v5.4s, #0 movi v20.4s, #0 movi v21.4s, #0 .else .ifc \isa, neon_dotprod sub v0.16b, v0.16b, v24.16b sub v16.16b, v16.16b, v24.16b .endif mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b mov v21.16b, v27.16b .endif tbl v1.16b, {v0.16b}, v28.16b tbl v2.16b, {v0.16b}, v29.16b tbl v3.16b, {v0.16b}, v30.16b tbl v17.16b, {v16.16b}, v28.16b tbl v18.16b, {v16.16b}, v29.16b tbl v19.16b, {v16.16b}, v30.16b \dot v4.4s, v1.16b, v26.4b[0] \dot v5.4s, v2.16b, v26.4b[0] \dot v20.4s, v17.16b, v26.4b[0] \dot v21.4s, v18.16b, v26.4b[0] \dot v4.4s, v2.16b, v26.4b[1] \dot v5.4s, v3.16b, v26.4b[1] \dot v20.4s, v18.16b, v26.4b[1] \dot v21.4s, v19.16b, v26.4b[1] uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v4.8h, v4.8h, #2 srshr v20.8h, v20.8h, #2 .else sshr v4.8h, v4.8h, #2 sshr v20.8h, v20.8h, #2 .endif subs \h, \h, #2 str q4, [\dst] str q20, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .else // put sqshrun v4.8b, v4.8h, #6 sqshrun v20.8b, v20.8h, #6 subs \h, \h, #2 str d4, [\dst] str d20, [\dst, \d_strd] add \dst, \dst, \d_strd, lsl #1 .endif b.gt 8b ret .align JUMP_ALIGN 160: // H - 16xN AARCH64_VALID_JUMP_TARGET ldr d26, [\xmx] .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 168f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 16: ldr q16, [\src] ldur q17, [\src, #8] // avoid 2 register TBL for small cores add \src, \src, \s_strd .ifc \type, prep movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v29.16b tbl v1.16b, {v16.16b}, v30.16b tbl v2.16b, {v17.16b}, v29.16b tbl v3.16b, {v17.16b}, v30.16b usmmla v6.4s, v0.16b, v26.16b usmmla v7.4s, v1.16b, v26.16b usmmla v22.4s, v2.16b, v26.16b usmmla v23.4s, v3.16b, v26.16b uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 subs \h, \h, #1 stp q6, q22, [\dst] add \dst, \dst, \d_strd .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 st1 {v6.16b}, [\dst], \d_strd .endif b.gt 16b ret .align JUMP_ALIGN 168: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 16: ldr q16, [\src] ldur q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd .ifc \type\()_\isa, prep_neon_i8mm movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b tbl v3.16b, {v17.16b}, v28.16b tbl v4.16b, {v17.16b}, v29.16b \dot v6.4s, v0.16b, v26.4b[0] \dot v7.4s, v1.16b, v26.4b[0] \dot v22.4s, v2.16b, v26.4b[0] \dot v23.4s, v3.16b, v26.4b[0] \dot v6.4s, v1.16b, v26.4b[1] \dot v7.4s, v2.16b, v26.4b[1] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 .endif subs \h, \h, #1 stp q6, q22, [\dst] add \dst, \dst, \d_strd .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 st1 {v6.16b}, [\dst], \d_strd .endif b.gt 16b ret .align JUMP_ALIGN 320: // H - 32xN+ 640: AARCH64_VALID_JUMP_TARGET ldr d26, [\xmx] .ifc \type, put sub \d_strd, \d_strd, \w, uxtw .else sub \d_strd, \d_strd, \w, uxtw #1 .endif sub \s_strd, \s_strd, \w, uxtw mov w8, \w .ifc \isa, neon_i8mm cmp w9, #SHARP1 b.eq 328f // horizontal == SHARP1 ldp q29, q30, [x13, #(OFFSET_USMMLA)] ext v0.8b, v26.8b, v26.8b, #7 ins v26.d[1], v0.d[0] .align LOOP_ALIGN 32: ldr q16, [\src] ldur q17, [\src, #8] // avoid 2 register TBL for small cores add \src, \src, #16 .ifc \type, prep movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v29.16b tbl v1.16b, {v16.16b}, v30.16b tbl v2.16b, {v17.16b}, v29.16b tbl v3.16b, {v17.16b}, v30.16b usmmla v6.4s, v0.16b, v26.16b usmmla v7.4s, v1.16b, v26.16b usmmla v22.4s, v2.16b, v26.16b usmmla v23.4s, v3.16b, v26.16b uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 subs w8, w8, #16 stp q6, q22, [\dst], #32 .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs w8, w8, #16 str q6, [\dst], #16 .endif b.gt 32b add \src, \src, \s_strd add \dst, \dst, \d_strd mov w8, \w subs \h, \h, #1 b.gt 32b ret .align JUMP_ALIGN 328: .endif // neon_i8mm ldp q29, q30, [x13, #16] .align LOOP_ALIGN 32: ldr q16, [\src] ldur q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, #16 .ifc \type\()_\isa, prep_neon_i8mm movi v6.4s, #0 movi v7.4s, #0 movi v22.4s, #0 movi v23.4s, #0 .else .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b .endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b tbl v3.16b, {v17.16b}, v28.16b tbl v4.16b, {v17.16b}, v29.16b \dot v6.4s, v0.16b, v26.4b[0] \dot v7.4s, v1.16b, v26.4b[0] \dot v22.4s, v2.16b, v26.4b[0] \dot v23.4s, v3.16b, v26.4b[0] \dot v6.4s, v1.16b, v26.4b[1] \dot v7.4s, v2.16b, v26.4b[1] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep .ifc \isa, neon_i8mm srshr v6.8h, v6.8h, #2 srshr v22.8h, v22.8h, #2 .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 .endif subs w8, w8, #16 stp q6, q22, [\dst], #32 .else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs w8, w8, #16 str q6, [\dst], #16 .endif b.gt 32b add \src, \src, \s_strd add \dst, \dst, \d_strd mov w8, \w subs \h, \h, #1 b.gt 32b ret endfunc jumptable \type\()_8tap_h_\isa\()_tbl .word 640b - \type\()_8tap_h_\isa\()_tbl .word 320b - \type\()_8tap_h_\isa\()_tbl .word 160b - \type\()_8tap_h_\isa\()_tbl .word 80b - \type\()_8tap_h_\isa\()_tbl .word 40b - \type\()_8tap_h_\isa\()_tbl .ifc \type, put .word 20b - \type\()_8tap_h_\isa\()_tbl .endif endjumptable .endm // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn prep, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 #if HAVE_I8MM ENABLE_I8MM // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn prep, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 DISABLE_I8MM #endif // HAVE_I8MM DISABLE_DOTPROD #endif // HAVE_DOTPROD dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/msac.S000066400000000000000000000302301517466257200227100ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * Copyright © 2026, Arm Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define BUF_POS 0 #define BUF_END 8 #define DIF 16 #define RNG 24 #define CNT 28 #define ALLOW_UPDATE_CDF 32 function msac_decode_bool_bypass_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT ldr x7, [x0, #DIF] lsl x5, x5, #47 // r << 47 mov x15, #0 // ret = 0 subs x2, x7, x5 // dif - vw mov x8, #1 csel x7, x2, x7, hs // if (dif >= vw) dif -= vw cinc x15, x15, lo // if (dif < vw) ret |= 1 orr x7, x8, x7, lsl #1 // dif << bits subs w6, w6, #1 // cnt -= bits adr x10, #8 b.mi L(refill) str x7, [x0, #DIF] str w6, [x0, #CNT] mov x0, x15 ret L(refill): // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 subs x5, x5, x4 b.hi 5f ldr x8, [x3] // next_bits add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) neg w9, w6 neg w5, w4 and w9, w9, #7 rev x8, x8 // next_bits = bswap(next_bits) lsr w5, w5, #3 // num_bytes_read lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) 2: // refill_end add x3, x3, x5 add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] 3: // refill_end2 lsr x8, x8, x9 // throw away the lowest bits lsl x8, x8, x9 // shift back in place eor x7, x7, x8 // dif ^= next_bits 4: // end str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 ret 5: // refill_eob cmp x3, x4 b.hs 4b ldr x8, [x4, #-8] neg w9, w6 lsl w5, w5, #3 and w9, w9, #7 lsr x8, x8, x5 add w5, w6, #-48 rev x8, x8 sub w4, w4, w3 // num_bytes_left lsr x8, x8, x5 neg w5, w5 lsr w5, w5, #3 cmp w5, w4 csel w5, w5, w4, lo // num_bytes_read b 2b endfunc function msac_decode_bools_bypass_neon, export=1 ldp w11, w6, [x0, #RNG] // + CNT ldr x7, [x0, #DIF] cmp w1, w6 b.ls 3f // We can't ret directly from refill here so inline a special version ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 subs x5, x5, x4 b.hi 1f ldr x8, [x3] // next_bits add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) neg w9, w6 neg w5, w4 and w9, w9, #7 rev x8, x8 // next_bits = bswap(next_bits) lsr w5, w5, #3 // num_bytes_read lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) b 2f 1: // refill_eob cmp x3, x4 b.hs 3f ldr x8, [x4, #-8] neg w9, w6 lsl w5, w5, #3 and w9, w9, #7 lsr x8, x8, x5 add w5, w6, #-48 rev x8, x8 sub w4, w4, w3 // num_bytes_left lsr x8, x8, x5 neg w5, w5 lsr w5, w5, #3 cmp w5, w4 csel w5, w5, w4, lo // num_bytes_read 2: // refill_end add x3, x3, x5 add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] lsr x8, x8, x9 // throw away the lowest bits lsl x8, x8, x9 // shift back in place eor x7, x7, x8 // dif ^= next_bits 3: lsl x5, x11, #47 // r << 47 mov x15, #0 mov w3, w1 4: sub w1, w1, #1 // n_bits subs x2, x7, x5 // dif - vw lsl x15, x15, #1 // ret <<= 1 lsr x5, x5, #1 // vw >> 1 csel x7, x2, x7, hs // if (dif >= vw) dif -= vw cinc x15, x15, lo // if (dif < vw) ret |= 1 cbnz w1, 4b mvn x7, x7 sub w6, w6, w3 // cnt -= bits lsl x7, x7, x3 // dif << bits mvn x7, x7 str w6, [x0, #CNT] str x7, [x0, #DIF] mov x0, x15 ret endfunc function msac_decode_unary_bypass_neon, export=1 ldp w11, w6, [x0, #RNG] // + CNT ldr x7, [x0, #DIF] cmp w1, w6 b.ls 3f // We can't ret directly from refill here so inline a special version ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 subs x5, x5, x4 b.hi 1f ldr x8, [x3] // next_bits add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) neg w9, w6 neg w5, w4 and w9, w9, #7 rev x8, x8 // next_bits = bswap(next_bits) lsr w5, w5, #3 // num_bytes_read lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) b 2f 1: // refill_eob cmp x3, x4 b.hs 3f ldr x8, [x4, #-8] neg w9, w6 lsl w5, w5, #3 and w9, w9, #7 lsr x8, x8, x5 add w5, w6, #-48 rev x8, x8 sub w4, w4, w3 // num_bytes_left lsr x8, x8, x5 neg w5, w5 lsr w5, w5, #3 cmp w5, w4 csel w5, w5, w4, lo // num_bytes_read 2: // refill_end add x3, x3, x5 add w6, w6, w5, lsl #3 // cnt += num_bits_read str x3, [x0, #BUF_POS] lsr x8, x8, x9 // throw away the lowest bits lsl x8, x8, x9 // shift back in place eor x7, x7, x8 // dif ^= next_bits 3: lsl x5, x11, #47 // r << 47 mov x15, #0 // ret mov w3, #0 // bit 4: add w3, w3, #1 subs x2, x7, x5 // dif - vw b.lo 5f lsr x5, x5, #1 // vw >> 1 mov x7, x2 // dif -= vw add x15, x15, #1 // ret++ cmp w3, w1 b.ne 4b 5: mvn x7, x7 sub w6, w6, w3 // cnt -= bits lsl x7, x7, x3 // dif << bits str w6, [x0, #CNT] mvn x7, x7 str x7, [x0, #DIF] mov x0, x15 ret endfunc function msac_decode_bool_adapt_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT ldrh w9, [x1] // cdf[0] mov w2, #0xFFF0 ldr x7, [x0, #DIF] mov w8, #0xFFFFFFF8 lsr w4, w5, #8 // r >> 8 and w2, w2, w9, lsr #3 // (f >> 7) << 4 add w2, w2, #8 // p = ((f >> 7) << 4) + 8 mul w4, w4, w2 // (r >> 8) * p ldr w10, [x0, #ALLOW_UPDATE_CDF] and w4, w8, w4, lsr #4 // v = ((r >> 8) * p >> 7) << 3 subs x8, x7, x4, lsl #48 // dif - vw sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 cbz w10, 1f ldrh w2, [x1, #2] // cdf[1] lsr w12, w2, #8 // pc >> 8 and w8, w2, #0xff // (uint8_t)count = pc & 0xff add w12, w12, w12, lsl #1 // pc >> 8 * 3 sub w3, w2, w8, lsr #5 // pc - (count >= 32) movrel x11, X(msac_rate) add w8, w12, w8, lsr #4 // pc >> 8 * 3 + count >> 4 sub w9, w9, w15 // cdf[0] -= bit ldrb w2, [x11, x8] // rate sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} add w10, w3, #1 // pc + (count < 32) asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate sub w9, w9, w11 // cdf[0] strh w10, [x1, #2] strh w9, [x1] 1: mvn x7, x7 // mvn/lsl/mvn is equivalent to ((dif + 1) << d) - 1 lsl w4, w4, w5 // rng << d lsl x7, x7, x5 // dif << d subs w6, w6, w5 // cnt -= d mvn x7, x7 str w4, [x0, #RNG] b.lo L(refill) str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/refmvs.S000066400000000000000000000565631517466257200233100ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm-offsets.h" #include "src/arm/asm.S" #include "util.S" #define INVALID_MV 0x80008000 // void dav2d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, // int bx4, int bw4, int bh4) function splat_mv_neon, export=1 ld1 {v3.16b}, [x1] clz w3, w3 movrel x5, splat_tbl sub w3, w3, #26 ext v2.16b, v3.16b, v3.16b, #12 ldrsw x3, [x5, w3, uxtw #2] add w2, w2, w2, lsl #1 ext v0.16b, v2.16b, v3.16b, #4 add x3, x5, x3 ext v1.16b, v2.16b, v3.16b, #8 lsl w2, w2, #2 ext v2.16b, v2.16b, v3.16b, #12 1: ldr x1, [x0], #8 subs w4, w4, #1 add x1, x1, x2 br x3 10: AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x1] str s2, [x1, #8] b.gt 1b ret 20: AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x1] str d1, [x1, #16] b.gt 1b ret 320: AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 160: AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 80: AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 40: AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1] b.gt 1b ret endfunc jumptable splat_tbl .word 320b - splat_tbl .word 160b - splat_tbl .word 80b - splat_tbl .word 40b - splat_tbl .word 20b - splat_tbl .word 10b - splat_tbl endjumptable const mv_tbls, align=4 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 endconst const mask_mult, align=4 .byte 1, 2, 1, 2, 0, 0, 0, 0 endconst // void dav2d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, // refmvs_block **rr, const uint8_t *ref_sign, // int col_end8, int row_end8, // int col_start8, int row_start8) function save_tmvs_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp movi v30.8b, #0 ld1 {v31.8b}, [x3] movrel x8, save_tmvs_tbl movrel x16, mask_mult movrel x13, mv_tbls ld1 {v29.8b}, [x16] ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] mov w15, #5 mov w14, #12*2 sxtw x4, w4 sxtw x6, w6 mul w1, w1, w15 // stride *= 5 sub w5, w5, w7 // h = row_end8 - row_start8 lsl w7, w7, #1 // row_start8 <<= 1 1: mov w15, #5 and w9, w7, #30 // (y & 15) * 2 ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] add x9, x9, #12 // &b[... + 1] madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] madd x3, x6, x15, x0 // &rp[x] 2: ldrb w11, [x9, #10] // cand_b->bs ld1 {v0.16b}, [x9] // cand_b->mv add x11, x8, w11, uxtw #3 ldr h1, [x9, #8] // cand_b->ref ldr w12, [x11] // bw8 mov x15, x8 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 cmp x9, x10 mov v2.8b, v0.8b b.ge 3f ldrb w15, [x9, #10] // cand_b->bs add x16, x9, #8 ld1 {v4.16b}, [x9] // cand_b->mv add x15, x8, w15, uxtw #3 ld1 {v1.h}[1], [x16] // cand_b->ref ldr w12, [x15] // bw8 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 trn1 v2.2d, v0.2d, v4.2d 3: abs v2.8h, v2.8h // abs(mv[].xy) tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 xtn v2.4h, v2.4s // abs() condition to 16 bit and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] umov w16, v1.h[0] // Extract case for first block umov w17, v1.h[1] ldrsw x11, [x11, #4] // Fetch jump table entry ldrsw x15, [x15, #4] ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case ldr q5, [x13, w17, uxtw #4] add x11, x8, x11 // Find jump table target add x15, x8, x15 tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block tbl v4.16b, {v4.16b}, v5.16b // v1 follows on v0, with another 3 full repetitions of the pattern. ext v1.16b, v0.16b, v0.16b, #1 ext v5.16b, v4.16b, v4.16b, #1 // v2 ends with 3 complete repetitions of the pattern. ext v2.16b, v0.16b, v1.16b, #4 ext v6.16b, v4.16b, v5.16b, #4 blr x11 b.ge 4f // if (cand_b >= end) mov v0.16b, v4.16b mov v1.16b, v5.16b mov v2.16b, v6.16b cmp x9, x10 blr x15 b.lt 2b // if (cand_b < end) 4: subs w5, w5, #1 // h-- add w7, w7, #2 // y += 2 add x0, x0, x1 // rp += stride b.gt 1b ldp x29, x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret 10: AARCH64_VALID_CALL_TARGET add x16, x3, #4 st1 {v0.s}[0], [x3] st1 {v0.b}[4], [x16] add x3, x3, #5 ret 20: AARCH64_VALID_CALL_TARGET add x16, x3, #8 st1 {v0.d}[0], [x3] st1 {v0.h}[4], [x16] add x3, x3, #2*5 ret 40: AARCH64_VALID_CALL_TARGET st1 {v0.16b}, [x3] str s1, [x3, #16] add x3, x3, #4*5 ret 80: AARCH64_VALID_CALL_TARGET // This writes 6 full entries plus 2 extra bytes st1 {v0.16b, v1.16b}, [x3] // Write the last few, overlapping with the first write. stur q2, [x3, #(8*5-16)] add x3, x3, #8*5 ret 160: AARCH64_VALID_CALL_TARGET add x16, x3, #6*5 add x17, x3, #12*5 // This writes 6 full entries plus 2 extra bytes st1 {v0.16b, v1.16b}, [x3] // Write another 6 full entries, slightly overlapping with the first set st1 {v0.16b, v1.16b}, [x16] // Write 8 bytes (one full entry) after the first 12 st1 {v0.8b}, [x17] // Write the last 3 entries str q2, [x3, #(16*5-16)] add x3, x3, #16*5 ret endfunc jumptable save_tmvs_tbl .word 16 * 12 .word 160b - save_tmvs_tbl .word 16 * 12 .word 160b - save_tmvs_tbl .word 8 * 12 .word 80b - save_tmvs_tbl .word 8 * 12 .word 80b - save_tmvs_tbl .word 8 * 12 .word 80b - save_tmvs_tbl .word 8 * 12 .word 80b - save_tmvs_tbl .word 4 * 12 .word 40b - save_tmvs_tbl .word 4 * 12 .word 40b - save_tmvs_tbl .word 4 * 12 .word 40b - save_tmvs_tbl .word 4 * 12 .word 40b - save_tmvs_tbl .word 2 * 12 .word 20b - save_tmvs_tbl .word 2 * 12 .word 20b - save_tmvs_tbl .word 2 * 12 .word 20b - save_tmvs_tbl .word 2 * 12 .word 20b - save_tmvs_tbl .word 2 * 12 .word 20b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl .word 1 * 12 .word 10b - save_tmvs_tbl endjumptable // void dav2d_load_tmvs_neon(const refmvs_frame *const rf, int tile_row_idx, // const int col_start8, const int col_end8, // const int row_start8, int row_end8) function load_tmvs_neon, export=1 rf .req x0 tile_row_idx .req w1 col_start8 .req w2 col_end8 .req w3 row_start8 .req w4 row_end8 .req w5 col_start8i .req w6 col_end8i .req w7 rp_proj .req x8 stride5 .req x9 wstride5 .req w9 stp x28, x27, [sp, #-96]! stp x26, x25, [sp, #16] stp x24, x23, [sp, #32] stp x22, x21, [sp, #48] stp x20, x19, [sp, #64] stp x29, x30, [sp, #80] ldr w15, [rf, #RMVSF_N_TILE_THREADS] ldp w16, w17, [rf, #RMVSF_IW8] // include rf->ih8 too sub col_start8i, col_start8, #8 // col_start8 - 8 add col_end8i, col_end8, #8 // col_end8 + 8 ldr wstride5, [rf, #RMVSF_RP_STRIDE] ldr rp_proj, [rf, #RMVSF_RP_PROJ] cmp w15, #1 csel tile_row_idx, wzr, tile_row_idx, eq // if (rf->n_tile_threads == 1) tile_row_idx = 0 bic col_start8i, col_start8i, col_start8i, asr #31 // imax(col_start8 - 8, 0) cmp col_end8i, w16 csel col_end8i, col_end8i, w16, lt // imin(col_end8 + 8, rf->iw8) lsl tile_row_idx, tile_row_idx, #4 // 16 * tile_row_idx cmp row_end8, w17 csel row_end8, row_end8, w17, lt // imin(row_end8, rf->ih8) add wstride5, wstride5, wstride5, lsl #2 // stride * sizeof(refmvs_temporal_block) and w15, row_start8, #15 // row_start8 & 15 add w10, col_start8, col_start8, lsl #2 // col_start8 * sizeof(refmvs_temporal_block) smaddl rp_proj, tile_row_idx, wstride5, rp_proj // &rf->rp_proj[16 * stride * tile_row_idx] smaddl x10, w15, wstride5, x10 // ((row_start8 & 15) * stride + col_start8) * sizeof(refmvs_temporal_block) mov w15, #INVALID_MV sub w11, col_end8, col_start8 // xfill loop count add x10, x10, rp_proj // &rf->rp_proj[16 * stride * tile_row_idx + (row_start8 & 15) * stride + col_start8] add x15, x15, x15, lsl #40 // first 64b of 4 [INVALID_MV, 0]... patterns mov w17, #(INVALID_MV >> 8) // last 32b of 4 patterns sub w12, row_end8, row_start8 // yfill loop count ror x16, x15, #48 // second 64b of 4 patterns ldr w19, [rf, #RMVSF_N_MFMVS] 5: // yfill loop and w13, w11, #-4 // xfill 4x count by patterns mov x14, x10 // fill_ptr = row_ptr add x10, x10, stride5 // row_ptr += stride sub w12, w12, #1 // y-- cbz w13, 3f 4: // xfill loop 4x sub w13, w13, #4 // xfill 4x count -= 4 stp x15, x16, [x14] str w17, [x14, #16] add x14, x14, #20 // fill_ptr += 4 * sizeof(refmvs_temporal_block) cbnz w13, 4b 3: // up to 3 residuals tbz w11, #1, 1f str x15, [x14] strh w16, [x14, #8] add x14, x14, #10 // fill_ptr += 2 * sizeof(refmvs_temporal_block) 1: // up to 1 residual tbz w11, #0, 2f str w15, [x14] 2: cbnz w12, 5b // yfill loop cbz w19, 11f // if (!rf->n_mfmvs) skip nloop add x29, rf, #RMVSF_MFMV_REF2CUR mov w10, #0 // n = 0 movi v3.2s, #255 // 0x3FFF >> 6, for MV clamp movrel x1, div_mult_tbl 10: // nloop ldrsb w16, [x29, x10] // ref2cur = rf->mfmv_ref2cur[n] cmp w16, #-32 b.eq 9f // if (ref2cur == INVALID_REF2CUR) continue add x17, x10, #(RMVSF_MFMV_REF - RMVSF_MFMV_REF2CUR) // n - (&rf->mfmv_ref - &rf->mfmv_ref2cur) mov x20, #4 ldrb w17, [x29, x17] // ref = rf->mfmv_ref[n] ldr x13, [x29, #(RMVSF_RP_REF - RMVSF_MFMV_REF2CUR)] sub x21, x10, x10, lsl #3 // -(n * 7) smaddl x20, row_start8, wstride5, x20 // row_start8 * stride * sizeof(refmvs_temporal_block) + 4 mov w12, row_start8 // y = row_start8 add x28, x29, #(RMVSF_MFMV_REF2REF - RMVSF_MFMV_REF2CUR - 1) // &rf->mfmv_ref2ref - 1 ldr x13, [x13, x17, lsl #3] // rf->rp_ref[ref] sub x28, x28, x21 // rf->mfmv_ref2ref[n] - 1 sub w17, w17, #4 // ref_sign = ref - 4 add x13, x13, x20 // r = &rf->rp_ref[ref][row_start8 * stride].ref dup v0.2s, w17 // ref_sign 5: // yloop and w14, w12, #-8 // y_sb_align = y & ~7 mov w11, col_start8i // x = col_start8i add w15, w14, #8 // y_sb_align + 8 cmp w14, row_start8 csel w14, w14, row_start8, gt // imax(y_sb_align, row_start8) cmp w15, row_end8 csel w15, w15, row_end8, lt // imin(y_sb_align + 8, row_end8) 4: // xloop add x23, x13, x11, lsl #2 // partial &r[x] address ldrb w22, [x23, x11] // b_ref = rb->ref cbz w22, 6f // if (!b_ref) continue ldrb w24, [x28, x22] // ref2ref = rf->mfmv_ref2ref[n][b_ref - 1] cbz w24, 6f // if (!ref2ref) continue ldrh w20, [x1, x24, lsl #1] // div_mult[ref2ref] add x23, x23, x11 // &r[x] mul w20, w20, w16 // frac = ref2cur * div_mult[ref2ref] ldur s1, [x23, #-4] // mv{y, x} = rb->mv fmov s2, w20 // frac sxtl v1.4s, v1.4h mul v1.2s, v1.2s, v2.s[0] // offset{y, x} = frac * mv{y, x} ssra v1.2s, v1.2s, #31 // offset{y, x} + (offset{y, x} >> 31) ldur w25, [x23, #-4] // b_mv = rb->mv srshr v1.2s, v1.2s, #14 // (offset{y, x} + (offset{y, x} >> 31) + 8192) >> 14 abs v2.2s, v1.2s // abs(offset{y, x}) eor v1.8b, v1.8b, v0.8b // offset{y, x} ^ ref_sign sshr v2.2s, v2.2s, #6 // abs(offset{y, x}) >> 6 cmlt v1.2s, v1.2s, #0 // sign(offset{y, x} ^ ref_sign): -1 or 0 umin v2.2s, v2.2s, v3.2s // iclip(abs(offset{y, x}) >> 6, 0, 0x3FFF >> 6) neg v4.2s, v2.2s bsl v1.8b, v4.8b, v2.8b // apply_sign(iclip(abs(offset{y, x}) >> 6, 0, 0x3FFF >> 6)) fmov x20, d1 // offset{y, x} add w21, w12, w20 // pos_y = y + offset.y cmp w21, w14 // pos_y >= y_proj_start b.lt 1f cmp w21, w15 // pos_y < y_proj_end b.ge 1f add x26, x11, x20, asr #32 // pos_x = x + offset.x and w27, w21, #15 // pos_y & 15 add x21, x26, x26, lsl #2 // pos_x * sizeof(refmvs_temporal_block) umaddl x27, w27, wstride5, rp_proj // &rp_proj[(pos_y & 15) * stride] add x27, x27, x21 // &rp_proj[(pos_y & 15) * stride + pos_x] 3: // copy loop and w20, w11, #-8 // x_sb_align = x & ~7 sub w21, w20, #8 // x_sb_align - 8 cmp w21, col_start8 csel w21, w21, col_start8, gt // imax(x_sb_align - 8, col_start8) cmp w26, w21 // pos_x >= imax(x_sb_align - 8, col_start8) b.lt 2f add w20, w20, #16 // x_sb_align + 16 cmp w20, col_end8 csel w20, w20, col_end8, lt // imin(x_sb_align + 16, col_end8) cmp w26, w20 // pos_x < imin(x_sb_align + 16, col_end8) b.ge 2f str w25, [x27] // rp_proj[pos + pos_x].mv = rb->mv (b_mv) strb w24, [x27, #4] // rp_proj[pos + pos_x].ref = ref2ref 2: // search part of copy loop add w11, w11, #1 // x++ cmp w11, col_end8i // if (++x >= col_end8i) break xloop b.ge 8f ldrb w20, [x23, #5]! // rb++; rb->ref cmp w20, w22 // if (rb->ref != b_ref) break b.ne 7f ldur w21, [x23, #-4] // rb->mv.n cmp w21, w25 // if (rb->mv.n != b_mv.n) break b.ne 7f add w26, w26, #1 // pos_x++ add x27, x27, #5 // advance &rp_proj[(pos_y & 15) * stride + pos_x] b 3b // copy loop 1: // search loop add w11, w11, #1 // x++ cmp w11, col_end8i // if (++x >= col_end8i) break xloop b.ge 8f ldrb w20, [x23, #5]! // rb++; rb->ref cmp w20, w22 // if (rb->ref != b_ref) break b.ne 7f ldur w21, [x23, #-4] // rb->mv.n cmp w21, w25 // if (rb->mv.n == b_mv.n) continue b.eq 1b // search loop 7: cmp w11, col_end8i // x < col_end8i b.lt 4b // xloop 6: // continue case of xloop add w11, w11, #1 // x++ cmp w11, col_end8i // x < col_end8i b.lt 4b // xloop 8: add w12, w12, #1 // y++ add x13, x13, stride5 // r += stride cmp w12, row_end8 // y < row_end8 b.lt 5b // yloop 9: add w10, w10, #1 cmp w10, w19 // n < rf->n_mfmvs b.lt 10b // nloop 11: ldp x29, x30, [sp, #80] ldp x20, x19, [sp, #64] ldp x22, x21, [sp, #48] ldp x24, x23, [sp, #32] ldp x26, x25, [sp, #16] ldp x28, x27, [sp], #96 ret .unreq rf .unreq tile_row_idx .unreq col_start8 .unreq col_end8 .unreq row_start8 .unreq row_end8 .unreq col_start8i .unreq col_end8i .unreq rp_proj .unreq stride5 .unreq wstride5 endfunc const div_mult_tbl .hword 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 .hword 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 .hword 1024, 963, 910, 862, 819, 780, 744, 712 .hword 682, 655, 630, 606, 585, 564, 546, 528 endconst dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/64/util.S000066400000000000000000000301501517466257200227430ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2015 Martin Storsjo * Copyright © 2015 Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #ifndef DAV2D_SRC_ARM_64_UTIL_S #define DAV2D_SRC_ARM_64_UTIL_S #include "config.h" #include "src/arm/asm.S" #ifndef __has_feature #define __has_feature(x) 0 #endif .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 adrp \rd, \val@PAGE add \rd, \rd, \val@PAGEOFF sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset)@PAGE add \rd, \rd, \val+(\offset)@PAGEOFF .endif #elif defined(PIC) && defined(_WIN32) .if \offset < 0 adrp \rd, \val add \rd, \rd, :lo12:\val sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif #elif __has_feature(hwaddress_sanitizer) adrp \rd, :pg_hi21_nc:\val+(\offset) movk \rd, #:prel_g3:\val+0x100000000 add \rd, \rd, :lo12:\val+(\offset) #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) #else ldr \rd, =\val+\offset #endif .endm .macro sub_sp space #ifdef _WIN32 .if \space > 8192 // Here, we'd need to touch two (or more) pages while decrementing // the stack pointer. .error "sub_sp_align doesn't support values over 8K at the moment" .elseif \space > 4096 sub x16, sp, #4096 ldr xzr, [x16] sub sp, x16, #(\space - 4096) .else sub sp, sp, #\space .endif #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif #endif .endm .macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 zip1 \r0\().16b, \r0\().16b, \r1\().16b // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 zip1 \r2\().16b, \r2\().16b, \r3\().16b // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 zip1 \r4\().16b, \r4\().16b, \r5\().16b // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 zip1 \r6\().16b, \r6\().16b, \r7\().16b // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 trn1 \r1\().8h, \r0\().8h, \r2\().8h // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 trn2 \r3\().8h, \r0\().8h, \r2\().8h // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 trn1 \r5\().8h, \r4\().8h, \r6\().8h // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 trn2 \r7\().8h, \r4\().8h, \r6\().8h // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 trn1 \r0\().4s, \r1\().4s, \r5\().4s // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 trn2 \r2\().4s, \r1\().4s, \r5\().4s // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 trn1 \r1\().4s, \r3\().4s, \r7\().4s // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 trn2 \r3\().4s, \r3\().4s, \r7\().4s \xtl\()2 \r4\().8h, \r0\().16b \xtl \r0\().8h, \r0\().8b \xtl\()2 \r6\().8h, \r2\().16b \xtl \r2\().8h, \r2\().8b \xtl\()2 \r5\().8h, \r1\().16b \xtl \r1\().8h, \r1\().8b \xtl\()2 \r7\().8h, \r3\().16b \xtl \r3\().8h, \r3\().8b .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().8h, \r0\().8h, \r1\().8h trn2 \t9\().8h, \r0\().8h, \r1\().8h trn1 \r1\().8h, \r2\().8h, \r3\().8h trn2 \r3\().8h, \r2\().8h, \r3\().8h trn1 \r0\().8h, \r4\().8h, \r5\().8h trn2 \r5\().8h, \r4\().8h, \r5\().8h trn1 \r2\().8h, \r6\().8h, \r7\().8h trn2 \r7\().8h, \r6\().8h, \r7\().8h trn1 \r4\().4s, \r0\().4s, \r2\().4s trn2 \r2\().4s, \r0\().4s, \r2\().4s trn1 \r6\().4s, \r5\().4s, \r7\().4s trn2 \r7\().4s, \r5\().4s, \r7\().4s trn1 \r5\().4s, \t9\().4s, \r3\().4s trn2 \t9\().4s, \t9\().4s, \r3\().4s trn1 \r3\().4s, \t8\().4s, \r1\().4s trn2 \t8\().4s, \t8\().4s, \r1\().4s trn1 \r0\().2d, \r3\().2d, \r4\().2d trn2 \r4\().2d, \r3\().2d, \r4\().2d trn1 \r1\().2d, \r5\().2d, \r6\().2d trn2 \r5\().2d, \r5\().2d, \r6\().2d trn2 \r6\().2d, \t8\().2d, \r2\().2d trn1 \r2\().2d, \t8\().2d, \r2\().2d trn1 \r3\().2d, \t9\().2d, \r7\().2d trn2 \r7\().2d, \t9\().2d, \r7\().2d .endm .macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7 trn1 \t8\().8h, \r0\().8h, \r1\().8h trn2 \t9\().8h, \r0\().8h, \r1\().8h trn1 \r1\().8h, \r2\().8h, \r3\().8h trn2 \r3\().8h, \r2\().8h, \r3\().8h trn1 \r0\().8h, \r4\().8h, \r5\().8h trn2 \r5\().8h, \r4\().8h, \r5\().8h trn1 \r2\().8h, \r6\().8h, \r7\().8h trn2 \r7\().8h, \r6\().8h, \r7\().8h trn1 \r4\().4s, \r0\().4s, \r2\().4s trn2 \r2\().4s, \r0\().4s, \r2\().4s trn1 \r6\().4s, \r5\().4s, \r7\().4s trn2 \r7\().4s, \r5\().4s, \r7\().4s trn1 \r5\().4s, \t9\().4s, \r3\().4s trn2 \t9\().4s, \t9\().4s, \r3\().4s trn1 \r3\().4s, \t8\().4s, \r1\().4s trn2 \t8\().4s, \t8\().4s, \r1\().4s trn1 \o0\().2d, \r3\().2d, \r4\().2d trn2 \o4\().2d, \r3\().2d, \r4\().2d trn1 \o1\().2d, \r5\().2d, \r6\().2d trn2 \o5\().2d, \r5\().2d, \r6\().2d trn2 \o6\().2d, \t8\().2d, \r2\().2d trn1 \o2\().2d, \t8\().2d, \r2\().2d trn1 \o3\().2d, \t9\().2d, \r7\().2d trn2 \o7\().2d, \t9\().2d, \r7\().2d .endm .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().16b, \r0\().16b, \r1\().16b trn2 \t9\().16b, \r0\().16b, \r1\().16b trn1 \r1\().16b, \r2\().16b, \r3\().16b trn2 \r3\().16b, \r2\().16b, \r3\().16b trn1 \r0\().16b, \r4\().16b, \r5\().16b trn2 \r5\().16b, \r4\().16b, \r5\().16b trn1 \r2\().16b, \r6\().16b, \r7\().16b trn2 \r7\().16b, \r6\().16b, \r7\().16b trn1 \r4\().8h, \r0\().8h, \r2\().8h trn2 \r2\().8h, \r0\().8h, \r2\().8h trn1 \r6\().8h, \r5\().8h, \r7\().8h trn2 \r7\().8h, \r5\().8h, \r7\().8h trn1 \r5\().8h, \t9\().8h, \r3\().8h trn2 \t9\().8h, \t9\().8h, \r3\().8h trn1 \r3\().8h, \t8\().8h, \r1\().8h trn2 \t8\().8h, \t8\().8h, \r1\().8h trn1 \r0\().4s, \r3\().4s, \r4\().4s trn2 \r4\().4s, \r3\().4s, \r4\().4s trn1 \r1\().4s, \r5\().4s, \r6\().4s trn2 \r5\().4s, \r5\().4s, \r6\().4s trn2 \r6\().4s, \t8\().4s, \r2\().4s trn1 \r2\().4s, \t8\().4s, \r2\().4s trn1 \r3\().4s, \t9\().4s, \r7\().4s trn2 \r7\().4s, \t9\().4s, \r7\().4s .endm .macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().16b, \r0\().16b, \r1\().16b trn2 \t5\().16b, \r0\().16b, \r1\().16b trn1 \t6\().16b, \r2\().16b, \r3\().16b trn2 \t7\().16b, \r2\().16b, \r3\().16b trn1 \r0\().8h, \t4\().8h, \t6\().8h trn2 \r2\().8h, \t4\().8h, \t6\().8h trn1 \r1\().8h, \t5\().8h, \t7\().8h trn2 \r3\().8h, \t5\().8h, \t7\().8h .endm .macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().4h, \r0\().4h, \r1\().4h trn2 \t5\().4h, \r0\().4h, \r1\().4h trn1 \t6\().4h, \r2\().4h, \r3\().4h trn2 \t7\().4h, \r2\().4h, \r3\().4h trn1 \r0\().2s, \t4\().2s, \t6\().2s trn2 \r2\().2s, \t4\().2s, \t6\().2s trn1 \r1\().2s, \t5\().2s, \t7\().2s trn2 \r3\().2s, \t5\().2s, \t7\().2s .endm .macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().4s, \r0\().4s, \r1\().4s trn2 \t5\().4s, \r0\().4s, \r1\().4s trn1 \t6\().4s, \r2\().4s, \r3\().4s trn2 \t7\().4s, \r2\().4s, \r3\().4s trn1 \r0\().2d, \t4\().2d, \t6\().2d trn2 \r2\().2d, \t4\().2d, \t6\().2d trn1 \r1\().2d, \t5\().2d, \t7\().2d trn2 \r3\().2d, \t5\().2d, \t7\().2d .endm .macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h trn1 \t6\().8h, \r2\().8h, \r3\().8h trn2 \t7\().8h, \r2\().8h, \r3\().8h trn1 \r0\().4s, \t4\().4s, \t6\().4s trn2 \r2\().4s, \t4\().4s, \t6\().4s trn1 \r1\().4s, \t5\().4s, \t7\().4s trn2 \r3\().4s, \t5\().4s, \t7\().4s .endm .macro transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h trn1 \t6\().8h, \r2\().8h, \r3\().8h trn2 \t7\().8h, \r2\().8h, \r3\().8h trn1 \o0\().4s, \t4\().4s, \t6\().4s trn2 \o2\().4s, \t4\().4s, \t6\().4s trn1 \o1\().4s, \t5\().4s, \t7\().4s trn2 \o3\().4s, \t5\().4s, \t7\().4s .endm #endif /* DAV2D_SRC_ARM_64_UTIL_S */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/arm-arch.h000066400000000000000000000047301517466257200232610ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef ARM_ARM_ARCH_H #define ARM_ARM_ARCH_H /* Compatibility header to define __ARM_ARCH with older compilers */ #ifndef __ARM_ARCH #ifdef _M_ARM #define __ARM_ARCH _M_ARM #elif defined(__ARM_ARCH_8A__) || defined(_M_ARM64) #define __ARM_ARCH 8 #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ defined(__ARM_ARCH_7EM__) || defined(__ARM_ARCH_7R__) || \ defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) #define __ARM_ARCH 7 #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || \ defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) #define __ARM_ARCH 6 #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) #define __ARM_ARCH 5 #elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) #define __ARM_ARCH 4 #elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__) #define __ARM_ARCH 3 #elif defined(__ARM_ARCH_2__) #define __ARM_ARCH 2 #else #error Unknown ARM architecture version #endif #endif /* !__ARM_ARCH */ #endif /* ARM_ARM_ARCH_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/asm-offsets.h000066400000000000000000000046221517466257200240160ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef ARM_ASM_OFFSETS_H #define ARM_ASM_OFFSETS_H #include "config.h" #define FGD_SEED 0 #define FGD_AR_COEFF_LAG 92 #define FGD_AR_COEFFS_Y 96 #define FGD_AR_COEFFS_UV 120 #define FGD_AR_COEFF_SHIFT 176 #define FGD_GRAIN_SCALE_SHIFT 184 #define FGD_SCALING_SHIFT 88 #define FGD_UV_MULT 188 #define FGD_UV_LUMA_MULT 196 #define FGD_UV_OFFSET 204 #define FGD_CLIP_TO_RESTRICTED_RANGE 216 #if ARCH_AARCH64 #define RMVSF_IW8 16 #define RMVSF_IH8 20 #define RMVSF_MFMV_REF 53 #define RMVSF_MFMV_REF2CUR 56 #define RMVSF_MFMV_REF2REF 59 #define RMVSF_N_MFMVS 80 #define RMVSF_RP_REF 96 #define RMVSF_RP_PROJ 104 #define RMVSF_RP_STRIDE 112 #define RMVSF_N_TILE_THREADS 128 #endif #endif /* ARM_ASM_OFFSETS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/asm.S000066400000000000000000000252361517466257200223260ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_ARM_ASM_S #define DAV2D_SRC_ARM_ASM_S #include "config.h" #if ARCH_AARCH64 #define x18 do_not_use_x18 #define w18 do_not_use_w18 #if HAVE_AS_ARCH_DIRECTIVE .arch AS_ARCH_LEVEL #endif #if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE #define ENABLE_DOTPROD .arch_extension dotprod #define DISABLE_DOTPROD .arch_extension nodotprod #else #define ENABLE_DOTPROD #define DISABLE_DOTPROD #endif #if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE #define ENABLE_I8MM .arch_extension i8mm #define DISABLE_I8MM .arch_extension noi8mm #else #define ENABLE_I8MM #define DISABLE_I8MM #endif #if HAVE_AS_ARCHEXT_SVE_DIRECTIVE #define ENABLE_SVE .arch_extension sve #define DISABLE_SVE .arch_extension nosve #else #define ENABLE_SVE #define DISABLE_SVE #endif #if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE #define ENABLE_SVE2 .arch_extension sve2 #define DISABLE_SVE2 .arch_extension nosve2 #else #define ENABLE_SVE2 #define DISABLE_SVE2 #endif /* If we do support the .arch_extension directives, disable support for all * the extensions that we may use, in case they were implicitly enabled by * the .arch level. This makes it clear if we try to assemble an instruction * from an unintended extension set; we only allow assmbling such instructions * within regions where we explicitly enable those extensions. */ DISABLE_DOTPROD DISABLE_I8MM DISABLE_SVE DISABLE_SVE2 /* Support macros for * - Armv8.3-A Pointer Authentication and * - Armv8.5-A Branch Target Identification * features which require emitting a .note.gnu.property section with the * appropriate architecture-dependent feature bits set. * * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be * used immediately before saving the LR register (x30) to the stack. * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also * have the same value at the two points. For example: * * .global f * f: * AARCH64_SIGN_LINK_REGISTER * stp x29, x30, [sp, #-96]! * mov x29, sp * ... * ldp x29, x30, [sp], #96 * AARCH64_VALIDATE_LINK_REGISTER * ret * * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an * indirect call target. In particular, all symbols exported from a file must * begin with one of these macros. For example, a leaf function that does not * save LR can instead use |AARCH64_VALID_CALL_TARGET|: * * .globl return_zero * return_zero: * AARCH64_VALID_CALL_TARGET * mov x0, #0 * ret * * A non-leaf function which does not immediately save LR may need both macros * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function * may jump to an alternate implementation before setting up the stack: * * .globl with_early_jump * with_early_jump: * AARCH64_VALID_CALL_TARGET * cmp x0, #128 * b.lt .Lwith_early_jump_128 * AARCH64_SIGN_LINK_REGISTER * stp x29, x30, [sp, #-96]! * mov x29, sp * ... * ldp x29, x30, [sp], #96 * AARCH64_VALIDATE_LINK_REGISTER * ret * * .Lwith_early_jump_128: * ... * ret * * These annotations are only required with indirect calls. Private symbols that * are only the target of direct calls do not require annotations. Also note * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not * indirect jumps (BR). Indirect jumps in assembly are supported through * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. * * Although not necessary, it is safe to use these macros in 32-bit ARM * assembly. This may be used to simplify dual 32-bit and 64-bit files. * * References: * - "ELF for the Arm® 64-bit Architecture" * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst * - "Providing protection for complex software" * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software */ #if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) #define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification #define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc' #define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' #define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j' #else #define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification #define AARCH64_VALID_JUMP_CALL_TARGET #define AARCH64_VALID_CALL_TARGET #define AARCH64_VALID_JUMP_TARGET #endif #if defined(__ARM_FEATURE_PAC_DEFAULT) #if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A #define AARCH64_SIGN_LINK_REGISTER paciasp #define AARCH64_VALIDATE_LINK_REGISTER autiasp #elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B #define AARCH64_SIGN_LINK_REGISTER pacibsp #define AARCH64_VALIDATE_LINK_REGISTER autibsp #else #error Pointer authentication defines no valid key! #endif #if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions #error Authentication of leaf functions is enabled but not supported in dav2d! #endif #define GNU_PROPERTY_AARCH64_PAC (1 << 1) #elif defined(__APPLE__) && defined(__arm64e__) #define GNU_PROPERTY_AARCH64_PAC 0 #define AARCH64_SIGN_LINK_REGISTER pacibsp #define AARCH64_VALIDATE_LINK_REGISTER autibsp #else /* __ARM_FEATURE_PAC_DEFAULT */ #define GNU_PROPERTY_AARCH64_PAC 0 #define AARCH64_SIGN_LINK_REGISTER #define AARCH64_VALIDATE_LINK_REGISTER #endif /* !__ARM_FEATURE_PAC_DEFAULT */ #if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) .pushsection .note.gnu.property, "a" .balign 8 .long 4 .long 0x10 .long 0x5 .asciz "GNU" .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ .long 4 .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) .long 0 .popsection #endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ #endif /* ARCH_AARCH64 */ #if ARCH_ARM .syntax unified #ifdef __ELF__ .arch armv7-a .fpu neon .eabi_attribute 10, 0 // suppress Tag_FP_arch .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch .section .note.GNU-stack,"",%progbits // Mark stack as non-executable #endif /* __ELF__ */ #ifdef _WIN32 #define CONFIG_THUMB 1 #else #define CONFIG_THUMB 0 #endif #if CONFIG_THUMB .thumb #define A @ #define T #else #define A #define T @ #endif /* CONFIG_THUMB */ #endif /* ARCH_ARM */ #if !defined(PIC) #if defined(__PIC__) #define PIC __PIC__ #elif defined(__pic__) #define PIC __pic__ #endif #endif #ifndef PRIVATE_PREFIX #define PRIVATE_PREFIX dav2d_ #endif #define PASTE(a,b) a ## b #define CONCAT(a,b) PASTE(a,b) #ifdef PREFIX #define EXTERN CONCAT(_,PRIVATE_PREFIX) #else #define EXTERN PRIVATE_PREFIX #endif .macro function name, export=0, align=2 .macro endfunc #ifdef __ELF__ .size \name, . - \name #endif #if HAVE_AS_FUNC .endfunc #endif .purgem endfunc .endm .text .align \align .if \export .global EXTERN\name #ifdef __ELF__ .type EXTERN\name, %function .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif #if HAVE_AS_FUNC .func EXTERN\name #endif EXTERN\name: .else #ifdef __ELF__ .type \name, %function #endif #if HAVE_AS_FUNC .func \name #endif .endif \name: #if ARCH_AARCH64 .if \export AARCH64_VALID_CALL_TARGET .endif #endif .endm .macro const name, export=0, align=2 .macro endconst #ifdef __ELF__ .size \name, . - \name #endif .purgem endconst .endm #if defined(_WIN32) .section .rdata #elif !defined(__MACH__) .section .rodata #else .const_data #endif .align \align .if \export .global EXTERN\name #ifdef __ELF__ .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif EXTERN\name: .endif \name: .endm .macro jumptable name #ifdef _WIN32 // MS armasm64 doesn't seem to be able to create relocations for subtraction // of labels in different sections; for armasm64 (and all of Windows for // simplicity), write the jump table in the text section, to allow calculating // differences at assembly time. See // https://developercommunity.visualstudio.com/t/armasm64-unable-to-create-cross-section/10722340 // for reference. (LLVM can create such relocations, but checking for _WIN32 // for simplicity, as execute-only memory isn't relevant on Windows at the // moment.) function \name #else // For other platforms, write jump tables in a const data section, to allow // working in environments where executable memory isn't readable. const \name #endif .endm .macro endjumptable #ifdef _WIN32 endfunc #else endconst #endif .endm #ifdef __APPLE__ #define L(x) L ## x #else #define L(x) .L ## x #endif #define X(x) CONCAT(EXTERN, x) #endif /* DAV2D_SRC_ARM_ASM_S */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/cdef.h000066400000000000000000000112601517466257200224640ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/cdef.h" decl_cdef_dir_fn(BF(dav2d_cdef_find_dir, neon)); void BF(dav2d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, ptrdiff_t src_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, int h, enum CdefEdgeFlags edges); void BF(dav2d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src, ptrdiff_t src_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, int h, enum CdefEdgeFlags edges); // Passing edges to this function, to allow it to switch to a more // optimized version for fully edged cases. Using size_t for edges, // to avoid ABI differences for passing more than one argument on the stack. void BF(dav2d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride, const uint16_t *tmp, int pri_strength, int sec_strength, int dir, int damping, int h, size_t edges HIGHBD_DECL_SUFFIX); void BF(dav2d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride, const uint16_t *tmp, int pri_strength, int sec_strength, int dir, int damping, int h, size_t edges HIGHBD_DECL_SUFFIX); #define DEFINE_FILTER(w, h, tmp_stride) \ static void \ cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \ const pixel (*left)[2], \ const pixel *const top, \ const pixel *const bottom, \ const int pri_strength, const int sec_strength, \ const int dir, const int damping, \ const enum CdefEdgeFlags edges \ HIGHBD_DECL_SUFFIX) \ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \ BF(dav2d_cdef_padding##w, neon)(tmp, dst, stride, \ left, top, bottom, h, edges); \ BF(dav2d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \ sec_strength, dir, damping, h, edges \ HIGHBD_TAIL_SUFFIX); \ } DEFINE_FILTER(8, 8, 16) DEFINE_FILTER(4, 8, 8) DEFINE_FILTER(4, 4, 8) static ALWAYS_INLINE void cdef_dsp_init_arm(Dav2dCdefDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; c->dir = BF(dav2d_cdef_find_dir, neon); c->fb[0] = cdef_filter_8x8_neon; c->fb[1] = cdef_filter_4x8_neon; c->fb[2] = cdef_filter_4x4_neon; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/cpu.c000066400000000000000000000161051517466257200223500ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/attributes.h" #include "src/cpu.h" #include "src/arm/cpu.h" #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #include #if ARCH_AARCH64 #define HWCAP_AARCH64_ASIMDDP (1 << 20) #define HWCAP_AARCH64_SVE (1 << 22) #define HWCAP2_AARCH64_SVE2 (1 << 1) #define HWCAP2_AARCH64_I8MM (1 << 13) COLD unsigned dav2d_get_cpu_flags_arm(void) { unsigned long hw_cap = dav2d_getauxval(AT_HWCAP); unsigned long hw_cap2 = dav2d_getauxval(AT_HWCAP2); unsigned flags = dav2d_get_default_cpu_flags(); flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV2D_ARM_CPU_FLAG_DOTPROD : 0; flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV2D_ARM_CPU_FLAG_I8MM : 0; flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV2D_ARM_CPU_FLAG_SVE : 0; flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV2D_ARM_CPU_FLAG_SVE2 : 0; return flags; } #else /* !ARCH_AARCH64 */ #ifndef HWCAP_ARM_NEON #define HWCAP_ARM_NEON (1 << 12) #endif #define HWCAP_ARM_ASIMDDP (1 << 24) #define HWCAP_ARM_I8MM (1 << 27) COLD unsigned dav2d_get_cpu_flags_arm(void) { unsigned long hw_cap = dav2d_getauxval(AT_HWCAP); unsigned flags = dav2d_get_default_cpu_flags(); flags |= (hw_cap & HWCAP_ARM_NEON) ? DAV2D_ARM_CPU_FLAG_NEON : 0; flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV2D_ARM_CPU_FLAG_DOTPROD : 0; flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV2D_ARM_CPU_FLAG_I8MM : 0; return flags; } #endif /* ARCH_AARCH64 */ #elif defined(__APPLE__) #include static int have_feature(const char *feature) { int supported = 0; size_t size = sizeof(supported); if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) { return 0; } return supported; } COLD unsigned dav2d_get_cpu_flags_arm(void) { unsigned flags = dav2d_get_default_cpu_flags(); if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= DAV2D_ARM_CPU_FLAG_DOTPROD; if (have_feature("hw.optional.arm.FEAT_I8MM")) flags |= DAV2D_ARM_CPU_FLAG_I8MM; /* No SVE and SVE2 feature detection available on Apple platforms. */ return flags; } #elif defined(__OpenBSD__) && ARCH_AARCH64 #include #include #include #include COLD unsigned dav2d_get_cpu_flags_arm(void) { unsigned flags = dav2d_get_default_cpu_flags(); #ifdef CPU_ID_AA64ISAR0 int mib[2]; uint64_t isar0; uint64_t isar1; size_t len; mib[0] = CTL_MACHDEP; mib[1] = CPU_ID_AA64ISAR0; len = sizeof(isar0); if (sysctl(mib, 2, &isar0, &len, NULL, 0) != -1) { if (ID_AA64ISAR0_DP(isar0) >= ID_AA64ISAR0_DP_IMPL) flags |= DAV2D_ARM_CPU_FLAG_DOTPROD; } mib[0] = CTL_MACHDEP; mib[1] = CPU_ID_AA64ISAR1; len = sizeof(isar1); if (sysctl(mib, 2, &isar1, &len, NULL, 0) != -1) { #ifdef ID_AA64ISAR1_I8MM_IMPL if (ID_AA64ISAR1_I8MM(isar1) >= ID_AA64ISAR1_I8MM_IMPL) flags |= DAV2D_ARM_CPU_FLAG_I8MM; #endif } #endif return flags; } #elif defined(_WIN32) #include COLD unsigned dav2d_get_cpu_flags_arm(void) { unsigned flags = dav2d_get_default_cpu_flags(); #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) flags |= DAV2D_ARM_CPU_FLAG_DOTPROD; #endif #ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) flags |= DAV2D_ARM_CPU_FLAG_SVE; #endif #ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) flags |= DAV2D_ARM_CPU_FLAG_SVE2; #endif #ifdef PF_ARM_V82_I8MM_INSTRUCTIONS_AVAILABLE if (IsProcessorFeaturePresent(PF_ARM_V82_I8MM_INSTRUCTIONS_AVAILABLE)) flags |= DAV2D_ARM_CPU_FLAG_I8MM; #endif return flags; } #elif defined(__ANDROID__) || defined(__linux__) #include #include #include static unsigned parse_proc_cpuinfo(const char *flag) { FILE *file = fopen("/proc/cpuinfo", "r"); if (!file) return 0; char line_buffer[120]; const char *line; size_t flaglen = strlen(flag); while ((line = fgets(line_buffer, sizeof(line_buffer), file))) { // check all occurances as whole words const char *found = line; while ((found = strstr(found, flag))) { if ((found == line_buffer || !isgraph(found[-1])) && (isspace(found[flaglen]) || feof(file))) { fclose(file); return 1; } found += flaglen; } // if line is incomplete seek back to avoid splitting the search // string into two buffers if (!strchr(line, '\n') && strlen(line) > flaglen) { // use fseek since the 64 bit fseeko is only available since // Android API level 24 and meson defines _FILE_OFFSET_BITS // by default 64 if (fseek(file, -flaglen, SEEK_CUR)) break; } } fclose(file); return 0; } COLD unsigned dav2d_get_cpu_flags_arm(void) { unsigned flags = dav2d_get_default_cpu_flags(); flags |= parse_proc_cpuinfo("neon") ? DAV2D_ARM_CPU_FLAG_NEON : 0; flags |= parse_proc_cpuinfo("asimd") ? DAV2D_ARM_CPU_FLAG_NEON : 0; flags |= parse_proc_cpuinfo("asimddp") ? DAV2D_ARM_CPU_FLAG_DOTPROD : 0; flags |= parse_proc_cpuinfo("i8mm") ? DAV2D_ARM_CPU_FLAG_I8MM : 0; #if ARCH_AARCH64 flags |= parse_proc_cpuinfo("sve") ? DAV2D_ARM_CPU_FLAG_SVE : 0; flags |= parse_proc_cpuinfo("sve2") ? DAV2D_ARM_CPU_FLAG_SVE2 : 0; #endif /* ARCH_AARCH64 */ return flags; } #else /* Unsupported OS */ COLD unsigned dav2d_get_cpu_flags_arm(void) { return dav2d_get_default_cpu_flags(); } #endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/cpu.h000066400000000000000000000033431517466257200223550ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_ARM_CPU_H #define DAV2D_SRC_ARM_CPU_H enum CpuFlags { DAV2D_ARM_CPU_FLAG_NEON = 1 << 0, DAV2D_ARM_CPU_FLAG_DOTPROD = 1 << 1, DAV2D_ARM_CPU_FLAG_I8MM = 1 << 2, DAV2D_ARM_CPU_FLAG_SVE = 1 << 3, DAV2D_ARM_CPU_FLAG_SVE2 = 1 << 4, }; unsigned dav2d_get_cpu_flags_arm(void); #endif /* DAV2D_SRC_ARM_CPU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/deblock.h000066400000000000000000000040341517466257200231670ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/deblock.h" decl_deblock_sb_fn(BF(dav2d_lpf_h_sb_y, neon)); decl_deblock_sb_fn(BF(dav2d_lpf_v_sb_y, neon)); decl_deblock_sb_fn(BF(dav2d_lpf_h_sb_uv, neon)); decl_deblock_sb_fn(BF(dav2d_lpf_v_sb_uv, neon)); static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav2dDeblockDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; c->deblock_sb[0][0] = BF(dav2d_lpf_h_sb_y, neon); c->deblock_sb[0][1] = BF(dav2d_lpf_v_sb_y, neon); c->deblock_sb[1][0] = BF(dav2d_lpf_h_sb_uv, neon); c->deblock_sb[1][1] = BF(dav2d_lpf_v_sb_uv, neon); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/filmgrain.h000066400000000000000000000222741517466257200235420ustar00rootroot00000000000000/* * Copyright © 2018, Niklas Haas * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * Copyright © 2021, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/filmgrain.h" #include "asm-offsets.h" CHECK_OFFSET(Dav2dFilmGrainData, seed, FGD_SEED); CHECK_OFFSET(Dav2dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); CHECK_OFFSET(Dav2dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); CHECK_OFFSET(Dav2dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); CHECK_OFFSET(Dav2dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); CHECK_OFFSET(Dav2dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); CHECK_OFFSET(Dav2dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); CHECK_OFFSET(Dav2dFilmGrainData, uv_mult, FGD_UV_MULT); CHECK_OFFSET(Dav2dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); CHECK_OFFSET(Dav2dFilmGrainData, uv_offset, FGD_UV_OFFSET); CHECK_OFFSET(Dav2dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); void BF(dav2d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], const Dav2dFilmGrainData *const data HIGHBD_DECL_SUFFIX); #define GEN_GRAIN_UV(suff) \ void BF(dav2d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ const entry buf_y[][GRAIN_WIDTH], \ const Dav2dFilmGrainData *const data, \ const intptr_t uv \ HIGHBD_DECL_SUFFIX) GEN_GRAIN_UV(420); GEN_GRAIN_UV(422); GEN_GRAIN_UV(444); // Use ptrdiff_t instead of int for the last few parameters, to get the // same layout of parameters on the stack across platforms. void BF(dav2d_fgy_32x32, neon)(pixel *const dst, const pixel *const src, const ptrdiff_t stride, const uint8_t scaling[SCALING_SIZE], const int scaling_shift, const entry grain_lut[][GRAIN_WIDTH], const int offsets[][2], const int h, const ptrdiff_t clip, const ptrdiff_t type HIGHBD_DECL_SUFFIX); static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, const ptrdiff_t stride, const Dav2dFilmGrainData *const data, const size_t pw, const uint8_t scaling[SCALING_SIZE], const entry grain_lut[][GRAIN_WIDTH], const int bh, const int row_num HIGHBD_DECL_SUFFIX) { const int rows = 1 + (data->overlap_flag && row_num > 0); // seed[0] contains the current row, seed[1] contains the previous unsigned seed[2]; for (int i = 0; i < rows; i++) { seed[i] = data->seed; seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); } int offsets[2 /* col offset */][2 /* row offset */]; // process this row in FG_BLOCK_SIZE^2 blocks for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) { if (data->overlap_flag && bx) { // shift previous offsets left for (int i = 0; i < rows; i++) offsets[1][i] = offsets[0][i]; } // update current offsets for (int i = 0; i < rows; i++) offsets[0][i] = get_random_number(8, &seed[i]); int type = 0; if (data->overlap_flag && row_num) type |= 1; /* overlap y */ if (data->overlap_flag && bx) type |= 2; /* overlap x */ BF(dav2d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, scaling, data->scaling_shift, grain_lut, offsets, bh, data->clip_to_restricted_range, type HIGHBD_TAIL_SUFFIX); } } // Use ptrdiff_t instead of int for the last few parameters, to get the // parameters on the stack with the same layout across platforms. #define FGUV(nm, sx, sy) \ void BF(dav2d_fguv_32x32_##nm, neon)(pixel *const dst, \ const pixel *const src, \ const ptrdiff_t stride, \ const uint8_t scaling[SCALING_SIZE], \ const Dav2dFilmGrainData *const data, \ const entry grain_lut[][GRAIN_WIDTH], \ const pixel *const luma_row, \ const ptrdiff_t luma_stride, \ const int offsets[][2], \ const ptrdiff_t h, const ptrdiff_t uv, \ const ptrdiff_t is_id, \ const ptrdiff_t type \ HIGHBD_DECL_SUFFIX); \ static void \ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ const ptrdiff_t stride, const Dav2dFilmGrainData *const data, \ const size_t pw, const uint8_t scaling[SCALING_SIZE], \ const entry grain_lut[][GRAIN_WIDTH], const int bh, \ const int row_num, const pixel *const luma_row, \ const ptrdiff_t luma_stride, const int uv, const int is_id \ HIGHBD_DECL_SUFFIX) \ { \ const int rows = 1 + (data->overlap_flag && row_num > 0); \ \ /* seed[0] contains the current row, seed[1] contains the previous */ \ unsigned seed[2]; \ for (int i = 0; i < rows; i++) { \ seed[i] = data->seed; \ seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ } \ \ int offsets[2 /* col offset */][2 /* row offset */]; \ \ /* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \ for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \ if (data->overlap_flag && bx) { \ /* shift previous offsets left */ \ for (int i = 0; i < rows; i++) \ offsets[1][i] = offsets[0][i]; \ } \ \ /* update current offsets */ \ for (int i = 0; i < rows; i++) \ offsets[0][i] = get_random_number(8, &seed[i]); \ \ int type = 0; \ if (data->overlap_flag && row_num) \ type |= 1; /* overlap y */ \ if (data->overlap_flag && bx) \ type |= 2; /* overlap x */ \ if (data->chroma_scaling_from_luma) \ type |= 4; \ \ BF(dav2d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ scaling, data, grain_lut, \ luma_row + (bx << sx), luma_stride, \ offsets, bh, uv, is_id, type \ HIGHBD_TAIL_SUFFIX); \ } \ } FGUV(420, 1, 1); FGUV(422, 1, 0); FGUV(444, 0, 0); static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav2dFilmGrainDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; c->generate_grain_y = BF(dav2d_generate_grain_y, neon); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_generate_grain_uv_420, neon); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_generate_grain_uv_422, neon); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_generate_grain_uv_444, neon); c->fgy_32x32xn = fgy_32x32xn_neon; c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/ipred.h000066400000000000000000000363361517466257200227010ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/ipred.h" decl_angular_ipred_fn(BF(dav2d_ipred_dc, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_dc_128, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_dc_top, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_dc_left, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_h, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_v, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_paeth, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth_v, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth_h, neon)); decl_angular_ipred_fn(BF(dav2d_ipred_filter, neon)); decl_pal_pred_fn(BF(dav2d_pal_pred, neon)); #if ARCH_AARCH64 && 0 void BF(dav2d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz, const pixel *const in, const int end HIGHBD_DECL_SUFFIX); void BF(dav2d_ipred_z1_filter_edge, neon)(pixel *out, const int sz, const pixel *const in, const int end, const int strength); void BF(dav2d_ipred_pixel_set, neon)(pixel *out, const pixel px, const int n); void BF(dav2d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride, const pixel *const top, const int width, const int height, const int dx, const int max_base_x); void BF(dav2d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride, const pixel *const top, const int width, const int height, const int dx, const int max_base_x); static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in, const int width, const int height, int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int is_sm = (angle >> 9) & 0x1; const int enable_intra_edge_filter = angle >> 10; angle &= 511; int dx = dav2d_dr_intra_derivative[angle]; pixel top_out[64 + 64 + (64+15)*2 + 16]; int max_base_x; const int upsample_above = enable_intra_edge_filter ? get_upsample(width + height, 90 - angle, is_sm) : 0; if (upsample_above) { BF(dav2d_ipred_z1_upsample_edge, neon)(top_out, width + height, topleft_in, width + imin(width, height) HIGHBD_TAIL_SUFFIX); max_base_x = 2 * (width + height) - 2; dx <<= 1; } else { const int filter_strength = enable_intra_edge_filter ? get_filter_strength(width + height, 90 - angle, is_sm) : 0; if (filter_strength) { BF(dav2d_ipred_z1_filter_edge, neon)(top_out, width + height, topleft_in, width + imin(width, height), filter_strength); max_base_x = width + height - 1; } else { max_base_x = width + imin(width, height) - 1; memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel)); } } const int base_inc = 1 + upsample_above; int pad_pixels = width + 15; // max(dx >> 6) == 15 BF(dav2d_ipred_pixel_set, neon)(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc); if (upsample_above) BF(dav2d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height, dx, max_base_x); else BF(dav2d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height, dx, max_base_x); } void BF(dav2d_ipred_reverse, neon)(pixel *dst, const pixel *const src, const int n); void BF(dav2d_ipred_z2_upsample_edge, neon)(pixel *out, const int sz, const pixel *const in HIGHBD_DECL_SUFFIX); void BF(dav2d_ipred_z2_fill1, neon)(pixel *dst, ptrdiff_t stride, const pixel *const top, const pixel *const left, const int width, const int height, const int dx, const int dy); void BF(dav2d_ipred_z2_fill2, neon)(pixel *dst, ptrdiff_t stride, const pixel *const top, const pixel *const left, const int width, const int height, const int dx, const int dy); void BF(dav2d_ipred_z2_fill3, neon)(pixel *dst, ptrdiff_t stride, const pixel *const top, const pixel *const left, const int width, const int height, const int dx, const int dy); static void ipred_z2_neon(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in, const int width, const int height, int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int is_sm = (angle >> 9) & 0x1; const int enable_intra_edge_filter = angle >> 10; angle &= 511; assert(angle > 90 && angle < 180); int dy = dav2d_dr_intra_derivative[angle - 90]; int dx = dav2d_dr_intra_derivative[180 - angle]; const int upsample_left = enable_intra_edge_filter ? get_upsample(width + height, 180 - angle, is_sm) : 0; const int upsample_above = enable_intra_edge_filter ? get_upsample(width + height, angle - 90, is_sm) : 0; pixel buf[3*(64+1)]; pixel *left = &buf[2*(64+1)]; // The asm can underread below the start of top[] and left[]; to avoid // surprising behaviour, make sure this is within the allocated stack space. pixel *top = &buf[1*(64+1)]; pixel *flipped = &buf[0*(64+1)]; if (upsample_above) { BF(dav2d_ipred_z2_upsample_edge, neon)(top, width, topleft_in HIGHBD_TAIL_SUFFIX); dx <<= 1; } else { const int filter_strength = enable_intra_edge_filter ? get_filter_strength(width + height, angle - 90, is_sm) : 0; if (filter_strength) { BF(dav2d_ipred_z1_filter_edge, neon)(&top[1], imin(max_width, width), topleft_in, width, filter_strength); if (max_width < width) memcpy(&top[1 + max_width], &topleft_in[1 + max_width], (width - max_width) * sizeof(pixel)); } else { pixel_copy(&top[1], &topleft_in[1], width); } } if (upsample_left) { flipped[0] = topleft_in[0]; BF(dav2d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], height); BF(dav2d_ipred_z2_upsample_edge, neon)(left, height, flipped HIGHBD_TAIL_SUFFIX); dy <<= 1; } else { const int filter_strength = enable_intra_edge_filter ? get_filter_strength(width + height, 180 - angle, is_sm) : 0; if (filter_strength) { flipped[0] = topleft_in[0]; BF(dav2d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], height); BF(dav2d_ipred_z1_filter_edge, neon)(&left[1], imin(max_height, height), flipped, height, filter_strength); if (max_height < height) memcpy(&left[1 + max_height], &flipped[1 + max_height], (height - max_height) * sizeof(pixel)); } else { BF(dav2d_ipred_reverse, neon)(&left[1], &topleft_in[0], height); } } top[0] = left[0] = *topleft_in; assert(!(upsample_above && upsample_left)); if (!upsample_above && !upsample_left) { BF(dav2d_ipred_z2_fill1, neon)(dst, stride, top, left, width, height, dx, dy); } else if (upsample_above) { BF(dav2d_ipred_z2_fill2, neon)(dst, stride, top, left, width, height, dx, dy); } else /*if (upsample_left)*/ { BF(dav2d_ipred_z2_fill3, neon)(dst, stride, top, left, width, height, dx, dy); } } void BF(dav2d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride, const pixel *const left, const int width, const int height, const int dy, const int max_base_y); void BF(dav2d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride, const pixel *const left, const int width, const int height, const int dy, const int max_base_y); static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in, const int width, const int height, int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int is_sm = (angle >> 9) & 0x1; const int enable_intra_edge_filter = angle >> 10; angle &= 511; assert(angle > 180); int dy = dav2d_dr_intra_derivative[270 - angle]; pixel flipped[64 + 64 + 16]; pixel left_out[64 + 64 + (64+15)*2]; int max_base_y; const int upsample_left = enable_intra_edge_filter ? get_upsample(width + height, angle - 180, is_sm) : 0; if (upsample_left) { flipped[0] = topleft_in[0]; BF(dav2d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], height + imax(width, height)); BF(dav2d_ipred_z1_upsample_edge, neon)(left_out, width + height, flipped, height + imin(width, height) HIGHBD_TAIL_SUFFIX); max_base_y = 2 * (width + height) - 2; dy <<= 1; } else { const int filter_strength = enable_intra_edge_filter ? get_filter_strength(width + height, angle - 180, is_sm) : 0; if (filter_strength) { flipped[0] = topleft_in[0]; BF(dav2d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], height + imax(width, height)); BF(dav2d_ipred_z1_filter_edge, neon)(left_out, width + height, flipped, height + imin(width, height), filter_strength); max_base_y = width + height - 1; } else { BF(dav2d_ipred_reverse, neon)(left_out, &topleft_in[0], height + imin(width, height)); max_base_y = height + imin(width, height) - 1; } } const int base_inc = 1 + upsample_left; // The tbx based implementation needs left[] to have 64 bytes intitialized, // the other implementation can read height + max(dy >> 6) past the end. int pad_pixels = imax(64 - max_base_y - 1, height + 15); BF(dav2d_ipred_pixel_set, neon)(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc); if (upsample_left) BF(dav2d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height, dy, max_base_y); else BF(dav2d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height, dy, max_base_y); } #endif static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav2dIntraPredDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; #if ARCH_AARCH64 && 0 c->intra_pred[DC_PRED] = BF(dav2d_ipred_dc, neon); c->intra_pred[DC_128_PRED] = BF(dav2d_ipred_dc_128, neon); c->intra_pred[TOP_DC_PRED] = BF(dav2d_ipred_dc_top, neon); c->intra_pred[LEFT_DC_PRED] = BF(dav2d_ipred_dc_left, neon); c->intra_pred[HOR_PRED] = BF(dav2d_ipred_h, neon); c->intra_pred[VERT_PRED] = BF(dav2d_ipred_v, neon); c->intra_pred[PAETH_PRED] = BF(dav2d_ipred_paeth, neon); c->intra_pred[SMOOTH_PRED] = BF(dav2d_ipred_smooth, neon); #endif #if ARCH_AARCH64 && BITDEPTH == 8 c->intra_pred[SMOOTH_H_PRED] = BF(dav2d_ipred_smooth_h, neon); c->intra_pred[SMOOTH_V_PRED] = BF(dav2d_ipred_smooth_v, neon); #endif #if ARCH_AARCH64 && 0 c->intra_pred[Z1_PRED] = ipred_z1_neon; c->intra_pred[Z2_PRED] = ipred_z2_neon; c->intra_pred[Z3_PRED] = ipred_z3_neon; //c->intra_pred[DIP_PRED] = BF(dav2d_ipred_dip, neon); c->cfl_pred[DC_PRED] = BF(dav2d_ipred_cfl, neon); c->cfl_pred[DC_128_PRED] = BF(dav2d_ipred_cfl_128, neon); c->cfl_pred[TOP_DC_PRED] = BF(dav2d_ipred_cfl_top, neon); c->cfl_pred[LEFT_DC_PRED] = BF(dav2d_ipred_cfl_left, neon); c->cfl_ac[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_ipred_cfl_ac_420, neon); c->cfl_ac[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_ipred_cfl_ac_422, neon); c->cfl_ac[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_ipred_cfl_ac_444, neon); #endif #if ARCH_AARCH64 && BITDEPTH == 8 c->pal_pred = BF(dav2d_pal_pred, neon); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/itx.h000066400000000000000000000063311517466257200223720ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/itx.h" decl_itx17_fns( 4, 4, neon); decl_itx16_fns( 4, 8, neon); decl_itx16_fns( 4, 16, neon); decl_itx16_fns( 8, 4, neon); decl_itx16_fns( 8, 8, neon); decl_itx16_fns( 8, 16, neon); decl_itx2_fns ( 8, 32, neon); decl_itx16_fns(16, 4, neon); decl_itx16_fns(16, 8, neon); decl_itx12_fns(16, 16, neon); decl_itx2_fns (16, 32, neon); decl_itx2_fns (32, 8, neon); decl_itx2_fns (32, 16, neon); decl_itx2_fns (32, 32, neon); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_16x64, neon)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_32x64, neon)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x16, neon)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x32, neon)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x64, neon)); static ALWAYS_INLINE void itx_dsp_init_arm(Dav2dInvTxfmDSPContext *const c, int bpc, int *const all_simd) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; assign_itx_fn( , 4, 4, wht_wht, WHT_WHT, neon); if (BITDEPTH == 16 && bpc != 10) return; assign_itx16_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); assign_itx16_fn(R, 8, 4, neon); assign_itx16_fn( , 8, 8, neon); assign_itx16_fn(R, 8, 16, neon); assign_itx2_fn (R, 8, 32, neon); assign_itx16_fn(R, 16, 4, neon); assign_itx16_fn(R, 16, 8, neon); assign_itx12_fn( , 16, 16, neon); assign_itx2_fn (R, 16, 32, neon); assign_itx1_fn (R, 16, 64, neon); assign_itx2_fn (R, 32, 8, neon); assign_itx2_fn (R, 32, 16, neon); assign_itx2_fn ( , 32, 32, neon); assign_itx1_fn (R, 32, 64, neon); assign_itx1_fn (R, 64, 16, neon); assign_itx1_fn (R, 64, 32, neon); assign_itx1_fn ( , 64, 64, neon); *all_simd = 1; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/looprestoration.h000066400000000000000000001252561517466257200250410ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/looprestoration.h" #if ARCH_AARCH64 void BF(dav2d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void BF(dav2d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); #else // The 8bpc version calculates things slightly differently than the reference // C version. That version calculates roughly this: // int16_t sum = 0; // for (int i = 0; i < 7; i++) // sum += src[idx] * fh[i]; // int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h; // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h; // sum += 1 << (bitdepth + 6 - round_bits_h); // Compared to the reference C version, this is the output of the first pass // _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e. // with round_offset precompensated. // The 16bpc version calculates things pretty much the same way as the // reference C version, but with the end result subtracted by // 1 << (bitdepth + 6 - round_bits_h). void BF(dav2d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], const pixel *src, const int16_t fh[8], const int w, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); // This calculates things slightly differently than the reference C version. // This version calculates roughly this: // int32_t sum = 0; // for (int i = 0; i < 7; i++) // sum += mid[idx] * fv[i]; // sum = (sum + rounding_off_v) >> round_bits_v; // This function assumes that the width is a multiple of 8. void BF(dav2d_wiener_filter_v, neon)(pixel *dst, int16_t **ptrs, const int16_t fv[8], const int w HIGHBD_DECL_SUFFIX); void BF(dav2d_wiener_filter_hv, neon)(pixel *dst, const pixel (*left)[4], const pixel *src, const int16_t filter[2][8], const int w, const enum LrEdgeFlags edges, int16_t **ptrs HIGHBD_DECL_SUFFIX); static void wiener_filter_neon(pixel *p, const ptrdiff_t stride, const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, hor, 6 * 384,); int16_t *ptrs[7], *rows[6]; for (int i = 0; i < 6; i++) rows[i] = &hor[i * 384]; const int16_t (*const filter)[8] = params->filter; const int16_t *fh = params->filter[0]; const int16_t *fv = params->filter[1]; const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); const pixel *src = p; if (edges & LR_HAVE_TOP) { ptrs[0] = rows[0]; ptrs[1] = rows[0]; ptrs[2] = rows[1]; ptrs[3] = rows[2]; ptrs[4] = rows[2]; ptrs[5] = rows[2]; BF(dav2d_wiener_filter_h, neon)(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); lpf += PXSTRIDE(stride); BF(dav2d_wiener_filter_h, neon)(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); BF(dav2d_wiener_filter_h, neon)(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); if (--h <= 0) goto v1; ptrs[4] = ptrs[5] = rows[3]; BF(dav2d_wiener_filter_h, neon)(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); if (--h <= 0) goto v2; ptrs[5] = rows[4]; BF(dav2d_wiener_filter_h, neon)(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); if (--h <= 0) goto v3; } else { ptrs[0] = rows[0]; ptrs[1] = rows[0]; ptrs[2] = rows[0]; ptrs[3] = rows[0]; ptrs[4] = rows[0]; ptrs[5] = rows[0]; BF(dav2d_wiener_filter_h, neon)(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); if (--h <= 0) goto v1; ptrs[4] = ptrs[5] = rows[1]; BF(dav2d_wiener_filter_h, neon)(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); if (--h <= 0) goto v2; ptrs[5] = rows[2]; BF(dav2d_wiener_filter_h, neon)(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); if (--h <= 0) goto v3; ptrs[6] = rows[3]; BF(dav2d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); p += PXSTRIDE(stride); if (--h <= 0) goto v3; ptrs[6] = rows[4]; BF(dav2d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); p += PXSTRIDE(stride); if (--h <= 0) goto v3; } ptrs[6] = ptrs[5] + 384; do { BF(dav2d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs HIGHBD_TAIL_SUFFIX); left++; src += PXSTRIDE(stride); p += PXSTRIDE(stride); } while (--h > 0); if (!(edges & LR_HAVE_BOTTOM)) goto v3; BF(dav2d_wiener_filter_hv, neon)(p, NULL, lpf_bottom, filter, w, edges, ptrs HIGHBD_TAIL_SUFFIX); lpf_bottom += PXSTRIDE(stride); p += PXSTRIDE(stride); BF(dav2d_wiener_filter_hv, neon)(p, NULL, lpf_bottom, filter, w, edges, ptrs HIGHBD_TAIL_SUFFIX); p += PXSTRIDE(stride); v1: BF(dav2d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); return; v3: BF(dav2d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); p += PXSTRIDE(stride); v2: BF(dav2d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); p += PXSTRIDE(stride); goto v1; } #endif static void rotate_neon(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) { int32_t *tmp32 = sumsq_ptrs[0]; int16_t *tmp16 = sum_ptrs[0]; for (int i = 0; i < n - 1; i++) { sumsq_ptrs[i] = sumsq_ptrs[i + 1]; sum_ptrs[i] = sum_ptrs[i + 1]; } sumsq_ptrs[n - 1] = tmp32; sum_ptrs[n - 1] = tmp16; } static void rotate5_x2_neon(int32_t **sumsq_ptrs, int16_t **sum_ptrs) { int32_t *tmp32[2]; int16_t *tmp16[2]; for (int i = 0; i < 2; i++) { tmp32[i] = sumsq_ptrs[i]; tmp16[i] = sum_ptrs[i]; } for (int i = 0; i < 3; i++) { sumsq_ptrs[i] = sumsq_ptrs[i + 2]; sum_ptrs[i] = sum_ptrs[i + 2]; } for (int i = 0; i < 2; i++) { sumsq_ptrs[3 + i] = tmp32[i]; sum_ptrs[3 + i] = tmp16[i]; } } void BF(dav2d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum, const pixel (*left)[4], const pixel *src, const int w, const enum LrEdgeFlags edges); void BF(dav2d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum, const pixel (*left)[4], const pixel *src, const int w, const enum LrEdgeFlags edges); void BF(dav2d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3, int32_t *sumsq5, int16_t *sum5, const pixel (*left)[4], const pixel *src, const int w, const enum LrEdgeFlags edges); #if ARCH_ARM void dav2d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum, int32_t *sumsq_out, int16_t *sum_out, const int w); void dav2d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum, int32_t *sumsq_out, int16_t *sum_out, const int w); void dav2d_sgr_calc_row_ab1_neon(int32_t *AA, int16_t *BB, int w, int s, int bitdepth_max); void dav2d_sgr_calc_row_ab2_neon(int32_t *AA, int16_t *BB, int w, int s, int bitdepth_max); void BF(dav2d_sgr_finish_filter_row1, neon)(int16_t *tmp, const pixel *src, int32_t **A_ptrs, int16_t **B_ptrs, const int w); void BF(dav2d_sgr_weighted_row1, neon)(pixel *dst, const int16_t *t1, const int w, const int wt HIGHBD_DECL_SUFFIX); #else void dav2d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, int32_t *AA, int16_t *BB, const int w, const int s, const int bitdepth_max); void dav2d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, int32_t *AA, int16_t *BB, const int w, const int s, const int bitdepth_max); void BF(dav2d_sgr_finish_weighted1, neon)(pixel *dst, int32_t **A_ptrs, int16_t **B_ptrs, const int w, const int w1 HIGHBD_DECL_SUFFIX); void BF(dav2d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride, int32_t **A_ptrs, int16_t **B_ptrs, const int w, const int h, const int w1 HIGHBD_DECL_SUFFIX); void BF(dav2d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride, int32_t **A_ptrs, int16_t **B_ptrs, const int w, const int h); #endif void BF(dav2d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride, int32_t **A_ptrs, int16_t **B_ptrs, const int w, const int h); void BF(dav2d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, const int16_t *t1, const int16_t *t2, const int w, const int h, const int16_t wt[2] HIGHBD_DECL_SUFFIX); static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, int32_t *sumsq_out, int16_t *sum_out, const int w, const int s, const int bitdepth_max) { #if ARCH_ARM dav2d_sgr_box3_row_v_neon(sumsq, sum, sumsq_out, sum_out, w); dav2d_sgr_calc_row_ab1_neon(sumsq_out, sum_out, w, s, bitdepth_max); #else // box3_v + calc_ab1 dav2d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); #endif rotate_neon(sumsq, sum, 3); } static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, int32_t *sumsq_out, int16_t *sum_out, const int w, const int s, const int bitdepth_max) { #if ARCH_ARM dav2d_sgr_box5_row_v_neon(sumsq, sum, sumsq_out, sum_out, w); dav2d_sgr_calc_row_ab2_neon(sumsq_out, sum_out, w, s, bitdepth_max); #else // box5_v + calc_ab2 dav2d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); #endif rotate5_x2_neon(sumsq, sum); } static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum, int32_t *AA, int16_t *BB, const pixel (*left)[4], const pixel *src, const int w, const int s, const enum LrEdgeFlags edges, const int bitdepth_max) { BF(dav2d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges); sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max); } static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride, int32_t **A_ptrs, int16_t **B_ptrs, const int w, const int w1 HIGHBD_DECL_SUFFIX) { #if ARCH_ARM ALIGN_STK_16(int16_t, tmp, 384,); BF(dav2d_sgr_finish_filter_row1, neon)(tmp, *dst, A_ptrs, B_ptrs, w); BF(dav2d_sgr_weighted_row1, neon)(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); #else BF(dav2d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs, w, w1 HIGHBD_TAIL_SUFFIX); #endif *dst += PXSTRIDE(stride); rotate_neon(A_ptrs, B_ptrs, 3); } #define ARM_FILTER_OUT_STRIDE 384 static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride, int32_t **A_ptrs, int16_t **B_ptrs, const int w, const int h, const int w1 HIGHBD_DECL_SUFFIX) { #if ARCH_ARM ALIGN_STK_16(int16_t, tmp, 2*ARM_FILTER_OUT_STRIDE,); BF(dav2d_sgr_finish_filter2_2rows, neon)(tmp, *dst, stride, A_ptrs, B_ptrs, w, h); BF(dav2d_sgr_weighted_row1, neon)(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); *dst += PXSTRIDE(stride); if (h > 1) { BF(dav2d_sgr_weighted_row1, neon)(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX); *dst += PXSTRIDE(stride); } #else BF(dav2d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs, w, h, w1 HIGHBD_TAIL_SUFFIX); *dst += 2*PXSTRIDE(stride); #endif rotate_neon(A_ptrs, B_ptrs, 2); } static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride, int32_t **A5_ptrs, int16_t **B5_ptrs, int32_t **A3_ptrs, int16_t **B3_ptrs, const int w, const int h, const int w0, const int w1 HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp5, 2*ARM_FILTER_OUT_STRIDE,); ALIGN_STK_16(int16_t, tmp3, 2*ARM_FILTER_OUT_STRIDE,); BF(dav2d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h); #if ARCH_ARM BF(dav2d_sgr_finish_filter_row1, neon)(tmp3, *dst, A3_ptrs, B3_ptrs, w); BF(dav2d_sgr_finish_filter_row1, neon)(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride), &A3_ptrs[1], &B3_ptrs[1], w); #else BF(dav2d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride, A3_ptrs, B3_ptrs, w, h); #endif const int16_t wt[2] = { w0, w1 }; BF(dav2d_sgr_weighted2, neon)(*dst, stride, tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX); *dst += h*PXSTRIDE(stride); rotate_neon(A5_ptrs, B5_ptrs, 2); rotate_neon(A3_ptrs, B3_ptrs, 4); } static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride, const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { #define ARM_BUF_STRIDE (384 + 16) ALIGN_STK_16(int32_t, sumsq_buf, ARM_BUF_STRIDE * 3 + 16,); ALIGN_STK_16(int16_t, sum_buf, ARM_BUF_STRIDE * 3 + 16,); int32_t *sumsq_ptrs[3], *sumsq_rows[3]; int16_t *sum_ptrs[3], *sum_rows[3]; for (int i = 0; i < 3; i++) { sumsq_rows[i] = &sumsq_buf[i * ARM_BUF_STRIDE]; sum_rows[i] = &sum_buf[i * ARM_BUF_STRIDE]; } ALIGN_STK_16(int32_t, A_buf, ARM_BUF_STRIDE * 3 + 16,); ALIGN_STK_16(int16_t, B_buf, ARM_BUF_STRIDE * 3 + 16,); int32_t *A_ptrs[3]; int16_t *B_ptrs[3]; for (int i = 0; i < 3; i++) { A_ptrs[i] = &A_buf[i * ARM_BUF_STRIDE]; B_ptrs[i] = &B_buf[i * ARM_BUF_STRIDE]; } const pixel *src = dst; const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); if (edges & LR_HAVE_TOP) { sumsq_ptrs[0] = sumsq_rows[0]; sumsq_ptrs[1] = sumsq_rows[1]; sumsq_ptrs[2] = sumsq_rows[2]; sum_ptrs[0] = sum_rows[0]; sum_ptrs[1] = sum_rows[1]; sum_ptrs[2] = sum_rows[2]; BF(dav2d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); lpf += PXSTRIDE(stride); BF(dav2d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); left++; src += PXSTRIDE(stride); rotate_neon(A_ptrs, B_ptrs, 3); if (--h <= 0) goto vert_1; sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); left++; src += PXSTRIDE(stride); rotate_neon(A_ptrs, B_ptrs, 3); if (--h <= 0) goto vert_2; } else { sumsq_ptrs[0] = sumsq_rows[0]; sumsq_ptrs[1] = sumsq_rows[0]; sumsq_ptrs[2] = sumsq_rows[0]; sum_ptrs[0] = sum_rows[0]; sum_ptrs[1] = sum_rows[0]; sum_ptrs[2] = sum_rows[0]; BF(dav2d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A_ptrs, B_ptrs, 3); if (--h <= 0) goto vert_1; sumsq_ptrs[2] = sumsq_rows[1]; sum_ptrs[2] = sum_rows[1]; sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); left++; src += PXSTRIDE(stride); rotate_neon(A_ptrs, B_ptrs, 3); if (--h <= 0) goto vert_2; sumsq_ptrs[2] = sumsq_rows[2]; sum_ptrs[2] = sum_rows[2]; } do { sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); left++; src += PXSTRIDE(stride); sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); } while (--h > 0); if (!(edges & LR_HAVE_BOTTOM)) goto vert_2; sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); lpf_bottom += PXSTRIDE(stride); sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); return; vert_2: sumsq_ptrs[2] = sumsq_ptrs[1]; sum_ptrs[2] = sum_ptrs[1]; sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], w, params->sgr.s1, BITDEPTH_MAX); sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); output_1: sumsq_ptrs[2] = sumsq_ptrs[1]; sum_ptrs[2] = sum_ptrs[1]; sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], w, params->sgr.s1, BITDEPTH_MAX); sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); return; vert_1: sumsq_ptrs[2] = sumsq_ptrs[1]; sum_ptrs[2] = sum_ptrs[1]; sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A_ptrs, B_ptrs, 3); goto output_1; } static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride, const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int32_t, sumsq_buf, ARM_BUF_STRIDE * 5 + 16,); ALIGN_STK_16(int16_t, sum_buf, ARM_BUF_STRIDE * 5 + 16,); int32_t *sumsq_ptrs[5], *sumsq_rows[5]; int16_t *sum_ptrs[5], *sum_rows[5]; for (int i = 0; i < 5; i++) { sumsq_rows[i] = &sumsq_buf[i * ARM_BUF_STRIDE]; sum_rows[i] = &sum_buf[i * ARM_BUF_STRIDE]; } ALIGN_STK_16(int32_t, A_buf, ARM_BUF_STRIDE * 2 + 16,); ALIGN_STK_16(int16_t, B_buf, ARM_BUF_STRIDE * 2 + 16,); int32_t *A_ptrs[2]; int16_t *B_ptrs[2]; for (int i = 0; i < 2; i++) { A_ptrs[i] = &A_buf[i * ARM_BUF_STRIDE]; B_ptrs[i] = &B_buf[i * ARM_BUF_STRIDE]; } const pixel *src = dst; const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); if (edges & LR_HAVE_TOP) { sumsq_ptrs[0] = sumsq_rows[0]; sumsq_ptrs[1] = sumsq_rows[0]; sumsq_ptrs[2] = sumsq_rows[1]; sumsq_ptrs[3] = sumsq_rows[2]; sumsq_ptrs[4] = sumsq_rows[3]; sum_ptrs[0] = sum_rows[0]; sum_ptrs[1] = sum_rows[0]; sum_ptrs[2] = sum_rows[1]; sum_ptrs[3] = sum_rows[2]; sum_ptrs[4] = sum_rows[3]; BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); lpf += PXSTRIDE(stride); BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], left, src, w, edges); left++; src += PXSTRIDE(stride); if (--h <= 0) goto vert_1; BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); rotate_neon(A_ptrs, B_ptrs, 2); if (--h <= 0) goto vert_2; // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set // one of them to point at the previously unused rows[4]. sumsq_ptrs[3] = sumsq_rows[4]; sum_ptrs[3] = sum_rows[4]; } else { sumsq_ptrs[0] = sumsq_rows[0]; sumsq_ptrs[1] = sumsq_rows[0]; sumsq_ptrs[2] = sumsq_rows[0]; sumsq_ptrs[3] = sumsq_rows[0]; sumsq_ptrs[4] = sumsq_rows[0]; sum_ptrs[0] = sum_rows[0]; sum_ptrs[1] = sum_rows[0]; sum_ptrs[2] = sum_rows[0]; sum_ptrs[3] = sum_rows[0]; sum_ptrs[4] = sum_rows[0]; BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], left, src, w, edges); left++; src += PXSTRIDE(stride); if (--h <= 0) goto vert_1; sumsq_ptrs[4] = sumsq_rows[1]; sum_ptrs[4] = sum_rows[1]; BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); rotate_neon(A_ptrs, B_ptrs, 2); if (--h <= 0) goto vert_2; sumsq_ptrs[3] = sumsq_rows[2]; sumsq_ptrs[4] = sumsq_rows[3]; sum_ptrs[3] = sum_rows[2]; sum_ptrs[4] = sum_rows[3]; BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], left, src, w, edges); left++; src += PXSTRIDE(stride); if (--h <= 0) goto odd; BF(dav2d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); if (--h <= 0) goto vert_2; // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set // one of them to point at the previously unused rows[4]. sumsq_ptrs[3] = sumsq_rows[4]; sum_ptrs[3] = sum_rows[4]; } do { BF(dav2d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges); left++; src += PXSTRIDE(stride); if (--h <= 0) goto odd; BF(dav2d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); } while (--h > 0); if (!(edges & LR_HAVE_BOTTOM)) goto vert_2; BF(dav2d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges); lpf_bottom += PXSTRIDE(stride); BF(dav2d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges); output_2: sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); return; vert_2: // Duplicate the last row twice more sumsq_ptrs[3] = sumsq_ptrs[2]; sumsq_ptrs[4] = sumsq_ptrs[2]; sum_ptrs[3] = sum_ptrs[2]; sum_ptrs[4] = sum_ptrs[2]; goto output_2; odd: // Copy the last row as padding once sumsq_ptrs[4] = sumsq_ptrs[3]; sum_ptrs[4] = sum_ptrs[3]; sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); output_1: // Duplicate the last row twice more sumsq_ptrs[3] = sumsq_ptrs[2]; sumsq_ptrs[4] = sumsq_ptrs[2]; sum_ptrs[3] = sum_ptrs[2]; sum_ptrs[4] = sum_ptrs[2]; sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); // Output only one row sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); return; vert_1: // Copy the last row as padding once sumsq_ptrs[4] = sumsq_ptrs[3]; sum_ptrs[4] = sum_ptrs[3]; sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); rotate_neon(A_ptrs, B_ptrs, 2); goto output_1; } static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride, const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int32_t, sumsq5_buf, ARM_BUF_STRIDE * 5 + 16,); ALIGN_STK_16(int16_t, sum5_buf, ARM_BUF_STRIDE * 5 + 16,); int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; int16_t *sum5_ptrs[5], *sum5_rows[5]; for (int i = 0; i < 5; i++) { sumsq5_rows[i] = &sumsq5_buf[i * ARM_BUF_STRIDE]; sum5_rows[i] = &sum5_buf[i * ARM_BUF_STRIDE]; } ALIGN_STK_16(int32_t, sumsq3_buf, ARM_BUF_STRIDE * 3 + 16,); ALIGN_STK_16(int16_t, sum3_buf, ARM_BUF_STRIDE * 3 + 16,); int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; int16_t *sum3_ptrs[3], *sum3_rows[3]; for (int i = 0; i < 3; i++) { sumsq3_rows[i] = &sumsq3_buf[i * ARM_BUF_STRIDE]; sum3_rows[i] = &sum3_buf[i * ARM_BUF_STRIDE]; } ALIGN_STK_16(int32_t, A5_buf, ARM_BUF_STRIDE * 2 + 16,); ALIGN_STK_16(int16_t, B5_buf, ARM_BUF_STRIDE * 2 + 16,); int32_t *A5_ptrs[2]; int16_t *B5_ptrs[2]; for (int i = 0; i < 2; i++) { A5_ptrs[i] = &A5_buf[i * ARM_BUF_STRIDE]; B5_ptrs[i] = &B5_buf[i * ARM_BUF_STRIDE]; } ALIGN_STK_16(int32_t, A3_buf, ARM_BUF_STRIDE * 4 + 16,); ALIGN_STK_16(int16_t, B3_buf, ARM_BUF_STRIDE * 4 + 16,); int32_t *A3_ptrs[4]; int16_t *B3_ptrs[4]; for (int i = 0; i < 4; i++) { A3_ptrs[i] = &A3_buf[i * ARM_BUF_STRIDE]; B3_ptrs[i] = &B3_buf[i * ARM_BUF_STRIDE]; } const pixel *src = dst; const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); if (edges & LR_HAVE_TOP) { sumsq5_ptrs[0] = sumsq5_rows[0]; sumsq5_ptrs[1] = sumsq5_rows[0]; sumsq5_ptrs[2] = sumsq5_rows[1]; sumsq5_ptrs[3] = sumsq5_rows[2]; sumsq5_ptrs[4] = sumsq5_rows[3]; sum5_ptrs[0] = sum5_rows[0]; sum5_ptrs[1] = sum5_rows[0]; sum5_ptrs[2] = sum5_rows[1]; sum5_ptrs[3] = sum5_rows[2]; sum5_ptrs[4] = sum5_rows[3]; sumsq3_ptrs[0] = sumsq3_rows[0]; sumsq3_ptrs[1] = sumsq3_rows[1]; sumsq3_ptrs[2] = sumsq3_rows[2]; sum3_ptrs[0] = sum3_rows[0]; sum3_ptrs[1] = sum3_rows[1]; sum3_ptrs[2] = sum3_rows[2]; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], sumsq5_rows[0], sum5_rows[0], NULL, lpf, w, edges); lpf += PXSTRIDE(stride); BF(dav2d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], sumsq5_rows[1], sum5_rows[1], NULL, lpf, w, edges); BF(dav2d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], sumsq5_rows[2], sum5_rows[2], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); if (--h <= 0) goto vert_1; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], sumsq5_rows[3], sum5_rows[3], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); rotate_neon(A5_ptrs, B5_ptrs, 2); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); if (--h <= 0) goto vert_2; // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set // one of them to point at the previously unused rows[4]. sumsq5_ptrs[3] = sumsq5_rows[4]; sum5_ptrs[3] = sum5_rows[4]; } else { sumsq5_ptrs[0] = sumsq5_rows[0]; sumsq5_ptrs[1] = sumsq5_rows[0]; sumsq5_ptrs[2] = sumsq5_rows[0]; sumsq5_ptrs[3] = sumsq5_rows[0]; sumsq5_ptrs[4] = sumsq5_rows[0]; sum5_ptrs[0] = sum5_rows[0]; sum5_ptrs[1] = sum5_rows[0]; sum5_ptrs[2] = sum5_rows[0]; sum5_ptrs[3] = sum5_rows[0]; sum5_ptrs[4] = sum5_rows[0]; sumsq3_ptrs[0] = sumsq3_rows[0]; sumsq3_ptrs[1] = sumsq3_rows[0]; sumsq3_ptrs[2] = sumsq3_rows[0]; sum3_ptrs[0] = sum3_rows[0]; sum3_ptrs[1] = sum3_rows[0]; sum3_ptrs[2] = sum3_rows[0]; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], sumsq5_rows[0], sum5_rows[0], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); if (--h <= 0) goto vert_1; sumsq5_ptrs[4] = sumsq5_rows[1]; sum5_ptrs[4] = sum5_rows[1]; sumsq3_ptrs[2] = sumsq3_rows[1]; sum3_ptrs[2] = sum3_rows[1]; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], sumsq5_rows[1], sum5_rows[1], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); rotate_neon(A5_ptrs, B5_ptrs, 2); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); if (--h <= 0) goto vert_2; sumsq5_ptrs[3] = sumsq5_rows[2]; sumsq5_ptrs[4] = sumsq5_rows[3]; sum5_ptrs[3] = sum5_rows[2]; sum5_ptrs[4] = sum5_rows[3]; sumsq3_ptrs[2] = sumsq3_rows[2]; sum3_ptrs[2] = sum3_rows[2]; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], sumsq5_rows[2], sum5_rows[2], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); if (--h <= 0) goto odd; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], sumsq5_rows[3], sum5_rows[3], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, w, 2, params->sgr.w0, params->sgr.w1 HIGHBD_TAIL_SUFFIX); if (--h <= 0) goto vert_2; // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set // one of them to point at the previously unused rows[4]. sumsq5_ptrs[3] = sumsq5_rows[4]; sum5_ptrs[3] = sum5_rows[4]; } do { BF(dav2d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], sumsq5_ptrs[3], sum5_ptrs[3], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); if (--h <= 0) goto odd; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], sumsq5_ptrs[4], sum5_ptrs[4], left, src, w, edges); left++; src += PXSTRIDE(stride); sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, w, 2, params->sgr.w0, params->sgr.w1 HIGHBD_TAIL_SUFFIX); } while (--h > 0); if (!(edges & LR_HAVE_BOTTOM)) goto vert_2; BF(dav2d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], sumsq5_ptrs[3], sum5_ptrs[3], NULL, lpf_bottom, w, edges); lpf_bottom += PXSTRIDE(stride); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); BF(dav2d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], sumsq5_ptrs[4], sum5_ptrs[4], NULL, lpf_bottom, w, edges); output_2: sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, w, 2, params->sgr.w0, params->sgr.w1 HIGHBD_TAIL_SUFFIX); return; vert_2: // Duplicate the last row twice more sumsq5_ptrs[3] = sumsq5_ptrs[2]; sumsq5_ptrs[4] = sumsq5_ptrs[2]; sum5_ptrs[3] = sum5_ptrs[2]; sum5_ptrs[4] = sum5_ptrs[2]; sumsq3_ptrs[2] = sumsq3_ptrs[1]; sum3_ptrs[2] = sum3_ptrs[1]; sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); sumsq3_ptrs[2] = sumsq3_ptrs[1]; sum3_ptrs[2] = sum3_ptrs[1]; goto output_2; odd: // Copy the last row as padding once sumsq5_ptrs[4] = sumsq5_ptrs[3]; sum5_ptrs[4] = sum5_ptrs[3]; sumsq3_ptrs[2] = sumsq3_ptrs[1]; sum3_ptrs[2] = sum3_ptrs[1]; sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, w, 2, params->sgr.w0, params->sgr.w1 HIGHBD_TAIL_SUFFIX); output_1: // Duplicate the last row twice more sumsq5_ptrs[3] = sumsq5_ptrs[2]; sumsq5_ptrs[4] = sumsq5_ptrs[2]; sum5_ptrs[3] = sum5_ptrs[2]; sum5_ptrs[4] = sum5_ptrs[2]; sumsq3_ptrs[2] = sumsq3_ptrs[1]; sum3_ptrs[2] = sum3_ptrs[1]; sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); // Output only one row sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, w, 1, params->sgr.w0, params->sgr.w1 HIGHBD_TAIL_SUFFIX); return; vert_1: // Copy the last row as padding once sumsq5_ptrs[4] = sumsq5_ptrs[3]; sum5_ptrs[4] = sum5_ptrs[3]; sumsq3_ptrs[2] = sumsq3_ptrs[1]; sum3_ptrs[2] = sum3_ptrs[1]; sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], w, params->sgr.s0, BITDEPTH_MAX); rotate_neon(A5_ptrs, B5_ptrs, 2); sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], w, params->sgr.s1, BITDEPTH_MAX); rotate_neon(A3_ptrs, B3_ptrs, 4); goto output_1; } static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav2dLoopRestorationDSPContext *const c, int bpc) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; #if ARCH_AARCH64 c->wiener[0] = BF(dav2d_wiener_filter7, neon); c->wiener[1] = BF(dav2d_wiener_filter5, neon); #else c->wiener[0] = c->wiener[1] = wiener_filter_neon; #endif if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = sgr_filter_5x5_neon; c->sgr[1] = sgr_filter_3x3_neon; c->sgr[2] = sgr_filter_mix_neon; } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/mc.h000066400000000000000000000072151517466257200221670ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "src/mc.h" #include "src/cpu.h" decl_8tap_fns(neon); decl_8tap_fns(neon_dotprod); decl_8tap_fns(neon_i8mm); decl_8tap_fns(sve2); decl_mc_fn(BF(dav2d_put_bilin, neon)); decl_mct_fn(BF(dav2d_prep_bilin, neon)); decl_avg_fn(BF(dav2d_avg, neon)); decl_w_avg_fn(BF(dav2d_w_avg, neon)); decl_mask_fn(BF(dav2d_mask, neon)); decl_blend_fn(BF(dav2d_blend, neon)); decl_w_mask_fn(BF(dav2d_w_mask_444, neon)); decl_w_mask_fn(BF(dav2d_w_mask_422, neon)); decl_w_mask_fn(BF(dav2d_w_mask_420, neon)); decl_warp8x8_fn(BF(dav2d_warp_affine_8x8, neon)); decl_warp8x8t_fn(BF(dav2d_warp_affine_8x8t, neon)); decl_emu_edge_fn(BF(dav2d_emu_edge, neon)); decl_sad_refine_mv_fn(BF(dav2d_sad_refine_mv, neon)); decl_sad8x8_fn(BF(dav2d_sad8x8, neon)); static ALWAYS_INLINE void mc_dsp_init_arm(Dav2dMCDSPContext *const c) { #define init_mc_fn(type, name, suffix) \ c->mc[type] = BF(dav2d_put_##name, suffix) #define init_mct_fn(type, name, suffix) \ c->mct[type] = BF(dav2d_prep_##name, suffix) const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; init_8tap_fns(neon); init_mc_fn (DAV2D_FILTER_BILINEAR, bilin, neon); init_mct_fn(DAV2D_FILTER_BILINEAR, bilin, neon); c->avg = BF(dav2d_avg, neon); c->w_avg = BF(dav2d_w_avg, neon); c->mask = BF(dav2d_mask, neon); c->blend = BF(dav2d_blend, neon); c->w_mask[0] = BF(dav2d_w_mask_444, neon); c->w_mask[1] = BF(dav2d_w_mask_422, neon); c->w_mask[2] = BF(dav2d_w_mask_420, neon); c->warp8x8 = BF(dav2d_warp_affine_8x8, neon); c->warp8x8t = BF(dav2d_warp_affine_8x8t, neon); c->emu_edge = BF(dav2d_emu_edge, neon); #if BITDEPTH == 8 c->sad_refine_mv = BF(dav2d_sad_refine_mv, neon); c->sad8x8 = BF(dav2d_sad8x8, neon); #endif #if ARCH_AARCH64 #if BITDEPTH == 8 #if HAVE_DOTPROD if (flags & DAV2D_ARM_CPU_FLAG_DOTPROD) { init_8tap_fns(neon_dotprod); } #endif // HAVE_DOTPROD #if HAVE_I8MM if (flags & DAV2D_ARM_CPU_FLAG_I8MM) { init_8tap_fns(neon_i8mm); } #endif // HAVE_I8MM #endif // BITDEPTH == 8 #if BITDEPTH == 16 #if HAVE_SVE2 if (flags & DAV2D_ARM_CPU_FLAG_SVE2) { init_8tap_fns(sve2); } #endif // HAVE_SVE2 #endif // BITDEPTH == 16 #endif // ARCH_AARCH64 } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/msac.h000066400000000000000000000044671517466257200225210ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_ARM_MSAC_H #define DAV2D_SRC_ARM_MSAC_H unsigned dav2d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); unsigned dav2d_msac_decode_bool_bypass_neon(MsacContext *s); unsigned dav2d_msac_decode_bools_bypass_neon(MsacContext *s, unsigned n_bits); unsigned dav2d_msac_decode_unary_bypass_neon(MsacContext *s, unsigned max_bits); static inline unsigned dav2d_msac_decode_unary_bypass21_neon(MsacContext *s) { return dav2d_msac_decode_unary_bypass_neon(s, 21); } #if ARCH_AARCH64 #define dav2d_msac_decode_bool_adapt dav2d_msac_decode_bool_adapt_neon #define dav2d_msac_decode_bool_bypass dav2d_msac_decode_bool_bypass_neon #define dav2d_msac_decode_bools_bypass dav2d_msac_decode_bools_bypass_neon #define dav2d_msac_decode_unary_bypass6 dav2d_msac_decode_unary_bypass_neon #define dav2d_msac_decode_unary_bypass21 dav2d_msac_decode_unary_bypass21_neon #endif #endif /* DAV2D_SRC_ARM_MSAC_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/arm/refmvs.h000066400000000000000000000047471517466257200231010ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm-offsets.h" #include "src/cpu.h" #include "src/refmvs.h" #if ARCH_AARCH64 && 0 CHECK_OFFSET(refmvs_frame, iw8, RMVSF_IW8); CHECK_OFFSET(refmvs_frame, ih8, RMVSF_IH8); CHECK_OFFSET(refmvs_frame, mfmv_ref, RMVSF_MFMV_REF); CHECK_OFFSET(refmvs_frame, mfmv_ref2cur, RMVSF_MFMV_REF2CUR); CHECK_OFFSET(refmvs_frame, mfmv_ref2ref, RMVSF_MFMV_REF2REF); CHECK_OFFSET(refmvs_frame, n_mfmvs, RMVSF_N_MFMVS); CHECK_OFFSET(refmvs_frame, rp_ref, RMVSF_RP_REF); CHECK_OFFSET(refmvs_frame, rp_proj, RMVSF_RP_PROJ); CHECK_OFFSET(refmvs_frame, rp_stride, RMVSF_RP_STRIDE); CHECK_OFFSET(refmvs_frame, n_tile_threads, RMVSF_N_TILE_THREADS); #endif //decl_load_tmvs_fn(dav2d_load_tmvs_neon); decl_save_tmvs_fn(dav2d_save_tmvs_neon); decl_splat_mv_fn(dav2d_splat_mv_neon); static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav2dRefmvsDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_ARM_CPU_FLAG_NEON)) return; #if ARCH_AARCH64 && 0 c->load_tmvs = dav2d_load_tmvs_neon; #endif c->save_tmvs = dav2d_save_tmvs_neon; c->splat_mv = dav2d_splat_mv_neon; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ccso.h000066400000000000000000000047431517466257200217430ustar00rootroot00000000000000/* * Copyright © 2025-2026, VideoLAN and dav2d authors * Copyright © 2025-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_CCSO_H #define DAV2D_SRC_CCSO_H #include "cdef.h" #include "common/bitdepth.h" #define decl_ccso_prep_fn(name) \ void (name)(uint8_t *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, \ const_left_pixel_row_2px left, const pixel *top, const pixel *bottom, \ unsigned max_band_log2, unsigned ext_filter, unsigned quant_step, \ int edge_cfl, int bo_only, int w, int h, enum CdefEdgeFlags edges \ HIGHBD_DECL_SUFFIX) typedef decl_ccso_prep_fn(*ccso_prep_fn); #define decl_ccso_add_fn(name) \ void (name)(pixel *dst, const ptrdiff_t dst_stride, const uint8_t *idx, \ ptrdiff_t idx_stride, const uint8_t *offset_idxs, \ const int8_t *offset_lut, int w, int h, \ const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) typedef decl_ccso_add_fn(*ccso_add_fn); typedef struct Dav2dCcsoDSPContext { ccso_prep_fn prep[3 /* 444/luma, 422, 420 */]; ccso_add_fn add; } Dav2dCcsoDSPContext; bitfn_decls(void dav2d_ccso_dsp_init, Dav2dCcsoDSPContext *c); #endif /* DAV2D_SRC_CCSO_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ccso_tmpl.c000066400000000000000000000162221517466257200227650ustar00rootroot00000000000000/* * Copyright © 2025-2026, VideoLAN and dav2d authors * Copyright © 2025-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "ccso.h" #include "tables.h" #include "common/bitdepth.h" #include "common/intops.h" static int8_t ccso_pos[7][2] = { { -1, 0 }, { 0, -1 }, { -1, -1 }, { -1, 1 }, { -1, -2 }, { 1, -2 }, { 0, 2 } }; static void padding(pixel *const tmp, const ptrdiff_t tmp_stride, const pixel *const src, const ptrdiff_t src_stride, const pixel (*left)[2], const pixel *top, const pixel *bottom, const int w, const int h, enum CdefEdgeFlags edges) { int x_min = edges & CDEF_HAVE_LEFT ? -2 : 0; int x_max = w - 1 + (edges & CDEF_HAVE_RIGHT ? 2 : 0); int y_min = edges & CDEF_HAVE_TOP ? -2 : 0; int y_max = h - 1 + (edges & CDEF_HAVE_BOTTOM ? 2 : 0); for (int y = -2; y < h + 2; y++) { int src_y = iclip(y, y_min, y_max); for (int x = -2; x < w + 2; x++) { pixel v; int src_x = iclip(x, x_min, x_max); if (src_y < 0) { v = top[src_x + (2 + src_y) * PXSTRIDE(src_stride)]; } else if (src_y >= h) { v = bottom[src_x + (src_y - h) * PXSTRIDE(src_stride)]; } else if (src_x < 0) { v = left[src_y][2 + src_x]; } else { v = src[src_x + src_y * PXSTRIDE(src_stride)]; } tmp[x + y * tmp_stride] = v; } } } static inline unsigned ccso_score(int diff, int quant_step, unsigned edge_classifier) { if (diff > quant_step && !edge_classifier) return 2; if (diff < -quant_step) return 0; return 1; } static NOINLINE void ccso_prep_c(uint8_t *dst, ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_stride, const pixel (*left)[2], const pixel *top, const pixel *bottom, unsigned max_band_log2, const unsigned ext_filter, const unsigned quant_step, const int edge_clf, const int bo_only, enum CdefEdgeFlags edges, const int w, const int h, const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) { const unsigned shift = bitdepth_from_max(bitdepth_max) - max_band_log2; const int dy = ccso_pos[ext_filter][0]; const int dx = ccso_pos[ext_filter][1]; const ptrdiff_t tmp_stride = 68; const ptrdiff_t luma_offset = dx + dy * tmp_stride; pixel tmp_buf[68 * 12]; // 68*12 is the maximum value of tmp_stride * (h + 4) pixel *tmp = tmp_buf + 2 * tmp_stride + 2; padding(tmp, tmp_stride, src, src_stride, left, top, bottom, w << ss_hor, h << ss_ver, edges); for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { const int x_luma = x << ss_hor; const int c = tmp[x_luma]; const int band = c >> shift; if (bo_only) dst[x] = band; else { unsigned cls0 = 0, cls1 = 0; cls0 = ccso_score(tmp[x_luma + luma_offset] - c, quant_step, edge_clf); cls1 = ccso_score(tmp[x_luma - luma_offset] - c, quant_step, edge_clf); dst[x] = (cls0 << 5) | (cls1 << 3) | band; } } tmp += tmp_stride << ss_ver; dst += dst_stride; } } #define ccso_prep_fn(ss_hor, ss_ver, name) \ static void ccso_prep_##name##_c(uint8_t *const dst, \ ptrdiff_t dst_stride, \ const pixel *const src, \ const ptrdiff_t src_stride, \ const pixel (*left)[2], \ const pixel *top, \ const pixel *bottom, \ const unsigned max_band_log2, \ const unsigned ext_filter, \ const unsigned quant_step, \ const int ccso_edge_clf, \ const int bo_only, \ const int w, \ const int h, \ const enum CdefEdgeFlags edges \ HIGHBD_DECL_SUFFIX) \ { \ ccso_prep_c(dst, dst_stride, src, src_stride, left, top, bottom, max_band_log2, ext_filter, \ quant_step, ccso_edge_clf, bo_only, edges, w, h, ss_hor, ss_ver HIGHBD_TAIL_SUFFIX); \ } ccso_prep_fn(1, 1, 420); ccso_prep_fn(1, 0, 422); ccso_prep_fn(0, 0, 444); static void ccso_add_c(pixel *line, const ptrdiff_t dst_stride, const uint8_t *idx_line, const ptrdiff_t idx_stride, const uint8_t *const offset_idxs, const int8_t *const offset_lut, const int w, const int h, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { for (int yy = 0; yy < h; yy += 4, ll_mask++) { for (int xx = 0, bx = 0; xx < w; xx += 4, bx++) { if (ll_mask[0][0] & (1 << bx)) continue; pixel *dst = line; const uint8_t *idx = idx_line; for (int y = yy; y < yy + 4; y++) { for (int x = xx; x < xx + 4; x++) { int byte_idx = idx[x] >> 1; int half_idx = idx[x] & 1; int offset_idx = 7 & (offset_idxs[byte_idx] >> (4 * half_idx)); dst[x] = iclip_pixel(dst[x] + offset_lut[offset_idx]); } dst += PXSTRIDE(dst_stride); idx += idx_stride; } } line += PXSTRIDE(dst_stride) * 4; idx_line += idx_stride * 4; } } COLD void bitfn(dav2d_ccso_dsp_init)(Dav2dCcsoDSPContext *const c) { c->prep[0] = ccso_prep_444_c; c->prep[1] = ccso_prep_422_c; c->prep[2] = ccso_prep_420_c; c->add = ccso_add_c; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cdef.h000066400000000000000000000053601517466257200217110ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_CDEF_H #define DAV2D_SRC_CDEF_H #include #include #include "common/bitdepth.h" enum CdefEdgeFlags { CDEF_HAVE_LEFT = 1 << 0, CDEF_HAVE_RIGHT = 1 << 1, CDEF_HAVE_TOP = 1 << 2, CDEF_HAVE_BOTTOM = 1 << 3, }; #ifdef BITDEPTH typedef const pixel (*const_left_pixel_row_2px)[2]; #else typedef const void *const_left_pixel_row_2px; #endif // CDEF operates entirely on pre-filter data; if bottom/right edges are // present (according to $edges), then the pre-filter data is located in // $dst. However, the edge pixels above $dst may be post-filter, so in // order to get access to pre-filter top pixels, use $top. #define decl_cdef_fn(name) \ void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \ const pixel *top, const pixel *bottom, \ int pri_strength, int sec_strength, \ int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) typedef decl_cdef_fn(*cdef_fn); #define decl_cdef_dir_fn(name) \ int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX) typedef decl_cdef_dir_fn(*cdef_dir_fn); typedef struct Dav2dCdefDSPContext { cdef_dir_fn dir; cdef_fn fb[3 /* 444/luma, 422, 420 */]; } Dav2dCdefDSPContext; bitfn_decls(void dav2d_cdef_dsp_init, Dav2dCdefDSPContext *c); #endif /* DAV2D_SRC_CDEF_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cdef_apply.h000066400000000000000000000034041517466257200231130ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_CDEF_APPLY_H #define DAV2D_SRC_CDEF_APPLY_H #include "common/bitdepth.h" #include "src/internal.h" void bytefn(dav2d_cdef_brow)(Dav2dTaskContext *tc, pixel *const p[3], const Av2Filter *lflvl, int by_start, int by_end, int sbrow_start, int sby); #endif /* DAV2D_SRC_CDEF_APPLY_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cdef_apply_tmpl.c000066400000000000000000000442461517466257200241530ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/cdef_apply.h" enum Backup2x8Flags { BACKUP_2X8_Y = 1 << 0, BACKUP_2X8_UV = 1 << 1, }; static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3], const ptrdiff_t stride[2], const enum Dav2dPixelLayout layout) { const ptrdiff_t y_stride = PXSTRIDE(stride[0]); if (y_stride < 0) pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride); else pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride); if (layout != DAV2D_PIXEL_LAYOUT_I400) { const ptrdiff_t uv_stride = PXSTRIDE(stride[1]); if (uv_stride < 0) { const int uv_off = layout == DAV2D_PIXEL_LAYOUT_I420 ? 3 : 7; pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride); pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride); } else { const int uv_off = layout == DAV2D_PIXEL_LAYOUT_I420 ? 2 : 6; pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride); pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride); } } } static void backup2x8(pixel dst[3][8][2], /*const*/ pixel *const src[3], const ptrdiff_t src_stride[2], int x_off, const enum Dav2dPixelLayout layout, const enum Backup2x8Flags flag) { ptrdiff_t y_off = 0; if (flag & BACKUP_2X8_Y) { for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0])) pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2); } if (layout == DAV2D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV)) return; const int ss_ver = layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = layout != DAV2D_PIXEL_LAYOUT_I444; x_off >>= ss_hor; y_off = 0; for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) { pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2); pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2); } } static int adjust_strength(const int strength, const unsigned var) { if (!var) return 0; const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0; return (strength * (4 + i) + 8) >> 4; } void bytefn(dav2d_cdef_brow)(Dav2dTaskContext *const tc, pixel *const p[3], const Av2Filter *const lflvl, const int by_start, const int by_end, const int sbrow_start, const int sby) { Dav2dFrameContext *const f = (Dav2dFrameContext *)tc->f; const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.p.bpc - 8; const Dav2dDSPContext *const dsp = f->dsp; enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0); pixel *ptrs[3] = { p[0], p[1], p[2] }; const int sbsz = 16; const int sb64w = (f->bw + sbsz - 1) >> 4; const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8; const int on_skip_tx = f->frame_hdr->cdef.on_skiptx; const enum Dav2dPixelLayout layout = f->cur.p.p.layout; const int uv_idx = DAV2D_PIXEL_LAYOUT_I444 - layout; const int ss_hor = layout != DAV2D_PIXEL_LAYOUT_I444; const int ss_ver = layout == DAV2D_PIXEL_LAYOUT_I420; static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 }, { 7, 0, 2, 4, 5, 6, 6, 6 } }; const uint8_t *uv_dir = uv_dirs[layout == DAV2D_PIXEL_LAYOUT_I422]; const int have_tt = f->c->n_tc > 1; const int sb128 = f->frame_hdr->sb128; const ptrdiff_t y_stride = PXSTRIDE(f->cur.p.stride[0]); const ptrdiff_t uv_stride = PXSTRIDE(f->cur.p.stride[1]); for (int by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) { const int tf = tc->top_pre_cdef_toggle; const int by_idx = (by & 0x3e) >> 1; if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM; if ((!have_tt || sbrow_start || by + 2 < by_end) && edges & CDEF_HAVE_BOTTOM) { // backup pre-filter data for next iteration pixel *const cdef_top_bak[3] = { f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride, f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride, f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride }; backup2lines(cdef_top_bak, ptrs, f->cur.p.stride, layout); } ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]); pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] }; edges &= ~CDEF_HAVE_LEFT; edges |= CDEF_HAVE_RIGHT; enum Backup2x8Flags prev_flag = 0; for (int sbx = 0, bit = 0; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) { ALIGN_STK_64(uint8_t, ccso_lut_idx, 3, [64*8]); const int sb256x = sbx >> 2; const int sb64x_idx = sbx & 3; const int sb64_idx = ((by & 0x30) >> 2) + sb64x_idx; const int cdef_idx = lflvl[sb256x].cdef_idx[sb64_idx]; if (f->c->inloop_filters & DAV2D_INLOOPFILTER_CCSO) { const enum Backup2x8Flags flag = lflvl[sb256x].ccso[0] | lflvl[sb256x].ccso[1] | lflvl[sb256x].ccso[2]; const enum Backup2x8Flags do_left = flag & ~prev_flag; prev_flag |= flag; if (do_left && edges & CDEF_HAVE_LEFT) { // we didn't backup the prefilter data because we didn't // filter it, so do it here instead backup2x8(lr_bak[bit], iptrs, f->cur.p.stride, 0, layout, do_left); } for (int pl = 0; pl < 3; pl++) { if (!lflvl[sb256x].ccso[pl]) continue; const Dav2dFrameHeader *const hdr = f->frame_hdr; const unsigned max_band = hdr->ccso.p[pl].max_band_log2; const unsigned ext_filter = hdr->ccso.p[pl].ext_filter_support; const unsigned scale_idx = hdr->ccso.p[pl].scale_idx; const unsigned quant = dav2d_ccso_quant_sz[scale_idx][hdr->ccso.p[pl].quant_idx]; const int edge_clf = hdr->ccso.p[pl].edge_clf; const int bo_only = hdr->ccso.p[pl].bo_only; const pixel *top, *bot; ptrdiff_t offset; enum CdefEdgeFlags sb_edges = edges; if ((sbx + 1) * sbsz >= f->bw) sb_edges &= ~CDEF_HAVE_RIGHT; const int w = imin(sbsz, f->bw - sbx * sbsz) * 4; assert(w >= 0); if (!have_tt) goto sb_st_y; if (sbrow_start && by == by_start) { offset = (sby * (4 << sb128) - 4) * y_stride + sbx * sbsz * 4; top = &f->lf.lr_db_line[0][offset]; bot = iptrs[0] + 8 * y_stride; } else if (!sbrow_start && by + 2 >= by_end) { offset = sby * 4 * y_stride + sbx * sbsz * 4; top = &f->lf.cdef_line[tf][0][offset]; offset = ((sby + 1) * (4 << sb128) - 2) * y_stride + sbx * sbsz * 4; bot = &f->lf.lr_db_line[0][offset]; } else { sb_st_y: offset = have_tt * sby * 4 * y_stride + sbx * sbsz * 4; top = &f->lf.cdef_line[tf][0][offset]; bot = iptrs[0] + 8 * y_stride; } dsp->ccso.prep[!!pl * uv_idx](ccso_lut_idx[pl], 64 >> (!!pl * ss_hor), iptrs[0], f->cur.p.stride[0], lr_bak[bit][0], top, bot, max_band, ext_filter, quant, edge_clf, bo_only, w >> (!!pl * ss_hor), 8 >> (!!pl * ss_ver), sb_edges HIGHBD_CALL_SUFFIX); } } const uint16_t (*y_ll_mask)[4], (*uv_ll_mask)[4]; if (f->frame_hdr->any_lossless /* segmentation + at least 1 lossless */) { y_ll_mask = (const uint16_t(*)[4]) &lflvl[sb256x].lossless_mask_y[2 * by_idx][sb64x_idx]; uv_ll_mask = (const uint16_t(*)[4]) &lflvl[sb256x].lossless_mask_uv[2 * by_idx >> ss_ver][sb64x_idx]; } else { static const uint16_t zero_ll_mask[2][4] = { { 0 } }; assert(!f->frame_hdr->all_lossless); y_ll_mask = uv_ll_mask = zero_ll_mask; } if (cdef_idx == -1 || (!f->frame_hdr->cdef.y_strength[cdef_idx] && !f->frame_hdr->cdef.uv_strength[cdef_idx]) || !(f->c->inloop_filters & DAV2D_INLOOPFILTER_CDEF)) { prev_flag = 0; goto next_sb; } const unsigned noskip_mask = on_skip_tx ? ~0U : lflvl[sb256x].noskip_mask[by_idx][sb64x_idx]; const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx]; const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx]; const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1); const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8; int y_sec_lvl = y_lvl & 3; y_sec_lvl += y_sec_lvl == 3; y_sec_lvl <<= bitdepth_min_8; const int uv_pri_lvl = (uv_lvl >> 2) << bitdepth_min_8; int uv_sec_lvl = uv_lvl & 3; uv_sec_lvl += uv_sec_lvl == 3; uv_sec_lvl <<= bitdepth_min_8; pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] }; for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw); bx += 2, edges |= CDEF_HAVE_LEFT) { if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT; // check if this 8x8 block had any coded coefficients; if not, // go to the next block const unsigned bx_mask = 3 << (bx & 14); const int y_lossless = (y_ll_mask[0][0] | y_ll_mask[1][0]) & bx_mask; const unsigned uvbx_mask = (3 >> ss_hor) << ((bx & 14) >> ss_hor); const int uv_lossless = (uv_ll_mask[0][0] | uv_ll_mask[!ss_ver][0]) & uvbx_mask; if (!(noskip_mask & bx_mask) || (y_lossless && uv_lossless)) { prev_flag = 0; goto next_b; } const enum Backup2x8Flags do_left = flag & ~prev_flag; prev_flag = flag; if (do_left && edges & CDEF_HAVE_LEFT) { // we didn't backup the prefilter data because it wasn't // there, so do it here instead backup2x8(lr_bak[bit], bptrs, f->cur.p.stride, 0, layout, do_left); } if (edges & CDEF_HAVE_RIGHT) { // backup pre-filter data for next iteration backup2x8(lr_bak[!bit], bptrs, f->cur.p.stride, 8, layout, flag); } int dir; unsigned variance; if (y_pri_lvl || uv_pri_lvl) dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0], &variance HIGHBD_CALL_SUFFIX); const pixel *top, *bot; ptrdiff_t offset; if (!have_tt) goto st_y; if (sbrow_start && by == by_start) { offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4; top = &f->lf.lr_db_line[0][offset]; bot = bptrs[0] + 8 * y_stride; } else if (!sbrow_start && by + 2 >= by_end) { top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4]; const int line = (sby + 1) * (4 << sb128) - 2; offset = line * y_stride + bx * 4; bot = &f->lf.lr_db_line[0][offset]; } else { st_y:; offset = sby * 4 * y_stride; top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4]; bot = bptrs[0] + 8 * y_stride; } if (y_pri_lvl) { const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance); if ((adj_y_pri_lvl || y_sec_lvl) && !y_lossless) dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0], lr_bak[bit][0], top, bot, adj_y_pri_lvl, y_sec_lvl, dir, damping, edges HIGHBD_CALL_SUFFIX); } else if (y_sec_lvl && !y_lossless) dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0], lr_bak[bit][0], top, bot, 0, y_sec_lvl, 0, damping, edges HIGHBD_CALL_SUFFIX); if (!uv_lvl || uv_lossless) goto skip_uv; assert(layout != DAV2D_PIXEL_LAYOUT_I400); const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0; for (int pl = 1; pl <= 2; pl++) { if (!have_tt) goto st_uv; if (sbrow_start && by == by_start) { const int line = sby * (4 << sb128) - 4; offset = line * uv_stride + (bx * 4 >> ss_hor); top = &f->lf.lr_db_line[pl][offset]; bot = bptrs[pl] + (8 >> ss_ver) * uv_stride; } else if (!sbrow_start && by + 2 >= by_end) { const ptrdiff_t top_offset = sby * 8 * uv_stride + (bx * 4 >> ss_hor); top = &f->lf.cdef_line[tf][pl][top_offset]; const int line = (sby + 1) * (4 << sb128) - 2; offset = line * uv_stride + (bx * 4 >> ss_hor); bot = &f->lf.lr_db_line[pl][offset]; } else { st_uv:; const ptrdiff_t offset = sby * 8 * uv_stride; top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)]; bot = bptrs[pl] + (8 >> ss_ver) * uv_stride; } dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1], lr_bak[bit][pl], top, bot, uv_pri_lvl, uv_sec_lvl, uvdir, damping - 1, edges HIGHBD_CALL_SUFFIX); } skip_uv: bit ^= 1; next_b: bptrs[0] += 8; bptrs[1] += 8 >> ss_hor; bptrs[2] += 8 >> ss_hor; } next_sb: if (f->c->inloop_filters & DAV2D_INLOOPFILTER_CCSO) { const enum Backup2x8Flags flag = lflvl[sb256x].ccso[0] | ((lflvl[sb256x].ccso[1] | lflvl[sb256x].ccso[2]) << 1); // XXX could improve the (actual) backups prior to running filter // (so the right side ones) by checking whether next sb/b needs // it (if has_cdef or has_ccso) const enum Backup2x8Flags do_right = flag & ~prev_flag; if (do_right && (sbx + 1) * sbsz < f->bw) { backup2x8(lr_bak[bit], iptrs, f->cur.p.stride, sbsz * 4, layout, do_right); prev_flag |= do_right; } for (int pl = 0; pl < 3; pl++) if (lflvl[sb256x].ccso[pl]) { const int w = imin(sbsz, f->bw - sbx * sbsz) * 4; dsp->ccso.add(iptrs[pl], f->cur.p.stride[!!pl], ccso_lut_idx[pl], 64 >> (!!pl * ss_hor), f->frame_hdr->ccso.p[pl].filter_off, dav2d_ccso_offset[f->frame_hdr->ccso.p[pl].scale_idx], w >> (!!pl * ss_hor), 8 >> (!!pl * ss_ver), pl ? uv_ll_mask : y_ll_mask HIGHBD_CALL_SUFFIX); } } iptrs[0] += sbsz * 4; iptrs[1] += sbsz * 4 >> ss_hor; iptrs[2] += sbsz * 4 >> ss_hor; } ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]); ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver; ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver; tc->top_pre_cdef_toggle ^= 1; } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cdef_tmpl.c000066400000000000000000000327641517466257200227500ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/cdef.h" #include "src/tables.h" static inline int constrain(const int diff, const int threshold, const int shift) { const int adiff = abs(diff); return apply_sign(imin(adiff, imax(0, threshold - (adiff >> shift))), diff); } static inline void fill(int16_t *tmp, const ptrdiff_t stride, const int w, const int h) { /* Use a value that's a large positive number when interpreted as unsigned, * and a large negative number when interpreted as signed. */ for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) tmp[x] = INT16_MIN; tmp += stride; } } static void padding(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, const ptrdiff_t src_stride, const pixel (*left)[2], const pixel *top, const pixel *bottom, const int w, const int h, const enum CdefEdgeFlags edges) { // fill extended input buffer int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2; if (!(edges & CDEF_HAVE_TOP)) { fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2); y_start = 0; } if (!(edges & CDEF_HAVE_BOTTOM)) { fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2); y_end -= 2; } if (!(edges & CDEF_HAVE_LEFT)) { fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start); x_start = 0; } if (!(edges & CDEF_HAVE_RIGHT)) { fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start); x_end -= 2; } for (int y = y_start; y < 0; y++) { for (int x = x_start; x < x_end; x++) tmp[x + y * tmp_stride] = top[x]; top += PXSTRIDE(src_stride); } for (int y = 0; y < h; y++) for (int x = x_start; x < 0; x++) tmp[x + y * tmp_stride] = left[y][2 + x]; for (int y = 0; y < h; y++) { for (int x = (y < h) ? 0 : x_start; x < x_end; x++) tmp[x] = src[x]; src += PXSTRIDE(src_stride); tmp += tmp_stride; } for (int y = h; y < y_end; y++) { for (int x = x_start; x < x_end; x++) tmp[x] = bottom[x]; bottom += PXSTRIDE(src_stride); tmp += tmp_stride; } } static NOINLINE void cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int pri_strength, const int sec_strength, const int dir, const int damping, const int w, int h, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) { const ptrdiff_t tmp_stride = 12; assert((w == 4 || w == 8) && (h == 4 || h == 8)); int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4) int16_t *tmp = tmp_buf + 2 * tmp_stride + 2; padding(tmp, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); if (pri_strength) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1); const int pri_shift = imax(0, damping - ulog2(pri_strength)); if (sec_strength) { const int sec_shift = damping - ulog2(sec_strength); do { for (int x = 0; x < w; x++) { const int px = dst[x]; int sum = 0; int max = px, min = px; int pri_tap_k = pri_tap; for (int k = 0; k < 2; k++) { const int off1 = dav2d_cdef_directions[dir + 2][k]; // dir const int p0 = tmp[x + off1]; const int p1 = tmp[x - off1]; sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); // if pri_tap_k == 4 then it becomes 2 else it remains 3 pri_tap_k = (pri_tap_k & 3) | 2; min = umin(p0, min); max = imax(p0, max); min = umin(p1, min); max = imax(p1, max); const int off2 = dav2d_cdef_directions[dir + 4][k]; // dir + 2 const int off3 = dav2d_cdef_directions[dir + 0][k]; // dir - 2 const int s0 = tmp[x + off2]; const int s1 = tmp[x - off2]; const int s2 = tmp[x + off3]; const int s3 = tmp[x - off3]; // sec_tap starts at 2 and becomes 1 const int sec_tap = 2 - k; sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); min = umin(s0, min); max = imax(s0, max); min = umin(s1, min); max = imax(s1, max); min = umin(s2, min); max = imax(s2, max); min = umin(s3, min); max = imax(s3, max); } dst[x] = iclip(px + ((sum - (sum < 0) + 8) >> 4), min, max); } dst += PXSTRIDE(dst_stride); tmp += tmp_stride; } while (--h); } else { // pri_strength only do { for (int x = 0; x < w; x++) { const int px = dst[x]; int sum = 0; int pri_tap_k = pri_tap; for (int k = 0; k < 2; k++) { const int off = dav2d_cdef_directions[dir + 2][k]; // dir const int p0 = tmp[x + off]; const int p1 = tmp[x - off]; sum += pri_tap_k * constrain(p0 - px, pri_strength, pri_shift); sum += pri_tap_k * constrain(p1 - px, pri_strength, pri_shift); pri_tap_k = (pri_tap_k & 3) | 2; } dst[x] = px + ((sum - (sum < 0) + 8) >> 4); } dst += PXSTRIDE(dst_stride); tmp += tmp_stride; } while (--h); } } else { // sec_strength only assert(sec_strength); const int sec_shift = damping - ulog2(sec_strength); do { for (int x = 0; x < w; x++) { const int px = dst[x]; int sum = 0; for (int k = 0; k < 2; k++) { const int off1 = dav2d_cdef_directions[dir + 4][k]; // dir + 2 const int off2 = dav2d_cdef_directions[dir + 0][k]; // dir - 2 const int s0 = tmp[x + off1]; const int s1 = tmp[x - off1]; const int s2 = tmp[x + off2]; const int s3 = tmp[x - off2]; const int sec_tap = 2 - k; sum += sec_tap * constrain(s0 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s1 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s2 - px, sec_strength, sec_shift); sum += sec_tap * constrain(s3 - px, sec_strength, sec_shift); } dst[x] = px + ((sum - (sum < 0) + 8) >> 4); } dst += PXSTRIDE(dst_stride); tmp += tmp_stride; } while (--h); } } #define cdef_fn(w, h) \ static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \ const ptrdiff_t stride, \ const pixel (*left)[2], \ const pixel *const top, \ const pixel *const bottom, \ const int pri_strength, \ const int sec_strength, \ const int dir, \ const int damping, \ const enum CdefEdgeFlags edges \ HIGHBD_DECL_SUFFIX) \ { \ cdef_filter_block_c(dst, stride, left, top, bottom, \ pri_strength, sec_strength, dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \ } cdef_fn(4, 4); cdef_fn(4, 8); cdef_fn(8, 8); static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride, unsigned *const var HIGHBD_DECL_SUFFIX) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; int partial_sum_hv[2][8] = { { 0 } }; int partial_sum_diag[2][15] = { { 0 } }; int partial_sum_alt[4][11] = { { 0 } }; for (int y = 0; y < 8; y++) { for (int x = 0; x < 8; x++) { const int px = (img[x] >> bitdepth_min_8) - 128; partial_sum_diag[0][ y + x ] += px; partial_sum_alt [0][ y + (x >> 1)] += px; partial_sum_hv [0][ y ] += px; partial_sum_alt [1][3 + y - (x >> 1)] += px; partial_sum_diag[1][7 + y - x ] += px; partial_sum_alt [2][3 - (y >> 1) + x ] += px; partial_sum_hv [1][ x ] += px; partial_sum_alt [3][ (y >> 1) + x ] += px; } img += PXSTRIDE(stride); } unsigned cost[8] = { 0 }; for (int n = 0; n < 8; n++) { cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n]; cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n]; } cost[2] *= 105; cost[6] *= 105; static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 }; for (int n = 0; n < 7; n++) { const int d = div_table[n]; cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] + partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d; cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] + partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d; } cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105; cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105; for (int n = 0; n < 4; n++) { unsigned *const cost_ptr = &cost[n * 2 + 1]; for (int m = 0; m < 5; m++) *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m]; *cost_ptr *= 105; for (int m = 0; m < 3; m++) { const int d = div_table[2 * m + 1]; *cost_ptr += (partial_sum_alt[n][m] * partial_sum_alt[n][m] + partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d; } } int best_dir = 0; unsigned best_cost = cost[0]; for (int n = 1; n < 8; n++) { if (cost[n] > best_cost) { best_cost = cost[n]; best_dir = n; } } *var = (best_cost - (cost[best_dir ^ 4])) >> 10; return best_dir; } #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/cdef.h" #elif ARCH_PPC64LE #include "src/ppc/cdef.h" #elif ARCH_RISCV #include "src/riscv/cdef.h" #elif ARCH_X86 #include "src/x86/cdef.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/cdef.h" #endif #endif COLD void bitfn(dav2d_cdef_dsp_init)(Dav2dCdefDSPContext *const c) { c->dir = cdef_find_dir_c; c->fb[0] = cdef_filter_block_8x8_c; c->fb[1] = cdef_filter_block_4x8_c; c->fb[2] = cdef_filter_block_4x4_c; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM cdef_dsp_init_arm(c); #elif ARCH_PPC64LE cdef_dsp_init_ppc(c); #elif ARCH_RISCV cdef_dsp_init_riscv(c); #elif ARCH_X86 cdef_dsp_init_x86(c); #elif ARCH_LOONGARCH64 cdef_dsp_init_loongarch(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cdf.c000066400000000000000000011700721517466257200215430ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/frame.h" #include "src/internal.h" #include "src/tables.h" #define CDF1(x) (32768-(x)) #define CDF2(a,b) \ CDF1(a), CDF1(b) #define CDF3(a,b,c) \ CDF1(a), CDF2(b,c) #define CDF4(a,b,c,d) \ CDF1(a), CDF3(b,c,d) #define CDF5(a,b,c,d,e) \ CDF1(a), CDF4(b,c,d,e) #define CDF6(a,b,c,d,e,f) \ CDF1(a), CDF5(b,c,d,e,f) #define CDF7(a,b,c,d,e,f,g) \ CDF1(a), CDF6(b,c,d,e,f,g) #define CDF8(a,b,c,d,e,f,g,h) \ CDF1(a), CDF7(b,c,d,e,f,g,h) #define CDF9(a,b,c,d,e,f,g,h,i) \ CDF1(a), CDF8(b,c,d,e,f,g,h,i) #define CDF10(a,b,c,d,e,f,g,h,i,j) \ CDF1(a), CDF9(b,c,d,e,f,g,h,i,j) #define CDF11(a,b,c,d,e,f,g,h,i,j,k) \ CDF1(a), CDF10(b,c,d,e,f,g,h,i,j,k) #define CDF12(a,b,c,d,e,f,g,h,i,j,k,l) \ CDF1(a), CDF11(b,c,d,e,f,g,h,i,j,k,l) #define CDF13(a,b,c,d,e,f,g,h,i,j,k,l,m) \ CDF1(a), CDF12(b,c,d,e,f,g,h,i,j,k,l,m) #define CDF14(a,b,c,d,e,f,g,h,i,j,k,l,m,n) \ CDF1(a), CDF13(b,c,d,e,f,g,h,i,j,k,l,m,n) #define CDF15(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \ CDF1(a), CDF14(b,c,d,e,f,g,h,i,j,k,l,m,n,o) typedef struct CdfDefaultContext { CdfModeContext m; CdfMvContext mv; } CdfDefaultContext; static const CdfDefaultContext default_cdf = { .m = { .rst_switchable = { { CDF1(25542), 62 << 8 }, { CDF1(25550), 37 << 8 }, }, .rst_ns_wiener = { CDF1(9051), 32 << 8 }, .rst_pc_wiener = { CDF1(12799), 25 << 8 }, .wiener_ns_len = { { CDF1(4898), 56 << 8 }, { CDF1(6088), 26 << 8 }, }, .wiener_ns_sym = { CDF1(29286), 65 << 8 }, .wiener_ns_cf = { CDF3(16557, 24352, 29677), 6 << 8 }, .part_split = { { { CDF1(28084), 3 << 8 }, { CDF1(23755), 93 << 8 }, { CDF1(23634), 93 << 8 }, { CDF1(19368), 3 << 8 }, { CDF1(24961), 0 << 8 }, { CDF1(14941), 1 << 8 }, { CDF1(16154), 1 << 8 }, { CDF1( 5905), 0 << 8 }, { CDF1(21934), 0 << 8 }, { CDF1(10440), 26 << 8 }, { CDF1(11984), 31 << 8 }, { CDF1( 3474), 0 << 8 }, { CDF1(20492), 16 << 8 }, { CDF1( 6963), 6 << 8 }, { CDF1( 8099), 26 << 8 }, { CDF1( 1529), 0 << 8 }, { CDF1(24117), 92 << 8 }, { CDF1( 7871), 10 << 8 }, { CDF1(23604), 2 << 8 }, { CDF1( 8429), 30 << 8 }, { CDF1(27356), 2 << 8 }, { CDF1(22441), 7 << 8 }, { CDF1( 8897), 31 << 8 }, { CDF1( 6811), 61 << 8 }, { CDF1(17592), 16 << 8 }, { CDF1( 5648), 32 << 8 }, { CDF1( 5339), 26 << 8 }, { CDF1( 1082), 26 << 8 }, { CDF1(26143), 77 << 8 }, { CDF1(11379), 85 << 8 }, { CDF1(20142), 93 << 8 }, { CDF1( 7401), 8 << 8 }, { CDF1(26235), 82 << 8 }, { CDF1(23674), 78 << 8 }, { CDF1(12441), 77 << 8 }, { CDF1(10482), 75 << 8 }, { CDF1(20663), 0 << 8 }, { CDF1( 4192), 27 << 8 }, { CDF1( 5274), 33 << 8 }, { CDF1( 713), 1 << 8 }, { CDF1(28255), 75 << 8 }, { CDF1(27370), 75 << 8 }, { CDF1(23527), 0 << 8 }, { CDF1(20990), 1 << 8 }, { CDF1(26727), 0 << 8 }, { CDF1(21187), 0 << 8 }, { CDF1(25324), 0 << 8 }, { CDF1(17838), 0 << 8 }, { CDF1(26136), 0 << 8 }, { CDF1(16591), 6 << 8 }, { CDF1(19838), 1 << 8 }, { CDF1(10605), 31 << 8 }, { CDF1(22914), 1 << 8 }, { CDF1(12609), 31 << 8 }, { CDF1(11341), 0 << 8 }, { CDF1( 4556), 0 << 8 }, { CDF1(24218), 1 << 8 }, { CDF1(13059), 7 << 8 }, { CDF1(15378), 32 << 8 }, { CDF1( 5858), 32 << 8 }, { CDF1(21644), 32 << 8 }, { CDF1( 7767), 31 << 8 }, { CDF1( 8309), 6 << 8 }, { CDF1( 1687), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(25554), 0 << 8 }, { CDF1(18892), 0 << 8 }, { CDF1(18530), 0 << 8 }, { CDF1(10806), 6 << 8 }, { CDF1(22504), 1 << 8 }, { CDF1(12140), 31 << 8 }, { CDF1(11966), 31 << 8 }, { CDF1( 4984), 30 << 8 }, { CDF1(24460), 31 << 8 }, { CDF1( 8698), 31 << 8 }, { CDF1( 9655), 31 << 8 }, { CDF1( 2563), 30 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(26227), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(17669), 1 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(26179), 1 << 8 }, { CDF1(17889), 1 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(23500), 1 << 8 }, { CDF1(13115), 31 << 8 }, { CDF1(15253), 31 << 8 }, { CDF1( 6458), 55 << 8 }, { CDF1(22566), 31 << 8 }, { CDF1(11497), 32 << 8 }, { CDF1(10045), 31 << 8 }, { CDF1( 3750), 26 << 8 }, }, }, .part_square = { { CDF1(18000), 7 << 8 }, { CDF1(10521), 37 << 8 }, { CDF1(11395), 62 << 8 }, { CDF1( 4419), 32 << 8 }, { CDF1(12996), 85 << 8 }, { CDF1( 8185), 55 << 8 }, { CDF1(10979), 36 << 8 }, { CDF1( 5010), 32 << 8 }, }, .part_dir = { { { CDF1(14644), 0 << 8 }, { CDF1(10173), 75 << 8 }, { CDF1(18529), 0 << 8 }, { CDF1(16071), 90 << 8 }, { CDF1(20263), 1 << 8 }, { CDF1(12813), 1 << 8 }, { CDF1(26612), 0 << 8 }, { CDF1(23277), 1 << 8 }, { CDF1(10594), 76 << 8 }, { CDF1( 7000), 75 << 8 }, { CDF1(20002), 1 << 8 }, { CDF1(12889), 2 << 8 }, { CDF1(13854), 76 << 8 }, { CDF1(10750), 1 << 8 }, { CDF1(18380), 1 << 8 }, { CDF1(17505), 6 << 8 }, { CDF1(14430), 7 << 8 }, { CDF1(11554), 2 << 8 }, { CDF1(20078), 1 << 8 }, { CDF1(19097), 76 << 8 }, { CDF1(15278), 2 << 8 }, { CDF1(10137), 1 << 8 }, { CDF1(21921), 7 << 8 }, { CDF1(14621), 6 << 8 }, { CDF1(19330), 2 << 8 }, { CDF1(15921), 1 << 8 }, { CDF1(26218), 1 << 8 }, { CDF1(24318), 1 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16066), 78 << 8 }, { CDF1( 9225), 2 << 8 }, { CDF1(22849), 31 << 8 }, { CDF1(14817), 11 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(18543), 75 << 8 }, { CDF1(13210), 10 << 8 }, { CDF1(24367), 32 << 8 }, { CDF1(18417), 25 << 8 }, { CDF1(24701), 6 << 8 }, { CDF1(18911), 7 << 8 }, { CDF1(29590), 1 << 8 }, { CDF1(27778), 7 << 8 }, { CDF1( 3400), 1 << 8 }, { CDF1( 935), 90 << 8 }, { CDF1(10365), 32 << 8 }, { CDF1( 1723), 1 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(15654), 76 << 8 }, { CDF1(13500), 75 << 8 }, { CDF1(19177), 91 << 8 }, { CDF1(14739), 90 << 8 }, { CDF1(18769), 75 << 8 }, { CDF1(13500), 1 << 8 }, { CDF1(23583), 75 << 8 }, { CDF1(20927), 75 << 8 }, { CDF1(15045), 1 << 8 }, { CDF1(10528), 0 << 8 }, { CDF1(22474), 6 << 8 }, { CDF1(14250), 0 << 8 }, { CDF1(16561), 75 << 8 }, { CDF1(11427), 76 << 8 }, { CDF1(21874), 6 << 8 }, { CDF1(16344), 90 << 8 }, { CDF1(21566), 31 << 8 }, { CDF1(13357), 2 << 8 }, { CDF1(27355), 1 << 8 }, { CDF1(24117), 6 << 8 }, { CDF1(10901), 77 << 8 }, { CDF1( 5780), 0 << 8 }, { CDF1(19056), 37 << 8 }, { CDF1( 9141), 1 << 8 }, { CDF1(20436), 7 << 8 }, { CDF1(15693), 32 << 8 }, { CDF1(26536), 6 << 8 }, { CDF1(23667), 31 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(19241), 7 << 8 }, { CDF1(13038), 32 << 8 }, { CDF1(28903), 32 << 8 }, { CDF1(24802), 32 << 8 }, { CDF1( 9097), 32 << 8 }, { CDF1( 2749), 6 << 8 }, { CDF1(15201), 27 << 8 }, { CDF1( 4449), 6 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, .part_ext = { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(31091), 3 << 8 }, { CDF1(29638), 0 << 8 }, { CDF1(28924), 0 << 8 }, { CDF1(28653), 0 << 8 }, { CDF1(30349), 93 << 8 }, { CDF1(28265), 75 << 8 }, { CDF1(27287), 0 << 8 }, { CDF1(27721), 0 << 8 }, { CDF1(29960), 93 << 8 }, { CDF1(28345), 90 << 8 }, { CDF1(27302), 90 << 8 }, { CDF1(27252), 75 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(30342), 1 << 8 }, { CDF1(27563), 6 << 8 }, { CDF1(26168), 6 << 8 }, { CDF1(23089), 32 << 8 }, { CDF1(30643), 0 << 8 }, { CDF1(28683), 1 << 8 }, { CDF1(28009), 0 << 8 }, { CDF1(26186), 0 << 8 }, { CDF1(29222), 1 << 8 }, { CDF1(25740), 0 << 8 }, { CDF1(24079), 6 << 8 }, { CDF1(19806), 76 << 8 }, { CDF1(29409), 90 << 8 }, { CDF1(26825), 93 << 8 }, { CDF1(25919), 93 << 8 }, { CDF1(24417), 93 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(26866), 30 << 8 }, { CDF1(24499), 25 << 8 }, { CDF1(24732), 31 << 8 }, { CDF1(23387), 25 << 8 }, { CDF1(27477), 30 << 8 }, { CDF1(25837), 31 << 8 }, { CDF1(24621), 31 << 8 }, { CDF1(23604), 5 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(24384), 26 << 8 }, { CDF1(22113), 32 << 8 }, { CDF1(21798), 2 << 8 }, { CDF1(20067), 6 << 8 }, { CDF1(26220), 31 << 8 }, { CDF1(22997), 30 << 8 }, { CDF1(22249), 31 << 8 }, { CDF1(20091), 0 << 8 }, }, }, .part_4way = { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(25789), 31 << 8 }, { CDF1(25290), 32 << 8 }, { CDF1(24270), 31 << 8 }, { CDF1(22994), 31 << 8 }, { CDF1(25801), 1 << 8 }, { CDF1(25260), 7 << 8 }, { CDF1(24041), 1 << 8 }, { CDF1(24281), 1 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(23038), 6 << 8 }, { CDF1(19972), 40 << 8 }, { CDF1(19723), 8 << 8 }, { CDF1(17939), 0 << 8 }, { CDF1(15574), 6 << 8 }, { CDF1(13761), 76 << 8 }, { CDF1(12917), 91 << 8 }, { CDF1(11328), 75 << 8 }, { CDF1(17295), 32 << 8 }, { CDF1(14463), 38 << 8 }, { CDF1(14724), 6 << 8 }, { CDF1(11653), 1 << 8 }, { CDF1(13202), 32 << 8 }, { CDF1(10929), 7 << 8 }, { CDF1(10348), 7 << 8 }, { CDF1( 8276), 1 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(24728), 31 << 8 }, { CDF1(22673), 31 << 8 }, { CDF1(21033), 6 << 8 }, { CDF1(20321), 1 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(24631), 67 << 8 }, { CDF1(21363), 15 << 8 }, { CDF1(20201), 15 << 8 }, { CDF1(17529), 82 << 8 }, { CDF1(21042), 3 << 8 }, { CDF1(18640), 80 << 8 }, { CDF1(18183), 91 << 8 }, { CDF1(15590), 76 << 8 }, }, }, .region_type = { { CDF1( 2635), 25 << 8 }, { CDF1( 883), 25 << 8 }, { CDF1( 503), 50 << 8 }, { CDF1( 279), 55 << 8 }, }, .skip_mode = { { CDF1(30964), 93 << 8 }, { CDF1(21769), 0 << 8 }, { CDF1(12484), 15 << 8 }, }, .skip_mode_drl_idx = { { CDF1(21634), 0 << 8 }, { CDF1(17376), 0 << 8 }, { CDF1(18432), 75 << 8 }, }, .intra = { { CDF1( 1522), 1 << 8 }, { CDF1(14381), 0 << 8 }, { CDF1(10455), 25 << 8 }, { CDF1(27796), 0 << 8 }, }, .intrabc = { { CDF1(32085), 5 << 8 }, { CDF1(15172), 30 << 8 }, { CDF1( 4503), 0 << 8 }, }, .gdf = { CDF1(14593), 32 << 8 }, .cdef_idx0 = { { CDF1(29034), 32 << 8 }, { CDF1(16472), 32 << 8 }, { CDF1( 5751), 32 << 8 }, { CDF1( 3115), 31 << 8 }, }, .cdef_idx = { { CDF1(17708), 32 << 8 }, { CDF2(13413, 24899), 32 << 8 }, { CDF3(10588, 19866, 26664), 37 << 8 }, { CDF4(10131, 17874, 23876, 28766), 37 << 8 }, { CDF5( 8363, 15451, 20811, 25453, 29393), 32 << 8 }, { CDF6( 7372, 13867, 18969, 23278, 26977, 30156), 32 << 8 }, }, .ccso = { { { CDF1(18469), 62 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1( 4949), 37 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(23470), 37 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1( 6666), 37 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(22914), 37 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1( 6993), 37 << 8 }, { CDF1(16384), 0 << 8 }, } }, .skip_txfm = { { CDF1(25865), 25 << 8 }, { CDF1(14316), 0 << 8 }, { CDF1( 4598), 0 << 8 }, { CDF1(25612), 6 << 8 }, { CDF1(12366), 1 << 8 }, { CDF1( 3320), 90 << 8 }, }, .dpcm = { { CDF1(16384) }, { CDF1(16384) }, }, .dpcm_dir = { { CDF1(16384) }, { CDF1(16384) }, }, .intra_y_set = { CDF3(28863, 31022, 31724), 93 << 8 }, .intra_y_idx0 = { { CDF7(15175, 20075, 21728, 24098, 26405, 27655, 28860), 5 << 8 }, { CDF7(10114, 14957, 16815, 19127, 20147, 25583, 27169), 0 << 8 }, { CDF7( 5636, 9004, 10456, 12122, 12744, 20325, 25607), 0 << 8 }, }, .intra_y_idx1 = { { CDF5(12743, 18172, 20194, 23648, 26419), 6 << 8 }, { CDF5( 8976, 16084, 20827, 24595, 28496), 75 << 8 }, { CDF5( 8784, 14556, 19710, 24903, 28724), 75 << 8 }, }, .fsc = { { { CDF1(30503), 3 << 8 }, { CDF1(31244), 3 << 8 }, { CDF1(32254), 78 << 8 }, { CDF1(32324), 93 << 8 }, { CDF1(32582), 93 << 8 }, { CDF1(32691), 93 << 8 }, }, { { CDF1(27437), 0 << 8 }, { CDF1(27242), 90 << 8 }, { CDF1(28040), 76 << 8 }, { CDF1(27589), 76 << 8 }, { CDF1(27234), 7 << 8 }, { CDF1(23583), 62 << 8 }, }, { { CDF1(26068), 75 << 8 }, { CDF1(22635), 75 << 8 }, { CDF1(22069), 6 << 8 }, { CDF1(19218), 32 << 8 }, { CDF1(13701), 31 << 8 }, { CDF1( 4636), 38 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(32016), 15 << 8 }, { CDF1(32403), 93 << 8 }, { CDF1(32583), 15 << 8 }, { CDF1(32683), 76 << 8 }, }, }, .mrl_index = { { CDF3(29573, 31193, 32023), 78 << 8 }, { CDF3(21812, 27066, 30279), 75 << 8 }, { CDF3(16076, 23806, 28762), 1 << 8 }, }, .multi_mrl = { { CDF1(19678), 6 << 8 }, { CDF1(12287), 6 << 8 }, { CDF1( 9574), 6 << 8 }, }, .pal_y = { CDF1(30045), 62 << 8 }, .pal_sz = { CDF6( 8779, 15095, 20777, 24903, 27923, 30403), 32 << 8 }, .dip = { { CDF1(11914), 31 << 8 }, { CDF1( 5128), 31 << 8 }, { CDF1( 3555), 31 << 8 }, }, .dip_mode = { CDF5( 5753, 15408, 19070, 22631, 27578), 0 << 8 }, .cfl = { { CDF1(20441), 30 << 8 }, { CDF1(11610), 6 << 8 }, { CDF1( 4643), 0 << 8 }, }, .intra_uv_mode = { { CDF7( 9363, 20957, 22865, 24753, 26411, 27983, 30428), 31 << 8 }, { CDF7(21282, 23610, 28208, 29311, 30348, 31158, 31491), 30 << 8 }, }, .mhccp = { CDF1(15499), 30 << 8 }, .mhccp_filter_dir = { { CDF2(10923, 21845), 0 << 8 }, { CDF2( 8795, 15105), 6 << 8 }, { CDF2(10433, 15974), 32 << 8 }, { CDF2(17085, 21689), 32 << 8 }, }, .cfl_type = { CDF1(12507), 30 << 8 }, .cfl_sign = { CDF7( 2421, 4332, 11256, 12766, 21386, 28725, 32087), 62 << 8 }, .cfl_alpha = { { CDF7(21679, 25305, 30646, 31512, 32537, 32646, 32696), 62 << 8 }, { CDF7( 8262, 16302, 24082, 29422, 31398, 32286, 32525), 62 << 8 }, { CDF7(17235, 26166, 30378, 31305, 32373, 32549, 32668), 62 << 8 }, { CDF7(17618, 25732, 27865, 30338, 31125, 31522, 32238), 62 << 8 }, { CDF7(17542, 23066, 27907, 28728, 30702, 31165, 31435), 62 << 8 }, { CDF7(17675, 24802, 30468, 30783, 31841, 32264, 32422), 62 << 8 }, }, .pal_idx_identity = { { CDF2(22515, 25751), 25 << 8 }, { CDF2( 4014, 5233), 31 << 8 }, { CDF2( 3548, 4163), 33 << 8 }, { CDF2(12999, 32756), 56 << 8 }, }, .pal_idx = { { { CDF1(28140), 90 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1( 8582), 6 << 8 }, { CDF1(27413), 32 << 8 }, { CDF1(30429), 93 << 8 }, }, { { CDF2(25350, 29026), 90 << 8 }, { CDF2(11363, 25273), 7 << 8 }, { CDF2( 6841, 28579), 1 << 8 }, { CDF2(21350, 26012), 6 << 8 }, { CDF2(30573, 31646), 93 << 8 }, }, { { CDF3(23706, 26962, 29060), 0 << 8 }, { CDF3( 9976, 22516, 27382), 1 << 8 }, { CDF3( 6691, 25460, 29234), 6 << 8 }, { CDF3(18909, 23925, 28403), 31 << 8 }, { CDF3(30308, 31076, 31818), 93 << 8 }, }, { { CDF4(24116, 26957, 28486, 29941), 0 << 8 }, { CDF4( 9568, 20472, 24294, 28942), 81 << 8 }, { CDF4( 5706, 25243, 28040, 30406), 76 << 8 }, { CDF4(20105, 22982, 27024, 28911), 31 << 8 }, { CDF4(30897, 31342, 31766, 32199), 93 << 8 }, }, { { CDF5(20824, 24227, 25926, 27459, 29266), 75 << 8 }, { CDF5( 8141, 18989, 21599, 26182, 28576), 75 << 8 }, { CDF5( 5252, 24340, 26450, 28438, 30625), 75 << 8 }, { CDF5(19519, 22695, 25587, 26972, 28423), 6 << 8 }, { CDF5(30383, 30890, 31247, 31653, 32150), 78 << 8 }, }, { { CDF6(21628, 24512, 25873, 27054, 28131, 29539), 80 << 8 }, { CDF6( 8028, 18264, 20613, 25424, 27112, 28906), 90 << 8 }, { CDF6( 6489, 22242, 24461, 26394, 28350, 30510), 75 << 8 }, { CDF6(22048, 24429, 26990, 27944, 28417, 29574), 76 << 8 }, { CDF6(30801, 31205, 31472, 31728, 32005, 32305), 93 << 8 }, }, { { CDF7(22471, 25083, 25984, 26893, 27654, 28750, 29903), 93 << 8 }, { CDF7( 7542, 17057, 19151, 23550, 25459, 27066, 28804), 90 << 8 }, { CDF7( 7582, 20437, 22728, 24622, 26515, 28579, 30632), 90 << 8 }, { CDF7(22102, 24144, 26916, 28151, 28846, 29212, 30153), 0 << 8 }, { CDF7(30524, 30887, 31156, 31393, 31626, 31911, 32281), 93 << 8 }, } }, .intrabc_mode = { CDF1(29993), 6 << 8 }, .intrabc_precision = { CDF1(19778), 31 << 8 }, .morph_pred = { { CDF1(31715), 55 << 8 }, { CDF1(19667), 1 << 8 }, { CDF1(10555), 91 << 8 }, }, .tip = { { CDF1(30898), 93 << 8 }, { CDF1(19665), 0 << 8 }, { CDF1( 9477), 15 << 8 }, }, .comp = { { CDF1(26924), 15 << 8 }, { CDF1(25000), 1 << 8 }, { CDF1(17949), 1 << 8 }, { CDF1(13581), 6 << 8 }, { CDF1( 7034), 0 << 8 }, }, .single_ref = { { { CDF1(26469), 0 << 8 }, { CDF1(28870), 30 << 8 }, { CDF1(29662), 1 << 8 }, { CDF1(29867), 6 << 8 }, { CDF1(29772), 6 << 8 }, { CDF1(29776), 26 << 8 }, }, { { CDF1(13631), 6 << 8 }, { CDF1(18185), 37 << 8 }, { CDF1(19992), 32 << 8 }, { CDF1(18462), 62 << 8 }, { CDF1(17451), 37 << 8 }, { CDF1(11578), 62 << 8 }, }, { { CDF1( 2599), 0 << 8 }, { CDF1( 5203), 31 << 8 }, { CDF1( 5185), 31 << 8 }, { CDF1( 3671), 31 << 8 }, { CDF1( 3954), 6 << 8 }, { CDF1( 1633), 5 << 8 }, }, }, .comp0_ref = { { { CDF1( 9272), 32 << 8 }, { CDF1(17175), 62 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1( 1385), 0 << 8 }, { CDF1( 4439), 31 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1( 521), 0 << 8 }, { CDF1( 1854), 25 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, .comp1_ref = { { { { CDF1(30729), 75 << 8 }, { CDF1(29403), 5 << 8 }, { CDF1(29037), 6 << 8 }, { CDF1(29355), 31 << 8 }, { CDF1(28573), 5 << 8 }, { CDF1(27396), 7 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(19315), 31 << 8 }, { CDF1(27821), 6 << 8 }, { CDF1(27892), 31 << 8 }, { CDF1(28695), 30 << 8 }, { CDF1(29637), 51 << 8 }, }, }, { { { CDF1(30432), 0 << 8 }, { CDF1(20290), 31 << 8 }, { CDF1(19855), 37 << 8 }, { CDF1(18567), 62 << 8 }, { CDF1(18331), 37 << 8 }, { CDF1(14241), 62 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1( 5725), 31 << 8 }, { CDF1(13420), 31 << 8 }, { CDF1(12780), 32 << 8 }, { CDF1(10781), 62 << 8 }, { CDF1( 6424), 62 << 8 }, }, }, { { { CDF1(11634), 31 << 8 }, { CDF1(10093), 31 << 8 }, { CDF1( 6065), 31 << 8 }, { CDF1( 5408), 31 << 8 }, { CDF1( 6411), 31 << 8 }, { CDF1( 4075), 30 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1( 898), 15 << 8 }, { CDF1( 3127), 1 << 8 }, { CDF1( 1775), 5 << 8 }, { CDF1( 1217), 6 << 8 }, { CDF1( 591), 5 << 8 }, }, }, }, .tip_mode = { CDF1(22129), 31 << 8 }, .warp = { { CDF1(25999), 1 << 8 }, { CDF1(14478), 7 << 8 }, { CDF1(10868), 6 << 8 }, { CDF1( 5256), 31 << 8 }, { CDF1( 2722), 31 << 8 }, }, .warp_newmv = { CDF1(15095), 1 << 8 }, .inter_mode = { { CDF2(10043, 11100), 6 << 8 }, { CDF2(21561, 21758), 1 << 8 }, { CDF2(25411, 25714), 0 << 8 }, { CDF2(14117, 14341), 0 << 8 }, { CDF2(18288, 18577), 0 << 8 }, }, .amvd = { { { CDF1( 5980), 15 << 8 }, { CDF1( 6091), 15 << 8 }, { CDF1( 6237), 93 << 8 }, }, { { CDF1( 861), 30 << 8 }, { CDF1( 847), 50 << 8 }, { CDF1( 1198), 66 << 8 }, }, { { CDF1( 456), 53 << 8 }, { CDF1( 431), 68 << 8 }, { CDF1( 849), 68 << 8 }, }, { { CDF1( 409), 68 << 8 }, { CDF1( 385), 68 << 8 }, { CDF1( 581), 68 << 8 }, }, { { CDF1(16246), 0 << 8 }, { CDF1( 9696), 0 << 8 }, { CDF1( 8791), 0 << 8 }, }, { { CDF1(13199), 0 << 8 }, { CDF1(10624), 0 << 8 }, { CDF1( 8586), 1 << 8 }, }, { { CDF1( 5112), 1 << 8 }, { CDF1( 3920), 3 << 8 }, { CDF1( 3668), 3 << 8 }, }, { { CDF1(12017), 76 << 8 }, { CDF1(10177), 75 << 8 }, { CDF1( 9184), 75 << 8 }, }, { { CDF1(12111), 6 << 8 }, { CDF1( 8056), 5 << 8 }, { CDF1( 6641), 27 << 8 }, } }, .bawp = { { CDF1(26456), 31 << 8 }, { CDF1( 5121), 31 << 8 }, }, .bawp_explicit = { { CDF1(26966), 31 << 8 }, { CDF1(15275), 6 << 8 }, { CDF1(14613), 31 << 8 }, }, .bawp_explicit_scale = { CDF1(21998), 32 << 8 }, .warp_extend = { { CDF1(20359), 0 << 8 }, { CDF1(20310), 75 << 8 }, { CDF1(21759), 75 << 8 }, }, .warp_causal = { { CDF1(14877), 1 << 8 }, { CDF1(12801), 0 << 8 }, { CDF1( 6885), 6 << 8 }, { CDF1( 2987), 30 << 8 }, }, .interintra = { { CDF1(20569), 1 << 8 }, { CDF1(17106), 1 << 8 }, { CDF1(20948), 1 << 8 }, { CDF1(25796), 32 << 8 }, }, .interintra_mode = { { CDF3( 1819, 16131, 26802), 32 << 8 }, { CDF3( 1442, 15840, 28441), 1 << 8 }, { CDF3( 1995, 15814, 28221), 7 << 8 }, { CDF3( 3564, 15440, 28048), 32 << 8 }, }, .interintra_wedge = { CDF1(16758), 0 << 8 }, .wedge_quad = { CDF3( 6511, 18144, 27374), 1 << 8 }, .wedge_angle = { { CDF4(10258, 15276, 19997, 26561), 6 << 8 }, { CDF4(14039, 19183, 26143, 30047), 6 << 8 }, { CDF4(19564, 22099, 25104, 29960), 1 << 8 }, { CDF4(13808, 17950, 25715, 29008), 7 << 8 }, }, .wedge_dist = { CDF3( 8203, 16994, 21032), 75 << 8 }, .wedge_dist2 = { CDF2(14463, 19115), 75 << 8 }, .tip_drl_idx = { { CDF1(30662), 0 << 8 }, { CDF1(23823), 6 << 8 }, { CDF1(21676), 6 << 8 }, }, .jmvd_amvd_scale_mode = { CDF2(23178, 26812), 0 << 8 }, .jmvd_scale_mode = { CDF4(23180, 24894, 26548, 29872), 1 << 8 }, .drl_idx = { { { CDF1(15721), 90 << 8 }, { CDF1(21115), 0 << 8 }, { CDF1(19567), 1 << 8 }, { CDF1(17602), 93 << 8 }, { CDF1(13319), 93 << 8 }, }, { { CDF1(18692), 93 << 8 }, { CDF1(19343), 90 << 8 }, { CDF1(18207), 90 << 8 }, { CDF1(17908), 93 << 8 }, { CDF1(18304), 93 << 8 }, }, { { CDF1(22157), 90 << 8 }, { CDF1(23233), 90 << 8 }, { CDF1(22782), 90 << 8 }, { CDF1(22353), 93 << 8 }, { CDF1(22457), 93 << 8 }, } }, .mvprec_def = { { CDF1(20650), 31 << 8 }, { CDF1(15758), 31 << 8 }, { CDF1( 4571), 31 << 8 }, }, .mvprec_rem = { { { CDF2(10923, 21845), 0 << 8 }, { CDF2(31340, 32505), 78 << 8 }, { CDF2(26039, 32175), 0 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32197, 32676), 75 << 8 }, { CDF2(28679, 32626), 0 << 8 }, } }, .warp_ref_idx = { { CDF1(18903), 0 << 8 }, { CDF1(24500), 90 << 8 }, { CDF1(25360), 75 << 8 }, }, .amvd_joint = { CDF3( 4, 17705, 32748), 1 << 8 }, .amvd_index = { { CDF7(10549, 15298, 16241, 22533, 27449, 30520, 32080), 26 << 8 }, { CDF7( 9414, 14965, 15966, 22465, 27468, 30628, 32144), 26 << 8 }, }, .warpmv_with_mvd = { CDF1(18452), 1 << 8 }, .warp_delta_prec = { { CDF1(11830), 50 << 8 }, { CDF1(13323), 25 << 8 }, { CDF1(14893), 25 << 8 }, { CDF1(13952), 25 << 8 }, { CDF1(15633), 25 << 8 }, { CDF1(16040), 25 << 8 }, { CDF1(16736), 25 << 8 }, { CDF1(17451), 40 << 8 }, { CDF1(17823), 35 << 8 }, { CDF1(18893), 5 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(17968), 25 << 8 }, { CDF1(18210), 32 << 8 }, { CDF1(18671), 32 << 8 }, { CDF1(20345), 15 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(19398), 25 << 8 }, { CDF1(20146), 27 << 8 }, { CDF1(22585), 26 << 8 }, { CDF1(26435), 26 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(20066), 25 << 8 }, { CDF1(21783), 28 << 8 }, { CDF1(27357), 31 << 8 }, { CDF1(31107), 56 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, .warp_delta_param = { { { CDF7( 8995, 22970, 25406, 29673, 30295, 31670, 31863), 0 << 8 }, { CDF7(13333, 24012, 26545, 30183, 30839, 31958, 32139), 0 << 8 }, }, { { CDF7( 8959, 14388, 19825, 21810, 25035, 28077, 29469), 15 << 8 }, { CDF7( 9199, 14146, 19484, 21591, 24614, 28015, 29538), 0 << 8 }, } }, .warp_delta_sign = { CDF1(14285), 93 << 8 }, .warp_interintra = { { CDF1(16384), 0 << 8 }, { CDF1(27980), 56 << 8 }, { CDF1(29163), 56 << 8 }, { CDF1(30658), 62 << 8 }, }, .txsz_lossless = { { { CDF1(16384), 1 << 8 }, { CDF1(16384), 1 << 8 }, }, { { CDF1(16384), 75 << 8 }, { CDF1(16384), 75 << 8 }, }, { { CDF1(16384), 75 << 8 }, { CDF1(16384), 75 << 8 }, }, { { CDF1(16384), 75 << 8 }, { CDF1(16384), 75 << 8 }, }, }, .tx_split = { { { { CDF1(26330), 30 << 8 }, { CDF1(29620), 0 << 8 }, { CDF1(20420), 30 << 8 }, { CDF1(21694), 31 << 8 }, { CDF1(13317), 31 << 8 }, { CDF1(15391), 31 << 8 }, { CDF1(15952), 31 << 8 }, { CDF1(14736), 31 << 8 }, { CDF1(13810), 5 << 8 }, }, { { CDF1(31646), 55 << 8 }, { CDF1(32393), 25 << 8 }, { CDF1(30802), 26 << 8 }, { CDF1(30485), 31 << 8 }, { CDF1(20759), 32 << 8 }, { CDF1(22159), 1 << 8 }, { CDF1(26832), 31 << 8 }, { CDF1(27351), 1 << 8 }, { CDF1(24696), 1 << 8 }, }, }, { { { CDF1(29308), 0 << 8 }, { CDF1(32550), 93 << 8 }, { CDF1(27963), 1 << 8 }, { CDF1(27618), 31 << 8 }, { CDF1(22367), 7 << 8 }, { CDF1(23478), 35 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(27990), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, } } }, .tx_part_2d = { { { { CDF6(32744, 32748, 32752, 32756, 32760, 32764), 60 << 8 }, { CDF6( 3006, 22888, 27132, 29972, 29976, 32724), 31 << 8 }, { CDF6( 3673, 8849, 27652, 27656, 29944, 29948), 31 << 8 }, { CDF6( 5219, 19419, 28052, 28836, 29528, 31296), 31 << 8 }, { CDF6( 3055, 19649, 22157, 27038, 27466, 31646), 31 << 8 }, { CDF6( 6044, 11255, 26170, 26493, 28585, 29584), 31 << 8 }, { CDF6( 5896, 20361, 25685, 27552, 28695, 31097), 31 << 8 }, { CDF6( 2355, 17601, 21703, 26050, 27881, 31397), 32 << 8 }, { CDF6( 4701, 13502, 24958, 26413, 28166, 30129), 32 << 8 }, { CDF6( 3319, 16449, 21641, 27154, 29260, 31512), 32 << 8 }, { CDF6( 2526, 17088, 19643, 29378, 29382, 32724), 31 << 8 }, { CDF6( 2298, 4406, 23886, 23890, 30148, 30152), 31 << 8 }, { CDF6( 1553, 16160, 18679, 27983, 29592, 32139), 32 << 8 }, { CDF6( 2316, 4714, 22731, 23797, 29514, 30077), 37 << 8 }, }, { { CDF6(10790, 22602, 32736, 32740, 32744, 32748), 32 << 8 }, { CDF6(14384, 19116, 26545, 28687, 28691, 32724), 31 << 8 }, { CDF6(13910, 21690, 26343, 26347, 28432, 28436), 31 << 8 }, { CDF6(15891, 21712, 28890, 29571, 30307, 31363), 32 << 8 }, { CDF6(15946, 19447, 20270, 23423, 23858, 31148), 32 << 8 }, { CDF6(16053, 16957, 20312, 20924, 24770, 25959), 32 << 8 }, { CDF6(18479, 23053, 26582, 26966, 27567, 29836), 62 << 8 }, { CDF6( 9312, 11882, 14354, 17792, 19827, 29477), 62 << 8 }, { CDF6( 8490, 10668, 13295, 15353, 19952, 22575), 62 << 8 }, { CDF6( 6404, 12066, 16173, 20041, 24512, 28421), 62 << 8 }, { CDF6(10019, 14455, 17658, 27012, 27016, 32724), 37 << 8 }, { CDF6( 9479, 14904, 19374, 19378, 28027, 28031), 37 << 8 }, { CDF6( 3717, 7198, 8103, 20546, 23558, 31447), 62 << 8 }, { CDF6( 4058, 5429, 8987, 13978, 25126, 26655), 62 << 8 }, }, }, { { { CDF6(32434, 32490, 32545, 32601, 32657, 32712), 50 << 8 }, { CDF6( 1491, 14241, 29930, 32517, 32524, 32724), 57 << 8 }, { CDF6( 1719, 16525, 31000, 31004, 32575, 32579), 30 << 8 }, { CDF6( 1645, 16749, 29324, 30425, 32016, 32485), 62 << 8 }, { CDF6( 2908, 15802, 24689, 28470, 32122, 32542), 25 << 8 }, { CDF6( 3470, 17931, 25841, 29589, 31907, 32465), 43 << 8 }, { CDF6( 5638, 19594, 28693, 29977, 30703, 32154), 50 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 2734, 12129, 30342, 31805, 31844, 32724), 50 << 8 }, { CDF6( 3849, 21783, 31043, 31056, 32181, 32193), 65 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, }, { { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, } } }, .tx_part_1d = { { { { CDF1(25131), 30 << 8 }, { CDF1(24514), 30 << 8 }, }, { { CDF1(19534), 6 << 8 }, { CDF1(18637), 0 << 8 }, }, }, { { { CDF1(30226), 50 << 8 }, { CDF1(30703), 33 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, } } }, .txtp_lossless = { CDF1(16384), 0 }, .txtp_long32_dct = { { CDF1( 36), 0 << 8 }, { CDF1( 128), 31 << 8 }, }, .txtp_intra_short_1d = { { CDF3(10692, 26586, 29231), 31 << 8 }, { CDF3(26700, 32160, 32748), 6 << 8 }, { CDF3(26915, 32411, 32748), 6 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, // unused? }, .txtp_inter_short_1d = { { { CDF3( 8347, 20254, 24536), 31 << 8 }, { CDF3(15994, 26294, 32748), 1 << 8 }, { CDF3(21212, 27810, 32748), 1 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(21815, 25084, 26230), 62 << 8 }, { CDF3(29354, 31747, 32748), 37 << 8 }, { CDF3(31614, 32529, 32748), 36 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(10501, 20590, 24181), 31 << 8 }, { CDF3(17596, 26388, 32748), 37 << 8 }, { CDF3(15407, 26475, 32732), 60 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, } }, .txtp_ext = { { CDF6( 5026, 16816, 19974, 23404, 26845, 30499), 8 << 8 }, { CDF6(14910, 25257, 26964, 29323, 30237, 31535), 0 << 8 }, { CDF6(13759, 26108, 27688, 29793, 30265, 31576), 35 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, // unused }, .txtp_ext_reduced = { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, // unused? }, .txtp_inter_tx_set = { { { { CDF1(14122), 25 << 8 }, { CDF1( 8962), 31 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16423), 26 << 8 }, { CDF1(23446), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(23352), 25 << 8 }, { CDF1(17069), 56 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(20835), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(30720), 3 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(28857), 17 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .txtp_inter_set0 = { { { CDF7( 8914, 10732, 12270, 14822, 17128, 19586, 21964), 25 << 8 }, { CDF7( 1160, 1555, 1972, 3414, 3962, 5151, 5908), 31 << 8 }, { CDF7(22819, 24338, 25592, 27001, 28395, 29648, 30990), 25 << 8 }, }, { { CDF7( 1140, 1725, 2324, 14653, 19072, 23618, 28109), 31 << 8 }, { CDF7( 58, 261, 587, 32317, 32556, 32626, 32708), 2 << 8 }, { CDF7(17404, 17669, 18403, 24052, 26393, 28506, 30676), 57 << 8 }, }, }, .txtp_inter_set1 = { { CDF7( 3121, 6470, 9191, 12280, 17811, 22588, 27697), 93 << 8 }, { CDF7( 338, 377, 571, 743, 7932, 11860, 17524), 1 << 8 }, { CDF7( 3314, 7625, 10639, 14206, 19363, 23456, 28033), 93 << 8 }, }, .txtp_inter_set2 = { { CDF3( 8669, 16533, 24855), 93 << 8 }, { CDF3( 9441, 16413, 25276), 90 << 8 }, { CDF3( 8767, 17611, 24876), 10 << 8 }, }, .txtp_inter_dct_idtx = { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, .txtp_inter_dct_idtx_iddct = { { { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, }, { { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, }, { { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, { CDF3(8192, 16384, 24576), 0 << 8 }, }, }, .stx = { { { CDF3( 4486, 15589, 26440), 75 << 8 }, { CDF3( 2357, 9504, 16641), 5 << 8 }, { CDF3( 1364, 8034, 14431), 0 << 8 }, { CDF3( 2472, 8725, 13853), 76 << 8 }, { CDF3( 7523, 11681, 14783), 1 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, // unused? { CDF3( 8192, 16384, 24576), 0 << 8 }, // unused? { CDF3(10490, 15394, 22206), 31 << 8 }, { CDF3(13967, 16953, 23109), 31 << 8 }, { CDF3(20191, 21727, 25818), 32 << 8 }, }, }, .stx_set = { CDF6(20712, 26263, 30623, 32732, 32736, 32740), 55 << 8, }, .stx_set_adst = { CDF3(15897, 26144, 30010), 5 << 8 }, .cctx = { CDF6(14350, 14836, 16054, 17075, 19408, 28530), 62 << 8 }, .comp_mode_sameref = { { CDF3( 6386, 23344, 23348), 31 << 8 }, { CDF3(10945, 24709, 24714), 31 << 8 }, { CDF3(11517, 25230, 25258), 31 << 8 }, { CDF3( 7563, 22176, 22180), 31 << 8 }, { CDF3( 6629, 20955, 20966), 6 << 8 }, }, .comp_mode_joint = { { CDF1(24720), 0 << 8 }, { CDF1(32764), 0 << 8 }, }, .comp_mode = { { CDF4(12177, 20001, 23193, 24448), 26 << 8 }, { CDF4(21192, 25117, 27806, 27948), 31 << 8 }, { CDF4(26779, 28724, 30192, 30249), 25 << 8 }, { CDF4(12506, 17871, 21295, 21389), 31 << 8 }, { CDF4(16948, 20335, 22582, 22617), 31 << 8 }, }, .opfl = { { CDF1(11582), 31 << 8 }, { CDF1(24076), 26 << 8 }, }, .refine_mv = { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(20204), 25 << 8 }, { CDF1(17614), 43 << 8 }, { CDF1(24443), 32 << 8 }, { CDF1(16384), 0 << 8 }, }, .comp_type_masked = { { CDF1(17510), 26 << 8 }, { CDF1(10382), 0 << 8 }, { CDF1( 8336), 31 << 8 }, { CDF1( 6054), 1 << 8 }, { CDF1( 5764), 5 << 8 }, { CDF1( 7154), 10 << 8 }, { CDF1(15013), 26 << 8 }, { CDF1( 8426), 0 << 8 }, { CDF1( 8278), 1 << 8 }, { CDF1( 4856), 1 << 8 }, { CDF1( 3464), 0 << 8 }, { CDF1( 5295), 0 << 8 }, }, .comp_type_weighted = { CDF1(16894), 31 << 8 }, .cwp_idx = { { CDF1(21704), 56 << 8 }, { CDF1(15990), 31 << 8 }, { CDF1(12544), 57 << 8 }, { CDF1(25638), 62 << 8 }, }, .filter = { { CDF2(29975, 32748), 25 << 8 }, { CDF2( 2076, 32703), 75 << 8 }, { CDF2( 19, 1768), 15 << 8 }, { CDF2(17314, 27415), 31 << 8 }, { CDF2(31286, 31994), 0 << 8 }, { CDF2( 9581, 32608), 31 << 8 }, { CDF2( 535, 1036), 0 << 8 }, { CDF2(24819, 27722), 31 << 8 }, }, .seg_pred = { { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, }, .seg_id_ext = { { CDF1(16384) }, { CDF1(16384) }, { CDF1(16384) }, }, .seg_id = { { { CDF7( 5622, 7893, 16093, 18233, 27809, 28373, 32533) }, { CDF7(14274, 18230, 22557, 24935, 29980, 30851, 32344) }, { CDF7(27527, 28487, 28723, 28890, 32397, 32647, 32679) }, }, { { CDF7( 4096, 8192, 12288, 16384, 20480, 24576, 28672) }, { CDF7( 4096, 8192, 12288, 16384, 20480, 24576, 28672) }, { CDF7( 4096, 8192, 12288, 16384, 20480, 24576, 28672) }, } }, .delta_q = { CDF7(16594, 23325, 26424, 28225, 29358, 30099, 30613), 56 << 8 }, }, .mv = { .shell_set = { CDF1(31579), 25 << 8 }, .shell_lower = { { CDF4( 4460, 12999, 22505, 30840), 38 << 8 }, { CDF5( 7519, 18907, 25563, 29875, 31983), 27 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF6( 8680, 13723, 18208, 22686, 26722, 30020), 5 << 8 }, { CDF6( 4324, 15300, 23690, 28697, 31282, 32359), 1 << 8 }, { CDF7( 7497, 17301, 23848, 27438, 29395, 30879, 32003), 31 << 8 }, { CDF7(10667, 20239, 25883, 29670, 31400, 32153, 32579), 0 << 8 }, }, .shell_upper = { { CDF5(21329, 30564, 32589, 32649, 32708), 50 << 8 }, { CDF5(24250, 31806, 32676, 32722, 32732), 50 << 8 }, { CDF6( 4681, 9362, 14043, 18725, 23406, 28087), 0 << 8 }, { CDF6(19978, 30160, 32564, 32732, 32736, 32740), 1 << 8 }, { CDF7(19707, 28414, 31240, 31648, 32692, 32717, 32721), 25 << 8 }, { CDF7(18469, 27427, 31562, 32652, 32724, 32728, 32732), 31 << 8 }, { CDF7(17810, 25196, 29372, 31953, 32564, 32720, 32724), 55 << 8 }, }, .shell_tip = { CDF1(16384), 0 << 8 }, .shell_offset_low = { { CDF1(14587), 36 << 8 }, { CDF1(20966), 75 << 8 }, }, .shell_offset_cl2 = { CDF1(13189), 0 << 8 }, .shell_offset_hi = { { CDF1(17943), 93 << 8 }, { CDF1(18934), 93 << 8 }, { CDF1(18928), 93 << 8 }, { CDF1(18696), 93 << 8 }, { CDF1(19044), 93 << 8 }, { CDF1(20362), 93 << 8 }, { CDF1(20426), 93 << 8 }, { CDF1(22563), 93 << 8 }, { CDF1(22190), 93 << 8 }, { CDF1(23458), 90 << 8 }, { CDF1(26227), 2 << 8 }, { CDF1(30765), 50 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, .col_component = { { CDF1( 5663), 25 << 8 }, { CDF1( 4856), 90 << 8 }, }, .col_index = { { CDF1(13445), 1 << 8 }, { CDF1(13541), 1 << 8 }, { CDF1(14045), 1 << 8 }, { CDF1(12888), 31 << 8 }, } }, }; static const CdfCoefContext default_coef_cdf[4] = { [0] = { .skip = { { { { CDF1(25759), 31 << 8 }, { CDF1( 1099), 93 << 8 }, { CDF1( 2762), 5 << 8 }, { CDF1( 7944), 26 << 8 }, { CDF1(16230), 6 << 8 }, { CDF1(29076), 31 << 8 }, { CDF1( 8898), 1 << 8 }, { CDF1(13655), 1 << 8 }, { CDF1(22348), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(24799), 56 << 8 }, { CDF1( 3162), 75 << 8 }, { CDF1( 4813), 0 << 8 }, { CDF1(13562), 1 << 8 }, { CDF1(17989), 76 << 8 }, { CDF1(31799), 0 << 8 }, { CDF1( 5170), 75 << 8 }, { CDF1(12844), 1 << 8 }, { CDF1(22336), 6 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(18946), 62 << 8 }, { CDF1( 5651), 6 << 8 }, { CDF1( 6193), 7 << 8 }, { CDF1(15587), 7 << 8 }, { CDF1(18312), 91 << 8 }, { CDF1(31798), 80 << 8 }, { CDF1( 2961), 0 << 8 }, { CDF1(11605), 26 << 8 }, { CDF1(24657), 31 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1( 7809), 62 << 8 }, { CDF1( 3864), 33 << 8 }, { CDF1(14130), 40 << 8 }, { CDF1(16018), 32 << 8 }, { CDF1(15599), 10 << 8 }, { CDF1(28161), 32 << 8 }, { CDF1( 1300), 36 << 8 }, { CDF1(11703), 57 << 8 }, { CDF1(24729), 33 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1( 2800), 62 << 8 }, { CDF1( 159), 32 << 8 }, { CDF1( 1725), 50 << 8 }, { CDF1(11138), 50 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(32319), 0 << 8 }, { CDF1( 456), 63 << 8 }, { CDF1( 6416), 50 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(32764), 0 << 8 }, { CDF1(10751), 16 << 8 }, { CDF1(17579), 16 << 8 }, { CDF1(24060), 90 << 8 }, { CDF1(28085), 5 << 8 }, { CDF1(30928), 93 << 8 }, { CDF1( 5437), 37 << 8 }, { CDF1(17819), 26 << 8 }, { CDF1(28074), 8 << 8 }, { CDF1(30432), 80 << 8 }, }, { { CDF1(32452), 62 << 8 }, { CDF1(14328), 93 << 8 }, { CDF1(18927), 93 << 8 }, { CDF1(22405), 93 << 8 }, { CDF1(25460), 93 << 8 }, { CDF1(30634), 75 << 8 }, { CDF1( 5152), 31 << 8 }, { CDF1(15362), 31 << 8 }, { CDF1(24814), 6 << 8 }, { CDF1(31865), 0 << 8 }, }, { { CDF1(31891), 62 << 8 }, { CDF1(17014), 93 << 8 }, { CDF1(19836), 93 << 8 }, { CDF1(20450), 6 << 8 }, { CDF1(25131), 1 << 8 }, { CDF1(31538), 3 << 8 }, { CDF1( 5465), 25 << 8 }, { CDF1(17440), 31 << 8 }, { CDF1(28788), 1 << 8 }, { CDF1(29135), 37 << 8 }, }, { { CDF1(31092), 62 << 8 }, { CDF1(11952), 28 << 8 }, { CDF1( 1928), 0 << 8 }, { CDF1(19906), 12 << 8 }, { CDF1(32150), 50 << 8 }, { CDF1(31347), 52 << 8 }, { CDF1( 9362), 10 << 8 }, { CDF1(29127), 50 << 8 }, { CDF1(32373), 50 << 8 }, { CDF1(27387), 60 << 8 }, }, { { CDF1(26822), 63 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .eob_bin_16 = { { CDF4( 1946, 3059, 6834, 15123), 6 << 8 }, { CDF4( 558, 835, 2739, 6534), 25 << 8 }, { CDF4( 8000, 10366, 14466, 19569), 31 << 8 }, }, .eob_bin_32 = { { CDF5( 1203, 1920, 4502, 10281, 17078), 6 << 8 }, { CDF5( 1078, 1340, 2464, 5152, 11416), 5 << 8 }, { CDF5(11670, 14271, 19287, 23919, 27084), 31 << 8 }, }, .eob_bin_64 = { { CDF6( 1848, 2357, 4392, 7629, 13328, 21530), 32 << 8 }, { CDF6( 1476, 1703, 2435, 4245, 7520, 14818), 6 << 8 }, { CDF6(13125, 16136, 21482, 26341, 29681, 31034), 31 << 8 }, }, .eob_bin_128 = { { CDF7( 336, 988, 2984, 5382, 9800, 16995, 24488), 32 << 8 }, { CDF7( 1506, 2078, 2996, 4361, 7027, 12117, 21037), 6 << 8 }, { CDF7(10069, 12663, 18436, 24325, 28451, 30559, 31560), 31 << 8 }, }, .eob_bin_256 = { { CDF7( 321, 2376, 5427, 7836, 12726, 18552, 21084), 32 << 8 }, { CDF7( 1954, 2587, 3867, 5133, 7560, 10574, 14742), 37 << 8 }, { CDF7( 7984, 10878, 15991, 20185, 25357, 29358, 31262), 32 << 8 }, }, .eob_bin_512 = { { CDF7( 139, 5954, 13032, 16244, 20068, 24954, 25897), 32 << 8 }, { CDF7( 3412, 3896, 5734, 7524, 10117, 13773, 16441), 62 << 8 }, { CDF7(12810, 16127, 22415, 25016, 27768, 29850, 31372), 32 << 8 }, }, .eob_bin_1024 = { { CDF7( 576, 1953, 8086, 14995, 20006, 24682, 25870), 62 << 8 }, { CDF7( 2747, 3019, 3935, 4876, 6051, 8588, 10135), 57 << 8 }, { CDF7(11873, 15379, 25835, 27918, 29093, 30164, 30782), 32 << 8 }, }, .eob_hi_bit = { CDF1(16391), 0 << 8 }, .eob_base_y_tok_hf = { { { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(25475, 29789), 75 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32132, 32609), 50 << 8 }, { CDF2(32294, 32701), 75 << 8 }, { CDF2(31618, 32500), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32658, 32751), 78 << 8 }, { CDF2(32701, 32753), 93 << 8 }, { CDF2(32675, 32745), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32658, 32719), 65 << 8 }, { CDF2(32658, 32744), 67 << 8 }, { CDF2(32647, 32756), 33 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32538, 32711), 0 << 8 }, { CDF2(32138, 32453), 0 << 8 }, { CDF2(32356, 32562), 50 << 8 }, }, }, .eob_base_y_tok_lf = { { { CDF4(27486, 31140, 31779, 32064), 7 << 8 }, { CDF4(28263, 31142, 31813, 32057), 6 << 8 }, { CDF4(27578, 30405, 31202, 31448), 6 << 8 }, { CDF4(29800, 32145, 32589, 32665), 93 << 8 }, }, { { CDF4(27197, 31209, 31698, 31944), 32 << 8 }, { CDF4(30415, 31987, 32302, 32387), 0 << 8 }, { CDF4(30766, 32160, 32461, 32564), 75 << 8 }, { CDF4(31446, 32510, 32712, 32740), 90 << 8 }, }, { { CDF4( 9881, 28329, 30354, 31217), 62 << 8 }, { CDF4(30815, 32135, 32360, 32446), 5 << 8 }, { CDF4(31896, 32058, 32423, 32525), 37 << 8 }, { CDF4(27097, 28987, 31508, 32138), 50 << 8 }, }, { { CDF4( 76, 3661, 13759, 21509), 62 << 8 }, { CDF4(31910, 32583, 32688, 32740), 93 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, { { CDF4( 298, 596, 894, 1192), 50 << 8 }, { CDF4(29008, 30611, 32490, 32683), 3 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, }, .base_y_tok_hf = { { { { CDF3(12360, 26392, 29943), 26 << 8 }, { CDF3( 8590, 25353, 30895), 6 << 8 }, }, { { CDF3( 7246, 19496, 26530), 75 << 8 }, { CDF3( 3468, 16232, 25621), 90 << 8 }, }, { { CDF3( 4008, 12605, 18928), 93 << 8 }, { CDF3( 1999, 9165, 16685), 93 << 8 }, }, { { CDF3( 3148, 9393, 14900), 93 << 8 }, { CDF3( 1110, 7696, 14122), 78 << 8 }, }, { { CDF3( 2543, 7526, 12021), 93 << 8 }, { CDF3( 2561, 5445, 10546), 93 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(28014, 31534, 32060), 5 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(13135, 23487, 28599), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 7049, 15368, 20768), 93 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 3109, 8054, 12383), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(25298, 32426, 32713), 93 << 8 }, { CDF3(14651, 31551, 32591), 93 << 8 }, }, { { CDF3(18466, 30483, 32267), 75 << 8 }, { CDF3( 306, 26643, 31787), 90 << 8 }, }, { { CDF3(12399, 26150, 30691), 90 << 8 }, { CDF3( 1131, 21210, 29129), 75 << 8 }, }, { { CDF3( 7487, 20115, 26881), 93 << 8 }, { CDF3( 656, 14639, 24430), 75 << 8 }, }, { { CDF3( 3588, 11042, 17169), 78 << 8 }, { CDF3( 756, 7796, 14913), 78 << 8 }, }, { { CDF3(26570, 32443, 32691), 78 << 8 }, { CDF3(17559, 31708, 32628), 78 << 8 }, }, { { CDF3(18312, 30354, 32288), 90 << 8 }, { CDF3( 581, 26605, 31701), 90 << 8 }, }, { { CDF3(11812, 25791, 30572), 75 << 8 }, { CDF3( 1073, 20736, 28973), 75 << 8 }, }, { { CDF3( 7165, 19304, 26609), 93 << 8 }, { CDF3( 971, 14401, 24415), 93 << 8 }, }, { { CDF3( 3978, 12366, 19160), 93 << 8 }, { CDF3( 935, 8823, 16611), 78 << 8 }, }, { { CDF3(25670, 32230, 32678), 75 << 8 }, { CDF3(19556, 31721, 32677), 0 << 8 }, }, { { CDF3(17121, 29735, 32054), 75 << 8 }, { CDF3( 1460, 26595, 31623), 90 << 8 }, }, { { CDF3(11589, 25561, 30426), 90 << 8 }, { CDF3( 1652, 20606, 28939), 78 << 8 }, }, { { CDF3( 7343, 20631, 27574), 90 << 8 }, { CDF3( 922, 15328, 25115), 75 << 8 }, }, { { CDF3( 4778, 14146, 21151), 90 << 8 }, { CDF3( 902, 9679, 18420), 78 << 8 }, }, { { CDF3(30964, 32538, 32693), 5 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(18212, 29081, 31586), 93 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(10647, 22153, 27538), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 6396, 16045, 22849), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 3574, 10559, 16078), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(28697, 32692, 32748), 90 << 8 }, { CDF3(13717, 32316, 32740), 90 << 8 }, }, { { CDF3(24783, 32374, 32712), 90 << 8 }, { CDF3( 1229, 30871, 32600), 75 << 8 }, }, { { CDF3(19000, 30799, 32442), 75 << 8 }, { CDF3( 2583, 27380, 31887), 15 << 8 }, }, { { CDF3(10656, 24633, 30053), 75 << 8 }, { CDF3( 882, 19759, 28348), 75 << 8 }, }, { { CDF3( 4024, 11749, 17822), 75 << 8 }, { CDF3( 311, 8254, 15425), 75 << 8 }, }, { { CDF3(29761, 32708, 32748), 75 << 8 }, { CDF3(18251, 32567, 32748), 0 << 8 }, }, { { CDF3(24756, 32428, 32722), 75 << 8 }, { CDF3( 1802, 30951, 32542), 93 << 8 }, }, { { CDF3(18650, 30593, 32394), 75 << 8 }, { CDF3( 2883, 27721, 31874), 78 << 8 }, }, { { CDF3(10280, 24576, 29914), 75 << 8 }, { CDF3( 288, 19075, 28254), 1 << 8 }, }, { { CDF3( 3976, 12847, 19261), 75 << 8 }, { CDF3( 754, 8850, 16554), 3 << 8 }, }, { { CDF3(30346, 32725, 32748), 75 << 8 }, { CDF3(24555, 32672, 32748), 5 << 8 }, }, { { CDF3(24866, 32414, 32726), 75 << 8 }, { CDF3( 857, 31274, 32665), 90 << 8 }, }, { { CDF3(18664, 30690, 32404), 93 << 8 }, { CDF3( 2459, 27485, 31962), 93 << 8 }, }, { { CDF3(10302, 24628, 30393), 93 << 8 }, { CDF3( 923, 19581, 28849), 78 << 8 }, }, { { CDF3( 5108, 14969, 22262), 78 << 8 }, { CDF3( 637, 11172, 20006), 78 << 8 }, }, { { CDF3(32114, 32706, 32748), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(22647, 31591, 32522), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(14439, 26903, 30661), 6 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 7410, 19635, 26647), 1 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 3267, 10417, 16526), 92 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(29459, 32676, 32741), 26 << 8 }, { CDF3(12621, 32434, 32748), 83 << 8 }, }, { { CDF3(23667, 32112, 32701), 75 << 8 }, { CDF3( 5563, 30354, 32581), 75 << 8 }, }, { { CDF3(19383, 30660, 32564), 75 << 8 }, { CDF3( 3845, 27375, 31697), 90 << 8 }, }, { { CDF3(10075, 24894, 29897), 90 << 8 }, { CDF3( 2073, 19727, 27880), 78 << 8 }, }, { { CDF3( 3368, 9398, 13892), 81 << 8 }, { CDF3( 571, 6307, 12048), 81 << 8 }, }, { { CDF3(30541, 32712, 32748), 18 << 8 }, { CDF3(17105, 32518, 32748), 90 << 8 }, }, { { CDF3(26380, 32437, 32736), 78 << 8 }, { CDF3( 1892, 31537, 32682), 91 << 8 }, }, { { CDF3(20557, 31281, 32473), 90 << 8 }, { CDF3( 3148, 28251, 32121), 0 << 8 }, }, { { CDF3(10605, 25976, 30389), 76 << 8 }, { CDF3( 3627, 20150, 28517), 76 << 8 }, }, { { CDF3( 3792, 11134, 15907), 81 << 8 }, { CDF3( 517, 7539, 13580), 81 << 8 }, }, { { CDF3(30408, 32748, 32752), 75 << 8 }, { CDF3(32291, 32705, 32748), 75 << 8 }, }, { { CDF3(25730, 32585, 32739), 90 << 8 }, { CDF3( 5942, 31616, 32616), 93 << 8 }, }, { { CDF3(20185, 31076, 32464), 78 << 8 }, { CDF3( 1004, 28344, 32094), 78 << 8 }, }, { { CDF3(11707, 26199, 30689), 93 << 8 }, { CDF3( 1262, 21328, 29569), 93 << 8 }, }, { { CDF3( 5035, 14930, 21229), 90 << 8 }, { CDF3( 746, 10699, 18756), 90 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(28383, 32609, 32688), 10 << 8 }, { CDF3(15428, 32067, 32704), 35 << 8 }, }, { { CDF3(15754, 30788, 32588), 90 << 8 }, { CDF3( 4880, 29631, 31606), 0 << 8 }, }, { { CDF3(10550, 24328, 32147), 75 << 8 }, { CDF3( 8738, 22209, 29673), 75 << 8 }, }, { { CDF3( 9695, 22879, 26951), 25 << 8 }, { CDF3( 1798, 15785, 27773), 75 << 8 }, }, { { CDF3( 3427, 10282, 14127), 93 << 8 }, { CDF3( 1687, 7815, 14031), 78 << 8 }, }, { { CDF3(31560, 32315, 32617), 65 << 8 }, { CDF3(12105, 31933, 32351), 75 << 8 }, }, { { CDF3(20436, 29949, 32063), 0 << 8 }, { CDF3( 6712, 28623, 32571), 25 << 8 }, }, { { CDF3( 9717, 28135, 31186), 15 << 8 }, { CDF3( 8249, 24519, 30362), 40 << 8 }, }, { { CDF3( 7123, 25288, 28494), 10 << 8 }, { CDF3(11556, 14880, 27228), 10 << 8 }, }, { { CDF3( 4041, 10893, 15462), 5 << 8 }, { CDF3( 794, 10029, 17278), 18 << 8 }, }, { { CDF3(28664, 32744, 32748), 0 << 8 }, { CDF3(31632, 32700, 32740), 15 << 8 }, }, { { CDF3(26708, 32579, 32742), 78 << 8 }, { CDF3( 6672, 31742, 32705), 93 << 8 }, }, { { CDF3(20833, 31645, 32639), 93 << 8 }, { CDF3( 1146, 29311, 32329), 93 << 8 }, }, { { CDF3(13676, 28227, 31721), 93 << 8 }, { CDF3( 1479, 24229, 30975), 78 << 8 }, }, { { CDF3( 5792, 15556, 21640), 91 << 8 }, { CDF3( 1677, 10836, 19468), 91 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, }, .base_y_tok_lf = { { { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1828, 16851, 24012, 28649, 30422), 43 << 8 }, { CDF5( 4, 8171, 22012, 28211, 30477), 78 << 8 }, }, { { CDF5( 6923, 16016, 21706, 27149, 29436), 3 << 8 }, { CDF5( 4, 7404, 16572, 22804, 28278), 83 << 8 }, }, { { CDF5( 5490, 8820, 15814, 20244, 24021), 92 << 8 }, { CDF5( 4, 5052, 13398, 18782, 23391), 2 << 8 }, }, { { CDF5( 3032, 8030, 13087, 17462, 21741), 0 << 8 }, { CDF5( 4, 4819, 10626, 13564, 18007), 16 << 8 }, }, { { CDF5( 2261, 6418, 9159, 11973, 15591), 78 << 8 }, { CDF5( 5, 3913, 9731, 12580, 13840), 92 << 8 }, }, { { CDF5( 2300, 5287, 8547, 12143, 15837), 90 << 8 }, { CDF5( 5, 2607, 6044, 8929, 11979), 90 << 8 }, }, { { CDF5( 1698, 5197, 8275, 11449, 12212), 18 << 8 }, { CDF5( 49, 2125, 4169, 7345, 10592), 0 << 8 }, }, { { CDF5( 588, 2906, 4192, 5998, 7090), 93 << 8 }, { CDF5( 1289, 1297, 3318, 4626, 5829), 93 << 8 }, }, { { CDF5(12754, 29010, 31539, 32136, 32523), 75 << 8 }, { CDF5( 5740, 26647, 31969, 32459, 32732), 10 << 8 }, }, { { CDF5( 7974, 23312, 28743, 31187, 32129), 78 << 8 }, { CDF5( 4, 19122, 28115, 31351, 32328), 90 << 8 }, }, { { CDF5( 6004, 17753, 25489, 28906, 30692), 78 << 8 }, { CDF5( 4, 11476, 20286, 25973, 29118), 0 << 8 }, }, { { CDF5( 5318, 12906, 20831, 25848, 28911), 93 << 8 }, { CDF5( 4, 8395, 15889, 22378, 27016), 90 << 8 }, }, { { CDF5( 3337, 10161, 16413, 20903, 24729), 93 << 8 }, { CDF5( 4, 6231, 13179, 19032, 23124), 78 << 8 }, }, { { CDF5( 2632, 8256, 13389, 18349, 22057), 93 << 8 }, { CDF5( 4, 5539, 10753, 15541, 19671), 90 << 8 }, }, { { CDF5( 1647, 4981, 8018, 10713, 12930), 78 << 8 }, { CDF5( 133, 2595, 5758, 9172, 12046), 78 << 8 }, }, { { CDF5(17458, 29871, 32000, 32546, 32702), 93 << 8 }, { CDF5( 6925, 27088, 31750, 32368, 32706), 93 << 8 }, }, { { CDF5(10512, 24503, 29646, 31529, 32218), 93 << 8 }, { CDF5( 862, 17944, 27174, 30714, 32074), 90 << 8 }, }, { { CDF5( 6509, 17436, 24062, 28298, 30439), 93 << 8 }, { CDF5( 730, 11602, 20450, 26039, 29031), 90 << 8 }, }, { { CDF5( 4334, 12843, 19639, 24807, 27809), 90 << 8 }, { CDF5( 1182, 8304, 16062, 22058, 26408), 93 << 8 }, }, { { CDF5( 2763, 7942, 12551, 16873, 20575), 93 << 8 }, { CDF5( 588, 5113, 10255, 14725, 18439), 93 << 8 }, }, { { CDF5(25261, 30274, 31401, 31859, 32108), 5 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11341, 22787, 27330, 29329, 30402), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7870, 16597, 22314, 26036, 27646), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9102, 15212, 19596, 22630, 24714), 5 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2627, 7584, 12537, 16148, 19228), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1773, 5159, 8384, 11582, 14326), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1281, 3306, 5564, 7730, 10025), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(28275, 32087, 32554, 32679, 32732), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(13124, 24281, 29249, 31039, 31932), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7252, 15503, 21990, 26021, 28329), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6766, 13251, 18520, 22234, 24944), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2022, 6282, 10559, 13831, 16913), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5( 8794, 30831, 31512, 32035, 32245), 60 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4155, 20109, 27969, 30536, 32143), 0 << 8 }, { CDF5( 4, 10300, 22300, 28132, 30839), 75 << 8 }, }, { { CDF5(13302, 23761, 27348, 29952, 31476), 1 << 8 }, { CDF5( 4, 14590, 24086, 28071, 29718), 1 << 8 }, }, { { CDF5( 6132, 14425, 20760, 25122, 27553), 76 << 8 }, { CDF5( 4, 7592, 16048, 21694, 25655), 81 << 8 }, }, { { CDF5( 3376, 10282, 14415, 19084, 23156), 1 << 8 }, { CDF5( 4, 5986, 12540, 17467, 21046), 75 << 8 }, }, { { CDF5( 2339, 7935, 12252, 15518, 18022), 75 << 8 }, { CDF5( 4, 4807, 8560, 13742, 17846), 93 << 8 }, }, { { CDF5( 1008, 5061, 9267, 11938, 15655), 91 << 8 }, { CDF5( 2687, 2712, 6941, 10360, 13721), 90 << 8 }, }, { { CDF5( 1067, 5070, 8639, 11287, 13789), 75 << 8 }, { CDF5( 4, 3968, 5983, 9072, 11954), 93 << 8 }, }, { { CDF5( 513, 3188, 4940, 6642, 8326), 93 << 8 }, { CDF5( 1598, 1604, 3335, 5390, 7186), 93 << 8 }, }, { { CDF5(18364, 30931, 32563, 32686, 32732), 78 << 8 }, { CDF5(20105, 28028, 32148, 32696, 32722), 76 << 8 }, }, { { CDF5(14335, 27700, 31502, 32326, 32673), 18 << 8 }, { CDF5( 4, 23820, 30576, 32277, 32675), 75 << 8 }, }, { { CDF5(13107, 25752, 29932, 31638, 32381), 75 << 8 }, { CDF5( 4, 19582, 27260, 30379, 31695), 90 << 8 }, }, { { CDF5( 6458, 17966, 24490, 28327, 30455), 93 << 8 }, { CDF5( 4, 12996, 21614, 26979, 29871), 78 << 8 }, }, { { CDF5( 4362, 12372, 18835, 23784, 27252), 93 << 8 }, { CDF5( 4, 8398, 15799, 21724, 25631), 90 << 8 }, }, { { CDF5( 3430, 10462, 16282, 20919, 24340), 75 << 8 }, { CDF5( 30, 6110, 12299, 17850, 22504), 75 << 8 }, }, { { CDF5( 1926, 5825, 9243, 12462, 15052), 78 << 8 }, { CDF5( 72, 3367, 7142, 10595, 13743), 78 << 8 }, }, { { CDF5(24383, 32129, 32637, 32713, 32732), 93 << 8 }, { CDF5( 9137, 30561, 32521, 32699, 32732), 90 << 8 }, }, { { CDF5(17270, 29740, 32008, 32483, 32655), 93 << 8 }, { CDF5( 8, 25977, 31428, 32465, 32657), 75 << 8 }, }, { { CDF5(12268, 25877, 30422, 31961, 32485), 90 << 8 }, { CDF5( 12, 20569, 28767, 31457, 32359), 90 << 8 }, }, { { CDF5( 7294, 18995, 26081, 29672, 31452), 93 << 8 }, { CDF5( 181, 14139, 23494, 28458, 30918), 93 << 8 }, }, { { CDF5( 3440, 9808, 15386, 19904, 23390), 93 << 8 }, { CDF5( 139, 6455, 12703, 17797, 21832), 90 << 8 }, }, { { CDF5(28752, 31869, 32320, 32478, 32572), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(15670, 27564, 30368, 31493, 31993), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9799, 20178, 25558, 28413, 30046), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9356, 16500, 21207, 24352, 26419), 6 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2981, 9131, 14182, 18343, 21454), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2154, 6775, 11531, 15084, 18264), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1257, 3940, 6890, 9699, 12030), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(30668, 32581, 32693, 32736, 32740), 5 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(18504, 29402, 31702, 32283, 32519), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(10766, 22532, 27978, 30115, 31262), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 8719, 18462, 24428, 27561, 29580), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3879, 10481, 15972, 19633, 22761), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(18644, 30427, 32255, 32544, 32664), 37 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4549, 25234, 29518, 31595, 31998), 1 << 8 }, { CDF5( 4, 12584, 25285, 29517, 31566), 76 << 8 }, }, { { CDF5(12259, 24561, 28578, 30638, 31755), 80 << 8 }, { CDF5( 4, 18211, 26628, 30288, 31028), 25 << 8 }, }, { { CDF5( 5548, 13583, 19991, 25062, 27790), 15 << 8 }, { CDF5( 4, 8981, 17065, 22847, 26409), 75 << 8 }, }, { { CDF5( 2511, 8771, 13709, 18515, 21631), 90 << 8 }, { CDF5( 4, 6091, 12402, 16401, 21384), 75 << 8 }, }, { { CDF5( 1456, 6940, 12018, 15507, 18576), 15 << 8 }, { CDF5( 3828, 4004, 7860, 12305, 16184), 91 << 8 }, }, { { CDF5( 1456, 5488, 8386, 12251, 14583), 90 << 8 }, { CDF5( 5, 3667, 7540, 10600, 13938), 75 << 8 }, }, { { CDF5( 2172, 4278, 6935, 10386, 13380), 93 << 8 }, { CDF5( 6, 2661, 6993, 9316, 12044), 90 << 8 }, }, { { CDF5( 854, 2756, 4422, 6402, 8143), 75 << 8 }, { CDF5( 4, 1978, 3940, 6072, 7460), 93 << 8 }, }, { { CDF5(27597, 32474, 32665, 32730, 32734), 16 << 8 }, { CDF5(24628, 29500, 31920, 32665, 32732), 2 << 8 }, }, { { CDF5(16858, 28844, 31500, 32377, 32690), 90 << 8 }, { CDF5( 4, 27050, 31611, 32598, 32708), 1 << 8 }, }, { { CDF5(17412, 28372, 31092, 32127, 32538), 75 << 8 }, { CDF5( 4, 23035, 29401, 31561, 32362), 1 << 8 }, }, { { CDF5( 6754, 18144, 24912, 28603, 30596), 75 << 8 }, { CDF5( 4, 15859, 24392, 28503, 30570), 0 << 8 }, }, { { CDF5( 4015, 12511, 19603, 24562, 27900), 80 << 8 }, { CDF5( 4, 9194, 17016, 22337, 26137), 75 << 8 }, }, { { CDF5( 3458, 10380, 16144, 21113, 24683), 90 << 8 }, { CDF5( 5, 6767, 13223, 18657, 22519), 15 << 8 }, }, { { CDF5( 1682, 5432, 8769, 12173, 14844), 93 << 8 }, { CDF5( 7, 3903, 7423, 10655, 13533), 93 << 8 }, }, { { CDF5(28336, 32620, 32715, 32736, 32740), 80 << 8 }, { CDF5( 9058, 31414, 32677, 32736, 32740), 0 << 8 }, }, { { CDF5(23033, 31648, 32521, 32700, 32732), 75 << 8 }, { CDF5( 75, 29218, 32255, 32646, 32732), 0 << 8 }, }, { { CDF5(17992, 29745, 32048, 32579, 32724), 75 << 8 }, { CDF5( 95, 26324, 31408, 32426, 32663), 90 << 8 }, }, { { CDF5( 9671, 22972, 28881, 31303, 32233), 93 << 8 }, { CDF5( 132, 18991, 27475, 30900, 32140), 75 << 8 }, }, { { CDF5( 3245, 10146, 15682, 20055, 23306), 90 << 8 }, { CDF5( 715, 6956, 13155, 18158, 21881), 78 << 8 }, }, { { CDF5(30282, 32463, 32643, 32689, 32720), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(18363, 29970, 31740, 32329, 32529), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11103, 21929, 26883, 29338, 30615), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(10010, 15981, 20125, 22717, 24545), 26 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2197, 6613, 12544, 16341, 19413), 32 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 977, 5129, 8874, 12334, 16079), 15 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1326, 3745, 5773, 7906, 10325), 35 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(31687, 32702, 32738, 32742, 32746), 5 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(23101, 31425, 32294, 32485, 32560), 76 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(14043, 26486, 29937, 30917, 31636), 6 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11590, 21553, 26089, 28623, 29781), 32 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3345, 9687, 15060, 19237, 21946), 33 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(17981, 30263, 31343, 31835, 32228), 52 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 541, 19909, 25694, 31571, 32707), 56 << 8 }, { CDF5( 5, 3008, 18509, 26661, 30500), 6 << 8 }, }, { { CDF5( 6280, 16644, 22197, 27386, 30311), 31 << 8 }, { CDF5( 3289, 9702, 18719, 20867, 24166), 42 << 8 }, }, { { CDF5( 2433, 8354, 15330, 20115, 23238), 2 << 8 }, { CDF5( 21, 9300, 13157, 16064, 22107), 7 << 8 }, }, { { CDF5( 3059, 5529, 10689, 14670, 16660), 87 << 8 }, { CDF5( 36, 4415, 9690, 12203, 15756), 10 << 8 }, }, { { CDF5( 2528, 7188, 10559, 12046, 15715), 90 << 8 }, { CDF5( 228, 1917, 4701, 8762, 12870), 17 << 8 }, }, { { CDF5( 1318, 4984, 6874, 11801, 12832), 85 << 8 }, { CDF5( 772, 1543, 4630, 8014, 10032), 90 << 8 }, }, { { CDF5( 1741, 4692, 6205, 8476, 10822), 85 << 8 }, { CDF5( 287, 1649, 4446, 6812, 8891), 10 << 8 }, }, { { CDF5( 819, 2068, 3940, 5617, 6164), 93 << 8 }, { CDF5( 20, 1444, 2749, 3441, 5300), 92 << 8 }, }, { { CDF5(22627, 30107, 31954, 32695, 32732), 6 << 8 }, { CDF5( 9634, 23470, 31648, 32544, 32656), 40 << 8 }, }, { { CDF5(14808, 27300, 31048, 32355, 32547), 31 << 8 }, { CDF5( 4, 26706, 31394, 32590, 32717), 16 << 8 }, }, { { CDF5(14111, 25840, 29676, 31196, 32100), 2 << 8 }, { CDF5( 2240, 18421, 26730, 29569, 30965), 5 << 8 }, }, { { CDF5( 3320, 16652, 23243, 27800, 29610), 6 << 8 }, { CDF5( 3241, 11944, 18304, 22700, 26752), 3 << 8 }, }, { { CDF5( 2775, 10402, 16795, 21420, 24155), 82 << 8 }, { CDF5( 42, 7547, 13763, 18308, 22325), 83 << 8 }, }, { { CDF5( 1526, 9483, 15296, 20232, 22733), 7 << 8 }, { CDF5( 3770, 6135, 11174, 17138, 20531), 93 << 8 }, }, { { CDF5( 1566, 4208, 7493, 9981, 11307), 75 << 8 }, { CDF5( 10, 2707, 5347, 8016, 11209), 91 << 8 }, }, { { CDF5(29051, 32611, 32732, 32736, 32740), 81 << 8 }, { CDF5(11740, 31606, 32673, 32736, 32740), 15 << 8 }, }, { { CDF5(23082, 31405, 32517, 32736, 32740), 15 << 8 }, { CDF5( 1955, 29285, 32394, 32685, 32732), 15 << 8 }, }, { { CDF5(16727, 28474, 31461, 32407, 32710), 75 << 8 }, { CDF5( 1043, 24584, 30321, 31899, 32456), 75 << 8 }, }, { { CDF5( 7685, 20689, 27553, 30514, 31627), 90 << 8 }, { CDF5( 312, 15641, 24900, 29282, 31344), 78 << 8 }, }, { { CDF5( 2935, 8636, 13745, 17661, 20515), 76 << 8 }, { CDF5( 22, 6005, 11121, 15180, 18215), 81 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 607, 3034, 7889, 15777, 23666), 75 << 8 }, { CDF5( 546, 1092, 5461, 8738, 19661), 75 << 8 }, }, { { CDF5( 482, 9638, 11083, 11565, 13493), 75 << 8 }, { CDF5( 840, 2101, 3781, 12603, 16384), 50 << 8 }, }, { { CDF5( 273, 7646, 7919, 10377, 13653), 50 << 8 }, { CDF5( 390, 975, 1560, 11703, 15409), 50 << 8 }, }, { { CDF5( 819, 1365, 2185, 5734, 6554), 50 << 8 }, { CDF5( 356, 534, 890, 1603, 7480), 50 << 8 }, }, { { CDF5( 426, 2128, 4256, 5532, 6383), 50 << 8 }, { CDF5( 226, 452, 1130, 3390, 4520), 50 << 8 }, }, { { CDF5( 936, 2185, 3433, 4057, 5305), 0 << 8 }, { CDF5( 213, 426, 851, 1702, 1915), 50 << 8 }, }, { { CDF5( 1130, 6780, 11299, 13559, 15819), 0 << 8 }, { CDF5( 840, 1260, 1680, 2521, 3361), 50 << 8 }, }, { { CDF5( 840, 2521, 2941, 3781, 6722), 75 << 8 }, { CDF5( 607, 910, 1820, 2427, 2731), 25 << 8 }, }, { { CDF5( 2760, 9884, 19144, 26446, 28761), 75 << 8 }, { CDF5( 6378, 8797, 14295, 25731, 30349), 50 << 8 }, }, { { CDF5( 1022, 5182, 10071, 14523, 20142), 10 << 8 }, { CDF5( 1625, 5578, 14671, 19283, 23851), 40 << 8 }, }, { { CDF5( 1493, 14043, 16621, 19742, 20896), 40 << 8 }, { CDF5( 79, 5792, 10632, 14519, 17217), 75 << 8 }, }, { { CDF5( 1896, 5958, 11645, 14353, 16249), 0 << 8 }, { CDF5( 973, 3244, 6164, 8111, 11031), 0 << 8 }, }, { { CDF5( 2777, 4999, 8331, 10552, 13329), 75 << 8 }, { CDF5( 799, 2398, 5195, 7593, 11189), 25 << 8 }, }, { { CDF5( 1771, 5314, 8856, 12399, 15056), 75 << 8 }, { CDF5( 2048, 6144, 14336, 16384, 18432), 50 << 8 }, }, { { CDF5( 509, 3735, 7980, 10017, 11715), 75 << 8 }, { CDF5( 537, 2149, 5551, 6088, 8416), 75 << 8 }, }, { { CDF5(24857, 32089, 32690, 32716, 32732), 0 << 8 }, { CDF5(10411, 28445, 32559, 32722, 32732), 18 << 8 }, }, { { CDF5(15845, 30289, 32552, 32660, 32732), 40 << 8 }, { CDF5( 7773, 29346, 31862, 32519, 32732), 40 << 8 }, }, { { CDF5( 9766, 24287, 28720, 30969, 32382), 82 << 8 }, { CDF5( 585, 20320, 29044, 32023, 32449), 28 << 8 }, }, { { CDF5( 7627, 18644, 24717, 29519, 31779), 0 << 8 }, { CDF5( 2718, 15100, 26426, 29748, 31560), 0 << 8 }, }, { { CDF5( 2451, 8257, 11869, 16556, 19523), 12 << 8 }, { CDF5( 256, 4651, 10581, 13824, 16341), 92 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, }, .dc_sign = { { { { CDF1(15831), 93 << 8 }, { CDF1(13632), 75 << 8 }, { CDF1(19041), 75 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .br_y_tok_hf = { { CDF3(22305, 28743, 30345), 6 << 8 }, { CDF3(22663, 29948, 31320), 78 << 8 }, { CDF3(19776, 28658, 30435), 78 << 8 }, { CDF3(15436, 25313, 28181), 78 << 8 }, { CDF3(11214, 20671, 24854), 78 << 8 }, { CDF3( 8548, 16982, 21766), 78 << 8 }, { CDF3( 5729, 11993, 17176), 78 << 8 }, }, .br_y_tok_lf = { { CDF3( 7943, 14193, 20775), 32 << 8 }, { CDF3(14297, 22400, 26238), 91 << 8 }, { CDF3(10557, 18683, 22550), 90 << 8 }, { CDF3( 8289, 16068, 18454), 92 << 8 }, { CDF3( 5258, 10730, 13709), 90 << 8 }, { CDF3( 3933, 8166, 10680), 90 << 8 }, { CDF3( 2465, 5325, 6625), 90 << 8 }, { CDF3(10865, 16430, 19691), 6 << 8 }, { CDF3(14571, 22733, 26106), 15 << 8 }, { CDF3(14072, 23021, 25971), 75 << 8 }, { CDF3(11558, 20253, 23235), 78 << 8 }, { CDF3( 8603, 16200, 19466), 93 << 8 }, { CDF3( 6641, 13086, 16612), 78 << 8 }, { CDF3( 4240, 9043, 11946), 93 << 8 }, }, .bob_base_y_tok = { { { CDF2( 9917, 17060), 90 << 8 }, { CDF2(13841, 21928), 91 << 8 }, { CDF2(11228, 19107), 90 << 8 }, }, { { CDF2(11194, 18097), 15 << 8 }, { CDF2(13829, 21552), 91 << 8 }, { CDF2(15625, 22723), 1 << 8 }, }, { { CDF2(10164, 15059), 1 << 8 }, { CDF2(11087, 15990), 0 << 8 }, { CDF2(12184, 18484), 31 << 8 }, }, }, .br_y_tok_idtx = { { { CDF3(10358, 16536, 21006), 15 << 8 }, { CDF3(10820, 18219, 22881), 91 << 8 }, { CDF3(10100, 15687, 20193), 75 << 8 }, { CDF3(10388, 15552, 19869), 75 << 8 }, { CDF3( 7467, 14671, 18379), 90 << 8 }, { CDF3( 5068, 8607, 12235), 15 << 8 }, { CDF3( 3545, 6569, 9269), 93 << 8 }, }, { { CDF3(10846, 17226, 21443), 0 << 8 }, { CDF3(13398, 20355, 24290), 15 << 8 }, { CDF3(11910, 18796, 23075), 1 << 8 }, { CDF3(12559, 18882, 23077), 76 << 8 }, { CDF3( 8733, 17495, 21561), 90 << 8 }, { CDF3( 4080, 7646, 11249), 5 << 8 }, { CDF3( 3404, 6120, 8842), 75 << 8 }, }, { { CDF3( 7328, 12456, 16707), 91 << 8 }, { CDF3( 8772, 14457, 18914), 2 << 8 }, { CDF3( 8195, 13709, 17968), 2 << 8 }, { CDF3(10966, 15804, 19497), 7 << 8 }, { CDF3( 5374, 13909, 17992), 7 << 8 }, { CDF3( 2314, 4570, 7480), 31 << 8 }, { CDF3( 1645, 3387, 5133), 76 << 8 }, }, }, .base_y_tok_idtx = { { { CDF3(28343, 29890, 30977), 93 << 8 }, { CDF3(20601, 26193, 28764), 93 << 8 }, { CDF3(19490, 23791, 27048), 75 << 8 }, { CDF3(16423, 19493, 22007), 78 << 8 }, { CDF3(12176, 17688, 21070), 1 << 8 }, { CDF3(11254, 15066, 18960), 1 << 8 }, { CDF3( 7135, 9594, 11748), 1 << 8 }, }, { { CDF3(30897, 31794, 32178), 93 << 8 }, { CDF3(20986, 28222, 30421), 15 << 8 }, { CDF3(18659, 24176, 27934), 0 << 8 }, { CDF3(15633, 18666, 21224), 3 << 8 }, { CDF3( 9693, 15878, 20636), 1 << 8 }, { CDF3(10083, 14089, 18515), 6 << 8 }, { CDF3( 6265, 8352, 10523), 1 << 8 }, }, { { CDF3(32379, 32564, 32633), 75 << 8 }, { CDF3(22177, 29474, 30765), 1 << 8 }, { CDF3(18538, 23452, 26801), 1 << 8 }, { CDF3(14706, 16590, 18239), 75 << 8 }, { CDF3( 6954, 12300, 15663), 7 << 8 }, { CDF3( 5210, 7282, 11087), 6 << 8 }, { CDF3( 2614, 3781, 4955), 75 << 8 }, }, }, .sign_idtx = { { { CDF1(15560), 93 << 8 }, { CDF1(24775), 93 << 8 }, { CDF1( 7540), 93 << 8 }, { CDF1(27844), 0 << 8 }, { CDF1( 3545), 75 << 8 }, { CDF1(28880), 82 << 8 }, { CDF1( 4886), 80 << 8 }, { CDF1(32178), 6 << 8 }, { CDF1( 1204), 75 << 8 }, }, { { CDF1(14810), 3 << 8 }, { CDF1(26650), 75 << 8 }, { CDF1( 5063), 90 << 8 }, { CDF1(29212), 15 << 8 }, { CDF1( 1994), 78 << 8 }, { CDF1(29106), 90 << 8 }, { CDF1( 3267), 90 << 8 }, { CDF1(32118), 0 << 8 }, { CDF1( 537), 93 << 8 }, }, { { CDF1(11533), 31 << 8 }, { CDF1(28709), 76 << 8 }, { CDF1( 2095), 90 << 8 }, { CDF1(31598), 90 << 8 }, { CDF1( 331), 93 << 8 }, { CDF1(31760), 91 << 8 }, { CDF1( 319), 78 << 8 }, { CDF1(32511), 78 << 8 }, { CDF1( 60), 43 << 8 }, }, }, .skip_v = { { CDF1( 1439), 78 << 8 }, { CDF1( 6191), 0 << 8 }, { CDF1(14610), 1 << 8 }, { CDF1( 180), 50 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1( 7648), 90 << 8 }, { CDF1(16148), 90 << 8 }, { CDF1(24565), 90 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, .eob_base_uv_tok_hf = { { CDF2(10923, 21845), 0 << 8 }, { CDF2(31214, 32437), 93 << 8 }, { CDF2(31888, 32447), 78 << 8 }, { CDF2(30612, 32073), 93 << 8 }, }, .base_uv_tok_hf = { { CDF3(26904, 32102, 32598), 0 << 8 }, { CDF3(15749, 28898, 31610), 90 << 8 }, { CDF3( 9106, 21329, 26962), 90 << 8 }, { CDF3( 4828, 12923, 18983), 75 << 8 }, { CDF3(27779, 32406, 32689), 90 << 8 }, { CDF3(17414, 30077, 32025), 90 << 8 }, { CDF3( 9228, 22296, 27767), 81 << 8 }, { CDF3( 4564, 12734, 19144), 90 << 8 }, { CDF3(29238, 32489, 32693), 80 << 8 }, { CDF3(19819, 30853, 32222), 25 << 8 }, { CDF3( 9314, 19318, 25346), 6 << 8 }, { CDF3( 3060, 10265, 16088), 5 << 8 }, }, .br_uv_tok_hf = { { CDF3(20014, 26541, 29552), 7 << 8 }, { CDF3(20674, 27680, 30329), 78 << 8 }, { CDF3(16228, 24293, 28314), 75 << 8 }, { CDF3( 9580, 16283, 20959), 75 << 8 }, }, .eob_base_uv_tok_lf = { { CDF4(28950, 31443, 32009, 32257), 75 << 8 }, { CDF4(29916, 31919, 32224, 32441), 6 << 8 }, { CDF4(28902, 30805, 31579, 31816), 2 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, .base_uv_tok_lf = { { CDF5(14076, 26464, 29938, 31308, 31828), 6 << 8 }, { CDF5( 7520, 21227, 27766, 30312, 31477), 75 << 8 }, { CDF5( 4377, 13290, 19811, 24220, 27064), 90 << 8 }, { CDF5( 1682, 5139, 8601, 11973, 15046), 90 << 8 }, { CDF5(15235, 28605, 31367, 32151, 32451), 6 << 8 }, { CDF5(10256, 24586, 29775, 31465, 32137), 93 << 8 }, { CDF5( 5918, 15629, 22317, 26602, 29101), 90 << 8 }, { CDF5( 2015, 5704, 9835, 13705, 17299), 76 << 8 }, { CDF5(26420, 31955, 32312, 32430, 32526), 75 << 8 }, { CDF5(16374, 29560, 31531, 32023, 32291), 30 << 8 }, { CDF5( 7197, 15954, 20986, 24934, 27737), 6 << 8 }, { CDF5( 4820, 9488, 11701, 14065, 16248), 11 << 8 }, } }, [1] = { .skip = { { { { CDF1(30296), 5 << 8 }, { CDF1( 2871), 75 << 8 }, { CDF1( 5650), 76 << 8 }, { CDF1(13405), 26 << 8 }, { CDF1(19531), 1 << 8 }, { CDF1(30186), 5 << 8 }, { CDF1( 8778), 31 << 8 }, { CDF1(17506), 0 << 8 }, { CDF1(26467), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(31530), 30 << 8 }, { CDF1( 2361), 78 << 8 }, { CDF1( 4301), 75 << 8 }, { CDF1(14660), 31 << 8 }, { CDF1(19138), 6 << 8 }, { CDF1(31475), 5 << 8 }, { CDF1( 5031), 0 << 8 }, { CDF1(16054), 31 << 8 }, { CDF1(27047), 5 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(31362), 25 << 8 }, { CDF1( 2991), 75 << 8 }, { CDF1( 6803), 1 << 8 }, { CDF1(19942), 32 << 8 }, { CDF1(22017), 6 << 8 }, { CDF1(32325), 5 << 8 }, { CDF1( 5502), 0 << 8 }, { CDF1(17059), 31 << 8 }, { CDF1(27657), 31 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(27230), 62 << 8 }, { CDF1( 604), 80 << 8 }, { CDF1( 4108), 0 << 8 }, { CDF1(15740), 62 << 8 }, { CDF1(18869), 27 << 8 }, { CDF1(32126), 32 << 8 }, { CDF1( 4208), 31 << 8 }, { CDF1(18962), 32 << 8 }, { CDF1(28315), 31 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(23062), 62 << 8 }, { CDF1( 740), 33 << 8 }, { CDF1( 9511), 42 << 8 }, { CDF1(22294), 62 << 8 }, { CDF1(21507), 93 << 8 }, { CDF1(31946), 57 << 8 }, { CDF1( 2406), 62 << 8 }, { CDF1(18145), 31 << 8 }, { CDF1(24883), 33 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(32764), 0 << 8 }, { CDF1(10589), 91 << 8 }, { CDF1(17200), 7 << 8 }, { CDF1(20721), 1 << 8 }, { CDF1(23311), 6 << 8 }, { CDF1(28906), 5 << 8 }, { CDF1( 4037), 26 << 8 }, { CDF1(14946), 26 << 8 }, { CDF1(24129), 0 << 8 }, { CDF1(32128), 78 << 8 }, }, { { CDF1(32577), 31 << 8 }, { CDF1(13622), 93 << 8 }, { CDF1(16203), 90 << 8 }, { CDF1(19412), 90 << 8 }, { CDF1(23766), 75 << 8 }, { CDF1(29589), 3 << 8 }, { CDF1( 2774), 26 << 8 }, { CDF1(14855), 26 << 8 }, { CDF1(24976), 0 << 8 }, { CDF1(31738), 75 << 8 }, }, { { CDF1(32415), 62 << 8 }, { CDF1(15422), 93 << 8 }, { CDF1(18741), 91 << 8 }, { CDF1(20940), 75 << 8 }, { CDF1(25667), 76 << 8 }, { CDF1(31116), 0 << 8 }, { CDF1( 3281), 31 << 8 }, { CDF1(17757), 31 << 8 }, { CDF1(27707), 1 << 8 }, { CDF1(29599), 7 << 8 }, }, { { CDF1(32182), 62 << 8 }, { CDF1(13890), 90 << 8 }, { CDF1(17539), 18 << 8 }, { CDF1(21145), 15 << 8 }, { CDF1(26833), 93 << 8 }, { CDF1(31582), 1 << 8 }, { CDF1( 4193), 27 << 8 }, { CDF1(21696), 32 << 8 }, { CDF1(29128), 6 << 8 }, { CDF1(26841), 62 << 8 }, }, { { CDF1(31681), 62 << 8 }, { CDF1( 9846), 32 << 8 }, { CDF1(21469), 32 << 8 }, { CDF1(24254), 7 << 8 }, { CDF1(29068), 92 << 8 }, { CDF1(31736), 32 << 8 }, { CDF1( 8875), 62 << 8 }, { CDF1(25927), 60 << 8 }, { CDF1(29086), 60 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .eob_bin_16 = { { CDF4( 1839, 2995, 6796, 15126), 30 << 8 }, { CDF4( 1354, 1787, 4813, 9327), 30 << 8 }, { CDF4( 8298, 11869, 17297, 24569), 30 << 8 }, }, .eob_bin_32 = { { CDF5( 1143, 1778, 3854, 8166, 13337), 31 << 8 }, { CDF5( 1109, 1433, 2804, 5652, 11224), 31 << 8 }, { CDF5( 8685, 11412, 16486, 22345, 27930), 31 << 8 }, }, .eob_bin_64 = { { CDF6( 939, 1381, 3142, 6536, 13617, 21799), 31 << 8 }, { CDF6( 1338, 1662, 2777, 5268, 9184, 15736), 31 << 8 }, { CDF6( 9398, 11810, 17045, 22537, 27448, 30795), 31 << 8 }, }, .eob_bin_128 = { { CDF7( 654, 1068, 2257, 5580, 12362, 21455, 26067), 32 << 8 }, { CDF7( 1852, 2243, 3554, 5549, 9085, 14604, 21559), 32 << 8 }, { CDF7( 9713, 12146, 17484, 22818, 27372, 30388, 31811), 31 << 8 }, }, .eob_bin_256 = { { CDF7( 790, 1184, 2218, 5022, 11432, 20289, 23689), 57 << 8 }, { CDF7( 2998, 3363, 4528, 6371, 9250, 13525, 18730), 32 << 8 }, { CDF7( 9611, 12008, 17284, 22532, 26744, 29711, 31730), 31 << 8 }, }, .eob_bin_512 = { { CDF7( 1311, 2019, 3578, 6876, 13781, 23541, 24922), 62 << 8 }, { CDF7( 5415, 5926, 7647, 10113, 13081, 17084, 20509), 57 << 8 }, { CDF7( 9906, 12482, 17574, 22657, 27655, 30234, 31821), 32 << 8 }, }, .eob_bin_1024 = { { CDF7( 634, 1777, 3873, 6293, 12653, 23825, 24712), 57 << 8 }, { CDF7( 4894, 5819, 8219, 11022, 14406, 18919, 21479), 57 << 8 }, { CDF7( 9216, 12549, 17630, 21723, 25862, 28760, 30440), 37 << 8 }, }, .eob_hi_bit = { CDF1(18326), 0 << 8 }, .eob_base_y_tok_hf = { { { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(26028, 30728), 3 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(31077, 32105), 85 << 8 }, { CDF2(31889, 32651), 93 << 8 }, { CDF2(31276, 32538), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32624, 32752), 93 << 8 }, { CDF2(32688, 32756), 93 << 8 }, { CDF2(32695, 32756), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32677, 32743), 68 << 8 }, { CDF2(32051, 32739), 7 << 8 }, { CDF2(32291, 32748), 32 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32710, 32756), 78 << 8 }, { CDF2(32700, 32756), 13 << 8 }, { CDF2(24989, 32282), 62 << 8 }, }, }, .eob_base_y_tok_lf = { { { CDF4(28595, 31926, 32420, 32582), 91 << 8 }, { CDF4(28506, 31820, 32509, 32686), 0 << 8 }, { CDF4(28680, 31966, 32503, 32626), 0 << 8 }, { CDF4(27591, 32110, 32578, 32712), 75 << 8 }, }, { { CDF4(28550, 32010, 32465, 32593), 1 << 8 }, { CDF4(31254, 32502, 32660, 32713), 75 << 8 }, { CDF4(31600, 32549, 32688, 32735), 75 << 8 }, { CDF4(31747, 32600, 32712, 32740), 78 << 8 }, }, { { CDF4(27239, 31085, 31883, 32247), 7 << 8 }, { CDF4(31836, 32524, 32640, 32686), 0 << 8 }, { CDF4(32293, 32547, 32627, 32711), 43 << 8 }, { CDF4(24966, 28087, 29127, 31988), 50 << 8 }, }, { { CDF4(22871, 30729, 31682, 32000), 37 << 8 }, { CDF4(32054, 32674, 32735, 32740), 90 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, { { CDF4( 7975, 22586, 27023, 29960), 62 << 8 }, { CDF4(31368, 32346, 32628, 32738), 6 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, }, .base_y_tok_hf = { { { { CDF3(12507, 26077, 30060), 30 << 8 }, { CDF3( 7550, 25098, 30550), 30 << 8 }, }, { { CDF3( 8009, 20715, 27321), 3 << 8 }, { CDF3( 3223, 17464, 26650), 3 << 8 }, }, { { CDF3( 4305, 13150, 20144), 93 << 8 }, { CDF3( 1952, 10128, 18324), 3 << 8 }, }, { { CDF3( 3098, 9586, 15580), 93 << 8 }, { CDF3( 1089, 7195, 13810), 93 << 8 }, }, { { CDF3( 1728, 6104, 10373), 93 << 8 }, { CDF3( 1235, 4736, 9683), 93 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(27562, 31970, 32298), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(11965, 23262, 28482), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 6254, 14791, 20469), 78 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 1646, 6619, 10696), 93 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(25334, 31967, 32621), 90 << 8 }, { CDF3(15582, 31040, 32462), 78 << 8 }, }, { { CDF3(18907, 30472, 32314), 90 << 8 }, { CDF3( 953, 26945, 31805), 75 << 8 }, }, { { CDF3(13516, 26908, 30904), 75 << 8 }, { CDF3( 1549, 21936, 29542), 0 << 8 }, }, { { CDF3( 8297, 20991, 27437), 78 << 8 }, { CDF3( 557, 15598, 25261), 0 << 8 }, }, { { CDF3( 3980, 11626, 17598), 3 << 8 }, { CDF3( 918, 8073, 15400), 3 << 8 }, }, { { CDF3(26391, 32035, 32595), 75 << 8 }, { CDF3(17672, 31342, 32493), 0 << 8 }, }, { { CDF3(19580, 30651, 32319), 75 << 8 }, { CDF3( 589, 27593, 31900), 75 << 8 }, }, { { CDF3(13734, 27193, 31043), 0 << 8 }, { CDF3( 1624, 22701, 29917), 0 << 8 }, }, { { CDF3( 8311, 21239, 27850), 3 << 8 }, { CDF3( 510, 15971, 25802), 0 << 8 }, }, { { CDF3( 4217, 12744, 19511), 3 << 8 }, { CDF3( 717, 9081, 16951), 3 << 8 }, }, { { CDF3(25094, 32069, 32616), 5 << 8 }, { CDF3(17839, 31778, 32640), 5 << 8 }, }, { { CDF3(19672, 30814, 32312), 75 << 8 }, { CDF3( 2025, 28482, 32043), 75 << 8 }, }, { { CDF3(14800, 28188, 31442), 0 << 8 }, { CDF3( 1221, 24055, 30573), 0 << 8 }, }, { { CDF3( 8995, 22820, 28992), 3 << 8 }, { CDF3( 1167, 17916, 27275), 3 << 8 }, }, { { CDF3( 4844, 14798, 22123), 78 << 8 }, { CDF3( 839, 10591, 19419), 3 << 8 }, }, { { CDF3(30845, 32589, 32713), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(19409, 29802, 31864), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(11114, 22871, 27848), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 6092, 16202, 22429), 5 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 3247, 9679, 15027), 90 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(27264, 32622, 32748), 75 << 8 }, { CDF3(13182, 32042, 32728), 75 << 8 }, }, { { CDF3(22849, 32019, 32682), 75 << 8 }, { CDF3( 491, 29944, 32540), 75 << 8 }, }, { { CDF3(17618, 30067, 32261), 75 << 8 }, { CDF3( 1861, 25960, 31490), 75 << 8 }, }, { { CDF3(10036, 24382, 29998), 75 << 8 }, { CDF3( 1347, 18931, 28166), 78 << 8 }, }, { { CDF3( 5051, 14703, 21347), 75 << 8 }, { CDF3( 259, 10600, 19087), 0 << 8 }, }, { { CDF3(29595, 32710, 32748), 75 << 8 }, { CDF3(19929, 32553, 32748), 0 << 8 }, }, { { CDF3(24721, 32388, 32722), 75 << 8 }, { CDF3( 1482, 30913, 32625), 75 << 8 }, }, { { CDF3(18729, 30720, 32485), 0 << 8 }, { CDF3( 1645, 27442, 31977), 0 << 8 }, }, { { CDF3(10807, 25638, 30749), 75 << 8 }, { CDF3( 1150, 20387, 29227), 3 << 8 }, }, { { CDF3( 5546, 16487, 23567), 0 << 8 }, { CDF3( 403, 12249, 21372), 0 << 8 }, }, { { CDF3(30695, 32738, 32748), 80 << 8 }, { CDF3(25061, 32689, 32748), 0 << 8 }, }, { { CDF3(26547, 32580, 32747), 75 << 8 }, { CDF3( 1334, 31785, 32708), 75 << 8 }, }, { { CDF3(20996, 31697, 32662), 0 << 8 }, { CDF3( 3682, 29397, 32392), 78 << 8 }, }, { { CDF3(13209, 28091, 31805), 3 << 8 }, { CDF3( 840, 23372, 30922), 3 << 8 }, }, { { CDF3( 7389, 20059, 27005), 0 << 8 }, { CDF3( 332, 15296, 25093), 0 << 8 }, }, { { CDF3(32282, 32726, 32748), 15 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(23211, 31638, 32504), 90 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(13980, 26668, 30388), 1 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 7795, 19288, 25941), 6 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 3877, 11700, 17890), 1 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(28880, 32680, 32748), 0 << 8 }, { CDF3(12427, 32347, 32748), 90 << 8 }, }, { { CDF3(24025, 32081, 32704), 75 << 8 }, { CDF3( 2065, 30727, 32657), 76 << 8 }, }, { { CDF3(19872, 30769, 32395), 0 << 8 }, { CDF3( 1210, 27547, 31961), 76 << 8 }, }, { { CDF3(10410, 24769, 29817), 1 << 8 }, { CDF3( 2247, 19228, 27805), 1 << 8 }, }, { { CDF3( 3997, 11545, 16952), 75 << 8 }, { CDF3( 146, 7698, 14688), 1 << 8 }, }, { { CDF3(30905, 32733, 32748), 75 << 8 }, { CDF3(18768, 32634, 32748), 76 << 8 }, }, { { CDF3(26894, 32528, 32731), 76 << 8 }, { CDF3( 1978, 31675, 32684), 76 << 8 }, }, { { CDF3(20750, 31222, 32412), 1 << 8 }, { CDF3( 3259, 28753, 32073), 1 << 8 }, }, { { CDF3(10948, 25238, 30225), 0 << 8 }, { CDF3( 733, 20129, 28689), 1 << 8 }, }, { { CDF3( 3995, 12602, 18654), 0 << 8 }, { CDF3( 997, 9216, 16560), 75 << 8 }, }, { { CDF3(31712, 32750, 32754), 75 << 8 }, { CDF3(31152, 32718, 32748), 75 << 8 }, }, { { CDF3(28091, 32642, 32748), 75 << 8 }, { CDF3( 5217, 32006, 32702), 78 << 8 }, }, { { CDF3(22886, 31816, 32647), 0 << 8 }, { CDF3( 2358, 29840, 32394), 75 << 8 }, }, { { CDF3(13616, 27730, 31451), 0 << 8 }, { CDF3( 1022, 23059, 30343), 0 << 8 }, }, { { CDF3( 6026, 17154, 23829), 75 << 8 }, { CDF3( 837, 12628, 21443), 90 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(27869, 32588, 32732), 1 << 8 }, { CDF3(11934, 31678, 32720), 78 << 8 }, }, { { CDF3(20773, 31428, 32627), 75 << 8 }, { CDF3( 813, 29336, 32408), 0 << 8 }, }, { { CDF3(16679, 28979, 31835), 6 << 8 }, { CDF3( 2825, 24549, 30651), 1 << 8 }, }, { { CDF3( 8393, 21384, 27275), 1 << 8 }, { CDF3( 95, 15687, 24751), 6 << 8 }, }, { { CDF3( 3024, 8983, 13215), 75 << 8 }, { CDF3( 476, 5931, 11675), 90 << 8 }, }, { { CDF3(29793, 32715, 32748), 31 << 8 }, { CDF3(15137, 32136, 32735), 0 << 8 }, }, { { CDF3(23697, 31828, 32635), 1 << 8 }, { CDF3( 1247, 29941, 32474), 31 << 8 }, }, { { CDF3(15925, 28680, 31655), 6 << 8 }, { CDF3( 2639, 25018, 30693), 6 << 8 }, }, { { CDF3( 8039, 21380, 27690), 80 << 8 }, { CDF3( 2054, 16597, 25993), 0 << 8 }, }, { { CDF3( 3566, 10567, 15908), 90 << 8 }, { CDF3( 229, 7289, 14138), 1 << 8 }, }, { { CDF3(31282, 32752, 32756), 75 << 8 }, { CDF3(32072, 32728, 32748), 90 << 8 }, }, { { CDF3(26939, 32527, 32733), 75 << 8 }, { CDF3( 6585, 31625, 32669), 78 << 8 }, }, { { CDF3(19878, 30839, 32397), 78 << 8 }, { CDF3( 1559, 27814, 31953), 78 << 8 }, }, { { CDF3(11443, 25746, 30681), 78 << 8 }, { CDF3( 474, 20515, 28997), 78 << 8 }, }, { { CDF3( 5430, 15504, 22268), 93 << 8 }, { CDF3( 484, 11281, 19715), 93 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, }, .base_y_tok_lf = { { { { CDF5( 2114, 21141, 25369, 30654, 32239), 15 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 549, 13587, 23108, 28767, 30410), 10 << 8 }, { CDF5( 4, 5842, 19871, 26925, 30302), 1 << 8 }, }, { { CDF5( 4153, 12881, 20886, 25152, 28465), 5 << 8 }, { CDF5( 4, 9887, 17074, 23240, 26922), 1 << 8 }, }, { { CDF5( 3509, 9661, 15222, 20437, 24828), 3 << 8 }, { CDF5( 4, 5559, 13171, 19109, 22949), 1 << 8 }, }, { { CDF5( 2887, 7713, 11843, 15697, 19332), 18 << 8 }, { CDF5( 4, 5122, 9599, 14742, 18968), 76 << 8 }, }, { { CDF5( 2828, 6215, 10303, 13224, 16380), 18 << 8 }, { CDF5( 4, 2931, 7161, 10506, 14230), 18 << 8 }, }, { { CDF5( 1614, 5662, 8479, 11366, 13422), 18 << 8 }, { CDF5( 4, 1770, 4011, 7227, 9349), 17 << 8 }, }, { { CDF5( 1307, 3944, 6304, 9227, 11645), 83 << 8 }, { CDF5( 4, 2665, 4779, 7323, 9602), 93 << 8 }, }, { { CDF5( 1058, 2942, 4397, 5803, 6862), 78 << 8 }, { CDF5( 8, 1344, 2527, 4536, 6155), 78 << 8 }, }, { { CDF5(13961, 27727, 31548, 32432, 32640), 30 << 8 }, { CDF5( 2708, 23637, 31303, 32335, 32585), 1 << 8 }, }, { { CDF5( 8468, 21640, 27998, 30901, 32015), 90 << 8 }, { CDF5( 4, 16662, 26837, 30837, 32112), 90 << 8 }, }, { { CDF5( 6388, 17049, 24419, 28383, 30593), 90 << 8 }, { CDF5( 4, 12537, 21264, 26680, 30075), 5 << 8 }, }, { { CDF5( 4692, 13240, 20421, 25309, 28482), 78 << 8 }, { CDF5( 4, 9230, 16499, 22332, 26503), 0 << 8 }, }, { { CDF5( 3442, 10509, 16259, 21231, 24617), 0 << 8 }, { CDF5( 4, 6258, 12848, 18162, 22620), 90 << 8 }, }, { { CDF5( 2789, 8106, 13273, 17690, 21420), 78 << 8 }, { CDF5( 4, 5418, 10448, 15448, 19662), 83 << 8 }, }, { { CDF5( 1801, 4801, 7786, 10991, 13154), 93 << 8 }, { CDF5( 25, 3162, 6409, 9255, 12072), 3 << 8 }, }, { { CDF5(18092, 29844, 32040, 32571, 32719), 75 << 8 }, { CDF5( 9164, 26106, 31175, 32321, 32602), 78 << 8 }, }, { { CDF5(11343, 24700, 29814, 31603, 32215), 93 << 8 }, { CDF5( 22, 18800, 27925, 31139, 32203), 90 << 8 }, }, { { CDF5( 6970, 17708, 24782, 28661, 30778), 93 << 8 }, { CDF5( 38, 12260, 21656, 27173, 30241), 75 << 8 }, }, { { CDF5( 4769, 13169, 19793, 24809, 28254), 78 << 8 }, { CDF5( 145, 8816, 16919, 22877, 26986), 78 << 8 }, }, { { CDF5( 2805, 8155, 12943, 17359, 21110), 78 << 8 }, { CDF5( 566, 5051, 10431, 15290, 19445), 78 << 8 }, }, { { CDF5(21153, 29831, 31499, 32049, 32276), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11827, 24672, 29069, 30619, 31334), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6970, 16794, 23049, 26961, 28915), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6107, 13557, 19154, 23110, 25619), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2702, 7985, 12910, 17340, 20552), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1993, 5600, 9439, 13043, 16257), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1269, 3987, 6708, 9519, 11780), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(26630, 32076, 32532, 32672, 32731), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(13939, 25333, 29988, 31536, 32142), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6772, 15540, 22058, 26174, 28801), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5996, 13722, 19150, 22923, 25872), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2372, 6802, 11003, 14855, 18017), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(17303, 31326, 31526, 32376, 32710), 11 << 8 }, { CDF5( 260, 16384, 22105, 27307, 28867), 68 << 8 }, }, { { CDF5( 1845, 17089, 26116, 30050, 31620), 75 << 8 }, { CDF5( 4, 7809, 21186, 27970, 30787), 75 << 8 }, }, { { CDF5( 9621, 20802, 26452, 29388, 31088), 0 << 8 }, { CDF5( 4, 15476, 24099, 28279, 30173), 1 << 8 }, }, { { CDF5( 5542, 13743, 19899, 24736, 27946), 75 << 8 }, { CDF5( 4, 8467, 16948, 22772, 26471), 0 << 8 }, }, { { CDF5( 3700, 10363, 15560, 20281, 23726), 75 << 8 }, { CDF5( 4, 6038, 11885, 17198, 21363), 15 << 8 }, }, { { CDF5( 2418, 7197, 12060, 15605, 19277), 93 << 8 }, { CDF5( 4, 4658, 9217, 13468, 17379), 78 << 8 }, }, { { CDF5( 1804, 6314, 9128, 12718, 15539), 91 << 8 }, { CDF5( 1372, 1648, 6025, 9656, 13223), 78 << 8 }, }, { { CDF5( 1735, 4846, 7989, 10786, 13487), 78 << 8 }, { CDF5( 4, 3323, 6453, 9627, 12335), 93 << 8 }, }, { { CDF5( 1072, 3013, 4878, 6783, 8518), 93 << 8 }, { CDF5( 10, 1926, 3795, 5732, 7600), 93 << 8 }, }, { { CDF5(18024, 30489, 32251, 32626, 32732), 90 << 8 }, { CDF5(19896, 24724, 31657, 32534, 32732), 5 << 8 }, }, { { CDF5(11660, 26630, 30824, 32187, 32598), 75 << 8 }, { CDF5( 4, 21429, 29602, 31983, 32544), 0 << 8 }, }, { { CDF5(12486, 24964, 29580, 31503, 32248), 15 << 8 }, { CDF5( 4, 19295, 27630, 30822, 31918), 1 << 8 }, }, { { CDF5( 6927, 17619, 24441, 28334, 30439), 78 << 8 }, { CDF5( 4, 12994, 22001, 27172, 29927), 75 << 8 }, }, { { CDF5( 4541, 12600, 19151, 24114, 27363), 93 << 8 }, { CDF5( 4, 8631, 16251, 21921, 25807), 0 << 8 }, }, { { CDF5( 3464, 9852, 15685, 20588, 24106), 78 << 8 }, { CDF5( 4, 6580, 12812, 18079, 22278), 0 << 8 }, }, { { CDF5( 2085, 5889, 9302, 12624, 15262), 93 << 8 }, { CDF5( 69, 3305, 6891, 10477, 13651), 78 << 8 }, }, { { CDF5(23748, 31870, 32594, 32721, 32732), 90 << 8 }, { CDF5(10783, 29998, 32378, 32682, 32732), 78 << 8 }, }, { { CDF5(17733, 30083, 32212, 32605, 32725), 75 << 8 }, { CDF5( 187, 25993, 31535, 32484, 32683), 75 << 8 }, }, { { CDF5(12827, 26254, 30707, 32112, 32556), 75 << 8 }, { CDF5( 92, 21424, 29165, 31621, 32404), 75 << 8 }, }, { { CDF5( 7651, 19635, 26664, 30009, 31616), 78 << 8 }, { CDF5( 27, 14981, 24532, 29277, 31387), 75 << 8 }, }, { { CDF5( 3603, 10277, 15911, 20395, 23870), 78 << 8 }, { CDF5( 350, 6796, 13207, 18418, 22380), 3 << 8 }, }, { { CDF5(28587, 32126, 32526, 32643, 32695), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(16637, 28727, 31322, 32097, 32381), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9765, 21318, 26832, 29587, 30810), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6949, 15290, 21217, 24889, 27171), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3604, 10158, 15795, 20231, 23348), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2312, 7277, 11902, 16059, 19358), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1616, 4713, 7922, 10951, 13675), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(30272, 32591, 32703, 32736, 32740), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(19778, 30226, 32065, 32498, 32646), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11082, 23345, 28543, 30633, 31593), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7718, 18010, 24057, 27609, 29598), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3560, 10218, 15775, 20026, 23024), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(22472, 31406, 32415, 32617, 32686), 37 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4579, 20789, 27200, 30357, 31565), 0 << 8 }, { CDF5( 4, 12804, 24534, 29241, 31305), 7 << 8 }, }, { { CDF5(10553, 22962, 27982, 30214, 31394), 1 << 8 }, { CDF5( 4, 17230, 25117, 28842, 30403), 0 << 8 }, }, { { CDF5( 5244, 13272, 19764, 24354, 27332), 15 << 8 }, { CDF5( 4, 8510, 16126, 21941, 25809), 0 << 8 }, }, { { CDF5( 3263, 9944, 15196, 19855, 22889), 78 << 8 }, { CDF5( 4, 6173, 11683, 16946, 21240), 75 << 8 }, }, { { CDF5( 1953, 7145, 11524, 15201, 18227), 5 << 8 }, { CDF5( 290, 4454, 9436, 13541, 17364), 75 << 8 }, }, { { CDF5( 1722, 5876, 9313, 13099, 15693), 75 << 8 }, { CDF5( 4, 3730, 7620, 11131, 14444), 75 << 8 }, }, { { CDF5( 1665, 4813, 7575, 10457, 12877), 0 << 8 }, { CDF5( 4, 3095, 6224, 9491, 12462), 90 << 8 }, }, { { CDF5( 1050, 3459, 5369, 7424, 9155), 93 << 8 }, { CDF5( 125, 1583, 3405, 5421, 7274), 15 << 8 }, }, { { CDF5(25676, 31984, 32605, 32712, 32732), 75 << 8 }, { CDF5(22804, 26009, 31701, 32573, 32732), 0 << 8 }, }, { { CDF5(13894, 27451, 31145, 32228, 32639), 15 << 8 }, { CDF5( 4, 23912, 30395, 32138, 32610), 75 << 8 }, }, { { CDF5(14722, 26873, 30554, 31901, 32382), 81 << 8 }, { CDF5( 4, 19859, 27062, 30489, 31792), 6 << 8 }, }, { { CDF5( 6399, 16967, 23871, 27811, 30028), 78 << 8 }, { CDF5( 4, 13114, 21748, 26707, 29694), 0 << 8 }, }, { { CDF5( 4205, 12741, 19447, 23962, 26806), 76 << 8 }, { CDF5( 4, 8506, 15656, 21414, 25696), 78 << 8 }, }, { { CDF5( 3566, 10323, 16256, 20824, 24096), 75 << 8 }, { CDF5( 4, 6984, 13002, 18282, 22495), 90 << 8 }, }, { { CDF5( 1775, 5940, 9908, 13605, 16182), 0 << 8 }, { CDF5( 33, 4084, 7837, 11117, 14306), 80 << 8 }, }, { { CDF5(26506, 32458, 32715, 32736, 32740), 90 << 8 }, { CDF5( 8374, 30740, 32600, 32729, 32733), 93 << 8 }, }, { { CDF5(20516, 31118, 32508, 32699, 32732), 75 << 8 }, { CDF5( 9, 27963, 32073, 32637, 32732), 75 << 8 }, }, { { CDF5(15612, 28327, 31610, 32454, 32687), 80 << 8 }, { CDF5( 26, 24066, 30443, 32138, 32600), 80 << 8 }, }, { { CDF5( 8459, 21222, 27807, 30732, 31943), 78 << 8 }, { CDF5( 26, 16663, 26041, 30121, 31822), 75 << 8 }, }, { { CDF5( 4168, 12043, 18257, 22879, 26034), 75 << 8 }, { CDF5( 44, 8409, 15622, 20993, 24796), 0 << 8 }, }, { { CDF5(30650, 32519, 32673, 32723, 32732), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(17529, 29628, 31828, 32359, 32545), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(10542, 21115, 26318, 29161, 30805), 7 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(10247, 15768, 19711, 22296, 24028), 31 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2397, 7449, 12493, 16590, 19724), 56 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1869, 5577, 9489, 12835, 15631), 56 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1228, 3968, 6037, 8766, 10801), 55 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(31955, 32716, 32740, 32744, 32748), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(22693, 31709, 32467, 32593, 32659), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(14735, 26877, 30393, 31770, 32325), 1 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9390, 20635, 25906, 28733, 30212), 6 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4403, 11798, 18101, 22593, 25468), 5 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(17498, 29424, 31225, 31893, 32132), 62 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4465, 19403, 25248, 29392, 31099), 7 << 8 }, { CDF5( 4, 11650, 23783, 29177, 30996), 7 << 8 }, }, { { CDF5( 9152, 21002, 26411, 29197, 30838), 6 << 8 }, { CDF5( 387, 15169, 23639, 27517, 29225), 7 << 8 }, }, { { CDF5( 4263, 12796, 18125, 23216, 25968), 76 << 8 }, { CDF5( 4, 8811, 16017, 21391, 24903), 1 << 8 }, }, { { CDF5( 2963, 7692, 13120, 17437, 20742), 75 << 8 }, { CDF5( 4, 5759, 11337, 16128, 19807), 78 << 8 }, }, { { CDF5( 2381, 7283, 11060, 13175, 15657), 78 << 8 }, { CDF5( 30, 3641, 7204, 11473, 15621), 76 << 8 }, }, { { CDF5( 1314, 5852, 7870, 11057, 13558), 75 << 8 }, { CDF5( 1846, 2126, 5191, 9398, 12426), 75 << 8 }, }, { { CDF5( 1838, 5614, 8157, 11023, 13394), 93 << 8 }, { CDF5( 711, 875, 4561, 6006, 8631), 31 << 8 }, }, { { CDF5( 1166, 3234, 5698, 8017, 9616), 93 << 8 }, { CDF5( 25, 1054, 2149, 3082, 4062), 0 << 8 }, }, { { CDF5(26289, 31974, 32589, 32707, 32732), 6 << 8 }, { CDF5(15601, 27297, 32026, 32579, 32732), 30 << 8 }, }, { { CDF5(14722, 28032, 31407, 32465, 32660), 92 << 8 }, { CDF5( 4, 25486, 30863, 32285, 32652), 81 << 8 }, }, { { CDF5(14143, 25751, 29446, 31254, 32093), 75 << 8 }, { CDF5( 54, 19128, 26757, 29947, 31382), 1 << 8 }, }, { { CDF5( 5488, 15901, 22490, 26764, 29217), 75 << 8 }, { CDF5( 61, 12497, 20739, 25857, 28681), 6 << 8 }, }, { { CDF5( 4022, 11170, 17442, 21682, 24925), 75 << 8 }, { CDF5( 4, 7890, 14887, 20318, 24465), 75 << 8 }, }, { { CDF5( 3011, 9192, 14601, 19495, 22381), 0 << 8 }, { CDF5( 328, 6291, 12024, 16828, 20336), 1 << 8 }, }, { { CDF5( 1563, 4884, 8005, 10967, 13252), 90 << 8 }, { CDF5( 14, 3264, 6674, 9678, 12501), 90 << 8 }, }, { { CDF5(28318, 32551, 32737, 32741, 32745), 75 << 8 }, { CDF5( 8123, 31532, 32680, 32736, 32740), 76 << 8 }, }, { { CDF5(21238, 31149, 32509, 32707, 32732), 75 << 8 }, { CDF5( 138, 28353, 32132, 32659, 32732), 0 << 8 }, }, { { CDF5(16317, 28616, 31700, 32471, 32706), 1 << 8 }, { CDF5( 645, 23978, 30123, 31941, 32494), 83 << 8 }, }, { { CDF5( 8177, 21167, 27635, 30495, 31817), 93 << 8 }, { CDF5( 240, 16105, 25295, 29540, 31448), 75 << 8 }, }, { { CDF5( 3393, 9936, 15268, 19487, 22467), 76 << 8 }, { CDF5( 100, 7031, 13173, 17962, 21368), 1 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(14193, 27782, 31321, 32004, 32607), 50 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1210, 13319, 22982, 27380, 30780), 37 << 8 }, { CDF5( 4, 7290, 20938, 26753, 30532), 32 << 8 }, }, { { CDF5( 6129, 19017, 24319, 27326, 29222), 32 << 8 }, { CDF5( 108, 12264, 17594, 22318, 25161), 57 << 8 }, }, { { CDF5( 3672, 16362, 17673, 20077, 22838), 26 << 8 }, { CDF5( 1389, 4066, 11635, 17622, 22074), 7 << 8 }, }, { { CDF5( 2395, 9381, 12019, 14390, 17169), 40 << 8 }, { CDF5( 123, 3545, 5521, 12207, 15924), 33 << 8 }, }, { { CDF5( 2084, 5758, 9241, 11873, 14327), 93 << 8 }, { CDF5( 14, 4141, 8239, 10165, 13119), 93 << 8 }, }, { { CDF5( 1510, 4925, 7765, 10264, 12295), 18 << 8 }, { CDF5( 424, 2326, 5212, 9049, 12105), 93 << 8 }, }, { { CDF5( 1310, 4491, 6944, 9377, 11560), 17 << 8 }, { CDF5( 796, 2092, 4230, 6299, 9460), 18 << 8 }, }, { { CDF5( 575, 2082, 3357, 4723, 5504), 93 << 8 }, { CDF5( 407, 1183, 2575, 3784, 5063), 90 << 8 }, }, { { CDF5(21859, 29753, 31615, 32585, 32702), 31 << 8 }, { CDF5( 3511, 29157, 31832, 32567, 32701), 65 << 8 }, }, { { CDF5(13469, 26644, 30586, 31697, 32163), 27 << 8 }, { CDF5( 43, 24491, 30511, 31649, 32156), 32 << 8 }, }, { { CDF5(11517, 23927, 28667, 30593, 31651), 1 << 8 }, { CDF5( 25, 17446, 24435, 28271, 30325), 6 << 8 }, }, { { CDF5( 6058, 16232, 22050, 26218, 27912), 1 << 8 }, { CDF5( 168, 8904, 17397, 23318, 27175), 75 << 8 }, }, { { CDF5( 3216, 9575, 15671, 19813, 23338), 76 << 8 }, { CDF5( 65, 6797, 12411, 18393, 22224), 1 << 8 }, }, { { CDF5( 2705, 8581, 12260, 16653, 20249), 90 << 8 }, { CDF5( 396, 5255, 11135, 16441, 20215), 0 << 8 }, }, { { CDF5( 1193, 3568, 6140, 8638, 10114), 93 << 8 }, { CDF5( 11, 2647, 5137, 7270, 9301), 75 << 8 }, }, { { CDF5(27677, 32597, 32738, 32742, 32746), 26 << 8 }, { CDF5( 9101, 31116, 32717, 32736, 32740), 2 << 8 }, }, { { CDF5(17896, 30207, 32257, 32653, 32732), 75 << 8 }, { CDF5( 325, 26949, 31766, 32617, 32732), 16 << 8 }, }, { { CDF5(13007, 26339, 30456, 31958, 32494), 75 << 8 }, { CDF5( 638, 21532, 29168, 31580, 32346), 80 << 8 }, }, { { CDF5( 6432, 18636, 25684, 29741, 31417), 0 << 8 }, { CDF5( 399, 14022, 22914, 27926, 30488), 1 << 8 }, }, { { CDF5( 2460, 7366, 11813, 15406, 18038), 90 << 8 }, { CDF5( 141, 5139, 9696, 13649, 16881), 75 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, }, .dc_sign = { { { { CDF1(16540), 93 << 8 }, { CDF1(14804), 93 << 8 }, { CDF1(18312), 93 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .br_y_tok_hf = { { CDF3(24190, 30560, 31576), 30 << 8 }, { CDF3(24366, 30921, 31758), 78 << 8 }, { CDF3(21555, 29858, 31096), 3 << 8 }, { CDF3(16661, 26574, 28953), 78 << 8 }, { CDF3(11988, 21738, 25479), 93 << 8 }, { CDF3( 8920, 17446, 22214), 93 << 8 }, { CDF3( 5802, 12294, 17499), 78 << 8 }, }, .br_y_tok_lf = { { CDF3(11992, 17977, 23847), 37 << 8 }, { CDF3(14229, 22329, 26075), 90 << 8 }, { CDF3(11443, 20390, 22820), 76 << 8 }, { CDF3( 8397, 16223, 19264), 90 << 8 }, { CDF3( 5827, 11883, 14653), 90 << 8 }, { CDF3( 4416, 9159, 11664), 90 << 8 }, { CDF3( 2812, 6024, 7386), 93 << 8 }, { CDF3(16755, 23759, 26700), 5 << 8 }, { CDF3(17293, 25940, 28518), 93 << 8 }, { CDF3(15547, 25297, 27304), 93 << 8 }, { CDF3(12584, 22225, 24378), 93 << 8 }, { CDF3( 9185, 17517, 19977), 93 << 8 }, { CDF3( 7015, 13961, 16744), 93 << 8 }, { CDF3( 4509, 9609, 11858), 93 << 8 }, }, .bob_base_y_tok = { { { CDF2(16673, 23002), 75 << 8 }, { CDF2(19890, 25933), 90 << 8 }, { CDF2(18355, 25595), 75 << 8 }, }, { { CDF2(19619, 25865), 0 << 8 }, { CDF2(22421, 27836), 0 << 8 }, { CDF2(22018, 27887), 0 << 8 }, }, { { CDF2(22395, 27670), 31 << 8 }, { CDF2(24275, 28656), 31 << 8 }, { CDF2(22595, 28026), 1 << 8 }, }, }, .br_y_tok_idtx = { { { CDF3(11825, 18784, 22974), 0 << 8 }, { CDF3(10844, 17662, 22290), 0 << 8 }, { CDF3(10616, 16959, 21303), 90 << 8 }, { CDF3(11817, 17622, 21664), 90 << 8 }, { CDF3( 8267, 15623, 19902), 90 << 8 }, { CDF3( 5350, 9801, 14462), 5 << 8 }, { CDF3( 3443, 6585, 9559), 75 << 8 }, }, { { CDF3(13939, 20762, 24451), 31 << 8 }, { CDF3(16243, 23219, 26714), 31 << 8 }, { CDF3(15074, 22210, 25984), 6 << 8 }, { CDF3(16107, 22725, 26296), 6 << 8 }, { CDF3(10893, 20273, 24826), 6 << 8 }, { CDF3( 5645, 10671, 15100), 30 << 8 }, { CDF3( 4335, 8679, 12492), 31 << 8 }, }, { { CDF3(14250, 21163, 24941), 37 << 8 }, { CDF3(17217, 23934, 27433), 32 << 8 }, { CDF3(17090, 23993, 27574), 37 << 8 }, { CDF3(19725, 25425, 28697), 37 << 8 }, { CDF3(12700, 23765, 28251), 37 << 8 }, { CDF3( 6542, 11855, 17108), 37 << 8 }, { CDF3( 5101, 10468, 15443), 37 << 8 }, }, }, .base_y_tok_idtx = { { { CDF3(28035, 30893, 31756), 78 << 8 }, { CDF3(18097, 26957, 29476), 75 << 8 }, { CDF3(15075, 22557, 26893), 0 << 8 }, { CDF3(11632, 16975, 20254), 0 << 8 }, { CDF3( 5659, 12007, 16445), 0 << 8 }, { CDF3( 3855, 7854, 12411), 1 << 8 }, { CDF3( 2360, 4192, 5868), 0 << 8 }, }, { { CDF3(31009, 32323, 32574), 78 << 8 }, { CDF3(21342, 29678, 31451), 78 << 8 }, { CDF3(16940, 25075, 29268), 0 << 8 }, { CDF3(12426, 17800, 21533), 0 << 8 }, { CDF3( 6079, 13099, 18572), 1 << 8 }, { CDF3( 4249, 8763, 13561), 6 << 8 }, { CDF3( 2473, 4631, 6923), 5 << 8 }, }, { { CDF3(32079, 32614, 32706), 75 << 8 }, { CDF3(22634, 30655, 31942), 0 << 8 }, { CDF3(16704, 25079, 29917), 31 << 8 }, { CDF3(12795, 17039, 20729), 6 << 8 }, { CDF3( 5362, 11212, 18340), 37 << 8 }, { CDF3( 3513, 6916, 12694), 31 << 8 }, { CDF3( 1883, 3540, 5856), 6 << 8 }, }, }, .sign_idtx = { { { CDF1(17010), 0 << 8 }, { CDF1(25832), 0 << 8 }, { CDF1( 7718), 3 << 8 }, { CDF1(29537), 1 << 8 }, { CDF1( 8925), 31 << 8 }, { CDF1(30595), 75 << 8 }, { CDF1( 4677), 90 << 8 }, { CDF1(32471), 90 << 8 }, { CDF1( 3023), 1 << 8 }, }, { { CDF1(17528), 25 << 8 }, { CDF1(27808), 0 << 8 }, { CDF1( 6222), 0 << 8 }, { CDF1(29747), 1 << 8 }, { CDF1( 2848), 31 << 8 }, { CDF1(31635), 75 << 8 }, { CDF1( 2513), 75 << 8 }, { CDF1(32514), 75 << 8 }, { CDF1( 474), 18 << 8 }, }, { { CDF1(19157), 31 << 8 }, { CDF1(30551), 31 << 8 }, { CDF1( 2788), 31 << 8 }, { CDF1(30752), 37 << 8 }, { CDF1( 701), 26 << 8 }, { CDF1(32565), 15 << 8 }, { CDF1( 359), 8 << 8 }, { CDF1(32515), 87 << 8 }, { CDF1( 223), 37 << 8 }, }, }, .skip_v = { { CDF1( 1341), 93 << 8 }, { CDF1( 7768), 75 << 8 }, { CDF1(16044), 1 << 8 }, { CDF1( 104), 60 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1( 8426), 0 << 8 }, { CDF1(16919), 0 << 8 }, { CDF1(25248), 0 << 8 }, { CDF1( 8448), 50 << 8 }, { CDF1(27163), 50 << 8 }, { CDF1(27594), 50 << 8 }, }, .eob_base_uv_tok_hf = { { CDF2(10923, 21845), 0 << 8 }, { CDF2(31595, 32587), 93 << 8 }, { CDF2(32266, 32710), 93 << 8 }, { CDF2(32287, 32698), 93 << 8 }, }, .base_uv_tok_hf = { { CDF3(26264, 32282, 32667), 75 << 8 }, { CDF3(16468, 29532, 31912), 78 << 8 }, { CDF3( 8915, 21647, 27531), 0 << 8 }, { CDF3( 4341, 12415, 18782), 75 << 8 }, { CDF3(27171, 32473, 32714), 90 << 8 }, { CDF3(18547, 30679, 32297), 75 << 8 }, { CDF3( 9723, 23037, 28561), 0 << 8 }, { CDF3( 4310, 12399, 18745), 75 << 8 }, { CDF3(29747, 32607, 32733), 75 << 8 }, { CDF3(21347, 31575, 32509), 90 << 8 }, { CDF3(10548, 22298, 28335), 80 << 8 }, { CDF3( 2124, 9709, 17598), 90 << 8 }, }, .br_uv_tok_hf = { { CDF3(24442, 29827, 31535), 75 << 8 }, { CDF3(22490, 28954, 31256), 93 << 8 }, { CDF3(17882, 25670, 29377), 93 << 8 }, { CDF3(10664, 17735, 22437), 75 << 8 }, }, .eob_base_uv_tok_lf = { { CDF4(29843, 32183, 32562, 32675), 75 << 8 }, { CDF4(29975, 31731, 32100, 32264), 12 << 8 }, { CDF4(30096, 31949, 32460, 32611), 27 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, .base_uv_tok_lf = { { CDF5(15362, 27650, 30927, 32014, 32397), 6 << 8 }, { CDF5( 9556, 22999, 28733, 31004, 31942), 75 << 8 }, { CDF5( 5238, 14626, 21567, 26015, 28727), 78 << 8 }, { CDF5( 2196, 6512, 10780, 14847, 18294), 93 << 8 }, { CDF5(15569, 28727, 31706, 32423, 32650), 0 << 8 }, { CDF5(11506, 25824, 30517, 31948, 32443), 75 << 8 }, { CDF5( 6211, 16772, 23701, 27732, 29961), 90 << 8 }, { CDF5( 2461, 7239, 11755, 15869, 19210), 90 << 8 }, { CDF5(26951, 32136, 32556, 32654, 32696), 75 << 8 }, { CDF5(17691, 30215, 31944, 32399, 32550), 75 << 8 }, { CDF5( 9349, 19307, 25385, 29027, 30821), 6 << 8 }, { CDF5( 8528, 15111, 18457, 21676, 23767), 37 << 8 }, } }, [2] = { .skip = { { { { CDF1(28021), 3 << 8 }, { CDF1( 4675), 75 << 8 }, { CDF1( 6407), 75 << 8 }, { CDF1(12870), 0 << 8 }, { CDF1(18069), 1 << 8 }, { CDF1(26728), 30 << 8 }, { CDF1( 4204), 6 << 8 }, { CDF1(11853), 75 << 8 }, { CDF1(19479), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(30300), 0 << 8 }, { CDF1( 4727), 3 << 8 }, { CDF1( 5333), 75 << 8 }, { CDF1(13460), 6 << 8 }, { CDF1(17424), 6 << 8 }, { CDF1(28814), 30 << 8 }, { CDF1( 4777), 0 << 8 }, { CDF1(12522), 0 << 8 }, { CDF1(20439), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(31298), 25 << 8 }, { CDF1( 6336), 0 << 8 }, { CDF1( 7488), 76 << 8 }, { CDF1(19193), 31 << 8 }, { CDF1(21405), 31 << 8 }, { CDF1(31491), 5 << 8 }, { CDF1( 5723), 0 << 8 }, { CDF1(15055), 1 << 8 }, { CDF1(23518), 1 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(31807), 56 << 8 }, { CDF1( 1393), 75 << 8 }, { CDF1( 3549), 2 << 8 }, { CDF1(18574), 32 << 8 }, { CDF1(19795), 32 << 8 }, { CDF1(31792), 36 << 8 }, { CDF1( 5941), 31 << 8 }, { CDF1(16460), 32 << 8 }, { CDF1(25194), 31 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(32278), 61 << 8 }, { CDF1( 2582), 31 << 8 }, { CDF1(13992), 31 << 8 }, { CDF1(25993), 32 << 8 }, { CDF1(26283), 77 << 8 }, { CDF1(32056), 0 << 8 }, { CDF1( 6989), 32 << 8 }, { CDF1(17824), 32 << 8 }, { CDF1(24231), 32 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(32764), 0 << 8 }, { CDF1( 9384), 91 << 8 }, { CDF1(12245), 7 << 8 }, { CDF1(14958), 75 << 8 }, { CDF1(18080), 30 << 8 }, { CDF1(23099), 30 << 8 }, { CDF1( 2191), 90 << 8 }, { CDF1( 9674), 1 << 8 }, { CDF1(17537), 1 << 8 }, { CDF1(32587), 90 << 8 }, }, { { CDF1(32729), 18 << 8 }, { CDF1(12523), 93 << 8 }, { CDF1(13844), 93 << 8 }, { CDF1(16079), 93 << 8 }, { CDF1(21004), 90 << 8 }, { CDF1(25973), 0 << 8 }, { CDF1( 1860), 15 << 8 }, { CDF1( 9626), 1 << 8 }, { CDF1(18115), 1 << 8 }, { CDF1(31918), 25 << 8 }, }, { { CDF1(32727), 43 << 8 }, { CDF1(14244), 93 << 8 }, { CDF1(15808), 91 << 8 }, { CDF1(18087), 75 << 8 }, { CDF1(23066), 90 << 8 }, { CDF1(27617), 0 << 8 }, { CDF1( 1932), 0 << 8 }, { CDF1(12274), 1 << 8 }, { CDF1(23957), 6 << 8 }, { CDF1(30007), 7 << 8 }, }, { { CDF1(32605), 56 << 8 }, { CDF1(14434), 92 << 8 }, { CDF1(21625), 78 << 8 }, { CDF1(21326), 2 << 8 }, { CDF1(25757), 5 << 8 }, { CDF1(30059), 8 << 8 }, { CDF1( 2485), 26 << 8 }, { CDF1(15041), 6 << 8 }, { CDF1(26473), 5 << 8 }, { CDF1(25895), 63 << 8 }, }, { { CDF1(32562), 56 << 8 }, { CDF1(15140), 7 << 8 }, { CDF1(23809), 7 << 8 }, { CDF1(26086), 31 << 8 }, { CDF1(29383), 2 << 8 }, { CDF1(31155), 16 << 8 }, { CDF1( 4393), 51 << 8 }, { CDF1(17678), 33 << 8 }, { CDF1(25607), 56 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .eob_bin_16 = { { CDF4( 3946, 6406, 13325, 26004), 30 << 8 }, { CDF4( 2616, 3402, 7857, 13406), 30 << 8 }, { CDF4(13678, 17362, 23556, 28557), 30 << 8 }, }, .eob_bin_32 = { { CDF5( 3277, 4780, 9255, 16544, 21853), 31 << 8 }, { CDF5( 1655, 2364, 4679, 10029, 18955), 26 << 8 }, { CDF5(15549, 19007, 24651, 29173, 31742), 6 << 8 }, }, .eob_bin_64 = { { CDF6( 2024, 3239, 6969, 13073, 21617, 27643), 31 << 8 }, { CDF6( 1514, 1937, 3810, 7727, 13471, 23183), 31 << 8 }, { CDF6(14959, 18244, 24321, 29177, 31773, 32497), 1 << 8 }, }, .eob_bin_128 = { { CDF7( 1672, 2600, 5666, 11986, 21596, 28733, 30851), 31 << 8 }, { CDF7( 2101, 2605, 4469, 8056, 13125, 19491, 26379), 31 << 8 }, { CDF7(15041, 18279, 24430, 28935, 31545, 32439, 32672), 6 << 8 }, }, .eob_bin_256 = { { CDF7( 1556, 2520, 5142, 10585, 19507, 28407, 30571), 32 << 8 }, { CDF7( 3608, 4081, 5802, 8998, 13007, 18326, 23738), 32 << 8 }, { CDF7(14007, 17061, 22822, 27233, 30196, 31899, 32583), 6 << 8 }, }, .eob_bin_512 = { { CDF7( 1654, 2720, 5317, 10565, 19306, 29790, 30875), 32 << 8 }, { CDF7( 7237, 7980, 10164, 13730, 18015, 23652, 27137), 32 << 8 }, { CDF7(14503, 17149, 22592, 27035, 30013, 31699, 32526), 7 << 8 }, }, .eob_bin_1024 = { { CDF7( 2101, 3304, 6486, 12539, 21781, 30341, 30738), 32 << 8 }, { CDF7(11349, 12231, 14967, 18532, 22406, 26663, 28302), 57 << 8 }, { CDF7(15081, 17873, 23055, 27251, 30053, 31491, 32305), 32 << 8 }, }, .eob_hi_bit = { CDF1(19673), 78 << 8 }, .eob_base_y_tok_hf = { { { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(28958, 32194), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32132, 32735), 25 << 8 }, { CDF2(32555, 32745), 93 << 8 }, { CDF2(32458, 32729), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32617, 32754), 93 << 8 }, { CDF2(32699, 32756), 93 << 8 }, { CDF2(32695, 32756), 93 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32684, 32756), 93 << 8 }, { CDF2(32552, 32756), 10 << 8 }, { CDF2(32589, 32756), 75 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32687, 32756), 93 << 8 }, { CDF2(32716, 32756), 91 << 8 }, { CDF2(31504, 32715), 56 << 8 }, }, }, .eob_base_y_tok_lf = { { { CDF4(29573, 32066, 32477, 32625), 76 << 8 }, { CDF4(28827, 32168, 32643, 32735), 90 << 8 }, { CDF4(29362, 32299, 32653, 32722), 75 << 8 }, { CDF4(29765, 32457, 32711, 32740), 90 << 8 }, }, { { CDF4(29324, 32291, 32617, 32714), 0 << 8 }, { CDF4(31938, 32652, 32728, 32740), 75 << 8 }, { CDF4(31949, 32673, 32738, 32742), 75 << 8 }, { CDF4(31802, 32666, 32744, 32748), 78 << 8 }, }, { { CDF4(28567, 31870, 32438, 32629), 6 << 8 }, { CDF4(32100, 32663, 32732, 32740), 75 << 8 }, { CDF4(32648, 32724, 32744, 32748), 83 << 8 }, { CDF4(25565, 32000, 32473, 32709), 60 << 8 }, }, { { CDF4(26412, 30605, 31764, 32230), 37 << 8 }, { CDF4(32167, 32698, 32744, 32748), 90 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, { { CDF4(24457, 29221, 30907, 31726), 62 << 8 }, { CDF4(32052, 32690, 32737, 32741), 90 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, }, .base_y_tok_hf = { { { { CDF3(16644, 29920, 32210), 0 << 8 }, { CDF3( 8472, 27835, 32053), 8 << 8 }, }, { { CDF3(12684, 27992, 31958), 15 << 8 }, { CDF3( 1352, 23985, 31355), 0 << 8 }, }, { { CDF3( 6935, 20786, 28614), 90 << 8 }, { CDF3( 1491, 14175, 25464), 18 << 8 }, }, { { CDF3( 4284, 13084, 20147), 93 << 8 }, { CDF3( 506, 8831, 17788), 90 << 8 }, }, { { CDF3( 2473, 16075, 19166), 90 << 8 }, { CDF3( 2070, 4484, 8278), 42 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(31137, 32587, 32685), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(13651, 25517, 30395), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 7870, 18277, 25280), 78 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 3433, 9986, 16852), 17 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(25281, 32546, 32739), 90 << 8 }, { CDF3(13926, 31641, 32699), 75 << 8 }, }, { { CDF3(19473, 31423, 32614), 90 << 8 }, { CDF3( 98, 28106, 32308), 90 << 8 }, }, { { CDF3(14245, 28210, 31761), 78 << 8 }, { CDF3( 186, 23144, 30686), 75 << 8 }, }, { { CDF3( 8755, 22554, 29121), 78 << 8 }, { CDF3( 312, 17032, 27163), 78 << 8 }, }, { { CDF3( 5207, 15042, 22350), 78 << 8 }, { CDF3( 383, 10485, 19689), 78 << 8 }, }, { { CDF3(27697, 32637, 32746), 90 << 8 }, { CDF3(19828, 32280, 32729), 0 << 8 }, }, { { CDF3(20895, 31709, 32601), 93 << 8 }, { CDF3( 706, 29097, 32430), 75 << 8 }, }, { { CDF3(14671, 28816, 32008), 3 << 8 }, { CDF3( 547, 23747, 31087), 75 << 8 }, }, { { CDF3( 8923, 23097, 29710), 93 << 8 }, { CDF3( 217, 17350, 27724), 75 << 8 }, }, { { CDF3( 5346, 16046, 24143), 78 << 8 }, { CDF3( 362, 12099, 22064), 78 << 8 }, }, { { CDF3(26664, 32524, 32729), 0 << 8 }, { CDF3(19929, 32241, 32732), 5 << 8 }, }, { { CDF3(20429, 31546, 32587), 78 << 8 }, { CDF3( 871, 29224, 32427), 75 << 8 }, }, { { CDF3(15091, 29302, 32181), 93 << 8 }, { CDF3( 420, 24224, 31315), 75 << 8 }, }, { { CDF3( 9650, 24609, 30553), 93 << 8 }, { CDF3( 716, 18899, 28798), 78 << 8 }, }, { { CDF3( 6228, 18247, 26011), 93 << 8 }, { CDF3( 488, 12960, 23566), 93 << 8 }, }, { { CDF3(31458, 32658, 32742), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(20693, 30846, 32296), 90 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(12771, 25313, 29851), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 7602, 19024, 25601), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 4692, 14076, 20351), 15 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(26646, 32618, 32748), 75 << 8 }, { CDF3(12985, 32023, 32728), 75 << 8 }, }, { { CDF3(21678, 31945, 32675), 93 << 8 }, { CDF3( 46, 29649, 32565), 0 << 8 }, }, { { CDF3(16813, 29938, 32304), 75 << 8 }, { CDF3( 734, 25488, 31518), 0 << 8 }, }, { { CDF3(10047, 24573, 30320), 78 << 8 }, { CDF3( 680, 18741, 28408), 78 << 8 }, }, { { CDF3( 5428, 16010, 23347), 75 << 8 }, { CDF3( 282, 11635, 21016), 78 << 8 }, }, { { CDF3(29492, 32711, 32748), 90 << 8 }, { CDF3(22856, 32581, 32748), 90 << 8 }, }, { { CDF3(23814, 32473, 32732), 75 << 8 }, { CDF3( 505, 30842, 32654), 75 << 8 }, }, { { CDF3(17970, 30664, 32542), 0 << 8 }, { CDF3( 591, 26988, 31983), 0 << 8 }, }, { { CDF3(10731, 25646, 30915), 75 << 8 }, { CDF3( 716, 19981, 29261), 78 << 8 }, }, { { CDF3( 5897, 17573, 25080), 75 << 8 }, { CDF3( 178, 12954, 22725), 90 << 8 }, }, { { CDF3(30940, 32740, 32748), 0 << 8 }, { CDF3(27493, 32693, 32748), 5 << 8 }, }, { { CDF3(25557, 32571, 32738), 90 << 8 }, { CDF3( 1156, 31536, 32692), 90 << 8 }, }, { { CDF3(19676, 31438, 32631), 75 << 8 }, { CDF3( 1853, 28395, 32266), 78 << 8 }, }, { { CDF3(12168, 27322, 31648), 93 << 8 }, { CDF3( 374, 21862, 30426), 78 << 8 }, }, { { CDF3( 6908, 19494, 27071), 78 << 8 }, { CDF3( 113, 14821, 25176), 90 << 8 }, }, { { CDF3(32313, 32737, 32748), 25 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(23711, 32006, 32620), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(16041, 28992, 31635), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 9990, 23637, 29213), 5 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 5762, 16053, 22500), 5 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(27184, 32672, 32748), 1 << 8 }, { CDF3(10613, 32252, 32738), 75 << 8 }, }, { { CDF3(22470, 32091, 32716), 75 << 8 }, { CDF3( 949, 30238, 32643), 80 << 8 }, }, { { CDF3(17935, 30422, 32413), 0 << 8 }, { CDF3( 674, 26736, 32008), 80 << 8 }, }, { { CDF3(11011, 25612, 30643), 90 << 8 }, { CDF3( 997, 19957, 28943), 90 << 8 }, }, { { CDF3( 4754, 13518, 19616), 75 << 8 }, { CDF3( 30, 10027, 18226), 76 << 8 }, }, { { CDF3(29734, 32729, 32748), 0 << 8 }, { CDF3(17868, 32552, 32748), 0 << 8 }, }, { { CDF3(25168, 32565, 32739), 76 << 8 }, { CDF3( 525, 31274, 32689), 76 << 8 }, }, { { CDF3(19155, 30941, 32493), 75 << 8 }, { CDF3( 1986, 27740, 32084), 75 << 8 }, }, { { CDF3(10959, 25516, 30465), 75 << 8 }, { CDF3( 233, 20155, 29122), 81 << 8 }, }, { { CDF3( 4681, 14503, 21334), 0 << 8 }, { CDF3( 483, 10679, 18885), 75 << 8 }, }, { { CDF3(31907, 32752, 32756), 0 << 8 }, { CDF3(31874, 32719, 32748), 75 << 8 }, }, { { CDF3(26499, 32598, 32736), 90 << 8 }, { CDF3( 4327, 31612, 32678), 93 << 8 }, }, { { CDF3(20964, 31523, 32599), 75 << 8 }, { CDF3( 1451, 28828, 32263), 90 << 8 }, }, { { CDF3(12822, 27283, 31369), 78 << 8 }, { CDF3( 336, 21868, 30060), 75 << 8 }, }, { { CDF3( 6252, 17633, 24699), 93 << 8 }, { CDF3( 465, 12979, 22414), 93 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(27513, 32649, 32748), 6 << 8 }, { CDF3(12533, 32080, 32727), 0 << 8 }, }, { { CDF3(22267, 31858, 32685), 80 << 8 }, { CDF3( 475, 29885, 32524), 15 << 8 }, }, { { CDF3(17673, 30194, 32223), 1 << 8 }, { CDF3( 1643, 25673, 31224), 1 << 8 }, }, { { CDF3(10022, 24035, 29658), 75 << 8 }, { CDF3( 276, 18877, 27751), 6 << 8 }, }, { { CDF3( 4529, 13192, 18915), 76 << 8 }, { CDF3( 285, 9212, 16975), 76 << 8 }, }, { { CDF3(29967, 32707, 32748), 6 << 8 }, { CDF3(15326, 32376, 32746), 15 << 8 }, }, { { CDF3(24351, 32244, 32717), 1 << 8 }, { CDF3( 1649, 31057, 32684), 0 << 8 }, }, { { CDF3(17832, 30408, 32381), 6 << 8 }, { CDF3( 1876, 26558, 31696), 31 << 8 }, }, { { CDF3(10499, 24843, 30081), 75 << 8 }, { CDF3( 1028, 19460, 28542), 1 << 8 }, }, { { CDF3( 5050, 14590, 21227), 75 << 8 }, { CDF3( 92, 10933, 19446), 6 << 8 }, }, { { CDF3(31604, 32752, 32756), 75 << 8 }, { CDF3(32163, 32722, 32748), 90 << 8 }, }, { { CDF3(26537, 32585, 32739), 75 << 8 }, { CDF3( 5518, 31656, 32695), 90 << 8 }, }, { { CDF3(20484, 31396, 32601), 75 << 8 }, { CDF3( 1028, 28479, 32227), 78 << 8 }, }, { { CDF3(12807, 27291, 31420), 93 << 8 }, { CDF3( 469, 22186, 30263), 75 << 8 }, }, { { CDF3( 6722, 18261, 25292), 90 << 8 }, { CDF3( 57, 13631, 22880), 75 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, }, .base_y_tok_lf = { { { { CDF5( 3361, 19325, 24366, 31088, 31928), 50 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1632, 15337, 24150, 28503, 30563), 2 << 8 }, { CDF5( 4, 5295, 20829, 28126, 30850), 0 << 8 }, }, { { CDF5( 6632, 16063, 23298, 27040, 29721), 3 << 8 }, { CDF5( 4, 11374, 19507, 24923, 28568), 25 << 8 }, }, { { CDF5( 4909, 11090, 17838, 22385, 26279), 15 << 8 }, { CDF5( 35, 6017, 15862, 22480, 26459), 0 << 8 }, }, { { CDF5( 3589, 9811, 14531, 19054, 22695), 3 << 8 }, { CDF5( 4, 6364, 9855, 16927, 21591), 0 << 8 }, }, { { CDF5( 2743, 7444, 12418, 15058, 19040), 0 << 8 }, { CDF5( 6, 2870, 8495, 12527, 16693), 18 << 8 }, }, { { CDF5( 1596, 6378, 10210, 14940, 17169), 93 << 8 }, { CDF5( 4, 1626, 3090, 7519, 9718), 18 << 8 }, }, { { CDF5( 648, 4541, 7588, 11572, 14642), 5 << 8 }, { CDF5( 5, 3893, 6075, 9405, 12116), 18 << 8 }, }, { { CDF5( 1519, 3450, 5055, 6173, 7170), 8 << 8 }, { CDF5( 5, 2567, 3706, 7533, 9113), 90 << 8 }, }, { { CDF5(14288, 28805, 31683, 32583, 32700), 78 << 8 }, { CDF5( 64, 24516, 31405, 32371, 32655), 6 << 8 }, }, { { CDF5( 8353, 22447, 29049, 31579, 32315), 78 << 8 }, { CDF5( 4, 16982, 27998, 31526, 32421), 0 << 8 }, }, { { CDF5( 7498, 18802, 25718, 29605, 31419), 75 << 8 }, { CDF5( 4, 13992, 23750, 28392, 31093), 0 << 8 }, }, { { CDF5( 5931, 15139, 22182, 27418, 29974), 78 << 8 }, { CDF5( 4, 12206, 21157, 26583, 29002), 0 << 8 }, }, { { CDF5( 4417, 12408, 19565, 24937, 27743), 0 << 8 }, { CDF5( 6, 8793, 17130, 22092, 26855), 1 << 8 }, }, { { CDF5( 3697, 10293, 16641, 21345, 25242), 78 << 8 }, { CDF5( 4, 7551, 14182, 20667, 25647), 18 << 8 }, }, { { CDF5( 3788, 10092, 15480, 20531, 23568), 76 << 8 }, { CDF5( 30, 4783, 10714, 16396, 20863), 0 << 8 }, }, { { CDF5(18808, 30564, 32414, 32708, 32732), 93 << 8 }, { CDF5( 8341, 26149, 31856, 32628, 32726), 3 << 8 }, }, { { CDF5(13189, 27414, 31625, 32526, 32712), 90 << 8 }, { CDF5( 95, 21665, 30252, 32221, 32614), 3 << 8 }, }, { { CDF5( 9837, 22542, 28890, 31327, 32298), 78 << 8 }, { CDF5( 6, 16453, 26512, 30728, 32308), 75 << 8 }, }, { { CDF5( 7017, 18407, 25212, 29652, 31622), 78 << 8 }, { CDF5( 31, 11962, 22617, 28211, 31109), 75 << 8 }, }, { { CDF5( 5446, 14294, 21338, 26300, 29677), 78 << 8 }, { CDF5( 156, 8220, 17040, 23841, 28514), 78 << 8 }, }, { { CDF5(24803, 31286, 32079, 32381, 32540), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(12821, 26214, 30050, 31373, 31972), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7896, 17264, 23904, 27850, 29752), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7045, 14770, 19995, 23659, 26108), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2310, 6418, 12150, 16868, 20631), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1672, 5533, 10030, 13735, 17567), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1219, 4524, 7028, 10416, 13388), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(29688, 32425, 32651, 32731, 32735), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(15856, 26584, 30786, 32053, 32570), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 8823, 17890, 24454, 28342, 30615), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7001, 15331, 21333, 25485, 28372), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3542, 10300, 15274, 19865, 23271), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(20460, 32339, 32513, 32662, 32732), 13 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 283, 17174, 26580, 30375, 31638), 75 << 8 }, { CDF5( 4, 5576, 21745, 28213, 30821), 75 << 8 }, }, { { CDF5(11048, 22170, 26485, 29197, 30921), 0 << 8 }, { CDF5( 4, 16199, 25093, 29139, 30504), 0 << 8 }, }, { { CDF5( 6377, 13511, 19779, 25102, 28132), 0 << 8 }, { CDF5( 4, 7377, 18122, 23594, 27200), 78 << 8 }, }, { { CDF5( 4500, 10287, 15895, 21180, 24672), 0 << 8 }, { CDF5( 16, 5691, 12262, 17802, 21695), 0 << 8 }, }, { { CDF5( 2790, 7843, 12604, 16931, 20564), 93 << 8 }, { CDF5( 4, 4821, 9847, 13980, 18278), 91 << 8 }, }, { { CDF5( 1381, 7439, 10964, 14802, 18073), 76 << 8 }, { CDF5( 1522, 2450, 6619, 10499, 14157), 16 << 8 }, }, { { CDF5( 2025, 5643, 8935, 12150, 14843), 90 << 8 }, { CDF5( 224, 2917, 6661, 10512, 13337), 90 << 8 }, }, { { CDF5( 1368, 3825, 6654, 9038, 11091), 90 << 8 }, { CDF5( 41, 2582, 5312, 7720, 10479), 90 << 8 }, }, { { CDF5(20410, 31676, 32514, 32692, 32732), 80 << 8 }, { CDF5(25631, 26346, 32305, 32649, 32732), 6 << 8 }, }, { { CDF5( 9017, 26371, 30820, 32288, 32624), 75 << 8 }, { CDF5( 4, 20642, 29893, 32138, 32626), 0 << 8 }, }, { { CDF5(11957, 24371, 29434, 31605, 32329), 90 << 8 }, { CDF5( 4, 19715, 28037, 30945, 32081), 1 << 8 }, }, { { CDF5( 6587, 17392, 24549, 28765, 30910), 75 << 8 }, { CDF5( 4, 13017, 22846, 28010, 30563), 75 << 8 }, }, { { CDF5( 4820, 13076, 20235, 25274, 28367), 90 << 8 }, { CDF5( 4, 9517, 18022, 23590, 27505), 75 << 8 }, }, { { CDF5( 3839, 11042, 17152, 22327, 25816), 93 << 8 }, { CDF5( 4, 7434, 14558, 20495, 24720), 90 << 8 }, }, { { CDF5( 3178, 9048, 14018, 18246, 21456), 93 << 8 }, { CDF5( 41, 4587, 9592, 14488, 18984), 75 << 8 }, }, { { CDF5(24602, 32389, 32700, 32736, 32740), 75 << 8 }, { CDF5( 7626, 30290, 32589, 32730, 32734), 75 << 8 }, }, { { CDF5(17206, 30451, 32404, 32696, 32732), 90 << 8 }, { CDF5( 4, 26159, 31880, 32631, 32732), 75 << 8 }, }, { { CDF5(12799, 26717, 31223, 32384, 32678), 75 << 8 }, { CDF5( 4, 21576, 29603, 32028, 32595), 0 << 8 }, }, { { CDF5( 7923, 20676, 27760, 30909, 32092), 78 << 8 }, { CDF5( 10, 15320, 25351, 29888, 31811), 78 << 8 }, }, { { CDF5( 4552, 13193, 20089, 24958, 28270), 78 << 8 }, { CDF5( 29, 9104, 17079, 23044, 27203), 78 << 8 }, }, { { CDF5(29032, 32327, 32595, 32682, 32720), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(16965, 29286, 31651, 32307, 32527), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9346, 21389, 27317, 29974, 31327), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6585, 15649, 21693, 25674, 28044), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3417, 10093, 16159, 21188, 24606), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2304, 7456, 12293, 16563, 20267), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1811, 5284, 8853, 12277, 15769), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(30975, 32679, 32738, 32742, 32746), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(20345, 30725, 32278, 32643, 32726), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(12243, 25011, 29788, 31583, 32260), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 8494, 19928, 26600, 29501, 31142), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4992, 13924, 20358, 24637, 27524), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(24594, 32389, 32641, 32718, 32732), 37 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2358, 19217, 26997, 30649, 31793), 1 << 8 }, { CDF5( 4, 10990, 24500, 28869, 31218), 5 << 8 }, }, { { CDF5(12447, 24328, 28566, 30469, 31468), 1 << 8 }, { CDF5( 4, 18559, 25929, 29452, 30837), 1 << 8 }, }, { { CDF5( 6059, 14350, 21101, 25538, 28284), 0 << 8 }, { CDF5( 4, 8626, 17947, 23810, 27621), 1 << 8 }, }, { { CDF5( 3725, 10288, 16087, 20666, 24016), 75 << 8 }, { CDF5( 4, 6533, 12288, 17848, 22413), 80 << 8 }, }, { { CDF5( 2171, 7502, 12172, 15887, 19156), 0 << 8 }, { CDF5( 7, 5094, 9919, 14712, 18887), 90 << 8 }, }, { { CDF5( 2182, 6455, 10199, 13836, 17142), 75 << 8 }, { CDF5( 4, 4141, 8300, 12354, 15957), 75 << 8 }, }, { { CDF5( 1639, 5232, 8372, 11445, 14349), 75 << 8 }, { CDF5( 10, 3077, 6545, 10446, 13509), 0 << 8 }, }, { { CDF5( 1084, 4304, 6721, 9558, 11647), 75 << 8 }, { CDF5( 545, 1846, 4045, 6508, 8619), 15 << 8 }, }, { { CDF5(24344, 31910, 32550, 32682, 32732), 86 << 8 }, { CDF5(26511, 26812, 32257, 32605, 32732), 6 << 8 }, }, { { CDF5(11758, 27192, 31302, 32386, 32693), 76 << 8 }, { CDF5( 4, 22871, 30546, 32311, 32672), 76 << 8 }, }, { { CDF5(14135, 26534, 30495, 32044, 32475), 1 << 8 }, { CDF5( 4, 20045, 27213, 30377, 31787), 5 << 8 }, }, { { CDF5( 7076, 18263, 25043, 28687, 30681), 78 << 8 }, { CDF5( 4, 13464, 22336, 27466, 30144), 0 << 8 }, }, { { CDF5( 4633, 13655, 20720, 25269, 27732), 90 << 8 }, { CDF5( 4, 8679, 16355, 22272, 26693), 90 << 8 }, }, { { CDF5( 3748, 10999, 17182, 22075, 25415), 75 << 8 }, { CDF5( 4, 7508, 14197, 19726, 24108), 90 << 8 }, }, { { CDF5( 2184, 7460, 12244, 16404, 19473), 75 << 8 }, { CDF5( 69, 5366, 10343, 14774, 18517), 76 << 8 }, }, { { CDF5(25069, 32379, 32715, 32736, 32740), 90 << 8 }, { CDF5( 5129, 30522, 32636, 32736, 32740), 75 << 8 }, }, { { CDF5(19064, 30841, 32497, 32710, 32732), 75 << 8 }, { CDF5( 4, 27337, 32042, 32655, 32732), 0 << 8 }, }, { { CDF5(15344, 28138, 31636, 32491, 32707), 0 << 8 }, { CDF5( 10, 23734, 30387, 32192, 32638), 0 << 8 }, }, { { CDF5( 8553, 21612, 28244, 31035, 32141), 93 << 8 }, { CDF5( 4, 16669, 26367, 30466, 32062), 75 << 8 }, }, { { CDF5( 4707, 13756, 20729, 25449, 28537), 75 << 8 }, { CDF5( 9, 9798, 18016, 23799, 27538), 75 << 8 }, }, { { CDF5(30472, 32430, 32636, 32704, 32730), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(16740, 29432, 31724, 32306, 32530), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9770, 21395, 26819, 29645, 31040), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5883, 13836, 19460, 22882, 25424), 6 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 3255, 9742, 15113, 19977, 23209), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2318, 6734, 11462, 15492, 18969), 91 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 1344, 4886, 8490, 11147, 13803), 1 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(31970, 32735, 32740, 32744, 32748), 15 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(21766, 31615, 32549, 32682, 32732), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(14071, 27537, 31162, 32180, 32564), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9394, 22451, 28320, 30624, 31619), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6357, 16098, 22083, 26080, 28701), 76 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(22670, 31040, 32062, 32396, 32494), 62 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6204, 18887, 26106, 29994, 31527), 1 << 8 }, { CDF5( 4, 13216, 24238, 29218, 31178), 0 << 8 }, }, { { CDF5(10730, 22881, 27929, 30084, 31344), 25 << 8 }, { CDF5( 4, 18763, 26180, 29027, 30353), 26 << 8 }, }, { { CDF5( 4419, 14204, 19910, 24823, 27593), 6 << 8 }, { CDF5( 4, 9746, 17528, 23820, 27251), 0 << 8 }, }, { { CDF5( 3252, 8648, 14582, 18530, 21957), 31 << 8 }, { CDF5( 4, 6941, 13319, 18486, 23105), 75 << 8 }, }, { { CDF5( 2193, 8919, 13231, 16176, 19402), 75 << 8 }, { CDF5( 957, 4227, 8679, 14052, 18540), 0 << 8 }, }, { { CDF5( 1055, 5756, 9129, 12574, 14838), 26 << 8 }, { CDF5( 822, 4183, 8160, 12989, 16983), 75 << 8 }, }, { { CDF5( 2072, 7295, 11937, 15990, 18808), 90 << 8 }, { CDF5( 1296, 1658, 4422, 5572, 8265), 26 << 8 }, }, { { CDF5( 1733, 4681, 7680, 10654, 12528), 75 << 8 }, { CDF5( 4, 1678, 2994, 4855, 6537), 75 << 8 }, }, { { CDF5(24889, 31894, 32576, 32692, 32732), 6 << 8 }, { CDF5( 7489, 28023, 32440, 32702, 32732), 12 << 8 }, }, { { CDF5(13420, 28082, 31481, 32420, 32693), 2 << 8 }, { CDF5( 4, 24196, 30708, 32346, 32682), 5 << 8 }, }, { { CDF5(14158, 26101, 30271, 31760, 32366), 5 << 8 }, { CDF5( 4, 20337, 27469, 30511, 31808), 5 << 8 }, }, { { CDF5( 6545, 17433, 24600, 28570, 30488), 15 << 8 }, { CDF5( 4, 14301, 23067, 27600, 30048), 0 << 8 }, }, { { CDF5( 4611, 12681, 19425, 24119, 27368), 75 << 8 }, { CDF5( 6, 8765, 16411, 22190, 26422), 90 << 8 }, }, { { CDF5( 3293, 10520, 16126, 21429, 23703), 0 << 8 }, { CDF5( 13, 7300, 14277, 19168, 23448), 75 << 8 }, }, { { CDF5( 2464, 7343, 11864, 15738, 18418), 90 << 8 }, { CDF5( 85, 4192, 8349, 12391, 15795), 0 << 8 }, }, { { CDF5(26166, 32502, 32728, 32736, 32740), 0 << 8 }, { CDF5( 4561, 31126, 32673, 32736, 32740), 76 << 8 }, }, { { CDF5(19676, 30981, 32490, 32712, 32732), 75 << 8 }, { CDF5( 5, 27917, 32248, 32677, 32732), 1 << 8 }, }, { { CDF5(15573, 28642, 31918, 32572, 32718), 1 << 8 }, { CDF5( 33, 24002, 30459, 32142, 32635), 80 << 8 }, }, { { CDF5( 8718, 22161, 28870, 31388, 32324), 76 << 8 }, { CDF5( 37, 17061, 26380, 30433, 31990), 75 << 8 }, }, { { CDF5( 4574, 13326, 20007, 24664, 27510), 75 << 8 }, { CDF5( 75, 9240, 17009, 22503, 26129), 80 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(15828, 27456, 29866, 30952, 31512), 62 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4322, 17878, 25258, 29337, 30946), 31 << 8 }, { CDF5( 4, 11985, 23302, 27957, 30495), 6 << 8 }, }, { { CDF5( 9877, 22124, 26984, 29481, 30923), 1 << 8 }, { CDF5( 4, 18007, 23542, 27862, 29724), 7 << 8 }, }, { { CDF5( 4896, 15934, 20448, 22985, 25552), 6 << 8 }, { CDF5( 882, 5246, 14586, 20929, 25166), 31 << 8 }, }, { { CDF5( 2897, 11697, 15855, 19987, 22265), 1 << 8 }, { CDF5( 4, 5352, 8285, 14883, 19798), 1 << 8 }, }, { { CDF5( 2550, 7215, 11328, 15219, 17984), 3 << 8 }, { CDF5( 4, 4270, 8893, 12038, 15001), 18 << 8 }, }, { { CDF5( 1976, 5938, 10048, 13248, 15220), 1 << 8 }, { CDF5( 74, 2872, 5881, 8719, 12203), 3 << 8 }, }, { { CDF5( 1557, 5182, 7687, 10041, 12530), 18 << 8 }, { CDF5( 302, 3173, 6064, 9146, 12139), 16 << 8 }, }, { { CDF5( 1041, 2835, 4612, 6619, 7701), 75 << 8 }, { CDF5( 284, 1622, 3979, 5622, 7317), 75 << 8 }, }, { { CDF5(23459, 31826, 32523, 32706, 32732), 32 << 8 }, { CDF5( 4370, 28739, 32302, 32698, 32732), 30 << 8 }, }, { { CDF5(14211, 27544, 31308, 32252, 32647), 0 << 8 }, { CDF5( 4, 24453, 30550, 32209, 32593), 1 << 8 }, }, { { CDF5(13549, 26000, 29944, 31604, 32248), 1 << 8 }, { CDF5( 4, 19875, 26915, 30003, 31480), 6 << 8 }, }, { { CDF5( 6688, 18459, 24591, 28101, 29825), 1 << 8 }, { CDF5( 956, 10175, 19300, 25085, 28308), 1 << 8 }, }, { { CDF5( 4149, 12273, 18024, 22204, 25189), 0 << 8 }, { CDF5( 4, 7907, 14615, 20513, 24277), 0 << 8 }, }, { { CDF5( 3084, 9362, 14515, 19060, 22086), 0 << 8 }, { CDF5( 881, 5812, 11992, 16891, 21229), 1 << 8 }, }, { { CDF5( 2270, 6094, 9436, 13046, 15065), 90 << 8 }, { CDF5( 4, 2628, 6124, 9042, 11413), 1 << 8 }, }, { { CDF5(26213, 32584, 32715, 32736, 32740), 31 << 8 }, { CDF5( 9595, 31175, 32663, 32736, 32740), 81 << 8 }, }, { { CDF5(19100, 30551, 32351, 32675, 32732), 80 << 8 }, { CDF5( 42, 27427, 31958, 32640, 32732), 1 << 8 }, }, { { CDF5(14907, 27403, 31007, 32163, 32585), 0 << 8 }, { CDF5( 74, 23383, 30040, 31948, 32492), 6 << 8 }, }, { { CDF5( 8140, 21140, 27523, 30537, 31800), 1 << 8 }, { CDF5( 601, 15748, 25003, 29331, 31277), 1 << 8 }, }, { { CDF5( 3878, 11230, 17295, 21736, 24654), 75 << 8 }, { CDF5( 267, 7839, 14608, 19822, 23499), 75 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, }, .dc_sign = { { { { CDF1(17575), 75 << 8 }, { CDF1(14224), 78 << 8 }, { CDF1(19801), 93 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .br_y_tok_hf = { { CDF3(29897, 32392, 32630), 30 << 8 }, { CDF3(27484, 32113, 32486), 3 << 8 }, { CDF3(23979, 31279, 32028), 3 << 8 }, { CDF3(19126, 28997, 30760), 3 << 8 }, { CDF3(14329, 25240, 28412), 0 << 8 }, { CDF3(10861, 20773, 25861), 75 << 8 }, { CDF3( 7274, 15044, 21663), 0 << 8 }, }, .br_y_tok_lf = { { CDF3(13319, 21082, 26725), 37 << 8 }, { CDF3(15058, 23563, 27312), 75 << 8 }, { CDF3(12093, 21552, 24381), 91 << 8 }, { CDF3( 9847, 18914, 21611), 76 << 8 }, { CDF3( 6984, 14207, 17984), 75 << 8 }, { CDF3( 5580, 11627, 15411), 75 << 8 }, { CDF3( 3907, 8590, 10683), 81 << 8 }, { CDF3(19858, 26499, 28738), 31 << 8 }, { CDF3(19608, 28127, 30429), 0 << 8 }, { CDF3(17478, 27390, 29446), 0 << 8 }, { CDF3(14848, 25124, 27561), 0 << 8 }, { CDF3(11376, 21082, 24410), 75 << 8 }, { CDF3( 9179, 17862, 21918), 90 << 8 }, { CDF3( 6435, 13300, 17660), 75 << 8 }, }, .bob_base_y_tok = { { { CDF2(17507, 24673), 90 << 8 }, { CDF2(19003, 25281), 90 << 8 }, { CDF2(18718, 27145), 0 << 8 }, }, { { CDF2(22679, 29259), 15 << 8 }, { CDF2(24245, 30291), 0 << 8 }, { CDF2(25590, 31051), 0 << 8 }, }, { { CDF2(26844, 31178), 6 << 8 }, { CDF2(28524, 31813), 1 << 8 }, { CDF2(29430, 32097), 1 << 8 }, }, }, .br_y_tok_idtx = { { { CDF3(15586, 23809, 28480), 31 << 8 }, { CDF3(13137, 22338, 27776), 26 << 8 }, { CDF3(12960, 20989, 27007), 1 << 8 }, { CDF3(14487, 22004, 26872), 1 << 8 }, { CDF3( 8977, 19506, 25832), 1 << 8 }, { CDF3( 6689, 13456, 20960), 7 << 8 }, { CDF3( 3660, 8043, 13467), 0 << 8 }, }, { { CDF3(19250, 25820, 29038), 31 << 8 }, { CDF3(20479, 27597, 30639), 31 << 8 }, { CDF3(19284, 26800, 30037), 26 << 8 }, { CDF3(19120, 26676, 30033), 31 << 8 }, { CDF3(14799, 24819, 29256), 31 << 8 }, { CDF3(10041, 18099, 24102), 31 << 8 }, { CDF3( 6659, 13728, 19983), 31 << 8 }, }, { { CDF3(18981, 24347, 27397), 32 << 8 }, { CDF3(22083, 28198, 30532), 32 << 8 }, { CDF3(21589, 28346, 30716), 32 << 8 }, { CDF3(22576, 28726, 31113), 32 << 8 }, { CDF3(17332, 27607, 30702), 37 << 8 }, { CDF3(10788, 18280, 24206), 32 << 8 }, { CDF3( 7269, 15134, 21954), 62 << 8 }, }, }, .base_y_tok_idtx = { { { CDF3(29171, 31341, 32069), 3 << 8 }, { CDF3(19097, 27499, 29866), 75 << 8 }, { CDF3(16660, 23539, 28386), 0 << 8 }, { CDF3(13020, 18986, 22564), 5 << 8 }, { CDF3( 6762, 13469, 18997), 31 << 8 }, { CDF3( 5230, 9452, 15176), 32 << 8 }, { CDF3( 3282, 5921, 7884), 31 << 8 }, }, { { CDF3(31506, 32472, 32667), 75 << 8 }, { CDF3(20403, 29740, 31743), 3 << 8 }, { CDF3(14938, 24923, 30084), 0 << 8 }, { CDF3(10891, 19070, 24198), 30 << 8 }, { CDF3( 4812, 12877, 20442), 31 << 8 }, { CDF3( 3269, 8035, 14577), 31 << 8 }, { CDF3( 2373, 5157, 8269), 31 << 8 }, }, { { CDF3(32371, 32695, 32745), 75 << 8 }, { CDF3(21598, 31049, 32391), 0 << 8 }, { CDF3(14515, 26494, 31233), 31 << 8 }, { CDF3(11239, 19165, 24574), 31 << 8 }, { CDF3( 4137, 12390, 21147), 32 << 8 }, { CDF3( 2658, 7002, 14001), 31 << 8 }, { CDF3( 2391, 5537, 8704), 26 << 8 }, }, }, .sign_idtx = { { { CDF1(20565), 0 << 8 }, { CDF1(28478), 25 << 8 }, { CDF1( 8011), 15 << 8 }, { CDF1(31569), 56 << 8 }, { CDF1( 9937), 31 << 8 }, { CDF1(32166), 78 << 8 }, { CDF1( 4360), 83 << 8 }, { CDF1(32715), 38 << 8 }, { CDF1( 2253), 43 << 8 }, }, { { CDF1(21616), 26 << 8 }, { CDF1(29474), 25 << 8 }, { CDF1( 8169), 25 << 8 }, { CDF1(31873), 56 << 8 }, { CDF1( 6252), 31 << 8 }, { CDF1(32360), 18 << 8 }, { CDF1( 2458), 0 << 8 }, { CDF1(32719), 52 << 8 }, { CDF1( 682), 68 << 8 }, }, { { CDF1(22816), 31 << 8 }, { CDF1(30531), 26 << 8 }, { CDF1( 5700), 31 << 8 }, { CDF1(31507), 62 << 8 }, { CDF1( 1539), 62 << 8 }, { CDF1(32572), 26 << 8 }, { CDF1( 1015), 32 << 8 }, { CDF1(32694), 35 << 8 }, { CDF1( 412), 37 << 8 }, }, }, .skip_v = { { CDF1( 777), 93 << 8 }, { CDF1( 6871), 75 << 8 }, { CDF1(15620), 0 << 8 }, { CDF1( 1146), 35 << 8 }, { CDF1(14000), 25 << 8 }, { CDF1(24576), 50 << 8 }, { CDF1( 4536), 90 << 8 }, { CDF1(13035), 75 << 8 }, { CDF1(21755), 75 << 8 }, { CDF1( 5906), 35 << 8 }, { CDF1(18536), 25 << 8 }, { CDF1(26466), 50 << 8 }, }, .eob_base_uv_tok_hf = { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32086, 32691), 93 << 8 }, { CDF2(32538, 32743), 93 << 8 }, { CDF2(32512, 32730), 93 << 8 }, }, .base_uv_tok_hf = { { CDF3(26928, 32457, 32720), 75 << 8 }, { CDF3(16472, 30015, 32195), 75 << 8 }, { CDF3( 9599, 22905, 28625), 75 << 8 }, { CDF3( 4866, 13616, 20435), 75 << 8 }, { CDF3(27711, 32490, 32726), 75 << 8 }, { CDF3(16379, 30167, 32197), 75 << 8 }, { CDF3( 9085, 22146, 28034), 1 << 8 }, { CDF3( 3893, 11359, 17565), 75 << 8 }, { CDF3(30658, 32684, 32748), 90 << 8 }, { CDF3(22171, 31876, 32636), 90 << 8 }, { CDF3(10726, 23535, 29193), 17 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, .br_uv_tok_hf = { { CDF3(26645, 31153, 32251), 78 << 8 }, { CDF3(23320, 29627, 31652), 90 << 8 }, { CDF3(18265, 26040, 29663), 75 << 8 }, { CDF3(11140, 18504, 23213), 1 << 8 }, }, .eob_base_uv_tok_lf = { { CDF4(30964, 32463, 32673, 32733), 75 << 8 }, { CDF4(31731, 32623, 32730, 32740), 1 << 8 }, { CDF4(30879, 32453, 32716, 32740), 16 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, .base_uv_tok_lf = { { CDF5(14853, 28192, 31400, 32275, 32557), 1 << 8 }, { CDF5(10499, 24975, 30136, 31803, 32399), 75 << 8 }, { CDF5( 5543, 15727, 22759, 27080, 29613), 75 << 8 }, { CDF5( 2214, 6634, 11179, 15534, 19289), 90 << 8 }, { CDF5(15032, 28602, 31421, 32234, 32548), 0 << 8 }, { CDF5(10857, 24883, 29885, 31603, 32255), 0 << 8 }, { CDF5( 5068, 14275, 20979, 25484, 28338), 0 << 8 }, { CDF5( 1825, 5574, 9437, 13284, 16813), 93 << 8 }, { CDF5(28668, 32450, 32696, 32736, 32740), 15 << 8 }, { CDF5(17386, 30911, 32385, 32652, 32732), 90 << 8 }, { CDF5( 9471, 20994, 27290, 30512, 31784), 15 << 8 }, { CDF5( 7211, 13700, 19148, 23635, 26679), 0 << 8 }, } }, [3] = { .skip = { { { { CDF1(21443), 16 << 8 }, { CDF1( 4688), 0 << 8 }, { CDF1( 4478), 90 << 8 }, { CDF1( 7588), 0 << 8 }, { CDF1( 9776), 1 << 8 }, { CDF1(14682), 1 << 8 }, { CDF1( 421), 90 << 8 }, { CDF1( 2639), 76 << 8 }, { CDF1( 5035), 40 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(27522), 0 << 8 }, { CDF1( 5380), 78 << 8 }, { CDF1( 5185), 90 << 8 }, { CDF1(10193), 0 << 8 }, { CDF1(13111), 1 << 8 }, { CDF1(22246), 31 << 8 }, { CDF1( 1317), 0 << 8 }, { CDF1( 5966), 90 << 8 }, { CDF1(10326), 75 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(30197), 6 << 8 }, { CDF1( 8536), 0 << 8 }, { CDF1( 7061), 75 << 8 }, { CDF1(15977), 6 << 8 }, { CDF1(17416), 6 << 8 }, { CDF1(28033), 6 << 8 }, { CDF1( 2336), 0 << 8 }, { CDF1( 8267), 76 << 8 }, { CDF1(13804), 76 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(32558), 56 << 8 }, { CDF1( 3796), 0 << 8 }, { CDF1( 4485), 76 << 8 }, { CDF1(16972), 32 << 8 }, { CDF1(17777), 37 << 8 }, { CDF1(29879), 35 << 8 }, { CDF1( 3523), 1 << 8 }, { CDF1(10339), 2 << 8 }, { CDF1(17710), 7 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(32683), 93 << 8 }, { CDF1( 6378), 32 << 8 }, { CDF1(13628), 2 << 8 }, { CDF1(24546), 76 << 8 }, { CDF1(26525), 90 << 8 }, { CDF1(31671), 75 << 8 }, { CDF1( 5036), 31 << 8 }, { CDF1(11209), 32 << 8 }, { CDF1(17036), 7 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(32754), 50 << 8 }, { CDF1(12951), 93 << 8 }, { CDF1(15900), 93 << 8 }, { CDF1(14312), 16 << 8 }, { CDF1(17581), 15 << 8 }, { CDF1(19079), 18 << 8 }, { CDF1( 507), 93 << 8 }, { CDF1( 3382), 28 << 8 }, { CDF1( 7041), 18 << 8 }, { CDF1(32482), 81 << 8 }, }, { { CDF1(32755), 68 << 8 }, { CDF1(11625), 93 << 8 }, { CDF1(10365), 93 << 8 }, { CDF1(11240), 93 << 8 }, { CDF1(13860), 75 << 8 }, { CDF1(18669), 0 << 8 }, { CDF1( 620), 93 << 8 }, { CDF1( 4574), 76 << 8 }, { CDF1(11493), 83 << 8 }, { CDF1(31754), 75 << 8 }, }, { { CDF1(32749), 93 << 8 }, { CDF1(13494), 93 << 8 }, { CDF1(13995), 90 << 8 }, { CDF1(14723), 90 << 8 }, { CDF1(19850), 0 << 8 }, { CDF1(24076), 1 << 8 }, { CDF1( 816), 75 << 8 }, { CDF1( 8011), 6 << 8 }, { CDF1(18757), 11 << 8 }, { CDF1(32020), 3 << 8 }, }, { { CDF1(32697), 37 << 8 }, { CDF1(12925), 90 << 8 }, { CDF1(14605), 80 << 8 }, { CDF1(15058), 77 << 8 }, { CDF1(23741), 18 << 8 }, { CDF1(25610), 6 << 8 }, { CDF1( 1293), 5 << 8 }, { CDF1(11938), 1 << 8 }, { CDF1(22991), 32 << 8 }, { CDF1(26438), 65 << 8 }, }, { { CDF1(32644), 37 << 8 }, { CDF1(16042), 82 << 8 }, { CDF1(24056), 7 << 8 }, { CDF1(24203), 7 << 8 }, { CDF1(27963), 5 << 8 }, { CDF1(29458), 5 << 8 }, { CDF1( 2804), 57 << 8 }, { CDF1(19130), 43 << 8 }, { CDF1(26311), 60 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .eob_bin_16 = { { CDF4( 5029, 12046, 26017, 32453), 31 << 8 }, { CDF4( 3860, 5738, 8894, 18233), 31 << 8 }, { CDF4(29060, 30345, 32212, 32705), 27 << 8 }, }, .eob_bin_32 = { { CDF5( 7189, 11732, 21773, 29775, 32060), 31 << 8 }, { CDF5( 3833, 7634, 15540, 24684, 30675), 31 << 8 }, { CDF5(26250, 28453, 31497, 32596, 32732), 32 << 8 }, }, .eob_bin_64 = { { CDF6( 3765, 6333, 13340, 20831, 28830, 32289), 31 << 8 }, { CDF6( 5045, 7407, 13926, 20909, 27015, 31664), 31 << 8 }, { CDF6(24202, 26714, 30623, 32371, 32728, 32732), 6 << 8 }, }, .eob_bin_128 = { { CDF7( 4122, 6840, 13278, 20906, 28131, 31450, 32305), 31 << 8 }, { CDF7( 4575, 6063, 10938, 17191, 23126, 27885, 31584), 31 << 8 }, { CDF7(23490, 25861, 30005, 32096, 32701, 32720, 32724), 1 << 8 }, }, .eob_bin_256 = { { CDF7( 3574, 5683, 11604, 19611, 27344, 30852, 31552), 31 << 8 }, { CDF7( 7197, 7949, 12414, 18002, 23115, 27424, 30391), 32 << 8 }, { CDF7(23383, 25747, 29846, 31947, 32613, 32720, 32724), 6 << 8 }, }, .eob_bin_512 = { { CDF7( 3753, 5634, 10147, 17539, 26473, 32201, 32398), 6 << 8 }, { CDF7(11196, 12269, 16359, 22348, 27230, 30729, 31805), 32 << 8 }, { CDF7(22240, 24551, 29042, 31545, 32498, 32720, 32724), 7 << 8 }, }, .eob_bin_1024 = { { CDF7( 5838, 8921, 14454, 21162, 27888, 32275, 32424), 31 << 8 }, { CDF7(15377, 16920, 20643, 25175, 28586, 31294, 32008), 32 << 8 }, { CDF7(23067, 25626, 29608, 31655, 32465, 32687, 32716), 7 << 8 }, }, .eob_hi_bit = { CDF1(20364), 3 << 8 }, .eob_base_y_tok_hf = { { { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(10923, 21845), 0 << 8 }, { CDF2(31358, 32747), 18 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32389, 32626), 75 << 8 }, { CDF2(32642, 32756), 93 << 8 }, { CDF2(32629, 32756), 18 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32686, 32756), 93 << 8 }, { CDF2(32726, 32756), 93 << 8 }, { CDF2(32724, 32756), 15 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32700, 32756), 93 << 8 }, { CDF2(32577, 32756), 40 << 8 }, { CDF2(32626, 32753), 88 << 8 }, }, { { CDF2(10923, 21845), 0 << 8 }, { CDF2(32708, 32756), 93 << 8 }, { CDF2(32603, 32749), 91 << 8 }, { CDF2(32644, 32756), 42 << 8 }, }, }, .eob_base_y_tok_lf = { { { CDF4(30609, 32413, 32637, 32716), 5 << 8 }, { CDF4(30333, 32610, 32744, 32748), 1 << 8 }, { CDF4(31067, 32497, 32676, 32711), 1 << 8 }, { CDF4(32062, 32748, 32752, 32756), 36 << 8 }, }, { { CDF4(31289, 32570, 32714, 32740), 1 << 8 }, { CDF4(32248, 32709, 32744, 32748), 75 << 8 }, { CDF4(32234, 32719, 32744, 32748), 75 << 8 }, { CDF4(32449, 32740, 32744, 32748), 3 << 8 }, }, { { CDF4(31348, 32506, 32654, 32724), 6 << 8 }, { CDF4(32301, 32703, 32744, 32748), 90 << 8 }, { CDF4(32692, 32739, 32744, 32748), 7 << 8 }, { CDF4(20480, 26624, 28672, 30720), 50 << 8 }, }, { { CDF4(30132, 32214, 32554, 32667), 7 << 8 }, { CDF4(32297, 32711, 32744, 32748), 90 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, { { CDF4(28223, 31648, 32311, 32527), 37 << 8 }, { CDF4(32184, 32695, 32744, 32748), 90 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, }, .base_y_tok_hf = { { { { CDF3(18405, 30002, 32555), 40 << 8 }, { CDF3(14824, 31208, 31988), 50 << 8 }, }, { { CDF3(15397, 31584, 32373), 75 << 8 }, { CDF3( 1347, 29476, 32319), 65 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(31972, 32470, 32748), 28 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(22231, 32012, 32721), 40 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 4681, 16384, 30427), 50 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(26034, 32672, 32748), 93 << 8 }, { CDF3(12576, 32175, 32745), 90 << 8 }, }, { { CDF3(22551, 32372, 32748), 93 << 8 }, { CDF3( 20, 30978, 32718), 15 << 8 }, }, { { CDF3(17805, 31280, 32648), 93 << 8 }, { CDF3( 1557, 28114, 32425), 3 << 8 }, }, { { CDF3(13292, 28676, 32147), 93 << 8 }, { CDF3( 555, 23489, 31233), 0 << 8 }, }, { { CDF3(10339, 22768, 29435), 35 << 8 }, { CDF3( 758, 15761, 26377), 35 << 8 }, }, { { CDF3(29142, 32726, 32748), 15 << 8 }, { CDF3(17698, 32623, 32748), 25 << 8 }, }, { { CDF3(24984, 32622, 32748), 93 << 8 }, { CDF3( 600, 31789, 32745), 15 << 8 }, }, { { CDF3(18241, 32005, 32740), 78 << 8 }, { CDF3( 3053, 28834, 32573), 93 << 8 }, }, { { CDF3(11582, 29556, 32181), 18 << 8 }, { CDF3( 5757, 25240, 31661), 33 << 8 }, }, { { CDF3( 3072, 21504, 29184), 65 << 8 }, { CDF3( 643, 12208, 26343), 35 << 8 }, }, { { CDF3(29915, 32732, 32748), 78 << 8 }, { CDF3(24589, 32741, 32748), 0 << 8 }, }, { { CDF3(25072, 32653, 32747), 15 << 8 }, { CDF3( 2766, 32408, 32743), 78 << 8 }, }, { { CDF3(19064, 32047, 32688), 90 << 8 }, { CDF3( 6208, 29694, 32582), 75 << 8 }, }, { { CDF3(16118, 29137, 32325), 90 << 8 }, { CDF3( 3624, 26555, 32028), 6 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(32165, 32689, 32748), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(20616, 31528, 32613), 78 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(17095, 29686, 32489), 18 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(13280, 23529, 32046), 75 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 1725, 3449, 31043), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(27252, 32687, 32748), 0 << 8 }, { CDF3(12250, 32394, 32747), 0 << 8 }, }, { { CDF3(22953, 32345, 32732), 93 << 8 }, { CDF3( 100, 31081, 32711), 75 << 8 }, }, { { CDF3(18289, 31130, 32616), 90 << 8 }, { CDF3( 1043, 27525, 32293), 15 << 8 }, }, { { CDF3(12030, 27356, 31773), 75 << 8 }, { CDF3( 1328, 21750, 30500), 18 << 8 }, }, { { CDF3( 8454, 22057, 29080), 90 << 8 }, { CDF3( 312, 16537, 27228), 76 << 8 }, }, { { CDF3(29639, 32739, 32748), 0 << 8 }, { CDF3(24459, 32557, 32748), 78 << 8 }, }, { { CDF3(24256, 32629, 32748), 75 << 8 }, { CDF3( 586, 31663, 32726), 5 << 8 }, }, { { CDF3(18917, 31583, 32708), 0 << 8 }, { CDF3( 1616, 28584, 32470), 75 << 8 }, }, { { CDF3(13655, 29545, 32407), 75 << 8 }, { CDF3( 1396, 23860, 31469), 90 << 8 }, }, { { CDF3( 9270, 24668, 30796), 91 << 8 }, { CDF3( 1688, 18438, 28979), 91 << 8 }, }, { { CDF3(31096, 32751, 32755), 0 << 8 }, { CDF3(28942, 32716, 32748), 5 << 8 }, }, { { CDF3(25750, 32644, 32748), 75 << 8 }, { CDF3( 465, 32242, 32741), 90 << 8 }, }, { { CDF3(20669, 32346, 32741), 90 << 8 }, { CDF3( 2885, 29990, 32664), 75 << 8 }, }, { { CDF3(15748, 30594, 32632), 90 << 8 }, { CDF3( 372, 26420, 32273), 90 << 8 }, }, { { CDF3(10002, 25783, 31458), 81 << 8 }, { CDF3( 869, 20294, 30019), 92 << 8 }, }, { { CDF3(32513, 32752, 32756), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(24556, 32348, 32716), 90 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(19561, 30597, 32461), 1 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(11689, 27786, 31427), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(26428, 32701, 32748), 5 << 8 }, { CDF3( 8484, 32368, 32747), 0 << 8 }, }, { { CDF3(22687, 32407, 32740), 75 << 8 }, { CDF3( 390, 30961, 32716), 5 << 8 }, }, { { CDF3(18394, 31398, 32670), 75 << 8 }, { CDF3( 853, 27383, 32300), 75 << 8 }, }, { { CDF3(12738, 28040, 31842), 75 << 8 }, { CDF3( 1044, 21941, 30518), 75 << 8 }, }, { { CDF3( 6990, 19000, 26174), 78 << 8 }, { CDF3( 118, 14015, 24791), 75 << 8 }, }, { { CDF3(28826, 32741, 32748), 31 << 8 }, { CDF3(19295, 32513, 32748), 5 << 8 }, }, { { CDF3(24880, 32674, 32744), 76 << 8 }, { CDF3( 2546, 31692, 32741), 80 << 8 }, }, { { CDF3(19559, 31494, 32644), 76 << 8 }, { CDF3( 1553, 28807, 32451), 81 << 8 }, }, { { CDF3(12667, 27705, 31740), 5 << 8 }, { CDF3( 706, 22624, 30721), 0 << 8 }, }, { { CDF3( 7658, 20890, 27914), 75 << 8 }, { CDF3( 423, 16424, 25634), 91 << 8 }, }, { { CDF3(31332, 32752, 32756), 0 << 8 }, { CDF3(32634, 32730, 32748), 75 << 8 }, }, { { CDF3(26024, 32656, 32746), 75 << 8 }, { CDF3( 6440, 32017, 32721), 78 << 8 }, }, { { CDF3(21950, 32128, 32733), 90 << 8 }, { CDF3( 2501, 29939, 32539), 75 << 8 }, }, { { CDF3(15537, 30098, 32412), 75 << 8 }, { CDF3( 951, 25283, 31648), 3 << 8 }, }, { { CDF3( 8361, 23816, 30093), 76 << 8 }, { CDF3( 1474, 18475, 28692), 91 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, { { { CDF3(27339, 32708, 32748), 81 << 8 }, { CDF3(11572, 32394, 32748), 8 << 8 }, }, { { CDF3(23878, 32548, 32744), 5 << 8 }, { CDF3( 461, 31124, 32705), 0 << 8 }, }, { { CDF3(19326, 31492, 32668), 75 << 8 }, { CDF3( 664, 28301, 32374), 75 << 8 }, }, { { CDF3(13120, 27541, 31646), 76 << 8 }, { CDF3( 550, 22504, 30562), 76 << 8 }, }, { { CDF3( 6365, 17893, 24908), 15 << 8 }, { CDF3( 608, 12570, 22302), 90 << 8 }, }, { { CDF3(29384, 32732, 32748), 6 << 8 }, { CDF3(13817, 32574, 32748), 0 << 8 }, }, { { CDF3(24520, 32525, 32747), 76 << 8 }, { CDF3( 1775, 31839, 32739), 6 << 8 }, }, { { CDF3(19335, 31747, 32688), 6 << 8 }, { CDF3( 3566, 28880, 32362), 6 << 8 }, }, { { CDF3(12397, 27427, 31587), 1 << 8 }, { CDF3( 509, 22594, 30553), 0 << 8 }, }, { { CDF3( 7148, 19219, 26315), 75 << 8 }, { CDF3( 677, 13710, 23764), 76 << 8 }, }, { { CDF3(31236, 32752, 32756), 80 << 8 }, { CDF3(32618, 32731, 32748), 90 << 8 }, }, { { CDF3(27068, 32675, 32748), 75 << 8 }, { CDF3( 5945, 32111, 32744), 90 << 8 }, }, { { CDF3(22852, 32207, 32731), 75 << 8 }, { CDF3( 2177, 30282, 32570), 75 << 8 }, }, { { CDF3(14481, 29134, 32134), 0 << 8 }, { CDF3( 2589, 24920, 31424), 0 << 8 }, }, { { CDF3( 8171, 21620, 28271), 75 << 8 }, { CDF3( 169, 16674, 26696), 75 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, }, }, .base_y_tok_lf = { { { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 100, 16721, 28276, 31260, 32398), 1 << 8 }, { CDF5( 4, 6986, 23211, 29768, 31892), 1 << 8 }, }, { { CDF5( 6124, 20746, 26659, 30297, 32090), 80 << 8 }, { CDF5( 4, 20450, 26906, 29984, 32052), 42 << 8 }, }, { { CDF5( 2483, 9881, 20654, 28573, 31617), 7 << 8 }, { CDF5( 4478, 5170, 20007, 29247, 31101), 12 << 8 }, }, { { CDF5( 4402, 12594, 21642, 24821, 29711), 60 << 8 }, { CDF5( 119, 12229, 15790, 22676, 27425), 28 << 8 }, }, { { CDF5( 5174, 12072, 22420, 24145, 25869), 0 << 8 }, { CDF5( 728, 2913, 5825, 21117, 22574), 90 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(18225, 31062, 32659, 32736, 32740), 31 << 8 }, { CDF5( 747, 25457, 32313, 32736, 32740), 43 << 8 }, }, { { CDF5( 9428, 30477, 32339, 32668, 32696), 18 << 8 }, { CDF5( 4, 21160, 31076, 32661, 32732), 0 << 8 }, }, { { CDF5(14256, 29803, 32043, 32616, 32732), 75 << 8 }, { CDF5( 10, 23629, 31540, 32549, 32730), 32 << 8 }, }, { { CDF5( 8535, 25300, 30787, 31549, 32463), 85 << 8 }, { CDF5( 1351, 18749, 29052, 31248, 32599), 90 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 3855, 9638, 17348, 19275, 23130), 50 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(20884, 32507, 32740, 32744, 32748), 15 << 8 }, { CDF5(11439, 30717, 32718, 32736, 32740), 83 << 8 }, }, { { CDF5(16651, 32234, 32676, 32736, 32740), 15 << 8 }, { CDF5( 2783, 29097, 32503, 32736, 32740), 80 << 8 }, }, { { CDF5(14119, 29823, 32164, 32617, 32692), 12 << 8 }, { CDF5( 2682, 23298, 31008, 32517, 32684), 88 << 8 }, }, { { CDF5(10503, 29407, 31508, 31928, 32348), 25 << 8 }, { CDF5( 886, 19484, 30111, 30997, 31882), 75 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(29595, 31985, 32553, 32665, 32732), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(16090, 27467, 31184, 32157, 32704), 1 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11881, 22576, 26436, 31160, 32105), 27 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4886, 21845, 24001, 27882, 28169), 60 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(29540, 32735, 32740, 32744, 32748), 6 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(21479, 29463, 32511, 32736, 32740), 93 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(15056, 26273, 29816, 32178, 32473), 25 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(10519, 30400, 31391, 32327, 32713), 15 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 86, 20785, 28528, 31103, 32087), 0 << 8 }, { CDF5( 4, 6067, 24484, 29795, 31615), 0 << 8 }, }, { { CDF5(10869, 20589, 26954, 30327, 31505), 0 << 8 }, { CDF5( 4, 15477, 26105, 29564, 31257), 1 << 8 }, }, { { CDF5( 4914, 15045, 23040, 27717, 30490), 0 << 8 }, { CDF5( 4, 10307, 19393, 25816, 29383), 15 << 8 }, }, { { CDF5( 2819, 10379, 17295, 23231, 27208), 75 << 8 }, { CDF5( 2939, 6238, 14472, 21322, 25358), 1 << 8 }, }, { { CDF5( 2510, 8572, 13721, 19650, 24615), 91 << 8 }, { CDF5( 4, 6558, 12858, 16531, 22847), 1 << 8 }, }, { { CDF5( 2098, 8373, 11493, 16534, 21082), 10 << 8 }, { CDF5( 371, 3300, 8699, 14469, 19163), 76 << 8 }, }, { { CDF5( 1409, 6107, 9513, 13859, 18674), 40 << 8 }, { CDF5( 1882, 3210, 8081, 12842, 17048), 85 << 8 }, }, { { CDF5( 780, 5461, 7802, 10923, 13263), 0 << 8 }, { CDF5( 537, 4835, 8058, 10744, 13967), 90 << 8 }, }, { { CDF5(21042, 31954, 32667, 32736, 32740), 18 << 8 }, { CDF5(28136, 28288, 32626, 32720, 32732), 57 << 8 }, }, { { CDF5( 7851, 28094, 31782, 32677, 32732), 78 << 8 }, { CDF5( 4, 21935, 31329, 32631, 32732), 1 << 8 }, }, { { CDF5(12762, 26468, 31109, 32414, 32693), 90 << 8 }, { CDF5( 4, 21556, 29551, 32049, 32643), 0 << 8 }, }, { { CDF5(10020, 22866, 29258, 31456, 32395), 90 << 8 }, { CDF5( 4, 17576, 27446, 31017, 32403), 0 << 8 }, }, { { CDF5( 7348, 19414, 26844, 30566, 31941), 90 << 8 }, { CDF5( 4, 12308, 22780, 28574, 31279), 75 << 8 }, }, { { CDF5( 6457, 17740, 24197, 28585, 30902), 78 << 8 }, { CDF5( 769, 11802, 20680, 26000, 30085), 15 << 8 }, }, { { CDF5( 4625, 14072, 20960, 25388, 28438), 40 << 8 }, { CDF5( 185, 8492, 16246, 20861, 25568), 90 << 8 }, }, { { CDF5(24491, 32563, 32740, 32744, 32748), 90 << 8 }, { CDF5(10193, 31065, 32692, 32736, 32740), 75 << 8 }, }, { { CDF5(18023, 31654, 32683, 32736, 32740), 78 << 8 }, { CDF5( 4, 28229, 32493, 32736, 32740), 0 << 8 }, }, { { CDF5(15126, 29669, 32385, 32717, 32732), 3 << 8 }, { CDF5( 4, 25310, 31745, 32662, 32732), 0 << 8 }, }, { { CDF5(11803, 26851, 31619, 32578, 32732), 90 << 8 }, { CDF5( 4, 21347, 30308, 32362, 32732), 75 << 8 }, }, { { CDF5( 8413, 22123, 28902, 31454, 32484), 91 << 8 }, { CDF5( 533, 15815, 26833, 31091, 32338), 1 << 8 }, }, { { CDF5(30851, 32572, 32702, 32736, 32740), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(15898, 29692, 32156, 32573, 32722), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 9275, 21031, 27592, 31095, 32259), 78 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 7586, 16864, 22782, 27672, 30277), 83 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 4779, 14336, 21504, 25259, 27307), 25 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(32033, 32744, 32748, 32752, 32756), 3 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(21905, 31682, 32689, 32736, 32740), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(16223, 28310, 31943, 32665, 32704), 2 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(10587, 25458, 29995, 32012, 32516), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(20744, 31573, 32432, 32575, 32732), 37 << 8 }, { CDF5( 2048, 24576, 26624, 28672, 30720), 50 << 8 }, }, { { CDF5( 1512, 20783, 28537, 31233, 32030), 0 << 8 }, { CDF5( 4, 9686, 25465, 29825, 31669), 0 << 8 }, }, { { CDF5(11418, 22879, 28443, 30589, 31691), 0 << 8 }, { CDF5( 4, 18611, 26971, 30180, 31489), 1 << 8 }, }, { { CDF5( 5931, 14893, 21314, 26301, 29130), 0 << 8 }, { CDF5( 4, 10485, 20060, 26727, 29964), 2 << 8 }, }, { { CDF5( 4511, 11376, 17124, 22328, 26530), 90 << 8 }, { CDF5( 109, 4550, 13218, 19861, 24578), 6 << 8 }, }, { { CDF5( 2402, 9192, 14384, 19305, 22990), 75 << 8 }, { CDF5( 4, 5323, 10546, 15982, 20848), 76 << 8 }, }, { { CDF5( 3030, 8069, 12594, 17208, 20711), 3 << 8 }, { CDF5( 4, 4225, 9697, 14319, 18246), 75 << 8 }, }, { { CDF5( 2223, 6369, 10286, 13595, 18323), 83 << 8 }, { CDF5( 175, 2961, 7374, 11755, 15610), 3 << 8 }, }, { { CDF5( 1367, 6180, 9659, 13156, 15544), 90 << 8 }, { CDF5( 505, 2440, 6344, 10284, 13648), 92 << 8 }, }, { { CDF5(22323, 32120, 32586, 32723, 32732), 6 << 8 }, { CDF5(28677, 28707, 32573, 32736, 32740), 31 << 8 }, }, { { CDF5( 9067, 28461, 31803, 32646, 32732), 90 << 8 }, { CDF5( 4, 23713, 31500, 32628, 32732), 6 << 8 }, }, { { CDF5(14440, 27462, 31270, 32400, 32671), 75 << 8 }, { CDF5( 4, 21690, 28829, 31630, 32453), 5 << 8 }, }, { { CDF5( 8061, 20968, 27850, 30659, 31948), 93 << 8 }, { CDF5( 4, 16746, 25908, 29977, 31832), 5 << 8 }, }, { { CDF5( 5820, 16451, 23783, 28293, 30655), 90 << 8 }, { CDF5( 4, 10996, 20608, 26697, 30042), 5 << 8 }, }, { { CDF5( 4853, 14377, 20953, 25793, 29037), 75 << 8 }, { CDF5( 4, 10112, 18364, 24708, 28619), 75 << 8 }, }, { { CDF5( 3528, 10929, 17584, 23009, 26629), 78 << 8 }, { CDF5( 5, 8177, 15921, 21527, 25499), 75 << 8 }, }, { { CDF5(24757, 32619, 32740, 32744, 32748), 1 << 8 }, { CDF5( 8837, 31099, 32688, 32736, 32740), 78 << 8 }, }, { { CDF5(19775, 31747, 32667, 32736, 32740), 75 << 8 }, { CDF5( 4, 28933, 32511, 32732, 32736), 0 << 8 }, }, { { CDF5(17105, 29988, 32352, 32704, 32732), 75 << 8 }, { CDF5( 5, 26112, 31741, 32633, 32732), 0 << 8 }, }, { { CDF5(10631, 24891, 30498, 32167, 32638), 93 << 8 }, { CDF5( 4, 20962, 29880, 32217, 32689), 76 << 8 }, }, { { CDF5( 7259, 19563, 26854, 30459, 32035), 90 << 8 }, { CDF5( 161, 13687, 24042, 29325, 31661), 75 << 8 }, }, { { CDF5(31243, 32617, 32724, 32736, 32740), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(15771, 29814, 32176, 32618, 32719), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 8991, 20335, 27006, 30538, 31920), 1 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11004, 16889, 22530, 26291, 28694), 32 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 2048, 4096, 16384, 20480, 28672), 50 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(32375, 32744, 32748, 32752, 32756), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(21853, 32148, 32719, 32736, 32740), 90 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(14100, 28596, 31850, 32559, 32685), 85 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5(11320, 25619, 29193, 31576, 32172), 75 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(17586, 29249, 30954, 31637, 31943), 60 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6607, 19220, 27249, 31067, 32015), 15 << 8 }, { CDF5( 4, 12693, 25132, 30060, 31735), 0 << 8 }, }, { { CDF5(11218, 23056, 28975, 31289, 32045), 5 << 8 }, { CDF5( 4, 16611, 26400, 28905, 30652), 1 << 8 }, }, { { CDF5( 4973, 15646, 22529, 27020, 30234), 75 << 8 }, { CDF5( 4, 11786, 18820, 25595, 28661), 75 << 8 }, }, { { CDF5( 4994, 11005, 17116, 23193, 25485), 0 << 8 }, { CDF5( 10, 5820, 14233, 19478, 24366), 75 << 8 }, }, { { CDF5( 1327, 8818, 15358, 18969, 22525), 25 << 8 }, { CDF5( 1517, 5910, 11125, 16349, 21591), 0 << 8 }, }, { { CDF5( 123, 5065, 11340, 15150, 17222), 16 << 8 }, { CDF5( 1786, 5857, 10807, 14291, 19911), 75 << 8 }, }, { { CDF5( 2928, 7753, 12931, 17083, 20510), 75 << 8 }, { CDF5( 549, 2673, 5713, 7772, 12679), 3 << 8 }, }, { { CDF5( 1125, 4606, 9305, 11931, 14884), 13 << 8 }, { CDF5( 12, 3709, 6058, 8777, 11932), 85 << 8 }, }, { { CDF5(19675, 31285, 32524, 32698, 32732), 11 << 8 }, { CDF5(11455, 26740, 32279, 32717, 32732), 3 << 8 }, }, { { CDF5(10345, 28211, 31805, 32644, 32732), 3 << 8 }, { CDF5( 4, 23277, 31209, 32545, 32732), 8 << 8 }, }, { { CDF5(13559, 26348, 31030, 32344, 32653), 6 << 8 }, { CDF5( 4, 19849, 28463, 31321, 32304), 83 << 8 }, }, { { CDF5( 7549, 20488, 27158, 30433, 31903), 75 << 8 }, { CDF5( 4, 15187, 24493, 29279, 31267), 0 << 8 }, }, { { CDF5( 4919, 15070, 22336, 27218, 29792), 75 << 8 }, { CDF5( 296, 9560, 18580, 24640, 28571), 75 << 8 }, }, { { CDF5( 4449, 12778, 20939, 26844, 28917), 5 << 8 }, { CDF5( 4, 8819, 16299, 21272, 25505), 75 << 8 }, }, { { CDF5( 2662, 8971, 15434, 20936, 24242), 77 << 8 }, { CDF5( 5, 7370, 13045, 17765, 21686), 90 << 8 }, }, { { CDF5(23376, 32614, 32740, 32744, 32748), 5 << 8 }, { CDF5( 7252, 31056, 32698, 32736, 32740), 78 << 8 }, }, { { CDF5(18958, 31560, 32655, 32736, 32740), 90 << 8 }, { CDF5( 7, 28722, 32521, 32728, 32732), 76 << 8 }, }, { { CDF5(15525, 29433, 32224, 32682, 32732), 90 << 8 }, { CDF5( 5, 25379, 31608, 32598, 32732), 1 << 8 }, }, { { CDF5(10133, 24671, 30659, 32294, 32704), 75 << 8 }, { CDF5( 194, 19014, 28420, 31496, 32473), 75 << 8 }, }, { { CDF5( 6037, 17899, 25577, 29484, 31446), 75 << 8 }, { CDF5( 675, 12416, 22211, 28031, 30591), 75 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, { { { CDF5(20160, 28303, 31010, 31935, 32259), 65 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 6387, 21131, 28651, 31386, 32236), 51 << 8 }, { CDF5( 4, 14343, 25805, 30386, 31978), 5 << 8 }, }, { { CDF5(10794, 23651, 28414, 30514, 31623), 26 << 8 }, { CDF5( 4, 18792, 26345, 29842, 31433), 26 << 8 }, }, { { CDF5( 7250, 18857, 23745, 26494, 29185), 31 << 8 }, { CDF5( 59, 5934, 16720, 25540, 28810), 51 << 8 }, }, { { CDF5( 4853, 13443, 19345, 23384, 26041), 0 << 8 }, { CDF5( 4, 7130, 12348, 18932, 24173), 75 << 8 }, }, { { CDF5( 2667, 9653, 13863, 18817, 21760), 1 << 8 }, { CDF5( 4, 5477, 10652, 16140, 19594), 1 << 8 }, }, { { CDF5( 2595, 7918, 13026, 17820, 20007), 77 << 8 }, { CDF5( 508, 3110, 7370, 10565, 14704), 2 << 8 }, }, { { CDF5( 1961, 6366, 9476, 12294, 15249), 90 << 8 }, { CDF5( 753, 2600, 6724, 10877, 14165), 15 << 8 }, }, { { CDF5( 1420, 4356, 6620, 8937, 10997), 90 << 8 }, { CDF5( 120, 1977, 4498, 6986, 9746), 75 << 8 }, }, { { CDF5(20741, 31938, 32628, 32728, 32732), 35 << 8 }, { CDF5( 9417, 28087, 32436, 32701, 32732), 3 << 8 }, }, { { CDF5(12348, 28499, 31976, 32606, 32720), 0 << 8 }, { CDF5( 4, 24379, 31121, 32398, 32684), 0 << 8 }, }, { { CDF5(13275, 26774, 30839, 32177, 32535), 16 << 8 }, { CDF5( 4, 21429, 28887, 31453, 32269), 26 << 8 }, }, { { CDF5( 7379, 21206, 27561, 30526, 31713), 1 << 8 }, { CDF5( 703, 12333, 22878, 28540, 30860), 1 << 8 }, }, { { CDF5( 5216, 15396, 22184, 26885, 29262), 75 << 8 }, { CDF5( 24, 9666, 18359, 24699, 28006), 0 << 8 }, }, { { CDF5( 4332, 12591, 18916, 23721, 26816), 3 << 8 }, { CDF5( 499, 7000, 14338, 19317, 23450), 0 << 8 }, }, { { CDF5( 3594, 9556, 14336, 19090, 22322), 75 << 8 }, { CDF5( 17, 3572, 9288, 13324, 15737), 0 << 8 }, }, { { CDF5(24009, 32537, 32740, 32744, 32748), 75 << 8 }, { CDF5(10094, 31466, 32706, 32736, 32740), 0 << 8 }, }, { { CDF5(19318, 31691, 32676, 32736, 32740), 80 << 8 }, { CDF5( 319, 28584, 32312, 32719, 32732), 75 << 8 }, }, { { CDF5(15897, 29451, 32143, 32648, 32732), 75 << 8 }, { CDF5( 219, 25102, 31471, 32488, 32732), 75 << 8 }, }, { { CDF5(10046, 24716, 30457, 32055, 32572), 0 << 8 }, { CDF5( 503, 18762, 28275, 31529, 32438), 75 << 8 }, }, { { CDF5( 5398, 16028, 23301, 27427, 29942), 76 << 8 }, { CDF5( 497, 11148, 20015, 25561, 28959), 76 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, { { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, }, }, }, .dc_sign = { { { { CDF1(20108), 1 << 8 }, { CDF1(14675), 0 << 8 }, { CDF1(20547), 93 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, { { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, }, }, .br_y_tok_hf = { { CDF3(31501, 32621, 32738), 55 << 8 }, { CDF3(29651, 32500, 32698), 6 << 8 }, { CDF3(27204, 32153, 32586), 32 << 8 }, { CDF3(22529, 30751, 32001), 31 << 8 }, { CDF3(17048, 27341, 30583), 15 << 8 }, { CDF3(12558, 24036, 28617), 6 << 8 }, { CDF3( 9501, 18960, 24998), 13 << 8 }, }, .br_y_tok_lf = { { CDF3(13965, 20979, 26518), 62 << 8 }, { CDF3(17417, 26343, 29434), 6 << 8 }, { CDF3(14569, 24464, 27796), 6 << 8 }, { CDF3(12532, 22823, 25671), 11 << 8 }, { CDF3( 8586, 17291, 21825), 6 << 8 }, { CDF3( 6326, 13118, 17907), 6 << 8 }, { CDF3( 4106, 9173, 12623), 15 << 8 }, { CDF3(24175, 29267, 31202), 35 << 8 }, { CDF3(22491, 30103, 31663), 1 << 8 }, { CDF3(19961, 29271, 31158), 6 << 8 }, { CDF3(17023, 27102, 29910), 6 << 8 }, { CDF3(13897, 23531, 27451), 1 << 8 }, { CDF3(11233, 19762, 24691), 7 << 8 }, { CDF3( 7378, 14545, 19934), 7 << 8 }, }, .bob_base_y_tok = { { { CDF2(20211, 31417), 5 << 8 }, { CDF2(22000, 30309), 5 << 8 }, { CDF2(24619, 32031), 32 << 8 }, }, { { CDF2(25412, 31853), 32 << 8 }, { CDF2(26802, 31283), 32 << 8 }, { CDF2(29204, 32091), 32 << 8 }, }, { { CDF2(30348, 32633), 32 << 8 }, { CDF2(31094, 32686), 40 << 8 }, { CDF2(31665, 32664), 7 << 8 }, }, }, .br_y_tok_idtx = { { { CDF3(32317, 32467, 32618), 62 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3(26624, 28672, 30720), 50 << 8 }, { CDF3(32099, 32322, 32545), 50 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(31284, 32482, 32636), 62 << 8 }, { CDF3(31915, 32524, 32727), 62 << 8 }, { CDF3(32202, 32681, 32724), 60 << 8 }, { CDF3(32241, 32717, 32748), 60 << 8 }, { CDF3(30064, 32611, 32737), 62 << 8 }, { CDF3(29352, 32394, 32715), 62 << 8 }, { CDF3(21876, 31279, 32303), 60 << 8 }, }, { { CDF3(32083, 32670, 32719), 60 << 8 }, { CDF3(32154, 32631, 32700), 60 << 8 }, { CDF3(31950, 32686, 32727), 60 << 8 }, { CDF3(32416, 32734, 32748), 60 << 8 }, { CDF3(31835, 32703, 32746), 62 << 8 }, { CDF3(30941, 32582, 32737), 60 << 8 }, { CDF3(28688, 32549, 32724), 60 << 8 }, }, }, .base_y_tok_idtx = { { { CDF3(31026, 32443, 32748), 25 << 8 }, { CDF3(22538, 31251, 32718), 5 << 8 }, { CDF3(22228, 26294, 32627), 0 << 8 }, { CDF3(20903, 23313, 25862), 38 << 8 }, { CDF3(11627, 16067, 32345), 25 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, { { CDF3(31817, 32581, 32735), 25 << 8 }, { CDF3(19499, 31019, 32650), 0 << 8 }, { CDF3(13910, 24210, 32424), 31 << 8 }, { CDF3(12714, 20186, 29593), 62 << 8 }, { CDF3( 4822, 12648, 29509), 32 << 8 }, { CDF3( 2380, 7451, 16774), 75 << 8 }, { CDF3( 3072, 10791, 15675), 50 << 8 }, }, { { CDF3(32485, 32734, 32748), 75 << 8 }, { CDF3(19059, 32009, 32725), 0 << 8 }, { CDF3( 9309, 28748, 32484), 32 << 8 }, { CDF3( 5722, 17292, 29536), 62 << 8 }, { CDF3( 2186, 8637, 26136), 57 << 8 }, { CDF3( 1951, 5879, 15659), 33 << 8 }, { CDF3( 2166, 5695, 11591), 36 << 8 }, }, }, .sign_idtx = { { { CDF1(18478), 31 << 8 }, { CDF1(30410), 26 << 8 }, { CDF1( 3274), 16 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(32138), 0 << 8 }, { CDF1( 697), 50 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(24519), 32 << 8 }, { CDF1(32572), 26 << 8 }, { CDF1( 653), 13 << 8 }, { CDF1(32404), 50 << 8 }, { CDF1(16384), 0 << 8 }, { CDF1(32753), 83 << 8 }, { CDF1( 78), 86 << 8 }, { CDF1(32612), 50 << 8 }, { CDF1(16384), 0 << 8 }, }, { { CDF1(27765), 62 << 8 }, { CDF1(32723), 78 << 8 }, { CDF1( 368), 31 << 8 }, { CDF1(32138), 50 << 8 }, { CDF1( 2521), 50 << 8 }, { CDF1(32764), 35 << 8 }, { CDF1( 22), 53 << 8 }, { CDF1(32577), 50 << 8 }, { CDF1(16384), 0 << 8 }, }, }, .skip_v = { { CDF1( 606), 90 << 8 }, { CDF1( 6309), 0 << 8 }, { CDF1(13697), 1 << 8 }, { CDF1( 801), 10 << 8 }, { CDF1(10893), 25 << 8 }, { CDF1(19895), 50 << 8 }, { CDF1( 6221), 76 << 8 }, { CDF1(13720), 75 << 8 }, { CDF1(20139), 75 << 8 }, { CDF1( 7333), 60 << 8 }, { CDF1(20045), 25 << 8 }, { CDF1(24485), 50 << 8 }, }, .eob_base_uv_tok_hf = { { CDF2(10923, 21845), 0 << 8 }, { CDF2(31987, 32697), 93 << 8 }, { CDF2(32662, 32756), 90 << 8 }, { CDF2(32487, 32698), 60 << 8 }, }, .base_uv_tok_hf = { { CDF3(26130, 32392, 32726), 75 << 8 }, { CDF3(13730, 29626, 32253), 75 << 8 }, { CDF3( 9442, 23537, 29252), 6 << 8 }, { CDF3( 4255, 13986, 21188), 35 << 8 }, { CDF3(24285, 32033, 32659), 0 << 8 }, { CDF3(11954, 28100, 31763), 75 << 8 }, { CDF3( 7399, 19439, 26435), 5 << 8 }, { CDF3( 4096, 12081, 19339), 77 << 8 }, { CDF3(30583, 31894, 32331), 90 << 8 }, { CDF3(18905, 31508, 32138), 25 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, { CDF3( 8192, 16384, 24576), 0 << 8 }, }, .br_uv_tok_hf = { { CDF3(26977, 31598, 32364), 50 << 8 }, { CDF3(24721, 30575, 32124), 93 << 8 }, { CDF3(18803, 26659, 30610), 90 << 8 }, { CDF3(11757, 20385, 25284), 5 << 8 }, }, .eob_base_uv_tok_lf = { { CDF4(31345, 32511, 32676, 32725), 76 << 8 }, { CDF4(31940, 32413, 32531, 32650), 50 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, { CDF4( 6554, 13107, 19661, 26214), 0 << 8 }, }, .base_uv_tok_lf = { { CDF5( 7986, 24484, 30470, 32123, 32478), 12 << 8 }, { CDF5( 6385, 22268, 29781, 31884, 32479), 6 << 8 }, { CDF5( 4563, 14337, 22377, 27527, 30631), 75 << 8 }, { CDF5( 1768, 5903, 10730, 16633, 21440), 90 << 8 }, { CDF5( 7554, 22427, 29258, 31475, 32386), 5 << 8 }, { CDF5( 6209, 20694, 28786, 31444, 32330), 90 << 8 }, { CDF5( 4185, 12361, 19887, 25739, 29491), 93 << 8 }, { CDF5( 1583, 5456, 9702, 15370, 20474), 90 << 8 }, { CDF5(30183, 32316, 32574, 32639, 32703), 90 << 8 }, { CDF5(15147, 31729, 32521, 32669, 32719), 35 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, { CDF5( 5461, 10923, 16384, 21845, 27307), 0 << 8 }, } } }; #define update_cdf_1d(n1d, type, name, op) op(n1d, type, name) #define update_cdf_2d(n1d, n2d, type, name, op) \ for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, type, name[j], op) #define update_cdf_3d(n1d, n2d, n3d, type, name, op) \ for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, type, name[k], op) #define update_cdf_4d(n1d, n2d, n3d, n4d, type, name, op) \ for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, type, name[l], op) #define update_mv_cdfs(type, op) \ update_cdf_1d(1, type, shell_set, op); \ update_cdf_2d(7, (9 + j) >> 1, type, shell_lower, op); \ update_cdf_2d(7, imin(10 + j, 14) >> 1, type, shell_upper, op); \ update_cdf_1d(1, type, shell_tip, op); \ update_cdf_2d(2, 1, type, shell_offset_low, op); \ update_cdf_1d(1, type, shell_offset_cl2, op); \ update_cdf_2d(16, 1, type, shell_offset_hi, op); \ update_cdf_2d(2, 1, type, col_component, op); \ update_cdf_2d(4, 1, type, col_index, op) #define update_cdfs(op, between_kf_nonkf_condition) \ update_cdf_2d(2, 1, m, rst_switchable, op); \ update_cdf_1d(1, m, rst_ns_wiener, op); \ update_cdf_1d(1, m, rst_pc_wiener, op); \ update_cdf_2d(2, 1, m, wiener_ns_len, op); \ update_cdf_1d(1, m, wiener_ns_sym, op); \ update_cdf_1d(3, m, wiener_ns_cf, op); \ update_cdf_3d(2, 64, 1, m, part_split, op); \ update_cdf_2d(8, 1, m, part_square, op); \ update_cdf_3d(2, 64, 1, m, part_dir, op); \ update_cdf_3d(2, 64, 1, m, part_ext, op); \ update_cdf_3d(2, 64, 1, m, part_4way, op); \ update_cdf_2d(3, 1, m, intrabc, op); \ update_cdf_1d(1, m, gdf, op); \ update_cdf_2d(4, 1, m, cdef_idx0, op); \ update_cdf_2d(6, 1 + j, m, cdef_idx, op); \ update_cdf_3d(3, 4, 1, m, ccso, op); \ update_cdf_2d(6, 1, m, skip_txfm, op); \ update_cdf_2d(2, 1, m, dpcm, op); \ update_cdf_2d(2, 1, m, dpcm_dir, op); \ update_cdf_1d(3, m, intra_y_set, op); \ update_cdf_2d(3, 7, m, intra_y_idx0, op); \ update_cdf_2d(3, 5, m, intra_y_idx1, op); \ update_cdf_3d(4, 6, 1, m, fsc, op); \ update_cdf_2d(3, 3, m, mrl_index, op); \ update_cdf_2d(3, 1, m, multi_mrl, op); \ update_cdf_1d(1, m, pal_y, op); \ update_cdf_1d(6, m, pal_sz, op); \ update_cdf_2d(3, 1, m, dip, op); \ update_cdf_1d(5, m, dip_mode, op); \ update_cdf_2d(3, 1, m, cfl, op); \ update_cdf_2d(2, 7, m, intra_uv_mode, op); \ update_cdf_1d(1, m, mhccp, op); \ update_cdf_2d(4, 2, m, mhccp_filter_dir, op); \ update_cdf_1d(1, m, cfl_type, op); \ update_cdf_1d(7, m, cfl_sign, op); \ update_cdf_2d(6, 7, m, cfl_alpha, op); \ update_cdf_2d(4, 2, m, pal_idx_identity, op); \ update_cdf_3d(7, 5, k + 1, m, pal_idx, op); \ update_cdf_1d(1, m, intrabc_mode, op); \ update_cdf_1d(1, m, intrabc_precision, op); \ update_cdf_2d(3, 1, m, morph_pred, op); \ update_cdf_3d(4, 2, 1, m, txsz_lossless, op); \ update_cdf_4d(2, 2, 9, 1, m, tx_split, op); \ update_cdf_4d(2, 2, 14, 6, m, tx_part_2d, op); \ update_cdf_4d(2, 2, 2, 1, m, tx_part_1d, op); \ update_cdf_1d(1, m, txtp_lossless, op); \ update_cdf_2d(2, 1, m, txtp_long32_dct, op); \ update_cdf_2d(4, 3, m, txtp_intra_short_1d, op); \ update_cdf_3d(3, 4, 3, m, txtp_inter_short_1d, op); \ update_cdf_2d(4, 6, m, txtp_ext, op); \ update_cdf_2d(4, 1, m, txtp_ext_reduced, op); \ update_cdf_4d(2, 3, 4, 1, m, txtp_inter_tx_set, op); \ update_cdf_3d(2, 3, 7, m, txtp_inter_set0, op); \ update_cdf_2d(3, 7, m, txtp_inter_set1, op); \ update_cdf_2d(3, 3, m, txtp_inter_set2, op); \ update_cdf_3d(3, 4, 1, m, txtp_inter_dct_idtx, op); \ update_cdf_3d(3, 4, 3, m, txtp_inter_dct_idtx_iddct, op); \ update_cdf_3d(2, 5, 3, m, stx, op); \ update_cdf_1d(3, m, stx_set_adst, op); \ update_cdf_1d(6, m, stx_set, op); \ update_cdf_1d(6, m, cctx, op); \ \ update_cdf_4d(2, N_TX_SIZES, 10, 1, coef, skip, op); \ update_cdf_2d(3, 4, coef, eob_bin_16, op); \ update_cdf_2d(3, 5, coef, eob_bin_32, op); \ update_cdf_2d(3, 6, coef, eob_bin_64, op); \ update_cdf_2d(3, 7, coef, eob_bin_128, op); \ update_cdf_2d(3, 7, coef, eob_bin_256, op); \ update_cdf_2d(3, 7, coef, eob_bin_512, op); \ update_cdf_2d(3, 7, coef, eob_bin_1024, op); \ update_cdf_1d(1, coef, eob_hi_bit, op); \ update_cdf_3d(5, 4, 2, coef, eob_base_y_tok_hf, op); \ update_cdf_4d(5, 20, 2, 3, coef, base_y_tok_hf, op); \ update_cdf_2d(7, 3, coef, br_y_tok_hf, op); \ update_cdf_3d(5, 4, 4, coef, eob_base_y_tok_lf, op); \ update_cdf_4d(5, 33, 2, 5, coef, base_y_tok_lf, op); \ update_cdf_2d(14, 3, coef, br_y_tok_lf, op); \ update_cdf_4d(2, 2, 3, 1, coef, dc_sign, op); \ update_cdf_3d(3, 3, 2, coef, bob_base_y_tok, op); \ update_cdf_3d(3, 7, 3, coef, br_y_tok_idtx, op); \ update_cdf_3d(3, 7, 3, coef, base_y_tok_idtx, op); \ update_cdf_3d(3, 9, 1, coef, sign_idtx, op); \ update_cdf_2d(12, 1, coef, skip_v, op); \ update_cdf_2d(4, 2, coef, eob_base_uv_tok_hf, op); \ update_cdf_2d(12, 3, coef, base_uv_tok_hf, op); \ update_cdf_2d(4, 3, coef, br_uv_tok_hf, op); \ update_cdf_2d(4, 4, coef, eob_base_uv_tok_lf, op); \ update_cdf_2d(12, 5, coef, base_uv_tok_lf, op); \ \ update_cdf_2d(3, 1, m, seg_id_ext, op); \ update_cdf_3d(2, 3, 7, m, seg_id, op); \ update_cdf_1d(7, m, delta_q, op); \ \ update_mv_cdfs(dmv, op); \ \ between_kf_nonkf_condition; \ \ update_cdf_2d(4, 1, m, region_type, op); \ update_cdf_2d(3, 1, m, skip_mode, op); \ update_cdf_2d(3, 1, m, skip_mode_drl_idx, op); \ update_cdf_2d(4, 1, m, intra, op); \ update_cdf_2d(3, 1, m, tip, op); \ update_cdf_2d(5, 1, m, comp, op); \ update_cdf_3d(3, 6, 1, m, single_ref, op); \ update_cdf_3d(3, 6, 1, m, comp0_ref, op); \ update_cdf_4d(3, 2, 6, 1, m, comp1_ref, op); \ update_cdf_1d(1, m, tip_mode, op); \ update_cdf_2d(5, 1, m, warp, op); \ update_cdf_1d(1, m, warp_newmv, op); \ update_cdf_2d(5, 2, m, inter_mode, op); \ update_cdf_3d(9, 3, 1, m, amvd, op); \ update_cdf_2d(2, 1, m, bawp, op); \ update_cdf_2d(3, 1, m, bawp_explicit, op); \ update_cdf_1d(1, m, bawp_explicit_scale, op); \ update_cdf_2d(3, 1, m, warp_extend, op); \ update_cdf_2d(4, 1, m, warp_causal, op); \ update_cdf_2d(4, 1, m, interintra, op); \ update_cdf_2d(4, 3, m, interintra_mode, op); \ update_cdf_1d(1, m, interintra_wedge, op); \ update_cdf_1d(3, m, wedge_quad, op); \ update_cdf_2d(4, 4, m, wedge_angle, op); \ update_cdf_1d(3, m, wedge_dist, op); \ update_cdf_1d(2, m, wedge_dist2, op); \ update_cdf_2d(3, 1, m, tip_drl_idx, op); \ update_cdf_1d(2, m, jmvd_amvd_scale_mode, op); \ update_cdf_1d(4, m, jmvd_scale_mode, op); \ update_cdf_3d(3, 5, 1, m, drl_idx, op); \ update_cdf_2d(3, 1, m, mvprec_def, op); \ update_cdf_3d(2, 3, 2, m, mvprec_rem, op); \ update_cdf_2d(3, 1, m, warp_ref_idx, op); \ update_cdf_1d(3, m, amvd_joint, op); \ update_cdf_2d(2, 7, m, amvd_index, op); \ update_cdf_1d(1, m, warpmv_with_mvd, op); \ update_cdf_2d(N_BS_SIZES, 1, m, warp_delta_prec, op); \ update_cdf_3d(2, 2, 7, m, warp_delta_param, op); \ update_cdf_1d(1, m, warp_delta_sign, op); \ update_cdf_2d(4, 1, m, warp_interintra, op); \ update_cdf_2d(5, 3, m, comp_mode_sameref, op); \ update_cdf_2d(2, 1, m, comp_mode_joint, op); \ update_cdf_2d(5, 4, m, comp_mode, op); \ update_cdf_2d(2, 1, m, opfl, op); \ update_cdf_2d(11, 1, m, refine_mv, op); \ update_cdf_2d(12, 1, m, comp_type_masked, op); \ update_cdf_1d(1, m, comp_type_weighted, op); \ update_cdf_2d(4, 1, m, cwp_idx, op); \ update_cdf_2d(8, 2, m, filter, op); \ \ update_cdf_2d(3, 1, m, seg_pred, op); \ \ update_mv_cdfs(mv, op) void dav2d_cdf_reset_count(const Dav2dFrameHeader *const hdr, CdfContext *const dst) { #define reset_count(n1d, type, name) \ do { \ uint8_t *const count = (uint8_t*)&dst->type.name[n1d]; \ *count = (*count * 3) >> 2; \ } while (0) update_cdfs(reset_count, if (IS_KEY_OR_INTRA(hdr)) return); #undef reset_count } void dav2d_cdf_shift(CdfContext *const dst, const CdfContext *const src, const int n_tiles_log2) { #define shift_store(n1d, type, name) \ do { \ for (int n = 0; n < n1d; n++) { \ dst->type.name[n] = src->type.name[n] >> n_tiles_log2; \ } \ uint8_t *const dst_count = (uint8_t*)&dst->type.name[n1d]; \ const uint8_t *const src_count = (const uint8_t*)&src->type.name[n1d]; \ dst_count[0] = src_count[0] >> n_tiles_log2; \ dst_count[1] = src_count[1]; /* don't modify para */ \ } while (0) update_cdfs(shift_store,); #undef shift_store } void dav2d_cdf_shift_accumulate(CdfContext *const dst, const CdfContext *const src, const int n_tiles_log2) { #define shift_accumulate(n1d, type, name) \ do { \ for (int n = 0; n < n1d; n++) { \ dst->type.name[n] += src->type.name[n] >> n_tiles_log2; \ } \ uint8_t *const dst_count = (uint8_t*)&dst->type.name[n1d]; \ const uint8_t *const src_count = (const uint8_t*)&src->type.name[n1d]; \ *dst_count += *src_count >> n_tiles_log2; \ /* ignore para beyond first tile */ \ } while (0) update_cdfs(shift_accumulate,); #undef shift_accumulate } void dav2d_cdf_pri_sec_average(CdfContext *const dst, const CdfThreadContext *const src1, const CdfThreadContext *const src2) { #define pri_sec_average(n1d, type, name) \ do { \ for (int n = 0; n <= n1d; n++) { \ /* for n == n1d this averages count without modifying para */ \ dst->type.name[n] = (src1##type->name[n] * 7 + \ src2##type->name[n] * 1 + 4) >> 3; \ } \ } while (0) const CdfModeContext *src1m, *src2m; const CdfMvContext *src1mv, *src1dmv, *src2mv, *src2dmv; const CdfCoefContext *src1coef, *src2coef; #define assign_src(num) \ if (src##num->ref) { \ src##num##m = &src##num->data.cdf->m; \ src##num##mv = &src##num->data.cdf->mv; \ src##num##dmv = &src##num->data.cdf->dmv; \ src##num##coef = &src##num->data.cdf->coef; \ } else { \ src##num##m = &default_cdf.m; \ src##num##mv = src##num##dmv = &default_cdf.mv; \ src##num##coef = &default_coef_cdf[src##num->data.qcat]; \ } assign_src(1); assign_src(2); #undef assign_src update_cdfs(pri_sec_average,); #undef pri_sec_average } /* * CDF threading wrappers. */ void dav2d_cdf_thread_init_static(CdfThreadContext *const cdf, const unsigned qidx) { cdf->ref = NULL; cdf->data.qcat = (qidx > 90) + (qidx > 140) + (qidx > 190); } void dav2d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const src) { if (src->ref) { memcpy(dst, src->data.cdf, sizeof(*dst)); } else { dst->coef = default_coef_cdf[src->data.qcat]; dst->m = default_cdf.m; dst->mv = default_cdf.mv; dst->dmv = default_cdf.mv; } } int dav2d_cdf_thread_alloc(Dav2dContext *const c, CdfThreadContext *const cdf, const int have_frame_mt) { cdf->ref = dav2d_ref_create_using_pool(c->cdf_pool, sizeof(CdfContext) + sizeof(atomic_uint)); if (!cdf->ref) return DAV2D_ERR(ENOMEM); cdf->data.cdf = cdf->ref->data; if (have_frame_mt) { cdf->progress = (atomic_uint *) &cdf->data.cdf[1]; atomic_init(cdf->progress, 0); } return 0; } void dav2d_cdf_thread_ref(CdfThreadContext *const dst, CdfThreadContext *const src) { *dst = *src; if (src->ref) dav2d_ref_inc(src->ref); } void dav2d_cdf_thread_unref(CdfThreadContext *const cdf) { memset(&cdf->data, 0, sizeof(*cdf) - offsetof(CdfThreadContext, data)); dav2d_ref_dec(&cdf->ref); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cdf.h000066400000000000000000000223671517466257200215520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_CDF_H #define DAV2D_SRC_CDF_H #include #include "src/levels.h" #include "src/ref.h" #include "src/thread_data.h" /* Buffers padded to [4]/[8] for SIMD where needed. */ /* TODO: Reorganize structs to minimize alignment padding. */ typedef struct CdfModeContext { ALIGN(uint16_t rst_switchable[2][2], 4); ALIGN(uint16_t rst_pc_wiener[2], 4); ALIGN(uint16_t rst_ns_wiener[2], 4); ALIGN(uint16_t wiener_ns_len[2][2], 4); ALIGN(uint16_t wiener_ns_sym[2], 4); ALIGN(uint16_t wiener_ns_cf[4], 8); ALIGN(uint16_t part_split[2][64][2], 4); ALIGN(uint16_t part_square[8][2], 4); ALIGN(uint16_t part_dir[2][64][2], 4); ALIGN(uint16_t part_ext[2][64][2], 4); ALIGN(uint16_t part_4way[2][64][2], 4); ALIGN(uint16_t region_type[4][2], 4); ALIGN(uint16_t intrabc[3][2], 4); ALIGN(uint16_t gdf[2], 4); ALIGN(uint16_t cdef_idx0[4][2], 4); ALIGN(uint16_t cdef_idx[6][7+1], 16); ALIGN(uint16_t ccso[3][4][2], 4); ALIGN(uint16_t skip_txfm[6][2], 4); ALIGN(uint16_t dpcm[2][2], 4); ALIGN(uint16_t dpcm_dir[2][2], 4); ALIGN(uint16_t intra_y_set[4], 8); ALIGN(uint16_t intra_y_idx0[3][8], 16); ALIGN(uint16_t intra_y_idx1[3][6+2], 16); ALIGN(uint16_t fsc[4][6][2], 4); ALIGN(uint16_t mrl_index[3][4], 8); ALIGN(uint16_t multi_mrl[3][2], 4); ALIGN(uint16_t pal_y[2], 4); ALIGN(uint16_t pal_sz[7+1], 16); ALIGN(uint16_t dip[3][2], 4); ALIGN(uint16_t dip_mode[6+2], 16); ALIGN(uint16_t cfl[3][2], 4); ALIGN(uint16_t intra_uv_mode[2][8], 16); ALIGN(uint16_t mhccp[2], 4); ALIGN(uint16_t mhccp_filter_dir[4][3+1], 8); ALIGN(uint16_t cfl_type[2], 4); ALIGN(uint16_t cfl_sign[8], 16); ALIGN(uint16_t cfl_alpha[6][8], 16); ALIGN(uint16_t pal_idx_identity[4][3+1], 8); ALIGN(uint16_t pal_idx[7][5][8], 16); ALIGN(uint16_t intrabc_mode[2], 4); ALIGN(uint16_t intrabc_precision[2], 4); ALIGN(uint16_t morph_pred[3][2], 4); ALIGN(uint16_t txsz_lossless[4][2][2], 4); ALIGN(uint16_t tx_split[2][2][9][2], 4); ALIGN(uint16_t tx_part_2d[2][2][14][7+1], 16); ALIGN(uint16_t tx_part_1d[2][2][2][2], 4); ALIGN(uint16_t txtp_lossless[2], 4); ALIGN(uint16_t txtp_long32_dct[2][2], 4); ALIGN(uint16_t txtp_intra_short_1d[4][4], 8); ALIGN(uint16_t txtp_inter_short_1d[3][4][4], 8); ALIGN(uint16_t txtp_ext[4][7+1], 16); ALIGN(uint16_t txtp_ext_reduced[4][2], 4); ALIGN(uint16_t txtp_inter_tx_set[2][3][4][2], 4); ALIGN(uint16_t txtp_inter_set0[2][3][8], 16); ALIGN(uint16_t txtp_inter_set1[3][8], 16); ALIGN(uint16_t txtp_inter_set2[3][4], 8); ALIGN(uint16_t txtp_inter_dct_idtx[3][4][2], 4); ALIGN(uint16_t txtp_inter_dct_idtx_iddct[3][4][4], 4); ALIGN(uint16_t stx[2][5][4], 8); ALIGN(uint16_t stx_set_adst[4], 8); ALIGN(uint16_t stx_set[7+1], 16); ALIGN(uint16_t cctx[7+1], 16); ALIGN(uint16_t seg_id_ext[3][2], 2); ALIGN(uint16_t seg_id[2][3][8], 8); ALIGN(uint16_t delta_q[8], 8); /* inter/switch */ ALIGN(uint16_t skip_mode[3][2], 4); ALIGN(uint16_t skip_mode_drl_idx[3][2], 4); ALIGN(uint16_t intra[4][2], 4); ALIGN(uint16_t tip[3][2], 4); ALIGN(uint16_t comp[5][2], 4); ALIGN(uint16_t single_ref[3][6][2], 4); ALIGN(uint16_t comp0_ref[3][6][2], 4); ALIGN(uint16_t comp1_ref[3][2][6][2], 4); ALIGN(uint16_t tip_mode[2], 4); ALIGN(uint16_t warp[5][2], 4); ALIGN(uint16_t warp_newmv[2], 4); ALIGN(uint16_t inter_mode[5][3+1], 8); ALIGN(uint16_t amvd[9][3][2], 4); ALIGN(uint16_t bawp[2][2], 4); ALIGN(uint16_t bawp_explicit[3][2], 4); ALIGN(uint16_t bawp_explicit_scale[2], 4); ALIGN(uint16_t warp_extend[3][2], 4); ALIGN(uint16_t warp_causal[4][2], 4); ALIGN(uint16_t interintra[4][2], 4); ALIGN(uint16_t interintra_mode[4][4], 8); ALIGN(uint16_t interintra_wedge[2], 4); ALIGN(uint16_t wedge_quad[4], 8); ALIGN(uint16_t wedge_angle[4][5+3], 16); ALIGN(uint16_t wedge_dist2[3+1], 8); ALIGN(uint16_t wedge_dist[4], 8); ALIGN(uint16_t tip_drl_idx[3][2], 4); ALIGN(uint16_t jmvd_amvd_scale_mode[3+1], 8); ALIGN(uint16_t jmvd_scale_mode[5+3], 16); ALIGN(uint16_t drl_idx[3][5][2], 4); ALIGN(uint16_t mvprec_def[3][2], 4); ALIGN(uint16_t mvprec_rem[2][3][3+1], 8); ALIGN(uint16_t warp_ref_idx[3][2], 4); ALIGN(uint16_t amvd_joint[4], 8); ALIGN(uint16_t amvd_index[2][8], 16); ALIGN(uint16_t warpmv_with_mvd[2], 4); ALIGN(uint16_t warp_delta_prec[N_BS_SIZES][2], 4); ALIGN(uint16_t warp_delta_param[2][2][8], 16); ALIGN(uint16_t warp_delta_sign[2], 4); ALIGN(uint16_t warp_interintra[4][2], 4); ALIGN(uint16_t comp_mode_sameref[5][4], 8); ALIGN(uint16_t comp_mode_joint[2][2], 4); ALIGN(uint16_t comp_mode[5][5+3], 16); ALIGN(uint16_t opfl[2][2], 4); ALIGN(uint16_t refine_mv[11][2], 4); ALIGN(uint16_t comp_type_masked[12][2], 4); ALIGN(uint16_t comp_type_weighted[2], 4); ALIGN(uint16_t cwp_idx[4][2], 4); ALIGN(uint16_t filter[8][4], 8); ALIGN(uint16_t seg_pred[3][2], 4); } CdfModeContext; typedef struct CdfCoefContext { ALIGN(uint16_t skip[2][5][10][2], 4); ALIGN(uint16_t eob_bin_16[3][5+3], 16); ALIGN(uint16_t eob_bin_32[3][6+2], 16); ALIGN(uint16_t eob_bin_64[3][7+1], 16); ALIGN(uint16_t eob_bin_128[3][8], 16); ALIGN(uint16_t eob_bin_256[3][8], 16); ALIGN(uint16_t eob_bin_512[3][8], 16); ALIGN(uint16_t eob_bin_1024[3][8], 16); ALIGN(uint16_t eob_hi_bit[2], 4); ALIGN(uint16_t eob_base_y_tok_hf[5][4][3+1], 8); ALIGN(uint16_t base_y_tok_hf[5][20][2][4], 8); ALIGN(uint16_t br_y_tok_hf[7][4], 8); ALIGN(uint16_t eob_base_y_tok_lf[5][4][5+3], 16); ALIGN(uint16_t base_y_tok_lf[5][33][2][6+2], 16); ALIGN(uint16_t br_y_tok_lf[14][4], 8); ALIGN(uint16_t dc_sign[2][2][3][2], 4); ALIGN(uint16_t bob_base_y_tok[3][3][3+1], 8); ALIGN(uint16_t br_y_tok_idtx[3][7][4], 8); ALIGN(uint16_t base_y_tok_idtx[3][7][4], 8); ALIGN(uint16_t sign_idtx[3][9][2], 4); ALIGN(uint16_t skip_v[12][2], 4); ALIGN(uint16_t eob_base_uv_tok_hf[4][3+1], 8); ALIGN(uint16_t base_uv_tok_hf[12][4], 8); ALIGN(uint16_t br_uv_tok_hf[4][4], 8); ALIGN(uint16_t eob_base_uv_tok_lf[4][5+3], 16); ALIGN(uint16_t base_uv_tok_lf[12][6+2], 16); } CdfCoefContext; typedef struct CdfMvContext { ALIGN(uint16_t shell_lower[7][8], 16); ALIGN(uint16_t shell_upper[7][8], 16); ALIGN(uint16_t shell_set[2], 4); ALIGN(uint16_t shell_tip[2], 4); ALIGN(uint16_t shell_offset_low[2][2], 4); ALIGN(uint16_t shell_offset_cl2[2], 4); ALIGN(uint16_t shell_offset_hi[16][2], 4); ALIGN(uint16_t col_component[2][2], 4); ALIGN(uint16_t col_index[4][2], 4); } CdfMvContext; typedef struct CdfContext { CdfCoefContext coef; CdfModeContext m; CdfMvContext mv, dmv; } CdfContext; typedef struct CdfThreadContext { Dav2dRef *ref; ///< allocation origin union { CdfContext *cdf; // if ref != NULL unsigned qcat; // if ref == NULL, from static CDF tables } data; atomic_uint *progress; } CdfThreadContext; void dav2d_cdf_reset_count(const Dav2dFrameHeader *hdr, CdfContext *dst); void dav2d_cdf_shift(CdfContext *dst, const CdfContext *src, int n_tiles_log2); void dav2d_cdf_shift_accumulate(CdfContext *dst, const CdfContext *src, int n_tiles_log2); void dav2d_cdf_pri_sec_average(CdfContext *dst, const CdfThreadContext *src1, const CdfThreadContext *src2); void dav2d_cdf_thread_init_static(CdfThreadContext *cdf, unsigned qidx); int dav2d_cdf_thread_alloc(Dav2dContext *c, CdfThreadContext *cdf, const int have_frame_mt); void dav2d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src); void dav2d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src); void dav2d_cdf_thread_unref(CdfThreadContext *cdf); #endif /* DAV2D_SRC_CDF_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cpu.c000066400000000000000000000076751517466257200216050ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "src/cpu.h" #include "src/log.h" #ifdef _WIN32 #include #endif #ifdef __APPLE__ #include #include #endif #if HAVE_UNISTD_H #include #endif #if HAVE_PTHREAD_GETAFFINITY_NP #include #if HAVE_PTHREAD_NP_H #include #endif #if defined(__FreeBSD__) #define cpu_set_t cpuset_t #endif #endif #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #include #endif unsigned dav2d_cpu_flags = 0U; unsigned dav2d_cpu_flags_mask = ~0U; COLD void dav2d_init_cpu(void) { #if HAVE_ASM && !__has_feature(memory_sanitizer) // memory sanitizer is inherently incompatible with asm #if ARCH_AARCH64 || ARCH_ARM dav2d_cpu_flags = dav2d_get_cpu_flags_arm(); #elif ARCH_LOONGARCH dav2d_cpu_flags = dav2d_get_cpu_flags_loongarch(); #elif ARCH_PPC64LE dav2d_cpu_flags = dav2d_get_cpu_flags_ppc(); #elif ARCH_RISCV dav2d_cpu_flags = dav2d_get_cpu_flags_riscv(); #elif ARCH_X86 dav2d_cpu_flags = dav2d_get_cpu_flags_x86(); #endif #endif } COLD void dav2d_set_cpu_flags_mask(const unsigned mask) { dav2d_cpu_flags_mask = mask; } COLD int dav2d_num_logical_processors(Dav2dContext *const c) { #ifdef _WIN32 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) GROUP_AFFINITY affinity; if (GetThreadGroupAffinity(GetCurrentThread(), &affinity)) { int num_processors = 1; while (affinity.Mask &= affinity.Mask - 1) num_processors++; return num_processors; } #else SYSTEM_INFO system_info; GetNativeSystemInfo(&system_info); return system_info.dwNumberOfProcessors; #endif #elif HAVE_PTHREAD_GETAFFINITY_NP && defined(CPU_COUNT) cpu_set_t affinity; if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) return CPU_COUNT(&affinity); #elif defined(__APPLE__) int num_processors; size_t length = sizeof(num_processors); if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0)) return num_processors; #elif defined(_SC_NPROCESSORS_ONLN) return (int)sysconf(_SC_NPROCESSORS_ONLN); #endif if (c) dav2d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n"); return 1; } COLD unsigned long dav2d_getauxval(unsigned long type) { #if HAVE_GETAUXVAL return getauxval(type); #elif HAVE_ELF_AUX_INFO unsigned long aux = 0; int ret = elf_aux_info(type, &aux, sizeof(aux)); if (ret != 0) errno = ret; return aux; #else errno = ENOSYS; return 0; #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/cpu.h000066400000000000000000000106511517466257200215760ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_CPU_H #define DAV2D_SRC_CPU_H #include "config.h" #include "common/attributes.h" #include "dav2d/common.h" #include "dav2d/dav2d.h" #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/cpu.h" #elif ARCH_LOONGARCH #include "src/loongarch/cpu.h" #elif ARCH_PPC64LE #include "src/ppc/cpu.h" #elif ARCH_RISCV #include "src/riscv/cpu.h" #elif ARCH_X86 #include "src/x86/cpu.h" #endif EXTERN unsigned dav2d_cpu_flags; EXTERN unsigned dav2d_cpu_flags_mask; void dav2d_init_cpu(void); DAV2D_API void dav2d_set_cpu_flags_mask(unsigned mask); int dav2d_num_logical_processors(Dav2dContext *c); unsigned long dav2d_getauxval(unsigned long); static ALWAYS_INLINE unsigned dav2d_get_default_cpu_flags(void) { unsigned flags = 0; #if ARCH_AARCH64 || ARCH_ARM #if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 flags |= DAV2D_ARM_CPU_FLAG_NEON; #endif #ifdef __ARM_FEATURE_DOTPROD flags |= DAV2D_ARM_CPU_FLAG_DOTPROD; #endif #ifdef __ARM_FEATURE_MATMUL_INT8 flags |= DAV2D_ARM_CPU_FLAG_I8MM; #endif #if ARCH_AARCH64 #ifdef __ARM_FEATURE_SVE flags |= DAV2D_ARM_CPU_FLAG_SVE; #endif #ifdef __ARM_FEATURE_SVE2 flags |= DAV2D_ARM_CPU_FLAG_SVE2; #endif #endif /* ARCH_AARCH64 */ #elif ARCH_PPC64LE #if defined(__VSX__) flags |= DAV2D_PPC_CPU_FLAG_VSX; #endif #if defined(__POWER9_VECTOR__) flags |= DAV2D_PPC_CPU_FLAG_PWR9; #endif #elif ARCH_RISCV #if defined(__riscv_v) flags |= DAV2D_RISCV_CPU_FLAG_V; #endif #elif ARCH_X86 #if defined(__AVX512F__) && defined(__AVX512CD__) && \ defined(__AVX512BW__) && defined(__AVX512DQ__) && \ defined(__AVX512VL__) && defined(__AVX512VNNI__) && \ defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \ defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \ defined(__AVX512BITALG__) && defined(__GFNI__) && \ defined(__VAES__) && defined(__VPCLMULQDQ__) flags |= DAV2D_X86_CPU_FLAG_AVX512ICL | DAV2D_X86_CPU_FLAG_AVX2 | DAV2D_X86_CPU_FLAG_SSE41 | DAV2D_X86_CPU_FLAG_SSSE3 | DAV2D_X86_CPU_FLAG_SSE2; #elif defined(__AVX2__) flags |= DAV2D_X86_CPU_FLAG_AVX2 | DAV2D_X86_CPU_FLAG_SSE41 | DAV2D_X86_CPU_FLAG_SSSE3 | DAV2D_X86_CPU_FLAG_SSE2; #elif defined(__SSE4_1__) || defined(__AVX__) flags |= DAV2D_X86_CPU_FLAG_SSE41 | DAV2D_X86_CPU_FLAG_SSSE3 | DAV2D_X86_CPU_FLAG_SSE2; #elif defined(__SSSE3__) flags |= DAV2D_X86_CPU_FLAG_SSSE3 | DAV2D_X86_CPU_FLAG_SSE2; #elif ARCH_X86_64 || defined(__SSE2__) || \ (defined(_M_IX86_FP) && _M_IX86_FP >= 2) flags |= DAV2D_X86_CPU_FLAG_SSE2; #endif #endif return flags; } static ALWAYS_INLINE unsigned dav2d_get_cpu_flags(void) { unsigned flags = dav2d_cpu_flags & dav2d_cpu_flags_mask; #if TRIM_DSP_FUNCTIONS /* Since this function is inlined, unconditionally setting a flag here will * enable dead code elimination in the calling function. */ flags |= dav2d_get_default_cpu_flags(); #endif return flags; } #endif /* DAV2D_SRC_CPU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ctx.c000066400000000000000000000045111517466257200215760ustar00rootroot00000000000000/* * Copyright © 2024-2026, VideoLAN and dav2d authors * Copyright © 2024-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "ctx.h" static void memset_w1(void *const ptr, const int value) { set_ctx1((uint8_t *) ptr, 0, value); } static void memset_w2(void *const ptr, const int value) { set_ctx2((uint8_t *) ptr, 0, value); } static void memset_w4(void *const ptr, const int value) { set_ctx4((uint8_t *) ptr, 0, value); } static void memset_w8(void *const ptr, const int value) { set_ctx8((uint8_t *) ptr, 0, value); } static void memset_w16(void *const ptr, const int value) { set_ctx16((uint8_t *) ptr, 0, value); } static void memset_w32(void *const ptr, const int value) { set_ctx32((uint8_t *) ptr, 0, value); } static void memset_w64(void *const ptr, const int value) { set_ctx64((uint8_t *) ptr, 0, value); } const dav2d_memset_pow2_fn dav2d_memset_pow2[7] = { memset_w1, memset_w2, memset_w4, memset_w8, memset_w16, memset_w32, memset_w64, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ctx.h000066400000000000000000000071711517466257200216100ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_CTX_H #define DAV2D_SRC_CTX_H #include #include "common/attributes.h" #include "common/intops.h" PACKED(union alias64 { uint64_t u64; uint8_t u8[8]; }) ATTR_ALIAS; PACKED(union alias32 { uint32_t u32; uint8_t u8[4]; }) ATTR_ALIAS; PACKED(union alias16 { uint16_t u16; uint8_t u8[2]; }) ATTR_ALIAS; union alias8 { uint8_t u8; } ATTR_ALIAS; typedef void (*dav2d_memset_pow2_fn)(void *ptr, int value); EXTERN const dav2d_memset_pow2_fn dav2d_memset_pow2[7]; static inline void dav2d_memset_likely_pow2(void *const ptr, const int value, const int n) { assert(n >= 1 && n <= 64); if ((n&(n-1)) == 0) { dav2d_memset_pow2[ulog2(n)](ptr, value); } else { memset(ptr, value, n); } } // For smaller sizes use multiplication to broadcast bytes. memset misbehaves on the smaller sizes. // For the larger sizes, we want to use memset to get access to vector operations. #define set_ctx1(var, off, val) \ ((union alias8 *) &(var)[off])->u8 = (val) * 0x01 #define set_ctx2(var, off, val) \ ((union alias16 *) &(var)[off])->u16 = (val) * 0x0101 #define set_ctx4(var, off, val) \ ((union alias32 *) &(var)[off])->u32 = (val) * 0x01010101U #define set_ctx8(var, off, val) \ ((union alias64 *) &(var)[off])->u64 = (val) * 0x0101010101010101ULL #define set_ctx16(var, off, val) do { \ memset(&(var)[off], val, 16); \ } while (0) #define set_ctx32(var, off, val) do { \ memset(&(var)[off], val, 32); \ } while (0) #define set_ctx64(var, off, val) do { \ memset(&(var)[off], val, 64); \ } while (0) #define case_set(var) \ switch (var) { \ case 0: set_ctx(set_ctx1); break; \ case 1: set_ctx(set_ctx2); break; \ case 2: set_ctx(set_ctx4); break; \ case 3: set_ctx(set_ctx8); break; \ case 4: set_ctx(set_ctx16); break; \ case 5: set_ctx(set_ctx32); break; \ case 6: set_ctx(set_ctx64); break; \ default: assert(0); \ } #define case_set_upto16(var) \ switch (var) { \ case 0: set_ctx(set_ctx1); break; \ case 1: set_ctx(set_ctx2); break; \ case 2: set_ctx(set_ctx4); break; \ case 3: set_ctx(set_ctx8); break; \ case 4: set_ctx(set_ctx16); break; \ default: assert(0); \ } #endif /* DAV2D_SRC_CTX_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/data.c000066400000000000000000000120011517466257200217020ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "dav2d/data.h" #include "common/attributes.h" #include "common/validate.h" #include "src/data.h" #include "src/ref.h" uint8_t *dav2d_data_create_internal(Dav2dData *const buf, const size_t sz) { validate_input_or_ret(buf != NULL, NULL); if (sz > SIZE_MAX / 2) return NULL; buf->ref = dav2d_ref_create(ALLOC_DAV2DDATA, sz); if (!buf->ref) return NULL; buf->data = buf->ref->const_data; buf->sz = sz; dav2d_data_props_set_defaults(&buf->m); buf->m.size = sz; return buf->ref->data; } int dav2d_data_wrap_internal(Dav2dData *const buf, const uint8_t *const ptr, const size_t sz, void (*const free_callback)(const uint8_t *data, void *cookie), void *const cookie) { validate_input_or_ret(buf != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(ptr != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(free_callback != NULL, DAV2D_ERR(EINVAL)); if (sz > SIZE_MAX / 2) return DAV2D_ERR(EINVAL); Dav2dRef *const ref = dav2d_malloc(ALLOC_DAV2DDATA, sizeof(Dav2dRef)); if (!ref) return DAV2D_ERR(ENOMEM); buf->ref = dav2d_ref_init(ref, ptr, free_callback, cookie, 1); buf->data = ptr; buf->sz = sz; dav2d_data_props_set_defaults(&buf->m); buf->m.size = sz; return 0; } int dav2d_data_wrap_user_data_internal(Dav2dData *const buf, const uint8_t *const user_data, void (*const free_callback)(const uint8_t *user_data, void *cookie), void *const cookie) { validate_input_or_ret(buf != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(free_callback != NULL, DAV2D_ERR(EINVAL)); Dav2dRef *const ref = dav2d_malloc(ALLOC_DAV2DDATA, sizeof(Dav2dRef)); if (!ref) return DAV2D_ERR(ENOMEM); buf->m.user_data.ref = dav2d_ref_init(ref, user_data, free_callback, cookie, 1); buf->m.user_data.data = user_data; return 0; } void dav2d_data_ref(Dav2dData *const dst, const Dav2dData *const src) { assert(dst != NULL); assert(dst->data == NULL); assert(src != NULL); if (src->ref) { assert(src->data != NULL); dav2d_ref_inc(src->ref); } if (src->m.user_data.ref) dav2d_ref_inc(src->m.user_data.ref); *dst = *src; } void dav2d_data_props_copy(Dav2dDataProps *const dst, const Dav2dDataProps *const src) { assert(dst != NULL); assert(src != NULL); dav2d_ref_dec(&dst->user_data.ref); *dst = *src; if (dst->user_data.ref) dav2d_ref_inc(dst->user_data.ref); } void dav2d_data_props_set_defaults(Dav2dDataProps *const props) { assert(props != NULL); memset(props, 0, sizeof(*props)); props->timestamp = INT64_MIN; props->offset = -1; } void dav2d_data_props_unref_internal(Dav2dDataProps *const props) { validate_input(props != NULL); struct Dav2dRef *user_data_ref = props->user_data.ref; dav2d_data_props_set_defaults(props); dav2d_ref_dec(&user_data_ref); } void dav2d_data_unref_internal(Dav2dData *const buf) { validate_input(buf != NULL); struct Dav2dRef *user_data_ref = buf->m.user_data.ref; if (buf->ref) { validate_input(buf->data != NULL); dav2d_ref_dec(&buf->ref); } memset(buf, 0, sizeof(*buf)); dav2d_data_props_set_defaults(&buf->m); dav2d_ref_dec(&user_data_ref); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/data.h000066400000000000000000000051151517466257200217170ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_DATA_H #define DAV2D_SRC_DATA_H #include "dav2d/data.h" void dav2d_data_ref(Dav2dData *dst, const Dav2dData *src); /** * Copy the source properties to the destination and increase the * user_data's reference count (if it's not NULL). */ void dav2d_data_props_copy(Dav2dDataProps *dst, const Dav2dDataProps *src); void dav2d_data_props_set_defaults(Dav2dDataProps *props); uint8_t *dav2d_data_create_internal(Dav2dData *buf, size_t sz); int dav2d_data_wrap_internal(Dav2dData *buf, const uint8_t *ptr, size_t sz, void (*free_callback)(const uint8_t *data, void *user_data), void *user_data); int dav2d_data_wrap_user_data_internal(Dav2dData *buf, const uint8_t *user_data, void (*free_callback)(const uint8_t *user_data, void *cookie), void *cookie); void dav2d_data_unref_internal(Dav2dData *buf); void dav2d_data_props_unref_internal(Dav2dDataProps *props); #endif /* DAV2D_SRC_DATA_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/dav2d.rc.in000066400000000000000000000022041517466257200225640ustar00rootroot00000000000000#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 #define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" #define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 #define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" #include 1 VERSIONINFO FILETYPE VFT_DLL FILEOS VOS_NT_WINDOWS32 PRODUCTVERSION PROJECT_VERSION_NUMBER FILEVERSION API_VERSION_NUMBER BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "040904E4" BEGIN VALUE "CompanyName", "VideoLAN" VALUE "ProductName", "dav2d" VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR VALUE "FileVersion", API_VERSION_NUMBER_STR VALUE "FileDescription", "dav2d " PROJECT_VERSION_NUMBER_STR " - AV2 decoder" VALUE "InternalName", "dav2d" VALUE "OriginalFilename", "libdav2d.dll" VALUE "LegalCopyright", L"Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav2d Authors" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x409, 1252 END END dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/db_apply.h000066400000000000000000000041601517466257200225770ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_DB_APPLY_H #define DAV2D_SRC_DB_APPLY_H #include #include "common/bitdepth.h" #include "src/internal.h" #include "src/levels.h" void bytefn(dav2d_deblock_sbrow_cols)(const Dav2dFrameContext *f, pixel *const p[3], Av2Filter *lflvl, int sby, int start_of_tile_row); void bytefn(dav2d_deblock_sbrow_rows)(const Dav2dFrameContext *f, pixel *const p[3], Av2Filter *lflvl, int sby); void bytefn(dav2d_copy_db)(Dav2dFrameContext *const f, /*const*/ pixel *const src[3], int sby); #endif /* DAV2D_SRC_DB_APPLY_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/db_apply_tmpl.c000066400000000000000000001104441517466257200236310ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include "common/intops.h" #include "src/db_apply.h" #include "src/lr_apply.h" #include "src/quantizer.h" static unsigned deblock_quant_thr(const int hbd, const int qidx) { const int qmax = 255 + 48 * hbd; return (dav2d_dq_lookup(iclip(qidx, 0, qmax)) + 4) >> (3 + 6); } static unsigned deblock_side_thr(const int hbd, const int qidx) { const int bitdepth_min_8 = 2 * hbd; const int q_ind = iclip(qidx - 24 * bitdepth_min_8, 0, 296 - 1); const int side_thr = dav2d_deblock_side_thresholds[q_ind]; return imax(side_thr + (1 << 4 >> bitdepth_min_8), 0) >> (5 - bitdepth_min_8); } static void init_deblock_thr_lut_y(const Dav2dFrameHeader *const frame_hdr, const int hbd, const int dir, const int qidx, pixel lut[2][16]) { const int qmax = 255 + 48 * hbd; for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) { const int yac = frame_hdr->segmentation.enabled ? iclip(qidx + frame_hdr->segmentation.d.delta_q[i], 0, qmax) : qidx; const int dir_yac = yac + 8 * frame_hdr->deblock.delta_q_y[dir]; lut[0][i] = deblock_quant_thr(hbd, dir_yac); lut[1][i] = deblock_side_thr(hbd, dir_yac); } } static void init_deblock_thr_lut_uv(const Dav2dFrameHeader *const frame_hdr, const int hbd, const int qidx, pixel lut[2][2][16]) { const int qmax = 255 + 48 * hbd; for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) { const int yac = frame_hdr->segmentation.enabled ? iclip(qidx + frame_hdr->segmentation.d.delta_q[i], 0, qmax) : qidx; const int uac = yac + frame_hdr->quant.uac_delta + 8 * frame_hdr->deblock.delta_q_u; lut[0][0][i] = deblock_quant_thr(hbd, uac); lut[0][1][i] = deblock_side_thr(hbd, uac); const int vac = yac + frame_hdr->quant.vac_delta + 8 * frame_hdr->deblock.delta_q_v; lut[1][0][i] = deblock_quant_thr(hbd, vac); lut[1][1][i] = deblock_side_thr(hbd, vac); } } // The deblock buffer stores 12 rows of pixels. A superblock block will // contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above // and 2 below) the final 4 rows are used to swap the bottom of the last // stripe with the top of the next super block row. static void backup_db(const Dav2dFrameContext *const f, pixel *dst, const pixel *src, const ptrdiff_t stride, const int ss_ver, const int sb128, int row, const int row_h, const int w, const int h, const int ss_hor, const int lr_backup) { const int cdef_backup = !lr_backup; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = ((64 << (cdef_backup & sb128)) - 8 * !row) >> ss_ver; src += (stripe_h - 2) * PXSTRIDE(stride); if (f->c->n_tc == 1) { if (row) { const int top = 4 << sb128; // Copy the top part of the stored loop filtered pixels from the // previous sb row needed above the first stripe of this sb row. pixel_copy(&dst[PXSTRIDE(stride) * 0], &dst[PXSTRIDE(stride) * top], w); pixel_copy(&dst[PXSTRIDE(stride) * 1], &dst[PXSTRIDE(stride) * (top + 1)], w); pixel_copy(&dst[PXSTRIDE(stride) * 2], &dst[PXSTRIDE(stride) * (top + 2)], w); pixel_copy(&dst[PXSTRIDE(stride) * 3], &dst[PXSTRIDE(stride) * (top + 3)], w); } dst += 4 * PXSTRIDE(stride); } while (row + stripe_h <= row_h) { for (int i = 0; i < 4; i++) { pixel_copy(dst, src, w); dst += PXSTRIDE(stride); src += PXSTRIDE(stride); } row += stripe_h; // unmodified stripe_h for the 1st stripe stripe_h = 64 >> ss_ver; src += (stripe_h - 4) * PXSTRIDE(stride); } } void bytefn(dav2d_copy_db)(Dav2dFrameContext *const f, /*const*/ pixel *const src[3], const int sby) { const int have_tt = f->c->n_tc > 1; const int offset = 8 * !!sby; const ptrdiff_t *const stride = f->cur.p.stride; const int tt_off = have_tt * sby * (4 << f->frame_hdr->sb128); pixel *const dst[3] = { f->lf.lr_db_line[0] + tt_off * PXSTRIDE(stride[0]), f->lf.lr_db_line[1] + tt_off * PXSTRIDE(stride[1]), f->lf.lr_db_line[2] + tt_off * PXSTRIDE(stride[1]) }; // TODO Also check block level restore type to reduce copying. const int restore_planes = f->lf.restore_planes; if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) { const int h = f->cur.p.p.h; const int w = f->bw << 2; const int row_h = imin((sby + 1) << (6 + f->frame_hdr->sb128), h - 1); const int y_stripe = (sby << (6 + f->frame_hdr->sb128)) - offset; backup_db(f, dst[0], src[0] - offset * PXSTRIDE(stride[0]), stride[0], 0, f->frame_hdr->sb128, y_stripe, row_h, w, h, 0, 1); } if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) && f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400) { const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h = (f->cur.p.p.h + ss_ver) >> ss_ver; const int w = f->bw << (2 - ss_hor); const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->frame_hdr->sb128), h - 1); const int offset_uv = offset >> ss_ver; const int y_stripe = (sby << ((6 - ss_ver) + f->frame_hdr->sb128)) - offset_uv; if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) { backup_db(f, dst[1], src[1] - offset_uv * PXSTRIDE(stride[1]), stride[1], ss_ver, f->frame_hdr->sb128, y_stripe, row_h, w, h, ss_hor, 1); } if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) { backup_db(f, dst[2], src[2] - offset_uv * PXSTRIDE(stride[1]), stride[1], ss_ver, f->frame_hdr->sb128, y_stripe, row_h, w, h, ss_hor, 1); } } } static void transpose_lossless_mask(uint16_t dst_mask[17], const uint16_t (*const src_mask)[4], const int x64, const int ss_hor, const int ss_ver) { // copy previous sb column dst_mask[0] = dst_mask[16 >> ss_hor]; // transpose the original mask // TODO: Use a faster bit matrix transpose implementation for (int x = 0; x < 16 >> ss_hor; x++) { unsigned col_mask = 0; for (int y = 0; y < 16 >> ss_ver; y++) { col_mask |= (1 & (src_mask[y][x64] >> x)) << y; } dst_mask[x+1] = col_mask; } } static void setup_thr_cols_sb64(pixel *const q_thr_dst, pixel *const side_thr_dst, const ptrdiff_t dst_stride, const uint8_t *const segmap, const ptrdiff_t seg_stride, const uint16_t (*const mask)[5][4], const pixel thr_lut[2][16], pixel *const left_q_thr, pixel *const left_side_thr, const int y64, const int ss_hor, const int ss_ver, const int w4, const int h4) { const int mask_idx = y64 >> ss_ver; const int mask_shift = y64 & ss_ver ? 8 : 0; for (int y4 = 0; y4 < h4; y4++) { int prev_q_thr = left_q_thr[y4]; int prev_side_thr = left_side_thr[y4]; for (int x4 = 0; x4 < w4; x4++) { const int seg_id = segmap[x4 + y4 * seg_stride]; const int cur_q_thr = thr_lut[0][seg_id]; const int cur_side_thr = thr_lut[1][seg_id]; const int subpu = 3 * ((mask[x4][4][mask_idx] >> (mask_shift + y4)) & 1); int edge_q_thr; int edge_side_thr; if (cur_q_thr && prev_q_thr) edge_q_thr = (cur_q_thr + prev_q_thr + 1) >> 1; else edge_q_thr = cur_q_thr | prev_q_thr; if (cur_side_thr && prev_side_thr) edge_side_thr = (cur_side_thr + prev_side_thr + 1) >> 1; else edge_side_thr = cur_side_thr | prev_side_thr; // store transposed q_thr_dst[x4 * dst_stride + y4] = edge_q_thr >> subpu; side_thr_dst[x4 * dst_stride + y4] = edge_side_thr >> subpu; prev_q_thr = cur_q_thr; prev_side_thr = cur_side_thr; } left_q_thr[y4] = prev_q_thr; left_side_thr[y4] = prev_side_thr; } } static void setup_thr_rows_sb64(pixel *const q_thr_dst, pixel *const side_thr_dst, const ptrdiff_t dst_stride, const uint8_t *const segmap, const ptrdiff_t seg_stride, const uint16_t (*const mask)[5][4], const pixel thr_lut[2][16], const pixel above_thr_lut[2][16], const int sb64x, const int ss_hor, const int ss_ver, const int w4, const int h4) { const int mask_idx = sb64x >> ss_hor; const int mask_shift = sb64x & ss_hor ? 8 : 0; pixel above_q_thr[16] = { 0 }; pixel above_side_thr[16] = { 0 }; if (above_thr_lut) { for (int x4 = 0; x4 < w4; x4++) { const int seg_id = segmap[x4 - seg_stride]; above_q_thr[x4] = above_thr_lut[0][seg_id]; above_side_thr[x4] = above_thr_lut[1][seg_id]; } } for (int x4 = 0; x4 < w4; x4++) { int prev_q_thr = above_q_thr[x4]; int prev_side_thr = above_side_thr[x4]; for (int y4 = 0; y4 < h4; y4++) { const int seg_id = segmap[x4 + y4 * seg_stride]; const int cur_q_thr = thr_lut[0][seg_id]; const int cur_side_thr = thr_lut[1][seg_id]; const int subpu = 3 * ((mask[y4][4][mask_idx] >> (mask_shift + x4)) & 1); int edge_q_thr; int edge_side_thr; if (cur_q_thr && prev_q_thr) edge_q_thr = (cur_q_thr + prev_q_thr + 1) >> 1; else edge_q_thr = cur_q_thr | prev_q_thr; if (cur_side_thr && prev_side_thr) edge_side_thr = (cur_side_thr + prev_side_thr + 1) >> 1; else edge_side_thr = cur_side_thr | prev_side_thr; q_thr_dst[x4 + y4 * dst_stride] = edge_q_thr >> subpu; side_thr_dst[x4 + y4 * dst_stride] = edge_side_thr >> subpu; prev_q_thr = cur_q_thr; prev_side_thr = cur_side_thr; } } } static inline void filter_plane_cols_y(const Dav2dFrameContext *const f, const int have_left, const uint16_t (*const mask)[5][4], const uint16_t (*const ll_mask), const pixel *q_thr, const pixel *side_thr, pixel *dst, const ptrdiff_t ls, const int y64, const int w4, const int h4, int tile_edge) { const Dav2dDSPContext *const dsp = f->dsp; // filter edges between columns (e.g. block1 | block2) for (int x = 0; x < w4; x++, q_thr += 16, side_thr += 16) { if (!have_left && !x) continue; uint16_t hmask[4] = { mask[x][0][y64], mask[x][1][y64], mask[x][2][y64], mask[x][3][y64] }; dsp->lf.deblock_sb[0][0](&dst[x * 4], ls, hmask, ll_mask + x, q_thr, side_thr, tile_edge, h4 HIGHBD_CALL_SUFFIX); tile_edge = 0; } } static inline void filter_plane_rows_y(const Dav2dFrameContext *const f, const int have_top, const uint16_t (*const mask)[5][4], const uint16_t (*const ll_mask), const pixel *q_thr, const pixel *side_thr, pixel *dst, const ptrdiff_t ls, const int sb64x, const int w4, const int h4) { const Dav2dDSPContext *const dsp = f->dsp; // block1 // filter edges between rows (e.g. ------) // block2 for (int y = 0; y < h4; y++, dst += 4 * PXSTRIDE(ls), q_thr += 16, side_thr += 16) { if (!have_top && !y) continue; const uint16_t vmask[4] = { mask[y][0][sb64x], mask[y][1][sb64x], mask[y][2][sb64x], mask[y][3][sb64x] }; dsp->lf.deblock_sb[0][1](dst, ls, vmask, ll_mask + y, q_thr, side_thr, !y, w4 HIGHBD_CALL_SUFFIX); } } static inline void filter_plane_cols_uv(const Dav2dFrameContext *const f, const int have_left, const uint16_t (*const mask)[5][4], const uint16_t (*const ll_mask), const pixel *u_q_thr, const pixel *u_side_thr, const pixel *v_q_thr, const pixel *v_side_thr, pixel *const u, pixel *const v, const ptrdiff_t ls, const int y64, const int w4, const int h4, int tile_edge, const int ss_ver) { const Dav2dDSPContext *const dsp = f->dsp; const int apply_u = f->frame_hdr->deblock.level_u; const int apply_v = f->frame_hdr->deblock.level_v; const int mask_idx = y64 >> ss_ver; const int mask_shift = y64 & ss_ver ? 8 : 0; const int bytes_mask = ss_ver ? 0xff : 0xffff; // filter edges between columns (e.g. block1 | block2) for (int x = 0; x < w4; x++, u_q_thr += 16, u_side_thr += 16, v_q_thr += 16, v_side_thr += 16) { if (!have_left && !x) continue; uint16_t hmask[3] = { (mask[x][0][mask_idx] >> mask_shift) & bytes_mask, (mask[x][1][mask_idx] >> mask_shift) & bytes_mask, (mask[x][2][mask_idx] >> mask_shift) & bytes_mask, }; if (apply_u) dsp->lf.deblock_sb[1][0](&u[x * 4], ls, hmask, ll_mask + x, u_q_thr, u_side_thr, tile_edge, h4 HIGHBD_CALL_SUFFIX); if (apply_v) dsp->lf.deblock_sb[1][0](&v[x * 4], ls, hmask, ll_mask + x, v_q_thr, v_side_thr, tile_edge, h4 HIGHBD_CALL_SUFFIX); tile_edge = 0; } } static inline void filter_plane_rows_uv(const Dav2dFrameContext *const f, const int have_top, const uint16_t (*const mask)[5][4], const uint16_t (*const ll_mask), const pixel *u_q_thr, const pixel *u_side_thr, const pixel *v_q_thr, const pixel *v_side_thr, pixel *const u, pixel *const v, const ptrdiff_t ls, const int sb64x, const int w4, const int h4, const int ss_hor) { const Dav2dDSPContext *const dsp = f->dsp; ptrdiff_t off_l = 0; const int apply_u = f->frame_hdr->deblock.level_u; const int apply_v = f->frame_hdr->deblock.level_v; const int mask_idx = sb64x >> ss_hor; const int mask_shift = sb64x & ss_hor ? 8 : 0; const int bytes_mask = ss_hor ? 0xff : 0xffff; // block1 // filter edges between rows (e.g. ------) // block2 for (int y = 0; y < h4; y++, off_l += 4 * PXSTRIDE(ls), u_q_thr += 16, u_side_thr += 16, v_q_thr += 16, v_side_thr += 16) { if (!have_top && !y) continue; const uint16_t vmask[3] = { (mask[y][0][mask_idx] >> mask_shift) & bytes_mask, (mask[y][1][mask_idx] >> mask_shift) & bytes_mask, (mask[y][2][mask_idx] >> mask_shift) & bytes_mask, }; if (apply_u) dsp->lf.deblock_sb[1][1](&u[off_l], ls, vmask, ll_mask + y, u_q_thr, u_side_thr, !y, w4 HIGHBD_CALL_SUFFIX); if (apply_v) dsp->lf.deblock_sb[1][1](&v[off_l], ls, vmask, ll_mask + y, v_q_thr, v_side_thr, !y, w4 HIGHBD_CALL_SUFFIX); } } static const uint8_t placeholder_segmap[16] = { 0 }; static void deblock_sbrow64_cols(const Dav2dFrameContext *const f, pixel *const p[3], Av2Filter *const lflvl, int y64, const int start_of_tile_row) { int x64, have_left; const int sb128 = f->frame_hdr->sb128; const int starty4 = (y64 * 16) & 0x30; const int sbl2 = 4 + f->frame_hdr->sb128; const int halign = (f->bh + 63) & ~63; const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h4 = imin(f->bh - y64 * 16, 16); const int uv_h4 = h4 >> ss_ver; const int hbd = f->seq_hdr->hbd; const int y64idx = (y64 & 3) << 2; const ptrdiff_t seg_stride = f->cur_segmap ? f->b4_stride : 0; const uint8_t *const segmap = f->cur_segmap ? &f->cur_segmap[y64 * 16 * seg_stride] : NULL; // fix lpf strength at tile col boundaries if (f->frame_hdr->tip.frame_mode != 2) { const uint8_t *lpf_y = &f->lf.tx_db_right_edge[0][y64 * 16]; const uint8_t *lpf_uv = &f->lf.tx_db_right_edge[1][y64 * 16 >> ss_ver]; for (int tile_col = 1;; tile_col++) { const int sbx = f->frame_hdr->tiling.t.col_start_sb[tile_col]; if ((sbx << sbl2) >= f->bw) break; const int bx4 = (sbx << sbl2) & 0x30, cbx4 = bx4 >> ss_hor; const int x256 = sbx >> (2 - sb128); uint16_t (*const y_hmask)[4] = lflvl[x256].filter_y[0][bx4]; int sidx = y64 & 3; for (int y4 = 0; y4 < h4; y4++) { const unsigned smask = 1 << y4; const int idx = 3 * !!(y_hmask[3][sidx] & smask) + 2 * !!(y_hmask[2][sidx] & smask) + !!(y_hmask[1][sidx] & smask); y_hmask[3][sidx] &= ~smask; y_hmask[2][sidx] &= ~smask; y_hmask[1][sidx] &= ~smask; y_hmask[0][sidx] &= ~smask; y_hmask[imin(idx, lpf_y[y4])][sidx] |= smask; } lpf_y += halign; if (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400) { const int uv_endy4 = (starty4 >> ss_ver) + uv_h4; uint16_t (*const uv_hmask)[4] = lflvl[x256].filter_uv[0][cbx4]; sidx = (y64 & 3) >> ss_ver; for (int y4 = starty4 >> ss_ver; y4 < uv_endy4; y4++) { const unsigned smask = 1 << (y4 & 0xf); const int idx = 2 * !!(uv_hmask[2][sidx] & smask) + !!(uv_hmask[1][sidx] & smask); uv_hmask[2][sidx] &= ~smask; uv_hmask[1][sidx] &= ~smask; uv_hmask[0][sidx] &= ~smask; uv_hmask[imin(idx, lpf_uv[y4 - (starty4 >> ss_ver)])][sidx] |= smask; } } lpf_uv += halign >> ss_ver; } // fix lpf strength at tile row boundaries if (start_of_tile_row) { const BlockContext *a; int x256; for (x256 = 0, a = &f->a[f->sb256w * (start_of_tile_row - 1)]; x256 < f->sb256w; x256++, a++) { uint16_t (*const y_vmask)[4] = lflvl[x256].filter_y[1][starty4]; const int w = imin(64, f->bw - (x256 << 6)); for (int i = 0; i < w; i++) { const int sidx = i >> 4; const unsigned smask = 1 << (i & 0xf); const int idx = 3 * !!(y_vmask[3][sidx] & smask) + 2 * !!(y_vmask[2][sidx] & smask) + !!(y_vmask[1][sidx] & smask); y_vmask[3][sidx] &= ~smask; y_vmask[2][sidx] &= ~smask; y_vmask[1][sidx] &= ~smask; y_vmask[0][sidx] &= ~smask; y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask; } if (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400) { const int cw = w >> ss_hor; uint16_t (*const uv_vmask)[4] = lflvl[x256].filter_uv[1][starty4 >> ss_ver]; for (int i = 0; i < cw; i++) { const int sidx = i >> 4; const unsigned smask = 1 << (i & 0xf); const int idx = 2 * !!(uv_vmask[2][sidx] & smask) + !!(uv_vmask[1][sidx] & smask); uv_vmask[2][sidx] &= ~smask; uv_vmask[1][sidx] &= ~smask; uv_vmask[0][sidx] &= ~smask; uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask; } } } } } // Crop deblock size on the bottom of the frame if ((y64 + 1) * 16 + 4 > f->bh) { // For luma, we crop 32 long tx edges that overhang by 24 pixels. // Frame dimensions are multiples of 8 so we only need to crop a single row. const int luma_crop_y4 = starty4 + h4 - 2; // check if this was handled by the previous sb row if (luma_crop_y4 >= 0) { for (int x256 = 0; x256 < f->sb256w; x256++) { const int w = imin(64, f->bw - (x256 << 6)); uint16_t (*const y_vmask)[4] = lflvl[x256].filter_y[1][luma_crop_y4]; for (int i = 0; i < (w + 15) >> 4; i++) { unsigned mask = y_vmask[3][i]; y_vmask[3][i] = 0; y_vmask[2][i] |= mask; } } } } if (f->frame_hdr->deblock.level_y[0]) { int l_qidx = -1; // left q_idx pixel lut[2][16]; pixel edge_q_thr[16 * 16]; pixel edge_side_thr[16 * 16]; pixel left_q_thr[16] = { 0 }; pixel left_side_thr[16] = { 0 }; uint16_t ll_mask[17] = { 0 }; pixel *ptr; int tile_col = 1; int tile_end = f->frame_hdr->tiling.t.col_start_sb[tile_col] << sbl2; for (ptr = p[0], have_left = 0, x64 = 0; x64 < (f->bw + 15) >> 4; x64++, have_left = 1, ptr += 64) { if (x64 * 16 > tile_end) { tile_col++; tile_end = f->frame_hdr->tiling.t.col_start_sb[tile_col] << sbl2; } const Av2Filter *const col_lflvl = &lflvl[x64 >> 2]; const int cur_qidx = col_lflvl->qidx[(x64 & 3) + y64idx]; if (cur_qidx != l_qidx) { init_deblock_thr_lut_y(f->frame_hdr, hbd, 0, cur_qidx, lut); l_qidx = cur_qidx; } const uint8_t *const col_seg = segmap ? &segmap[x64 * 16] : placeholder_segmap; setup_thr_cols_sb64(edge_q_thr, edge_side_thr, 16, col_seg, seg_stride, &col_lflvl->filter_y[0][(x64 & 3) * 16], lut, left_q_thr, left_side_thr, y64 & 3, 0, 0, imin(f->bw - x64 * 16, 16), h4); transpose_lossless_mask(ll_mask, &col_lflvl->lossless_mask_y[starty4], x64 & 3, 0, 0); filter_plane_cols_y(f, have_left, &col_lflvl->filter_y[0][(x64 & 3) * 16], ll_mask, edge_q_thr, edge_side_thr, ptr, f->cur.p.stride[0], y64 & 3, imin(16, f->bw - x64 * 16), h4, tile_end == x64 * 16); } } if (!f->frame_hdr->deblock.level_u && !f->frame_hdr->deblock.level_v) return; int prev_qidx = -1; pixel lut[2][2][16]; const ptrdiff_t uv_seg_stride = f->lf.segmap_uv ? f->lf.uv_segmap_stride : 0; const uint8_t *const uv_segmap = f->lf.segmap_uv ? &f->lf.segmap_uv[y64 * (16 >> ss_ver) * uv_seg_stride] : NULL; pixel edge_q_thr[2][16 * 16]; pixel edge_side_thr[2][16 * 16]; pixel left_q_thr[2][16] = { 0 }; pixel left_side_thr[2][16] = { 0 }; uint16_t ll_mask[17] = { 0 }; ptrdiff_t uv_off; int tile_col = 1; int tile_end = f->frame_hdr->tiling.t.col_start_sb[tile_col] << sbl2; for (uv_off = 0, have_left = 0, x64 = 0; x64 < (f->bw + 15) >> 4; x64++, have_left = 1, uv_off += 64 >> ss_hor) { if (x64 * 16 > tile_end) { tile_col++; tile_end = f->frame_hdr->tiling.t.col_start_sb[tile_col] << sbl2; } const Av2Filter *const col_lflvl = &lflvl[x64 >> 2]; const int cur_qidx = col_lflvl->qidx[(x64 & 3) + y64idx]; if (cur_qidx != prev_qidx) { init_deblock_thr_lut_uv(f->frame_hdr, hbd, cur_qidx, lut); prev_qidx = cur_qidx; } const uint8_t *const col_seg = uv_segmap ? &uv_segmap[x64 * (16 >> ss_hor)] : placeholder_segmap; const int uv_w4 = imin(f->bw - x64 * 16, 16) >> ss_hor; setup_thr_cols_sb64(edge_q_thr[0], edge_side_thr[0], 16, col_seg, uv_seg_stride, &col_lflvl->filter_uv[0][(x64 & 3) * 16 >> ss_hor], lut[0], left_q_thr[0], left_side_thr[0], y64 & 3, ss_hor, ss_ver, uv_w4, uv_h4); setup_thr_cols_sb64(edge_q_thr[1], edge_side_thr[1], 16, col_seg, uv_seg_stride, &col_lflvl->filter_uv[0][(x64 & 3) * 16 >> ss_hor], lut[1], left_q_thr[1], left_side_thr[1], y64 & 3, ss_hor, ss_ver, uv_w4, uv_h4); transpose_lossless_mask(ll_mask, &col_lflvl->lossless_mask_uv[starty4 >> ss_ver], x64 & 3, ss_hor, ss_ver); filter_plane_cols_uv(f, have_left, &col_lflvl->filter_uv[0][(x64 & 3) * 16 >> ss_hor], ll_mask, edge_q_thr[0], edge_side_thr[0], edge_q_thr[1], edge_side_thr[1], &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1], y64 & 3, imin(16, f->bw - x64 * 16) >> ss_hor, uv_h4, tile_end == x64 * 16, ss_ver); } } static void deblock_sbrow64_rows(const Dav2dFrameContext *const f, pixel *const p[3], Av2Filter *const lflvl, int y64) { int x64; // Don't filter outside the frame const int have_top = y64 > 0; const int starty4 = (y64 * 16) & 0x30; const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h4 = imin(f->bh - y64 * 16, 16); const int uv_h4 = h4 >> ss_ver; const int hbd = f->seq_hdr->hbd; const int y64idx = (y64 & 3) << 2; const Av2Filter *const a_lflvl = have_top ? &lflvl[-f->sb256w * (starty4 == 0)] : NULL; const int a_y64idx = ((y64 + 3) & 3) << 2; const ptrdiff_t seg_stride = f->cur_segmap ? f->b4_stride : 0; const uint8_t *const segmap = f->cur_segmap ? &f->cur_segmap[y64 * 16 * seg_stride] : NULL; if (f->frame_hdr->deblock.level_y[1]) { pixel *ptr; int l_qidx = -1, al_qidx = -1; // left and above left pixel lut[2][16]; pixel a_lut[2][16]; pixel (*const a_lut_ptr)[16] = a_lflvl ? a_lut : NULL; pixel edge_q_thr[16 * 16]; pixel edge_side_thr[16 * 16]; uint16_t ll_mask[17] = { 0 }; for (ptr = p[0], x64 = 0; x64 < (f->bw + 15) >> 4; x64++, ptr += 64) { const Av2Filter *const col_lflvl = &lflvl[x64 >> 2]; for (int y = 0; y < h4; y++) ll_mask[y + 1] = col_lflvl->lossless_mask_y[starty4 + y][x64 & 3]; const int cur_qidx = col_lflvl->qidx[(x64 & 3) + y64idx]; if (cur_qidx != l_qidx) { init_deblock_thr_lut_y(f->frame_hdr, hbd, 1, cur_qidx, lut); l_qidx = cur_qidx; } if (a_lut_ptr) { ll_mask[0] = a_lflvl[x64 >> 2].lossless_mask_y[(starty4 + 63) & 63][x64 & 3]; const int a_qidx = a_lflvl[x64 >> 2].qidx[(x64 & 3) + a_y64idx]; if (a_qidx != al_qidx) { init_deblock_thr_lut_y(f->frame_hdr, hbd, 1, a_qidx, a_lut_ptr); al_qidx = a_qidx; } } const uint8_t *const col_seg = segmap ? &segmap[x64 * 16] : placeholder_segmap; setup_thr_rows_sb64(edge_q_thr, edge_side_thr, 16, col_seg, seg_stride, &col_lflvl->filter_y[1][starty4], lut, a_lut_ptr, x64 & 3, 0, 0, imin(f->bw - x64 * 16, 16), h4); filter_plane_rows_y(f, have_top, &col_lflvl->filter_y[1][starty4], ll_mask, edge_q_thr, edge_side_thr, ptr, f->cur.p.stride[0], x64 & 3, imin(16, f->bw - x64 * 16), h4); } } if (!f->frame_hdr->deblock.level_u && !f->frame_hdr->deblock.level_v) return; const ptrdiff_t uv_seg_stride = f->lf.segmap_uv ? f->lf.uv_segmap_stride : 0; const uint8_t *const uv_segmap = f->lf.segmap_uv ? &f->lf.segmap_uv[y64 * (16 >> ss_ver) * uv_seg_stride] : NULL; int l_qidx = -1, al_qidx = -1; // left and above left pixel lut[2][2][16]; pixel a_lut[2][2][16]; pixel (*const a_lut_ptr)[2][16] = a_lflvl ? a_lut : NULL; pixel edge_q_thr[2][16 * 16]; pixel edge_side_thr[2][16 * 16]; uint16_t ll_mask[17] = { 0 }; ptrdiff_t uv_off; for (uv_off = 0, x64 = 0; x64 < (f->bw + 15) >> 4; x64++, uv_off += 64 >> ss_hor) { const Av2Filter *const col_lflvl = &lflvl[x64 >> 2]; for (int y = 0; y < uv_h4; y++) ll_mask[y + 1] = col_lflvl->lossless_mask_uv[(starty4 >> ss_ver) + y][x64 & 3]; const int cur_qidx = col_lflvl->qidx[(x64 & 3) + y64idx]; if (cur_qidx != l_qidx) { init_deblock_thr_lut_uv(f->frame_hdr, hbd, cur_qidx, lut); l_qidx = cur_qidx; } if (a_lut_ptr) { ll_mask[0] = a_lflvl[x64 >> 2].lossless_mask_uv[((starty4 + 63) & 63) >> ss_ver][x64 & 3]; const int a_qidx = a_lflvl[x64 >> 2].qidx[(x64 & 3) + a_y64idx]; if (a_qidx != al_qidx) { init_deblock_thr_lut_uv(f->frame_hdr, hbd, a_qidx, a_lut_ptr); al_qidx = a_qidx; } } const uint8_t *const col_seg = uv_segmap ? &uv_segmap[x64 * (16 >> ss_hor)] : placeholder_segmap; const int uv_w4 = imin(f->bw - x64 * 16, 16) >> ss_hor; setup_thr_rows_sb64(edge_q_thr[0], edge_side_thr[0], 16, col_seg, uv_seg_stride, &col_lflvl->filter_uv[1][starty4 >> ss_ver], lut[0], a_lut_ptr ? a_lut_ptr[0] : NULL, x64 & 3, ss_hor, ss_ver, uv_w4, uv_h4); setup_thr_rows_sb64(edge_q_thr[1], edge_side_thr[1], 16, col_seg, uv_seg_stride, &col_lflvl->filter_uv[1][starty4 >> ss_ver], lut[1], a_lut_ptr ? a_lut_ptr[1] : NULL, x64 & 3, ss_hor, ss_ver, uv_w4, uv_h4); filter_plane_rows_uv(f, have_top, &col_lflvl->filter_uv[1][starty4 >> ss_ver], ll_mask, edge_q_thr[0], edge_side_thr[0], edge_q_thr[1], edge_side_thr[1], &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1], x64 & 3, (imin(16, f->bw - x64 * 16) + ss_hor) >> ss_hor, uv_h4, ss_hor); } } void bytefn(dav2d_deblock_sbrow_cols)(const Dav2dFrameContext *const f, pixel *const p[3], Av2Filter *const lflvl, int sby, int start_of_tile_row) { const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int y64_start = sby << f->frame_hdr->sb128; const int y64_end = imin((sby + 1) << f->frame_hdr->sb128, (f->bh + 15) >> 4); pixel *ptrs[3] = { p[0], p[1], p[2] }; for (int y64 = y64_start; y64 < y64_end; y64++) { deblock_sbrow64_cols(f, ptrs, lflvl, y64, start_of_tile_row); start_of_tile_row = 0; ptrs[0] += 64 * PXSTRIDE(f->cur.p.stride[0]); ptrs[1] += 64 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver; ptrs[2] += 64 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver; } } void bytefn(dav2d_deblock_sbrow_rows)(const Dav2dFrameContext *const f, pixel *const p[3], Av2Filter *const lflvl, int sby) { const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int y64_start = sby << f->frame_hdr->sb128; const int y64_end = imin((sby + 1) << f->frame_hdr->sb128, (f->bh + 15) >> 4); pixel *ptrs[3] = { p[0], p[1], p[2] }; for (int y64 = y64_start; y64 < y64_end; y64++) { deblock_sbrow64_rows(f, ptrs, lflvl, y64); ptrs[0] += 64 * PXSTRIDE(f->cur.p.stride[0]); ptrs[1] += 64 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver; ptrs[2] += 64 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver; } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/deblock.h000066400000000000000000000043131517466257200224100ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_DEBLOCK_H #define DAV2D_SRC_DEBLOCK_H #include #include #include "common/bitdepth.h" #include "src/levels.h" #define decl_deblock_sb_fn(name) \ void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *mask, \ const uint16_t *ll_mask, \ const pixel *q_thr, const pixel *side_thr, int edge, \ int w HIGHBD_DECL_SUFFIX) typedef decl_deblock_sb_fn(*deblock_sb_fn); typedef struct Dav2dDeblockDSPContext { /* * dimension 1: plane (0=luma, 1=chroma) * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v) * * dst/stride are aligned by 32 */ deblock_sb_fn deblock_sb[2][2]; } Dav2dDeblockDSPContext; bitfn_decls(void dav2d_deblock_dsp_init, Dav2dDeblockDSPContext *c); #endif /* DAV2D_SRC_DEBLOCK_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/deblock_tmpl.c000066400000000000000000000256361517466257200234520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/attributes.h" #include "common/intops.h" #include "src/deblock.h" static const int8_t max_width_y[4] = { 1, 3, 6, 8 }; static const int8_t max_width_uv[3] = { 1, 3, 4 }; static const int8_t q_first[5] = { 45, 40, 32 }; static const int8_t q_thresh_mults[8] = { 32, 25, 19, 19, 0, 18, 0, 17 }; static const int8_t w_mult[8] = { 85, 51, 37, 28, 0, 20, 0, 15 }; static int filter_choice(const pixel *const s, const pixel *const t, const ptrdiff_t stride, const int max_width_neg, const int max_width_pos, unsigned q_thr, unsigned side_thr) { unsigned deriv_s, deriv_t; unsigned second_derivs_buf[4]; unsigned *second_deriv = &second_derivs_buf[2]; for (int dist = -2; dist < 2; dist++) { deriv_s = abs(s[(dist - 1) * stride] - (s[dist * stride] << 1) + s[(dist + 1) * stride]); deriv_t = abs(t[(dist - 1) * stride] - (t[dist * stride] << 1) + t[(dist + 1) * stride]); second_deriv[dist] = (deriv_s + deriv_t + 1) >> 1; } const unsigned high_deriv = umax(second_deriv[-2], second_deriv[1]); if (high_deriv > side_thr) return 0; if (max_width_pos == 1) return 1; const unsigned side_thr2 = side_thr >> 2; unsigned transition = second_deriv[-1] + second_deriv[0]; if (high_deriv > side_thr2) return 1; if (transition > q_thr * 4) return 1; const unsigned side_thr3 = side_thr >> 3; if (high_deriv > side_thr3) return 2; if (transition > q_thr * 3) return 2; const unsigned end_thr = (side_thr * 3) >> 4; // if !(chroma && edge) if (max_width_neg >= 3) { deriv_s = abs(s[-1 * stride] - s[-4 * stride] - 3 * (s[-1 * stride] - s[-2 * stride])); deriv_t = abs(t[-1 * stride] - t[-4 * stride] - 3 * (t[-1 * stride] - t[-2 * stride])); if (((deriv_s + deriv_t + 1) >> 1) > end_thr) return 2; } deriv_s = abs(s[0] - s[3 * stride] - 3 * (s[0] - s[stride])); deriv_t = abs(t[0] - t[3 * stride] - 3 * (t[0] - t[stride])); if (((deriv_s + deriv_t + 1) >> 1) > end_thr) return 2; if (max_width_pos == 3) return 3; transition <<= 4; int prev_dist = 3; for (int dist = 4; dist <= max_width_pos; dist += 2) { const unsigned q_thr4 = q_thr * q_first[(dist - 4) >> 1]; const unsigned end_thr4 = (side_thr * dist) >> 4; if (transition > q_thr4) return prev_dist; const int dist2 = imin(7, dist); // if !(luma && edge && dist2 == 8) if (max_width_neg >= dist2) { deriv_s = abs(s[-stride] - s[(-dist2 - 1) * stride] - dist2 * (s[-stride] - s[-2 * stride])); deriv_t = abs(t[-stride] - t[(-dist2 - 1) * stride] - dist2 * (t[-stride] - t[-2 * stride])); if (((deriv_s + deriv_t + 1) >> 1) > end_thr4) return prev_dist; } deriv_s = abs(s[0] - s[dist2 * stride] - dist2 * (s[0] - s[stride])); deriv_t = abs(t[0] - t[dist2 * stride] - dist2 * (t[0] - t[stride])); if (((deriv_s + deriv_t + 1) >> 1) > end_thr4) return prev_dist; prev_dist = dist; } return max_width_pos; } static NOINLINE void deblock(pixel *dst, unsigned q_thr, unsigned side_thr, const ptrdiff_t stridea, const ptrdiff_t strideb, const int max_width_pos, const int max_width_neg, const int pos_lossless, const int neg_lossless HIGHBD_DECL_SUFFIX) { const int width = filter_choice(dst, dst + 3 * stridea, strideb, max_width_neg, max_width_pos, q_thr, side_thr); const int width_neg = imin(width, max_width_neg); const int width_pos = width; if (width_pos < 1) return; const int q_thr_clamp = q_thr * q_thresh_mults[width - 1]; for (int i = 0; i < 4; i++, dst += stridea) { const int delta_m2 = iclip(4 * (3 * (dst[0] - dst[-1 * strideb]) - (dst[strideb] - dst[-2 * strideb])), -q_thr_clamp, q_thr_clamp); if (!neg_lossless) { const int delta_m2_neg = delta_m2 * w_mult[width_neg - 1]; for (int j = 0; j < width_neg; j++) { pixel *const dst_pix = &dst[(-j - 1) * strideb]; const int diff = (delta_m2_neg * (width_neg - j) + (1 << 10)) >> 11; *dst_pix = iclip(*dst_pix + diff, 0, BITDEPTH_MAX); } } if (!pos_lossless) { int delta_m2_pos = delta_m2 * w_mult[width_pos - 1]; for (int j = 0; j < width_pos; j++) { pixel *const dst_pix = &dst[j * strideb]; const int diff = (delta_m2_pos * (width_pos - j) + (1 << 10)) >> 11; *dst_pix = iclip(*dst_pix - diff, 0, BITDEPTH_MAX); } } } } static void deblock_h_sb64y_c(pixel *dst, const ptrdiff_t stride, const uint16_t *const vmask, const uint16_t *const ll_mask, const pixel *q_thr, const pixel *side_thr, const int edge, const int h HIGHBD_DECL_SUFFIX) { const unsigned vm = vmask[0] | vmask[1] | vmask[2] | vmask[3]; for (unsigned y = 1; vm & ~(y - 1); y <<= 1, dst += 4 * PXSTRIDE(stride), q_thr++, side_thr++) { if (vm & y) { const int idx = (vmask[3] & y) ? 3 : (vmask[2] & y) ? 2 : !!(vmask[1] & y); const int max_width_pos = max_width_y[idx]; const int max_width_neg = edge ? imin(6, max_width_pos) : max_width_pos; const int pos_lossless = !!(ll_mask[1] & y); const int neg_lossless = !!(ll_mask[0] & y); deblock(dst, *q_thr, *side_thr, PXSTRIDE(stride), 1, max_width_pos, max_width_neg, pos_lossless, neg_lossless HIGHBD_TAIL_SUFFIX); } } } static void deblock_v_sb64y_c(pixel *dst, const ptrdiff_t stride, const uint16_t *const vmask, const uint16_t *const ll_mask, const pixel *q_thr, const pixel *side_thr, const int edge, const int w HIGHBD_DECL_SUFFIX) { const unsigned vm = vmask[0] | vmask[1] | vmask[2] | vmask[3]; for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, q_thr++, side_thr++) { if (vm & x) { const int idx = (vmask[3] & x) ? 3 : (vmask[2] & x) ? 2 : !!(vmask[1] & x); const int max_width_pos = max_width_y[idx]; const int max_width_neg = edge ? imin(6, max_width_pos) : max_width_pos; const int pos_lossless = !!(ll_mask[1] & x); const int neg_lossless = !!(ll_mask[0] & x); deblock(dst, *q_thr, *side_thr, 1, PXSTRIDE(stride), max_width_pos, max_width_neg, pos_lossless, neg_lossless HIGHBD_TAIL_SUFFIX); } } } static void deblock_h_sb64uv_c(pixel *dst, const ptrdiff_t stride, const uint16_t *const vmask, const uint16_t *const ll_mask, const pixel *q_thr, const pixel *side_thr, const int edge, const int h HIGHBD_DECL_SUFFIX) { const unsigned vm = vmask[0] | vmask[1] | vmask[2]; for (unsigned y = 1; vm & ~(y - 1); y <<= 1, dst += 4 * PXSTRIDE(stride), q_thr++, side_thr++) { if (vm & y) { const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y); const int max_width_pos = max_width_uv[idx]; const int max_width_neg = edge ? imin(2, max_width_pos) : max_width_pos; const int pos_lossless = !!(ll_mask[1] & y); const int neg_lossless = !!(ll_mask[0] & y); deblock(dst, *q_thr, *side_thr, PXSTRIDE(stride), 1, max_width_pos, max_width_neg, pos_lossless, neg_lossless HIGHBD_TAIL_SUFFIX); } } } static void deblock_v_sb64uv_c(pixel *dst, const ptrdiff_t stride, const uint16_t *const vmask, const uint16_t *const ll_mask, const pixel *q_thr, const pixel *side_thr, const int edge, const int w HIGHBD_DECL_SUFFIX) { const unsigned vm = vmask[0] | vmask[1] | vmask[2]; for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, q_thr++, side_thr++) { if (vm & x) { const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x); const int max_width_pos = max_width_uv[idx]; const int max_width_neg = edge ? imin(2, max_width_pos) : max_width_pos; const int pos_lossless = !!(ll_mask[1] & x); const int neg_lossless = !!(ll_mask[0] & x); deblock(dst, *q_thr, *side_thr, 1, PXSTRIDE(stride), max_width_pos, max_width_neg, pos_lossless, neg_lossless HIGHBD_TAIL_SUFFIX); } } } #if HAVE_ASM #if ARCH_X86 #include "src/x86/deblock.h" #endif #endif COLD void bitfn(dav2d_deblock_dsp_init)(Dav2dDeblockDSPContext *const c) { c->deblock_sb[0][0] = deblock_h_sb64y_c; c->deblock_sb[0][1] = deblock_v_sb64y_c; c->deblock_sb[1][0] = deblock_h_sb64uv_c; c->deblock_sb[1][1] = deblock_v_sb64uv_c; #if HAVE_ASM #if ARCH_X86 deblock_dsp_init_x86(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/debug.h000066400000000000000000000037561517466257200221050ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_DEBUG_H #define DAV2D_SRC_DEBUG_H #define DEBUG_BLOCK_INFO 0 #if DEBUG_BLOCK_INFO #define DB_ONLY(...) __VA_ARGS__, #define BLOCK_TO_DEBUG_S(frame_offset, by, bx) \ frame_offset == 0 && by >= 0 && by < 16 && bx >= 0 && bx < 16 #define BLOCK_TO_DEBUG BLOCK_TO_DEBUG_S(f->frame_hdr->frame_offset, t->by, t->bx) #define DEBUG_BLOCK_printf(...) \ if (BLOCK_TO_DEBUG) { \ printf(__VA_ARGS__); \ } #else #define DB_ONLY(...) #define BLOCK_TO_DEBUG 0 #define DEBUG_BLOCK_printf(...) do { } while (0) #endif #define DEBUG_B_PIXELS 0 #endif /* DAV2D_SRC_DEBUG_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/decode.c000066400000000000000000007677061517466257200222510ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include #include "dav2d/data.h" #include "common/frame.h" #include "common/intops.h" #include "src/ctx.h" #include "src/decode.h" #include "src/env.h" #include "src/filmgrain.h" #include "src/log.h" #include "src/quantizer.h" #include "src/recon.h" #include "src/ref.h" #include "src/tables.h" #include "src/thread_task.h" #include "src/warpmv.h" #include "src/wedge.h" static void init_quant_tables(const Dav2dFrameHeader *const frame_hdr, const int qidx, uint32_t (*dq)[3][2]) { // and then ac == dc for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) { const int yac = frame_hdr->segmentation.enabled ? qidx + frame_hdr->segmentation.d.delta_q[i] : qidx; const int ydc = yac + frame_hdr->quant.ydc_delta; const int uac = yac + frame_hdr->quant.uac_delta; const int udc = yac + frame_hdr->quant.udc_delta; const int vac = yac + frame_hdr->quant.vac_delta; const int vdc = yac + frame_hdr->quant.vdc_delta; dq[i][0][0] = dav2d_dq_lookup(ydc); dq[i][0][1] = dav2d_dq_lookup(yac); dq[i][1][0] = dav2d_dq_lookup(udc); dq[i][1][1] = dav2d_dq_lookup(uac); dq[i][2][0] = dav2d_dq_lookup(vdc); dq[i][2][1] = dav2d_dq_lookup(vac); } } static inline void init_wiener(Dav2dFrameContext *const f) { const enum Dav2dRestorationType type = f->frame_hdr->restoration.p[0].type; if (type == DAV2D_RESTORATION_NONE) return; int qidx = f->frame_hdr->quant.yac; f->lf.base_q = dav2d_dq_lookup(f->frame_hdr->quant.yac); int idx = 3; if (qidx < 130) { idx = 0; } else if (qidx < 190) { idx = 1; } else if (qidx < 220) { idx = 2; } if (type == DAV2D_RESTORATION_NS_WIENER || type == DAV2D_RESTORATION_SWITCHABLE) { int num_classes_idx = f->frame_hdr->restoration.p[0].ns.num_classes_idx; if (num_classes_idx) f->lf.ns_subclass_lut = dav2d_pc_wiener_sub_classify_ns[idx][num_classes_idx - 1]; } if (type == DAV2D_RESTORATION_PC_WIENER || type == DAV2D_RESTORATION_SWITCHABLE) { f->lf.pc_subclass_lut = dav2d_pc_wiener_sub_classify[idx]; f->lf.pc_filters = dav2d_pc_wiener_filters[idx]; } } static inline void read_amvd(Dav2dTileState *const ts, mv *const mv) { const int joint = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.amvd_joint, 3); if (!joint) { mv->n = 0; return; } if (joint & 2) { const int s = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.amvd_index[0], 7); mv->y = s < 3 ? 2 + s * 2 : 1 << s; } else mv->y = 0; if (joint & 1) { const int s = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.amvd_index[1], 7); mv->x = s < 3 ? 2 + s * 2 : 1 << s; } else mv->x = 0; } // mv_prec=0..6 for {8,4,2,f,h,q,e}pel static inline void read_mv_residual(Dav2dTileState *const ts, CdfMvContext *const cdf_mv, mv *const mv, const int mv_prec) { int sh_class; const int n_syms = 9 + mv_prec, h_syms = n_syms >> 1; if (dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->shell_set)) { const int h_syms2 = n_syms - h_syms; sh_class = h_syms + 1 + dav2d_msac_decode_symbol_adapt8(&ts->msac, cdf_mv->shell_upper[mv_prec], imin(h_syms2, 7)); if (mv_prec + sh_class == 21) sh_class += dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.mv.shell_tip); } else { sh_class = dav2d_msac_decode_symbol_adapt8(&ts->msac, cdf_mv->shell_lower[mv_prec], h_syms); } int sh_index; if (sh_class < 2) { sh_index = dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->shell_offset_low[sh_class]); } else if (sh_class == 2) { sh_index = dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->shell_offset_cl2); if (sh_index) { sh_index += dav2d_msac_decode_bool_bypass(&ts->msac); if (sh_index == 2) sh_index += dav2d_msac_decode_bool_bypass(&ts->msac); } } else { sh_index = 0; for (int i = 0, m = 1; i < sh_class; i++, m <<= 1) { sh_index |= m * dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->shell_offset_hi[i]); } } if (sh_class) sh_index += 1 << sh_class; if (!sh_index) { mv->n = 0; return; } int pair_index = 0; if (sh_index >= 2) { pair_index = dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->col_component[0]); if (pair_index && sh_index >= 4) { pair_index += dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->col_component[1]); if (pair_index == 2 && sh_index >= 6) pair_index += dav2d_msac_decode_uniform(&ts->msac, (sh_index >> 1) - 1); } } const int sh = 6 - mv_prec; if (pair_index * 2 == sh_index) { mv->x = mv->y = (sh_index >> 1) << sh; } else { const int b = dav2d_msac_decode_bool_adapt(&ts->msac, cdf_mv->col_index[imin(sh_class, 3)]); if (b) { mv->y = pair_index << sh; mv->x = (sh_index - pair_index) << sh; } else { mv->x = pair_index << sh; mv->y = (sh_index - pair_index) << sh; } } } static int neg_deinterleave(int diff, int ref, int max) { if (!ref) return diff; if (ref >= (max - 1)) return max - diff - 1; if (2 * ref < max) { if (diff <= 2 * ref) { if (diff & 1) return ref + ((diff + 1) >> 1); else return ref - (diff >> 1); } return diff; } else { if (diff <= 2 * (max - ref - 1)) { if (diff & 1) return ref + ((diff + 1) >> 1); else return ref - (diff >> 1); } return max - (diff + 1); } } static void derive_warpmv(const Dav2dTaskContext *const t, const int have_top, const int have_left, const int bw4, const int bh4, const int w4, const int h4, const int ref, const union mv mv, Dav2dWarpedMotionParams *const wmp) { int pts[8][2 /* in, out */][2 /* x, y */], np = 0; const refmvs_block *const r = &t->rt.r[(t->by & 63) * 128], *ra; #define bs(rp) dav2d_block_dimensions[(rp)->bs] #define add_sample(dx, dy, sx, sy, rp) do { \ const union mv *const rmv = (rp)->mf & 2 ? (rp)->lmv : (rp)->mv; \ for (int n = 0; n < 2; n++) { \ if ((rp)->ref.ref[n] != ref) continue; \ pts[np][0][0] = 16 * (2 * (dx) + sx * bs(rp)[0]) - 8; \ pts[np][0][1] = 16 * (2 * dy + sy * bs(rp)[1]) - 8; \ pts[np][1][0] = pts[np][0][0] + rmv[n].x; \ pts[np][1][1] = pts[np][0][1] + rmv[n].y; \ if (++np == 8) break; \ } \ } while (0) assert(bw4 > 1); const Dav2dFrameContext *const f = t->f; int have_topleft = 0; int have_topright = 0; const int is_not_sb_boundary = t->by & (f->sb_step - 1); int init_odd; if (have_top) { if (is_not_sb_boundary) { ra = &t->rt.r[((t->by - 1) & 63) * 128]; const refmvs_block *r2 = &ra[(t->bx & 127)]; int off = -r2->ox4; have_topleft = !off; do { add_sample(off, 0, 1, -1, &r2[off]); off += bs(&r2[off])[0]; } while (off < w4 && np < 8); have_topright = off <= bw4; } else { ra = t->rt.ra; const refmvs_block *r2 = &ra[t->bx >> 1]; init_odd = t->bx & 1; have_topleft = 1; // if the block pointed to by our rounded-down top/left coordinate // doesn't intersect with us, skip to the next one. This block will // instead be handled later on as a top/left candidate. int off = bs(r2)[0] <= r2->ox4 + init_odd; // at the top/right edge, we want to include blocks if they // intersect with us. Otherwise, they will be handled as top/right // candidates further down. const int tr_ext = (t->bx + bw4) & (f->sb_step - 1) && (ra[(t->bx + bw4) >> 1].ox4 || init_odd); do { const int off8 = (t->bx + off) >> 1, odd = (t->bx + off) & 1; const int ioff = off - ra[off8].ox4 - odd; add_sample(ioff, 0, 1, -1, &ra[off8]); // the +1 prevents us from re-iterating over the same block // multiple times. Since we round down in the indexing a few // lines up and offset from the actual candidate position here, // this happens to work. off = ioff + bs(&ra[off8])[0] + 1; } while (off < w4 + tr_ext && np < 8); have_topright = 1; } have_topright &= bw4 <= 16 && t->bx + bw4 + !is_not_sb_boundary < t->ts->tiling.col_end && (!(t->by & (f->sb_step - 1)) || // top sb boundary ((t->bx + bw4) & (f->sb_step - 1) && // right sb boundary ra[(t->bx + bw4) & 127].mv[0].y != INVALID_MV)); } if (np < 8 && have_left) { const refmvs_block *r2 = &r[(t->bx - 1) & 127]; int off = -r2->oy4; have_topleft &= !off; do { add_sample(0, off, -1, 1, &r2[off * 128]); off += bs(&r2[off * 128])[1]; } while (off < h4 && np < 8); } else have_topleft = 0; if (is_not_sb_boundary) { if (np < 8 && have_topleft) // top/left add_sample(0, 0, -1, -1, &ra[((t->bx - 1) & 127)]); if (np < 8 && have_topright) // top/right add_sample(bw4, 0, 1, -1, &ra[((t->bx + bw4) & 127)]); } else { if (np < 8 && have_topleft) { // top/left const refmvs_block *const r2 = t->bx & (f->sb_step - 1) ? &ra[(t->bx - 1) >> 1] : &t->rt.ra_tl; if (dav2d_block_dimensions[r2->bs][0] + init_odd == r2->ox4 + 2) add_sample(0, 0, -1, -1, r2); } if (np < 8 && have_topright) { // top/right const refmvs_block *const r2 = &ra[(t->bx + bw4 + 1) >> 1]; if (r2->ox4 == init_odd) add_sample(bw4, 0, 1, -1, r2); } } assert(np > 0 && np <= 8); #undef bs if (!dav2d_find_affine_int(pts, np, bw4, bh4, mv, wmp, t->bx, t->by) && !dav2d_get_shear_params(wmp)) { wmp->type = warp_type(wmp->matrix); } else wmp->type = DAV2D_WM_TYPE_INVALID; } static void extend_warpmv(Dav2dTaskContext *const t, const int x_off, const int y_off, const uint8_t *const b_dim, const Av2Block *const b, Dav2dWarpedMotionParams *const wmp) { const Dav2dFrameContext *const f = t->f; const refmvs_block *const r = y_off == -1 && !(t->by & (f->sb_step - 1)) ? x_off < 0 && !(t->bx & (f->sb_step - 1)) ? &t->rt.ra_tl : &t->rt.ra[(t->bx + x_off) >> 1] : &t->rt.r[((t->by + y_off) & 63) * 128 + ((t->bx + x_off) & 127)]; int32_t *const m = wmp->matrix; if (r->mf & 2) { if (r->warp_type == DAV2D_WM_TYPE_INVALID) memcpy(m, &dav2d_default_wm_params.matrix, sizeof(*m) * 6); else memcpy(m, r->m, sizeof(*m) * 6); } else if (r->mf & 1) { memcpy(m, t->f->frame_hdr->gmv.m[b->ref.ref[0]].matrix, sizeof(*m) * 6); } else { memcpy(&m[2], &dav2d_default_wm_params.matrix[2], sizeof(*m) * 4); const int ref = r->ref.ref[0] != b->ref.ref[0]; m[0] = r->mv[ref].x * (1 << 13); m[1] = r->mv[ref].y * (1 << 13); } // extend warpmv using (quasi-)matrix from neighbour const int bw4 = b_dim[0], bh4 = b_dim[1]; const int sx = t->bx * 4 + 2 * bw4 - 1, sy = t->by * 4 + 2 * bh4 - 1; const int64_t px = ((int64_t) sx << 16) + b->mv[0].x * (1 << 13); const int64_t py = ((int64_t) sy << 16) + b->mv[0].y * (1 << 13); if (x_off >= 0) { assert(y_off == -1); const int ay = t->by * 4 - 1, sh = 1 + b_dim[3]; const int64_t apx = (int64_t) m[2] * sx + (int64_t) m[3] * ay + m[0]; const int64_t apy = (int64_t) m[4] * sx + (int64_t) m[5] * ay + m[1]; const int m3 = (int) ((px - apx + bh4 - (px < apx)) >> sh); const int m5 = (int) ((py - apy + bh4 - (py < apy)) >> sh); m[3] = iclip((m3 + 0x20 - (m3 < 0)) & ~0x3f, -0x7fc0, 0x7fc0); m[5] = iclip((m5 + 0x20 - (m5 < 0x10000)) & ~0x3f, 0x8040, 0x17fc0); } else { assert(x_off == -1 || !(t->by & (t->f->sb_step - 1))); const int ax = t->bx * 4 - 1, sh = 1 + b_dim[2]; const int64_t lpx = (int64_t) m[2] * ax + (int64_t) m[3] * sy + m[0]; const int64_t lpy = (int64_t) m[4] * ax + (int64_t) m[5] * sy + m[1]; const int m2 = (int) ((px - lpx + bw4 - (px < lpx)) >> sh); const int m4 = (int) ((py - lpy + bw4 - (py < lpy)) >> sh); m[2] = iclip((m2 + 0x20 - (m2 < 0x10000)) & ~0x3f, 0x8040, 0x17fc0); m[4] = iclip((m4 + 0x20 - (m4 < 0)) & ~0x3f, -0x7fc0, 0x7fc0); } dav2d_set_affine_mv2d(bw4, bh4, b->mv[0], wmp, t->bx, t->by); wmp->type = dav2d_get_shear_params(wmp) ? DAV2D_WM_TYPE_INVALID : warp_type(m); } static int read_pal_indices(Dav2dTaskContext *const t, uint8_t *const pal_out, const int pal_sz, const int sz[4]) { Dav2dTileState *const ts = t->ts; uint16_t (*const pal_cdf)[8] = ts->cdf.m.pal_idx[pal_sz - 2]; uint8_t *const pal_idx = t->scratch.pal_idx_y; const int dir = imax(sz[2], sz[3]) < 64 && dav2d_msac_decode_bool_bypass(&ts->msac); const ptrdiff_t strides[2] = { dir ? 1 : sz[2], dir ? sz[2] : 1 }; const int lim1 = sz[!dir], lim2 = sz[dir]; int copy = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.pal_idx_identity[3], 2); if (copy == 2) return -1; int prev_v = pal_idx[0] = dav2d_msac_decode_uniform(&ts->msac, pal_sz); if (copy == 1) { // FIXME if dir=0, maybe use memset()? for (int m = 1; m < lim2; m++) pal_idx[m * strides[1]] = prev_v; } else { int prev_h = prev_v; for (int m = 1; m < lim2; m++) { const int v = dav2d_msac_decode_symbol_adapt8(&ts->msac, pal_cdf[0], pal_sz - 1); prev_h = pal_idx[m * strides[1]] = !v ? prev_h : v - (v <= prev_h); } } ptrdiff_t off = strides[0]; for (int n = 1; n < lim1; n++, off += strides[0]) { copy = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.pal_idx_identity[copy], 2); if (copy == 2) { // FIXME if dir=0, maybe use memcpy()? for (int m = 0; m < lim2; m++) pal_idx[off + m * strides[1]] = pal_idx[off - strides[0] + m * strides[1]]; } else { const int v = dav2d_msac_decode_symbol_adapt8(&ts->msac, pal_cdf[0], pal_sz - 1); const int next_v = pal_idx[off] = !v ? prev_v : v - (v <= prev_v); if (copy == 1) { // FIXME if dir=0, maybe use memset()? for (int m = 1; m < lim2; m++) pal_idx[off + m * strides[1]] = next_v; } else { int prev_tl = prev_v, prev_l = next_v; for (int m = 1; m < lim2; m++) { int prev_t = pal_idx[off - strides[0] + m * strides[1]]; int ctx; if (prev_t == prev_l) { ctx = 3 + (prev_tl == prev_l); } else { ctx = 1 + (prev_t == prev_tl || prev_l == prev_tl); } const int v = dav2d_msac_decode_symbol_adapt8(&ts->msac, pal_cdf[ctx], pal_sz - 1); int p; switch (ctx) { default: assert(0); case 1: { switch (v) { case 0: case 1: p = v == dir ? prev_l : prev_t; break; case 2: p = prev_tl; break; default: { const int s1 = prev_l < prev_t; const int s2 = prev_l < prev_tl; const int s3 = prev_t < prev_tl; p = v - (v <= prev_l + s1 + s2) - (v <= prev_t + s3 + !s1) - (v <= prev_tl + !s2 + !s3); break; }} break; } case 2: { const int prev_l_or_t = prev_l + prev_t - prev_tl; switch (v) { case 0: p = prev_tl; break; case 1: p = prev_l_or_t; break; default: { const int s = prev_l_or_t < prev_tl; p = v - (v <= prev_l_or_t + s) - (v <= prev_tl + !s); break; }} break; } case 3: { switch (v) { case 0: p = prev_l; break; case 1: p = prev_tl; break; default: { const int s = prev_l < prev_tl; p = v - (v <= prev_l + s) - (v <= prev_tl + !s); break; }} break; } case 4: p = !v ? prev_l : v - (v <= prev_l); break; } prev_l = pal_idx[off + m * strides[1]] = p; prev_tl = prev_t; } } prev_v = next_v; } } t->c->pal_dsp.pal_idx_finish(pal_out, pal_idx, sz[2], sz[3], sz[0], sz[1]); return 0; } static inline unsigned get_prev_frame_segid(const Dav2dFrameContext *const f, const int by, const int bx, const int w4, int h4, const uint8_t *ref_seg_map, const ptrdiff_t stride) { assert(f->frame_hdr->primary_ref_frame != DAV2D_PRIMARY_REF_NONE); unsigned seg_id = 8; ref_seg_map += by * stride + bx; do { for (int x = 0; x < w4; x++) seg_id = imin(seg_id, ref_seg_map[x]); ref_seg_map += stride; } while (--h4 > 0 && seg_id); assert(seg_id < 8); return seg_id; } #if DEBUG_BLOCK_INFO static void debug_warp_matrix(const int depth, const Dav2dFrameContext *const f, const Dav2dTaskContext *const t, const Av2Block *const b, const int r) { #define signabs(v) v < 0 ? '-' : ' ', abs(v) DEBUG_BLOCK_printf("%*s[ %c%x, %c%x | %c%x, %c%x, %c%x, %c%x ],t=%d " "mv=y:%d,x:%d\n", depth, "", signabs(t->warpmv[r].matrix[0]), signabs(t->warpmv[r].matrix[1]), signabs(t->warpmv[r].matrix[2]), signabs(t->warpmv[r].matrix[3]), signabs(t->warpmv[r].matrix[4]), signabs(t->warpmv[r].matrix[5]), t->warpmv[r].type, b->mv[r].y, b->mv[r].x); #undef signabs } #else #define debug_warp_matrix(...) #endif static inline void splat_oneref_mv(DB_ONLY(const int depth) const Dav2dFrameContext *const f, Dav2dTaskContext *const t, const enum BlockSize bs, const Av2Block *const b, const int by4, const int bw4, const int bh4) { refmvs_block *const s_dst = &t->rt.r[by4 * 128 + (t->bx & 127)]; refmvs_block s_src; const ptrdiff_t t_stride = f->rf.rp_stride; refmvs_temporal_block *const t_dst = f->seq_hdr->ref_frame_mvs && b->ref.ref[0] != TIP_FRAME ? &f->rf.rp[(t->by >> 1) * t_stride + (t->bx >> 1)] : NULL; refmvs_temporal_block t_src; t_src.ref.ref[0] = t_src.ref.ref[1] = s_src.ref.ref[0] = b->ref.ref[0]; s_src.ref.ref[1] = -1; s_src.mv[1].y = INVALID_MV; s_src.bs = bs; s_src.subpel_filter = b->filter; if (b->motion_mode > MM_INTERINTRA || (b->inter_mode == GLOBALMV && imin(bw4, bh4) > 1 && f->frame_hdr->gmv.m[b->ref.ref[0]].type > DAV2D_WM_TYPE_TRANSLATION)) { assert(bw4 > 1 && bh4 > 1); const Dav2dWarpedMotionParams *wm; if (b->motion_mode > MM_INTERINTRA) { s_src.lmv[0] = b->mv[0]; s_src.lmv[1].y = INVALID_MV; s_src.mf = 2; wm = &t->warpmv[0]; memcpy(s_src.m, wm->matrix, sizeof(int32_t) * 6); s_src.warp_type = wm->type; } else { s_src.mv[0] = b->mv[0]; s_src.mf = 1; wm = &f->frame_hdr->gmv.m[b->ref.ref[0]]; } const int32_t *const mat = wm->matrix; const int64_t mvx = (int64_t) (mat[2] - 0x10000) * (t->bx + 1) * 4 + (int64_t) mat[3] * (t->by + 1) * 4 + mat[0]; const int64_t mvy = (int64_t) mat[4] * (t->bx + 1) * 4 + mat[1] + (int64_t) (mat[5] - 0x10000) * (t->by + 1) * 4; f->c->refmvs_dsp.splat_warpmv(s_dst, &s_src, t_dst, t_stride, &t_src, mvy, mvx, wm, bw4, bh4); } else { s_src.mv[0] = b->mv[0]; s_src.mf = b->inter_mode == GLOBALMV && imin(bw4, bh4) > 1; // this is invalid for TIP, but that will be overwritten in tip_pred() t_src.mv.mv[0] = t_src.mv.mv[1] = quantize_mv(b->mv[0]); if (t_src.mv.mv[0].n == INVALID_TRAJ) t_src.ref.pair = -1; f->c->refmvs_dsp.splat_mv(s_dst, &s_src, t_dst, t_stride, &t_src, bw4, bh4); } } static inline void splat_intrabc_mv(DB_ONLY(const int depth) const Dav2dFrameContext *const f, Dav2dTaskContext *const t, const enum BlockSize bs, const Av2Block *const b, const int by4, const int bw4, const int bh4) { refmvs_block *const s_dst = &t->rt.r[by4 * 128 + (t->bx & 127)]; refmvs_block s_src = (refmvs_block) { .ref.pair = -1, .mv[0] = b->mv[0], .mv[1].y = INVALID_MV, .bs = bs, .mf = 0, }; const ptrdiff_t t_stride = f->rf.rp_stride; refmvs_temporal_block *const t_dst = f->seq_hdr->ref_frame_mvs ? &f->rf.rp[(t->by >> 1) * t_stride + (t->bx >> 1)] : NULL; refmvs_temporal_block t_src = { .ref.pair = -1, .mv.n = INVALID_TRAJ * 0x10001U, }; f->c->refmvs_dsp.splat_mv(s_dst, &s_src, t_dst, t_stride, &t_src, bw4, bh4); if (t->f->seq_hdr->refmv_bank) dav2d_refmvs_bank_add(&t->rt, bs, t->by, t->bx, b); } static inline void splat_tworef_mv(DB_ONLY(const int depth) const Dav2dFrameContext *const f, Dav2dTaskContext *const t, const enum BlockSize bs, const Av2Block *const b, const int by4, const int bw4, const int bh4) { refmvs_block *const s_dst = &t->rt.r[by4 * 128 + (t->bx & 127)]; refmvs_block s_src; const int t_swap = !!(f->rf.ref_flip & (1ULL << (b->ref.ref[0] * 8 + b->ref.ref[1]))); const ptrdiff_t t_stride = f->rf.rp_stride; const int opfl = b->inter_mode >= OPFL_NEARMV_NEARMV; const int refinemv = b->refine_mv && b->comp_type == COMP_INTER_AVG; refmvs_temporal_block *t_dst = f->seq_hdr->ref_frame_mvs && (!opfl || !refinemv) ? &f->rf.rp[(t->by >> 1) * t_stride + (t->bx >> 1)] : NULL; refmvs_temporal_block t_src; s_src.ref.ref[0] = t_src.ref.ref[t_swap] = b->ref.ref[0]; s_src.ref.ref[1] = t_src.ref.ref[!t_swap] = b->ref.ref[1]; s_src.bs = bs; s_src.subpel_filter = b->filter; s_src.mf = b->cwp_idx << 2; const uint8_t *const mask = b->comp_type == COMP_INTER_WEDGE ? WEDGE_TMVP(bs, bw4, bh4, b->wedge_idx) : NULL; if (b->motion_mode > MM_INTERINTRA || (b->inter_mode == GLOBALMV_GLOBALMV && imin(bw4, bh4) > 1 && (f->frame_hdr->gmv.m[b->ref.ref[0]].type > DAV2D_WM_TYPE_TRANSLATION || f->frame_hdr->gmv.m[b->ref.ref[1]].type > DAV2D_WM_TYPE_TRANSLATION))) { assert(bw4 > 1 && bh4 > 1); const Dav2dWarpedMotionParams *wm1, *wm2; if (b->motion_mode > MM_INTERINTRA) { COPY2MV(s_src.lmv, b->mv); s_src.mf |= 2; wm1 = &t->warpmv[0]; wm2 = &t->warpmv[1]; memcpy(s_src.m, wm1->matrix, sizeof(int32_t) * 6); s_src.warp_type = wm1->type; } else { COPY2MV(s_src.mv, b->mv); s_src.mf |= 1; wm1 = &f->frame_hdr->gmv.m[b->ref.ref[0]]; wm2 = &f->frame_hdr->gmv.m[b->ref.ref[1]]; } const int32_t *const mat1 = wm1->matrix; const int32_t *const mat2 = wm2->matrix; const int64_t mvx1 = (int64_t) (mat1[2] - 0x10000) * (t->bx + 1) * 4 + (int64_t) mat1[3] * (t->by + 1) * 4 + mat1[0]; const int64_t mvy1 = (int64_t) mat1[4] * (t->bx + 1) * 4 + mat1[1] + (int64_t) (mat1[5] - 0x10000) * (t->by + 1) * 4; const int64_t mvx2 = (int64_t) (mat2[2] - 0x10000) * (t->bx + 1) * 4 + (int64_t) mat2[3] * (t->by + 1) * 4 + mat2[0]; const int64_t mvy2 = (int64_t) mat2[4] * (t->bx + 1) * 4 + mat2[1] + (int64_t) (mat2[5] - 0x10000) * (t->by + 1) * 4; // FIXME for compound-warp_causal-newmv^2, do we need a 2nd matrix? f->c->refmvs_dsp.splat_comp_warpmv(s_dst, &s_src, t_dst, t_stride, &t_src, mvy1, mvx1, mvy2, mvx2, wm1, wm2, bw4, bh4, t_swap, mask, b->wedge_sign ^ t_swap); } else { COPY2MV(s_src.mv, b->mv); s_src.mf |= b->inter_mode == GLOBALMV_GLOBALMV && imin(bw4, bh4) > 1; t_src.mv.mv[0] = quantize_mv(b->mv[t_swap]); t_src.mv.mv[1] = quantize_mv(b->mv[!t_swap]); if (!mask) { if (t_src.mv.mv[0].n == INVALID_TRAJ) { if (t_src.mv.mv[1].n == INVALID_TRAJ) { t_src.ref.pair = -1; } else { t_src.mv.mv[0] = t_src.mv.mv[1]; t_src.ref.ref[0] = t_src.ref.ref[1]; } } else if (t_src.mv.mv[1].n == INVALID_TRAJ) { t_src.mv.mv[1] = t_src.mv.mv[0]; t_src.ref.ref[1] = t_src.ref.ref[0]; } f->c->refmvs_dsp.splat_mv(s_dst, &s_src, t_dst, t_stride, &t_src, bw4, bh4); } else { f->c->refmvs_dsp.splat_comp_wedgemv(s_dst, &s_src, t_dst, t_stride, &t_src, bw4, bh4, mask, b->wedge_sign ^ t_swap); } } } static inline void splat_intraref(const Dav2dContext *const c, const Dav2dFrameContext *const f, Dav2dTaskContext *const t, const enum BlockSize bs, const int by4, const int bw4, const int bh4) { refmvs_block *const s_dst = &t->rt.r[by4 * 128 + (t->bx & 127)]; refmvs_block s_src = (refmvs_block) { .ref.pair = -1, .mv[0].y = INVALID_MV, .mv[1].y = INVALID_MV, .bs = bs, .mf = 0, }; const ptrdiff_t t_stride = f->rf.rp_stride; refmvs_temporal_block *const t_dst = f->seq_hdr->ref_frame_mvs ? &f->rf.rp[(t->by >> 1) * t_stride + (t->bx >> 1)] : NULL; refmvs_temporal_block t_src = { .ref.pair = -1, .mv.n = INVALID_TRAJ * 0x10001U, }; c->refmvs_dsp.splat_mv(s_dst, &s_src, t_dst, t_stride, &t_src, bw4, bh4); if (t->f->seq_hdr->refmv_bank) dav2d_refmvs_bank_update(&t->rt, bs, t->by, t->bx); } static void mc_lowest_px(int *const dst, const int by4, const int bh4, const int mvy, const int ss_ver, const struct ScalableMotionParams *const smp) { const int v_mul = 4 >> ss_ver; if (!smp->scale) { const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver); *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy); } else { int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver); const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8; y = apply_sign64((llabs(tmp) + 128) >> 8, tmp) + 32; const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4; *dst = imax(*dst, bottom); } } static ALWAYS_INLINE void affine_lowest_px(int *const dst, const uint8_t *const b_dim, const int by, const int bx, const Dav2dWarpedMotionParams *const wmp, const int ss_ver, const int ss_hor) { const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; const int32_t *const mat = wmp->matrix; const int y = b_dim[1] * v_mul - 8; // lowest y const int src_y = by * 4 + ((y + 4) << ss_ver); const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; // check left- and right-most blocks for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) { // calculate transformation relative to center of 8x8 block in // luma pixel units const int src_x = bx * 4 + ((x + 4) << ss_hor); const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int dy = (int) (mvy >> 16) - 4; *dst = imax(*dst, dy + 4 + 8); } } static inline void affine_lowest_px_luma(Dav2dTaskContext *const t, int *const dst, const uint8_t *const b_dim, const Dav2dWarpedMotionParams *const wmp) { affine_lowest_px(dst, b_dim, t->by, t->bx, wmp, 0, 0); } static inline void affine_lowest_px_chroma(Dav2dTaskContext *const t, int *const dst, const uint8_t *const b_dim, const Dav2dWarpedMotionParams *const wmp) { const Dav2dFrameContext *const f = t->f; affine_lowest_px(dst, b_dim, t->cby, t->cbx, wmp, f->ss_ver, f->ss_hor); } static void derive_lowest_px(Dav2dTaskContext *const t, const enum BlockSize lbs, const enum BlockSize cbs, Av2Block *const b) { const Dav2dFrameContext *const f = t->f; Dav2dTileState *const ts = t->ts; const int has_chroma = cbs != BS_INVALID; // keep track of motion vectors for each reference const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift; int (*const lowest_px)[2] = ts->lowest_pixel[sby]; const uint8_t *const b_dim = dav2d_block_dimensions[lbs]; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int has_sub8x8_chroma = has_chroma && cbs != lbs && imin(bw4, bh4) < 16; const int has_regular_chroma = has_chroma && !has_sub8x8_chroma; const int ss_ver = f->ss_ver; int cbw4, cbh4; const uint8_t *c_b_dim; if (has_chroma) { c_b_dim = dav2d_block_dimensions[cbs]; cbw4 = c_b_dim[0]; cbh4 = c_b_dim[1]; } // y assert(lbs != BS_INVALID); if (b->ref.ref[1] == -1 && b->ref.ref[0] != TIP_FRAME) { if (!f->frame_hdr->force_integer_mv && ((b->inter_mode == GLOBALMV && imin(bw4, bh4) > 1 && f->gmv_warp_allowed[b->ref.ref[0]]) || (b->motion_mode >= MM_WARP_CAUSAL && t->warpmv[0].type > DAV2D_WM_TYPE_INVALID))) { affine_lowest_px_luma(t, &lowest_px[b->ref.ref[0]][0], b_dim, b->motion_mode >= MM_WARP_CAUSAL ? &t->warpmv[0] : &f->frame_hdr->gmv.m[b->ref.ref[0]]); if (has_regular_chroma) affine_lowest_px_chroma(t, &lowest_px[b->ref.ref[0]][1], c_b_dim, b->motion_mode >= MM_WARP_CAUSAL ? &t->warpmv[0] : &f->frame_hdr->gmv.m[b->ref.ref[0]]); } else { mc_lowest_px(&lowest_px[b->ref.ref[0]][0], t->by, bh4, b->mv[0].y, 0, &f->svc[b->ref.ref[0]][1]); if (has_regular_chroma) mc_lowest_px(&lowest_px[b->ref.ref[0]][1], t->cby, cbh4, b->mv[0].y, ss_ver, &f->svc[b->ref.ref[0]][1]); } if (b->motion_mode > MM_INTERINTRA) { // backup warpmatrix memcpy(b->mtxbak, t->warpmv[0].matrix, sizeof(int32_t) * 6); } } else if (b->ref.ref[0] == TIP_FRAME) { const int h4 = imin(bh4, f->bh - t->cby); const int w4 = imin(bw4, f->bw - t->cbx); int opfl = f->seq_hdr->tip_refine_mv && (f->frame_hdr->tip.frame_mode == 1 || f->frame_hdr->tip.subpel_filter == DAV2D_FILTER_8TAP_SHARP); const union refpair ref = f->rf.tip.ref; const int refine = opfl && f->frame_hdr->tip.frame_mode == 1 && f->refdist[ref.ref[0]] == -f->refdist[ref.ref[1]]; const int step = 2 << (f->frame_hdr->tip.frame_mode == 2 /* frame */ ? !opfl : ((!opfl && imin(bw4, bh4) >= 4) || b->bs == BS_256x256)); opfl &= !!f->seq_hdr->opfl_refine && f->frame_hdr->has_bothside_refs; const ptrdiff_t t_stride = f->rf.rp_stride; for (int y = 0; y < h4; y += step) { const ptrdiff_t off_y8 = (((t->by + y) & (f->sb_step - 1)) >> 1) * t_stride; for (int x = 0; x < w4; x += step) { const ptrdiff_t off_8x8 = off_y8 + ((t->bx + x) >> 1); mv tmv = t->rt.rp_proj[off_8x8].mv; if (tmv.y == INVALID_MV) tmv.n = 0; for (int i = 0; i < 2; i++) { const int64_t tmpy = tmv.y * (int64_t) f->rf.tip.sf[i]; const int tipmvy = iclip((int)((tmpy + 0x2000 - (tmpy < 0)) >> 14), -0xffff, 0xffff); const int mvy = iclip(tipmvy + b->mv[0].y, -0xffff, 0xffff); mc_lowest_px(&lowest_px[ref.ref[i]][0], t->by + y, step, mvy | opfl | refine, 0, &f->svc[ref.ref[i]][1]); if (has_regular_chroma) mc_lowest_px(&lowest_px[ref.ref[i]][1], t->by + y, step, mvy | opfl | refine, ss_ver, &f->svc[ref.ref[i]][1]); } } } } else if (b->inter_mode >= OPFL_NEARMV_NEARMV || (b->refine_mv && b->comp_type == COMP_INTER_AVG)) { // the "| 1" here is to force the mv into a subpel-position and ensure // the 4px subpel filter overhang is applied in mc_lowest_px(). This // matches the 4px overhang that refinemv and/or opfl can generate, // regardless of the input MV. for (int i = 0; i < 2; i++) { mc_lowest_px(&lowest_px[b->ref.ref[i]][0], t->by, bh4, b->mv[i].y | 1, 0, &f->svc[b->ref.ref[i]][1]); if (has_regular_chroma) mc_lowest_px(&lowest_px[b->ref.ref[i]][1], t->cby, cbh4, b->mv[i].y | 1, ss_ver, &f->svc[b->ref.ref[i]][1]); } } else { for (int i = 0; i < 2; i++) { if ((b->inter_mode == GLOBALMV_GLOBALMV && imin(bw4, bh4) > 1 && f->gmv_warp_allowed[b->ref.ref[i]]) || (b->motion_mode == MM_WARP_CAUSAL && t->warpmv[i].type > DAV2D_WM_TYPE_INVALID)) { affine_lowest_px_luma(t, &lowest_px[b->ref.ref[i]][0], b_dim, b->motion_mode == MM_WARP_CAUSAL ? &t->warpmv[i] : &f->frame_hdr->gmv.m[b->ref.ref[i]]); if (has_regular_chroma) affine_lowest_px_chroma(t, &lowest_px[b->ref.ref[i]][1], c_b_dim, b->motion_mode == MM_WARP_CAUSAL ? &t->warpmv[i] : &f->frame_hdr->gmv.m[b->ref.ref[i]]); } else { mc_lowest_px(&lowest_px[b->ref.ref[i]][0], t->by, bh4, b->mv[i].y, 0, &f->svc[b->ref.ref[i]][1]); if (has_regular_chroma) mc_lowest_px(&lowest_px[b->ref.ref[i]][1], t->cby, cbh4, b->mv[i].y, ss_ver, &f->svc[b->ref.ref[i]][1]); } if (b->motion_mode > MM_INTERINTRA) { // backup warpmatrix. We write one of them into our neighbour // block on the right, which is OK since comp-warpmv blocks are // always at least 8px wide, so that entry is unused. assert(bw4 >= 2); memcpy(b[i].mtxbak, t->warpmv[i].matrix, sizeof(int32_t) * 6); } } } if (has_sub8x8_chroma) { const int ch4 = imin(cbh4, f->bh - t->cby); const int cw4 = imin(cbw4, f->bw - t->cbx); const refmvs_block *r = &t->rt.r[(t->cby & 63) * 128 + (t->cbx & 127)]; for (int y = 0; y < ch4; y++, r += 128) { for (int x = 0; x < cw4; x++) { // grab ref/MV/filter from spatial refmvs const refmvs_block *const r2 = &r[x]; if (r2->ox4 || r2->oy4) continue; const union mv mv = r2->mf & 2 ? r2->lmv[0] : r2->mv[0]; const uint8_t *const sdim = dav2d_block_dimensions[r2->bs]; mc_lowest_px(&lowest_px[r2->ref.ref[0]][1], t->cby + y, sdim[1], mv.y, ss_ver, &f->svc[b->ref.ref[0]][1]); } } } } static int recon_b(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum BlockSize lbs, const enum BlockSize cbs, Av2Block *const b) { const Dav2dFrameContext *const f = t->f; if (t->task_thread.pass == PASS_RECON) { if (b->motion_mode > MM_INTERINTRA) { // restore warp matrices const int comp = b->ref.ref[1] != -1; for (int n = 0; n <= comp; n++) { memcpy(t->warpmv[n].matrix, b[n].mtxbak, sizeof(int32_t) * 6); t->warpmv[n].type = dav2d_get_shear_params(&t->warpmv[n]) ? DAV2D_WM_TYPE_INVALID : warp_type(t->warpmv[n].matrix); } } return f->bd_fn.recon_b(t, DB_ONLY(depth) lbs, (const enum BlockSize[2]){ cbs, cbs }, b); } const enum BlockSize bs = lbs == BS_INVALID ? cbs : lbs; assert(bs != BS_INVALID); Dav2dTileState *const ts = t->ts; const int have_top = t->by > ts->tiling.row_start; const int have_left = t->bx > ts->tiling.col_start; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); const int by4 = t->by & 63; const int has_luma = lbs != BS_INVALID; // resolve motion vector and/or warp matrix if (has_luma && b->intrabc) { refmvs_candidate mvstack[6]; int n_mvs; dav2d_refmvs_find(&t->rt, mvstack, NULL, &n_mvs, (union refpair) { .pair = -1 }, bs, 0, t->by, t->bx); #if DEBUG_BLOCK_INFO if (BLOCK_TO_DEBUG) { printf("%*sfind_mv_refs(intra)\n", depth, ""); for (int n = 0; n < n_mvs; n++) printf("%*smv[%d/%d]: y=%d,x=%d,w=%d\n", depth + 1, "", n, n_mvs, mvstack[n].mv[0].y, mvstack[n].mv[0].x, mvstack[n].weight); } #endif const union mv diff = b->mv[0]; b->mv[0] = mvstack[b->drl_idx[0]].mv[0]; if (!b->mv[0].n) { // I don't know if this can actually happen, but AVM has code here // to force the refmv to a nonzero value const int sbsz = 64 << f->frame_hdr->sb128; if (t->by - f->sb_step < ts->tiling.row_start) { b->mv[0].x = -(8 * (sbsz + 256)); } else { b->mv[0].y = -(8 * sbsz); } } if (!b->is_refmv) { if (!b->is_qpel) fix_int_mv_precision(&b->mv[0]); b->mv[0].x += diff.x; b->mv[0].y += diff.y; } #if 0 // clip intrabc motion vector to decoded parts of current tile int border_left = ts->tiling.col_start * 4; int border_top = ts->tiling.row_start * 4; if (has_chroma) { if (bw4 < 2 && ss_hor) border_left += 4; if (bh4 < 2 && ss_ver) border_top += 4; } int src_left = t->bx * 4 + (b->mv[0].x >> 3); int src_top = t->by * 4 + (b->mv[0].y >> 3); int src_right = src_left + bw4 * 4; int src_bottom = src_top + bh4 * 4; const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4; // check against left or right tile boundary and adjust if necessary if (src_left < border_left) { src_right += border_left - src_left; src_left += border_left - src_left; } else if (src_right > border_right) { src_left -= src_right - border_right; src_right -= src_right - border_right; } // check against top tile boundary and adjust if necessary if (src_top < border_top) { src_bottom += border_top - src_top; src_top += border_top - src_top; } const int sbx = (t->bx >> (4 + f->frame_hdr->sb128)) << (6 + f->frame_hdr->sb128); const int sby = (t->by >> (4 + f->frame_hdr->sb128)) << (6 + f->frame_hdr->sb128); const int sb_size = 1 << (6 + f->frame_hdr->sb128); // check for overlap with current superblock if (src_bottom > sby && src_right > sbx) { if (src_top - border_top >= src_bottom - sby) { // if possible move src up into the previous suberblock row src_top -= src_bottom - sby; src_bottom -= src_bottom - sby; } else if (src_left - border_left >= src_right - sbx) { // if possible move src left into the previous suberblock src_left -= src_right - sbx; src_right -= src_right - sbx; } } // move src up if it is below current superblock row if (src_bottom > sby + sb_size) { src_top -= src_bottom - (sby + sb_size); src_bottom -= src_bottom - (sby + sb_size); } // error out if mv still overlaps with the current superblock if (src_bottom > sby && src_right > sbx) return -1; b->mv[0].x = (src_left - t->bx * 4) * 8; b->mv[0].y = (src_top - t->by * 4) * 8; #endif DEBUG_BLOCK_printf("%*sfinal 2dmv: y=%d,x=%d\n", depth, "", b->mv[0].y, b->mv[0].x); splat_intrabc_mv(DB_ONLY(depth) f, t, bs, b, by4, bw4, bh4); } else if (!b->intra) { assert(has_luma); if (b->ref.ref[1] == -1) { if (b->inter_mode == GLOBALMV) { b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv.m[b->ref.ref[0]], t->bx, t->by, bw4, bh4, f->bw, f->bh, f->frame_hdr); } else { refmvs_candidate mvstack[6]; int32_t warp[4][7]; int n_mvs[2]; dav2d_refmvs_find(&t->rt, mvstack, b->ref.ref[0] != TIP_FRAME && b->inter_mode > NEWMV ? warp : NULL, n_mvs, (union refpair) { .ref = { b->ref.ref[0], -1 }}, bs, 0, t->by, t->bx); #if DEBUG_BLOCK_INFO if (BLOCK_TO_DEBUG) { printf("%*sfind_mv_refs(%d,-1)\n", depth, "", b->ref.ref[0]); for (int n = 0; n < n_mvs[0]; n++) printf("%*smv[%d/%d]: y=%d,x=%d,w=%d,y_off=%d,x_off=%d\n", depth + 1, "", n, n_mvs[0], mvstack[n].mv[0].y, mvstack[n].mv[0].x, mvstack[n].weight, mvstack[n].y_off, mvstack[n].x_off); if (b->ref.ref[0] != TIP_FRAME && b->inter_mode > NEWMV) for (int n = 0; n < n_mvs[1]; n++) printf("%*swarp[%d/%d]: %d, %d, %d, %d, %d, %d, t=%d\n", depth + 1, "", n, n_mvs[1], warp[n][0], warp[n][1], warp[n][2], warp[n][3], warp[n][4], warp[n][5], warp[n][6]); } #endif const union mv diff = b->mv[0]; b->mv[0] = b->inter_mode == WARPMV ? get_warpmv_2d(warp[b->warp_ref_idx], t->bx, t->by, bw4, bh4, f->bw, f->bh, b->warpmv_with_mvd ? b->mv_prec : 6) : mvstack[b->drl_idx[0]].mv[0]; if (b->inter_mode == NEWMV || b->inter_mode == WARPNEWMV || (b->inter_mode == WARPMV && b->warpmv_with_mvd)) { if (!b->amvd && b->mv_prec <= 3) mv_reduce_prec(&b->mv[0], b->mv_prec); b->mv[0].x += diff.x; b->mv[0].y += diff.y; } if (b->motion_mode == MM_WARP_DELTA) { int32_t *const m = t->warpmv[0].matrix; for (int n = 0; n < 4 && b->matrix[n] != -0x80; n++) { if (b->matrix[n]) { const int base = ((n - 1U) >= 2U) * 0x10000; m[2 + n] = iclip(warp[b->warp_ref_idx][n + 2] + b->matrix[n] * (1 << 10), base - 0x7fc0, base + 0x7fc0); } else { m[2 + n] = warp[b->warp_ref_idx][n + 2]; } } if (b->matrix[2] == -0x80) { m[5] = m[2]; m[4] = -m[3]; } dav2d_set_affine_mv2d(bw4, bh4, b->mv[0], &t->warpmv[0], t->bx, t->by); t->warpmv[0].type = dav2d_get_shear_params(&t->warpmv[0]) ? DAV2D_WM_TYPE_INVALID : warp_type(t->warpmv[0].matrix); } else if (b->motion_mode == MM_WARP_CAUSAL) { derive_warpmv(t, have_top, have_left, bw4, bh4, w4, h4, b->ref.ref[0], b->mv[0], &t->warpmv[0]); } else if (b->motion_mode == MM_WARP_EXTEND) { const int is_sb_boundary = !(t->by & (f->sb_step - 1)); int y_off = 0, x_off = 0; // invalid if (mvstack[b->drl_idx[0]].x_off == -1 || mvstack[b->drl_idx[0]].y_off == -1) { y_off = mvstack[b->drl_idx[0]].y_off; x_off = mvstack[b->drl_idx[0]].x_off; const refmvs_block *r; const int sb_mask = f->sb_step - 1; if (is_sb_boundary && y_off == -1) { r = t->bx & sb_mask || x_off >= 0 ? &t->rt.ra[(t->bx + x_off) >> 1] : &t->rt.ra_tl; } else { r = &t->rt.r[((t->by + y_off) & 63) * 128 + ((t->bx + x_off) & 127)]; } if (r->ref.ref[0] == TIP_FRAME) x_off = y_off = 0; } const refmvs_block *const tml = !have_left ? NULL : &t->rt.r[(t->by & 63) * 128 + ((t->bx - 1) & 127)]; const refmvs_block *const bml = (!have_left || t->by + bh4 > ts->tiling.row_end) ? NULL : &t->rt.r[((t->by + bh4 - 1) & 63) * 128 + ((t->bx - 1) & 127)]; const refmvs_block *const lmt = !have_top ? NULL : is_sb_boundary ? &t->rt.ra[(t->bx & ~1) >> 1] : &t->rt.r[((t->by - 1) & 63) * 128 + ((t->bx) & 127)]; const refmvs_block *const rmt = (!have_top || t->bx + bw4 > ts->tiling.col_end) ? NULL : is_sb_boundary ? &t->rt.ra[((t->bx & ~1) + bw4 - 2) >> 1] : &t->rt.r[((t->by - 1) & 63) * 128 + ((t->bx + bw4 - 1) & 127)]; if (x_off || y_off) { /* do nothing */ } else if (bml && (bml->ref.ref[0] == b->ref.ref[0] || bml->ref.ref[1] == b->ref.ref[0])) { y_off = bh4 - 1; x_off = -1; } else if (rmt && (rmt->ref.ref[0] == b->ref.ref[0] || rmt->ref.ref[1] == b->ref.ref[0])) { y_off = -1; x_off = -(t->bx & is_sb_boundary) + bw4 - (1 + is_sb_boundary); } else if (tml && (tml->ref.ref[0] == b->ref.ref[0] || tml->ref.ref[1] == b->ref.ref[0])) { y_off = 0; x_off = -1; } else if (lmt && (lmt->ref.ref[0] == b->ref.ref[0] || lmt->ref.ref[1] == b->ref.ref[0])) { y_off = -1; x_off = -(t->bx & is_sb_boundary); } if (y_off || x_off) extend_warpmv(t, x_off, y_off, b_dim, b, &t->warpmv[0]); else t->warpmv[0].type = DAV2D_WM_TYPE_INVALID; } } } else if (b->skip_mode) { refmvs_candidate mvstack[6]; int n_mvs; dav2d_refmvs_find(&t->rt, mvstack, NULL, &n_mvs, b->ref, bs, 1, t->by, t->bx); #if DEBUG_BLOCK_INFO if (BLOCK_TO_DEBUG) { printf("%*sfind_mv_refs(%d,%d)\n", depth, "", b->ref.ref[0], b->ref.ref[1]); for (int n = 0; n < n_mvs; n++) printf("%*smv[%d/%d]: y=%d,x=%d,y2=%d,x2=%d,w=%d,cwp=%d\n", depth + 1, "", n, n_mvs, mvstack[n].mv[0].y, mvstack[n].mv[0].x, mvstack[n].mv[1].y, mvstack[n].mv[1].x, mvstack[n].weight, mvstack[n].cwp_idx); } #endif COPY2MV(b->mv, mvstack[b->drl_idx[0]].mv); b->cwp_idx = mvstack[b->drl_idx[0]].cwp_idx; } else { if (b->inter_mode != GLOBALMV_GLOBALMV) { refmvs_candidate mvstack[6]; int n_mvs[2]; if (b->inter_mode > NEARMV_NEWMV) { dav2d_refmvs_find(&t->rt, mvstack, NULL, &n_mvs[0], b->ref, bs, 0, t->by, t->bx); } else if (b->ref.ref[0] == b->ref.ref[1]) { dav2d_refmvs_find(&t->rt, mvstack, NULL, &n_mvs[0], (union refpair) { .ref = { b->ref.ref[0], -1 } }, bs, 0, t->by, t->bx); for (int n = 0; n < 6; n++) { mvstack[n].mv[1] = mvstack[n].mv[0]; mvstack[n].weight *= 0x101; } n_mvs[1] = n_mvs[0]; } else { dav2d_refmvs_find(&t->rt, mvstack, NULL, &n_mvs[0], (union refpair) { .ref = { b->ref.ref[0], -1 } }, bs, 0, t->by, t->bx); refmvs_candidate mvstack2[6]; dav2d_refmvs_find(&t->rt, mvstack2, NULL, &n_mvs[1], (union refpair) { .ref = { b->ref.ref[1], -1 } }, bs, 0, t->by, t->bx); for (int n = 0; n < 6; n++) { mvstack[n].mv[1] = mvstack2[n].mv[0]; mvstack[n].weight = (mvstack[n].weight & 0xff) | mvstack2[n].weight << 8; } } #if DEBUG_BLOCK_INFO if (BLOCK_TO_DEBUG) { printf("%*sfind_mv_refs(%d,%d)\n", depth, "", b->ref.ref[0], b->ref.ref[1]); if (b->inter_mode <= NEARMV_NEWMV) { for (int drl = 0; drl < 2; drl++) for (int n = 0; n < n_mvs[drl]; n++) printf("%*smv[%d:%d/%d]: y=%d,x=%d,w=%d\n", depth + 1, "", drl, n, n_mvs[drl], mvstack[n].mv[drl].y, mvstack[n].mv[drl].x, (mvstack[n].weight >> (8 * drl)) & 0xff); } else { for (int n = 0; n < n_mvs[0]; n++) printf("%*smv[%d/%d]: y=%d,x=%d,y2=%d,x2=%d,w=%d\n", depth + 1, "", n, n_mvs[0], mvstack[n].mv[0].y, mvstack[n].mv[0].x, mvstack[n].mv[1].y, mvstack[n].mv[1].x, mvstack[n].weight); } } #endif for (int n = 0; n < 2; n++) { const union mv diff = b->mv[n]; b->mv[n] = mvstack[b->drl_idx[n]].mv[n]; const enum InterPredMode m = dav2d_comp_inter_pred_modes[b->inter_mode - NEARMV_NEARMV][n]; if (m != NEWMV) continue; const int mv_prec = (b->mv_prec >> (n * 4)) & 0xf; if (!b->amvd && mv_prec <= 3) mv_reduce_prec(&b->mv[n], mv_prec); b->mv[n].x += diff.x; b->mv[n].y += diff.y; } if (b->motion_mode == MM_WARP_CAUSAL) { for (int i = 0; i < 2; i++) derive_warpmv(t, have_top, have_left, bw4, bh4, w4, h4, b->ref.ref[i], b->mv[i], &t->warpmv[i]); } } else { for (int n = 0; n < 2; n++) b->mv[n] = get_gmv_2d(&f->frame_hdr->gmv.m[b->ref.ref[n]], t->bx, t->by, bw4, bh4, f->bw, f->bh, f->frame_hdr); } } const int is_comp = b->ref.ref[1] != -1; if (b->motion_mode > MM_INTERINTRA) { for (int i = 0; i <= is_comp; i++) debug_warp_matrix(depth, f, t, b, i); } else if (is_comp) { DEBUG_BLOCK_printf("%*sfinal 2dmv: y=%d,x=%d | y=%d,x=%d\n", depth, "", b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x); } else { DEBUG_BLOCK_printf("%*sfinal 2dmv: y=%d,x=%d\n", depth, "", b->mv[0].y, b->mv[0].x); } if (t->f->seq_hdr->refmv_bank) dav2d_refmvs_bank_add(&t->rt, bs, t->by, t->bx, b); if (b->motion_mode > MM_INTERINTRA) { int res = 0; if (t->warpmv[0].type != DAV2D_WM_TYPE_INVALID) res = dav2d_refmvs_warp_add(&t->rt, t->warpmv, DB_ONLY(t->by, t->bx) b->ref.ref[0]); if (!res && is_comp && t->warpmv[1].type != DAV2D_WM_TYPE_INVALID) dav2d_refmvs_warp_add(&t->rt, &t->warpmv[1], DB_ONLY(t->by, t->bx) b->ref.ref[1]); } if (b->ref.ref[1] != -1) { splat_tworef_mv(DB_ONLY(depth) f, t, bs, b, by4, bw4, bh4); } else { splat_oneref_mv(DB_ONLY(depth) f, t, bs, b, by4, bw4, bh4); } } else if (has_luma && (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)) { splat_intraref(f->c, f, t, bs, by4, bw4, bh4); } if (t->task_thread.pass == PASS_MVRES) { if (!b->intra) derive_lowest_px(t, lbs, cbs, b); return 0; } assert(t->task_thread.pass & PASS_RECON); return f->bd_fn.recon_b(t, DB_ONLY(depth) lbs, (const enum BlockSize[2]){ cbs, cbs }, b); } static const uint8_t size_group_lookup[] = { [BS_4x4] = 0, [BS_4x8] = 0, [BS_8x4] = 0, [BS_8x8] = 1, [BS_8x16] = 1, [BS_16x8] = 1, [BS_16x16] = 2, [BS_16x32] = 2, [BS_32x16] = 2, [BS_32x32] = 3, [BS_32x64] = 3, [BS_64x32] = 3, [BS_64x64] = 3, [BS_64x128] = 3, [BS_128x64] = 3, [BS_128x128] = 3, [BS_128x256] = 3, [BS_256x128] = 3, [BS_256x256] = 3, [BS_4x16] = 0, [BS_16x4] = 0, [BS_8x32] = 1, [BS_32x8] = 1, [BS_16x64] = 2, [BS_64x16] = 2, [BS_4x32] = 1, [BS_32x4]= 1, [BS_8x64] = 2, [BS_64x8] = 2, [BS_4x64] = 2, [BS_64x4] = 2, }; static void read_tx_part(Dav2dTaskContext *const t, DB_ONLY(const int depth) Av2Block *const b, const enum BlockSize bs) { Dav2dTileState *const ts = t->ts; const Dav2dFrameContext *const f = t->f; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = b_dim[0], bh4 = b_dim[1]; b->tx_part = TX_PARTITION_NONE; if (f->frame_hdr->segmentation.lossless[b->seg_id]) { b->tx_size_ll = 0; if (bs != BS_4x4 && ((b->intra && !b->intrabc) ? b->fsc : !b->skip_txfm)) { const int szctx = size_group_lookup[bs]; const int inter = !b->intra || b->intrabc; b->tx_size_ll = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txsz_lossless[szctx][inter]); DEBUG_BLOCK_printf("%*sPost-lltxsz[ctx=%d|%d,%d]: r=%d\n", depth, "", szctx, inter, b->tx_size_ll, ts->msac.rng); } } else if (!b->skip_txfm) { if (f->frame_hdr->txfm_mode == DAV2D_TX_SWITCHABLE && bs != BS_4x4 && imax(bw4, bh4) <= 16) { const int inter = !b->intra || b->intrabc; static const uint8_t size_to_tx_part_group_lookup[] = { [BS_64x64] = 7, [BS_64x32] = 6, [BS_64x16] = 8, [BS_64x8] = 8, [BS_64x4] = 8, [BS_32x64] = 6, [BS_32x32] = 5, [BS_32x16] = 4, [BS_32x8] = 8, [BS_32x4] = 8, [BS_16x64] = 8, [BS_16x32] = 4, [BS_16x16] = 3, [BS_16x8] = 2, [BS_16x4] = 8, [BS_8x64] = 8, [BS_8x32] = 8, [BS_8x16] = 2, [BS_8x8] = 1, [BS_8x4] = 0, [BS_4x64] = 8, [BS_4x32] = 8, [BS_4x16] = 8, [BS_4x8] = 0, [BS_4x4] = 0, }; const int szctx = size_to_tx_part_group_lookup[bs]; int is_split = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.tx_split[b->fsc][inter][szctx]); if (is_split) { if (imin(bw4, bh4) >= 2) { static const uint8_t size_to_tx_type_group_vh_lookup[] = { [BS_64x64] = 9, [BS_64x32] = 8, [BS_64x16] = 13, [BS_64x8] = 11, [BS_32x64] = 7, [BS_32x32] = 6, [BS_32x16] = 5, [BS_32x8] = 11, [BS_16x64] = 12, [BS_16x32] = 4, [BS_16x16] = 3, [BS_16x8] = 2, [BS_8x64] = 10, [BS_8x32] = 10, [BS_8x16] = 1, [BS_8x8] = 0, }; const int ctx = size_to_tx_type_group_vh_lookup[bs]; b->tx_part = 1 + dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.tx_part_2d[b->fsc][inter][ctx], 6); } else if (imax(bw4, bh4) >= 4) { const int ctx = bw4 >= 4; const int tx_part_4way = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.tx_part_1d[b->fsc][inter][ctx]); b->tx_part = TX_PARTITION_H + ctx + tx_part_4way * 2; } else { assert(bs == BS_4x8 || bs == BS_8x4); b->tx_part = bs == BS_4x8 ? TX_PARTITION_H : TX_PARTITION_V; } } DEBUG_BLOCK_printf("%*sPost-txpart[ctx=%d|%d|%d,%d]: r=%d\n", depth, "", b->fsc, inter, szctx, b->tx_part, ts->msac.rng); } } } static int read_wedge_idx(Dav2dTileState *const ts) { static const int8_t wedge_angle_dist2idx[20][4] = { { -1, 0, 1, 2 }, // WEDGE_0 { 3, 4, 5, 6 }, // WEDGE_14 { 7, 8, 9, 10 }, // WEDGE_27 { 11, 12, 13, 14 }, // WEDGE_45 { 15, 16, 17, 18 }, // WEDGE_63 { -1, 19, 20, 21 }, // WEDGE_90 { 22, 23, 24, 25 }, // WEDGE_117 { 26, 27, 28, 29 }, // WEDGE_135 { 30, 31, 32, 33 }, // WEDGE_153 { 34, 35, 36, 37 }, // WEDGE_166 { -1, 38, 39, 40 }, // WEDGE_180 { -1, 41, 42, 43 }, // WEDGE_194 { -1, 44, 45, 46 }, // WEDGE_207 { -1, 47, 48, 49 }, // WEDGE_225 { -1, 50, 51, 52 }, // WEDGE_243 { -1, 53, 54, 55 }, // WEDGE_270 { -1, 56, 57, 58 }, // WEDGE_297 { -1, 59, 60, 61 }, // WEDGE_315 { -1, 62, 63, 64 }, // WEDGE_333 { -1, 65, 66, 67 }, // WEDGE_346 }; const int quad = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.wedge_quad, 3); const int angle = 5 * quad + dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.wedge_angle[quad], 4); const int dist = (angle - 1U >= 9U || angle == 5) ? 1 + dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.wedge_dist2, 2) : dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.wedge_dist, 3); return wedge_angle_dist2idx[angle][dist]; } static inline void jmvd_scale(union mv *const mv, const int amvd, const int jmvd_scale_mode) { if (amvd) { switch (jmvd_scale_mode) { default: assert(0); case 0: break; case 1: mv->y *= 2; mv->x *= 2; break; case 2: mv->y /= 2; mv->x /= 2; break; } } else { switch (jmvd_scale_mode) { default: assert(0); case 0: break; case 1: mv->y *= 2; break; case 2: mv->x *= 2; break; case 3: mv->y /= 2; break; case 4: mv->x /= 2; break; } } } static int decode_b(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum BlockSize lbs, const enum BlockSize cbs) { const enum BlockSize bs = lbs == BS_INVALID ? cbs : lbs; assert(bs != BS_INVALID); Dav2dTileState *const ts = t->ts; const Dav2dFrameContext *const f = t->f; Av2Block b_mem, *const b = f->c->task_thread.n_passes > 1 ? &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bx4 = t->bx & 63, by4 = t->by & 63; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); const int have_left = t->bx > ts->tiling.col_start; const int have_top = t->by > ts->tiling.row_start; const int has_luma = lbs != BS_INVALID, has_chroma = cbs != BS_INVALID; int ss_hor, ss_ver, cbx4, cby4, cbw4, cbh4, cw4, ch4; if (has_chroma) { ss_ver = f->ss_ver; ss_hor = f->ss_hor; cbx4 = (t->cbx & 63) >> ss_hor; cby4 = (t->cby & 63) >> ss_ver; const uint8_t *const cb_dim = dav2d_block_dimensions[cbs]; cbw4 = cb_dim[0] >> ss_hor; cbh4 = cb_dim[1] >> ss_ver; assert(cbw4 >= 1 && cbh4 >= 1); cw4 = imin(cbw4, (f->bw - t->cbx) >> ss_hor); ch4 = imin(cbh4, (f->bh - t->cby) >> ss_ver); assert(cw4 >= 1 && ch4 >= 1); } DEBUG_BLOCK_printf("%*sdecode_b[y=%d,x=%d,bs=%dx%d,plane=%s]: r=%d\n", depth - 1, "", t->by, t->bx, bw4 * 4, bh4 * 4, !has_chroma ? "y" : !has_luma ? "uv" : "yuv", ts->msac.rng); if (!(t->task_thread.pass & PASS_ENTROPY)) { return recon_b(t, DB_ONLY(depth) lbs, cbs, b); } if (has_luma) b->bs = lbs; if (has_chroma) b->cbs = cbs; // segment_id (if seg_feature for skip/ref/gmv is enabled) int seg_pred = 0; if (f->frame_hdr->segmentation.enabled) { if (!has_luma) { b->seg_id = f->cur_segmap[t->bx + t->by * f->b4_stride]; } else if (!f->frame_hdr->segmentation.update_map) { if (f->prev_segmap) { unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4, f->prev_segmap, f->b4_stride); if (seg_id >= 16) return -1; b->seg_id = seg_id; } else { b->seg_id = 0; } } else if (f->frame_hdr->segmentation.preskip) { if (f->frame_hdr->segmentation.temporal && (seg_pred = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + t->l.seg_pred[by4]]))) { // temporal predicted seg_id if (f->prev_segmap) { unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4, f->prev_segmap, f->b4_stride); if (seg_id >= 16) return -1; b->seg_id = seg_id; } else { b->seg_id = 0; } } else { int seg_ctx; const unsigned pred_seg_id = get_cur_frame_segid(t->by, t->bx, have_top, have_left, &seg_ctx, f->cur_segmap, f->b4_stride); const unsigned ext_flag = f->seq_hdr->segmentation.ext ? dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.seg_id_ext[seg_ctx]) : 0; const unsigned diff = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.seg_id[ext_flag][seg_ctx], 7) + (ext_flag << 3); const unsigned last_active_seg_id = f->frame_hdr->segmentation.last_active_segid; b->seg_id = neg_deinterleave(diff, pred_seg_id, last_active_seg_id + 1); if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error? if (b->seg_id >= DAV2D_MAX_SEGMENTS) b->seg_id = 0; // error? } if (!has_luma) DEBUG_BLOCK_printf("%*sPost-segid[%d]: r=%d\n", depth, "", b->seg_id, ts->msac.rng); } } else { b->seg_id = 0; } // cross-sb boundary neighbours const BlockContext *nx[2]; int xoff[2], idx = 0; if (have_left && t->by + bh4 <= ts->tiling.row_end) { nx[0] = &t->l; xoff[0] = by4 + bh4 - 1; idx++; } if (have_top && t->bx + bw4 <= ts->tiling.col_end) { nx[idx] = t->a; xoff[idx] = bx4 + bw4 - 1; idx++; } if (idx < 2 && have_left) { nx[idx] = &t->l; xoff[idx] = by4; idx++; } if (idx < 2) { nx[idx] = t->a; xoff[idx] = bx4; if (!idx) { nx[idx + 1] = t->a; xoff[idx + 1] = bx4; } idx += have_top; } // skip_mode if (!((f->frame_hdr->segmentation.d.globalmv_mask | f->frame_hdr->segmentation.d.skip_mask) & (1 << b->seg_id)) && f->frame_hdr->skip_mode_enabled && bw4 * bh4 > 2 && !t->intra_region) { const int ctx = nx[0]->skip_mode[xoff[0]] + nx[1]->skip_mode[xoff[1]]; b->skip_mode = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip_mode[ctx]); DEBUG_BLOCK_printf("%*sPost-skip_mode[ctx=%d,%d]: r=%d\n", depth, "", ctx, b->skip_mode, ts->msac.rng); } else { b->skip_mode = 0; } if (b->skip_mode) { b->intra = 0; } else if (IS_INTER_OR_SWITCH(f->frame_hdr) && !t->intra_region) { if (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400 && lbs != cbs) { // mixed-intra/inter regions in inter frames with chroma planes // of different sizes (in AVM language: with offsets) are inter b->intra = 0; } else { const int ictx = get_intra_ctx(nx, xoff, idx); b->intra = !dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intra[ictx]); DEBUG_BLOCK_printf("%*sPost-is_inter[ctx=%d,%d]: r=%d\n", depth, "", ictx, !b->intra, ts->msac.rng); } } else { b->intra = 1; } const BlockContext *nb[2]; int boff[2]; if (has_luma) { b->intrabc = 0; // get "spatial neighbours", depending on edge availability; // do not cross SB boundaries vertically const int have_top_in_sb = !!(t->by & (f->sb_step - 1)); int idx = 0; if (have_left && bh4 == h4) { nb[0] = &t->l; boff[0] = by4 + bh4 - 1; idx++; } if (have_top_in_sb && bw4 == w4) { nb[idx] = t->a; boff[idx] = bx4 + bw4 - 1; idx++; } if (have_left && idx < 2) { nb[idx] = &t->l; boff[idx] = by4; idx++; } if (have_top_in_sb && idx < 2) { nb[idx] = t->a; boff[idx] = bx4; idx++; } if (idx < 2) { boff[idx] = -1; if (!idx) boff[1] = -1; } if (f->frame_hdr->allow_intrabc && imin(bw4, bh4) < 16 && b->intra && !t->intra_region) { const int ctx = (boff[0] == -1 ? 0 : nb[0]->intrabc[boff[0]]) + (boff[1] == -1 ? 0 : nb[1]->intrabc[boff[1]]); b->intrabc = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc[ctx]); DEBUG_BLOCK_printf("%*sPost-intrabc[ctx=%d,%d]: r=%d\n", depth, "", ctx, b->intrabc, ts->msac.rng); } } const int intrabc = has_luma && b->intrabc; // skip_txfm if (f->frame_hdr->segmentation.d.skip_mask & (1 << b->seg_id)) { b->skip_txfm = 1; } else if (b->intra && !intrabc) { if (has_luma) b->skip_txfm = 0; } else { const int ctx = nx[0]->skip_txfm[xoff[0]] + nx[1]->skip_txfm[xoff[1]] + b->skip_mode * 3; b->skip_txfm = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip_txfm[ctx]); DEBUG_BLOCK_printf("%*sPost-skip_txfm[ctx=%d,%d]: r=%d\n", depth, "", ctx, b->skip_txfm, ts->msac.rng); } const int skip_txfm = has_luma && b->skip_txfm; // segment_id if (f->frame_hdr->segmentation.enabled && f->frame_hdr->segmentation.update_map && !f->frame_hdr->segmentation.preskip) { if (!has_luma) { b->seg_id = f->cur_segmap[t->bx + t->by * f->b4_stride]; } else if (!b->skip_txfm && f->frame_hdr->segmentation.temporal && (seg_pred = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + t->l.seg_pred[by4]]))) { // temporal predicted seg_id if (f->prev_segmap) { unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4, f->prev_segmap, f->b4_stride); if (seg_id >= 16) return -1; b->seg_id = seg_id; } else { b->seg_id = 0; } } else { int seg_ctx; const unsigned pred_seg_id = get_cur_frame_segid(t->by, t->bx, have_top, have_left, &seg_ctx, f->cur_segmap, f->b4_stride); if (b->skip_txfm && !f->frame_hdr->any_lossless) { b->seg_id = pred_seg_id; } else { const unsigned ext_flag = f->seq_hdr->segmentation.ext ? dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.seg_id_ext[seg_ctx]) : 0; const unsigned diff = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.seg_id[ext_flag][seg_ctx], 7) + (ext_flag << 3); const unsigned last_active_seg_id = f->frame_hdr->segmentation.last_active_segid; b->seg_id = neg_deinterleave(diff, pred_seg_id, last_active_seg_id + 1); if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error? } if (b->seg_id >= DAV2D_MAX_SEGMENTS) b->seg_id = 0; // error? } if (has_luma) DEBUG_BLOCK_printf("%*sPost-segid[%d]: r=%d\n", depth, "", b->seg_id, ts->msac.rng); } if (has_luma) { // FIXME some of these can be pre-calculated at the start of a frame // FIXME gdf block size can be 64 pixels when tiles are split at odd places const int gdf_sz_log2 = f->frame_hdr->frame_type == DAV2D_FRAME_TYPE_KEY ? 1 : imax(1, f->frame_hdr->sb128); const int gdf_bs = 16 << gdf_sz_log2; if (!((t->bx | t->by) & (gdf_bs - 1))) { int idx = ((t->by & 48) >> 2) + ((t->bx & 48) >> 4); int flag; if (f->frame_hdr->gdf.enabled == DAV2D_ADAPTIVE && imax(f->cur.p.p.w, f->cur.p.p.h) > 4 * gdf_bs) { flag = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.gdf); DEBUG_BLOCK_printf("%*sPost-gdf[y=%d,x=%d,gdf=%d]: r=%d\n", depth, "", t->by, t->bx, flag, ts->msac.rng); } else flag = !!f->frame_hdr->gdf.enabled; dav2d_memset_pow2[gdf_sz_log2](&t->lf_mask->gdf[idx], flag); if (gdf_bs >= 32) { dav2d_memset_pow2[gdf_sz_log2](&t->lf_mask->gdf[idx+4], flag); if (gdf_bs == 64) { dav2d_memset_pow2[gdf_sz_log2](&t->lf_mask->gdf[idx+8], flag); dav2d_memset_pow2[gdf_sz_log2](&t->lf_mask->gdf[idx+12], flag); } } } } // cdef index if (f->frame_hdr->cdef.enabled && (!skip_txfm || f->frame_hdr->cdef.on_skiptx)) { const int idx = ((t->bx & 0x30) >> 4) + ((t->by & 0x30) >> 2); int8_t *const cdef_ptr = &t->lf_mask->cdef_idx[idx]; if (*cdef_ptr == -1) { int v; if (f->frame_hdr->cdef.n_strengths == 1) { v = 0; } else { const int left_cdef_idx = t->bx - 16 < ts->tiling.col_start ? -1 : idx & 3 ? cdef_ptr[-1] : t->lf_mask[-1].cdef_idx[idx + 3]; const int top_cdef_idx = !((t->by & ~15) & (f->sb_step - 1)) ? -1 : idx & 0xc ? cdef_ptr[-4] : t->lf_mask[-f->sb256w].cdef_idx[idx + 12]; // cdef_idx=-1: --, 0: true, 1-7: false, edge combo -> context // ctx=0: false/false, false/--, --/false, --/-- // ctx=1: false/true, true/false // ctx=2: true/--, --/true, true/true [same coded block] // ctx=3: true/true [different coded block] int ctx; if ((left_cdef_idx | top_cdef_idx) != -1) { // both edges are available ctx = !left_cdef_idx + !top_cdef_idx; // FIXME this should only be done when both edges are *not* // from the same coded block ctx += ctx == 2; } else { ctx = !(left_cdef_idx & top_cdef_idx) * 2; } if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.cdef_idx0[ctx])) { v = 0; } else if (f->frame_hdr->cdef.n_strengths == 2) { v = 1; } else { const int rem = f->frame_hdr->cdef.n_strengths - 3; v = 1 + dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.cdef_idx[rem], rem + 1); } DEBUG_BLOCK_printf("%*sPost-cdef_idx[ctx=%d,%d]: r=%d\n", depth, "", ctx, v, ts->msac.rng); } const int splat_idx = imax(0, b_dim[2] - 4); dav2d_memset_pow2[splat_idx](cdef_ptr, v); if (bh4 >= 32) { dav2d_memset_pow2[splat_idx](&cdef_ptr[4], v); if (bh4 == 64) { dav2d_memset_pow2[splat_idx](&cdef_ptr[8], v); dav2d_memset_pow2[splat_idx](&cdef_ptr[12], v); } } } } // ccso if (has_luma && !((t->bx | t->by) & 63)) { const ptrdiff_t ccso_idx = 3 * ((t->bx >> 6) + (t->by >> 6) * f->sb256w); for (int p = 0; p < 3; p++) { if (!f->frame_hdr->ccso.p[p].enabled) continue; if (f->frame_hdr->ccso.p[p].sb_reuse) { t->lf_mask->ccso[p] = f->prev_ccsomap[p][ccso_idx + p]; } else { // for left/left-bottom [if no overhang] context: // ctx=0: --/--, false/--, --/false, false/false // ctx=1: false/true, true/false // ctx=2: true/--, --/true, true/true [same coded block] // ctx=3: true/true [different coded block] const int ctx = t->bx - 64 >= ts->tiling.col_start ? t->lf_mask[-1].ccso[p] * 2 : 0; t->lf_mask->ccso[p] = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ccso[p][ctx]); DEBUG_BLOCK_printf("%*sPost-ccso[pl=%c,ctx=%d,%d]: r=%d\n", depth, "", "yuv"[p], ctx, t->lf_mask->ccso[p], ts->msac.rng); } if (f->cur_ccsomap) f->cur_ccsomap[ccso_idx + p] = t->lf_mask->ccso[p]; } } // delta-q/lf if (has_luma && !((t->bx | t->by) & (63 >> (2 - f->frame_hdr->sb128)))) { const int prev_qidx = ts->last_qidx; const int have_delta_q = f->frame_hdr->delta.q.present && (bs != f->root_bs || !b->skip_txfm); if (have_delta_q) { int delta_q = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.delta_q, 7); if (delta_q == 7) { const int n_bits = 1 + dav2d_msac_decode_bools_bypass(&ts->msac, 3); delta_q = dav2d_msac_decode_bools_bypass(&ts->msac, n_bits) + 1 + (1 << n_bits); } if (delta_q) { if (dav2d_msac_decode_bool_bypass(&ts->msac)) delta_q = -delta_q; delta_q *= 1 << f->frame_hdr->delta.q.res_log2; } ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255); DEBUG_BLOCK_printf("%*sPost-delta_q[%d->%d]: r=%d\n", depth, "", delta_q >> f->frame_hdr->delta.q.res_log2, ts->last_qidx, ts->msac.rng); } const int new_qidx = ts->last_qidx; if (new_qidx == f->frame_hdr->quant.yac) { // assign frame-wide q values to this sb ts->dq = f->dq; } else if (new_qidx != prev_qidx) { // find sb-specific quant parameters init_quant_tables(f->frame_hdr, new_qidx, ts->dqmem); ts->dq = ts->dqmem; } uint16_t *qidx_ptr = &t->lf_mask->qidx[(bx4 >> 4) + ((by4 & 0x30) >> 2)]; const int sbsz64 = 1 << f->frame_hdr->sb128; for (int y64 = 0; y64 < sbsz64; y64++) { for (int x64 = 0; x64 < sbsz64; x64++) qidx_ptr[x64] = new_qidx; qidx_ptr += 4; } } b->fsc = 0; // intra/inter-specific stuff int midx = 0xff; // intra/luma directional intra prediction index, if set if (b->intra && !intrabc) { static const uint8_t reordered_nondir_y_mode[] = { DC_PRED, SMOOTH_PRED, SMOOTH_V_PRED, SMOOTH_H_PRED, PAETH_PRED, }; static const uint8_t reordered_dir_y_mode[] = { DIAG_DOWN_LEFT_PRED, VERT_LEFT_PRED, VERT_PRED, VERT_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED, HOR_DOWN_PRED, HOR_PRED, HOR_UP_PRED, }; if (has_luma) { b->dpcm[0] = f->frame_hdr->segmentation.lossless[b->seg_id] && dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.dpcm[0]); if (b->dpcm[0]) { if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.dpcm_dir[0])) { b->y_mode = HOR_PRED; midx = 45; } else { b->y_mode = VERT_PRED; midx = 17; } b->mrl_index = 0; b->multi_mrl = 0; b->y_angle = 0; DEBUG_BLOCK_printf("%*sPost-ydpcm[dir=%d]: r=%d\n", depth, "", b->y_mode == HOR_PRED, ts->msac.rng); } else { const int y_set = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.intra_y_set, 3); int y_mode_idx, y_mode_ctx; if (!y_set) { y_mode_ctx = (w4 == bw4 && t->a->midx[bx4 + bw4 - 1] != 0xff) + (h4 == bh4 && t->l.midx[by4 + bh4 - 1] != 0xff); y_mode_idx = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.intra_y_idx0[y_mode_ctx], 7); if (y_mode_idx == 7) y_mode_idx += dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.intra_y_idx1[y_mode_ctx], 5); } else { y_mode_idx = y_set * 16 - 3 + dav2d_msac_decode_bools_bypass(&ts->msac, 4); } if (y_mode_idx < 5) { b->y_mode = reordered_nondir_y_mode[y_mode_idx]; b->y_angle = 0; } else { const int dir_y_mode_idx = y_mode_idx - 5; static const uint8_t default_mode_list_y[] = { 17, 45, 3, 10, 24, 31, 38, 52, // (-2, +2) 15, 19, 43, 47, 1, 5, 8, 12, 22, 26, 29, 33, 36, 40, 50, 54, // (-1, +1) 16, 18, 44, 46, 2, 4, 9, 11, 23, 25, 30, 32, 37, 39, 51, 53, // (-3, +3) 14, 20, 42, 48, 0, 6, 7, 13, 21, 27, 28, 34, 35, 41, 49, 55 }; uint8_t custom_mode_list_y[56]; const uint8_t *reorder = default_mode_list_y; if (bw4 * bh4 > 2) { // modes are reordered if neighbour (above/left) modes used // directional intra prediction modes uint64_t mask = 0; uint8_t *ptr = custom_mode_list_y; *ptr = -1; if (h4 == bh4 && t->l.midx[by4 + bh4 - 1] != 0xff) { const int lmidx = t->l.midx[by4 + bh4 - 1]; *ptr++ = lmidx; mask |= 1ULL << lmidx; } if (w4 == bw4 && t->a->midx[bx4 + bw4 - 1] != 0xff) { const int amidx = t->a->midx[bx4 + bw4 - 1]; if (amidx != custom_mode_list_y[0]) { *ptr++ = amidx; mask |= 1ULL << amidx; } } int n_dirs = (int)(ptr - custom_mode_list_y); if (n_dirs > 0) { reorder = custom_mode_list_y; if (bw4 * bh4 > 4 && dir_y_mode_idx >= n_dirs) { // add surrounding [-3..+3] angles for (int i = 1; i < 5; i++) { for (int n = 0; n < n_dirs; n++) { const int cmidx = custom_mode_list_y[n]; for (int delta = -i, j = 0; j < 2; delta = +i, j++) { // FIXME replace modulo with fastdiv const int dmidx = (cmidx + delta + 56) % 56; if (!(mask & (1ULL << dmidx))) { *ptr++ = dmidx; mask |= 1ULL << dmidx; } } } } } n_dirs = (int)(ptr - custom_mode_list_y); if (dir_y_mode_idx >= n_dirs) { // remainder of modes in default order for (unsigned long n = 0; n < ARRAY_SIZE(default_mode_list_y); n++) { const int fmidx = default_mode_list_y[n]; const uint64_t bit = 1ULL << fmidx; if (!(mask & bit)) *ptr++ = fmidx; } } } } const int dir_y_mode_reord = midx = reorder[dir_y_mode_idx]; // FIXME division/modulo can be replaced with fastdiv b->y_mode = reordered_dir_y_mode[dir_y_mode_reord / 7]; b->y_angle = dir_y_mode_reord % 7 - 3; } DEBUG_BLOCK_printf("%*sPost-intra_y_mode[set=%d,idx=%d,ctx=%d,mode=%d,angle=%d]: r=%d\n", depth, "", y_set, y_mode_idx, y_set > 0 ? -1 : y_mode_ctx, b->y_mode, b->y_angle, ts->msac.rng); } // =min(5,floor(log2(bw4+bh4)*1.99-1.62)) or // = floor(log2(bw4+bh4)*1.55-0.555) - or anything in between if (imax(bw4, bh4) <= 8 && f->seq_hdr->idtx_intra) { static const uint8_t fsc_bsize_groups[N_BS_SIZES] = { [BS_32x32] = 5, [BS_32x16] = 5, [BS_32x8] = 4, [BS_32x4] = 4, [BS_16x32] = 5, [BS_16x16] = 4, [BS_16x8] = 3, [BS_16x4] = 3, [BS_8x32] = 4, [BS_8x16] = 3, [BS_8x8] = 2, [BS_8x4] = 1, [BS_4x32] = 4, [BS_4x16] = 3, [BS_4x8] = 1, [BS_4x4] = 0, }; const int sz_ctx = fsc_bsize_groups[bs]; const int ctx = (IS_INTER_OR_SWITCH(f->frame_hdr) && !t->intra_region) ? 3 : (boff[0] == -1 ? 0 : nb[0]->fsc[boff[0]]) + (boff[1] == -1 ? 0 : nb[1]->fsc[boff[1]]); b->fsc = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.fsc[ctx][sz_ctx]); DEBUG_BLOCK_printf("%*sPost-fsc[ctx=%d|%d,%d]: r=%d\n", depth, "", ctx, sz_ctx, b->fsc, ts->msac.rng); } b->mrl_index = b->multi_mrl = 0; if (!b->dpcm[0] && midx != 0xff /* directional mode */ && f->seq_hdr->mrls) { const int ctx = (boff[0] == -1 ? 0 : nb[0]->mrl[boff[0]]) + (boff[1] == -1 ? 0 : nb[1]->mrl[boff[1]]); b->mrl_index = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.mrl_index[ctx], 3); DEBUG_BLOCK_printf("%*sPost-mrl_index[ctx=%d,%d]: r=%d\n", depth, "", ctx, b->mrl_index, ts->msac.rng); if (b->mrl_index > 0) { const int ctx2 = (boff[0] == -1 ? 0 : nb[0]->multi_mrl[boff[0]]) + (boff[1] == -1 ? 0 : nb[1]->multi_mrl[boff[1]]); b->multi_mrl = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.multi_mrl[ctx2]); DEBUG_BLOCK_printf("%*sPost-multi_line_mrl[ctx=%d,%d]: r=%d\n", depth, "", ctx2, b->multi_mrl, ts->msac.rng); } } } if (has_chroma) { b->dpcm[1] = f->frame_hdr->segmentation.lossless[b->seg_id] && dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.dpcm[1]); if (b->dpcm[1]) { b->uv_mode = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.dpcm_dir[1]) ? HOR_PRED : VERT_PRED; if (lbs == BS_INVALID) midx = t->luma_intra_dir_mode_map[(t->by & 15) * 16 + (t->bx & 15)]; const int uv_mode_idx = b->uv_mode == HOR_PRED ? 45 : 17; b->uv_angle = abs(midx - uv_mode_idx) >= 4 ? 0 : (midx % 7) - 3; DEBUG_BLOCK_printf("%*sPost-uvdpcm[dir=%d]: r=%d\n", depth, "", b->uv_mode == HOR_PRED, ts->msac.rng); } else { const int ll = f->frame_hdr->segmentation.lossless[b->seg_id]; const int mhccp_allowed = f->seq_hdr->mhccp && imax(cbw4, cbh4) <= (ll ? 1 : 8) && cbw4 * cbh4 >= 2 - ll; const int cfl_allowed = (f->seq_hdr->cfl || mhccp_allowed) && (imax(bw4, bh4) > 16 || !t->sdp_cfl_disallowed) && imax(cbw4, cbh4) <= (ll ? 1 : 16); int is_cfl = 0, uv_mode_idx, cfl_ctx, uv_mode_ctx; if (cfl_allowed) { cfl_ctx = (t->a->uvmode[cbx4] == CFL_PRED) + (t->l.uvmode[cby4] == CFL_PRED); is_cfl = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.cfl[cfl_ctx]); } if (is_cfl) { b->uv_mode = CFL_PRED; b->uv_angle = 0; } else { if (lbs == BS_INVALID) midx = t->luma_intra_dir_mode_map[(t->by & 15) * 16 + (t->bx & 15)]; uv_mode_ctx = midx != 0xff; uv_mode_idx = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.intra_uv_mode[uv_mode_ctx], 7); if (uv_mode_idx == 7) uv_mode_idx += dav2d_msac_decode_bools_bypass(&ts->msac, 3); if (uv_mode_idx > 12) return -1; if (uv_mode_idx < uv_mode_ctx) { b->uv_mode = reordered_dir_y_mode[midx / 7]; b->uv_angle = (midx % 7) - 3; } else { if (uv_mode_idx - uv_mode_ctx < 5) { b->uv_mode = reordered_nondir_y_mode[uv_mode_idx - uv_mode_ctx]; b->uv_angle = 0; } else { static const uint8_t default_mode_list_uv[] = { VERT_PRED, HOR_PRED, DIAG_DOWN_LEFT_PRED, DIAG_DOWN_RIGHT_PRED, VERT_LEFT_PRED, VERT_RIGHT_PRED, HOR_DOWN_PRED, HOR_UP_PRED, }; static const uint8_t intra_dir_mode_y_to_uv_idx[] = { 2, 4, 0, 5, 3, 6, 1, 7 }; int idx = uv_mode_idx - 5 - uv_mode_ctx; idx += uv_mode_ctx && idx >= intra_dir_mode_y_to_uv_idx[midx / 7]; b->uv_mode = default_mode_list_uv[idx]; b->uv_angle = 0; } } } DEBUG_BLOCK_printf("%*sPost-intra_uv_mode[cfl=%d,idx=%d,ctx=%d|%d,mode=%d,angle=%d]: r=%d\n", depth, "", is_cfl, is_cfl ? -1 : uv_mode_idx, cfl_allowed ? cfl_ctx : -1, is_cfl ? -1 : uv_mode_ctx, b->uv_mode, b->uv_angle, ts->msac.rng); if (b->uv_mode == CFL_PRED) { memset(b->cfl_alpha, 0, sizeof(b->cfl_alpha)); if (mhccp_allowed && (!f->seq_hdr->cfl || dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.mhccp))) { const int sz_ctx = size_group_lookup[bs]; b->cfl_type = CFL_MHCCP; b->cfl_mh_dir = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.mhccp_filter_dir[sz_ctx], 2); } else { b->cfl_type = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.cfl_type); if (b->cfl_type == CFL_EXPLICIT) { const int sign = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.cfl_sign, 7) + 1; const int sign_u = sign * 0x56 >> 8; const int sign_v = sign - sign_u * 3; assert(sign_u == sign / 3); if (sign_u) { const int ctx = (sign_u == 2) * 3 + sign_v; b->cfl_alpha[0] = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.cfl_alpha[ctx], 7) + 1; if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0]; } if (sign_v) { const int ctx = (sign_v == 2) * 3 + sign_u; b->cfl_alpha[1] = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.cfl_alpha[ctx], 7) + 1; if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1]; } } } DEBUG_BLOCK_printf("%*sPost-cfl[type=%d,%s=%d|%d]: r=%d\n", depth, "", b->cfl_type, b->cfl_type == CFL_MHCCP ? "mhdir" : "alpha", b->cfl_type == CFL_MHCCP ? b->cfl_mh_dir : b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng); } } } if (has_luma) { b->pal_sz = 0; if (f->frame_hdr->allow_screen_content_tools && b->y_mode == DC_PRED && imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4) { const int use_y_pal = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.pal_y); if (use_y_pal) { f->bd_fn.read_pal_plane(DB_ONLY(depth) t, b, bx4, by4); } else DEBUG_BLOCK_printf("%*sPost-ypal[0]: r=%d\n", depth, "", ts->msac.rng); } b->dip = 0; if (b->y_mode == DC_PRED && f->seq_hdr->intra_dip && !b->pal_sz && imin(bw4, bh4) >= 2 && bw4 * bh4 >= 8) { const int ctx = (boff[0] == -1 ? 0 : nb[0]->dip[boff[0]]) + (boff[1] == -1 ? 0 : nb[1]->dip[boff[1]]); b->dip = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.dip[ctx]); if (b->dip) { const int tp = dav2d_msac_decode_bool_bypass(&ts->msac); const int m = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.dip_mode, 5); b->dip = (tp << 4) | (m + 1); } DEBUG_BLOCK_printf("%*sPost-dip[ctx=%d,%d,tp=%d,mode=%d]: r=%d\n", depth, "", ctx, !!b->dip, b->dip >> 4, (b->dip - !!b->dip) & 7, ts->msac.rng); } if (b->pal_sz) { uint8_t *pal_idx; if (f->c->task_thread.n_passes > 1) { const int p = !!(t->task_thread.pass & PASS_ENTROPY); assert(ts->frame_thread[p].pal_idx); pal_idx = ts->frame_thread[p].pal_idx; ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; } else pal_idx = t->scratch.pal_idx_y; read_pal_indices(t, pal_idx, b->pal_sz, (int[4]) { w4 * 4, h4 * 4, bw4 * 4, bh4 * 4 }); DEBUG_BLOCK_printf("%*sPost-y-pal-indices: r=%d\n", depth, "", ts->msac.rng); } read_tx_part(t, DB_ONLY(depth) b, bs); } // reconstruction if (has_luma) { b->is_sm[0].a = sm_flag(t->a, bx4); b->is_sm[0].l = sm_flag(&t->l, by4); } if (has_chroma) { b->is_sm[1].a = sm_uv_flag(t->a, cbx4); b->is_sm[1].l = sm_uv_flag(&t->l, cby4); } if (t->task_thread.pass == PASS_ENTROPY) { f->bd_fn.read_coef_blocks(t, DB_ONLY(depth) lbs, cbs, b); } else { const int res = recon_b(t, DB_ONLY(depth) lbs, cbs, b); if (res < 0) return res; } if (has_luma) { // update contexts BlockContext *edge = t->a; for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { #define set_ctx(rep_macro) \ rep_macro(edge->fsc, off, b->fsc); \ rep_macro(edge->mode, off, b->y_mode); \ rep_macro(edge->midx, off, midx); \ rep_macro(edge->mrl, off, !!b->mrl_index); \ rep_macro(edge->multi_mrl, off, b->multi_mrl); \ rep_macro(edge->dip, off, !!b->dip); \ rep_macro(edge->pal_sz, off, b->pal_sz); \ rep_macro(edge->seg_pred, off, seg_pred); \ rep_macro(edge->skip_mode, off, 0); \ rep_macro(edge->intra, off, 1); \ rep_macro(edge->intrabc, off, 0); \ rep_macro(edge->morph_pred, off, 0); \ rep_macro(edge->skip_txfm, off, b->skip_txfm); \ if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \ rep_macro(edge->amvd, off, 0); \ rep_macro(edge->mvprec, off, 0); \ rep_macro(edge->motion_mode, off, 0); \ rep_macro(edge->comp_type, off, COMP_INTER_NONE); \ rep_macro(edge->ref[0], off, ((uint8_t) -1)); \ rep_macro(edge->ref[1], off, ((uint8_t) -1)); \ } case_set(b_dim[2 + i]); #undef set_ctx } } if (has_luma && b->pal_sz) f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4); if (has_chroma) { uint8_t uv_mode = b->uv_mode; dav2d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], uv_mode); dav2d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], uv_mode); } } else if (b->intrabc) { // intra block copy b->is_refmv = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc_mode); for (b->drl_idx[0] = 0; b->drl_idx[0] < f->frame_hdr->max_bvp_drl_bits; b->drl_idx[0]++) { if (!dav2d_msac_decode_bool_bypass(&ts->msac)) break; } b->ref.pair = -1; b->is_qpel = !f->frame_hdr->force_integer_mv; if (!b->is_refmv && !f->frame_hdr->force_integer_mv) { b->is_qpel = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc_precision); } if (!b->is_refmv) { read_mv_residual(ts, &ts->cdf.dmv, &b->mv[0], 3 + 2 * b->is_qpel); if (b->mv[0].y) { const int s = dav2d_msac_decode_bool_bypass(&ts->msac); if (s) b->mv[0].y = -b->mv[0].y; } if (b->mv[0].x) { const int s = dav2d_msac_decode_bool_bypass(&ts->msac); if (s) b->mv[0].x = -b->mv[0].x; } DEBUG_BLOCK_printf("%*sPost-mvdiff[y:%d,x:%d]: r=%d\n", depth, "", b->mv[0].y, b->mv[0].x, ts->msac.rng); } int morphctx = -1; b->morph_pred = 0; if (!(f->frame_hdr->frame_type & 1) && f->seq_hdr->bawp && f->frame_hdr->allow_screen_content_tools) { morphctx = (boff[0] == -1 ? 0 : nb[0]->morph_pred[boff[0]]) + (boff[1] == -1 ? 0 : nb[1]->morph_pred[boff[1]]); b->morph_pred = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.morph_pred[morphctx]); } DEBUG_BLOCK_printf("%*sPost-intrabc_info[mode=%d,drl=%d," "prec=%d,morphctx=%d,morph=%d]: r=%d\n", depth, "", b->is_refmv, b->drl_idx[0], b->is_qpel, morphctx, b->morph_pred, ts->msac.rng); read_tx_part(t, DB_ONLY(depth) b, bs); // reconstruction if (t->task_thread.pass == PASS_ENTROPY) { f->bd_fn.read_coef_blocks(t, DB_ONLY(depth) lbs, cbs, b); b->filter = DAV2D_FILTER_BILINEAR; } else { const int res = recon_b(t, DB_ONLY(depth) lbs, cbs, b); if (res < 0) return res; } if (has_luma) { BlockContext *edge = t->a; for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { #define set_ctx(rep_macro) \ rep_macro(edge->fsc, off, 0); \ rep_macro(edge->mode, off, DC_PRED); \ rep_macro(edge->midx, off, 0xff); \ rep_macro(edge->mrl, off, 0); \ rep_macro(edge->multi_mrl, off, 0); \ rep_macro(edge->dip, off, 0); \ rep_macro(edge->pal_sz, off, 0); \ rep_macro(edge->seg_pred, off, seg_pred); \ rep_macro(edge->skip_mode, off, 0); \ rep_macro(edge->intrabc, off, 1); \ rep_macro(edge->morph_pred, off, b->morph_pred); \ rep_macro(edge->intra, off, 1); \ rep_macro(edge->skip_txfm, off, b->skip_txfm); \ if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \ rep_macro(edge->amvd, off, 0); \ rep_macro(edge->mvprec, off, 0); \ rep_macro(edge->comp_type, off, COMP_INTER_NONE); \ rep_macro(edge->motion_mode, off, 0); \ rep_macro(edge->ref[0], off, ((uint8_t) -1)); \ rep_macro(edge->ref[1], off, ((uint8_t) -1)); \ } case_set(b_dim[2 + i]); #undef set_ctx } } if (has_chroma) { dav2d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED); dav2d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED); } } else { // inter-specific mode/mv coding int is_comp, has_subpel_filter, is_tip = 0; if (!b->skip_mode && f->frame_hdr->tip.frame_mode && cbs == lbs && imax(bw4, bh4) >= 2) { const int ctx = (idx < 1 ? 0 : nx[0]->ref[0][xoff[0]] == TIP_FRAME) + (idx < 2 ? 0 : nx[1]->ref[0][xoff[1]] == TIP_FRAME); is_tip = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.tip[ctx]); DEBUG_BLOCK_printf("%*sPost-tip[ctx=%d,%d]: r=%d\n", depth, "", ctx, is_tip, ts->msac.rng); } if (b->skip_mode) { is_comp = 1; } else if (!is_tip && !((f->frame_hdr->segmentation.d.globalmv_mask | f->frame_hdr->segmentation.d.skip_mask) & (1 << b->seg_id)) && f->frame_hdr->switchable_comp_refs && bw4 * bh4 >= 4) { const int ctx = get_comp_ctx(nx, xoff, idx, &f->refdir_with_intra[1]); is_comp = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp[ctx]); } else { is_comp = 0; } static const uint8_t mv_prec_tbl[][3] = { { 3, 1, 0 }, { 4, 3, 1 }, }; int mvprec_def = 1; b->amvd = 0; b->motion_mode = MM_TRANSLATION; b->refine_mv = 0; if (b->skip_mode) { const int max_drl_bits = f->frame_hdr->max_drl_bits; b->drl_idx[0] = 0; for (int ctx = 0; b->drl_idx[0] < max_drl_bits; b->drl_idx[0]++, ctx += ctx < 2) { if (!dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip_mode_drl_idx[ctx])) { break; } } DEBUG_BLOCK_printf("%*sPost-drl[%d,%d]: r=%d\n", depth, "", b->drl_idx[0], b->drl_idx[0], ts->msac.rng); b->ref.pair = f->skip_mode_refs.pair; for (int n = 0; n < idx; n++) { if (nx[n]->ref[0][xoff[n]] == TIP_FRAME) { b->ref.ref[0] = imin(f->rf.tip.ref.ref[0], f->rf.tip.ref.ref[1]); b->ref.ref[1] = imax(f->rf.tip.ref.ref[0], f->rf.tip.ref.ref[1]); break; } else if (nx[n]->ref[1][xoff[n]] != -1) { b->ref.ref[0] = nx[n]->ref[0][xoff[n]]; b->ref.ref[1] = nx[n]->ref[1][xoff[n]]; break; } else if (nx[n]->ref[0][xoff[n]] != -1) break; } b->comp_type = COMP_INTER_AVG; b->inter_mode = NEARMV_NEARMV; has_subpel_filter = 0; } else if (is_comp) { const int n_refs = f->frame_hdr->n_ref_frames; if (n_refs > 1) { const int same_refs = f->seq_hdr->num_same_ref_comp; int n = 0; uint8_t cnt[9] = { 0 }; if (idx > 0) { cnt[nx[0]->ref[0][xoff[0]] + 1]++; cnt[nx[0]->ref[1][xoff[0]] + 1]++; if (idx > 1) { cnt[nx[1]->ref[0][xoff[1]] + 1]++; cnt[nx[1]->ref[1][xoff[1]] + 1]++; } } int cnt_rem = idx * 2 - cnt[0] - cnt[8]; for (int i = 0, maybe_same_ref = !!same_refs, dir; i < n_refs + n - 2 + maybe_same_ref; i++) { int bit; const int cnt_cur = cnt[i + 1]; cnt_rem -= cnt_cur; if (!n && (i == 2 || (i >= n_refs - 2 && i + 1 >= same_refs))) { bit = 1; } else { const int ctx = iclip(cnt_cur - cnt_rem + 1, 0, 2); uint16_t *const cdf = n == 0 ? ts->cdf.m.comp0_ref[ctx][i] : ts->cdf.m.comp1_ref[ctx][dir ^ f->refdir[i]][i]; bit = dav2d_msac_decode_bool_adapt(&ts->msac, cdf); } if (bit) { b->ref.ref[n++] = i; if (n == 2) break; dir = f->refdir[i]; } if (maybe_same_ref) { assert(i < same_refs); maybe_same_ref = !bit && i + 1 < same_refs; if (bit) { i--; cnt_rem += cnt_cur; } } } if (n < 2) { b->ref.ref[1] = n_refs - 1; if (!n) b->ref.ref[0] = n_refs - 1 - (same_refs < n_refs); } } else { b->ref.pair = 0; } DEBUG_BLOCK_printf("%*sPost-ref[%d,%d]: r=%d\n", depth, "", b->ref.ref[0], b->ref.ref[1], ts->msac.rng); const int have_top_right = t->bx + bw4 <= ts->tiling.col_end; const int have_bottom_left = t->by + bh4 <= ts->tiling.row_end; const int comp_ctx = get_compref_ctx(t->a, &t->l, by4, bx4, have_top, have_left, have_top_right, have_bottom_left, b_dim, b->ref, f->rf.tip.ref); if (b->ref.ref[0] == b->ref.ref[1]) { b->inter_mode = NEARMV_NEARMV + dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.comp_mode_sameref[comp_ctx], 3); b->inter_mode += b->inter_mode > NEARMV_NEWMV; // skip newmv_nearmv } else { const int joint_ctx = f->refdist[b->ref.ref[0]] != -f->refdist[b->ref.ref[1]]; if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp_mode_joint[joint_ctx])) { b->inter_mode = JOINT_NEWMV; } else { b->inter_mode = NEARMV_NEARMV + dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.comp_mode[comp_ctx], 4); } } if (f->frame_hdr->opfl_refine_type == 1 /* switchable */ && b->inter_mode != GLOBALMV_GLOBALMV && imin(bw4, bh4) >= 2 && f->refdir[b->ref.ref[0]] != f->refdir[b->ref.ref[1]]) { const int ctx = b->inter_mode > NEARMV_NEARMV; if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.opfl[ctx])) b->inter_mode += 6 - (b->inter_mode >= GLOBALMV_GLOBALMV); } DEBUG_BLOCK_printf("%*sPost-comp_inter_mode[ctx=%d,%d]: r=%d\n", depth, "", comp_ctx, b->inter_mode, ts->msac.rng); #define NEWMV_MASK ((1 << NEARMV_NEWMV) | \ (1 << NEWMV_NEARMV) | \ (1 << NEWMV_NEWMV) | \ (1 << JOINT_NEWMV) | \ (1 << OPFL_NEARMV_NEWMV) | \ (1 << OPFL_NEWMV_NEARMV) | \ (1 << OPFL_NEWMV_NEWMV) | \ (1 << OPFL_JOINT_NEWMV)) const int is_newmv_mode = (1 << b->inter_mode) & NEWMV_MASK; #undef NEWMV_MASK if (f->seq_hdr->adaptive_mvd && is_newmv_mode) { static uint8_t amvd_mode_context[] = { [NEARMV_NEWMV - NEARMV_NEWMV] = 0, [NEWMV_NEARMV - NEARMV_NEWMV] = 1, [OPFL_NEARMV_NEWMV - NEARMV_NEWMV] = 2, [OPFL_NEWMV_NEARMV - NEARMV_NEWMV] = 3, [JOINT_NEWMV - NEARMV_NEWMV] = 5, [OPFL_JOINT_NEWMV - NEARMV_NEWMV] = 6, [NEWMV_NEWMV - NEARMV_NEWMV] = 7, [OPFL_NEWMV_NEWMV - NEARMV_NEWMV] = 8, }; const int mode_ctx = amvd_mode_context[b->inter_mode - NEARMV_NEWMV]; const int ctx = (nx[0]->ref[0][xoff[0]] == b->ref.ref[0] && nx[0]->amvd[xoff[0]]) + (nx[1]->ref[0][xoff[1]] == b->ref.ref[0] && nx[1]->amvd[xoff[1]]); b->amvd = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.amvd[mode_ctx][ctx]); DEBUG_BLOCK_printf("%*sPost-amvd[ctx=%d|%d,%d]: r=%d\n", depth, "", mode_ctx, ctx, b->amvd, ts->msac.rng); } int jmvd_scale_mode = 0; if (b->inter_mode == JOINT_NEWMV || b->inter_mode == OPFL_JOINT_NEWMV) { jmvd_scale_mode = b->amvd ? dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.jmvd_amvd_scale_mode, 2) : dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.jmvd_scale_mode, 4); DEBUG_BLOCK_printf("%*sPost-jmvd_scale_mode[%d]: r=%d\n", depth, "", jmvd_scale_mode, ts->msac.rng); } if (b->inter_mode == NEWMV_NEWMV && imin(bw4, bh4) > 1 && !f->frame_hdr->force_integer_mv && b->ref.ref[0] != b->ref.ref[1] && f->frame_hdr->opfl_refine_type != 2 /* always */ && f->frame_hdr->motion_modes & (1 << MM_WARP_CAUSAL)) { const int is_sb_boundary = !(t->by & (f->sb_step - 1)); const int ref1 = b->ref.ref[0], ref2 = b->ref.ref[1]; #define match_ref(dir, off, refidx) \ (t->dir ref[0][off] == refidx || t->dir ref[1][off] == refidx) #define match_refs(refidx) \ (match_ref(l., by4, refidx) || \ (t->by + bh4 <= ts->tiling.row_end && \ match_ref(l., by4 + bh4 - 1, refidx)) || \ (is_sb_boundary ? \ (match_ref(a_sb_cache., bx4 & ~1, refidx) || \ (((t->bx + bw4 - 2) & ~1) < ts->tiling.col_end && \ match_ref(a_sb_cache., (bx4 + bw4 - 2) & ~1, refidx))) : \ (match_ref(a->, bx4, refidx) || \ (t->bx + bw4 <= ts->tiling.col_end && \ match_ref(a->, bx4 + bw4 - 1, refidx))))) if (match_refs(ref1) && match_refs(ref2)) { #undef match_refs #undef match_ref const enum MotionMode x1 = boff[0] == -1 ? MM_TRANSLATION : nb[0]->motion_mode[boff[0]], x2 = boff[1] == -1 ? MM_TRANSLATION : nb[1]->motion_mode[boff[1]]; const int cs_ctx = (x1 >= MM_WARP_CAUSAL || x2 >= MM_WARP_CAUSAL) + (x1 == MM_WARP_CAUSAL) + (x2 == MM_WARP_CAUSAL); if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_causal[cs_ctx])) { b->motion_mode = MM_WARP_CAUSAL; } DEBUG_BLOCK_printf("%*sPost-comp_newmv_warp[%d]: r=%d\n", depth, "", b->motion_mode, ts->msac.rng); } } // drl memset(b->drl_idx, 0, sizeof(b->drl_idx)); if (b->inter_mode != GLOBALMV_GLOBALMV) { const int n_drls = 1 + (b->inter_mode <= NEARMV_NEWMV); const int max_drl_bits = f->frame_hdr->max_drl_bits; for (int r = 0, n = 0, ctx = 0; r < n_drls; r++) { for (; n < max_drl_bits; n++, ctx += ctx < 2) { if (!dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.drl_idx[ctx][comp_ctx])) { break; } } b->drl_idx[r] = n; n = b->inter_mode == NEARMV_NEARMV && b->ref.ref[0] == b->ref.ref[1] ? b->drl_idx[0] + (b->drl_idx[0] < max_drl_bits) : 0; ctx = imin(n, 2); } if (n_drls == 1) b->drl_idx[1] = b->drl_idx[0]; DEBUG_BLOCK_printf("%*sPost-drl[%d,%d]: r=%d\n", depth, "", b->drl_idx[0], b->drl_idx[1], ts->msac.rng); } // mv precision b->mv_prec = 3 + f->frame_hdr->mv_precision; if (b->mv_prec > 3 && !b->amvd && f->seq_hdr->flex_mvres && is_newmv_mode) { const int mvprec1 = boff[0] == -1 ? 0 : nb[0]->mvprec[boff[0]]; const int mvprec2 = boff[1] == -1 ? 0 : nb[1]->mvprec[boff[1]]; const int ctx1 = (mvprec1 & 1) + (mvprec2 & 1); if (!dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.mvprec_def[ctx1])) { const int ctx2 = (mvprec1 | mvprec2) >> 1; const int idx = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.mvprec_rem[ctx2][b->mv_prec - 4], 2); b->mv_prec = mv_prec_tbl[b->mv_prec == 6][idx]; mvprec_def = 2; } DEBUG_BLOCK_printf("%*sPost-mv_precision[ctx=%d|%d|%d,%d]: r=%d\n", depth, "", ctx1, mvprec_def == 1 ? -1 : (mvprec1 | mvprec2) >> 1, mvprec_def == 1 ? -1 : f->frame_hdr->mv_precision - 1, b->mv_prec, ts->msac.rng); } if (b->inter_mode != GLOBALMV_GLOBALMV) { int start = 0, end = 2; int refdist[2]; if (b->inter_mode == JOINT_NEWMV || b->inter_mode == OPFL_JOINT_NEWMV) { refdist[0] = f->absrefdist[b->ref.ref[0]]; refdist[1] = f->absrefdist[b->ref.ref[1]]; start = refdist[0] < refdist[1]; if (f->refdir[b->ref.ref[0]] ^ f->refdir[b->ref.ref[1]]) refdist[1] = -refdist[1]; end = start + 1; } enum InterPredMode m[2]; int n; int sum_mvd = 0, nnzc = 0; ZERO2MV(&b->mv); for (n = start; n < end; n++) { m[n] = dav2d_comp_inter_pred_modes[b->inter_mode - NEARMV_NEARMV][n]; if (m[n] != NEWMV) continue; if (b->amvd) { read_amvd(ts, &b->mv[n]); // nnzc remains zero if amvd=1, so mvd_sign_derive=>0 } else { read_mv_residual(ts, &ts->cdf.mv, &b->mv[n], b->mv_prec); sum_mvd += b->mv[n].y + b->mv[n].x; nnzc += !!b->mv[n].y + !!b->mv[n].x; } } if (b->inter_mode != NEARMV_NEARMV && b->inter_mode != OPFL_NEARMV_NEARMV) { #define BIDIR_NEWMV_MASK ((1 << NEWMV_NEWMV) | \ (1 << OPFL_NEWMV_NEWMV) | \ (1 << JOINT_NEWMV) | \ (1 << OPFL_JOINT_NEWMV)) if (!f->seq_hdr->mvd_sign_derive || b->drl_idx[0] || b->drl_idx[1] || nnzc < 3 * (end - start) - 2 || f->frame_hdr->allow_screen_content_tools || f->frame_hdr->mv_precision == 3 || b->mv_prec >= 5 || !((1 << b->inter_mode) & BIDIR_NEWMV_MASK) || b->motion_mode != MM_TRANSLATION) { // this means nnzc2 never reaches nnzc below, so the // sign-derive condition is never invoked nnzc = 5; } #undef BIDIR_NEWMV_MASK sum_mvd >>= (6 - b->mv_prec); int nnzc2 = 0; for (n = start; n < end; n++) { if (m[n] != NEWMV) continue; if (b->mv[n].y) { const int s = ++nnzc2 == nnzc ? sum_mvd & 1 : dav2d_msac_decode_bool_bypass(&ts->msac); if (s) b->mv[n].y = -b->mv[n].y; } if (b->mv[n].x) { const int s = ++nnzc2 == nnzc ? sum_mvd & 1 : dav2d_msac_decode_bool_bypass(&ts->msac); if (s) b->mv[n].x = -b->mv[n].x; } DEBUG_BLOCK_printf("%*sPost-mvdiff[%d,y:%d,x:%d]: r=%d\n", depth, "", n, b->mv[n].y, b->mv[n].x, ts->msac.rng); } if (b->inter_mode == JOINT_NEWMV || b->inter_mode == OPFL_JOINT_NEWMV) { n &= 1; // "the one not handled above" b->mv_prec = (6 << (n * 4)) | (b->mv_prec << (!n * 4)); b->mv[n] = dav2d_mv_projection(b->mv[!n], refdist[1], refdist[0], -0xffff, 0xffff); jmvd_scale(&b->mv[n], b->amvd, jmvd_scale_mode); } else { b->mv_prec *= 0x11; } } } if (f->seq_hdr->refine_mv && imin(bw4, bh4) >= 2 && bw4 * bh4 > 4 && b->inter_mode != GLOBALMV_GLOBALMV && f->refdist[b->ref.ref[0]] == -f->refdist[b->ref.ref[1]] && !f->svc[b->ref.ref[0]][0].scale && !f->svc[b->ref.ref[1]][0].scale && (f->frame_hdr->opfl_refine_type != 1 /* switchable */ || !((1 << b->inter_mode) & ((1 << NEARMV_NEWMV) | (1 << NEWMV_NEARMV) | (1 << NEWMV_NEWMV) | (1 << JOINT_NEWMV))))) { if ((1 << b->inter_mode) & ((1 << NEARMV_NEARMV) | (1 << OPFL_NEARMV_NEARMV) | (1 << OPFL_JOINT_NEWMV))) { b->refine_mv = 2; // implicitly enabled } else { const int ctx = b->inter_mode - NEARMV_NEARMV; b->refine_mv = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.refine_mv[ctx]); DEBUG_BLOCK_printf("%*sPost-refinemv[ctx=%d,%d]: r=%d\n", depth, "", ctx + 1, b->refine_mv, ts->msac.rng); } } has_subpel_filter = b->inter_mode <= JOINT_NEWMV /* no opfl */ && !b->refine_mv && b->motion_mode == MM_TRANSLATION && (b->inter_mode != GLOBALMV_GLOBALMV || imin(bw4, bh4) == 1); b->comp_type = COMP_INTER_AVG; if (b->inter_mode <= JOINT_NEWMV /* no opfl */ && b->refine_mv != 1 /* disabled, or implicitly enabled */ && !(b->inter_mode == JOINT_NEWMV && b->amvd) && f->seq_hdr->masked_compound && imin(bw4, bh4) >= 2) { const int ffr = f->furthest_future_refidx; #define comptype_ctx(num) \ num >= idx ? 0 : nx[num]->ref[1][xoff[num]] != -1 ? \ nx[num]->comp_type[xoff[num]] > COMP_INTER_AVG : \ (nx[num]->ref[0][xoff[num]] == ffr) * 2 const int cctx0 = comptype_ctx(0), cctx1 = comptype_ctx(1); #undef comptype_ctx const int ctx = cctx0 + cctx1 + (cctx0 && cctx1) + (f->absrefdist[b->ref.ref[0]] == f->absrefdist[b->ref.ref[1]]) * 6; const int has_mask = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp_type_masked[ctx]); if (has_mask) { if (imax(bw4, bh4) <= 16 && !dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp_type_weighted)) { b->comp_type = COMP_INTER_WEDGE; b->wedge_idx = read_wedge_idx(ts); b->wedge_sign = dav2d_msac_decode_bool_bypass(&ts->msac); } else { b->comp_type = COMP_INTER_SEG; b->mask_sign = dav2d_msac_decode_bool_bypass(&ts->msac); } } DEBUG_BLOCK_printf("%*sPost-comp_inter_type[ctx=%d,%d,%c=%d|%d]: r=%d\n", depth, "", ctx, b->comp_type - 1, "?wm"[b->comp_type - 1], b->comp_type == COMP_INTER_AVG ? -1 : b->comp_type == COMP_INTER_WEDGE ? b->wedge_idx : b->mask_sign, b->comp_type == COMP_INTER_WEDGE ? b->wedge_sign : -1, ts->msac.rng); } b->cwp_idx = 8; if (!b->refine_mv && !jmvd_scale_mode && f->seq_hdr->cwp && b->comp_type == COMP_INTER_AVG && (b->inter_mode == NEARMV_NEARMV || b->inter_mode == JOINT_NEWMV)) { int n; for (n = 0; n < 4; n++) { if (!dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.cwp_idx[n])) { break; } } static const int8_t cwp_weighting_factor[2][5] = { { 8, 12, 4, 10, 6 }, { 8, 12, 4, 20, -4 }, }; b->cwp_idx = cwp_weighting_factor[!(f->refdir[b->ref.ref[0]] ^ f->refdir[b->ref.ref[1]])][n]; DEBUG_BLOCK_printf("%*sPost-compweightpred_idx[%d]: r=%d\n", depth, "", b->cwp_idx, ts->msac.rng); } } else { b->comp_type = COMP_INTER_NONE; // ref if ((f->frame_hdr->segmentation.d.globalmv_mask | f->frame_hdr->segmentation.d.skip_mask) & (1 << b->seg_id)) { b->ref.ref[0] = 0; } else { if (is_tip) { b->ref.ref[0] = TIP_FRAME; b->cwp_idx = dav2d_tip_wts[f->frame_hdr->tip.global_wtd_idx]; } else { const int n_refs = f->frame_hdr->n_ref_frames; int i = 0; if (n_refs > 1) { uint8_t cnt[9] = { 0 }; if (idx > 0) { cnt[nx[0]->ref[0][xoff[0]] + 1]++; cnt[nx[0]->ref[1][xoff[0]] + 1]++; if (idx > 1) { cnt[nx[1]->ref[0][xoff[1]] + 1]++; cnt[nx[1]->ref[1][xoff[1]] + 1]++; } } int cnt_rem = idx * 2 - cnt[0] - cnt[8]; do { const int cnt_cur = cnt[i + 1]; cnt_rem -= cnt_cur; const int ctx = iclip(cnt_cur - cnt_rem + 1, 0, 2); if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.single_ref[ctx][i])) { break; } } while (++i < n_refs - 1); } b->ref.ref[0] = i; } DEBUG_BLOCK_printf("%*sPost-ref[%d,-1]: r=%d\n", depth, "", b->ref.ref[0], ts->msac.rng); } b->ref.ref[1] = -1; const int have_top_right = t->bx + bw4 <= ts->tiling.col_end; const int have_bottom_left = t->by + bh4 <= ts->tiling.row_end; const int sngl_ctx = get_snglref_ctx(t->a, &t->l, by4, bx4, have_top, have_left, have_top_right, have_bottom_left, b_dim, b->ref.ref[0]); const int is_sb_boundary = !(t->by & (f->sb_step - 1)); if ((f->frame_hdr->segmentation.d.globalmv_mask | f->frame_hdr->segmentation.d.skip_mask) & (1 << b->seg_id)) { b->inter_mode = GLOBALMV; } else if (is_tip) { b->inter_mode = NEARMV + 2 * dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.tip_mode); DEBUG_BLOCK_printf("%*sPost-tip_mode[%d]: r=%d\n", depth, "", b->inter_mode, ts->msac.rng); } else { int allow_warp = 0; if (imin(bw4, bh4) >= 2 && f->frame_hdr->warp_motion) { const int ctx = get_warp_ctx(t->a, &t->a_sb_cache, &t->l, by4, bx4, have_top, have_left, is_sb_boundary ? ((t->bx + bw4 - 2) & ~1) < ts->tiling.col_end : have_top_right, have_bottom_left, is_sb_boundary, b_dim, b->ref.ref[0]); allow_warp = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp[ctx]); } if (allow_warp) { b->inter_mode = !f->frame_hdr->force_integer_mv && !dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_newmv) ? WARPNEWMV : WARPMV; } else { b->inter_mode = NEARMV + dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.inter_mode[sngl_ctx], 2); } DEBUG_BLOCK_printf("%*sPost-single_inter_mode[ctx=%d,%d]: r=%d\n", depth, "", sngl_ctx, b->inter_mode, ts->msac.rng); } if (f->seq_hdr->adaptive_mvd && b->inter_mode == NEWMV) { const int ctx = (nx[0]->ref[0][xoff[0]] == b->ref.ref[0] && nx[0]->amvd[xoff[0]]) + (nx[1]->ref[0][xoff[1]] == b->ref.ref[0] && nx[1]->amvd[xoff[1]]); b->amvd = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.amvd[4][ctx]); DEBUG_BLOCK_printf("%*sPost-amvd[ctx=4|%d,%d]: r=%d\n", depth, "", ctx, b->amvd, ts->msac.rng); } b->warp_ref_idx = 0; b->warpmv_with_mvd = 0; b->bawp[0] = b->bawp[1] = 0; if (is_tip) { /* do nothing */ } else if (b->inter_mode <= NEWMV) { // block-adaptive weighted prediction if (f->frame_hdr->bawp && b->inter_mode != GLOBALMV && imin(bw4, bh4) >= 2 && !f->svc[b->ref.ref[0]][0].scale) { b->bawp[0] = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.bawp[0]); if (b->bawp[0]) { const int ctx = b->inter_mode == NEWMV ? 2 - b->amvd : 0; b->bawp[0] += dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.bawp_explicit[ctx]); if (b->bawp[0] == 2) { b->bawp[0] += dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.bawp_explicit_scale); b->bawp[0] |= ctx << 2; } if (has_chroma) b->bawp[1] = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.bawp[1]); } DEBUG_BLOCK_printf("%*sPost-bawp[%d,%d]: r=%d\n", depth, "", b->bawp[0] & 3, b->bawp[1], ts->msac.rng); } // inter-intra (motion-mode) if (f->frame_hdr->motion_modes & (1 << MM_INTERINTRA) && !b->bawp[0] && bw4 * bh4 > 2 && imax(bw4, bh4) <= 16 && b->inter_mode >= NEARMV && b->inter_mode <= NEWMV) { const int ctx = size_group_lookup[bs]; if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra[ctx])) { b->motion_mode = MM_INTERINTRA; b->interintra_mode = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.interintra_mode[ctx], 3); b->wedge_idx = -1; if (imin(bw4, bh4) > 1 && dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra_wedge)) { b->wedge_idx = read_wedge_idx(ts); } } DEBUG_BLOCK_printf("%*sPost-interintra[%d,%d,%d]: r=%d\n", depth, "", b->motion_mode, b->motion_mode ? b->interintra_mode : -1, b->motion_mode ? b->wedge_idx : -1, ts->msac.rng); } } else { // motion mode b->motion_mode = MM_WARP_DELTA; if (b->inter_mode == WARPNEWMV) { const int ref = b->ref.ref[0]; #define match_ref(dir, off) \ (t->dir ref[0][off] == ref || t->dir ref[1][off] == ref) const int has_cs_ext = match_ref(l., by4) || (t->by + bh4 <= ts->tiling.row_end && match_ref(l., by4 + bh4 - 1)) || (is_sb_boundary ? (match_ref(a_sb_cache., bx4 & ~1) || (((t->bx + bw4 - 2) & ~1) < ts->tiling.col_end && match_ref(a_sb_cache., (bx4 + bw4 - 2) & ~1))) : (match_ref(a->, bx4) || (t->bx + bw4 <= ts->tiling.col_end && match_ref(a->, bx4 + bw4 - 1)))); #undef match_ref b->motion_mode = MM_WARP_DELTA; if (has_cs_ext) { const enum MotionMode x1 = boff[0] == -1 ? MM_TRANSLATION : nb[0]->motion_mode[boff[0]], x2 = boff[1] == -1 ? MM_TRANSLATION : nb[1]->motion_mode[boff[1]]; const int ext_ctx = (x1 >= MM_WARP_CAUSAL) + (x2 >= MM_WARP_CAUSAL); const unsigned mm = f->frame_hdr->motion_modes; if (mm & (1 << MM_WARP_EXTEND) && dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_extend[ext_ctx])) { b->motion_mode = MM_WARP_EXTEND; } else if ((mm & (3 << MM_WARP_CAUSAL)) == (3 << MM_WARP_CAUSAL)) { const int cs_ctx = (ext_ctx > 0) + (x1 == MM_WARP_CAUSAL) + (x2 == MM_WARP_CAUSAL); b->motion_mode = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_causal[cs_ctx]) ? MM_WARP_CAUSAL : MM_WARP_DELTA; } else { b->motion_mode = mm & (1 << MM_WARP_CAUSAL) ? MM_WARP_CAUSAL : MM_WARP_DELTA; } DEBUG_BLOCK_printf("%*sPost-sngl_newmv_warp[%d]: r=%d\n", depth, "", b->motion_mode, ts->msac.rng); } } if (b->motion_mode == MM_WARP_DELTA) { for (; b->warp_ref_idx < 3; b->warp_ref_idx++) { if (!dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_ref_idx[b->warp_ref_idx])) { break; } } DEBUG_BLOCK_printf("%*sPost-warp_ref_idx[%d/%d]: r=%d\n", depth, "", b->warp_ref_idx, 4, ts->msac.rng); } if (b->inter_mode == WARPMV && b->warp_ref_idx < 2) { b->warpmv_with_mvd = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warpmv_with_mvd); DEBUG_BLOCK_printf("%*sPost-warpmv_with_mvd[%d]: r=%d\n", depth, "", b->warpmv_with_mvd, ts->msac.rng); } } // drl b->drl_idx[0] = 0; if (b->inter_mode != WARPMV && b->inter_mode != GLOBALMV) { const int max_drl_bits = f->frame_hdr->max_drl_bits; int n = 0; for (int ctx = 0; n < max_drl_bits; n++, ctx += ctx < 2) { if (!dav2d_msac_decode_bool_adapt(&ts->msac, is_tip ? ts->cdf.m.tip_drl_idx[ctx] : ts->cdf.m.drl_idx[ctx][sngl_ctx])) { break; } } b->drl_idx[0] = n; DEBUG_BLOCK_printf("%*sPost-drl[%d,-1]: r=%d\n", depth, "", b->drl_idx[0], ts->msac.rng); } // mv precision b->mv_prec = 3 + f->frame_hdr->mv_precision; if (b->mv_prec > 3 && !b->amvd && f->seq_hdr->flex_mvres && (b->inter_mode == NEWMV || b->inter_mode == WARPNEWMV)) { const int mvprec1 = boff[0] == -1 ? 0 : nb[0]->mvprec[boff[0]]; const int mvprec2 = boff[1] == -1 ? 0 : nb[1]->mvprec[boff[1]]; const int ctx1 = (mvprec1 & 1) + (mvprec2 & 1); if (!dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.mvprec_def[ctx1])) { const int ctx2 = (mvprec1 | mvprec2) >> 1; const int idx = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.mvprec_rem[ctx2][b->mv_prec - 4], 2); b->mv_prec = mv_prec_tbl[b->mv_prec == 6][idx]; mvprec_def = 2; } DEBUG_BLOCK_printf("%*sPost-mv_precision[ctx=%d|%d|%d,%d]: r=%d\n", depth, "", ctx1, mvprec_def == 1 ? -1 : (mvprec1 | mvprec2) >> 1, mvprec_def == 1 ? -1 : f->frame_hdr->mv_precision - 1, b->mv_prec, ts->msac.rng); } if (b->inter_mode == NEWMV || b->inter_mode == WARPNEWMV || (b->inter_mode == WARPMV && b->warpmv_with_mvd)) { int nnzc, nnzc2 = 0, sum_mvd; if (b->amvd) { read_amvd(ts, &b->mv[0]); nnzc = 3; // see comment a few lines down } else { read_mv_residual(ts, &ts->cdf.mv, &b->mv[0], b->mv_prec); nnzc = !!b->mv[0].x + !!b->mv[0].y; sum_mvd = (b->mv[0].x + b->mv[0].y) >> (6 - b->mv_prec); if (b->inter_mode == WARPMV || !nnzc || !f->seq_hdr->mvd_sign_derive || b->motion_mode != MM_TRANSLATION || f->frame_hdr->allow_screen_content_tools || f->frame_hdr->mv_precision == 3 || b->mv_prec >= 5) { // this means nnzc2 never reaches nnzc below, so the // sign-derive condition is never invoked nnzc = 3; } } if (b->mv[0].y) { const int s = ++nnzc2 == nnzc ? sum_mvd & 1 : dav2d_msac_decode_bool_bypass(&ts->msac); if (s) b->mv[0].y = -b->mv[0].y; } if (b->mv[0].x) { const int s = ++nnzc2 == nnzc ? sum_mvd & 1 : dav2d_msac_decode_bool_bypass(&ts->msac); if (s) b->mv[0].x = -b->mv[0].x; } DEBUG_BLOCK_printf("%*sPost-mvdiff[%d,y:%d,x:%d]: r=%d\n", depth, "", 0, b->mv[0].y, b->mv[0].x, ts->msac.rng); } if (b->inter_mode == WARPNEWMV && b->motion_mode == MM_WARP_DELTA && ((f->seq_hdr->six_param_warp_delta && b->warp_ref_idx == 1) || b->warp_ref_idx == 0)) { const int prec = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_delta_prec[bs]); const int np = f->seq_hdr->six_param_warp_delta && b->warp_ref_idx == 1 ? 4 : 2; const int step = 2 >> prec; for (int n = 0; n < np; n++) { const int ctx = n - 1U > 1U; b->matrix[n] = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.warp_delta_param[0][!ctx], 7); if (b->matrix[n] == 7 && prec) b->matrix[n] += dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.warp_delta_param[1][!ctx], 7); if (b->matrix[n]) { const int sign = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_delta_sign); if (sign) b->matrix[n] = -b->matrix[n]; b->matrix[n] *= step; } } if (np == 2) b->matrix[2] = -0x80; // "invalid" DEBUG_BLOCK_printf("%*sPost-warp_param_signal[%d,%d,%d,%d]: r=%d\n", depth, "", b->matrix[0] >> !prec, b->matrix[1] >> !prec, (np == 4) ? b->matrix[2] >> !prec : 0, (np == 4) ? b->matrix[3] >> !prec : 0, ts->msac.rng); } else if (b->motion_mode == MM_WARP_DELTA) { memset(b->matrix, 0, sizeof(b->matrix)); } b->warp_ii = 0; if (b->inter_mode == WARPMV && imin(bw4, bh4) >= 2 && imax(bw4, bh4) <= 16) { const int ctx = size_group_lookup[bs]; if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.warp_interintra[ctx])) { b->warp_ii = 1; b->interintra_mode = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.interintra_mode[ctx], 3); b->wedge_idx = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra_wedge) ? read_wedge_idx(ts) : -1; } DEBUG_BLOCK_printf("%*sPost-warp_ii[%d,%d,%d]: r=%d\n", depth, "", b->warp_ii, b->warp_ii ? b->interintra_mode : -1, b->warp_ii ? b->wedge_idx : -1, ts->msac.rng); } has_subpel_filter = !is_tip && b->inter_mode <= NEWMV && (b->inter_mode != GLOBALMV || imin(bw4, bh4) == 1); } // subpel filter if (b->skip_mode || b->ref.ref[0] == TIP_FRAME || b->refine_mv || b->inter_mode >= OPFL_NEARMV_NEARMV) { assert(!has_subpel_filter); b->filter = DAV2D_FILTER_8TAP_SHARP; } else if (f->frame_hdr->subpel_filter_mode == DAV2D_FILTER_SWITCHABLE) { if (has_subpel_filter) { const int ctx = get_filter_ctx(nb, boff, b->ref); b->filter = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.filter[ctx], DAV2D_N_SWITCHABLE_FILTERS - 1); DEBUG_BLOCK_printf("%*sPost-subpelfilter[ctx=%d,%d]: r=%d\n", depth, "", ctx, b->filter, ts->msac.rng); } else b->filter = DAV2D_FILTER_8TAP_REGULAR; } else { b->filter = f->frame_hdr->subpel_filter_mode; } read_tx_part(t, DB_ONLY(depth) b, bs); // reconstruction if (t->task_thread.pass == PASS_ENTROPY) { f->bd_fn.read_coef_blocks(t, DB_ONLY(depth) lbs, cbs, b); } else { const int res = recon_b(t, DB_ONLY(depth) lbs, cbs, b); if (res < 0) return res; } // context updates BlockContext *edge = t->a; for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { #define set_ctx(rep_macro) \ rep_macro(edge->seg_pred, off, seg_pred); \ rep_macro(edge->skip_mode, off, b->skip_mode); \ rep_macro(edge->intra, off, 0); \ rep_macro(edge->intrabc, off, 0); \ rep_macro(edge->morph_pred, off, 0); \ rep_macro(edge->midx, off, 0xff); \ rep_macro(edge->fsc, off, 0); \ rep_macro(edge->skip_txfm, off, b->skip_txfm); \ rep_macro(edge->pal_sz, off, 0); \ rep_macro(edge->comp_type, off, b->comp_type); \ rep_macro(edge->filter, off, b->filter); \ rep_macro(edge->mode, off, b->inter_mode); \ rep_macro(edge->mrl, off, 0); \ rep_macro(edge->multi_mrl, off, 0); \ rep_macro(edge->dip, off, 0); \ rep_macro(edge->ref[0], off, ((uint8_t) b->ref.ref[0])); \ rep_macro(edge->ref[1], off, ((uint8_t) b->ref.ref[1])); \ rep_macro(edge->motion_mode, off, b->motion_mode); \ rep_macro(edge->amvd, off, b->amvd); \ rep_macro(edge->mvprec, off, mvprec_def) case_set(b_dim[2 + i]); #undef set_ctx } if (has_chroma) { dav2d_memset_pow2[ulog2(cbw4)](&t->a->uvmode[cbx4], DC_PRED); dav2d_memset_pow2[ulog2(cbh4)](&t->l.uvmode[cby4], DC_PRED); } } // update contexts if (f->frame_hdr->segmentation.enabled) { int seg_id = b->seg_id; if (has_luma) { const ptrdiff_t seg_stride = f->b4_stride; uint8_t *seg_ptr = &f->cur_segmap[t->by * seg_stride + t->bx]; #define set_ctx(rep_macro) \ for (int y = 0; y < bh4; y++) { \ rep_macro(seg_ptr, 0, seg_id); \ seg_ptr += seg_stride; \ } case_set(b_dim[2]); #undef set_ctx } if (has_chroma && (f->frame_hdr->deblock.level_u || f->frame_hdr->deblock.level_v)) { const ptrdiff_t seg_stride = f->lf.uv_segmap_stride; uint8_t *seg_ptr = &f->lf.segmap_uv[(t->cby >> ss_ver) * seg_stride + (t->cbx >> ss_hor)]; #define set_ctx(rep_macro) \ for (int y = 0; y < cbh4; y++) { \ rep_macro(seg_ptr, 0, seg_id); \ seg_ptr += seg_stride; \ } case_set(ulog2(cbw4)); #undef set_ctx } if (f->frame_hdr->segmentation.lossless[seg_id]) { if (has_luma) { uint16_t (*const lossless)[4] = &t->lf_mask->lossless_mask_y[by4]; const uint64_t mask = (~0ULL >> (64 - bw4)) << bx4; const unsigned mask1 = (unsigned) (mask & 0xffff); const unsigned mask2 = (unsigned) ((mask >> 16) & 0xffff); const unsigned mask3 = (unsigned) ((mask >> 32) & 0xffff); const unsigned mask4 = (unsigned) ((mask >> 48)); for (int y = 0; y < bh4; y++) { if (mask1) lossless[y][0] |= mask1; if (mask2) lossless[y][1] |= mask2; if (mask3) lossless[y][2] |= mask3; if (mask4) lossless[y][3] |= mask4; } } if (has_chroma) { uint16_t (*const lossless)[4] = &t->lf_mask->lossless_mask_uv[cby4]; const uint64_t mask = (~0ULL >> (64 - cbw4)) << cbx4; const uint64_t ss_mask = ss_hor ? 0xff : 0xffff; const unsigned mask1 = (unsigned) (mask & ss_mask); const unsigned mask2 = (unsigned) ((mask >> (16 >> ss_hor)) & ss_mask); const unsigned mask3 = (unsigned) ((mask >> (32 >> ss_hor)) & ss_mask); const unsigned mask4 = (unsigned) ((mask >> (48 >> ss_hor)) & ss_mask); for (int y = 0; y < cbh4; y++) { if (mask1) lossless[y][0] |= mask1; if (mask2) lossless[y][1] |= mask2; if (mask3) lossless[y][2] |= mask3; if (mask4) lossless[y][3] |= mask4; } } } } if (f->frame_hdr->deblock.level_y[0] || f->frame_hdr->deblock.level_y[1]) { if (has_luma) { dav2d_create_db_mask(t->lf_mask->filter_y, b, lbs, t->bx, t->by, f->bw, f->bh, f->cur.p.p.layout, 0, &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4], f->frame_hdr, f->seq_hdr); } if (has_chroma && (f->frame_hdr->deblock.level_u || f->frame_hdr->deblock.level_v)) { dav2d_create_db_mask(t->lf_mask->filter_uv, b, cbs, t->cbx, t->cby, f->bw, f->bh, f->cur.p.p.layout, 1, &t->a->tx_lpf_uv[cbx4], &t->l.tx_lpf_uv[cby4], f->frame_hdr, f->seq_hdr); } } if (has_luma && !b->skip_txfm) { uint16_t (*noskip_mask)[4] = &t->lf_mask->noskip_mask[by4 >> 1]; const unsigned mask = (~0U >> imax(0, 32 - bw4)) << (bx4 & 15); const int bx_idx = (bx4 & 0x30) >> 4; for (int y = 0; y < bh4; y += 2, noskip_mask++) { (*noskip_mask)[bx_idx] |= mask; if (bw4 >= 32) { assert(mask == ~0U); (*noskip_mask)[bx_idx + 1] = mask; if (bw4 == 64) (*noskip_mask)[2] = (*noskip_mask)[3] = mask; } } } if (f->seq_hdr->sdp && f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400 && cbs == BS_INVALID) { const ptrdiff_t off = (t->by & 15) * 16 + (t->bx & 15); uint8_t *dirmap = &t->luma_intra_dir_mode_map[off]; uint8_t *fscmap = &t->luma_fsc_map[off]; const int bh4_max16 = imin(bh4, 16); #define set_ctx(rep_macro) \ for (int y = 0; y < bh4_max16; y++) { \ rep_macro(dirmap, 0, midx); \ rep_macro(fscmap, 0, b->fsc); \ dirmap += 16; \ fscmap += 16; \ } case_set(b_dim[2]); #undef set_ctx } return 0; } #if __has_feature(memory_sanitizer) #include static int checked_decode_b(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum BlockSize lbs, const enum BlockSize cbs) { const Dav2dFrameContext *const f = t->f; const int err = decode_b(t, DB_ONLY(depth) lbs, cbs); enum BlockSize bs[2] = { lbs, cbs }; if (err == 0 && t->task_thread.pass & PASS_RECON) for (int i = 0; i < 2; i++) { if (bs[i] == BS_INVALID) continue; const uint8_t *const b_dim = dav2d_block_dimensions[bs[i]]; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); const ptrdiff_t stride = f->cur.p.stride[i]; int bx = i ? t->cbx : t->bx, by = i ? t->cby : t->by; for (int p = i; p < 1 + i * 2; p++) { const int ss_ver = p && f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p && f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int width = w4 << (2 - ss_hor + (bw4 == ss_hor)); const int height = h4 << (2 - ss_ver + (bh4 == ss_ver)); bx &= ~ss_hor; by &= ~ss_ver; const uint8_t *data = f->cur.p.data[p] + (by << (2 - ss_ver)) * stride + (bx << (2 - ss_hor + !!f->seq_hdr->hbd)); for (int y = 0; y < height; data += stride, y++) { const size_t line_sz = width << !!f->seq_hdr->hbd; if (__msan_test_shadow(data, line_sz) != -1) { fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n", p, bx, by, w4, h4, y); __msan_check_mem_is_initialized(data, line_sz); } } } } return err; } #define decode_b checked_decode_b #endif /* defined(__has_feature) */ // dir_ptr is a limited recursive partition tree indicator for (ext)sdp as well // as sdp/cfl delay limits. // For partitions with luma, the 24th bit indicates whether the parent partition // prohibits extsdp in child partitions. // Also for luma, the lower 24 bits are aggregated as the split-direction of // the child partitions (1: hor, 2: ver, 3: mixed, -1: none] in bit 0-7, the // next sub-partition (1: hor, 2: ver, 3: mixed, -1: none) in bit 16-23, and // the child's partition choices in bit 8-15. // For chroma, these choices can then be used to infer the partition/direction. // sdp/cfl delay restrictions can also be calculated using these values. static int decode_sb(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum BlockSize lbs, enum BlockSize cbs, int *const dir_ptr) { const enum BlockSize bs = lbs == BS_INVALID ? cbs : lbs; assert(bs != BS_INVALID); const Dav2dFrameContext *const f = t->f; Dav2dTileState *const ts = t->ts; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int hw4 = bw4 >> 1, hh4 = bh4 >> 1; const int qw4 = hw4 >> 1, qh4 = hh4 >> 1; const int have_h_split = f->bw > t->bx + hw4; const int have_v_split = f->bh > t->by + hh4; const enum BlockSize cbs_orig = cbs; // key/intraonly frames always apply SDP at the 64x64 boundary if (lbs == BS_64x64 && cbs == BS_64x64 && f->seq_hdr->sdp && !(f->frame_hdr->frame_type & 1)) { int dir = 0; if (decode_sb(t, DB_ONLY(depth) lbs, BS_INVALID, &dir)) return -1; return decode_sb(t, DB_ONLY(depth) BS_INVALID, cbs, &dir); } DEBUG_BLOCK_printf("%*sdecode_sb[y=%d,x=%d,bs=%dx%d,plane=%s]: r=%d\n", depth - 1, "", t->by, t->bx, bw4 * 4, bh4 * 4, cbs == BS_INVALID ? "y" : lbs == BS_INVALID ? "uv" : "yuv", ts->msac.rng); static const struct PartitionConstants { // FIXME part[0][split] and part[1][split] are identical, maybe // we can save a byte by merging these together int8_t part[2 /* h, v */][4 /* half, quarter, eighth, split */]; int8_t ctx[2 /* _, direction */]; } subb[] = { [BS_256x256] = { { { BS_256x128, -1, -1, BS_128x128 }, { BS_128x256, -1, -1, BS_128x128 } }, { 9, 12 }, }, [BS_256x128] = { { { -1, -1, -1, -1 }, { BS_128x128, -1, -1, -1 } }, { 8, -1 /* 11 */ }, }, [BS_128x256] = { { { BS_128x128, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 7, -1 /* 10 */ }, }, [BS_128x128] = { { { BS_128x64, -1, -1, BS_64x64 }, { BS_64x128, -1, -1, BS_64x64 } }, { 6, 9 }, }, [BS_128x64] = { { { -1, -1, -1, -1 }, { BS_64x64, -1, -1, -1 } }, { 5, -1 /* 8 */ }, }, [BS_64x128] = { { { BS_64x64, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 4, -1 /* 7 */ }, }, [BS_64x64] = { { { BS_64x32, BS_64x16, BS_64x8, BS_32x32 }, { BS_32x64, BS_16x64, BS_8x64, BS_32x32 } }, { 3, 6 }, }, [BS_64x32] = { { { BS_64x16, BS_64x8, BS_64x4, BS_32x16 }, { BS_32x32, BS_16x32, BS_8x32, BS_32x16 } }, { 3, 5 }, }, [BS_64x16] = { { { BS_64x8, BS_64x4, -1, BS_32x8 }, { BS_32x16, BS_16x16, BS_8x16, BS_32x8 } }, { 15, 14 }, }, [BS_64x8] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, 0 }, }, [BS_64x4] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, -1 }, }, [BS_32x64] = { { { BS_32x32, BS_32x16, BS_32x8, BS_16x32 }, { BS_16x64, BS_8x64, BS_4x64, BS_16x32 } }, { 3, 4 }, }, [BS_32x32] = { { { BS_32x16, BS_32x8, BS_32x4, BS_16x16 }, { BS_16x32, BS_8x32, BS_4x32, BS_16x16 } }, { 2, 3 }, }, [BS_32x16] = { { { BS_32x8, BS_32x4, -1, BS_16x8 }, { BS_16x16, BS_8x16, BS_4x16, BS_16x8 } }, { 2, 2 }, }, [BS_32x8] = { { { BS_32x4, -1, -1, -1 }, { BS_16x8, BS_8x8, BS_4x8, BS_16x4 } }, { 13, 14 }, }, [BS_32x4] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, -1 }, }, [BS_16x64] = { { { BS_16x32, BS_16x16, BS_16x8, BS_8x32 }, { BS_8x64, BS_4x64, -1, BS_8x32 } }, { 14, 13 }, }, [BS_16x32] = { { { BS_16x16, BS_16x8, BS_16x4, BS_8x16 }, { BS_8x32, BS_4x32, -1, BS_8x16 } }, { 2, 1 }, }, [BS_16x16] = { { { BS_16x8, BS_16x4, -1, BS_8x8 }, { BS_8x16, BS_4x16, -1, BS_8x8 } }, { 1, 0 }, }, [BS_16x8] = { { { BS_16x4, -1, -1, -1 }, { BS_8x8, BS_4x8, -1, BS_8x4 } }, { 1, 2 }, }, [BS_16x4] = { { { -1, -1, -1, -1 }, { BS_8x4, -1, -1, -1 } }, { 11, -1 }, }, [BS_8x64] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, 0 }, }, [BS_8x32] = { { { BS_8x16, BS_8x8, BS_8x4, BS_4x16 }, { BS_4x32, -1, -1, -1 } }, { 12, 13 }, }, [BS_8x16] = { { { BS_8x8, BS_8x4, -1, BS_4x8 }, { BS_4x16, -1, -1, -1 } }, { 1, 1 }, }, [BS_8x8] = { { { BS_8x4, -1, -1, -1 }, { BS_4x8, -1, -1, -1 } }, { 0, 0 }, }, [BS_8x4] = { { { -1, -1, -1, -1 }, { BS_4x4, -1, -1, -1 } }, { 0, -1 }, }, [BS_4x64] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, -1 }, }, [BS_4x32] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, -1 }, }, [BS_4x16] = { { { BS_4x8, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 10, -1 }, }, [BS_4x8] = { { { BS_4x4, -1, -1, -1 }, { -1, -1, -1, -1 } }, { 0, -1 }, }, [BS_4x4] = { { { -1, -1, -1, -1 }, { -1, -1, -1, -1 } }, { -1, -1 }, }, }; const int pl = lbs == BS_INVALID; const struct PartitionConstants *const pcc = &subb[bs]; enum BlockPartition bp = PARTITION_INVALID; int bx4, by4; if (t->task_thread.pass & PASS_ENTROPY) { bx4 = t->bx & 63; by4 = t->by & 63; // FIXME some of the code below needs to be tested for 4:2:2 w/ SDP=1 const int eff_ss_ver = f->ss_ver & (lbs == BS_INVALID); const int eff_ss_hor = f->ss_hor & (lbs == BS_INVALID); const int bwh4ss[2] = { bw4 >> eff_ss_hor, bh4 >> eff_ss_ver }; assert(bwh4ss[0] >= 1 && bwh4ss[1] >= 1); int dir = -1; if (imax(bwh4ss[0], bwh4ss[1]) == 1 || // 1:8/1:16 partitions don't recursive (normatively) (pcc->part[0][0] & pcc->part[1][0]) == -1) { bp = PARTITION_NONE; } else if (!have_h_split || !have_v_split) { if (bw4 == bh4) { dir = have_v_split; bp = !have_v_split ? PARTITION_H : PARTITION_V; } else if (bw4 > bh4) { if (!have_h_split || f->bh <= t->by + qh4) { dir = 1; bp = PARTITION_V; } } else if (bh4 > bw4) { if (!have_v_split || f->bw <= t->bx + qw4) { dir = 0; bp = PARTITION_H; } } } if (bp == PARTITION_INVALID) { #if DEBUG_BLOCK_INFO if (0 && bs == f->root_bs) printf("poc=%d,y=%d,x=%d,bs=%d,r=%d\n", f->frame_hdr->frame_offset, t->by, t->bx, bs, ts->msac.rng); #endif if (cbs == BS_64x64 && lbs == BS_INVALID && ((*dir_ptr & 0xff) == 0xff || (*dir_ptr & 0x30003) == 0x10002 || (*dir_ptr & 0x30003) == 0x20001)) { // F164: infer SDP chroma partitioning at 64x64 level if ((*dir_ptr & 0xff) == 0xff) { bp = PARTITION_NONE; // if luma did not split, don't split chroma } else { // if luma split one way, and all children split another way, // then copy the initial first-way split for chroma dir = (*dir_ptr & 0x30003) == 0x10002; bp = (*dir_ptr >> 8) & 0xff; } } else { const int mix_inter = IS_INTER_OR_SWITCH(f->frame_hdr) && !t->intra_region; const int ctx1 = get_partition_ctx(t->a, &t->l, b_dim, pl, by4, bx4); const int ctx2 = ctx1 + pcc->ctx[0] * 4; // cannot split 4x8/8x4 blocks in mixed-intra/inter regions of // inter frames, since 4x4 block sizes in such regions are invalid const int is_split = mix_inter && b_dim[2] + b_dim[3] == 1 ? 0 : (!have_h_split || !have_v_split) || dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.part_split[pl][ctx2]); if (!is_split) { bp = PARTITION_NONE; } else { if ((bs == BS_128x128 || bs == BS_256x256) && have_v_split && have_h_split) { assert(lbs == cbs || f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I400); const int ctx3 = ctx1 + (bs == BS_256x256) * 4; const int is_square = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.part_square[ctx3]); if (is_square) bp = PARTITION_SPLIT; } else if (imax(bw4, bh4) >= 32) { assert(lbs == cbs || f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I400); assert(bw4 != bh4); bp = bw4 > bh4 ? PARTITION_V : PARTITION_H; } if (bp == PARTITION_INVALID) { // split - find direction const int aspect = 1 << f->seq_hdr->max_pb_aspect_ratio_log2; assert(bw4 * aspect >= bh4 && bh4 * aspect >= bw4); const int v_aspect = bw4 * aspect >= bh4 * 2; const int h_aspect = bh4 * aspect >= bw4 * 2; assert(v_aspect || h_aspect); if (imin(bwh4ss[0], bwh4ss[1]) == 1) { dir = bwh4ss[0] > bwh4ss[1]; } else if (!(v_aspect && h_aspect)) { dir = v_aspect; } else { const int ctx4 = ctx1 + pcc->ctx[1] * 4; dir = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.part_dir[pl][ctx4]); } assert(pcc->part[dir][0] != -1); bp = dir ? PARTITION_V : PARTITION_H; if (imax(bw4, bh4) <= 16) { // v3/h3 [ext-partition] // FIXME do we need to keep track of mix_inter and // resulting block sizes here to ensure we don't // get 4x4 blocks? const int bwh4ss2[2] = { bw4 >> f->ss_hor, bh4 >> f->ss_ver }; const int has_hv3 = f->seq_hdr->ext_partitions && bwh4ss[!dir] >= 4 && bwh4ss[dir] >= 2 && b_dim[!dir] * aspect >= b_dim[dir] * 4 && (cbs != lbs || (bwh4ss2[!dir] >= 4 && bwh4ss2[dir] >= 2) || (dir ? (lbs == BS_32x8 ? have_v_split : t->bx + qw4 * 3 < f->bw) : (lbs == BS_8x32 ? have_h_split : t->by + qh4 * 3 < f->bh))); const int has_hv4ab = bwh4ss[!dir] >= 8 && f->seq_hdr->uneven_4way_partitions && b_dim[!dir] * aspect >= b_dim[dir] * 8 && (cbs != lbs || bwh4ss2[!dir] >= 8 || (dir ? (t->bx + (qw4 >> 1) * 7 < f->bw) : (t->by + (qh4 >> 1) * 7 < f->bh))); if (has_hv3 || has_hv4ab) { assert(pcc->part[dir][1] != -1); const int ctx5 = get_partition2_ctx(t->a, &t->l, b_dim, pl, dir, by4, bx4); const int ctx6 = ctx5 + pcc->ctx[0] * 4; const int is_ext = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.part_ext[pl][ctx6]); if (is_ext) { bp = dir ? PARTITION_V3 : PARTITION_H3; if (has_hv4ab) { assert(pcc->part[dir][2] != -1); const int is_4way = !has_hv3 || dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.part_4way[pl][ctx6]); if (is_4way) { const int is_a_or_b = dav2d_msac_decode_bool_bypass(&ts->msac); bp = PARTITION_H4A + dir * 2 + is_a_or_b; } } } } } } } } } #if DEBUG_BLOCK_INFO static const char *const names[] = { [PARTITION_NONE]="none", [PARTITION_H4A]="h4a", [PARTITION_H4B]="h4b", [PARTITION_V4A]="v4a", [PARTITION_V4B]="v4b", [PARTITION_SPLIT]="s", [PARTITION_H]="h", [PARTITION_V]="v", [PARTITION_H3]="h3", [PARTITION_V3]="v3", }; DEBUG_BLOCK_printf("%*sread_partition[y=%d,x=%d,bs=%dx%d,bp=%d|%s]: r=%d\n", depth, "", t->by, t->bx, 4 * bw4, 4 * bh4, bp, names[bp], ts->msac.rng); #endif dir += dir != -1; // -1 for PARTITION_NONE, 1 or 2 for hor/ver // F157 "limit SDP-imposed CfL delay" if (lbs == BS_INVALID && cbs == BS_64x64) t->sdp_cfl_disallowed = dir != -1 && dir != (*dir_ptr & 0x3); *dir_ptr |= (uint8_t) dir | (bp << 8); int unmix_bit = 0; if (IS_INTER_OR_SWITCH(f->frame_hdr) && f->seq_hdr->ext_sdp && (cbs | lbs) != BS_INVALID && bp != PARTITION_NONE && !(*dir_ptr & (1 << 24)) && // parent partition limits recursive extsdp bp < PARTITION_H4A && imin(bw4, bh4) >= 2 && bs != f->root_bs && imax(bw4, bh4) <= 16) { const int sz = b_dim[2] + b_dim[3]; const int ctx = iclip(sz - 4, 0, 3) + (sz == 4); t->intra_region = unmix_bit = !dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.region_type[ctx]); DEBUG_BLOCK_printf("%*sis_mixed_region[ctx=%d,%d]: r=%d\n", depth, "", ctx, !t->intra_region, ts->msac.rng); if (t->intra_region) cbs = BS_INVALID; } if (f->c->task_thread.n_passes > 1) *ts->frame_thread[0].partition[0]++ = bp | (unmix_bit << 7); } else { bp = *ts->frame_thread[1].partition[t->task_thread.pass == PASS_MVRES]++; if (bp & 0x80) { assert(!t->intra_region); t->intra_region = 1; cbs = BS_INVALID; bp &= 0x7f; } } if (bs == cbs) { t->cbx = t->bx; t->cby = t->by; } // can child partitions split? static const uint8_t lim[][2] = { [PARTITION_NONE] = { 1, 1 }, [PARTITION_H] = { 1, 2 }, [PARTITION_V] = { 2, 1 }, [PARTITION_H3] = { 2, 4 }, [PARTITION_V3] = { 4, 2 }, [PARTITION_H4A] = { 1, 8 }, [PARTITION_H4B] = { 1, 8 }, [PARTITION_V4A] = { 8, 1 }, [PARTITION_V4B] = { 8, 1 }, [PARTITION_SPLIT] = { 2, 2 }, }; const uint8_t *const l = lim[bp]; int child_dir = (bw4 <= l[0] || bh4 <= l[1]) << 24; switch (bp) { case PARTITION_NONE: if (decode_b(t, DB_ONLY(depth + 1) lbs, cbs)) return -1; if (t->task_thread.pass & PASS_ENTROPY) { if ((cbs | lbs) != BS_INVALID) { BlockContext *edge = t->a; #define set_ctx(rep_macro) \ rep_macro(edge->partition[0], off, (uint8_t) ~(b_dim[i] - 1)); \ rep_macro(edge->partition[1], off, (uint8_t) ~(b_dim[i] - 1)) for (int i = 0, off = bx4; i < 2; i++, off = by4, edge = &t->l) { case_set(b_dim[2 + i]); } #undef set_ctx } else { dav2d_memset_pow2[b_dim[2]](&t->a->partition[pl][bx4], (uint8_t) ~(b_dim[0] - 1)); dav2d_memset_pow2[b_dim[3]](&t->l.partition[pl][by4], (uint8_t) ~(b_dim[1] - 1)); } } break; case PARTITION_V: { assert(hw4 > 0); const int sub4 = bs == cbs && (hw4 >> f->ss_hor) > 0; assert(sub4 || !pl); if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][0], sub4 ? pcc->part[1][0] : BS_INVALID, &child_dir)) { return -1; } if (t->bx + hw4 >= f->bw) break; t->bx += hw4; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][0], sub4 ? pcc->part[1][0] : cbs, &child_dir)) { return -1; } t->bx -= hw4; break; } case PARTITION_H: { assert(hh4 > 0); const int sub4 = bs == cbs && (hh4 >> f->ss_ver) > 0; assert(sub4 || !pl); if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][0], sub4 ? pcc->part[0][0] : BS_INVALID, &child_dir)) { return -1; } if (t->by + hh4 >= f->bh) break; t->by += hh4; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][0], sub4 ? pcc->part[0][0] : cbs, &child_dir)) { return -1; } t->by -= hh4; break; } case PARTITION_SPLIT: { assert(have_v_split && have_h_split && cbs == lbs); const enum BlockSize sbs = pcc->part[0][3]; if (decode_sb(t, DB_ONLY(depth + 1) sbs, sbs, &child_dir)) return -1; t->bx += hw4; if (decode_sb(t, DB_ONLY(depth + 1) sbs, sbs, &child_dir)) return -1; t->bx -= hw4; t->by += hh4; if (decode_sb(t, DB_ONLY(depth + 1) sbs, sbs, &child_dir)) return -1; t->bx += hw4; if (decode_sb(t, DB_ONLY(depth + 1) sbs, sbs, &child_dir)) return -1; t->bx -= hw4; t->by -= hh4; break; } case PARTITION_V3: { assert(qw4 > 0 && hh4 > 0); const int sub4 = bs == cbs && (qw4 >> f->ss_hor) > 0 && (hh4 >> f->ss_ver) > 0; assert(sub4 || !pl); const int i_3only = cbs == BS_INVALID || (!sub4 && bs != BS_32x8); if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][1], i_3only ? BS_INVALID : pcc->part[1][1], &child_dir)) { return -1; } if (t->bx + qw4 >= f->bw) break; t->bx += qw4; if (!i_3only) t->cbx = t->bx; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][3], sub4 ? pcc->part[1][3] : BS_INVALID, &child_dir)) { return -1; } if (t->by + hh4 < f->bh) { t->by += hh4; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][3], i_3only ? BS_INVALID : pcc->part[1][sub4 * 3], &child_dir)) { return -1; } t->by -= hh4; } if (t->bx + hw4 >= f->bw) { t->bx -= qw4; break; } t->bx += hw4; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][1], i_3only ? cbs : pcc->part[1][1], &child_dir)) { return -1; } t->bx -= 3 * qw4; break; } case PARTITION_H3: { assert(qh4 > 0 && hw4 > 0); const int sub4 = bs == cbs && (qh4 >> f->ss_ver) > 0 && (hw4 >> f->ss_hor) > 0; assert(sub4 || !pl); const int i_3only = cbs == BS_INVALID || (!sub4 && bs != BS_8x32); if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][1], i_3only ? BS_INVALID : pcc->part[0][1], &child_dir)) { return -1; } if (t->by + qh4 >= f->bh) break; t->by += qh4; if (!i_3only) t->cby = t->by; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][3], sub4 ? pcc->part[0][3] : BS_INVALID, &child_dir)) { return -1; } if (t->bx + hw4 < f->bw) { t->bx += hw4; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][3], i_3only ? BS_INVALID : pcc->part[0][sub4 * 3], &child_dir)) { return -1; } t->bx -= hw4; } if (t->by + hh4 >= f->bh) { t->by -= qh4; break; } t->by += hh4; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][1], i_3only ? cbs : pcc->part[0][1], &child_dir)) { return -1; } t->by -= 3 * qh4; break; } case PARTITION_V4A: case PARTITION_V4B: { const int ew4 = qw4 >> 1; assert(ew4 > 0); const int sub4 = bs == cbs && (ew4 >> f->ss_hor) > 0; assert(sub4 || !pl); if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][2], sub4 ? pcc->part[1][2] : BS_INVALID, &child_dir)) { return -1; } if (t->bx + ew4 >= f->bw) break; t->bx += ew4; const int var = bp - PARTITION_V4A; // v4b: 1, v4a: 0 if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][!var], sub4 ? pcc->part[1][!var] : -1, &child_dir)) { return -1; } const int w4a = qw4 << var, w4b = hw4 >> var; if (t->bx + w4a >= f->bw) { t->bx -= ew4; break; } t->bx += w4a; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][var], sub4 ? pcc->part[1][var] : -1, &child_dir)) { return -1; } if (t->bx + w4b >= f->bw) { t->bx -= ew4 + w4a; break; } t->bx += w4b; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[1][2], sub4 ? pcc->part[1][2] : cbs, &child_dir)) { return -1; } t->bx -= 7 * ew4; break; } case PARTITION_H4A: case PARTITION_H4B: { const int eh4 = qh4 >> 1; assert(eh4 > 0); const int sub4 = bs == cbs && (eh4 >> f->ss_ver) > 0; assert(sub4 || !pl); if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][2], sub4 ? pcc->part[0][2] : BS_INVALID, &child_dir)) { return -1; } if (t->by + eh4 >= f->bh) break; t->by += eh4; const int var = bp - PARTITION_H4A; // h4b: 1, h4a: 0 if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][!var], sub4 ? pcc->part[0][!var] : -1, &child_dir)) { return -1; } const int h4a = qh4 << var, h4b = hh4 >> var; if (t->by + h4a >= f->bh) { t->by -= eh4; break; } t->by += h4a; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][var], sub4 ? pcc->part[0][var] : -1, &child_dir)) { return -1; } if (t->by + h4b >= f->bh) { t->by -= eh4 + h4a; break; } t->by += h4b; if (decode_sb(t, DB_ONLY(depth + 1) pl ? BS_INVALID : pcc->part[0][2], sub4 ? pcc->part[0][2] : cbs, &child_dir)) { return -1; } t->by -= 7 * eh4; break; } default: assert(0); } *dir_ptr |= (child_dir & 0xff) << 16; if (t->intra_region && cbs_orig != BS_INVALID) { t->cbx = t->bx; t->cby = t->by; if (decode_b(t, DB_ONLY(depth + 1) BS_INVALID, cbs_orig)) return -1; t->intra_region = 0; } return 0; } static void reset_context(BlockContext *const ctx, const int keyframe, const int is_tip_frame) { memset(ctx->tx_lpf_y, 3, sizeof(ctx->tx_lpf_y)); memset(ctx->tx_lpf_uv, 2, sizeof(ctx->tx_lpf_uv)); if (is_tip_frame) return; memset(ctx->midx, 0xff, sizeof(ctx->midx)); memset(ctx->intra, keyframe, sizeof(ctx->intra)); memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode)); if (keyframe) memset(ctx->mode, DC_PRED, sizeof(ctx->mode)); memset(ctx->partition, 0, sizeof(ctx->partition)); memset(ctx->skip_txfm, 0, sizeof(ctx->skip_txfm)); memset(ctx->skip_mode, 0, sizeof(ctx->skip_mode)); if (!keyframe) { memset(ctx->ref, -1, sizeof(ctx->ref)); memset(ctx->comp_type, 0, sizeof(ctx->comp_type)); memset(ctx->mode, NEARMV, sizeof(ctx->mode)); } memset(ctx->mrl, 0, sizeof(ctx->mrl)); memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef)); memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef)); memset(ctx->filter, DAV2D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter)); memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred)); memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz)); } // { Y+U+V, Y+U } * 4 static const uint8_t ss_size_mul[4][2] = { [DAV2D_PIXEL_LAYOUT_I400] = { 4, 4 }, [DAV2D_PIXEL_LAYOUT_I420] = { 6, 5 }, [DAV2D_PIXEL_LAYOUT_I422] = { 8, 6 }, [DAV2D_PIXEL_LAYOUT_I444] = { 12, 8 }, }; static void setup_tile(Dav2dTileState *const ts, const Dav2dFrameContext *const f, const uint8_t *const data, const size_t sz, const int tile_row, const int tile_col, const unsigned tile_start_off) { const int col_sb_start = f->frame_hdr->tiling.t.col_start_sb[tile_col]; const int col_sb_end = f->frame_hdr->tiling.t.col_start_sb[tile_col + 1]; const int row_sb_start = f->frame_hdr->tiling.t.row_start_sb[tile_row]; const int row_sb_end = f->frame_hdr->tiling.t.row_start_sb[tile_row + 1]; const int sb_shift = f->sb_shift; const uint8_t *const size_mul = ss_size_mul[f->cur.p.p.layout]; for (int p = 0; p < 2; p++) { ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ? &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] >> 3] : NULL; ts->frame_thread[p].pal = f->frame_thread.pal ? &f->frame_thread.pal[(size_t)tile_start_off >> 6] : NULL; ts->frame_thread[p].cbi = f->frame_thread.cbi ? &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] >> 6] : NULL; ts->frame_thread[p].cf = f->frame_thread.cf ? (uint8_t*)f->frame_thread.cf + (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : NULL; ts->frame_thread[p].partition[0] = ts->frame_thread[p].partition[1] = f->frame_thread.partition ? (uint8_t*)f->frame_thread.partition + ((size_t)tile_start_off >> (4 - f->seq_hdr->sdp)) : NULL; } dav2d_cdf_thread_copy(&ts->cdf, &f->in_cdf); ts->last_qidx = f->frame_hdr->quant.yac; dav2d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update); #if DEBUG_BLOCK_INFO struct { int by, bx; } tmem = { .by = row_sb_start << sb_shift, .bx = col_sb_start << sb_shift, }, *const t = &tmem; DEBUG_BLOCK_printf("decode_tile[tilerow=%d/col=%d,y=%d-%d,x=%d-%d,size=%td]: r=%d\n", tile_row, tile_col, row_sb_start << sb_shift, imin(row_sb_end << sb_shift, f->bh), col_sb_start << sb_shift, imin(col_sb_end << sb_shift, f->bw), sz, ts->msac.rng); #endif ts->tiling.row = tile_row; ts->tiling.col = tile_col; ts->tiling.col_start = col_sb_start << sb_shift; ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw); ts->tiling.row_start = row_sb_start << sb_shift; ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh); for (int pl = 0; pl < 3; pl++) { if (f->frame_hdr->restoration.p[pl].type == DAV2D_RESTORATION_NS_WIENER || f->frame_hdr->restoration.p[pl].type == DAV2D_RESTORATION_SWITCHABLE) { struct NsWienerBank *const bank = &ts->ns_wiener_bank[pl]; const int8_t (*const cf_range)[2] = pl ? dav2d_ns_wiener_coef_range_uv : dav2d_ns_wiener_coef_range_y; memset(bank->bank_size, 0, sizeof(bank->bank_size)); memset(bank->bank_idx, 0, sizeof(bank->bank_idx)); const int n_classes = f->frame_hdr->restoration.p[pl].ns.num_classes; for (int n = 0; n < n_classes; n++) { for (int m = 0; m < 16 + !!pl * 2; m++) { bank->filter[0][n][m] = cf_range[m][1] + ((1 << cf_range[m][0]) >> 1); } } } } if (f->c->n_tc > 1) { for (int p = 0; p < 3; p++) atomic_init(&ts->progress[p], row_sb_start); } } static inline int decode_4way(MsacContext *const s, const int ref, uint16_t *const cdf, int n_bits) { assert(n_bits >= 4); const int bin = dav2d_msac_decode_symbol_adapt4(s, cdf, 3); const int rem = dav2d_msac_decode_bools_bypass(s, n_bits + bin + !bin - 4); const int v = (bin ? (1 << (n_bits + bin - 4)) : 0) + rem; const int n = 1 << n_bits; return ref * 2 <= n ? inv_recenter(ref, v) : n - 1 - inv_recenter(n - 1 - ref, v); } static void read_restoration_info(Dav2dTaskContext *const t, Av2RestorationUnit *const lr, const int p, const enum Dav2dRestorationType frame_type) { const Dav2dFrameContext *const f = t->f; Dav2dTileState *const ts = t->ts; if (frame_type == DAV2D_RESTORATION_SWITCHABLE) { assert(!p); if (dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.rst_switchable[0])) { lr->type = DAV2D_RESTORATION_NONE; } else { const int type = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.rst_switchable[1]); lr->type = type ? DAV2D_RESTORATION_PC_WIENER : DAV2D_RESTORATION_NS_WIENER; } } else { assert(!p || frame_type == DAV2D_RESTORATION_NS_WIENER); uint16_t *const cdf = frame_type == DAV2D_RESTORATION_NS_WIENER ? ts->cdf.m.rst_ns_wiener : ts->cdf.m.rst_pc_wiener; const int type = dav2d_msac_decode_bool_adapt(&ts->msac, cdf); lr->type = type ? frame_type : DAV2D_RESTORATION_NONE; } const struct Dav2dNSWienerPlane *const pd = &f->frame_hdr->restoration.p[p].ns; if (lr->type == DAV2D_RESTORATION_NS_WIENER && !pd->frame_filters_on) { const int n_classes = pd->num_classes; unsigned exact_match_mask = 0; struct NsWienerBank *const bank = &ts->ns_wiener_bank[p]; uint8_t bank_refs[16]; for (int n = 0, mask = 1; n < n_classes; n++, mask <<= 1) { const int exact_match = dav2d_msac_decode_bool_bypass(&ts->msac); const int bank_size = bank->bank_size[n]; int r; for (r = 0; r < bank_size - 1; r++) { const int found = dav2d_msac_decode_bool_bypass(&ts->msac); if (found) break; } r = (bank->bank_idx[n] - r) & 3; exact_match_mask |= mask * exact_match; bank_refs[n] = r; } const unsigned *const masks = p ? dav2d_subset_masks_uv : dav2d_subset_masks_y; const int8_t (*const cf_range)[2] = p ? dav2d_ns_wiener_coef_range_uv : dav2d_ns_wiener_coef_range_y; for (int n = 0; n < n_classes; n++, exact_match_mask >>= 1) { const int r = bank_refs[n]; int8_t *const filter = lr->ns_filter[n]; const int8_t *const ref_filter = bank->filter[r][n]; if (exact_match_mask & 1) { memcpy(filter, ref_filter, 16 + 2 * !!p); if (!bank->bank_size[n]) bank->bank_size[n] = 1; continue; } memset(filter, 0, 16 + !!p * 2); int s; for (s = 0; s < 3 - !!p; s++) { const int found = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.wiener_ns_len[!!p]); if (!found) break; } const unsigned mask = masks[s]; const int asym = p && s && dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.wiener_ns_sym); for (int i = 0, m = mask; i < 16 + !!p * 2; i++, m >>= 1) { if (!(m & 1)) continue; filter[i] = decode_4way(&ts->msac, ref_filter[i] - cf_range[i][1], ts->cdf.m.wiener_ns_cf, cf_range[i][0]) + cf_range[i][1]; if (asym && i >= 6) { filter[i + 1] = filter[i]; i++; m >>= 1; } } const int bidx = bank->bank_idx[n] = (1 + bank->bank_idx[n]) & 3; memcpy(bank->filter[bidx][n], filter, sizeof(*filter) * (16 + 2 * !!p)); bank->bank_size[n] += bank->bank_size[n] < 4; } } } // modeled after the equivalent function in aomdec:decodeframe.c static int check_trailing_bits_after_symbol_coder(const MsacContext *const msac) { // check marker bit (single 1), followed by zeroes const int n_bits = -(msac->cnt + 14); assert(n_bits <= 0); // this assumes we errored out when cnt <= -15 in caller const int n_bytes = (n_bits + 7) >> 3; const uint8_t *p = &msac->buf_pos[n_bytes]; const int pattern = 128 >> ((n_bits - 1) & 7); if ((p[-1] & (2 * pattern - 1)) != pattern) return 1; // check remainder zero bytes for (; p < msac->buf_end; p++) if (*p) return 1; return 0; } static int tip_frame_recon_sb(Dav2dTaskContext *const t, const enum BlockSize bs, const enum BlockSize cbs) { const Dav2dFrameContext *const f = t->f; Av2Block b = { .bs = bs, .cbs = cbs, .intra = 0, .intrabc = 0, .seg_id = 0, .skip_mode = 0, .skip_txfm = 1, .tx_part = TX_PARTITION_NONE, .mv[0].y = f->frame_hdr->tip.gmv.y, .mv[0].x = f->frame_hdr->tip.gmv.x, .inter_mode = NEARMV, .ref.ref[0] = TIP_FRAME, .ref.ref[1] = -1, .motion_mode = MM_TRANSLATION, .filter = f->frame_hdr->tip.subpel_filter, .cwp_idx = dav2d_tip_wts[f->frame_hdr->tip.global_wtd_idx], }; const int by4 = t->by & 63, bx4 = t->bx & 63; t->cbx = t->bx; t->cby = t->by; if (t->task_thread.pass == PASS_MVRES) { derive_lowest_px(t, bs, cbs, &b); return 0; } assert(t->task_thread.pass & PASS_RECON); if (f->frame_hdr->tip.apply_filter) { dav2d_create_db_mask(t->lf_mask->filter_y, &b, bs, t->bx, t->by, f->bw, f->bh, f->cur.p.p.layout, 0, &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4], f->frame_hdr, f->seq_hdr); if (cbs != BS_INVALID) dav2d_create_db_mask(t->lf_mask->filter_uv, &b, bs, t->bx, t->by, f->bw, f->bh, f->cur.p.p.layout, 1, &t->a->tx_lpf_uv[bx4 >> f->ss_hor], &t->l.tx_lpf_uv[by4 >> f->ss_ver], f->frame_hdr, f->seq_hdr); const int qidx = f->frame_hdr->quant.yac; uint16_t *qidx_ptr = &t->lf_mask->qidx[(bx4 >> 4) + ((by4 & 0x30) >> 2)]; const int sbsz64 = f->sb_step >> 4; for (int y64 = 0; y64 < sbsz64; y64++) { for (int x64 = 0; x64 < sbsz64; x64++) qidx_ptr[x64] = qidx; qidx_ptr += 4; } } return f->bd_fn.recon_b(t, DB_ONLY(0) bs, (const enum BlockSize[2]) { cbs, cbs }, &b); } int dav2d_decode_tile_sbrow(Dav2dTaskContext *const t) { const Dav2dFrameContext *const f = t->f; const enum BlockSize root_bs = f->root_bs; const enum BlockSize c_root_bs = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I400 ? BS_INVALID : root_bs; Dav2dTileState *const ts = t->ts; const Dav2dContext *const c = f->c; const int sb_step = f->sb_step; const int tile_row = ts->tiling.row, tile_col = ts->tiling.col; // FIXME turn into assert by not scheduling tip frame entropy parsing tasks if (f->frame_hdr->tip.frame_mode == 2 && t->task_thread.pass == PASS_ENTROPY) return 0; if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { if (t->task_thread.pass & PASS_MVRES) { dav2d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start, ts->tiling.col_end, ts->tiling.row_start, ts->tiling.row_end, t->by >> f->sb_shift, ts->tiling.row); } else if (t->task_thread.pass == PASS_RECON) { t->rt.rp_proj = &f->rf.rp_proj[(t->by >> 1) * f->rf.rp_stride]; } } if (t->task_thread.pass == PASS_MVRES && IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) { const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift; int (*const lowest_px)[2] = ts->lowest_pixel[sby]; for (int n = 0; n < 7; n++) for (int m = 0; m < 2; m++) lowest_px[n][m] = INT_MIN; } if (t->task_thread.pass & PASS_MVRES && f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { dav2d_refmvs_load_tmvs(&f->rf, ts->tiling.row, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); } const int sb256y = t->by >> 6; if (!(t->task_thread.pass & PASS_ENTROPY) || f->frame_hdr->tip.frame_mode == 2) { if (f->frame_hdr->tip.frame_mode == 2) reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), 1); for (t->bx = ts->tiling.col_start; t->bx < ts->tiling.col_end; t->bx += sb_step) { if (t->task_thread.pass & PASS_RECON) { memset(t->is_coded, 0, sizeof(t->is_coded)); t->lf_mask = f->lf.mask + (t->bx >> 6) + sb256y * f->sb256w; t->a = f->a + tile_row * f->sb256w + (t->bx >> 6); } if (t->task_thread.pass & PASS_MVRES && (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)) { dav2d_refmvs_reset_sb(&t->rt, t->by, t->bx); } if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; int dir = 0; if (f->frame_hdr->tip.frame_mode == 2) { tip_frame_recon_sb(t, root_bs, c_root_bs); } else { if (decode_sb(t, DB_ONLY(1) root_bs, c_root_bs, &dir)) return 1; } if (t->task_thread.pass & PASS_MVRES && (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)) { dav2d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt, t->bx >> 1, (t->bx + sb_step) >> 1, t->by >> 1, (t->by + sb_step) >> 1); } } if (t->task_thread.pass & PASS_RECON && c->n_tc > 1) f->bd_fn.backup_prefilter_data(t); return 0; } reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), 0); for (t->bx = ts->tiling.col_start; t->bx < ts->tiling.col_end; t->bx += sb_step) { memset(t->is_coded, 0, sizeof(t->is_coded)); t->lf_mask = f->lf.mask + (t->bx >> 6) + sb256y * f->sb256w; t->a = f->a + tile_row * f->sb256w + (t->bx >> 6); if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; switch (root_bs) { default: assert(0); case BS_64x64: { const int idx = ((t->bx & 0x30) >> 4) + ((t->by & 0x30) >> 2); t->lf_mask->cdef_idx[idx] = -1; break; } case BS_128x128: { const int idx = ((t->bx & 32) >> 4) + ((t->by & 32) >> 2); memset(&t->lf_mask->cdef_idx[idx + 0], -1, 2); memset(&t->lf_mask->cdef_idx[idx + 4], -1, 2); break; } case BS_256x256: memset(t->lf_mask->cdef_idx, -1, 16); break; } if (t->task_thread.pass & PASS_MVRES && (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)) { dav2d_refmvs_reset_sb(&t->rt, t->by, t->bx); } // Restoration filter const int sbsz = f->sb_step * 4; for (int p = 0, ss_ver = 0, ss_hor = 0; p < 3; p++, ss_ver = f->ss_ver, ss_hor = f->ss_hor) { if (f->frame_hdr->restoration.p[p].type == DAV2D_RESTORATION_NONE) continue; const int tx = 4 * (t->bx - ts->tiling.col_start) >> ss_hor; const int ty = 4 * (t->by - ts->tiling.row_start) >> ss_ver; const int unit_sz_log2 = f->frame_hdr->restoration.unit_size[!!p]; const int unit_sz = 1 << unit_sz_log2; const unsigned mask = unit_sz - 1; if ((tx | ty) & mask) continue; const int tw = ts->tiling.col_end * 4 >> ss_hor; const int th = ts->tiling.row_end * 4 >> ss_ver; const int half_unit = unit_sz >> 1; // Round half up at frame boundaries, if there's more than one // restoration unit const int fx = 4 * t->bx >> ss_hor, fy = t->by * 4 >> ss_ver; if ((ty && fy + half_unit > th) || (tx && fx + half_unit > tw)) continue; const enum Dav2dRestorationType frame_type = f->frame_hdr->restoration.p[p].type; // FIXME many of these values can be pre-calculated at frame-level const int sbw = sbsz >> ss_hor, sbh = sbsz >> ss_ver; const int lruw = imax(1, imin(tw - fx + half_unit, sbw) >> unit_sz_log2); const int lruh = imax(1, imin(th - fy + half_unit, sbh) >> unit_sz_log2); const int vsh = unit_sz_log2 - 7 + ss_ver; const int hsh = unit_sz_log2 - 7 + ss_hor; assert(vsh >= 0 && hsh >= 0); // FIXME: unit_sz_log2 can be 6 int sb_idx = (t->by >> 6) * f->sb256w + (t->bx >> 6); // TODO: store restoration data sequentially instead int start_unit_idx = ((t->by & 0x30) >> 2) + ((t->bx & 0x30) >> 4); // FIXME I think lruh is always 1, so this loop may be eliminated for (int y = 0; y < lruh; y++, sb_idx += f->sb256w << vsh) { for (int x = 0; x < lruw; x++) { int unit_idx = start_unit_idx; // TODO: + x * ... + y * ...; Av2RestorationUnit *const lr = &f->lf.lr_mask[sb_idx + (x << hsh)].lr[p][unit_idx]; read_restoration_info(t, lr, p, frame_type); DEBUG_BLOCK_printf("Post-restoration[p=%d,type=%d]: r=%d\n", p, lr->type, ts->msac.rng); } } } int dir = 0; t->sdp_cfl_disallowed = 0; if (IS_INTER_OR_SWITCH(f->frame_hdr)) { // for some contexts related to warp-motion, AVM uses 8x8 (instead // of 4x4) context resolution when we cross SB boundaries. However, // the way this is implemented means we sometimes go outside the // bounds of our own block into data that has already been written // into by our neighbour blocks. For example, if we access "top" at // 8x8 resolution for x=25, this may round to x=24 (which our left- // neighbour just overwrote). To workaround this, we keep a copy of // all affected context bits at SB boundaries. See AVM #1091. memcpy(t->a_sb_cache.ref[0], t->a->ref[0], 64); memcpy(t->a_sb_cache.ref[1], t->a->ref[1], 64); if (t->by > ts->tiling.row_start) memcpy(t->a_sb_cache.motion_mode, t->a->motion_mode, 64); } if (decode_sb(t, DB_ONLY(1) root_bs, c_root_bs, &dir)) return 1; if (t->task_thread.pass & PASS_MVRES && (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)) { dav2d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt, t->bx >> 1, (t->bx + sb_step) >> 1, t->by >> 1, (t->by + sb_step) >> 1); } } // backup pre-loopfilter pixels for intra prediction of the next sbrow if (t->task_thread.pass & PASS_RECON && c->n_tc > 1) f->bd_fn.backup_prefilter_data(t); // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix" // up the initial value in neighbour tiles when running the loopfilter int align_h = (f->bh + 63) & ~63; memcpy(&f->lf.tx_db_right_edge[0][align_h * tile_col + t->by], &t->l.tx_lpf_y[t->by & 0x30], sb_step); const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; align_h >>= ss_ver; memcpy(&f->lf.tx_db_right_edge[1][align_h * tile_col + (t->by >> ss_ver)], &t->l.tx_lpf_uv[(t->by & 0x30) >> ss_ver], sb_step >> ss_ver); // error out on symbol decoder overread if (ts->msac.cnt <= -15) return 1; return c->strict_std_compliance && f->frame_hdr->tip.frame_mode != 2 && (t->by >> f->sb_shift) + 1 >= f->frame_hdr->tiling.t.row_start_sb[tile_row + 1] && check_trailing_bits_after_symbol_coder(&ts->msac); } int dav2d_decode_frame_init(Dav2dFrameContext *const f) { const Dav2dContext *const c = f->c; int retval = DAV2D_ERR(ENOMEM); if (f->sbh > f->lf.start_of_tile_row_sz) { dav2d_free(f->lf.start_of_tile_row); f->lf.start_of_tile_row = dav2d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t)); if (!f->lf.start_of_tile_row) { f->lf.start_of_tile_row_sz = 0; goto error; } f->lf.start_of_tile_row_sz = f->sbh; } for (int tile_row = 0, sby = 0; tile_row < f->frame_hdr->tiling.t.rows; tile_row++) { f->lf.start_of_tile_row[sby++] = (tile_row << 1) | 1; while (sby < f->frame_hdr->tiling.t.row_start_sb[tile_row + 1]) f->lf.start_of_tile_row[sby++] = tile_row << 1; } const int n_ts = f->frame_hdr->tiling.t.cols * f->frame_hdr->tiling.t.rows; if (n_ts != f->n_ts) { if (c->task_thread.n_passes > 1) { dav2d_free(f->frame_thread.tile_start_off); f->frame_thread.tile_start_off = dav2d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts); if (!f->frame_thread.tile_start_off) { f->n_ts = 0; goto error; } } dav2d_free_aligned(f->ts); f->ts = dav2d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32); if (!f->ts) goto error; f->n_ts = n_ts; } const int a_sz = f->sb256w * f->frame_hdr->tiling.t.rows; if (a_sz != f->a_sz) { dav2d_free(f->a); f->a = dav2d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz); if (!f->a) { f->a_sz = 0; goto error; } f->a_sz = a_sz; } const int num_sb256 = f->sb256w * f->sb256h; const uint8_t *const size_mul = ss_size_mul[f->cur.p.p.layout]; const int hbd = !!f->seq_hdr->hbd; if (c->n_fc > 1) { const int lowest_pixel_mem_sz = f->frame_hdr->tiling.t.cols * f->sbh; if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) { dav2d_free(f->tile_thread.lowest_pixel_mem); f->tile_thread.lowest_pixel_mem = dav2d_malloc(ALLOC_TILE, lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem)); if (!f->tile_thread.lowest_pixel_mem) { f->tile_thread.lowest_pixel_mem_sz = 0; goto error; } f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz; } int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem; for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.t.rows; tile_row++, tile_row_base += f->frame_hdr->tiling.t.cols) { const int tile_row_sb_h = f->frame_hdr->tiling.t.row_start_sb[tile_row + 1] - f->frame_hdr->tiling.t.row_start_sb[tile_row]; for (int tile_col = 0; tile_col < f->frame_hdr->tiling.t.cols; tile_col++) { f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr; lowest_pixel_ptr += tile_row_sb_h; } } } if (c->task_thread.n_passes > 1) { const unsigned sb_step4 = f->sb_step * 4; int tile_idx = 0; for (int tile_row = 0; tile_row < f->frame_hdr->tiling.t.rows; tile_row++) { const unsigned row_off = f->frame_hdr->tiling.t.row_start_sb[tile_row] * sb_step4 * f->sb256w * 256; const unsigned b_diff = (f->frame_hdr->tiling.t.row_start_sb[tile_row + 1] - f->frame_hdr->tiling.t.row_start_sb[tile_row]) * sb_step4; for (int tile_col = 0; tile_col < f->frame_hdr->tiling.t.cols; tile_col++) { f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff * f->frame_hdr->tiling.t.col_start_sb[tile_col] * sb_step4; } } const int cbi_sz = num_sb256 * size_mul[0]; if (cbi_sz != f->frame_thread.cbi_sz) { dav2d_free_aligned(f->frame_thread.cbi); f->frame_thread.cbi = dav2d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) * cbi_sz * 64 * 64 / 4, 64); if (!f->frame_thread.cbi) { f->frame_thread.cbi_sz = 0; goto error; } f->frame_thread.cbi_sz = cbi_sz; } const int cf_sz = (num_sb256 * size_mul[0]) << hbd; if (cf_sz != f->frame_thread.cf_sz) { dav2d_free_aligned(f->frame_thread.cf); f->frame_thread.cf = dav2d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 256 * 256 / 2, 64); if (!f->frame_thread.cf) { f->frame_thread.cf_sz = 0; goto error; } memset(f->frame_thread.cf, 0, (size_t)cf_sz * 256 * 256 / 2); f->frame_thread.cf_sz = cf_sz; } const int part_sz = num_sb256 * 4096 << f->seq_hdr->sdp; if (part_sz != f->frame_thread.part_sz) { dav2d_free(f->frame_thread.partition); f->frame_thread.partition = dav2d_malloc(ALLOC_BLOCK, (size_t)part_sz); if (!f->frame_thread.partition) { f->frame_thread.part_sz = 0; goto error; } f->frame_thread.part_sz = part_sz; } if (f->frame_hdr->allow_screen_content_tools) { const int pal_sz = num_sb256 << hbd; if (pal_sz != f->frame_thread.pal_sz) { dav2d_free_aligned(f->frame_thread.pal); f->frame_thread.pal = dav2d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) * pal_sz * 32 * 32, 64); if (!f->frame_thread.pal) { f->frame_thread.pal_sz = 0; goto error; } f->frame_thread.pal_sz = pal_sz; } const int pal_idx_sz = num_sb256 * size_mul[1]; if (pal_idx_sz != f->frame_thread.pal_idx_sz) { dav2d_free_aligned(f->frame_thread.pal_idx); f->frame_thread.pal_idx = dav2d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) * pal_idx_sz * 256 * 256 / 8, 64); if (!f->frame_thread.pal_idx) { f->frame_thread.pal_idx_sz = 0; goto error; } f->frame_thread.pal_idx_sz = pal_idx_sz; } } else if (f->frame_thread.pal) { dav2d_freep_aligned(&f->frame_thread.pal); dav2d_freep_aligned(&f->frame_thread.pal_idx); f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0; } } // update allocation of block contexts for above ptrdiff_t y_stride = f->cur.p.stride[0], uv_stride = f->cur.p.stride[1]; if (y_stride * f->sbh * 4 != f->lf.cdef_buf_plane_sz[0] || uv_stride * f->sbh * 8 != f->lf.cdef_buf_plane_sz[1] || f->sbh != f->lf.cdef_buf_sbh) { dav2d_free_aligned(f->lf.cdef_line_buf); size_t alloc_sz = 64; alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh; alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh; uint8_t *ptr = f->lf.cdef_line_buf = dav2d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32); if (!ptr) { f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0; goto error; } ptr += 32; if (y_stride < 0) { f->lf.cdef_line[0][0] = ptr - y_stride * (f->sbh * 4 - 1); f->lf.cdef_line[1][0] = ptr - y_stride * (f->sbh * 4 - 3); } else { f->lf.cdef_line[0][0] = ptr + y_stride * 0; f->lf.cdef_line[1][0] = ptr + y_stride * 2; } ptr += llabs(y_stride) * f->sbh * 4; if (uv_stride < 0) { f->lf.cdef_line[0][1] = ptr - uv_stride * (f->sbh * 8 - 1); f->lf.cdef_line[0][2] = ptr - uv_stride * (f->sbh * 8 - 3); f->lf.cdef_line[1][1] = ptr - uv_stride * (f->sbh * 8 - 5); f->lf.cdef_line[1][2] = ptr - uv_stride * (f->sbh * 8 - 7); } else { f->lf.cdef_line[0][1] = ptr + uv_stride * 0; f->lf.cdef_line[0][2] = ptr + uv_stride * 2; f->lf.cdef_line[1][1] = ptr + uv_stride * 4; f->lf.cdef_line[1][2] = ptr + uv_stride * 6; } f->lf.cdef_buf_plane_sz[0] = (int) y_stride * f->sbh * 4; f->lf.cdef_buf_plane_sz[1] = (int) uv_stride * f->sbh * 8; f->lf.cdef_buf_sbh = f->sbh; } const int sb256 = f->frame_hdr->sb128; const int num_lines = c->n_tc > 1 ? f->sbh * 4 << sb256 : 20; // FIXME double this with threading enabled const int n_tile_rows_m1 = f->frame_hdr->tiling.t.rows - 1; y_stride = f->cur.p.stride[0], uv_stride = f->cur.p.stride[1]; if (y_stride * num_lines != f->lf.lr_buf_plane_sz[0] || uv_stride * num_lines * 2 != f->lf.lr_buf_plane_sz[1] || y_stride * 6 * n_tile_rows_m1 != f->lf.lr_buf_plane_sz[2] || uv_stride * 4 * n_tile_rows_m1 != f->lf.lr_buf_plane_sz[3]) { dav2d_free_aligned(f->lf.lr_line_buf); // lr simd may overread the input, so slightly over-allocate the db buffer size_t alloc_sz = 128; alloc_sz += (size_t)llabs(y_stride) * num_lines; alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2; alloc_sz += (size_t)llabs(y_stride) * n_tile_rows_m1 * 6; alloc_sz += (size_t)llabs(uv_stride) * n_tile_rows_m1 * 4; uint8_t *ptr = f->lf.lr_line_buf = dav2d_alloc_aligned(ALLOC_LR, alloc_sz, 64); if (!ptr) { f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0; goto error; } ptr += 64; if (y_stride < 0) f->lf.lr_db_line[0] = ptr - y_stride * (num_lines - 1); else f->lf.lr_db_line[0] = ptr; ptr += llabs(y_stride) * num_lines; if (uv_stride < 0) { f->lf.lr_db_line[1] = ptr - uv_stride * (num_lines * 1 - 1); f->lf.lr_db_line[2] = ptr - uv_stride * (num_lines * 2 - 1); } else { f->lf.lr_db_line[1] = ptr; f->lf.lr_db_line[2] = ptr + uv_stride * num_lines; } ptr += llabs(uv_stride) * num_lines * 2; // FIXME make the below work with negative stride f->lf.lr_cdef_line[0] = ptr; ptr += llabs(y_stride) * n_tile_rows_m1 * 6; f->lf.lr_cdef_line[1] = ptr; f->lf.lr_cdef_line[2] = ptr + llabs(uv_stride) * n_tile_rows_m1 * 2; f->lf.lr_buf_plane_sz[0] = (int) y_stride * num_lines; f->lf.lr_buf_plane_sz[1] = (int) uv_stride * num_lines * 2; f->lf.lr_buf_plane_sz[2] = (int) y_stride * n_tile_rows_m1 * 6; f->lf.lr_buf_plane_sz[3] = (int) uv_stride * n_tile_rows_m1 * 4; } // update allocation for loopfilter masks if (num_sb256 != f->lf.mask_sz) { dav2d_free(f->lf.mask); f->lf.mask = dav2d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb256); // over-allocate by 3 bytes since some of the SIMD implementations // index this from the level type and can thus over-read by up to 3 if (!f->lf.mask) { f->lf.mask_sz = 0; goto error; } if (c->task_thread.n_passes > 1) { dav2d_free(f->frame_thread.b); f->frame_thread.b = dav2d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) * num_sb256 * 64 * 64); if (!f->frame_thread.b) { f->lf.mask_sz = 0; goto error; } } f->lf.mask_sz = num_sb256; } memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb256); if (num_sb256 != f->lf.lr_mask_sz) { dav2d_free(f->lf.lr_mask); f->lf.lr_mask = dav2d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * num_sb256); if (!f->lf.lr_mask) { f->lf.lr_mask_sz = 0; goto error; } f->lf.lr_mask_sz = num_sb256; } memset(f->lf.lr_mask, 0, sizeof(*f->lf.lr_mask) * num_sb256); init_wiener(f); f->lf.restore_planes = ((f->frame_hdr->restoration.p[0].type != DAV2D_RESTORATION_NONE || f->frame_hdr->gdf.enabled) << 0) + ((f->frame_hdr->restoration.p[1].type != DAV2D_RESTORATION_NONE) << 1) + ((f->frame_hdr->restoration.p[2].type != DAV2D_RESTORATION_NONE) << 2); if (f->frame_hdr->gdf.enabled) { int ref_dst_idx = 0; if (IS_INTER_OR_SWITCH(f->frame_hdr)) { int max_dist = 0; for (int i = 0; i < imin(f->frame_hdr->n_ref_frames, 2); i++) max_dist = imax(max_dist, f->absrefdist[i]); const uint8_t ref_dst_idx_tbl[12] = { 5, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 5 }; ref_dst_idx = ref_dst_idx_tbl[imin(max_dist, 11)]; } f->lf.gdf_ref_dst_idx = ref_dst_idx; } size_t prefilter_data_size_y, prefilter_data_size_uv; if (c->n_tc == 1) { prefilter_data_size_y = prefilter_data_size_uv = 0; f->prefilter_data_full_frame = 1; } else if (f->frame_hdr->allow_intrabc) { const int bh_align7 = (f->bh + 7) & ~7; prefilter_data_size_y = f->cur.p.stride[0] * bh_align7 * 4; const int bh_align15 = (f->bh + 15) & ~15; prefilter_data_size_uv = f->cur.p.stride[1] * bh_align15 * 4 >> f->ss_ver; f->prefilter_data_full_frame = 1; } else { prefilter_data_size_y = f->cur.p.stride[0] * f->frame_hdr->tiling.t.rows; prefilter_data_size_uv = f->cur.p.stride[1] * f->frame_hdr->tiling.t.rows; f->prefilter_data_full_frame = 0; } const size_t prefilter_data_sz = prefilter_data_size_y + 2 * prefilter_data_size_uv; if (prefilter_data_sz > f->prefilter_data_sz) { if (f->prefilter_data_sz) // otherwise it's a pointer into f->cur dav2d_free_aligned(f->prefilter_data[0]); if (prefilter_data_sz) { uint8_t *ptr = f->prefilter_data[0] = dav2d_alloc_aligned(ALLOC_IPRED, prefilter_data_sz, 64); if (!ptr) { f->prefilter_data_sz = 0; goto error; } if (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400) { ptr += prefilter_data_size_y; f->prefilter_data[1] = ptr; ptr += prefilter_data_size_uv; f->prefilter_data[2] = ptr; } } f->prefilter_data_sz = prefilter_data_sz; } const int re_sz = f->sb256h * f->frame_hdr->tiling.t.cols; if (re_sz != f->lf.re_sz) { dav2d_free(f->lf.tx_db_right_edge[0]); f->lf.tx_db_right_edge[0] = dav2d_malloc(ALLOC_LF, re_sz * 64 * 2); if (!f->lf.tx_db_right_edge[0]) { f->lf.re_sz = 0; goto error; } f->lf.tx_db_right_edge[1] = f->lf.tx_db_right_edge[0] + re_sz * 64; f->lf.re_sz = re_sz; } // init ref mvs if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { const int ret = dav2d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr, f->refpoc, f->mvs, f->refrefpoc, f->refcnt, f->ref_mvs, f->c->n_tc > 1, f->c->n_fc > 1); if (ret < 0) goto error; } // setup dequant tables init_quant_tables(f->frame_hdr, f->frame_hdr->quant.yac, f->dq); if (f->frame_hdr->quant.qm.enabled) for (int i = 0; i < N_RECT_TX_SIZES; i++) { f->qm[i][0] = dav2d_qm_tbl[f->frame_hdr->quant.qm.y[0]][0][i]; f->qm[i][1] = dav2d_qm_tbl[f->frame_hdr->quant.qm.u[0]][1][i]; f->qm[i][2] = dav2d_qm_tbl[f->frame_hdr->quant.qm.v[0]][1][i]; } else memset(f->qm, 0, sizeof(f->qm)); /* Init loopfilter pointers. Increasing NULL pointers is technically UB, * so just point the chroma pointers in 4:0:0 to the luma plane here to * avoid having additional in-loop branches in various places. We never * dereference those pointers so it doesn't really matter what they * point at, as long as the pointers are valid. */ const int has_chroma = f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400; f->lf.p[0] = f->cur.p.data[0]; f->lf.p[1] = f->cur.p.data[has_chroma ? 1 : 0]; f->lf.p[2] = f->cur.p.data[has_chroma ? 2 : 0]; if (c->n_tc == 1) { f->prefilter_data[0] = f->cur.p.data[0]; f->prefilter_data[1] = f->cur.p.data[1]; f->prefilter_data[2] = f->cur.p.data[2]; } else if (f->frame_hdr->allow_intrabc) { f->cur.p.data[0] = f->prefilter_data[0]; f->cur.p.data[1] = f->prefilter_data[1]; f->cur.p.data[2] = f->prefilter_data[2]; } if (c->n_tc > 1) { for (int n = 0; n < f->sb256w * f->frame_hdr->tiling.t.rows; n++) reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_hdr->tip.frame_mode == 2); } retval = 0; error: return retval; } int dav2d_decode_frame_init_cdf(Dav2dFrameContext *const f) { const Dav2dContext *const c = f->c; int retval = DAV2D_ERR(EINVAL); if (f->use_pri_sec_cdf) { dav2d_cdf_pri_sec_average(f->in_cdf.data.cdf, &f->src_cdf[0], &f->src_cdf[1]); } if (!f->frame_hdr->disable_cdf_update) dav2d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf); // parse individual tiles per tile group int tile_row = 0, tile_col = 0; f->task_thread.update_set = 0; for (int i = 0; i < f->n_tile_data; i++) { const uint8_t *data = f->tile[i].data.data; size_t size = f->tile[i].data.sz; for (int j = f->tile[i].start; j <= f->tile[i].end; j++) { size_t tile_sz; if (j == f->tile[i].end) { tile_sz = size; } else { if (f->frame_hdr->tiling.n_bytes > size) goto error; tile_sz = 0; for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++) tile_sz |= (unsigned)*data++ << (k * 8); tile_sz++; size -= f->frame_hdr->tiling.n_bytes; if (tile_sz > size) goto error; } setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++, c->task_thread.n_passes > 1 ? f->frame_thread.tile_start_off[j] : 0); if (tile_col == f->frame_hdr->tiling.t.cols) { tile_col = 0; tile_row++; } if (j == f->frame_hdr->tiling.update && !f->frame_hdr->disable_cdf_update && !(f->seq_hdr->avg_cdf_type && f->frame_hdr->tiling.t.log2_cols + f->frame_hdr->tiling.t.log2_rows)) { f->task_thread.update_set = 1; } data += tile_sz; size -= tile_sz; } } atomic_store(&f->task_thread.entropy_task_counter, f->frame_hdr->tiling.t.cols * f->frame_hdr->tiling.t.rows); retval = 0; error: return retval; } void dav2d_decode_tip_frame_init(Dav2dFrameContext *const f) { const struct Dav2dTileInfo *const ti = &f->frame_hdr->tiling.t; const int sb_shift = f->sb_shift; for (int tile_row = 0, tile = 0; tile_row < ti->rows; tile_row++) { for (int tile_col = 0; tile_col < ti->cols; tile_col++, tile++) { const int col_sb_start = ti->col_start_sb[tile_col]; const int col_sb_end = ti->col_start_sb[tile_col + 1]; const int row_sb_start = ti->row_start_sb[tile_row]; const int row_sb_end = ti->row_start_sb[tile_row + 1]; Dav2dTileState *const ts = &f->ts[tile]; ts->tiling.row = tile_row; ts->tiling.col = tile_col; ts->tiling.col_start = col_sb_start << sb_shift; ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw); ts->tiling.row_start = row_sb_start << sb_shift; ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh); if (f->c->n_tc > 1) for (int p = 0; p < 3; p++) atomic_init(&ts->progress[p], row_sb_start); } } atomic_store(&f->task_thread.entropy_task_counter, f->frame_hdr->tiling.t.cols * f->frame_hdr->tiling.t.rows); } int dav2d_decode_frame_main(Dav2dFrameContext *const f) { const Dav2dContext *const c = f->c; int retval = DAV2D_ERR(EINVAL); assert(f->c->n_tc == 1); Dav2dTaskContext *const t = &c->tc[f - c->fc]; t->f = f; t->task_thread.pass = PASS_ALL; for (int n = 0; n < f->sb256w * f->frame_hdr->tiling.t.rows; n++) reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_hdr->tip.frame_mode == 2); for (int tile_row = 0; tile_row < f->frame_hdr->tiling.t.rows; tile_row++) { const int sby_start = f->frame_hdr->tiling.t.row_start_sb[tile_row]; const int sbh_end = imin(f->frame_hdr->tiling.t.row_start_sb[tile_row + 1], f->sbh); for (int sby = sby_start; sby < sbh_end; sby++) { t->by = sby << (4 + f->frame_hdr->sb128); const int by_end = (t->by + f->sb_step) >> 1; if (f->frame_hdr->use_ref_frame_mvs) { dav2d_refmvs_load_tmvs(&f->rf, tile_row, 0, f->bw >> 1, t->by >> 1, by_end); } for (int tile_col = 0; tile_col < f->frame_hdr->tiling.t.cols; tile_col++) { t->ts = &f->ts[tile_row * f->frame_hdr->tiling.t.cols + tile_col]; if (dav2d_decode_tile_sbrow(t)) goto error; } } // post filters (deblock + cdef + ccso + ...) // do this after completing full tiles, so that intra bc works correctly for (int sby = sby_start; sby < sbh_end; sby++) { f->bd_fn.filter_sbrow(f, sby); } } retval = 0; error: return retval; } void dav2d_decode_frame_exit(Dav2dFrameContext *const f, int retval) { const Dav2dContext *const c = f->c; if (f->cur.p.data[0]) atomic_init(&f->task_thread.error, 0); if (c->task_thread.n_passes > 1 && retval && f->frame_thread.cf) { memset(f->frame_thread.cf, 0, (size_t)f->frame_thread.cf_sz * 256 * 256 / 2); } for (int i = 0; i < 7; i++) { if (f->refp[i].p.frame_hdr) { if (!retval && c->n_fc > 1 && c->strict_std_compliance && atomic_load(&f->refp[i].progress[2]) == FRAME_ERROR) { retval = DAV2D_ERR(EINVAL); atomic_store(&f->task_thread.error, 1); atomic_store(&f->cur.progress[2], FRAME_ERROR); } dav2d_thread_picture_unref(&f->refp[i]); } dav2d_ref_dec(&f->ref_mvs_ref[i]); } dav2d_thread_picture_unref(&f->cur); dav2d_cdf_thread_unref(&f->in_cdf); if (f->frame_hdr && f->use_pri_sec_cdf) { dav2d_cdf_thread_unref(&f->src_cdf[0]); dav2d_cdf_thread_unref(&f->src_cdf[1]); } if (f->frame_hdr && !f->frame_hdr->disable_cdf_update) { if (f->out_cdf.progress) atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR); dav2d_cdf_thread_unref(&f->out_cdf); } dav2d_ref_dec(&f->cur_segmap_ref); dav2d_ref_dec(&f->prev_segmap_ref); dav2d_mem_pool_push(f->c->segmap_uv_pool, f->lf.segmap_uv); f->lf.segmap_uv = NULL; dav2d_ref_dec(&f->mvs_ref); dav2d_ref_dec(&f->cur_ccsomap_ref); for (int p = 0; p < 3; p++) dav2d_ref_dec(&f->prev_ccsomap_ref[p]); dav2d_ref_dec(&f->seq_hdr_ref); dav2d_ref_dec(&f->frame_hdr_ref); for (int i = 0; i < f->n_tile_data; i++) dav2d_data_unref_internal(&f->tile[i].data); f->task_thread.retval = retval; } int dav2d_decode_frame(Dav2dFrameContext *const f) { const Dav2dContext *const c = f->c; assert(c->n_fc == 1); // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task // threads also. Not sure it makes a measurable difference. int res = dav2d_decode_frame_init(f); if (!res) { if (f->frame_hdr->tip.frame_mode != 2) { res = dav2d_decode_frame_init_cdf(f); } else { dav2d_decode_tip_frame_init(f); } } // wait until all threads have completed if (!res) { if (c->n_tc > 1) { const int n_passes = c->task_thread.n_passes; for (int p = 0; p < n_passes && !res; p++) res = dav2d_task_create_tile_sbrow(f, p, 1); pthread_mutex_lock(&f->task_thread.ttd->lock); pthread_cond_signal(&f->task_thread.ttd->cond); if (!res) { while (!f->task_thread.done[0] || atomic_load(&f->task_thread.task_counter) > 0) { pthread_cond_wait(&f->task_thread.cond, &f->task_thread.ttd->lock); } } pthread_mutex_unlock(&f->task_thread.ttd->lock); res = f->task_thread.retval; } else { res = dav2d_decode_frame_main(f); if (!res && !f->frame_hdr->disable_cdf_update && (f->task_thread.update_set || f->seq_hdr->avg_cdf_type)) { const int shift = f->frame_hdr->tiling.t.log2_cols + f->frame_hdr->tiling.t.log2_rows; if (shift && f->seq_hdr->avg_cdf_type) { const int n_tiles = 1 << shift; dav2d_cdf_shift(f->out_cdf.data.cdf, &f->ts[0].cdf, shift); for (int n = 1; n < n_tiles; n++) dav2d_cdf_shift_accumulate(f->out_cdf.data.cdf, &f->ts[n].cdf, shift); } else { memcpy(f->out_cdf.data.cdf, &f->ts[f->frame_hdr->tiling.update].cdf, sizeof(CdfContext)); } dav2d_cdf_reset_count(f->frame_hdr, f->out_cdf.data.cdf); } } } dav2d_decode_frame_exit(f, res); res = f->task_thread.retval; f->n_tile_data = 0; return res; } int dav2d_submit_frame(Dav2dContext *const c) { Dav2dFrameContext *f; int res = -1; #if 0 // wait for c->out_delayed[next] and move into c->out if visible Dav2dThreadPicture *out_delayed; #endif if (c->n_fc > 1) { pthread_mutex_lock(&c->task_thread.lock); const unsigned next = c->frame_thread.next++; if (c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; f = &c->fc[next]; while (f->n_tile_data != 0) pthread_cond_wait(&f->task_thread.cond, &c->task_thread.lock); // FIXME forward error code for frame to dpb->res #if 0 out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { #endif if (f->frame_thread.scheduled) { unsigned first = atomic_load(&c->task_thread.first); if (first + 1U < c->n_fc) atomic_fetch_add(&c->task_thread.first, 1U); else atomic_store(&c->task_thread.first, 0); atomic_compare_exchange_strong(&c->task_thread.reset_task_cur, &first, UINT_MAX); if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; } #if 0 const int error = f->task_thread.retval; if (error) { f->task_thread.retval = 0; c->cached_error = error; dav2d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); dav2d_thread_picture_unref(out_delayed); } else if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[2], memory_order_relaxed); if ((out_delayed->visible || c->output_invisible_frames) && progress != FRAME_ERROR) { dav2d_thread_picture_ref(&c->out, out_delayed); c->event_flags |= dav2d_picture_get_event_flags(out_delayed); } dav2d_thread_picture_unref(out_delayed); } #endif } else { f = c->fc; } struct OutputQueue *q = NULL; f->seq_hdr = c->seq_hdr; f->seq_hdr_ref = c->seq_hdr_ref; dav2d_ref_inc(f->seq_hdr_ref); f->frame_hdr = c->frame_hdr; f->frame_hdr_ref = c->frame_hdr_ref; c->frame_hdr = NULL; c->frame_hdr_ref = NULL; f->dsp = &c->dsp[f->seq_hdr->hbd]; const int bpc = 8 + 2 * f->seq_hdr->hbd; if (!f->dsp->ipred.intra_pred[DC_PRED]) { Dav2dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd]; switch (bpc) { #define assign_bitdepth_case(bd) \ dav2d_ccso_dsp_init_##bd##bpc(&dsp->ccso); \ dav2d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \ dav2d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \ dav2d_itx_dsp_init_##bd##bpc(&dsp->itx); \ dav2d_stx_dsp_init_##bd##bpc(&dsp->stx); \ dav2d_deblock_dsp_init_##bd##bpc(&dsp->lf); \ dav2d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \ dav2d_mc_dsp_init_##bd##bpc(&dsp->mc); \ dav2d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \ break #if CONFIG_8BPC case 8: assign_bitdepth_case(8); #endif #if CONFIG_16BPC case 10: case 12: assign_bitdepth_case(16); #endif #undef assign_bitdepth_case default: dav2d_log(c, "Compiled without support for %d-bit decoding\n", 8 + 2 * f->seq_hdr->hbd); res = DAV2D_ERR(ENOPROTOOPT); goto error; } } #define assign_bitdepth_case(bd) \ f->bd_fn.recon_b = dav2d_recon_b_##bd##bpc; \ f->bd_fn.filter_sbrow = dav2d_filter_sbrow_##bd##bpc; \ f->bd_fn.filter_sbrow_deblock_cols = dav2d_filter_sbrow_deblock_cols_##bd##bpc; \ f->bd_fn.filter_sbrow_deblock_rows = dav2d_filter_sbrow_deblock_rows_##bd##bpc; \ f->bd_fn.filter_sbrow_cdef = dav2d_filter_sbrow_cdef_##bd##bpc; \ f->bd_fn.filter_sbrow_lr = dav2d_filter_sbrow_lr_##bd##bpc; \ f->bd_fn.backup_prefilter_data = dav2d_backup_prefilter_data_##bd##bpc; \ f->bd_fn.read_coef_blocks = dav2d_read_coef_blocks_##bd##bpc; \ f->bd_fn.copy_pal_block_y = dav2d_copy_pal_block_y_##bd##bpc; \ f->bd_fn.read_pal_plane = dav2d_read_pal_plane_##bd##bpc if (!f->seq_hdr->hbd) { #if CONFIG_8BPC assign_bitdepth_case(8); #endif } else { #if CONFIG_16BPC assign_bitdepth_case(16); #endif } #undef assign_bitdepth_case int ref_coded_width[7]; if (IS_INTER_OR_SWITCH(f->frame_hdr)) { for (int i = 0; i < 7; i++) { const int refidx = f->frame_hdr->refidx[i]; if (!c->refs[refidx].p.p.data[0] || f->frame_hdr->width * 2 < c->refs[refidx].p.p.p.w || f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h || f->frame_hdr->width > c->refs[refidx].p.p.p.w * 16 || f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 || f->seq_hdr->layout != c->refs[refidx].p.p.p.layout || bpc != c->refs[refidx].p.p.p.bpc) { for (int j = 0; j < i; j++) dav2d_thread_picture_unref(&f->refp[j]); res = DAV2D_ERR(EINVAL); goto error; } dav2d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p); ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width; if (f->frame_hdr->width != c->refs[refidx].p.p.p.w || f->frame_hdr->height != c->refs[refidx].p.p.p.h) { #define scale_fac(ref_sz, this_sz) \ ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz)) f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w, f->frame_hdr->width); f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h, f->frame_hdr->height); f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4; f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4; } else { f->svc[i][0].scale = f->svc[i][1].scale = 0; } f->gmv_warp_allowed[i] = f->frame_hdr->gmv.m[i].type > DAV2D_WM_TYPE_TRANSLATION && !f->frame_hdr->force_integer_mv && !dav2d_get_shear_params(&f->frame_hdr->gmv.m[i]) && !f->svc[i][0].scale; } } // setup entropy const int p_ref_idx = f->frame_hdr->primary_ref_frame; if (p_ref_idx == DAV2D_PRIMARY_REF_NONE) { dav2d_cdf_thread_init_static(&f->in_cdf, f->frame_hdr->quant.yac); f->use_pri_sec_cdf = 0; } else { const int s_ref_idx = f->frame_hdr->secondary_ref_frame; const int pri_ref = f->frame_hdr->refidx[p_ref_idx]; f->use_pri_sec_cdf = s_ref_idx != DAV2D_PRIMARY_REF_NONE && f->frame_hdr->frame_type == DAV2D_FRAME_TYPE_INTER && f->seq_hdr->avg_cdf && !f->seq_hdr->avg_cdf_type && f->frame_hdr->tip.frame_mode != 2; if (!f->use_pri_sec_cdf) { dav2d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]); } else { const int sec_ref = f->frame_hdr->refidx[s_ref_idx]; res = dav2d_cdf_thread_alloc(c, &f->in_cdf, c->n_fc > 1); if (res < 0) goto error; dav2d_cdf_thread_ref(&f->src_cdf[0], &c->cdf[pri_ref]); dav2d_cdf_thread_ref(&f->src_cdf[1], &c->cdf[sec_ref]); } } if (!f->frame_hdr->disable_cdf_update) { res = dav2d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1); if (res < 0) goto error; } // FIXME qsort so tiles are in order (for frame threading) if (f->n_tile_data_alloc < c->n_tile_data) { dav2d_free(f->tile); assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile)); f->tile = dav2d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile)); if (!f->tile) { f->n_tile_data_alloc = f->n_tile_data = 0; res = DAV2D_ERR(ENOMEM); goto error; } f->n_tile_data_alloc = c->n_tile_data; } if (f->frame_hdr->tip.frame_mode == 2) { assert(!c->n_tile_data); f->n_tile_data = -1; // any non-zero value for threading } else { memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile)); memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile)); f->n_tile_data = c->n_tile_data; c->n_tile_data = 0; } // allocate frame res = dav2d_thread_picture_alloc(c, f, bpc); if (res < 0) goto error; if (f->frame_hdr->film_grain.present && c->fgm[f->frame_hdr->film_grain.id]) { f->cur.p.fgm_ref = c->fgm[f->frame_hdr->film_grain.id]; dav2d_ref_inc(f->cur.p.fgm_ref); f->cur.p.fgm = f->cur.p.fgm_ref->data; } if (c->ci_ref) { f->cur.p.ci_ref = c->ci_ref; dav2d_ref_inc(f->cur.p.ci_ref); f->cur.p.ci = f->cur.p.ci_ref->data; } // move f->cur into output queue if (f->frame_hdr->show_immediate || c->output_invisible_frames) { q = dav2d_queue_output(c, &f->cur); #if 0 c->event_flags |= dav2d_picture_get_event_flags(&f->cur); #endif } // ss_ver is set for 4:2:0, and ss_hor for 4:2:0 & 4:2:2 f->ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; f->ss_hor = f->cur.p.p.layout - 1 < (unsigned) DAV2D_PIXEL_LAYOUT_I444 - 1; f->root_bs = (const uint8_t[]) { BS_64x64, BS_128x128, BS_256x256 }[f->frame_hdr->sb128]; f->bw = ((f->frame_hdr->width + 7) >> 3) << 1; f->bh = ((f->frame_hdr->height + 7) >> 3) << 1; f->sb256w = (f->bw + 63) >> 6; f->sb256h = (f->bh + 63) >> 6; f->sb_shift = 4 + f->frame_hdr->sb128; f->sb_step = 16 << f->frame_hdr->sb128; f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift; f->b4_stride = (f->bw + 63) & ~63; f->bitdepth_max = (1 << f->cur.p.p.bpc) - 1; atomic_init(&f->task_thread.error, 0); const int n_passes = c->task_thread.n_passes; const int cols = f->frame_hdr->tiling.t.cols; const int rows = f->frame_hdr->tiling.t.rows; atomic_store(&f->task_thread.task_counter, cols * rows * n_passes + f->sbh * imin(n_passes, 2)); // ref_mvs if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { if (f->seq_hdr->ref_frame_mvs) { f->mvs_ref = dav2d_ref_create_using_pool(c->refmvs_pool, sizeof(*f->mvs) * f->sb256h * 32 * (f->b4_stride >> 1)); if (!f->mvs_ref) { res = DAV2D_ERR(ENOMEM); goto error; } f->mvs = f->mvs_ref->data; } else { f->mvs_ref = NULL; } if (IS_INTER_OR_SWITCH(f->frame_hdr)) { const int poc = f->cur.p.frame_hdr->frame_offset; // we use -2 here so it doesn't match b->ref==-1, which means intra int furthest_future_refidx = -2; for (int i = 0; i < 7; i++) { f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset; const int delta = f->refdist[i] = get_poc_diff(f->seq_hdr->order_hint_n_bits, f->refpoc[i], poc); f->absrefdist[i] = abs(delta); f->refdir[i] = delta > 0; if (delta > 0 && (furthest_future_refidx < 0 || f->refdist[furthest_future_refidx] < delta)) { furthest_future_refidx = i; } } f->furthest_future_refidx = furthest_future_refidx; } else { memset(f->refpoc, 0, sizeof(f->refpoc)); } if (f->frame_hdr->use_ref_frame_mvs) { for (int i = 0; i < 7; i++) { const int refidx = f->frame_hdr->refidx[i]; const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1; const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1; if (c->refs[refidx].refmvs != NULL && ref_w == f->bw && ref_h == f->bh) { f->ref_mvs_ref[i] = c->refs[refidx].refmvs; dav2d_ref_inc(f->ref_mvs_ref[i]); f->ref_mvs[i] = c->refs[refidx].refmvs->data; } else { f->ref_mvs[i] = NULL; f->ref_mvs_ref[i] = NULL; } memcpy(f->refrefpoc[i], c->refs[refidx].refpoc, sizeof(*f->refrefpoc)); f->refcnt[i] = f->refp[i].p.frame_hdr->n_ref_frames; } } else { memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref)); } } else { f->mvs_ref = NULL; memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref)); } // segmap if (f->frame_hdr->segmentation.enabled) { // By default, the previous segmentation map is not initialised. f->prev_segmap_ref = NULL; f->prev_segmap = NULL; // We might need a previous frame's segmentation map. This // happens if there is either no update or a temporal update. if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) { const int pri_ref = f->frame_hdr->primary_ref_frame; assert(pri_ref != DAV2D_PRIMARY_REF_NONE); const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1; const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1; if (ref_w == f->bw && ref_h == f->bh) { f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap; if (f->prev_segmap_ref) { dav2d_ref_inc(f->prev_segmap_ref); f->prev_segmap = f->prev_segmap_ref->data; } } } const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 64 * f->sb256h; f->cur_segmap_ref = dav2d_ref_create_using_pool(c->segmap_pool, segmap_size); if (!f->cur_segmap_ref) { if (f->prev_segmap_ref) dav2d_ref_dec(&f->prev_segmap_ref); res = DAV2D_ERR(ENOMEM); goto error; } f->cur_segmap = f->cur_segmap_ref->data; if (!f->frame_hdr->segmentation.update_map && !f->prev_segmap_ref) { // We need a fresh segmentation map, zero out the segmentation map memset(f->cur_segmap, 0, segmap_size); } f->lf.uv_segmap_stride = f->sb256w * (64 >> f->ss_hor); const size_t segmap_uv_size = sizeof(*f->lf.segmap_uv) * f->lf.uv_segmap_stride * f->sb256h * (64 >> f->ss_ver); void *buf = dav2d_mem_pool_pop(c->segmap_uv_pool, segmap_uv_size); if (!buf) { res = DAV2D_ERR(ENOMEM); goto error; } f->lf.segmap_uv = buf; } else { f->cur_segmap = NULL; f->cur_segmap_ref = NULL; f->prev_segmap_ref = NULL; f->lf.segmap_uv = NULL; f->lf.uv_segmap_stride = 0; } // CCSO map for (int p = 0; p < 3; p++) f->prev_ccsomap_ref[p] = NULL; if (f->frame_hdr->ccso.enabled) { const int n_planes = f->seq_hdr->layout == DAV2D_PIXEL_LAYOUT_I400 ? 1 : 3; for (int p = 0; p < n_planes; p++) { if (!f->frame_hdr->ccso.p[p].sb_reuse) continue; const int ref = f->frame_hdr->ccso.p[p].refidx; f->prev_ccsomap_ref[p] = c->refs[f->frame_hdr->refidx[ref]].ccsomap; dav2d_ref_inc(f->prev_ccsomap_ref[p]); f->prev_ccsomap[p] = f->prev_ccsomap_ref[p]->data; } if (f->frame_hdr->ccso.p[0].sb_reuse && (n_planes == 1 || (f->frame_hdr->ccso.p[1].sb_reuse && f->frame_hdr->ccso.p[2].sb_reuse && f->frame_hdr->ccso.p[0].refidx == f->frame_hdr->ccso.p[1].refidx && f->frame_hdr->ccso.p[0].refidx == f->frame_hdr->ccso.p[2].refidx))) { f->cur_ccsomap_ref = f->prev_ccsomap_ref[0]; dav2d_ref_inc(f->cur_ccsomap_ref); f->cur_ccsomap = NULL; } else { const size_t ccsomap_size = sizeof(*f->cur_ccsomap) * 3 * f->sb256w * f->sb256h; f->cur_ccsomap_ref = dav2d_ref_create_using_pool(c->ccsomap_pool, ccsomap_size); if (!f->cur_ccsomap_ref) { res = DAV2D_ERR(ENOMEM); goto error; } f->cur_ccsomap = f->cur_ccsomap_ref->data; } } else { f->cur_ccsomap = NULL; f->cur_ccsomap_ref = NULL; } // skipmode f->skip_mode_refs.ref[0] = 0; f->skip_mode_refs.ref[1] = f->frame_hdr->skip_mode_enabled && f->frame_hdr->n_ref_frames > 1 && abs(f->absrefdist[0] - f->absrefdist[1]) <= 1; // update references etc. const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags; for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { if (c->refs[i].p.p.frame_hdr) dav2d_thread_picture_unref(&c->refs[i].p); dav2d_thread_picture_ref(&c->refs[i].p, &f->cur); dav2d_cdf_thread_unref(&c->cdf[i]); if (!f->frame_hdr->disable_cdf_update) { dav2d_cdf_thread_ref(&c->cdf[i], &f->out_cdf); } else { dav2d_cdf_thread_ref(&c->cdf[i], &f->in_cdf); } dav2d_ref_dec(&c->refs[i].segmap); c->refs[i].segmap = f->frame_hdr->segmentation.update_map ? f->cur_segmap_ref : f->prev_segmap_ref; if (c->refs[i].segmap) dav2d_ref_inc(c->refs[i].segmap); dav2d_ref_dec(&c->refs[i].refmvs); if (IS_INTER_OR_SWITCH(f->frame_hdr)) { c->refs[i].refmvs = f->mvs_ref; if (f->mvs_ref) dav2d_ref_inc(f->mvs_ref); } c->refs[i].ccsomap = f->cur_ccsomap_ref; if (f->cur_ccsomap_ref) dav2d_ref_inc(f->cur_ccsomap_ref); memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc)); } } if (c->n_fc == 1) { if ((res = dav2d_decode_frame(f)) < 0) { for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { if (c->refs[i].p.p.frame_hdr) dav2d_thread_picture_unref(&c->refs[i].p); dav2d_cdf_thread_unref(&c->cdf[i]); dav2d_ref_dec(&c->refs[i].segmap); dav2d_ref_dec(&c->refs[i].refmvs); } } goto error; } } else { dav2d_task_frame_init(f); f->frame_thread.scheduled = 1; pthread_mutex_unlock(&c->task_thread.lock); } return 0; error: atomic_init(&f->task_thread.error, 1); dav2d_cdf_thread_unref(&f->in_cdf); if (f->use_pri_sec_cdf) { dav2d_cdf_thread_unref(&f->src_cdf[0]); dav2d_cdf_thread_unref(&f->src_cdf[1]); } if (!f->frame_hdr->disable_cdf_update) dav2d_cdf_thread_unref(&f->out_cdf); for (int i = 0; i < 7; i++) { if (f->refp[i].p.frame_hdr) dav2d_thread_picture_unref(&f->refp[i]); dav2d_ref_dec(&f->ref_mvs_ref[i]); } // FIXME is it correct to re-report error messages during picture output // if we already reported them during picture input? if (q) q->res = res; dav2d_thread_picture_unref(&f->cur); dav2d_ref_dec(&f->cur_segmap_ref); dav2d_ref_dec(&f->prev_segmap_ref); dav2d_ref_dec(&f->mvs_ref); dav2d_ref_dec(&f->cur_ccsomap_ref); for (int p = 0; p < 3; p++) dav2d_ref_dec(&f->prev_ccsomap_ref[p]); dav2d_ref_dec(&f->seq_hdr_ref); dav2d_ref_dec(&f->frame_hdr_ref); #if 0 dav2d_data_props_copy(&c->cached_error_props, &c->in.m); #endif for (int i = 0; i < f->n_tile_data; i++) dav2d_data_unref_internal(&f->tile[i].data); f->n_tile_data = 0; if (c->n_fc > 1) pthread_mutex_unlock(&c->task_thread.lock); return res; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/decode.h000066400000000000000000000030661517466257200222340ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_DECODE_H #define DAV2D_SRC_DECODE_H #include "src/internal.h" int dav2d_submit_frame(Dav2dContext *c); #endif /* DAV2D_SRC_DECODE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/derivation.h000066400000000000000000000047551517466257200231630ustar00rootroot00000000000000/* * Copyright © 2026, VideoLAN and dav2d authors * Copyright © 2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/attributes.h" #include "common/intops.h" #include "src/tables.h" static inline int derive_alpha(const int num, const int den, int alpha) { const int max = (2 << 8) - 1; if (num && den) { const int num_abs = abs(num); const int shift_n = ulog2(num_abs); assert(den >= 0); const int shift_d = ulog2(den); const int e_d = den - (1U << shift_d); int f_d, f_n; if (shift_d > 7) f_d = (e_d + (1 << (shift_d - 8))) >> (shift_d - 7); else f_d = e_d << (7 - shift_d); if (shift_n > 7) f_n = (num_abs + (1 << (shift_n - 8))) >> (shift_n - 7); else f_n = num_abs << (7 - shift_n); const int shift_add = shift_d - shift_n - 8; if (shift_add <= 1) { const int shift0 = 9 + 7 + shift_add; const int tmp_alpha = shift0 < 0 ? max : imin((dav2d_div_recip[f_d] * f_n) >> shift0, max); if (tmp_alpha) alpha = apply_sign(tmp_alpha, num); } } return alpha; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/dip_tables.c000066400000000000000000000754621517466257200231230ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "src/dip_tables.h" const uint16_t dav2d_dip_weights[6][64][11] = { { { 3104, 6856, 4308, 3992, 4172, 5748, 4628, 4108, 4108, 4044, 4092 }, { 3380, 6076, 6392, 3568, 4352, 3952, 4876, 4316, 4156, 4000, 4088 }, { 4032, 3956, 7448, 4496, 4120, 3908, 4240, 4496, 4304, 4064, 4088 }, { 4148, 3660, 5628, 6904, 3704, 4116, 3892, 4360, 4428, 4216, 4100 }, { 4028, 4216, 3712, 7608, 4744, 4128, 3912, 4160, 4408, 4120, 4112 }, { 4040, 4296, 3596, 5636, 7112, 4084, 4028, 4008, 4364, 3892, 4092 }, { 4084, 4148, 4096, 3708, 8336, 4092, 4076, 3952, 4300, 4256, 4112 }, { 4096, 4088, 4260, 3272, 7448, 4116, 4072, 3964, 4244, 5468, 4120 }, { 3288, 5132, 5004, 3868, 4196, 4968, 6264, 4076, 4240, 4032, 4072 }, { 3732, 4620, 6248, 4136, 4196, 3776, 5100, 4884, 4344, 4012, 4096 }, { 4012, 3944, 5916, 5580, 4072, 3884, 4008, 4872, 4648, 4092, 4108 }, { 4024, 4128, 4308, 6904, 4508, 4056, 3740, 4372, 4816, 4160, 4132 }, { 4016, 4336, 3568, 6124, 6184, 4088, 3872, 3980, 4732, 4104, 4144 }, { 4064, 4252, 3836, 4344, 7816, 4088, 4008, 3828, 4568, 4212, 4148 }, { 4092, 4164, 4140, 3356, 8088, 4100, 4056, 3828, 4424, 4748, 4160 }, { 4100, 4128, 4192, 3324, 7224, 4140, 4052, 3908, 4280, 5620, 4180 }, { 3940, 4200, 5076, 4108, 4108, 3704, 6632, 4988, 4268, 4032, 4100 }, { 3956, 4004, 5524, 4852, 4096, 3812, 4512, 5532, 4708, 4028, 4132 }, { 3992, 4064, 4700, 6060, 4392, 3984, 3728, 4804, 5148, 4108, 4164 }, { 3996, 4320, 3732, 6200, 5524, 4048, 3764, 4028, 5180, 4176, 4192 }, { 4040, 4344, 3656, 4848, 7176, 4080, 3932, 3716, 4912, 4248, 4200 }, { 4076, 4244, 3964, 3632, 8072, 4112, 4016, 3688, 4644, 4512, 4200 }, { 4088, 4184, 4136, 3224, 7912, 4140, 4028, 3772, 4444, 5000, 4216 }, { 4092, 4144, 4156, 3372, 7160, 4196, 4004, 3920, 4248, 5612, 4240 }, { 4124, 3856, 4784, 4448, 4040, 3752, 5016, 6584, 4344, 4028, 4184 }, { 4020, 3832, 4736, 5316, 4196, 4068, 3836, 5516, 5380, 4048, 4200 }, { 4000, 4152, 3956, 5980, 4956, 4060, 3752, 4200, 5684, 4148, 4272 }, { 4012, 4352, 3572, 5308, 6456, 4056, 3928, 3588, 5356, 4236, 4292 }, { 4060, 4300, 3792, 4000, 7784, 4092, 4036, 3532, 4892, 4404, 4272 }, { 4080, 4232, 4036, 3284, 8168, 4140, 4032, 3648, 4572, 4704, 4260 }, { 4084, 4188, 4136, 3200, 7816, 4188, 4004, 3792, 4356, 5100, 4272 }, { 4092, 4148, 4156, 3412, 7144, 4228, 3988, 3948, 4180, 5556, 4292 }, { 4056, 3776, 4432, 4692, 4056, 4264, 3628, 6728, 5304, 4052, 4168 }, { 4044, 3852, 4100, 5460, 4476, 4224, 3724, 4600, 6264, 4108, 4312 }, { 4016, 4200, 3568, 5540, 5644, 4084, 3992, 3536, 5932, 4216, 4428 }, { 4044, 4284, 3616, 4520, 7192, 4060, 4120, 3364, 5228, 4320, 4408 }, { 4076, 4224, 3928, 3520, 8104, 4116, 4112, 3528, 4696, 4508, 4348 }, { 4088, 4184, 4108, 3156, 8168, 4172, 4060, 3700, 4396, 4804, 4328 }, { 4100, 4156, 4168, 3232, 7736, 4208, 4024, 3828, 4240, 5140, 4324 }, { 4112, 4120, 4184, 3456, 7128, 4248, 3988, 3968, 4124, 5508, 4320 }, { 4068, 3804, 4124, 4780, 4188, 4328, 3656, 4984, 6984, 4100, 4124 }, { 4080, 3896, 3700, 5308, 4916, 4244, 3984, 3620, 6680, 4184, 4560 }, { 4052, 4168, 3436, 4980, 6328, 4116, 4168, 3300, 5700, 4272, 4640 }, { 4064, 4200, 3704, 3992, 7656, 4088, 4204, 3440, 4884, 4384, 4528 }, { 4092, 4152, 4024, 3312, 8208, 4136, 4148, 3664, 4408, 4596, 4432 }, { 4092, 4124, 4168, 3168, 8072, 4196, 4076, 3808, 4192, 4872, 4388 }, { 4100, 4108, 4204, 3308, 7608, 4240, 4028, 3916, 4104, 5172, 4364 }, { 4120, 4088, 4204, 3512, 7064, 4264, 4000, 4012, 4048, 5480, 4348 }, { 4116, 3868, 3868, 4748, 4432, 4216, 4128, 3452, 7656, 4164, 4504 }, { 4112, 3900, 3492, 4968, 5464, 4264, 4188, 3284, 6280, 4252, 4960 }, { 4080, 4100, 3440, 4448, 6872, 4188, 4216, 3448, 5172, 4336, 4860 }, { 4088, 4124, 3784, 3688, 7848, 4144, 4188, 3696, 4468, 4476, 4652 }, { 4100, 4080, 4080, 3280, 8104, 4172, 4132, 3860, 4136, 4688, 4516 }, { 4104, 4068, 4212, 3256, 7880, 4220, 4068, 3948, 3996, 4932, 4456 }, { 4112, 4076, 4232, 3404, 7464, 4248, 4020, 4028, 3960, 5196, 4416 }, { 4132, 4076, 4212, 3576, 6984, 4276, 3984, 4104, 3948, 5460, 4384 }, { 4128, 3908, 3704, 4608, 4792, 4208, 4260, 3284, 6552, 4200, 5516 }, { 4144, 3872, 3428, 4568, 5972, 4312, 4216, 3512, 5380, 4316, 5436 }, { 4124, 3988, 3504, 4044, 7144, 4276, 4192, 3756, 4584, 4432, 5092 }, { 4104, 4044, 3832, 3552, 7800, 4232, 4136, 3940, 4136, 4600, 4784 }, { 4108, 4028, 4108, 3360, 7856, 4232, 4072, 4036, 3936, 4808, 4604 }, { 4120, 4028, 4228, 3388, 7612, 4232, 4048, 4064, 3876, 5020, 4528 }, { 4132, 4056, 4232, 3512, 7256, 4248, 4012, 4116, 3860, 5228, 4480 }, { 4140, 4080, 4204, 3652, 6856, 4276, 3980, 4172, 3864, 5464, 4440 } }, { { 3196, 6148, 4212, 4060, 4108, 7016, 3988, 4180, 4084, 4076, 4080 }, { 3052, 6036, 5800, 3620, 4244, 5604, 4420, 4148, 4108, 4048, 4076 }, { 3688, 4136, 7604, 3692, 4240, 4852, 4492, 4220, 4096, 4048, 4080 }, { 4072, 3316, 6884, 5556, 3832, 4480, 4412, 4300, 4080, 4136, 4088 }, { 3972, 3976, 4516, 7448, 4036, 4240, 4320, 4368, 4056, 4108, 4104 }, { 3916, 4372, 3584, 6628, 5900, 4116, 4220, 4408, 4052, 3848, 4112 }, { 4008, 4164, 4064, 4392, 7780, 4096, 4124, 4412, 4076, 3916, 4124 }, { 4056, 4012, 4400, 3424, 7476, 4096, 4076, 4388, 4128, 4980, 4116 }, { 3076, 4588, 4560, 3996, 4144, 6984, 5644, 3840, 4192, 4072, 4064 }, { 3300, 4444, 5592, 3776, 4244, 5924, 5452, 4184, 4120, 4052, 4072 }, { 3668, 3760, 6504, 3964, 4256, 5124, 5176, 4472, 4072, 4060, 4088 }, { 3884, 3592, 5988, 5156, 4144, 4584, 4884, 4672, 4052, 4096, 4104 }, { 3900, 4044, 4600, 6200, 4524, 4228, 4624, 4784, 4052, 4072, 4124 }, { 3908, 4292, 4008, 5644, 5828, 4056, 4388, 4832, 4072, 3984, 4140 }, { 3984, 4192, 4244, 4268, 7016, 4004, 4204, 4808, 4128, 4160, 4148 }, { 4036, 4076, 4432, 3656, 6836, 3996, 4116, 4744, 4192, 4908, 4160 }, { 3776, 4152, 4460, 4048, 4164, 4664, 7560, 4016, 4156, 4064, 4076 }, { 3720, 3920, 5076, 3944, 4256, 4852, 6520, 4648, 4052, 4056, 4092 }, { 3864, 3568, 5568, 4084, 4324, 4700, 5792, 5048, 4040, 4064, 4104 }, { 3964, 3620, 5268, 4744, 4360, 4388, 5268, 5268, 4060, 4092, 4116 }, { 3956, 4024, 4536, 5260, 4716, 4104, 4868, 5356, 4096, 4096, 4140 }, { 3960, 4224, 4260, 4884, 5564, 3952, 4544, 5340, 4164, 4100, 4160 }, { 4000, 4164, 4428, 4076, 6296, 3900, 4320, 5236, 4252, 4292, 4172 }, { 4044, 4052, 4552, 3732, 6200, 3904, 4184, 5124, 4316, 4844, 4200 }, { 4128, 4124, 4272, 4088, 4184, 3612, 6836, 5932, 3756, 4060, 4156 }, { 3980, 3944, 4576, 4048, 4284, 4076, 6108, 6028, 3920, 4052, 4136 }, { 4008, 3684, 4844, 4108, 4372, 4252, 5540, 6100, 4048, 4068, 4132 }, { 4048, 3656, 4760, 4404, 4456, 4232, 5096, 6116, 4152, 4092, 4136 }, { 4036, 3900, 4440, 4636, 4704, 4096, 4752, 6060, 4240, 4124, 4156 }, { 4032, 4092, 4356, 4420, 5192, 3964, 4488, 5924, 4340, 4168, 4176 }, { 4040, 4108, 4512, 3964, 5620, 3888, 4296, 5724, 4444, 4352, 4200 }, { 4064, 4024, 4640, 3756, 5588, 3872, 4168, 5524, 4496, 4768, 4244 }, { 4056, 4120, 4156, 4100, 4200, 4020, 4548, 7864, 3876, 4060, 4160 }, { 4016, 4044, 4252, 4088, 4296, 4088, 4676, 7280, 4228, 4048, 4128 }, { 4040, 3876, 4388, 4104, 4376, 4172, 4612, 7000, 4384, 4064, 4144 }, { 4076, 3752, 4448, 4184, 4452, 4216, 4480, 6824, 4476, 4088, 4160 }, { 4072, 3800, 4384, 4264, 4568, 4212, 4328, 6648, 4552, 4136, 4176 }, { 4060, 3928, 4384, 4176, 4780, 4140, 4196, 6440, 4636, 4216, 4196 }, { 4060, 4016, 4500, 3936, 5008, 4032, 4100, 6168, 4704, 4380, 4232 }, { 4080, 4000, 4632, 3792, 5052, 3944, 4036, 5896, 4740, 4696, 4276 }, { 4008, 4112, 4100, 4100, 4200, 4316, 3512, 7256, 5564, 4068, 3936 }, { 4012, 4088, 4100, 4096, 4276, 4260, 3744, 7064, 5396, 4056, 4056 }, { 4048, 3976, 4168, 4084, 4340, 4252, 3848, 6984, 5252, 4064, 4140 }, { 4080, 3816, 4296, 4068, 4384, 4284, 3888, 6904, 5156, 4088, 4184 }, { 4092, 3748, 4364, 4072, 4404, 4324, 3892, 6776, 5116, 4144, 4212 }, { 4080, 3792, 4396, 4060, 4424, 4316, 3888, 6600, 5104, 4248, 4236 }, { 4068, 3908, 4456, 3956, 4520, 4224, 3884, 6360, 5096, 4392, 4276 }, { 4080, 3984, 4552, 3844, 4612, 4088, 3880, 6076, 5076, 4636, 4324 }, { 4064, 4088, 4088, 4080, 4184, 4208, 3796, 5112, 7528, 4068, 3940 }, { 4052, 4076, 4060, 4076, 4244, 4276, 3636, 5796, 6712, 4064, 4164 }, { 4064, 3988, 4104, 4072, 4272, 4312, 3576, 6200, 6216, 4076, 4268 }, { 4088, 3848, 4236, 4036, 4276, 4340, 3592, 6392, 5928, 4100, 4308 }, { 4108, 3744, 4352, 4012, 4224, 4380, 3636, 6440, 5760, 4168, 4320 }, { 4108, 3748, 4396, 4020, 4172, 4380, 3700, 6376, 5636, 4268, 4336 }, { 4100, 3832, 4412, 3996, 4184, 4320, 3752, 6232, 5532, 4404, 4372 }, { 4104, 3932, 4468, 3908, 4280, 4192, 3788, 6024, 5420, 4596, 4424 }, { 4088, 4068, 4100, 4056, 4180, 4128, 4056, 3984, 7576, 4068, 4840 }, { 4080, 4040, 4080, 4052, 4216, 4232, 3756, 4860, 6988, 4072, 4764 }, { 4088, 3996, 4088, 4068, 4204, 4284, 3608, 5444, 6568, 4092, 4708 }, { 4104, 3884, 4200, 4036, 4172, 4332, 3556, 5804, 6280, 4132, 4656 }, { 4128, 3772, 4340, 3988, 4108, 4356, 3592, 5972, 6084, 4196, 4612 }, { 4148, 3736, 4396, 4000, 4024, 4348, 3676, 6012, 5924, 4292, 4592 }, { 4144, 3796, 4380, 4032, 3988, 4308, 3736, 5980, 5764, 4416, 4600 }, { 4148, 3868, 4412, 3972, 4068, 4228, 3768, 5872, 5600, 4572, 4628 } }, { { 3084, 7992, 3728, 4180, 4076, 5664, 4056, 4104, 4076, 4100, 4096 }, { 2972, 7832, 5864, 3488, 4300, 4064, 4324, 4044, 4084, 4068, 4112 }, { 3880, 4572, 8464, 3480, 4272, 3956, 4228, 4064, 4060, 4068, 4112 }, { 4268, 3160, 7432, 6012, 3516, 4184, 4104, 4100, 4056, 4220, 4104 }, { 4052, 3992, 4300, 8464, 3612, 4156, 4104, 4116, 4068, 4192, 4092 }, { 3972, 4424, 3232, 7272, 6012, 4088, 4120, 4116, 4064, 3744, 4100 }, { 4068, 4160, 3948, 4368, 8368, 4080, 4120, 4116, 4060, 3776, 4104 }, { 4100, 4052, 4340, 3176, 7944, 4060, 4140, 4096, 4060, 5084, 4100 }, { 2796, 7656, 3912, 4112, 4096, 5176, 5260, 3812, 4140, 4096, 4108 }, { 3044, 7608, 5988, 3468, 4304, 3660, 4844, 3952, 4080, 4068, 4132 }, { 3880, 4636, 8336, 3592, 4236, 3740, 4416, 4052, 4032, 4084, 4140 }, { 4224, 3324, 7240, 6108, 3536, 4064, 4184, 4104, 4028, 4220, 4120 }, { 4040, 4040, 4276, 8336, 3784, 4100, 4148, 4124, 4032, 4168, 4104 }, { 3976, 4424, 3284, 7096, 6132, 4044, 4164, 4112, 4044, 3776, 4104 }, { 4072, 4176, 3932, 4368, 8272, 4036, 4160, 4108, 4036, 3872, 4112 }, { 4100, 4076, 4304, 3224, 7928, 4020, 4176, 4096, 4028, 5084, 4112 }, { 3268, 7352, 4068, 4052, 4112, 3308, 6880, 3780, 4116, 4096, 4120 }, { 3276, 7352, 6124, 3448, 4308, 2892, 5460, 4044, 4016, 4072, 4160 }, { 3948, 4648, 8272, 3704, 4196, 3420, 4600, 4132, 3992, 4100, 4160 }, { 4228, 3468, 7048, 6200, 3556, 3896, 4252, 4132, 4004, 4224, 4136 }, { 4064, 4092, 4240, 8240, 3916, 3992, 4200, 4112, 4024, 4160, 4116 }, { 4000, 4424, 3316, 6952, 6216, 3976, 4212, 4092, 4028, 3808, 4116 }, { 4080, 4192, 3924, 4344, 8240, 3988, 4208, 4080, 4028, 3944, 4124 }, { 4092, 4096, 4284, 3248, 7912, 3992, 4216, 4088, 4004, 5096, 4132 }, { 3532, 7032, 4224, 3992, 4132, 2444, 6616, 5148, 3772, 4096, 4172 }, { 3440, 7096, 6232, 3448, 4304, 2420, 5428, 4648, 3868, 4076, 4192 }, { 4012, 4652, 8152, 3840, 4148, 3144, 4620, 4352, 3932, 4116, 4188 }, { 4236, 3596, 6872, 6312, 3568, 3732, 4288, 4180, 3996, 4232, 4152 }, { 4096, 4132, 4212, 8136, 4052, 3884, 4248, 4092, 4036, 4152, 4128 }, { 4036, 4428, 3352, 6808, 6312, 3900, 4252, 4060, 4032, 3844, 4132 }, { 4100, 4212, 3916, 4312, 8208, 3912, 4272, 4044, 4012, 4012, 4144 }, { 4100, 4104, 4272, 3260, 7896, 3936, 4272, 4064, 3980, 5116, 4156 }, { 3460, 6728, 4364, 3944, 4140, 2780, 4832, 6904, 3724, 4104, 4184 }, { 3484, 6824, 6312, 3468, 4288, 2412, 4812, 5340, 3920, 4084, 4196 }, { 4024, 4676, 7992, 4004, 4096, 3012, 4508, 4516, 4008, 4136, 4192 }, { 4236, 3720, 6680, 6392, 3592, 3592, 4316, 4184, 4032, 4236, 4164 }, { 4124, 4164, 4196, 7992, 4196, 3788, 4288, 4064, 4048, 4144, 4140 }, { 4068, 4424, 3392, 6664, 6376, 3832, 4296, 4036, 4028, 3884, 4148 }, { 4116, 4228, 3916, 4288, 8168, 3844, 4316, 4044, 3988, 4080, 4164 }, { 4116, 4116, 4260, 3280, 7848, 3852, 4328, 4068, 3956, 5140, 4176 }, { 3424, 6412, 4484, 3908, 4144, 3060, 3976, 6600, 5024, 4108, 4016 }, { 3488, 6552, 6360, 3512, 4272, 2480, 4444, 5268, 4524, 4092, 4160 }, { 4020, 4704, 7800, 4164, 4056, 2916, 4436, 4460, 4248, 4148, 4200 }, { 4232, 3840, 6504, 6440, 3648, 3448, 4376, 4116, 4128, 4236, 4180 }, { 4140, 4188, 4212, 7832, 4344, 3692, 4348, 4036, 4056, 4144, 4168 }, { 4084, 4416, 3452, 6520, 6444, 3760, 4344, 4040, 4004, 3920, 4172 }, { 4124, 4240, 3924, 4272, 8104, 3784, 4340, 4072, 3960, 4148, 4188 }, { 4120, 4128, 4256, 3312, 7800, 3804, 4332, 4096, 3932, 5168, 4196 }, { 3492, 6108, 4588, 3884, 4144, 2964, 4340, 4632, 6920, 4112, 3968 }, { 3500, 6264, 6360, 3580, 4244, 2448, 4564, 4464, 5432, 4108, 4196 }, { 3996, 4720, 7576, 4308, 4040, 2836, 4472, 4204, 4568, 4164, 4256 }, { 4220, 3960, 6340, 6424, 3744, 3332, 4400, 4064, 4204, 4240, 4236 }, { 4156, 4212, 4248, 7608, 4508, 3600, 4380, 4032, 4060, 4144, 4208 }, { 4088, 4400, 3528, 6360, 6488, 3700, 4372, 4068, 3968, 3968, 4208 }, { 4120, 4248, 3940, 4280, 8008, 3724, 4356, 4116, 3916, 4224, 4216 }, { 4116, 4144, 4244, 3376, 7704, 3760, 4336, 4128, 3908, 5200, 4228 }, { 3528, 5820, 4680, 3876, 4144, 2940, 4660, 3480, 7272, 4116, 4644 }, { 3500, 5964, 6280, 3668, 4220, 2444, 4712, 3848, 5836, 4128, 4548 }, { 3956, 4700, 7320, 4420, 4052, 2792, 4536, 3976, 4740, 4188, 4476 }, { 4200, 4052, 6188, 6296, 3888, 3264, 4392, 4024, 4200, 4248, 4392 }, { 4172, 4232, 4320, 7316, 4668, 3516, 4364, 4080, 3988, 4168, 4328 }, { 4108, 4392, 3628, 6200, 6472, 3620, 4364, 4112, 3916, 4040, 4288 }, { 4124, 4260, 3964, 4312, 7848, 3664, 4360, 4140, 3876, 4312, 4288 }, { 4120, 4172, 4232, 3468, 7560, 3700, 4352, 4136, 3876, 5240, 4292 } }, { { 3088, 6264, 4140, 4268, 4084, 6584, 4372, 4096, 4100, 4072, 4080 }, { 2948, 5652, 5500, 4332, 4116, 5460, 4844, 4072, 4104, 4048, 4080 }, { 3488, 3520, 6840, 4828, 4100, 5088, 4924, 4144, 4080, 4052, 4088 }, { 3804, 2428, 6236, 6408, 3928, 4952, 4912, 4200, 4072, 4120, 4092 }, { 3760, 2572, 4516, 7672, 4348, 4888, 4872, 4244, 4076, 4112, 4092 }, { 3748, 2760, 3716, 6728, 6168, 4764, 4848, 4268, 4104, 3956, 4088 }, { 3808, 2772, 3940, 4368, 8136, 4648, 4804, 4284, 4124, 4156, 4104 }, { 3844, 2920, 4068, 3188, 7952, 4528, 4744, 4280, 4120, 5372, 4148 }, { 2988, 4736, 4384, 4432, 4068, 6216, 6216, 3824, 4152, 4068, 4080 }, { 3104, 4056, 5064, 4960, 4048, 5300, 6336, 4052, 4092, 4048, 4096 }, { 3456, 2604, 5572, 5748, 4080, 4968, 6280, 4240, 4036, 4060, 4116 }, { 3680, 1608, 5240, 6696, 4276, 4860, 6184, 4364, 4016, 4096, 4132 }, { 3708, 1352, 4396, 6936, 5148, 4804, 6092, 4440, 4044, 4096, 4128 }, { 3760, 1352, 3940, 5724, 6872, 4676, 6012, 4492, 4100, 4112, 4120 }, { 3792, 1528, 3812, 4088, 8120, 4532, 5892, 4524, 4136, 4576, 4144 }, { 3808, 1832, 3656, 3368, 7848, 4412, 5716, 4524, 4136, 5628, 4220 }, { 3628, 4044, 4324, 4588, 4072, 4248, 7832, 4156, 4080, 4068, 4108 }, { 3568, 3196, 4684, 5272, 4096, 4196, 7368, 4580, 4004, 4052, 4140 }, { 3752, 1944, 4876, 6028, 4268, 4164, 7148, 4760, 3980, 4060, 4172 }, { 3884, 1032, 4600, 6568, 4748, 4164, 7048, 4836, 4000, 4096, 4188 }, { 3920, 664, 4068, 6216, 5908, 4160, 6952, 4880, 4060, 4140, 4188 }, { 3968, 568, 3744, 4952, 7432, 4092, 6824, 4924, 4112, 4324, 4200 }, { 3972, 776, 3504, 3776, 8272, 4016, 6632, 4944, 4152, 4868, 4244 }, { 3936, 1144, 3260, 3324, 7992, 3964, 6376, 4932, 4160, 5744, 4332 }, { 3960, 3720, 4244, 4644, 4112, 3504, 6940, 6028, 3744, 4068, 4192 }, { 3848, 2804, 4424, 5308, 4232, 3684, 6688, 6012, 3888, 4060, 4208 }, { 3948, 1688, 4448, 5900, 4568, 3736, 6652, 5896, 3996, 4072, 4244 }, { 4044, 840, 4152, 6108, 5308, 3748, 6700, 5796, 4076, 4116, 4272 }, { 4100, 456, 3688, 5420, 6648, 3716, 6712, 5740, 4132, 4220, 4304 }, { 4128, 384, 3340, 4296, 7944, 3672, 6648, 5724, 4180, 4504, 4332 }, { 4108, 536, 3068, 3456, 8496, 3656, 6464, 5684, 4228, 5060, 4388 }, { 4048, 840, 2836, 3232, 8184, 3652, 6232, 5596, 4280, 5796, 4460 }, { 3960, 3576, 4176, 4608, 4184, 3872, 4772, 7752, 4000, 4068, 4180 }, { 3936, 2724, 4196, 5172, 4420, 3772, 5156, 7160, 4312, 4072, 4228 }, { 4016, 1784, 4080, 5544, 4944, 3712, 5440, 6776, 4448, 4104, 4312 }, { 4116, 1000, 3788, 5348, 5980, 3668, 5652, 6552, 4496, 4160, 4388 }, { 4180, 616, 3360, 4516, 7384, 3620, 5756, 6440, 4528, 4304, 4436 }, { 4192, 540, 2956, 3648, 8400, 3588, 5756, 6360, 4560, 4664, 4480 }, { 4160, 664, 2640, 3164, 8720, 3576, 5664, 6232, 4616, 5196, 4532 }, { 4108, 856, 2444, 3156, 8336, 3568, 5556, 6028, 4712, 5828, 4580 }, { 3968, 3560, 4096, 4532, 4268, 4064, 3808, 7068, 5688, 4076, 4024 }, { 3980, 2840, 3980, 4916, 4644, 3860, 4288, 6632, 5700, 4096, 4224 }, { 4040, 2076, 3756, 4992, 5388, 3756, 4612, 6344, 5636, 4148, 4400 }, { 4144, 1400, 3476, 4420, 6696, 3680, 4832, 6200, 5600, 4212, 4496 }, { 4204, 1016, 3088, 3592, 8008, 3644, 4956, 6116, 5572, 4420, 4548 }, { 4200, 888, 2668, 3052, 8748, 3616, 5000, 6020, 5588, 4804, 4580 }, { 4184, 908, 2340, 2900, 8816, 3596, 4996, 5872, 5612, 5308, 4624 }, { 4144, 996, 2212, 3052, 8368, 3584, 4980, 5652, 5660, 5848, 4660 }, { 4020, 3632, 3992, 4412, 4368, 3964, 4100, 4696, 7768, 4096, 4108 }, { 4024, 3020, 3796, 4564, 4908, 3832, 4288, 4792, 7320, 4140, 4460 }, { 4068, 2444, 3508, 4280, 5908, 3724, 4484, 4780, 7112, 4204, 4636 }, { 4144, 1892, 3212, 3496, 7320, 3660, 4632, 4768, 7032, 4304, 4696 }, { 4200, 1492, 2860, 2820, 8400, 3636, 4708, 4760, 7000, 4568, 4704 }, { 4208, 1272, 2484, 2572, 8880, 3624, 4736, 4736, 6968, 4960, 4720 }, { 4192, 1192, 2196, 2676, 8784, 3612, 4740, 4668, 6936, 5412, 4732 }, { 4156, 1184, 2116, 2932, 8336, 3596, 4744, 4584, 6872, 5864, 4756 }, { 4052, 3684, 3896, 4268, 4476, 3896, 4432, 3196, 8024, 4128, 5100 }, { 4056, 3184, 3624, 4160, 5212, 3804, 4480, 3400, 7816, 4200, 5212 }, { 4096, 2740, 3300, 3580, 6392, 3704, 4580, 3472, 7804, 4300, 5184 }, { 4152, 2284, 3016, 2788, 7664, 3636, 4668, 3504, 7852, 4468, 5116 }, { 4204, 1864, 2716, 2336, 8496, 3604, 4728, 3528, 7884, 4768, 5036 }, { 4228, 1568, 2412, 2308, 8780, 3584, 4756, 3556, 7856, 5132, 5000 }, { 4204, 1416, 2188, 2532, 8624, 3592, 4728, 3604, 7780, 5516, 4976 }, { 4160, 1340, 2140, 2832, 8240, 3588, 4700, 3684, 7616, 5888, 4976 } }, { { 3760, 7016, 3760, 4140, 4084, 6184, 3840, 4108, 4084, 4096, 4088 }, { 3224, 7900, 4904, 3832, 4152, 4708, 4112, 4048, 4104, 4072, 4080 }, { 3720, 5724, 7276, 3616, 4180, 4192, 4108, 4092, 4096, 4060, 4084 }, { 4248, 3840, 7736, 4936, 3812, 4100, 4056, 4132, 4080, 4132, 4084 }, { 4168, 3908, 5652, 7240, 3564, 4104, 4052, 4152, 4068, 4160, 4088 }, { 4020, 4384, 3916, 7764, 4664, 4064, 4076, 4152, 4064, 3952, 4092 }, { 4032, 4328, 3916, 5676, 6948, 4060, 4100, 4136, 4060, 3800, 4100 }, { 4120, 4120, 4428, 3604, 7848, 4016, 4144, 4108, 4056, 4612, 4104 }, { 3512, 5820, 3916, 4088, 4104, 6904, 4644, 3876, 4116, 4088, 4080 }, { 3512, 7304, 4408, 3968, 4112, 5296, 4308, 3988, 4096, 4080, 4080 }, { 3652, 6584, 6172, 3748, 4132, 4404, 4132, 4076, 4092, 4068, 4084 }, { 4044, 4788, 7464, 4336, 3948, 4180, 3988, 4140, 4068, 4092, 4092 }, { 4160, 4080, 6604, 6096, 3656, 4136, 3984, 4148, 4048, 4132, 4100 }, { 4056, 4264, 4928, 7364, 4032, 4112, 4036, 4136, 4040, 4080, 4104 }, { 4024, 4384, 4176, 6600, 5548, 4080, 4084, 4116, 4028, 4000, 4116 }, { 4084, 4260, 4328, 4724, 6952, 4060, 4116, 4096, 4012, 4396, 4124 }, { 3796, 5056, 3976, 4072, 4116, 5780, 6376, 3700, 4120, 4084, 4068 }, { 3696, 6424, 4200, 4000, 4116, 5612, 4896, 3976, 4064, 4080, 4084 }, { 3684, 6744, 5388, 3820, 4116, 4808, 4264, 4080, 4072, 4076, 4096 }, { 3920, 5568, 6904, 4028, 4008, 4356, 4004, 4108, 4068, 4084, 4112 }, { 4096, 4512, 7016, 5236, 3732, 4208, 3948, 4104, 4048, 4132, 4124 }, { 4084, 4276, 5836, 6680, 3728, 4168, 3996, 4064, 4040, 4152, 4124 }, { 4052, 4344, 4760, 6888, 4560, 4136, 4056, 4048, 4020, 4148, 4136 }, { 4072, 4332, 4412, 5740, 5852, 4108, 4100, 4040, 3988, 4352, 4156 }, { 4112, 4620, 3992, 4064, 4128, 4516, 7080, 4580, 3872, 4084, 4108 }, { 3856, 5636, 4096, 4008, 4120, 5360, 5740, 4160, 3992, 4088, 4088 }, { 3700, 6488, 4844, 3884, 4092, 5104, 4724, 4056, 4060, 4092, 4104 }, { 3836, 6004, 6268, 3876, 4036, 4584, 4248, 4000, 4092, 4092, 4120 }, { 4048, 4960, 7048, 4624, 3796, 4292, 4064, 3976, 4068, 4132, 4144 }, { 4112, 4424, 6468, 5964, 3612, 4196, 4032, 3956, 4044, 4188, 4156 }, { 4100, 4348, 5388, 6744, 4000, 4144, 4072, 3956, 4012, 4224, 4168 }, { 4104, 4332, 4736, 6296, 4976, 4112, 4124, 3960, 3972, 4348, 4184 }, { 4124, 4384, 4004, 4064, 4128, 4256, 5996, 6312, 3672, 4084, 4124 }, { 3984, 5088, 4016, 4024, 4108, 4848, 6200, 4776, 3924, 4100, 4092 }, { 3772, 6020, 4472, 3940, 4072, 5024, 5460, 4124, 4048, 4108, 4108 }, { 3760, 6100, 5676, 3840, 4028, 4744, 4760, 3904, 4092, 4108, 4140 }, { 3956, 5316, 6776, 4240, 3856, 4428, 4336, 3860, 4072, 4136, 4172 }, { 4108, 4632, 6760, 5340, 3624, 4240, 4160, 3852, 4036, 4200, 4188 }, { 4156, 4388, 5940, 6328, 3752, 4136, 4136, 3872, 3988, 4252, 4204 }, { 4148, 4344, 5164, 6424, 4404, 4092, 4152, 3912, 3944, 4348, 4216 }, { 4064, 4256, 4020, 4060, 4112, 4372, 4696, 7160, 4332, 4092, 3996 }, { 4032, 4728, 3976, 4040, 4080, 4536, 5900, 5648, 4020, 4112, 4080 }, { 3868, 5540, 4240, 3984, 4044, 4768, 5912, 4520, 4028, 4124, 4124 }, { 3768, 5932, 5180, 3852, 4020, 4720, 5328, 4000, 4056, 4124, 4160 }, { 3872, 5500, 6360, 4024, 3904, 4516, 4736, 3856, 4032, 4144, 4196 }, { 4048, 4856, 6776, 4836, 3692, 4312, 4372, 3840, 3992, 4204, 4220 }, { 4148, 4492, 6296, 5828, 3684, 4172, 4208, 3872, 3936, 4260, 4240 }, { 4184, 4396, 5596, 6232, 4096, 4076, 4156, 3916, 3904, 4344, 4248 }, { 4064, 4212, 4032, 4052, 4088, 4332, 4268, 6184, 5924, 4100, 3884 }, { 4048, 4512, 3972, 4044, 4052, 4420, 5244, 6176, 4448, 4116, 4100 }, { 3956, 5084, 4132, 4008, 4020, 4576, 5804, 5188, 4060, 4136, 4180 }, { 3852, 5604, 4800, 3908, 3996, 4592, 5644, 4408, 3988, 4144, 4208 }, { 3844, 5528, 5868, 3948, 3932, 4480, 5140, 4048, 3964, 4160, 4232 }, { 3952, 5048, 6552, 4480, 3776, 4340, 4660, 3952, 3920, 4204, 4252 }, { 4076, 4652, 6440, 5348, 3708, 4208, 4344, 3968, 3864, 4260, 4276 }, { 4152, 4504, 5916, 5904, 3936, 4096, 4164, 3996, 3852, 4340, 4280 }, { 4108, 4216, 4032, 4056, 4060, 4184, 4380, 4604, 7000, 4108, 4400 }, { 4100, 4384, 3984, 4064, 4012, 4340, 4728, 5948, 5124, 4132, 4324 }, { 4064, 4728, 4088, 4044, 3988, 4452, 5380, 5700, 4216, 4152, 4332 }, { 3976, 5212, 4532, 3976, 3972, 4472, 5596, 4964, 3960, 4160, 4316 }, { 3880, 5432, 5388, 3948, 3948, 4392, 5332, 4444, 3888, 4172, 4312 }, { 3896, 5176, 6200, 4244, 3852, 4280, 4916, 4204, 3840, 4208, 4316 }, { 3980, 4812, 6404, 4940, 3744, 4164, 4560, 4128, 3812, 4264, 4324 }, { 4080, 4612, 6100, 5556, 3840, 4068, 4300, 4084, 3812, 4348, 4324 } }, { { 3300, 6744, 4516, 4400, 4156, 5172, 4368, 4132, 4220, 4080, 4080 }, { 3676, 5424, 6504, 4336, 4284, 4104, 4252, 4152, 4288, 4068, 4072 }, { 4032, 4052, 7304, 4688, 4292, 4188, 4044, 4108, 4296, 4080, 4072 }, { 4112, 3812, 6392, 5916, 4128, 4212, 4008, 4072, 4304, 4132, 4068 }, { 4044, 4064, 4952, 7016, 4324, 4216, 3996, 4076, 4312, 4096, 4060 }, { 4032, 4112, 4432, 6564, 5452, 4200, 4020, 4084, 4312, 3884, 4060 }, { 4052, 4048, 4600, 5300, 6580, 4196, 4048, 4076, 4300, 3880, 4064 }, { 4060, 4056, 4688, 4596, 6344, 4180, 4096, 4044, 4284, 4736, 4072 }, { 3456, 5300, 5260, 4624, 4224, 4728, 5004, 4092, 4336, 4080, 4048 }, { 3880, 4532, 6360, 5004, 4316, 4136, 4176, 4224, 4404, 4068, 4048 }, { 4032, 3996, 6468, 5396, 4392, 4240, 3960, 4116, 4440, 4068, 4048 }, { 4048, 3960, 5984, 5844, 4504, 4260, 3928, 4060, 4452, 4064, 4048 }, { 4024, 4060, 5396, 6168, 4752, 4260, 3928, 4056, 4444, 4024, 4048 }, { 4024, 4056, 5120, 6104, 5124, 4248, 3968, 4036, 4444, 3980, 4048 }, { 4028, 4032, 5056, 5892, 5260, 4252, 4012, 4020, 4444, 4112, 4056 }, { 4028, 4084, 4936, 5744, 4976, 4232, 4088, 4008, 4396, 4580, 4072 }, { 3868, 4684, 5460, 4808, 4288, 3988, 5284, 4284, 4360, 4072, 4048 }, { 3980, 4240, 6184, 5300, 4396, 4156, 4028, 4280, 4476, 4060, 4052 }, { 4044, 3968, 6168, 5584, 4520, 4248, 3880, 4116, 4520, 4048, 4052 }, { 4044, 3944, 5916, 5764, 4664, 4280, 3884, 4044, 4528, 4032, 4052 }, { 4024, 4012, 5584, 5960, 4756, 4288, 3900, 4020, 4520, 4028, 4056 }, { 4016, 4048, 5364, 6100, 4768, 4272, 3948, 3984, 4520, 4064, 4064 }, { 4020, 4044, 5284, 6116, 4660, 4268, 3996, 3972, 4500, 4212, 4076 }, { 4016, 4104, 5120, 6100, 4408, 4264, 4076, 3972, 4456, 4548, 4092 }, { 4008, 4436, 5468, 4908, 4356, 3872, 4780, 4908, 4272, 4064, 4080 }, { 4016, 4136, 6092, 5396, 4484, 4200, 3896, 4268, 4548, 4052, 4064 }, { 4064, 3936, 6116, 5612, 4572, 4256, 3840, 4072, 4564, 4044, 4076 }, { 4060, 3880, 5984, 5724, 4668, 4284, 3876, 4004, 4564, 4028, 4080 }, { 4036, 3960, 5684, 5964, 4644, 4292, 3920, 3956, 4568, 4052, 4084 }, { 4024, 4032, 5420, 6216, 4516, 4264, 3980, 3916, 4564, 4120, 4092 }, { 4028, 4052, 5328, 6264, 4380, 4264, 4020, 3916, 4536, 4264, 4104 }, { 4016, 4112, 5196, 6188, 4224, 4272, 4080, 3928, 4492, 4520, 4116 }, { 3960, 4316, 5444, 4944, 4404, 4124, 4092, 5340, 4384, 4060, 4072 }, { 4016, 4096, 6044, 5444, 4504, 4244, 3912, 4124, 4640, 4052, 4080 }, { 4052, 3928, 6152, 5608, 4548, 4280, 3864, 3980, 4588, 4048, 4108 }, { 4064, 3836, 6088, 5700, 4604, 4292, 3892, 3956, 4572, 4036, 4112 }, { 4048, 3896, 5780, 6004, 4512, 4280, 3972, 3888, 4588, 4076, 4108 }, { 4028, 4004, 5436, 6356, 4324, 4256, 4036, 3848, 4584, 4164, 4116 }, { 4024, 4076, 5268, 6424, 4220, 4252, 4072, 3848, 4560, 4292, 4124 }, { 4012, 4164, 5180, 6232, 4160, 4260, 4116, 3880, 4528, 4504, 4132 }, { 3960, 4236, 5412, 4976, 4408, 4188, 3980, 4912, 5044, 4068, 3968 }, { 4008, 4092, 6020, 5468, 4468, 4252, 4020, 3952, 4696, 4060, 4112 }, { 4036, 3972, 6152, 5604, 4492, 4276, 3940, 3892, 4568, 4056, 4160 }, { 4064, 3864, 6152, 5692, 4508, 4268, 3948, 3904, 4548, 4052, 4160 }, { 4056, 3884, 5860, 6028, 4376, 4248, 4028, 3856, 4564, 4100, 4148 }, { 4040, 3984, 5460, 6424, 4192, 4240, 4092, 3812, 4576, 4192, 4144 }, { 4024, 4104, 5212, 6520, 4108, 4240, 4120, 3816, 4572, 4308, 4144 }, { 4020, 4200, 5108, 6296, 4104, 4228, 4160, 3848, 4548, 4492, 4148 }, { 3988, 4196, 5388, 4980, 4368, 4136, 4120, 4284, 5632, 4076, 3988 }, { 3996, 4128, 5976, 5476, 4420, 4252, 4088, 3948, 4592, 4068, 4208 }, { 4024, 4044, 6116, 5612, 4424, 4260, 4008, 3896, 4476, 4068, 4232 }, { 4056, 3936, 6132, 5708, 4404, 4232, 4012, 3888, 4496, 4072, 4212 }, { 4068, 3908, 5900, 6028, 4264, 4216, 4064, 3864, 4520, 4128, 4192 }, { 4048, 4004, 5496, 6408, 4112, 4216, 4120, 3836, 4524, 4212, 4184 }, { 4024, 4140, 5180, 6520, 4052, 4224, 4148, 3836, 4524, 4328, 4180 }, { 4024, 4216, 5060, 6312, 4084, 4228, 4172, 3868, 4516, 4488, 4184 }, { 3988, 4208, 5268, 4968, 4308, 4116, 4108, 4092, 5544, 4088, 4472 }, { 3992, 4176, 5856, 5468, 4368, 4220, 4092, 4104, 4408, 4084, 4388 }, { 4012, 4124, 6012, 5620, 4348, 4236, 4064, 3964, 4344, 4092, 4340 }, { 4048, 4036, 6044, 5732, 4288, 4208, 4056, 3936, 4404, 4116, 4288 }, { 4072, 3992, 5892, 5956, 4188, 4192, 4080, 3920, 4432, 4168, 4260 }, { 4056, 4048, 5552, 6264, 4072, 4208, 4108, 3900, 4452, 4244, 4244 }, { 4032, 4172, 5220, 6392, 4044, 4208, 4152, 3900, 4460, 4344, 4232 }, { 4052, 4216, 5072, 6244, 4076, 4204, 4164, 3932, 4452, 4492, 4240 } } }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/dip_tables.h000066400000000000000000000031221517466257200231100ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_DIP_H #define DAV2D_SRC_DIP_H #include #include "common/attributes.h" EXTERN const uint16_t dav2d_dip_weights[6][64][11]; #endif /* DAV2D_SRC_DIP_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/env.h000066400000000000000000000401101517466257200215700ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_ENV_H #define DAV2D_SRC_ENV_H #include #include #include #include "src/levels.h" #include "src/refmvs.h" #include "src/tables.h" typedef struct BlockContext { uint8_t ALIGN(fsc[64], 8); uint8_t ALIGN(mode[64], 8); uint8_t ALIGN(midx[64], 8); uint8_t ALIGN(mrl[64], 8); uint8_t ALIGN(multi_mrl[64], 8); uint8_t ALIGN(dip[64], 8); uint8_t ALIGN(lcoef[64], 8); uint8_t ALIGN(ccoef[2][64], 8); uint8_t ALIGN(seg_pred[64], 8); uint8_t ALIGN(skip_txfm[64], 8); uint8_t ALIGN(skip_mode[64], 8); uint8_t ALIGN(intra[64], 8); uint8_t ALIGN(intrabc[64], 8); uint8_t ALIGN(morph_pred[64], 8); uint8_t ALIGN(comp_type[64], 8); int8_t ALIGN(ref[2][64], 8); // -1 means intra uint8_t ALIGN(motion_mode[64], 8); uint8_t ALIGN(amvd[64], 8); uint8_t ALIGN(mvprec[64], 8); uint8_t ALIGN(filter[64], 8); // DAV2D_N_SWITCHABLE_FILTERS=3 means unset uint8_t ALIGN(tx_lpf_y[64], 8); uint8_t ALIGN(tx_lpf_uv[64], 8); uint8_t ALIGN(partition[2][64], 8); uint8_t ALIGN(uvmode[64], 8); uint8_t ALIGN(pal_sz[64], 8); } BlockContext; struct SBEdgeCtx { int8_t ref[2][64]; uint8_t motion_mode[64]; }; static inline int get_intra_ctx(const BlockContext *nx[2], const int xoff[2], const int n_ctx) { if (!n_ctx) return 0; const int i = n_ctx - 1; const int sum = (nx[0]->intra[xoff[0]] && !nx[0]->intrabc[xoff[0]]) + (nx[i]->intra[xoff[i]] && !nx[i]->intrabc[xoff[i]]); return sum + (sum == n_ctx); } static inline int sm_flag(const BlockContext *const b, const int idx) { if (!b->intra[idx]) return 0; const enum IntraPredMode m = b->mode[idx]; return m == SMOOTH_PRED || m == SMOOTH_H_PRED || m == SMOOTH_V_PRED; } static inline int sm_uv_flag(const BlockContext *const b, const int idx) { const enum IntraPredMode m = b->uvmode[idx]; return m == SMOOTH_PRED || m == SMOOTH_H_PRED || m == SMOOTH_V_PRED; } static inline int get_partition_ctx(const BlockContext *const a, const BlockContext *const l, const uint8_t *const b_dim, const int plane, const int yb4, const int xb4) { return ((a->partition[plane][xb4] >> imax(b_dim[2] - 1, 0)) & 1) + (((l->partition[plane][yb4] >> imax(b_dim[3] - 1, 0)) & 1) << 1); } static inline int get_partition2_ctx(const BlockContext *const a, const BlockContext *const l, const uint8_t *const b_dim, const int plane, const int dir, const int yb4, const int xb4) { if (!dir /* horizontal */) { const int hh4 = b_dim[1] >> 1; return ((l->partition[plane][yb4 + hh4] >> (b_dim[3] - 2)) & 1) + (((l->partition[plane][yb4] >> (b_dim[3] - 2)) & 1) << 1); } else /* vertical */ { const int hw4 = b_dim[0] >> 1; return ((a->partition[plane][xb4 + hw4] >> (b_dim[2] - 2)) & 1) + (((a->partition[plane][xb4] >> (b_dim[2] - 2)) & 1) << 1); } } static inline int get_filter_ctx(const BlockContext *nb[2], const int boff[2], const union refpair r) { const int ref = r.ref[0], comp = r.ref[1] != -1; const int flt0 = (boff[0] != -1 && (nb[0]->ref[0][boff[0]] == ref || nb[0]->ref[1][boff[0]] == ref)) ? nb[0]->filter[boff[0]] : DAV2D_N_SWITCHABLE_FILTERS; const int flt1 = (boff[1] != -1 && (nb[1]->ref[0][boff[1]] == ref || nb[1]->ref[1][boff[1]] == ref)) ? nb[1]->filter[boff[1]] : DAV2D_N_SWITCHABLE_FILTERS; if (flt0 == flt1 || flt1 == DAV2D_N_SWITCHABLE_FILTERS) { return comp * 4 + flt0; } else if (flt0 == DAV2D_N_SWITCHABLE_FILTERS) { return comp * 4 + flt1; } else { return comp * 4 + DAV2D_N_SWITCHABLE_FILTERS; } } static inline int get_comp_ctx(const BlockContext *nx[2], const int xoff[2], const int n_ctx, const int8_t *const refdir) { switch (n_ctx) { default: assert(0); case 2: { const int refa2 = nx[0]->ref[1][xoff[0]]; const int refb2 = nx[1]->ref[1][xoff[1]]; if (refa2 == -1) { const int refa1 = nx[0]->ref[0][xoff[0]]; if (refb2 == -1) { const int refb1 = nx[1]->ref[0][xoff[1]]; return (refdir[refa1] == 1) ^ (refdir[refb1] == 1); } else return 2 + (!nx[0]->intrabc[xoff[0]] && refdir[refa1]); } else if (refb2 == -1) { const int refb1 = nx[1]->ref[0][xoff[1]]; return 2 + (!nx[1]->intrabc[xoff[1]] && refdir[refb1]); } else return 4; } case 1: { const int ref2 = nx[0]->ref[1][xoff[0]]; if (ref2 == -1) { const int ref1 = nx[0]->ref[0][xoff[0]]; return !nx[0]->intrabc[xoff[0]] && refdir[ref1]; } else return 3; } case 0: return 1; } } static inline int get_warp_ctx(const BlockContext *const a, const struct SBEdgeCtx *const a_sb_cache, const BlockContext *const l, const int yb4, const int xb4, const int have_top, const int have_left, const int have_top_right, const int have_bottom_left, const unsigned top_is_at_tile_boundary, const uint8_t *const b_dim, const int ref) { int ctx = 0; #define add_matching(dir, idx) do { \ ctx += (dir->ref[0][idx] == ref || dir->ref[1][idx] == ref) && \ dir->motion_mode[idx] >= 2; \ } while (0) if (have_top) { if (top_is_at_tile_boundary) { add_matching(a_sb_cache, xb4 & ~1); if (have_top_right && b_dim[0] >= 4) add_matching(a_sb_cache, (xb4 + b_dim[0] - 2) & ~1); } else { add_matching(a, xb4); if (have_top_right) add_matching(a, xb4 + b_dim[0] - 1); } } if (have_left) { add_matching(l, yb4); if (have_bottom_left) add_matching(l, yb4 + b_dim[1] - 1); } #undef add_matching return ctx; } static inline int get_snglref_ctx(const BlockContext *const a, const BlockContext *const l, const int yb4, const int xb4, const int have_top, const int have_left, const int have_top_right, const int have_bottom_left, const uint8_t *const b_dim, const int ref) { int row = 0, col = 0, newmv = 0; #define NEWMV0_MODE_MASK ((1 << NEWMV) | \ (1 << NEWMV_NEARMV) | \ (1 << NEWMV_NEWMV) | \ (1 << JOINT_NEWMV) | \ (1 << OPFL_NEWMV_NEARMV) | \ (1 << OPFL_NEWMV_NEWMV) | \ (1 << OPFL_JOINT_NEWMV)) // the joint_newmv modes are missing in NEWMV1_MODE_MASK, // see compound_ref1_mode() in AVM #define NEWMV1_MODE_MASK ((1 << NEARMV_NEWMV) | \ (1 << NEWMV_NEWMV) | \ (1 << OPFL_NEARMV_NEWMV) | \ (1 << OPFL_NEWMV_NEWMV)) #define add_matching(dir, cnt, idx) do { \ if (dir->ref[0][idx] == ref) { \ cnt++; \ newmv += !!((1 << dir->mode[idx]) & NEWMV0_MODE_MASK); \ } else if (dir->ref[1][idx] == ref) { \ cnt++; \ newmv += !!((1 << dir->mode[idx]) & NEWMV1_MODE_MASK); \ } \ } while (0) if (have_top) { add_matching(a, col, xb4); if (have_top_right) add_matching(a, col, xb4 + b_dim[0] - 1); } if (have_left) { add_matching(l, row, yb4); if (have_bottom_left) add_matching(l, row, yb4 + b_dim[1] - 1); } #undef NEWMV0_MODE_MASK #undef NEWMV1_MODE_MASK #undef add_matching return !!row + !!col + 2 * !!newmv; } static inline int get_compref_ctx(const BlockContext *const a, const BlockContext *const l, const int yb4, const int xb4, const int have_top, const int have_left, const int have_top_right, const int have_bottom_left, const uint8_t *const b_dim, const union refpair ref, const union refpair tip) { int row = 0, col = 0, newmv = 0; #define NEWMV_MODE_MASK ((1 << NEWMV) | \ (1 << NEARMV_NEWMV) | \ (1 << NEWMV_NEARMV) | \ (1 << NEWMV_NEWMV) | \ (1 << JOINT_NEWMV) | \ (1 << OPFL_NEARMV_NEWMV) | \ (1 << OPFL_NEWMV_NEARMV) | \ (1 << OPFL_NEWMV_NEWMV) | \ (1 << OPFL_JOINT_NEWMV)) #define add_matching(dir, cnt, idx) do { \ if (dir->ref[0][idx] == TIP_FRAME && \ tip.ref[0] == ref.ref[0] && tip.ref[1] == ref.ref[1]) \ { \ cnt++; \ newmv += dir->mode[idx] == NEWMV; \ } else if (dir->ref[0][idx] == ref.ref[0] && dir->ref[1][idx] == ref.ref[1]) { \ cnt++; \ newmv += !!((1 << dir->mode[idx]) & NEWMV_MODE_MASK); \ } \ } while (0) if (have_top) { add_matching(a, col, xb4); if (have_top_right) add_matching(a, col, xb4 + b_dim[0] - 1); } if (have_left) { add_matching(l, row, yb4); if (have_bottom_left) add_matching(l, row, yb4 + b_dim[1] - 1); } #undef NEWMV_MODE_MASK #undef add_matching return !!row + !!col + 2 * !!newmv; } static inline int get_poc_diff(const int order_hint_n_bits, const int poc0, const int poc1) { if (!order_hint_n_bits) return 0; const int mask = 1 << (order_hint_n_bits - 1); const int diff = poc0 - poc1; return (diff & (mask - 1)) - (diff & mask); } static inline unsigned get_cur_frame_segid(const int by, const int bx, const int have_top, const int have_left, int *const seg_ctx, const uint8_t *cur_seg_map, const ptrdiff_t stride) { cur_seg_map += bx + by * stride; if (have_left && have_top) { const int l = cur_seg_map[-1]; const int a = cur_seg_map[-stride]; const int al = cur_seg_map[-(stride + 1)]; if (l == a && al == l) *seg_ctx = 2; else if (l == a || al == l || a == al) *seg_ctx = 1; else *seg_ctx = 0; return a == al ? a : l; } else { *seg_ctx = 0; return have_left ? cur_seg_map[-1] : have_top ? cur_seg_map[-stride] : 0; } } static inline void fix_int_mv_precision(mv *const mv) { mv->x = (mv->x - (mv->x >> 15) + 3) & ~7U; mv->y = (mv->y - (mv->y >> 15) + 3) & ~7U; } static inline void fix_mv_precision(const Dav2dFrameHeader *const hdr, mv *const mv) { if (hdr->force_integer_mv) { fix_int_mv_precision(mv); } else if (hdr->mv_precision < 3) { mv->x = (mv->x - (mv->x >> 15)) & ~1U; mv->y = (mv->y - (mv->y >> 15)) & ~1U; } } // mv_prec=0..6 for {8,4,2,f,h,q,e}pel static inline void mv_reduce_prec(mv *const mv, const int mv_prec) { if (mv_prec == 6) return; const int rnd = 32 >> mv_prec; mv->x = mv->x + rnd - (mv->x > 0); mv->y = mv->y + rnd - (mv->y > 0); const unsigned mask = ~(rnd * 2U - 1); mv->x &= mask; mv->y &= mask; } static inline mv get_warpmv_2d(const int32_t *const matrix, const int bx4, const int by4, const int bw4, const int bh4, const int iw4, const int ih4, const int mv_precision) { const int x = bx4 * 4 + bw4 * 2 - 1; const int y = by4 * 4 + bh4 * 2 - 1; const int64_t xc = (matrix[2] - (1 << 16)) * (int64_t) x + matrix[3] * (int64_t) y + matrix[0]; const int64_t yc = (matrix[5] - (1 << 16)) * (int64_t) y + matrix[4] * (int64_t) x + matrix[1]; const int not_epel = mv_precision < 6, shift = 13 + not_epel; const int rnd = (1 << shift) >> 1, max = 0xffff - not_epel; union mv res = (mv) { .y = iclip(apply_sign64(((llabs(yc) + rnd) >> shift) << not_epel, yc), -max, +max), .x = iclip(apply_sign64(((llabs(xc) + rnd) >> shift) << not_epel, xc), -max, +max), }; res.y = iclip(res.y, -(by4 + bh4 + 4) * 32, (ih4 - by4 + 4) * 32); res.x = iclip(res.x, -(bx4 + bw4 + 4) * 32, (iw4 - bx4 + 4) * 32); return res; } static inline mv get_gmv_2d(const Dav2dWarpedMotionParams *const gmv, const int bx4, const int by4, const int bw4, const int bh4, const int iw4, const int ih4, const Dav2dFrameHeader *const hdr) { switch (gmv->type) { case DAV2D_WM_TYPE_ROT_ZOOM: assert(gmv->matrix[5] == gmv->matrix[2]); assert(gmv->matrix[4] == -gmv->matrix[3]); // fall-through default: case DAV2D_WM_TYPE_AFFINE: { mv res = get_warpmv_2d(gmv->matrix, bx4, by4, bw4, bh4, iw4, ih4, hdr->mv_precision + 3); if (hdr->force_integer_mv) fix_int_mv_precision(&res); return res; } case DAV2D_WM_TYPE_TRANSLATION: { mv res = (mv) { .y = gmv->matrix[0] >> 13, .x = gmv->matrix[1] >> 13, }; res.y = iclip(res.y, -(by4 + bh4 + 4) * 32, (ih4 - by4 + 4) * 32); res.x = iclip(res.x, -(bx4 + bw4 + 4) * 32, (iw4 - bx4 + 4) * 32); if (hdr->force_integer_mv) fix_int_mv_precision(&res); return res; } case DAV2D_WM_TYPE_IDENTITY: return (mv) { .x = 0, .y = 0 }; } } static inline enum Dav2dWarpedMotionType warp_type(const int32_t *const mtx) { if (mtx[2] != mtx[5] || mtx[3] != -mtx[4]) return DAV2D_WM_TYPE_AFFINE; if (mtx[2] != 0x10000 || mtx[3]) return DAV2D_WM_TYPE_ROT_ZOOM; return mtx[0] | mtx[1] ? DAV2D_WM_TYPE_TRANSLATION : DAV2D_WM_TYPE_IDENTITY; } #endif /* DAV2D_SRC_ENV_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ext/000077500000000000000000000000001517466257200214335ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ext/x86/000077500000000000000000000000001517466257200220605ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ext/x86/x86inc.asm000066400000000000000000001661061517466257200237130ustar00rootroot00000000000000;***************************************************************************** ;* x86inc.asm: x86 abstraction layer ;***************************************************************************** ;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner ;* Anton Mitrofanov ;* Fiona Glaser ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** ; This is a header file for the x86inc.asm assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used. %ifndef private_prefix %error private_prefix not defined %endif %ifndef public_prefix %define public_prefix private_prefix %endif %ifndef STACK_ALIGNMENT %if ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 %endif %endif %define WIN32 0 %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN32 1 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 %define WIN32 1 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define WIN32 1 %define WIN64 1 %else %define UNIX64 1 %endif %else %ifidn __OUTPUT_FORMAT__,win32 %define WIN32 1 %endif %endif %define FORMAT_ELF 0 %define FORMAT_MACHO 0 %define FORMAT_OBJ 0 %ifidn __OUTPUT_FORMAT__,elf %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf32 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf64 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,macho %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho32 %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho64 %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,obj %define FORMAT_OBJ 1 %elifidn __OUTPUT_FORMAT__,obj2 %define FORMAT_OBJ 1 %endif %ifdef PREFIX %define mangle(x) _ %+ x %else %define mangle(x) x %endif ; Use VEX-encoding even in non-AVX functions %ifndef FORCE_VEX_ENCODING %define FORCE_VEX_ENCODING 0 %endif %macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,win32 SECTION .rdata align=%1 %elif WIN64 SECTION .rdata align=%1 %elifidn __OUTPUT_FORMAT__,aout SECTION .text %else SECTION .rodata align=%1 %endif %endmacro %if ARCH_X86_64 %define PIC 1 ; always use PIC on x86-64 default rel %elifidn __OUTPUT_FORMAT__,win32 %define PIC 0 ; PIC isn't used on 32-bit Windows %elifndef PIC %define PIC 0 %endif %define HAVE_PRIVATE_EXTERN 1 %ifdef __NASM_VERSION_ID__ %use smartalign %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 %define HAVE_PRIVATE_EXTERN 0 %endif %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most use cases. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro. ; RET: ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: ; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size %macro DECLARE_REG 2-3 %define r%1q %2 %define r%1d %2d %define r%1w %2w %define r%1b %2b %define r%1h %2h %define %2q %2 %if %0 == 2 %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else %define r%1m [rstk + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro %macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 %define r%1h %3 %define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE bx, bl, bh DECLARE_REG_SIZE cx, cl, ch DECLARE_REG_SIZE dx, dl, dh DECLARE_REG_SIZE si, sil, null DECLARE_REG_SIZE di, dil, null DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments %macro DECLARE_REG_TMP 1-* %assign %%i 0 %rep %0 CAT_XDEFINE t, %%i, r%1 %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro DECLARE_REG_TMP_SIZE 0-* %rep %0 %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w %define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep %endmacro DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif %macro LEA 2 %if ARCH_X86_64 lea %1, [%2] %elif PIC call $+5 ; special-cased to not affect the RSB on most CPU:s pop %1 add %1, -$+1+%2 %else mov %1, %2 %endif %endmacro ; Repeats an instruction/operation for multiple arguments. ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" %macro REPX 2-* ; operation, args %xdefine %%f(x) %1 %rep %0 - 1 %rotate 1 %%f(%1) %endrep %endmacro %macro PUSH 1 push %1 %ifidn rstk, rsp %assign stack_offset stack_offset+gprsize %endif %endmacro %macro POP 1 pop %1 %ifidn rstk, rsp %assign stack_offset stack_offset-gprsize %endif %endmacro %macro PUSH_IF_USED 1-* %rep %0 %if %1 < regs_used PUSH r%1 %endif %rotate 1 %endrep %endmacro %macro POP_IF_USED 1-* %rep %0 %if %1 < regs_used pop r%1 %endif %rotate 1 %endrep %endmacro %macro LOAD_IF_USED 1-* %rep %0 %if %1 < num_args mov r%1, r %+ %1 %+ mp %endif %rotate 1 %endrep %endmacro %macro SUB 2 sub %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro %macro movifnidn 2 %ifnidn %1, %2 mov %1, %2 %endif %endmacro %if ARCH_X86_64 == 0 %define movsxd movifnidn %endif %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 %endif %endmacro %macro ASSERT 1 %if (%1) == 0 %error assertion ``%1'' failed %endif %endmacro %macro DEFINE_ARGS 0-* %ifdef n_arg_names %assign %%i 0 %rep n_arg_names CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep %endif %xdefine %%stack_offset stack_offset %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1 %endrep %xdefine stack_offset %%stack_offset %assign n_arg_names %0 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) ; Large stack allocations on Windows need to use stack probing in order ; to guarantee that all stack memory is committed before accessing it. ; This is done by ensuring that the guard page(s) at the end of the ; currently committed pages are touched prior to any pages beyond that. %if WIN64 %assign STACK_PROBE_SIZE 8192 %elifidn __OUTPUT_FORMAT__, win32 %assign STACK_PROBE_SIZE 4096 %else %assign STACK_PROBE_SIZE 0 %endif %macro PROBE_STACK 1 ; stack_size %if STACK_PROBE_SIZE %assign %%i STACK_PROBE_SIZE %rep %1 / STACK_PROBE_SIZE mov eax, [rsp-%%i] %assign %%i %%i+STACK_PROBE_SIZE %endrep %endif %endmacro %macro RESET_STACK_STATE 0 %ifidn rstk, rsp %assign stack_offset stack_offset - stack_size_padded %else %xdefine rstk rsp %endif %assign stack_size 0 %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs RESET_STACK_STATE %ifnum %2 %if mmsize != 8 %assign xmm_regs_used %2 %endif %endif %ifnum %1 %if %1 != 0 %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space %if xmm_regs_used > 8 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) PROBE_STACK stack_size_padded SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) %xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) %if %1 < 0 ; need to store rsp on stack %xdefine rstkm [rsp + stack_size + %%pad] %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) PROBE_STACK stack_size_padded mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif %endif %endmacro %macro SETUP_STACK_POINTER 0-1 0 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 ; Reserve an additional register for storing the original stack pointer, but avoid using ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) %if ARCH_X86_64 && regs_used == 7 %assign regs_used 8 %elif ARCH_X86_64 == 0 && regs_used == 1 %assign regs_used 2 %endif %endif %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. %assign regs_used 5 + UNIX64 * 3 %endif %endif %endif %endmacro %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx DECLARE_REG 1, rdx DECLARE_REG 2, R8 DECLARE_REG 3, R9 DECLARE_REG 4, R10, 40 DECLARE_REG 5, R11, 48 DECLARE_REG 6, rax, 56 DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 DECLARE_REG 11, R14, 96 DECLARE_REG 12, R15, 104 DECLARE_REG 13, R12, 112 DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro ; Push XMM registers to the stack. If no argument is specified all used register ; will be pushed, otherwise only push previously unpushed registers. %macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed %if mmsize != 8 %if %0 == 2 %assign %%pushed %2 %assign xmm_regs_used %1 %elif %0 == 1 %assign %%pushed xmm_regs_used %assign xmm_regs_used %1 %else %assign %%pushed 0 %endif ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif %assign %%pushed %%pushed - high_mm_regs - 8 %if %%pushed < 0 %assign %%pushed 0 %endif %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 %if %%regs_to_push > 0 ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 %assign %%i %%pushed + 8 %rep %%regs_to_push movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep %endif %endif %endmacro ; Allocated stack space for XMM registers and push all, or a subset, of those %macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved RESET_STACK_STATE %if mmsize != 8 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 + high_mm_regs %if %0 == 2 ASSERT %2 >= %1 %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 %else %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 %endif %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endif %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 %assign %%i xmm_regs_used - high_mm_regs %rep %%xmm_regs_on_stack %assign %%i %%i-1 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 + high_mm_regs movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 + high_mm_regs movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro %macro WIN64_RESTORE_XMM 0 WIN64_RESTORE_XMM_INTERNAL RESET_STACK_STATE %endmacro %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi DECLARE_REG 2, rdx DECLARE_REG 3, rcx DECLARE_REG 4, R8 DECLARE_REG 5, R9 DECLARE_REG 6, rax, 8 DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 DECLARE_REG 11, R14, 48 DECLARE_REG 12, R15, 56 DECLARE_REG 13, R12, 64 DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== DECLARE_REG 0, eax, 4 DECLARE_REG 1, ecx, 8 DECLARE_REG 2, edx, 12 DECLARE_REG 3, ebx, 16 DECLARE_REG 4, esi, 20 DECLARE_REG 5, edi, 24 DECLARE_REG 6, ebp, 28 %define rsp esp %macro DECLARE_ARG 1-* %rep %0 %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep %endmacro DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args %if num_args > 7 %assign num_args 7 %endif %if regs_used > 7 %assign regs_used 7 %endif SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4, %3 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 6, 5, 4, 3 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 %macro WIN64_SPILL_XMM 1-2 RESET_STACK_STATE %if mmsize != 8 %assign xmm_regs_used %1 %endif %endmacro %macro WIN64_RESTORE_XMM 0 RESET_STACK_STATE %endmacro %macro WIN64_PUSH_XMM 0-2 %if mmsize != 8 && %0 >= 1 %assign xmm_regs_used %1 %endif %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue || cpuflag(ssse3) RET %else rep ret %endif annotate_function_size %endmacro %define last_branch_adr $$ %macro AUTO_REP_RET 0 %if notcpuflag(ssse3) times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. %endif ret annotate_function_size %endmacro %macro BRANCH_INSTR 0-* %rep %0 %macro %1 1-2 %1 %2 %1 %if notcpuflag(ssse3) %%branch_instr equ $ %xdefine last_branch_adr %%branch_instr %endif %endmacro %rotate 1 %endrep %endmacro BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent %if has_epilogue call %1 RET %elif %2 jmp %1 %endif annotate_function_size %endmacro ;============================================================================= ; arch-independent part ;============================================================================= %assign function_align 16 ; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro %macro cvisible 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 0, %1 %+ SUFFIX, %2 %endmacro %macro cglobal_internal 2-3+ annotate_function_size %ifndef cglobaled_%2 %if %1 %xdefine %2 mangle(private_prefix %+ _ %+ %2) %else %xdefine %2 mangle(public_prefix %+ _ %+ %2) %endif %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %xdefine current_function_section __SECT__ %if FORMAT_ELF %if %1 global %2:function hidden %else global %2:function %endif %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 global %2:private_extern %else global %2 %endif %if WIN32 && !%1 %ifdef BUILDING_DLL export %2 %endif %elif FORMAT_OBJ && !%1 export %2 %endif align function_align %2: RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif %endmacro ; Create a global symbol from a local label with the correct name mangling and type %macro cglobal_label 1 %if FORMAT_ELF global current_function %+ %1:function hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global current_function %+ %1:private_extern %else global current_function %+ %1 %endif %1: %endmacro %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 2 extern %1 %endmacro ; Like cextern, but without the prefix. This should be used for symbols from external libraries. %macro cextern_naked 1 %ifdef PREFIX %xdefine %1 mangle(%1) %endif CAT_XDEFINE cglobaled_, %1, 3 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %if FORMAT_ELF global %1:data hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global %1:private_extern %else global %1 %endif %1: %2 %endmacro %if FORMAT_ELF ; The GNU linker assumes the stack is executable by default. [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %ifdef __NASM_VERSION_ID__ %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 %if ARCH_X86_64 ; Control-flow Enforcement Technology (CET) properties. [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] dd 0x00000004 ; n_namesz dd gprsize + 8 ; n_descsz dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 db "GNU",0 ; n_name dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND dd 0x00000004 ; pr_datasz dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK dd 0x00000000 ; pr_padding %endif %endif %endif %endif ; Tell debuggers how large the function was. ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, ; then its size might be unspecified. %macro annotate_function_size 0 %ifdef __YASM_VER__ %ifdef current_function %if FORMAT_ELF current_function_section %%ecf equ $ size current_function %%ecf - current_function __SECT__ %endif %endif %endif %endmacro ; cpuflags %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 %assign cpuflags_clmul (1<<13) | cpuflags_sse42 %assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul %assign cpuflags_avx (1<<15) | cpuflags_sse42 %assign cpuflags_xop (1<<16) | cpuflags_avx %assign cpuflags_fma4 (1<<17) | cpuflags_avx %assign cpuflags_fma3 (1<<18) | cpuflags_avx %assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 %assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 %assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL %assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ %assign cpuflags_cache32 (1<<24) %assign cpuflags_cache64 (1<<25) %assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<27) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) %define notcpuflag(x) (cpuflag(x) ^ 1) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-* %xdefine SUFFIX %undef cpuname %assign cpuflags 0 %if %0 >= 1 %rep %0 %ifdef cpuname %xdefine cpuname cpuname %+ _%1 %else %xdefine cpuname %1 %endif %assign cpuflags cpuflags | cpuflags_%1 %rotate 1 %endrep %xdefine SUFFIX _ %+ cpuname %if cpuflag(avx) %assign avx_enabled 1 %endif %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps %endif %if cpuflag(aligned) %define movu mova %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif %endif %if ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VERSION_ID__ ALIGNMODE p6 %else CPU amdnop %endif %else %ifdef __NASM_VERSION_ID__ ALIGNMODE nop %else CPU basicnop %endif %endif %endmacro ; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# ; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 %endmacro %macro CAT_UNDEF 2 %undef %1%2 %endmacro %macro DEFINE_MMREGS 1 ; mmtype %assign %%prev_mmregs 0 %ifdef num_mmregs %assign %%prev_mmregs num_mmregs %endif %assign num_mmregs 8 %if ARCH_X86_64 && mmsize >= 16 %assign num_mmregs 16 %if cpuflag(avx512) || mmsize == 64 %assign num_mmregs 32 %endif %endif %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1 %+ %%i CAT_XDEFINE nn%1, %%i, %%i %assign %%i %%i+1 %endrep %if %%prev_mmregs > num_mmregs %rep %%prev_mmregs - num_mmregs CAT_UNDEF m, %%i CAT_UNDEF nn %+ mmtype, %%i %assign %%i %%i+1 %endrep %endif %xdefine mmtype %1 %endmacro ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg %if ARCH_X86_64 && cpuflag(avx512) %assign %%i %1 %rep 16-%1 %assign %%i_high %%i+16 SWAP %%i, %%i_high %assign %%i %%i+1 %endrep %endif %endmacro %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 %define mova movq %define movu movq %define movh movd %define movnta movntq INIT_CPUFLAGS %1 DEFINE_MMREGS mm %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled FORCE_VEX_ENCODING %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS xmm %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif %xdefine bcstw 1to8 %xdefine bcstd 1to4 %xdefine bcstq 1to2 %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION %xdefine bcstw 1to16 %xdefine bcstd 1to8 %xdefine bcstq 1to4 %endmacro %macro INIT_ZMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_ZMM %1 %define mmsize 64 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION %xdefine bcstw 1to32 %xdefine bcstd 1to16 %xdefine bcstq 1to8 %endmacro INIT_XMM %macro DECLARE_MMCAST 1 %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 %define ymmzmm%1 ymm%1 %define zmmmm%1 mm%1 %define zmmxmm%1 xmm%1 %define zmmymm%1 ymm%1 %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %define zm%1 zmm %+ m%1 %endmacro %assign i 0 %rep 32 DECLARE_MMCAST i %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 %xdefine m%1 %%tmp%2 CAT_XDEFINE nn, m%1, %1 %rotate 2 %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) %ifnum %1 ; SWAP 0, 1, ... SWAP_INTERNAL_NUM %1, %2 %else ; SWAP m0, m1, ... SWAP_INTERNAL_NAME %1, %2 %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* %rep %0-1 %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp CAT_XDEFINE nn, m%1, %1 CAT_XDEFINE nn, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* %xdefine %%args nn %+ %1 %rep %0-1 %xdefine %%args %%args, nn %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs. %macro SAVE_MM_PERMUTATION 0-1 %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %assign %%i 0 %rep num_mmregs %xdefine %%tmp m %+ %%i CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp %assign %%i %%i+1 %endrep %endmacro %macro LOAD_MM_PERMUTATION 0-1 ; name to load from %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %xdefine %%tmp %%f %+ 0 %ifnum %%tmp DEFINE_MMREGS mmtype %assign %%i 0 %rep num_mmregs %xdefine %%tmp %%f %+ %%i CAT_XDEFINE %%m, %%i, m %+ %%tmp %assign %%i %%i+1 %endrep %rep num_mmregs %assign %%i %%i-1 CAT_XDEFINE m, %%i, %%m %+ %%i CAT_XDEFINE nn, m %+ %%i, %%i %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 %ifid %1 call_internal %1 %+ SUFFIX, %1 %else call %1 %endif %endmacro %macro call_internal 2 %xdefine %%i %2 %define %%j %%i %ifndef cglobaled_%2 %ifdef cglobaled_%1 %xdefine %%i %1 %endif %elif FORMAT_ELF %if ARCH_X86_64 %if cglobaled_%2 >= 2 ; Always emit PLT relocations when calling external functions, ; the linker will eliminate unnecessary PLT indirections anyway. %define %%j %%i wrt ..plt %endif %elif PIC && cglobaled_%2 == 3 ; Go through the GOT for functions declared using cextern_naked with ; PIC, as such functions presumably exists in external libraries. extern _GLOBAL_OFFSET_TABLE_ LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc %define %%j [eax+%%i wrt ..got] %endif %endif call %%j LOAD_MM_PERMUTATION %%i %endmacro ; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 sub %1, -128 %else add %1, %2 %endif %else add %1, %2 %endif %endmacro %macro sub 2 %ifnum %2 %if %2==128 add %1, -128 %else sub %1, %2 %endif %else sub %1, %2 %endif %endmacro ;============================================================================= ; AVX abstraction layer ;============================================================================= %assign i 0 %rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 CAT_XDEFINE sizeofzmm, i, 64 CAT_XDEFINE regnumofxmm, i, i CAT_XDEFINE regnumofymm, i, i CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i %macro CHECK_AVX_INSTR_EMU 3-* %xdefine %%opcode %1 %xdefine %%dst %2 %rep %0-2 %ifidn %%dst, %3 %error non-avx emulation of ``%%opcode'' is not supported %endif %rotate 1 %endrep %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ %ifnum sizeof%7 %assign __sizeofreg sizeof%7 %elifnum sizeof%6 %assign __sizeofreg sizeof%6 %else %assign __sizeofreg mmsize %endif %assign __emulate_avx 0 %if avx_enabled && __sizeofreg >= 16 %xdefine __instr v%1 %else %xdefine __instr %1 %if %0 >= 8+%4 %assign __emulate_avx 1 %endif %endif %ifnidn %2, fnord %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) %error use of ``%1'' sse2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) %error use of ``%1'' avx2 instruction in cpuname function: current_function %elif __sizeofreg == 16 && notcpuflag(sse) %error use of ``%1'' sse instruction in cpuname function: current_function %elif __sizeofreg == 32 && notcpuflag(avx) %error use of ``%1'' avx instruction in cpuname function: current_function %elif __sizeofreg == 64 && notcpuflag(avx512) %error use of ``%1'' avx512 instruction in cpuname function: current_function %elifidn %1, pextrw ; special case because the base instruction is mmx2, %ifnid %6 ; but sse4 is required for memory operands %if notcpuflag(sse4) %error use of ``%1'' sse4 instruction in cpuname function: current_function %endif %endif %endif %endif %endif %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 %if %5 && %4 == 0 %ifnidn %6, %7 %ifidn %6, %8 %xdefine __src1 %8 %xdefine __src2 %7 %elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %endif %ifnidn %6, __src1 %if %0 >= 9 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 %else CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 MOVAPS %6, __src1 %else MOVDQA %6, __src1 %endif %endif %if %0 >= 9 %1 %6, __src2, %9 %else %1 %6, __src2 %endif %elif %0 >= 9 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 %ifnnum regnumof%7 %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8, %9 %else __instr %6, %7, %8, %9 %endif %else __instr %6, %7, %8, %9 %endif %elif %0 == 8 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 %if %5 %ifnum regnumof%7 %ifnum regnumof%8 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 ; Most VEX-encoded instructions require an additional byte to encode when ; src2 is a high register (e.g. m8..15). If the instruction is commutative ; we can swap src1 and src2 when doing so reduces the instruction length. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %elifnum regnumof%8 ; put memory operands in src2 when possible %xdefine __src1 %8 %xdefine __src2 %7 %else %assign __emulate_avx 1 %endif %elifnnum regnumof%7 ; EVEX allows imm8 shift instructions to be used with memory operands, ; but VEX does not. This handles those special cases. %ifnnum %8 %assign __emulate_avx 1 %elif notcpuflag(avx512) %assign __emulate_avx 1 %endif %endif %if __emulate_avx ; a separate load is required %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8 %else __instr %6, __src1, __src2 %endif %else __instr %6, %7, %8 %endif %elif %0 == 7 %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6 %ifnum regnumof%7 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 %xdefine __src1 %7 %xdefine __src2 %6 %endif %endif %endif __instr %6, __src1, __src2 %else __instr %6, %7 %endif %else __instr %6 %endif %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not %macro AVX_INSTR 1-5 fnord, 0, 255, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 %elifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 %elifidn %4, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 %else RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 %endif %endmacro %endmacro ; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 AVX_INSTR addsd, sse2, 1, 0, 0 AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, aesni, 0, 0, 0 AVX_INSTR aesdeclast, aesni, 0, 0, 0 AVX_INSTR aesenc, aesni, 0, 0, 0 AVX_INSTR aesenclast, aesni, 0, 0, 0 AVX_INSTR aesimc, aesni AVX_INSTR aeskeygenassist, aesni AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR cmpeqpd, sse2, 1, 0, 1 AVX_INSTR cmpeqps, sse, 1, 0, 1 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 AVX_INSTR cmpeqss, sse, 1, 0, 0 AVX_INSTR cmplepd, sse2, 1, 0, 0 AVX_INSTR cmpleps, sse, 1, 0, 0 AVX_INSTR cmplesd, sse2, 1, 0, 0 AVX_INSTR cmpless, sse, 1, 0, 0 AVX_INSTR cmpltpd, sse2, 1, 0, 0 AVX_INSTR cmpltps, sse, 1, 0, 0 AVX_INSTR cmpltsd, sse2, 1, 0, 0 AVX_INSTR cmpltss, sse, 1, 0, 0 AVX_INSTR cmpneqpd, sse2, 1, 0, 1 AVX_INSTR cmpneqps, sse, 1, 0, 1 AVX_INSTR cmpneqsd, sse2, 1, 0, 0 AVX_INSTR cmpneqss, sse, 1, 0, 0 AVX_INSTR cmpnlepd, sse2, 1, 0, 0 AVX_INSTR cmpnleps, sse, 1, 0, 0 AVX_INSTR cmpnlesd, sse2, 1, 0, 0 AVX_INSTR cmpnless, sse, 1, 0, 0 AVX_INSTR cmpnltpd, sse2, 1, 0, 0 AVX_INSTR cmpnltps, sse, 1, 0, 0 AVX_INSTR cmpnltsd, sse2, 1, 0, 0 AVX_INSTR cmpnltss, sse, 1, 0, 0 AVX_INSTR cmpordpd, sse2 1, 0, 1 AVX_INSTR cmpordps, sse 1, 0, 1 AVX_INSTR cmpordsd, sse2 1, 0, 0 AVX_INSTR cmpordss, sse 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 AVX_INSTR cmpunordpd, sse2, 1, 0, 1 AVX_INSTR cmpunordps, sse, 1, 0, 1 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 AVX_INSTR cmpunordss, sse, 1, 0, 0 AVX_INSTR comisd, sse2, 1 AVX_INSTR comiss, sse, 1 AVX_INSTR cvtdq2pd, sse2, 1 AVX_INSTR cvtdq2ps, sse2, 1 AVX_INSTR cvtpd2dq, sse2, 1 AVX_INSTR cvtpd2ps, sse2, 1 AVX_INSTR cvtps2dq, sse2, 1 AVX_INSTR cvtps2pd, sse2, 1 AVX_INSTR cvtsd2si, sse2, 1 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 AVX_INSTR cvtss2si, sse, 1 AVX_INSTR cvttpd2dq, sse2, 1 AVX_INSTR cvttps2dq, sse2, 1 AVX_INSTR cvttsd2si, sse2, 1 AVX_INSTR cvttss2si, sse, 1 AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4, 1 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 AVX_INSTR ldmxcsr, sse, 1 AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 AVX_INSTR maxsd, sse2, 1, 0, 0 AVX_INSTR maxss, sse, 1, 0, 0 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 0 AVX_INSTR minss, sse, 1, 0, 0 AVX_INSTR movapd, sse2, 1 AVX_INSTR movaps, sse, 1 AVX_INSTR movd, mmx AVX_INSTR movddup, sse3, 1 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 AVX_INSTR movhpd, sse2, 1, 0, 0 AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 AVX_INSTR movmskpd, sse2, 1 AVX_INSTR movmskps, sse, 1 AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2, 1 AVX_INSTR movntps, sse, 1 AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3, 1 AVX_INSTR movsldup, sse3, 1 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2, 1 AVX_INSTR movups, sse, 1 AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 AVX_INSTR mulsd, sse2, 1, 0, 0 AVX_INSTR mulss, sse, 1, 0, 0 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 AVX_INSTR pclmulqdq, clmul, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpistri, sse42 AVX_INSTR pcmpistrm, sse42 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR pshufb, ssse3, 0, 0, 0 AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4, 1 AVX_INSTR roundps, sse4, 1 AVX_INSTR roundsd, sse4, 1, 1, 0 AVX_INSTR roundss, sse4, 1, 1, 0 AVX_INSTR rsqrtps, sse, 1 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 AVX_INSTR sqrtpd, sse2, 1 AVX_INSTR sqrtps, sse, 1 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse, 1 AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 AVX_INSTR ucomisd, sse2, 1 AVX_INSTR ucomiss, sse, 1 AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 AVX_INSTR unpcklps, sse, 1, 0, 0 AVX_INSTR xorpd, sse2, 1, 0, 1 AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfmul, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 ;%1 == instruction ;%2 == minimal instruction set %macro GPR_INSTR 2 %macro %1 2-5 fnord, %1, %2 %ifdef cpuname %if notcpuflag(%5) %error use of ``%4'' %5 instruction in cpuname function: current_function %endif %endif %ifidn %3, fnord %4 %1, %2 %else %4 %1, %2, %3 %endif %endmacro %endmacro GPR_INSTR andn, bmi1 GPR_INSTR bextr, bmi1 GPR_INSTR blsi, bmi1 GPR_INSTR blsmsk, bmi1 GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 GPR_INSTR crc32, sse42 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 GPR_INSTR pext, bmi2 GPR_INSTR popcnt, sse42 GPR_INSTR rorx, bmi2 GPR_INSTR sarx, bmi2 GPR_INSTR shlx, bmi2 GPR_INSTR shrx, bmi2 ; base-4 constants for shuffles %assign i 0 %rep 256 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) %if j < 10 CAT_XDEFINE q000, j, i %elif j < 100 CAT_XDEFINE q00, j, i %elif j < 1000 CAT_XDEFINE q0, j, i %else CAT_XDEFINE q, j, i %endif %assign i i+1 %endrep %undef i %undef j %macro FMA_INSTR 3 %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 %else %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. ; FMA3 is only possible if dst is the same as one of the src registers. ; Either src2 or src3 can be a memory operand. %macro FMA4_INSTR 2-* %push fma4_instr %xdefine %$prefix %1 %rep %0 - 1 %macro %$prefix%2 4-6 %$prefix, %2 %if notcpuflag(fma3) && notcpuflag(fma4) %error use of ``%5%6'' fma instruction in cpuname function: current_function %elif cpuflag(fma4) v%5%6 %1, %2, %3, %4 %elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. %ifnum sizeof%3 v%{5}213%6 %2, %3, %4 %else v%{5}132%6 %2, %4, %3 %endif %elifidn %1, %3 v%{5}213%6 %3, %2, %4 %elifidn %1, %4 v%{5}231%6 %4, %2, %3 %else %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported %endif %endmacro %rotate 1 %endrep %pop %endmacro FMA4_INSTR fmadd, pd, ps, sd, ss FMA4_INSTR fmaddsub, pd, ps FMA4_INSTR fmsub, pd, ps, sd, ss FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss ; Macros for converting VEX instructions to equivalent EVEX ones. %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex %macro %1 2-7 fnord, fnord, %1, %2, %3 %ifidn %3, fnord %define %%args %1, %2 %elifidn %4, fnord %define %%args %1, %2, %3 %else %define %%args %1, %2, %3, %4 %endif %assign %%evex_required cpuflag(avx512) & %7 %ifnum regnumof%1 %if regnumof%1 >= 16 || sizeof%1 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%2 %if regnumof%2 >= 16 || sizeof%2 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%3 %if regnumof%3 >= 16 || sizeof%3 > 32 %assign %%evex_required 1 %endif %endif %if %%evex_required %6 %%args %else %5 %%args ; Prefer VEX over EVEX due to shorter instruction length %endif %endmacro %endmacro EVEX_INSTR vbroadcastf128, vbroadcastf32x4 EVEX_INSTR vbroadcasti128, vbroadcasti32x4 EVEX_INSTR vextractf128, vextractf32x4 EVEX_INSTR vextracti128, vextracti32x4 EVEX_INSTR vinsertf128, vinsertf32x4 EVEX_INSTR vinserti128, vinserti32x4 EVEX_INSTR vmovdqa, vmovdqa32 EVEX_INSTR vmovdqu, vmovdqu32 EVEX_INSTR vpand, vpandd EVEX_INSTR vpandn, vpandnd EVEX_INSTR vpor, vpord EVEX_INSTR vpxor, vpxord EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision EVEX_INSTR vrcpss, vrcp14ss, 1 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/fg_apply.h000066400000000000000000000047451517466257200226170ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_FG_APPLY_H #define DAV2D_SRC_FG_APPLY_H #include "dav2d/picture.h" #include "common/bitdepth.h" #include "src/filmgrain.h" #ifdef BITDEPTH # define array_decl(type, name, sz) type name sz #else # define array_decl(type, name, sz) void *name #endif bitfn_decls(void dav2d_apply_grain, const Dav2dFilmGrainDSPContext *const dsp, Dav2dPicture *const out, const Dav2dPicture *const in); bitfn_decls(void dav2d_prep_grain, const Dav2dFilmGrainDSPContext *const dsp, Dav2dPicture *const out, const Dav2dPicture *const in, array_decl(uint8_t, scaling, [3][SCALING_SIZE]), array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH])); bitfn_decls(void dav2d_apply_grain_row, const Dav2dFilmGrainDSPContext *const dsp, Dav2dPicture *const out, const Dav2dPicture *const in, array_decl(const uint8_t, scaling, [3][SCALING_SIZE]), array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]), const int row); #endif /* DAV2D_SRC_FG_APPLY_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/fg_apply_tmpl.c000066400000000000000000000236451517466257200236460ustar00rootroot00000000000000/* * Copyright © 2018-2026, Niklas Haas * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "dav2d/common.h" #include "dav2d/picture.h" #include "common/intops.h" #include "common/bitdepth.h" #include "src/fg_apply.h" static void generate_scaling(const int bitdepth, const uint8_t points[][2], const int num, uint8_t scaling[SCALING_SIZE]) { #if BITDEPTH == 8 const int shift_x = 0; const int scaling_size = SCALING_SIZE; #else assert(bitdepth > 8); const int shift_x = bitdepth - 8; const int scaling_size = 1 << bitdepth; #endif if (num == 0) { memset(scaling, 0, scaling_size); return; } // Fill up the preceding entries with the initial value memset(scaling, points[0][1], points[0][0] << shift_x); // Linearly interpolate the values in the middle for (int i = 0; i < num - 1; i++) { const int bx = points[i][0]; const int by = points[i][1]; const int ex = points[i+1][0]; const int ey = points[i+1][1]; const int dx = ex - bx; const int dy = ey - by; assert(dx > 0); const int delta = dy * ((0x10000 + (dx >> 1)) / dx); for (int x = 0, d = 0x8000; x < dx; x++) { scaling[(bx + x) << shift_x] = by + (d >> 16); d += delta; } } // Fill up the remaining entries with the final value const int n = points[num - 1][0] << shift_x; memset(&scaling[n], points[num - 1][1], scaling_size - n); #if BITDEPTH != 8 const int pad = 1 << shift_x, rnd = pad >> 1; for (int i = 0; i < num - 1; i++) { const int bx = points[i][0] << shift_x; const int ex = points[i+1][0] << shift_x; const int dx = ex - bx; for (int x = 0; x < dx; x += pad) { const int range = scaling[bx + x + pad] - scaling[bx + x]; for (int n = 1, r = rnd; n < pad; n++) { r += range; scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x); } } } #endif } #ifndef UNIT_TEST void bitfn(dav2d_prep_grain)(const Dav2dFilmGrainDSPContext *const dsp, Dav2dPicture *const out, const Dav2dPicture *const in, uint8_t scaling[3][SCALING_SIZE], entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH]) { const Dav2dFilmGrainData *const data = out->fgm; #if BITDEPTH != 8 const int bitdepth_max = (1 << out->p.bpc) - 1; #endif const unsigned seed = out->frame_hdr->film_grain.seed; // Generate grain LUTs as needed dsp->generate_grain_y(grain_lut[0], data, seed HIGHBD_TAIL_SUFFIX); // always needed if (data->num_points[1] || data->chroma_scaling_from_luma) dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0], data, seed, 0 HIGHBD_TAIL_SUFFIX); if (data->num_points[2] || data->chroma_scaling_from_luma) dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0], data, seed, 1 HIGHBD_TAIL_SUFFIX); // Generate scaling LUTs as needed if (data->num_points[0] || data->chroma_scaling_from_luma) generate_scaling(in->p.bpc, data->points[0], data->num_points[0], scaling[0]); if (data->num_points[1]) generate_scaling(in->p.bpc, data->points[1], data->num_points[1], scaling[1]); if (data->num_points[2]) generate_scaling(in->p.bpc, data->points[2], data->num_points[2], scaling[2]); // Copy over the non-modified planes assert(out->stride[0] == in->stride[0]); if (!data->num_points[0]) { const ptrdiff_t stride = out->stride[0]; const ptrdiff_t sz = out->p.h * stride; if (sz < 0) memcpy((uint8_t*) out->data[0] + sz - stride, (uint8_t*) in->data[0] + sz - stride, -sz); else memcpy(out->data[0], in->data[0], sz); } if (in->p.layout != DAV2D_PIXEL_LAYOUT_I400 && !data->chroma_scaling_from_luma) { assert(out->stride[1] == in->stride[1]); const int ss_ver = in->p.layout == DAV2D_PIXEL_LAYOUT_I420; const ptrdiff_t stride = out->stride[1]; const ptrdiff_t sz = ((out->p.h + ss_ver) >> ss_ver) * stride; if (sz < 0) { if (!data->num_points[1]) memcpy((uint8_t*) out->data[1] + sz - stride, (uint8_t*) in->data[1] + sz - stride, -sz); if (!data->num_points[2]) memcpy((uint8_t*) out->data[2] + sz - stride, (uint8_t*) in->data[2] + sz - stride, -sz); } else { if (!data->num_points[1]) memcpy(out->data[1], in->data[1], sz); if (!data->num_points[2]) memcpy(out->data[2], in->data[2], sz); } } } void bitfn(dav2d_apply_grain_row)(const Dav2dFilmGrainDSPContext *const dsp, Dav2dPicture *const out, const Dav2dPicture *const in, const uint8_t scaling[3][SCALING_SIZE], const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH], const int row) { // Synthesize grain for the affected planes const Dav2dFilmGrainData *const data = out->fgm; const int ss_y = in->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_x = in->p.layout != DAV2D_PIXEL_LAYOUT_I444; const int cpw = (out->p.w + ss_x) >> ss_x; const int is_id = 0; //out->seq_hdr->mtrx == DAV2D_MC_IDENTITY; const int bs = 16 << data->block_size; pixel *const luma_src = ((pixel *) in->data[0]) + row * bs * PXSTRIDE(in->stride[0]); #if BITDEPTH != 8 const int bitdepth_max = (1 << out->p.bpc) - 1; #endif const unsigned seed = out->frame_hdr->film_grain.seed; if (data->num_points[0]) { const int bh = imin(out->p.h - row * bs, bs); dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * bs * PXSTRIDE(out->stride[0]), luma_src, out->stride[0], data, seed, out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); } if (!data->num_points[1] && !data->num_points[2] && !data->chroma_scaling_from_luma) { return; } const int bh = (imin(out->p.h - row * bs, bs) + ss_y) >> ss_y; // extend padding pixels if (out->p.w & ss_x) { pixel *ptr = luma_src; for (int y = 0; y < bh; y++) { ptr[out->p.w] = ptr[out->p.w - 1]; ptr += PXSTRIDE(in->stride[0]) << ss_y; } } const ptrdiff_t uv_off = row * bs * PXSTRIDE(out->stride[1]) >> ss_y; if (data->chroma_scaling_from_luma) { for (int pl = 0; pl < 2; pl++) dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, ((const pixel *) in->data[1 + pl]) + uv_off, in->stride[1], data, seed, cpw, scaling[0], grain_lut[1 + pl], bh, row, luma_src, in->stride[0], pl, is_id HIGHBD_TAIL_SUFFIX); } else { for (int pl = 0; pl < 2; pl++) if (data->num_points[1 + pl]) dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, ((const pixel *) in->data[1 + pl]) + uv_off, in->stride[1], data, seed, cpw, scaling[1 + pl], grain_lut[1 + pl], bh, row, luma_src, in->stride[0], pl, is_id HIGHBD_TAIL_SUFFIX); } } void bitfn(dav2d_apply_grain)(const Dav2dFilmGrainDSPContext *const dsp, Dav2dPicture *const out, const Dav2dPicture *const in) { ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]); #if ARCH_X86_64 && BITDEPTH == 8 ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]); #else uint8_t scaling[3][SCALING_SIZE]; #endif const int bs = 16 << out->fgm->block_size; const int rows = (out->p.h + bs - 1) / bs; bitfn(dav2d_prep_grain)(dsp, out, in, scaling, grain_lut); for (int row = 0; row < rows; row++) bitfn(dav2d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row); } #endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/filmgrain.h000066400000000000000000000066251517466257200227650ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_FILM_GRAIN_H #define DAV2D_SRC_FILM_GRAIN_H #include "common/bitdepth.h" #include "src/levels.h" #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 #if !defined(BITDEPTH) || BITDEPTH == 8 #define SCALING_SIZE 256 typedef int8_t entry; #else #define SCALING_SIZE 4096 typedef int16_t entry; #endif #define decl_generate_grain_y_fn(name) \ void (name)(entry buf[][GRAIN_WIDTH], \ const Dav2dFilmGrainData *const data, unsigned seed \ HIGHBD_DECL_SUFFIX) typedef decl_generate_grain_y_fn(*generate_grain_y_fn); #define decl_generate_grain_uv_fn(name) \ void (name)(entry buf[][GRAIN_WIDTH], \ const entry buf_y[][GRAIN_WIDTH], \ const Dav2dFilmGrainData *const data, unsigned seed, \ const intptr_t uv HIGHBD_DECL_SUFFIX) typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn); #define decl_fgy_32x32xn_fn(name) \ void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ const Dav2dFilmGrainData *data, unsigned seed, \ size_t pw, const uint8_t scaling[SCALING_SIZE], \ const entry grain_lut[][GRAIN_WIDTH], \ int bh, int row_num HIGHBD_DECL_SUFFIX) typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn); #define decl_fguv_32x32xn_fn(name) \ void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ const Dav2dFilmGrainData *data, unsigned seed, size_t pw, \ const uint8_t scaling[SCALING_SIZE], \ const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \ const pixel *luma_row, ptrdiff_t luma_stride, \ int uv_pl, int is_id HIGHBD_DECL_SUFFIX) typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn); typedef struct Dav2dFilmGrainDSPContext { generate_grain_y_fn generate_grain_y; generate_grain_uv_fn generate_grain_uv[3]; fgy_32x32xn_fn fgy_32x32xn; fguv_32x32xn_fn fguv_32x32xn[3]; } Dav2dFilmGrainDSPContext; bitfn_decls(void dav2d_film_grain_dsp_init, Dav2dFilmGrainDSPContext *c); #endif /* DAV2D_SRC_FILM_GRAIN_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/filmgrain_tmpl.c000066400000000000000000000462701517466257200240140ustar00rootroot00000000000000/* * Copyright © 2018-2026, Niklas Haas * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common/attributes.h" #include "common/intops.h" #include "src/filmgrain.h" #include "src/tables.h" #define SUB_GRAIN_WIDTH 44 #define SUB_GRAIN_HEIGHT 38 static inline int get_random_number(const int bits, unsigned *const state) { const int r = *state; unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; *state = (r >> 1) | (bit << 15); return (*state >> (16 - bits)) & ((1 << bits) - 1); } static inline int round2(const int x, const uint64_t shift) { return (x + ((1 << shift) >> 1)) >> shift; } static void generate_grain_y_c(entry buf[][GRAIN_WIDTH], const Dav2dFilmGrainData *const data, unsigned seed HIGHBD_DECL_SUFFIX) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; const int grain_ctr = 128 << bitdepth_min_8; const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; for (int y = 0; y < GRAIN_HEIGHT; y++) { for (int x = 0; x < GRAIN_WIDTH; x++) { const int value = get_random_number(11, &seed); buf[y][x] = round2(dav2d_gaussian_sequence[ value ], shift); } } const int ar_pad = 3; const int ar_lag = data->ar_coeff_lag; for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { const int8_t *coeff = data->ar_coeffs[0]; int sum = 0; for (int dy = -ar_lag; dy <= 0; dy++) { for (int dx = -ar_lag; dx <= ar_lag; dx++) { if (!dx && !dy) break; sum += *(coeff++) * buf[y + dy][x + dx]; } } const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); buf[y][x] = iclip(grain, grain_min, grain_max); } } } static NOINLINE void generate_grain_uv_c(entry buf[][GRAIN_WIDTH], const entry buf_y[][GRAIN_WIDTH], const Dav2dFilmGrainData *const data, unsigned seed, const intptr_t uv, const int subx, const int suby HIGHBD_DECL_SUFFIX) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; seed ^= (uv ? 0x49d8 : 0xb524); const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; const int grain_ctr = 128 << bitdepth_min_8; const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; for (int y = 0; y < chromaH; y++) { for (int x = 0; x < chromaW; x++) { const int value = get_random_number(11, &seed); buf[y][x] = round2(dav2d_gaussian_sequence[ value ], shift); } } const int ar_pad = 3; const int ar_lag = data->ar_coeff_lag; for (int y = ar_pad; y < chromaH; y++) { for (int x = ar_pad; x < chromaW - ar_pad; x++) { const int8_t *coeff = data->ar_coeffs[1 + uv]; int sum = 0; for (int dy = -ar_lag; dy <= 0; dy++) { for (int dx = -ar_lag; dx <= ar_lag; dx++) { // For the final (current) pixel, we need to add in the // contribution from the luma grain texture if (!dx && !dy) { if (!data->num_points[0]) break; int luma = 0; const int lumaX = ((x - ar_pad) << subx) + ar_pad; const int lumaY = ((y - ar_pad) << suby) + ar_pad; for (int i = 0; i <= suby; i++) { for (int j = 0; j <= subx; j++) { luma += buf_y[lumaY + i][lumaX + j]; } } luma = round2(luma, subx + suby); sum += luma * (*coeff); break; } sum += *(coeff++) * buf[y + dy][x + dx]; } } const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); buf[y][x] = iclip(grain, grain_min, grain_max); } } } #define gnuv_ss_fn(nm, ss_x, ss_y) \ static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \ generate_grain_uv_c(buf, buf_y, data, seed, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \ } gnuv_ss_fn(420, 1, 1); gnuv_ss_fn(422, 1, 0); gnuv_ss_fn(444, 0, 0); // samples from the correct block of a grain LUT, while taking into account the // offsets provided by the offsets cache static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], const int bs, const int offsets[2][2][2], const int subx, const int suby, const int bx, const int by, const int x, const int y) { const int *const off = offsets[bx][by]; const int offx = 3 + (2 >> subx) * (3 + off[1]); const int offy = 3 + (2 >> suby) * (3 + off[0]); return grain_lut[offy + y + (bs >> suby) * by] [offx + x + (bs >> subx) * bx]; } static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, const ptrdiff_t stride, const Dav2dFilmGrainData *const data, const unsigned in_seed, const size_t pw, const uint8_t scaling[SCALING_SIZE], const entry grain_lut[][GRAIN_WIDTH], const int bh, const int row_num HIGHBD_DECL_SUFFIX) { const int rows = 1 + (data->overlap_flag && row_num > 0); const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int grain_ctr = 128 << bitdepth_min_8; const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; const int bs = 16 << data->block_size; int min_value, max_value; if (data->clip_to_restricted_range) { min_value = 16 << bitdepth_min_8; max_value = 235 << bitdepth_min_8; } else { min_value = 0; max_value = BITDEPTH_MAX; } // seed[0] contains the current row, seed[1] contains the previous unsigned seed[2]; for (int i = 0; i < rows; i++) { seed[i] = in_seed; seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); } assert(stride % (bs * sizeof(pixel)) == 0); int offsets[2 /* col offset */][2 /* row offset */][2 /* y, x */]; // process this row in bs^2 blocks for (unsigned bx = 0; bx < pw; bx += bs) { const int bw = imin(bs, (int) pw - bx); if (data->overlap_flag && bx) { // shift previous offsets left for (int i = 0; i < rows; i++) for (int n = 0; n < 2; n++) offsets[1][i][n] = offsets[0][i][n]; } // update current offsets for (int i = 0; i < rows; i++) for (int n = 0; n < 2; n++) { offsets[0][i][n] = (3 - data->block_size) * get_random_number(9, &seed[i]) >> 6; for (int m = 0; m < 3; m++) get_random_number(16, &seed[i]); } // x/y block offsets to compensate for overlapped regions const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0; const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0; static const int w[2][2] = { { 27, 17 }, { 17, 27 } }; #define add_noise_y(x, y, grain) \ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \ const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \ *dst = iclip(*src + noise, min_value, max_value); for (int y = ystart; y < bh; y++) { // Non-overlapped image region (straightforward) for (int x = xstart; x < bw; x++) { int grain = sample_lut(grain_lut, bs, offsets, 0, 0, 0, 0, x, y); add_noise_y(x, y, grain); } // Special case for overlapped column for (int x = 0; x < xstart; x++) { int grain = sample_lut(grain_lut, bs, offsets, 0, 0, 0, 0, x, y); int old = sample_lut(grain_lut, bs, offsets, 0, 0, 1, 0, x, y); grain = round2(old * w[x][0] + grain * w[x][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_y(x, y, grain); } } for (int y = 0; y < ystart; y++) { // Special case for overlapped row (sans corner) for (int x = xstart; x < bw; x++) { int grain = sample_lut(grain_lut, bs, offsets, 0, 0, 0, 0, x, y); int old = sample_lut(grain_lut, bs, offsets, 0, 0, 0, 1, x, y); grain = round2(old * w[y][0] + grain * w[y][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_y(x, y, grain); } // Special case for doubly-overlapped corner for (int x = 0; x < xstart; x++) { // Blend the top pixel with the top left block int top = sample_lut(grain_lut, bs, offsets, 0, 0, 0, 1, x, y); int old = sample_lut(grain_lut, bs, offsets, 0, 0, 1, 1, x, y); top = round2(old * w[x][0] + top * w[x][1], 5); top = iclip(top, grain_min, grain_max); // Blend the current pixel with the left block int grain = sample_lut(grain_lut, bs, offsets, 0, 0, 0, 0, x, y); old = sample_lut(grain_lut, bs, offsets, 0, 0, 1, 0, x, y); grain = round2(old * w[x][0] + grain * w[x][1], 5); grain = iclip(grain, grain_min, grain_max); // Mix the row rows together and apply grain grain = round2(top * w[y][0] + grain * w[y][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_y(x, y, grain); } } } } static NOINLINE void fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, const ptrdiff_t stride, const Dav2dFilmGrainData *const data, const unsigned in_seed, const size_t pw, const uint8_t scaling[SCALING_SIZE], const entry grain_lut[][GRAIN_WIDTH], const int bh, const int row_num, const pixel *const luma_row, const ptrdiff_t luma_stride, const int uv, const int is_id, const int sx, const int sy HIGHBD_DECL_SUFFIX) { const int rows = 1 + (data->overlap_flag && row_num > 0); const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int grain_ctr = 128 << bitdepth_min_8; const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; const int bs = 16 << data->block_size; int min_value, max_value; if (data->clip_to_restricted_range) { min_value = 16 << bitdepth_min_8; max_value = (is_id ? 235 : 240) << bitdepth_min_8; } else { min_value = 0; max_value = BITDEPTH_MAX; } // seed[0] contains the current row, seed[1] contains the previous unsigned seed[2]; for (int i = 0; i < rows; i++) { seed[i] = in_seed; seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); } assert(stride % (bs * sizeof(pixel)) == 0); int offsets[2 /* col offset */][2 /* row offset */][2 /* y, x */]; // process this row in bs^2 blocks (subsampled) for (unsigned bx = 0; bx < pw; bx += bs >> sx) { const int bw = imin(bs >> sx, (int)(pw - bx)); if (data->overlap_flag && bx) { // shift previous offsets left for (int i = 0; i < rows; i++) for (int n = 0; n < 2; n++) offsets[1][i][n] = offsets[0][i][n]; } // update current offsets for (int i = 0; i < rows; i++) for (int n = 0; n < 2; n++) { offsets[0][i][n] = (3 - data->block_size) * get_random_number(9, &seed[i]) >> 6; for (int m = 0; m < 3; m++) get_random_number(16, &seed[i]); } // x/y block offsets to compensate for overlapped regions const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0; const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0; static const int w[2 /* sub */][2 /* off */][2] = { { { 27, 17 }, { 17, 27 } }, { { 23, 22 } }, }; #define add_noise_uv(x, y, grain) \ const int lx = (bx + x) << sx; \ const int ly = y << sy; \ const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \ pixel avg = luma[0]; \ if (sx) \ avg = (avg + luma[1] + 1) >> 1; \ const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ int val = avg; \ if (!data->chroma_scaling_from_luma) { \ const int combined = avg * data->uv_luma_mult[uv] + \ *src * data->uv_mult[uv]; \ val = iclip_pixel( (combined >> 6) + \ (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \ } \ const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \ *dst = iclip(*src + noise, min_value, max_value); for (int y = ystart; y < bh; y++) { // Non-overlapped image region (straightforward) for (int x = xstart; x < bw; x++) { int grain = sample_lut(grain_lut, bs, offsets, sx, sy, 0, 0, x, y); add_noise_uv(x, y, grain); } // Special case for overlapped column for (int x = 0; x < xstart; x++) { int grain = sample_lut(grain_lut, bs, offsets, sx, sy, 0, 0, x, y); int old = sample_lut(grain_lut, bs, offsets, sx, sy, 1, 0, x, y); grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_uv(x, y, grain); } } for (int y = 0; y < ystart; y++) { // Special case for overlapped row (sans corner) for (int x = xstart; x < bw; x++) { int grain = sample_lut(grain_lut, bs, offsets, sx, sy, 0, 0, x, y); int old = sample_lut(grain_lut, bs, offsets, sx, sy, 0, 1, x, y); grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_uv(x, y, grain); } // Special case for doubly-overlapped corner for (int x = 0; x < xstart; x++) { // Blend the top pixel with the top left block int top = sample_lut(grain_lut, bs, offsets, sx, sy, 0, 1, x, y); int old = sample_lut(grain_lut, bs, offsets, sx, sy, 1, 1, x, y); top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5); top = iclip(top, grain_min, grain_max); // Blend the current pixel with the left block int grain = sample_lut(grain_lut, bs, offsets, sx, sy, 0, 0, x, y); old = sample_lut(grain_lut, bs, offsets, sx, sy, 1, 0, x, y); grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); grain = iclip(grain, grain_min, grain_max); // Mix the row rows together and apply to image grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_uv(x, y, grain); } } } } #define fguv_ss_fn(nm, ss_x, ss_y) \ static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \ fguv_32x32xn_c(dst_row, src_row, stride, data, seed, pw, scaling, grain_lut, \ bh, row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \ HIGHBD_TAIL_SUFFIX); \ } fguv_ss_fn(420, 1, 1); fguv_ss_fn(422, 1, 0); fguv_ss_fn(444, 0, 0); #if HAVE_ASM && 0 #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/filmgrain.h" #elif ARCH_X86 #include "src/x86/filmgrain.h" #endif #endif COLD void bitfn(dav2d_film_grain_dsp_init)(Dav2dFilmGrainDSPContext *const c) { c->generate_grain_y = generate_grain_y_c; c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c; c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c; c->fgy_32x32xn = fgy_32x32xn_c; c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c; c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; #if HAVE_ASM && 0 #if ARCH_AARCH64 || ARCH_ARM film_grain_dsp_init_arm(c); #elif ARCH_X86 film_grain_dsp_init_x86(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/gdf_tables.c000066400000000000000000014573721517466257200231140ustar00rootroot00000000000000/* * Copyright © 2026, VideoLAN and dav2d authors * Copyright © 2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "gdf_tables.h" const uint16_t dav2d_gdf_alpha[6][6][22][4] = { { { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 386, 511, 386 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 43, 511, 43, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 10, 111, 10, 111 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 386, 511, 386, 511 }, { 511, 511, 511, 511 }, { 511, 43, 511, 43 }, { 111, 10, 111, 10 }, { 25, 25, 25, 25 }, { 28, 28, 28, 28 }, { 2, 2, 2, 2 }, { 2, 2, 2, 2 }, }, { { 478, 120, 478, 120 }, { 511, 511, 511, 511 }, { 232, 109, 232, 109 }, { 511, 127, 511, 127 }, { 511, 511, 511, 511 }, { 375, 15, 375, 15 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 120, 478, 120, 478 }, { 511, 511, 511, 511 }, { 109, 232, 109, 232 }, { 127, 511, 127, 511 }, { 15, 375, 15, 375 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 8, 8, 8, 8 }, { 10, 10, 10, 10 }, { 511, 511, 511, 511 }, }, { { 511, 103, 511, 103 }, { 511, 114, 511, 114 }, { 511, 4, 511, 4 }, { 511, 13, 511, 13 }, { 511, 204, 511, 11 }, { 511, 511, 511, 511 }, { 511, 11, 511, 204 }, { 204, 511, 11, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 11, 511, 204, 511 }, { 103, 511, 103, 511 }, { 114, 511, 114, 511 }, { 4, 511, 4, 511 }, { 13, 511, 13, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 66, 66, 66, 66 }, { 22, 22, 22, 22 }, { 10, 10, 10, 10 }, { 147, 147, 147, 147 }, }, { { 511, 511, 511, 511 }, { 511, 5, 511, 5 }, { 511, 6, 511, 6 }, { 511, 511, 511, 511 }, { 511, 511, 39, 16 }, { 18, 36, 18, 36 }, { 39, 16, 511, 511 }, { 511, 511, 16, 39 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 16, 39, 511, 511 }, { 511, 511, 511, 511 }, { 5, 511, 5, 511 }, { 6, 511, 6, 511 }, { 511, 511, 511, 511 }, { 36, 18, 36, 18 }, { 511, 511, 511, 511 }, { 52, 52, 52, 52 }, { 72, 72, 72, 72 }, { 69, 69, 69, 69 }, { 78, 78, 78, 78 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 12, 511, 12 }, { 511, 286, 511, 286 }, { 511, 511, 511, 25 }, { 16, 64, 16, 64 }, { 511, 25, 511, 511 }, { 511, 511, 25, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 25, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 12, 511, 12, 511 }, { 286, 511, 286, 511 }, { 64, 16, 64, 16 }, { 511, 511, 511, 511 }, { 201, 201, 201, 201 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 17, 25, 17, 25 }, { 511, 511, 511, 511 }, { 511, 511, 52, 35 }, { 511, 64, 511, 64 }, { 52, 35, 511, 511 }, { 511, 511, 35, 52 }, { 511, 511, 26, 26 }, { 511, 178, 511, 178 }, { 26, 26, 511, 511 }, { 35, 52, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 25, 17, 25, 17 }, { 511, 511, 511, 511 }, { 64, 511, 64, 511 }, { 178, 511, 178, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 3, 3, 3, 3 }, } }, { { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 197, 145, 197, 145 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 17, 17 }, { 511, 40, 511, 40 }, { 17, 17, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 145, 197, 145, 197 }, { 511, 511, 511, 511 }, { 40, 511, 40, 511 }, { 511, 511, 511, 511 }, { 335, 335, 335, 335 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 98, 511, 98, 511 }, { 511, 511, 511, 511 }, { 118, 145, 118, 145 }, { 511, 511, 511, 12 }, { 511, 511, 511, 511 }, { 511, 12, 511, 511 }, { 511, 511, 12, 511 }, { 511, 511, 511, 511 }, { 26, 33, 26, 33 }, { 511, 511, 511, 511 }, { 12, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 98, 511, 98 }, { 511, 511, 511, 511 }, { 145, 118, 145, 118 }, { 511, 511, 511, 511 }, { 33, 26, 33, 26 }, { 14, 14, 14, 14 }, { 12, 12, 12, 12 }, { 171, 171, 171, 171 }, { 48, 48, 48, 48 }, }, { { 6, 93, 6, 93 }, { 511, 6, 511, 6 }, { 511, 7, 511, 7 }, { 85, 130, 85, 130 }, { 511, 511, 6, 511 }, { 511, 39, 511, 39 }, { 6, 511, 511, 511 }, { 511, 511, 511, 6 }, { 511, 511, 26, 26 }, { 511, 511, 511, 511 }, { 26, 26, 511, 511 }, { 511, 6, 511, 511 }, { 93, 6, 93, 6 }, { 6, 511, 6, 511 }, { 7, 511, 7, 511 }, { 130, 85, 130, 85 }, { 39, 511, 39, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 9, 9, 9, 9 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 9, 511, 9 }, { 511, 511, 511, 511 }, { 511, 1, 511, 17 }, { 2, 511, 2, 511 }, { 511, 17, 511, 1 }, { 1, 511, 17, 511 }, { 511, 511, 511, 511 }, { 12, 47, 12, 47 }, { 511, 511, 511, 511 }, { 17, 511, 1, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 9, 511, 9, 511 }, { 511, 511, 511, 511 }, { 511, 2, 511, 2 }, { 47, 12, 47, 12 }, { 511, 511, 511, 511 }, { 183, 183, 183, 183 }, { 92, 92, 92, 92 }, { 511, 511, 511, 511 }, }, { { 511, 6, 511, 6 }, { 511, 4, 511, 4 }, { 511, 7, 511, 7 }, { 511, 9, 511, 9 }, { 511, 511, 511, 26 }, { 511, 131, 511, 131 }, { 511, 26, 511, 511 }, { 511, 511, 26, 511 }, { 511, 511, 9, 9 }, { 511, 511, 511, 511 }, { 9, 9, 511, 511 }, { 26, 511, 511, 511 }, { 6, 511, 6, 511 }, { 4, 511, 4, 511 }, { 7, 511, 7, 511 }, { 9, 511, 9, 511 }, { 131, 511, 131, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 12, 511, 12 }, { 511, 23, 511, 23 }, { 14, 511, 14, 511 }, { 511, 511, 511, 511 }, { 511, 511, 13, 511 }, { 511, 511, 511, 511 }, { 13, 511, 511, 511 }, { 511, 511, 511, 13 }, { 511, 511, 511, 511 }, { 511, 20, 511, 20 }, { 511, 511, 511, 511 }, { 511, 13, 511, 511 }, { 12, 511, 12, 511 }, { 23, 511, 23, 511 }, { 511, 14, 511, 14 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 20, 511, 20, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, }, { { { 511, 511, 511, 511 }, { 138, 511, 138, 511 }, { 511, 511, 511, 511 }, { 238, 511, 238, 511 }, { 511, 511, 511, 511 }, { 59, 511, 59, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 24, 47, 24, 47 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 138, 511, 138 }, { 511, 511, 511, 511 }, { 511, 238, 511, 238 }, { 511, 59, 511, 59 }, { 47, 24, 47, 24 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 242, 242, 242, 242 }, { 74, 74, 74, 74 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 158, 511, 158 }, { 133, 511, 133, 511 }, { 511, 4, 511, 14 }, { 511, 11, 511, 11 }, { 511, 14, 511, 4 }, { 4, 511, 14, 511 }, { 511, 511, 511, 511 }, { 511, 52, 511, 52 }, { 511, 511, 511, 511 }, { 14, 511, 4, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 158, 511, 158, 511 }, { 511, 133, 511, 133 }, { 11, 511, 11, 511 }, { 52, 511, 52, 511 }, { 467, 467, 467, 467 }, { 511, 511, 511, 511 }, { 28, 28, 28, 28 }, { 33, 33, 33, 33 }, }, { { 511, 214, 511, 214 }, { 511, 511, 511, 511 }, { 79, 194, 79, 194 }, { 511, 289, 511, 289 }, { 511, 72, 37, 24 }, { 43, 21, 43, 21 }, { 37, 24, 511, 72 }, { 72, 511, 24, 37 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 24, 37, 72, 511 }, { 214, 511, 214, 511 }, { 511, 511, 511, 511 }, { 194, 79, 194, 79 }, { 289, 511, 289, 511 }, { 21, 43, 21, 43 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 115, 115, 115, 115 }, { 23, 23, 23, 23 }, }, { { 511, 211, 511, 211 }, { 511, 511, 511, 511 }, { 132, 511, 132, 511 }, { 511, 511, 511, 511 }, { 511, 511, 32, 25 }, { 56, 8, 56, 8 }, { 32, 25, 511, 511 }, { 511, 511, 25, 32 }, { 511, 511, 14, 14 }, { 511, 67, 511, 67 }, { 14, 14, 511, 511 }, { 25, 32, 511, 511 }, { 211, 511, 211, 511 }, { 511, 511, 511, 511 }, { 511, 132, 511, 132 }, { 511, 511, 511, 511 }, { 8, 56, 8, 56 }, { 67, 511, 67, 511 }, { 89, 89, 89, 89 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 6, 511, 6 }, { 511, 10, 511, 10 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 24 }, { 511, 511, 511, 511 }, { 511, 24, 511, 511 }, { 511, 511, 24, 511 }, { 511, 511, 6, 6 }, { 511, 511, 511, 511 }, { 6, 6, 511, 511 }, { 24, 511, 511, 511 }, { 6, 511, 6, 511 }, { 10, 511, 10, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 26, 511, 26, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 26, 511, 26 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 6, 6, 6, 6 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, }, { { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 115, 511, 115, 511 }, { 511, 511, 511, 511 }, { 511, 173, 511, 173 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 14, 14 }, { 26, 51, 26, 51 }, { 14, 14, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 115, 511, 115 }, { 173, 511, 173, 511 }, { 51, 26, 51, 26 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 511, 511, 511, 511 }, { 384, 384, 384, 384 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 153, 511, 153 }, { 511, 511, 67, 12 }, { 511, 511, 511, 511 }, { 67, 12, 511, 511 }, { 511, 511, 12, 67 }, { 511, 511, 511, 511 }, { 30, 37, 30, 37 }, { 511, 511, 511, 511 }, { 12, 67, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 153, 511, 153, 511 }, { 511, 511, 511, 511 }, { 37, 30, 37, 30 }, { 511, 511, 511, 511 }, { 12, 12, 12, 12 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 106, 11, 106, 11 }, { 511, 511, 5, 34 }, { 511, 511, 511, 511 }, { 5, 34, 511, 511 }, { 511, 511, 34, 5 }, { 511, 511, 9, 9 }, { 511, 44, 511, 44 }, { 9, 9, 511, 511 }, { 34, 5, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 11, 106, 11, 106 }, { 511, 511, 511, 511 }, { 44, 511, 44, 511 }, { 3, 3, 3, 3 }, { 7, 7, 7, 7 }, { 4, 4, 4, 4 }, { 511, 511, 511, 511 }, }, { { 511, 142, 511, 142 }, { 511, 9, 511, 9 }, { 511, 5, 511, 5 }, { 511, 511, 511, 511 }, { 511, 511, 511, 10 }, { 511, 68, 511, 68 }, { 511, 10, 511, 511 }, { 511, 511, 10, 511 }, { 511, 511, 46, 46 }, { 511, 28, 511, 28 }, { 46, 46, 511, 511 }, { 10, 511, 511, 511 }, { 142, 511, 142, 511 }, { 9, 511, 9, 511 }, { 5, 511, 5, 511 }, { 511, 511, 511, 511 }, { 68, 511, 68, 511 }, { 28, 511, 28, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 13, 511, 13 }, { 511, 511, 511, 511 }, { 511, 37, 511, 37 }, { 511, 511, 50, 8 }, { 511, 511, 511, 511 }, { 50, 8, 511, 511 }, { 511, 511, 8, 50 }, { 511, 511, 2, 2 }, { 511, 71, 511, 71 }, { 2, 2, 511, 511 }, { 8, 50, 511, 511 }, { 511, 511, 511, 511 }, { 13, 511, 13, 511 }, { 511, 511, 511, 511 }, { 37, 511, 37, 511 }, { 511, 511, 511, 511 }, { 71, 511, 71, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 24, 511, 24, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 35, 35 }, { 511, 511, 511, 511 }, { 35, 35, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 24, 511, 24 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 13, 13, 13, 13 }, { 12, 12, 12, 12 }, }, }, { { { 511, 511, 511, 511 }, { 511, 120, 511, 120 }, { 511, 511, 511, 511 }, { 323, 511, 323, 511 }, { 511, 511, 49, 511 }, { 511, 511, 511, 511 }, { 49, 511, 511, 511 }, { 511, 511, 511, 49 }, { 511, 511, 17, 17 }, { 511, 45, 511, 45 }, { 17, 17, 511, 511 }, { 511, 49, 511, 511 }, { 511, 511, 511, 511 }, { 120, 511, 120, 511 }, { 511, 511, 511, 511 }, { 511, 323, 511, 323 }, { 511, 511, 511, 511 }, { 45, 511, 45, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 53, 53, 53, 53 }, { 47, 47, 47, 47 }, }, { { 511, 511, 511, 511 }, { 511, 231, 511, 231 }, { 105, 511, 105, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 14 }, { 511, 511, 511, 511 }, { 511, 14, 511, 511 }, { 511, 511, 14, 511 }, { 511, 511, 511, 511 }, { 511, 53, 511, 53 }, { 511, 511, 511, 511 }, { 14, 511, 511, 511 }, { 511, 511, 511, 511 }, { 231, 511, 231, 511 }, { 511, 105, 511, 105 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 53, 511, 53, 511 }, { 9, 9, 9, 9 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 238, 511, 238 }, { 119, 511, 119, 511 }, { 511, 4, 10, 7 }, { 511, 511, 511, 511 }, { 10, 7, 511, 4 }, { 4, 511, 7, 10 }, { 511, 511, 29, 29 }, { 511, 42, 511, 42 }, { 29, 29, 511, 511 }, { 7, 10, 4, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 238, 511, 238, 511 }, { 511, 119, 511, 119 }, { 511, 511, 511, 511 }, { 42, 511, 42, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 12, 12, 12, 12 }, { 9, 9, 9, 9 }, }, { { 511, 5, 511, 5 }, { 511, 5, 511, 5 }, { 511, 5, 511, 5 }, { 511, 210, 511, 210 }, { 511, 511, 20, 12 }, { 511, 67, 511, 67 }, { 20, 12, 511, 511 }, { 511, 511, 12, 20 }, { 511, 511, 47, 47 }, { 511, 511, 511, 511 }, { 47, 47, 511, 511 }, { 12, 20, 511, 511 }, { 5, 511, 5, 511 }, { 5, 511, 5, 511 }, { 5, 511, 5, 511 }, { 210, 511, 210, 511 }, { 67, 511, 67, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 71, 71, 71, 71 }, { 511, 511, 511, 511 }, }, { { 511, 4, 511, 4 }, { 511, 5, 511, 5 }, { 511, 10, 511, 10 }, { 511, 511, 511, 511 }, { 511, 1, 511, 23 }, { 511, 511, 511, 511 }, { 511, 23, 511, 1 }, { 1, 511, 23, 511 }, { 511, 511, 7, 7 }, { 511, 40, 511, 40 }, { 7, 7, 511, 511 }, { 23, 511, 1, 511 }, { 4, 511, 4, 511 }, { 5, 511, 5, 511 }, { 10, 511, 10, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 40, 511, 40, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 25, 511, 25, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 25, 511, 25 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 11, 11, 11, 11 }, { 14, 14, 14, 14 }, { 16, 16, 16, 16 }, { 511, 511, 511, 511 }, } }, { { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 163, 511, 163 }, { 342, 511, 342, 511 }, { 511, 511, 511, 511 }, { 511, 10, 511, 10 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 163, 511, 163, 511 }, { 511, 342, 511, 342 }, { 10, 511, 10, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 34, 34, 34, 34 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 123, 511, 123 }, { 40, 511, 511, 14 }, { 511, 511, 511, 511 }, { 511, 14, 40, 511 }, { 511, 40, 14, 511 }, { 511, 511, 511, 511 }, { 511, 49, 511, 49 }, { 511, 511, 511, 511 }, { 14, 511, 511, 40 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 123, 511, 123, 511 }, { 511, 511, 511, 511 }, { 49, 511, 49, 511 }, { 511, 511, 511, 511 }, { 429, 429, 429, 429 }, { 62, 62, 62, 62 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 114, 511, 114, 511 }, { 511, 511, 511, 511 }, { 511, 57, 511, 511 }, { 511, 9, 511, 9 }, { 511, 511, 511, 57 }, { 57, 511, 511, 511 }, { 14, 14, 46, 46 }, { 511, 511, 511, 511 }, { 46, 46, 14, 14 }, { 511, 511, 57, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 114, 511, 114 }, { 511, 511, 511, 511 }, { 9, 511, 9, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 4, 511, 4 }, { 16, 7, 16, 7 }, { 151, 511, 151, 511 }, { 511, 511, 511, 511 }, { 511, 511, 11, 511 }, { 40, 18, 40, 18 }, { 11, 511, 511, 511 }, { 511, 511, 511, 11 }, { 511, 511, 20, 20 }, { 511, 511, 511, 511 }, { 20, 20, 511, 511 }, { 511, 11, 511, 511 }, { 4, 511, 4, 511 }, { 7, 16, 7, 16 }, { 511, 151, 511, 151 }, { 511, 511, 511, 511 }, { 18, 40, 18, 40 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 7, 511, 7 }, { 511, 511, 511, 511 }, { 511, 12, 511, 12 }, { 511, 511, 511, 511 }, { 511, 511, 24, 23 }, { 5, 511, 5, 511 }, { 24, 23, 511, 511 }, { 511, 511, 23, 24 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 23, 24, 511, 511 }, { 7, 511, 7, 511 }, { 511, 511, 511, 511 }, { 12, 511, 12, 511 }, { 511, 511, 511, 511 }, { 511, 5, 511, 5 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, }, { { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 4 }, { 30, 511, 30, 511 }, { 511, 4, 511, 511 }, { 511, 511, 4, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 4, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 511, 30, 511, 30 }, { 511, 511, 511, 511 }, { 511, 511, 511, 511 }, { 75, 75, 75, 75 }, { 21, 21, 21, 21 }, { 511, 511, 511, 511 }, }, }, }; const int16_t dav2d_gdf_weight[6][6][3][22][4] = { { { { { -5, 0, -5, 0 }, { 13, 0, 13, 0 }, { -19, 0, -19, 0 }, { 15, 0, 15, 0 }, { 19, 0, -24, 0 }, { -17, -36, -17, -36 }, { -24, 0, 19, 0 }, { 0, 19, 0, -24 }, { -39, -39, 39, 39 }, { 103, 327, 103, 327 }, { 39, 39, -39, -39 }, { 0, -24, 0, 19 }, { 0, -5, 0, -5 }, { 0, 13, 0, 13 }, { 0, -19, 0, -19 }, { 0, 15, 0, 15 }, { -36, -17, -36, -17 }, { 327, 103, 327, 103 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -11, 0, -11 }, { 0, 33, 0, 33 }, { 0, -60, 0, -60 }, { 0, 91, 0, 91 }, { 0, 11, 0, -12 }, { -29, -138, -29, -138 }, { 0, -12, 0, 11 }, { 11, 0, -12, 0 }, { -22, -22, 17, 17 }, { 36, 136, 36, 136 }, { 17, 17, -22, -22 }, { -12, 0, 11, 0 }, { -11, 0, -11, 0 }, { 33, 0, 33, 0 }, { -60, 0, -60, 0 }, { 91, 0, 91, 0 }, { -138, -29, -138, -29 }, { 136, 36, 136, 36 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -15, -31, -15, -31 }, { 0, 0, 0, 0 }, { 13, 46, 13, 46 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -49, -12, -49, -12 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -31, -15, -31, -15 }, { 46, 13, 46, 13 }, { -12, -49, -12, -49 }, { 59, 59, 59, 59 }, { 57, 57, 57, 57 }, { 35, 35, 35, 35 }, { 22, 22, 22, 22 }, }, }, { { { -4, 0, -4, 0 }, { 10, 0, 10, 0 }, { -14, 0, -14, 0 }, { -2, 0, -2, 0 }, { 7, 0, -44, 0 }, { 23, 80, 23, 80 }, { -44, 0, 7, 0 }, { 0, 7, 0, -44 }, { -41, -41, 126, 126 }, { -110, 135, -110, 135 }, { 126, 126, -41, -41 }, { 0, -44, 0, 7 }, { 0, -4, 0, -4 }, { 0, 10, 0, 10 }, { 0, -14, 0, -14 }, { 0, -2, 0, -2 }, { 80, 23, 80, 23 }, { 135, -110, 135, -110 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -9, 0, -9 }, { 0, 9, 0, 9 }, { 0, -8, 0, -8 }, { 0, -22, 0, -22 }, { 0, 2, 0, -42 }, { 34, 244, 34, 244 }, { 0, -42, 0, 2 }, { 2, 0, -42, 0 }, { 1, 1, 122, 122 }, { -53, 139, -53, 139 }, { 122, 122, 1, 1 }, { -42, 0, 2, 0 }, { -9, 0, -9, 0 }, { 9, 0, 9, 0 }, { -8, 0, -8, 0 }, { -22, 0, -22, 0 }, { 244, 34, 244, 34 }, { 139, -53, 139, -53 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 55, 49, 55, 49 }, { 0, 0, 0, 0 }, { 1, 56, 1, 56 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -105, -71, -105, -71 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 49, 55, 49, 55 }, { 56, 1, 56, 1 }, { -71, -105, -71, -105 }, { -12, -12, -12, -12 }, { 61, 61, 61, 61 }, { 51, 51, 51, 51 }, { -19, -19, -19, -19 }, }, }, { { { -20, 0, -20, 0 }, { 8, 0, 8, 0 }, { -16, 0, -16, 0 }, { -10, 0, -10, 0 }, { -16, 0, -24, 0 }, { 14, -36, 14, -36 }, { -24, 0, -16, 0 }, { 0, -16, 0, -24 }, { -27, -27, 52, 52 }, { -21, 106, -21, 106 }, { 52, 52, -27, -27 }, { 0, -24, 0, -16 }, { 0, -20, 0, -20 }, { 0, 8, 0, 8 }, { 0, -16, 0, -16 }, { 0, -10, 0, -10 }, { -36, 14, -36, 14 }, { 106, -21, 106, -21 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -14, 0, -14 }, { 0, -24, 0, -24 }, { 0, 462, 0, 462 }, { 0, 172, 0, 172 }, { 0, -15, 0, 472 }, { 33, -32, 33, -32 }, { 0, 472, 0, -15 }, { -15, 0, 472, 0 }, { 19, 19, 143, 143 }, { -126, 125, -126, 125 }, { 143, 143, 19, 19 }, { 472, 0, -15, 0 }, { -14, 0, -14, 0 }, { -24, 0, -24, 0 }, { 462, 0, 462, 0 }, { 172, 0, 172, 0 }, { -32, 33, -32, 33 }, { 125, -126, 125, -126 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -5, -265, -5, -265 }, { 0, 0, 0, 0 }, { 6, -2, 6, -2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 87, -194, 87, -194 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -265, -5, -265, -5 }, { -2, 6, -2, 6 }, { -194, 87, -194, 87 }, { -5, -5, -5, -5 }, { -4, -4, -4, -4 }, { 0, 0, 0, 0 }, { -4, -4, -4, -4 }, }, }, { { { -19, 0, -19, 0 }, { 3, 0, 3, 0 }, { -11, 0, -11, 0 }, { 11, 0, 11, 0 }, { -27, 0, -79, 0 }, { -99, 29, -99, 29 }, { -79, 0, -27, 0 }, { 0, -27, 0, -79 }, { -22, -22, 18, 18 }, { -4, 90, -4, 90 }, { 18, 18, -22, -22 }, { 0, -79, 0, -27 }, { 0, -19, 0, -19 }, { 0, 3, 0, 3 }, { 0, -11, 0, -11 }, { 0, 11, 0, 11 }, { 29, -99, 29, -99 }, { 90, -4, 90, -4 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -15, 0, -15 }, { 0, 511, 0, 511 }, { 0, 372, 0, 372 }, { 0, -23, 0, -23 }, { 0, -2, 0, 422 }, { 173, 243, 173, 243 }, { 0, 422, 0, -2 }, { -2, 0, 422, 0 }, { -21, -21, 84, 84 }, { -114, 110, -114, 110 }, { 84, 84, -21, -21 }, { 422, 0, -2, 0 }, { -15, 0, -15, 0 }, { 511, 0, 511, 0 }, { 372, 0, 372, 0 }, { -23, 0, -23, 0 }, { 243, 173, 243, 173 }, { 110, -114, 110, -114 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 39, 87, 39, 87 }, { 0, 0, 0, 0 }, { -394, -13, -394, -13 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -37, 51, -37, 51 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 87, 39, 87, 39 }, { -13, -394, -13, -394 }, { 51, -37, 51, -37 }, { 41, 41, 41, 41 }, { 33, 33, 33, 33 }, { -24, -24, -24, -24 }, { -20, -20, -20, -20 }, }, }, { { { -46, 0, -46, 0 }, { 15, 0, 15, 0 }, { -25, 0, -25, 0 }, { 10, 0, 10, 0 }, { -39, 0, -23, 0 }, { 144, 76, 144, 76 }, { -23, 0, -39, 0 }, { 0, -39, 0, -23 }, { 1, 1, 50, 50 }, { 75, 36, 75, 36 }, { 50, 50, 1, 1 }, { 0, -23, 0, -39 }, { 0, -46, 0, -46 }, { 0, 15, 0, 15 }, { 0, -25, 0, -25 }, { 0, 10, 0, 10 }, { 76, 144, 76, 144 }, { 36, 75, 36, 75 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -8, 0, -8 }, { 0, -6, 0, -6 }, { 0, 511, 0, 511 }, { 0, -28, 0, -28 }, { 0, -14, 0, 289 }, { 509, 237, 509, 237 }, { 0, 289, 0, -14 }, { -14, 0, 289, 0 }, { 1, 1, 58, 58 }, { -94, -16, -94, -16 }, { 58, 58, 1, 1 }, { 289, 0, -14, 0 }, { -8, 0, -8, 0 }, { -6, 0, -6, 0 }, { 511, 0, 511, 0 }, { -28, 0, -28, 0 }, { 237, 509, 237, 509 }, { -16, -94, -16, -94 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -21, -152, -21, -152 }, { 0, 0, 0, 0 }, { -94, -46, -94, -46 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -95, 55, -95, 55 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -152, -21, -152, -21 }, { -46, -94, -46, -94 }, { 55, -95, 55, -95 }, { 2, 2, 2, 2 }, { 5, 5, 5, 5 }, { -6, -6, -6, -6 }, { -2, -2, -2, -2 }, }, }, { { { -35, 0, -35, 0 }, { -3, 0, -3, 0 }, { 511, 0, 511, 0 }, { -47, 0, -47, 0 }, { -40, 0, 318, 0 }, { -57, -88, -57, -88 }, { 318, 0, -40, 0 }, { 0, -40, 0, 318 }, { 30, 30, -280, -280 }, { 135, -76, 135, -76 }, { -280, -280, 30, 30 }, { 0, 318, 0, -40 }, { 0, -35, 0, -35 }, { 0, -3, 0, -3 }, { 0, 511, 0, 511 }, { 0, -47, 0, -47 }, { -88, -57, -88, -57 }, { -76, 135, -76, 135 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -32, 0, -32 }, { 0, -11, 0, -11 }, { 0, 511, 0, 511 }, { 0, -31, 0, -31 }, { 0, 5, 0, 320 }, { 16, 239, 16, 239 }, { 0, 320, 0, 5 }, { 5, 0, 320, 0 }, { -45, -45, 66, 66 }, { -154, -69, -154, -69 }, { 66, 66, -45, -45 }, { 320, 0, 5, 0 }, { -32, 0, -32, 0 }, { -11, 0, -11, 0 }, { 511, 0, 511, 0 }, { -31, 0, -31, 0 }, { 239, 16, 239, 16 }, { -69, -154, -69, -154 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 91, 17, 91, 17 }, { 0, 0, 0, 0 }, { -36, 87, -36, 87 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -6, -49, -6, -49 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 17, 91, 17, 91 }, { 87, -36, 87, -36 }, { -49, -6, -49, -6 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 12, 12, 12, 12 }, }, }, }, { { { { -3, 0, -3, 0 }, { 6, 0, 6, 0 }, { -11, 0, -11, 0 }, { 26, 0, 26, 0 }, { -5, 0, 23, 0 }, { -21, 37, -21, 37 }, { 23, 0, -5, 0 }, { 0, -5, 0, 23 }, { 13, 13, -246, -246 }, { 17, -315, 17, -315 }, { -246, -246, 13, 13 }, { 0, 23, 0, -5 }, { 0, -3, 0, -3 }, { 0, 6, 0, 6 }, { 0, -11, 0, -11 }, { 0, 26, 0, 26 }, { 37, -21, 37, -21 }, { -315, 17, -315, 17 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -16, 0, -16 }, { 0, 9, 0, 9 }, { 0, -25, 0, -25 }, { 0, 28, 0, 28 }, { 0, -25, 0, 13 }, { -25, 26, -25, 26 }, { 0, 13, 0, -25 }, { -25, 0, 13, 0 }, { 6, 6, -112, -112 }, { 29, -23, 29, -23 }, { -112, -112, 6, 6 }, { 13, 0, -25, 0 }, { -16, 0, -16, 0 }, { 9, 0, 9, 0 }, { -25, 0, -25, 0 }, { 28, 0, 28, 0 }, { 26, -25, 26, -25 }, { -23, 29, -23, 29 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 17, 20, 17, 20 }, { 0, 0, 0, 0 }, { 7, 30, 7, 30 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -12, 32, -12, 32 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 20, 17, 20, 17 }, { 30, 7, 30, 7 }, { 32, -12, 32, -12 }, { -3, -3, -3, -3 }, { 24, 24, 24, 24 }, { -2, -2, -2, -2 }, { 4, 4, 4, 4 }, }, }, { { { -12, 0, -12, 0 }, { 19, 0, 19, 0 }, { -54, 0, -54, 0 }, { 79, 0, 79, 0 }, { -64, 0, 0, 0 }, { -4, 23, -4, 23 }, { 0, 0, -64, 0 }, { 0, -64, 0, 0 }, { 114, 114, 61, 61 }, { -12, -126, -12, -126 }, { 61, 61, 114, 114 }, { 0, 0, 0, -64 }, { 0, -12, 0, -12 }, { 0, 19, 0, 19 }, { 0, -54, 0, -54 }, { 0, 79, 0, 79 }, { 23, -4, 23, -4 }, { -126, -12, -126, -12 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 3, 0, 3 }, { 0, -5, 0, -5 }, { 0, 18, 0, 18 }, { 0, -36, 0, -36 }, { 0, 20, 0, 263 }, { 6, -15, 6, -15 }, { 0, 263, 0, 20 }, { 20, 0, 263, 0 }, { -30, -30, -9, -9 }, { 266, 329, 266, 329 }, { -9, -9, -30, -30 }, { 263, 0, 20, 0 }, { 3, 0, 3, 0 }, { -5, 0, -5, 0 }, { 18, 0, 18, 0 }, { -36, 0, -36, 0 }, { -15, 6, -15, 6 }, { 329, 266, 329, 266 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -52, -39, -52, -39 }, { 0, 0, 0, 0 }, { -57, -74, -57, -74 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 156, 336, 156, 336 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -39, -52, -39, -52 }, { -74, -57, -74, -57 }, { 336, 156, 336, 156 }, { -16, -16, -16, -16 }, { -13, -13, -13, -13 }, { 6, 6, 6, 6 }, { 1, 1, 1, 1 }, }, }, { { { 85, 0, 85, 0 }, { 25, 0, 25, 0 }, { 10, 0, 10, 0 }, { -83, 0, -83, 0 }, { 58, 0, 83, 0 }, { 45, -14, 45, -14 }, { 83, 0, 58, 0 }, { 0, 58, 0, 83 }, { -16, -16, 18, 18 }, { -146, -123, -146, -123 }, { 18, 18, -16, -16 }, { 0, 83, 0, 58 }, { 0, 85, 0, 85 }, { 0, 25, 0, 25 }, { 0, 10, 0, 10 }, { 0, -83, 0, -83 }, { -14, 45, -14, 45 }, { -123, -146, -123, -146 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 33, 0, 33 }, { 0, -336, 0, -336 }, { 0, -294, 0, -294 }, { 0, 61, 0, 61 }, { 0, 14, 0, 17 }, { -1, -248, -1, -248 }, { 0, 17, 0, 14 }, { 14, 0, 17, 0 }, { 12, 12, -411, -411 }, { 30, -203, 30, -203 }, { -411, -411, 12, 12 }, { 17, 0, 14, 0 }, { 33, 0, 33, 0 }, { -336, 0, -336, 0 }, { -294, 0, -294, 0 }, { 61, 0, 61, 0 }, { -248, -1, -248, -1 }, { -203, 30, -203, 30 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 69, 61, 69, 61 }, { 0, 0, 0, 0 }, { -11, 75, -11, 75 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 168, 29, 168, 29 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 61, 69, 61, 69 }, { 75, -11, 75, -11 }, { 29, 168, 29, 168 }, { -25, -25, -25, -25 }, { 123, 123, 123, 123 }, { -1, -1, -1, -1 }, { 9, 9, 9, 9 }, }, }, { { { 21, 0, 21, 0 }, { 8, 0, 8, 0 }, { 2, 0, 2, 0 }, { 8, 0, 8, 0 }, { 61, 0, -11, 0 }, { -167, -81, -167, -81 }, { -11, 0, 61, 0 }, { 0, 61, 0, -11 }, { -1, -1, 0, 0 }, { 123, -38, 123, -38 }, { 0, 0, -1, -1 }, { 0, -11, 0, 61 }, { 0, 21, 0, 21 }, { 0, 8, 0, 8 }, { 0, 2, 0, 2 }, { 0, 8, 0, 8 }, { -81, -167, -81, -167 }, { -38, 123, -38, 123 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 11, 0, 11 }, { 0, -3, 0, -3 }, { 0, -512, 0, -512 }, { 0, -1, 0, -1 }, { 0, -307, 0, -468 }, { -291, -6, -291, -6 }, { 0, -468, 0, -307 }, { -307, 0, -468, 0 }, { 79, 79, 7, 7 }, { 12, -512, 12, -512 }, { 7, 7, 79, 79 }, { -468, 0, -307, 0 }, { 11, 0, 11, 0 }, { -3, 0, -3, 0 }, { -512, 0, -512, 0 }, { -1, 0, -1, 0 }, { -6, -291, -6, -291 }, { -512, 12, -512, 12 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 41, 192, 41, 192 }, { 0, 0, 0, 0 }, { -220, 146, -220, 146 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -15, -178, -15, -178 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 192, 41, 192, 41 }, { 146, -220, 146, -220 }, { -178, -15, -178, -15 }, { -8, -8, -8, -8 }, { 34, 34, 34, 34 }, { -35, -35, -35, -35 }, { -1, -1, -1, -1 }, }, }, { { { 78, 0, 78, 0 }, { -31, 0, -31, 0 }, { 22, 0, 22, 0 }, { 31, 0, 31, 0 }, { 71, 0, -73, 0 }, { -48, -169, -48, -169 }, { -73, 0, 71, 0 }, { 0, 71, 0, -73 }, { 37, 37, 3, 3 }, { -31, -94, -31, -94 }, { 3, 3, 37, 37 }, { 0, -73, 0, 71 }, { 0, 78, 0, 78 }, { 0, -31, 0, -31 }, { 0, 22, 0, 22 }, { 0, 31, 0, 31 }, { -169, -48, -169, -48 }, { -94, -31, -94, -31 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, 469, 0, 469 }, { 0, -123, 0, 511 }, { 19, 51, 19, 51 }, { 0, 511, 0, -123 }, { -123, 0, 511, 0 }, { -4, -4, 511, 511 }, { -58, 74, -58, 74 }, { 511, 511, -4, -4 }, { 511, 0, -123, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { 469, 0, 469, 0 }, { 51, 19, 51, 19 }, { 74, -58, 74, -58 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -21, 15, -21, 15 }, { 0, 0, 0, 0 }, { -63, -246, -63, -246 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -33, 87, -33, 87 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 15, -21, 15, -21 }, { -246, -63, -246, -63 }, { 87, -33, 87, -33 }, { 38, 38, 38, 38 }, { -53, -53, -53, -53 }, { 8, 8, 8, 8 }, { 0, 0, 0, 0 }, }, }, { { { 216, 0, 216, 0 }, { -28, 0, -28, 0 }, { -512, 0, -512, 0 }, { -124, 0, -124, 0 }, { 34, 0, -512, 0 }, { -53, 26, -53, 26 }, { -512, 0, 34, 0 }, { 0, 34, 0, -512 }, { 51, 51, -50, -50 }, { 10, 144, 10, 144 }, { -50, -50, 51, 51 }, { 0, -512, 0, 34 }, { 0, 216, 0, 216 }, { 0, -28, 0, -28 }, { 0, -512, 0, -512 }, { 0, -124, 0, -124 }, { 26, -53, 26, -53 }, { 144, 10, 144, 10 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, -174, 0, -174 }, { 0, 92, 0, 92 }, { 0, -189, 0, 291 }, { -42, 186, -42, 186 }, { 0, 291, 0, -189 }, { -189, 0, 291, 0 }, { -147, -147, 67, 67 }, { -88, 411, -88, 411 }, { 67, 67, -147, -147 }, { 291, 0, -189, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { -174, 0, -174, 0 }, { 92, 0, 92, 0 }, { 186, -42, 186, -42 }, { 411, -88, 411, -88 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 93, 71, 93, 71 }, { 0, 0, 0, 0 }, { -57, -89, -57, -89 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 22, 371, 22, 371 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 71, 93, 71, 93 }, { -89, -57, -89, -57 }, { 371, 22, 371, 22 }, { -3, -3, -3, -3 }, { 2, 2, 2, 2 }, { 2, 2, 2, 2 }, { 10, 10, 10, 10 }, }, }, }, { { { { 6, 0, 6, 0 }, { -9, 0, -9, 0 }, { 16, 0, 16, 0 }, { -1, 0, -1, 0 }, { 38, 0, 4, 0 }, { -73, -23, -73, -23 }, { 4, 0, 38, 0 }, { 0, 38, 0, 4 }, { -74, -74, -35, -35 }, { 195, 157, 195, 157 }, { -35, -35, -74, -74 }, { 0, 4, 0, 38 }, { 0, 6, 0, 6 }, { 0, -9, 0, -9 }, { 0, 16, 0, 16 }, { 0, -1, 0, -1 }, { -23, -73, -23, -73 }, { 157, 195, 157, 195 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -10, 0, -10 }, { 0, 5, 0, 5 }, { 0, -9, 0, -9 }, { 0, -2, 0, -2 }, { 0, -19, 0, -8 }, { 12, 16, 12, 16 }, { 0, -8, 0, -19 }, { -19, 0, -8, 0 }, { -8, -8, -4, -4 }, { -119, -139, -119, -139 }, { -4, -4, -8, -8 }, { -8, 0, -19, 0 }, { -10, 0, -10, 0 }, { 5, 0, 5, 0 }, { -9, 0, -9, 0 }, { -2, 0, -2, 0 }, { 16, 12, 16, 12 }, { -139, -119, -139, -119 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 29, 5, 29, 5 }, { 0, 0, 0, 0 }, { 11, 36, 11, 36 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -203, -344, -203, -344 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 5, 29, 5, 29 }, { 36, 11, 36, 11 }, { -344, -203, -344, -203 }, { -5, -5, -5, -5 }, { -9, -9, -9, -9 }, { 5, 5, 5, 5 }, { 8, 8, 8, 8 }, }, }, { { { -8, 0, -8, 0 }, { 1, 0, 1, 0 }, { -13, 0, -13, 0 }, { 17, 0, 17, 0 }, { -29, 0, 1, 0 }, { -19, -140, -19, -140 }, { 1, 0, -29, 0 }, { 0, -29, 0, 1 }, { 7, 7, 7, 7 }, { -20, -6, -20, -6 }, { 7, 7, 7, 7 }, { 0, 1, 0, -29 }, { 0, -8, 0, -8 }, { 0, 1, 0, 1 }, { 0, -13, 0, -13 }, { 0, 17, 0, 17 }, { -140, -19, -140, -19 }, { -6, -20, -6, -20 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -8, 0, -8 }, { 0, 29, 0, 29 }, { 0, -39, 0, -39 }, { 0, -12, 0, -12 }, { 0, 126, 0, 362 }, { 49, -41, 49, -41 }, { 0, 362, 0, 126 }, { 126, 0, 362, 0 }, { -67, -67, -22, -22 }, { -110, 276, -110, 276 }, { -22, -22, -67, -67 }, { 362, 0, 126, 0 }, { -8, 0, -8, 0 }, { 29, 0, 29, 0 }, { -39, 0, -39, 0 }, { -12, 0, -12, 0 }, { -41, 49, -41, 49 }, { 276, -110, 276, -110 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 32, 22, 32, 22 }, { 0, 0, 0, 0 }, { 14, -293, 14, -293 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -63, -386, -63, -386 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 22, 32, 22, 32 }, { -293, 14, -293, 14 }, { -386, -63, -386, -63 }, { -9, -9, -9, -9 }, { -13, -13, -13, -13 }, { 15, 15, 15, 15 }, { 15, 15, 15, 15 }, }, }, { { { 6, 0, 6, 0 }, { 17, 0, 17, 0 }, { -34, 0, -34, 0 }, { 15, 0, 15, 0 }, { 52, 0, 96, 0 }, { -16, 87, -16, 87 }, { 96, 0, 52, 0 }, { 0, 52, 0, 96 }, { -22, -22, -37, -37 }, { -55, -74, -55, -74 }, { -37, -37, -22, -22 }, { 0, 96, 0, 52 }, { 0, 6, 0, 6 }, { 0, 17, 0, 17 }, { 0, -34, 0, -34 }, { 0, 15, 0, 15 }, { 87, -16, 87, -16 }, { -74, -55, -74, -55 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -15, 0, -15 }, { 0, 21, 0, 21 }, { 0, -7, 0, -7 }, { 0, -40, 0, -40 }, { 0, -27, 0, 365 }, { -51, 256, -51, 256 }, { 0, 365, 0, -27 }, { -27, 0, 365, 0 }, { -20, -20, 58, 58 }, { -98, 197, -98, 197 }, { 58, 58, -20, -20 }, { 365, 0, -27, 0 }, { -15, 0, -15, 0 }, { 21, 0, 21, 0 }, { -7, 0, -7, 0 }, { -40, 0, -40, 0 }, { 256, -51, 256, -51 }, { 197, -98, 197, -98 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -2, -13, -2, -13 }, { 0, 0, 0, 0 }, { -32, -134, -32, -134 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -135, -64, -135, -64 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -13, -2, -13, -2 }, { -134, -32, -134, -32 }, { -64, -135, -64, -135 }, { -2, -2, -2, -2 }, { -14, -14, -14, -14 }, { 11, 11, 11, 11 }, { -14, -14, -14, -14 }, }, }, { { { -6, 0, -6, 0 }, { -20, 0, -20, 0 }, { 28, 0, 28, 0 }, { -1, 0, -1, 0 }, { -38, 0, -70, 0 }, { -99, 32, -99, 32 }, { -70, 0, -38, 0 }, { 0, -38, 0, -70 }, { 9, 9, -138, -138 }, { 2, 146, 2, 146 }, { -138, -138, 9, 9 }, { 0, -70, 0, -38 }, { 0, -6, 0, -6 }, { 0, -20, 0, -20 }, { 0, 28, 0, 28 }, { 0, -1, 0, -1 }, { 32, -99, 32, -99 }, { 146, 2, 146, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -34, 0, -34 }, { 0, 26, 0, 26 }, { 0, 14, 0, 14 }, { 0, -61, 0, -61 }, { 0, -1, 0, 414 }, { -83, 511, -83, 511 }, { 0, 414, 0, -1 }, { -1, 0, 414, 0 }, { -52, -52, 302, 302 }, { -72, 480, -72, 480 }, { 302, 302, -52, -52 }, { 414, 0, -1, 0 }, { -34, 0, -34, 0 }, { 26, 0, 26, 0 }, { 14, 0, 14, 0 }, { -61, 0, -61, 0 }, { 511, -83, 511, -83 }, { 480, -72, 480, -72 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 57, 90, 57, 90 }, { 0, 0, 0, 0 }, { -189, 286, -189, 286 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 16, 100, 16, 100 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 90, 57, 90, 57 }, { 286, -189, 286, -189 }, { 100, 16, 100, 16 }, { -25, -25, -25, -25 }, { 3, 3, 3, 3 }, { -3, -3, -3, -3 }, { 2, 2, 2, 2 }, }, }, { { { -69, 0, -69, 0 }, { 23, 0, 23, 0 }, { -18, 0, -18, 0 }, { -51, 0, -51, 0 }, { -45, 0, 77, 0 }, { 47, 105, 47, 105 }, { 77, 0, -45, 0 }, { 0, -45, 0, 77 }, { -40, -40, 94, 94 }, { -4, 34, -4, 34 }, { 94, 94, -40, -40 }, { 0, 77, 0, -45 }, { 0, -69, 0, -69 }, { 0, 23, 0, 23 }, { 0, -18, 0, -18 }, { 0, -51, 0, -51 }, { 105, 47, 105, 47 }, { 34, -4, 34, -4 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, -61, 0, -61 }, { 0, 72, 0, 72 }, { 0, -13, 0, 511 }, { 28, -58, 28, -58 }, { 0, 511, 0, -13 }, { -13, 0, 511, 0 }, { -32, -32, 511, 511 }, { -58, 131, -58, 131 }, { 511, 511, -32, -32 }, { 511, 0, -13, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { -61, 0, -61, 0 }, { 72, 0, 72, 0 }, { -58, 28, -58, 28 }, { 131, -58, 131, -58 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -10, -222, -10, -222 }, { 0, 0, 0, 0 }, { -163, -28, -163, -28 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -97, 129, -97, 129 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -222, -10, -222, -10 }, { -28, -163, -28, -163 }, { 129, -97, 129, -97 }, { 2, 2, 2, 2 }, { -88, -88, -88, -88 }, { 2, 2, 2, 2 }, { -6, -6, -6, -6 }, }, }, { { { 132, 0, 132, 0 }, { -72, 0, -72, 0 }, { 89, 0, 89, 0 }, { -285, 0, -285, 0 }, { 152, 0, -7, 0 }, { 31, 10, 31, 10 }, { -7, 0, 152, 0 }, { 0, 152, 0, -7 }, { 65, 65, -139, -139 }, { -61, -83, -61, -83 }, { -139, -139, 65, 65 }, { 0, -7, 0, 152 }, { 0, 132, 0, 132 }, { 0, -72, 0, -72 }, { 0, 89, 0, 89 }, { 0, -285, 0, -285 }, { 10, 31, 10, 31 }, { -83, -61, -83, -61 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -512, 0, -512 }, { 0, -512, 0, -512 }, { 0, -402, 0, -402 }, { 0, -194, 0, -194 }, { 0, 142, 0, -174 }, { 20, -22, 20, -22 }, { 0, -174, 0, 142 }, { 142, 0, -174, 0 }, { 94, 94, -55, -55 }, { 32, 157, 32, 157 }, { -55, -55, 94, 94 }, { -174, 0, 142, 0 }, { -512, 0, -512, 0 }, { -512, 0, -512, 0 }, { -402, 0, -402, 0 }, { -194, 0, -194, 0 }, { -22, 20, -22, 20 }, { 157, 32, 157, 32 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -512, 29, -512, 29 }, { 0, 0, 0, 0 }, { 10, -23, 10, -23 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -13, -64, -13, -64 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 29, -512, 29, -512 }, { -23, 10, -23, 10 }, { -64, -13, -64, -13 }, { 108, 108, 108, 108 }, { -2, -2, -2, -2 }, { -2, -2, -2, -2 }, { -4, -4, -4, -4 }, }, }, }, { { { { 4, 0, 4, 0 }, { -2, 0, -2, 0 }, { 6, 0, 6, 0 }, { -29, 0, -29, 0 }, { 17, 0, -23, 0 }, { 6, -51, 6, -51 }, { -23, 0, 17, 0 }, { 0, 17, 0, -23 }, { -37, -37, 206, 206 }, { 90, 323, 90, 323 }, { 206, 206, -37, -37 }, { 0, -23, 0, 17 }, { 0, 4, 0, 4 }, { 0, -2, 0, -2 }, { 0, 6, 0, 6 }, { 0, -29, 0, -29 }, { -51, 6, -51, 6 }, { 323, 90, 323, 90 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -19, 0, -19 }, { 0, 0, 0, 0 }, { 0, -8, 0, -8 }, { 0, -16, 0, -16 }, { 0, -33, 0, 19 }, { -22, 58, -22, 58 }, { 0, 19, 0, -33 }, { -33, 0, 19, 0 }, { 20, 20, -60, -60 }, { -15, 8, -15, 8 }, { -60, -60, 20, 20 }, { 19, 0, -33, 0 }, { -19, 0, -19, 0 }, { 0, 0, 0, 0 }, { -8, 0, -8, 0 }, { -16, 0, -16, 0 }, { 58, -22, 58, -22 }, { 8, -15, 8, -15 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -12, -12, -12, -12 }, { 0, 0, 0, 0 }, { -21, -35, -21, -35 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 17, -59, 17, -59 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -12, -12, -12, -12 }, { -35, -21, -35, -21 }, { -59, 17, -59, 17 }, { -5, -5, -5, -5 }, { -15, -15, -15, -15 }, { -10, -10, -10, -10 }, { -14, -14, -14, -14 }, }, }, { { { 3, 0, 3, 0 }, { 24, 0, 24, 0 }, { -12, 0, -12, 0 }, { 7, 0, 7, 0 }, { 50, 0, -87, 0 }, { 15, -1, 15, -1 }, { -87, 0, 50, 0 }, { 0, 50, 0, -87 }, { -88, -88, -48, -48 }, { -12, 37, -12, 37 }, { -48, -48, -88, -88 }, { 0, -87, 0, 50 }, { 0, 3, 0, 3 }, { 0, 24, 0, 24 }, { 0, -12, 0, -12 }, { 0, 7, 0, 7 }, { -1, 15, -1, 15 }, { 37, -12, 37, -12 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -1, 0, -1 }, { 0, 1, 0, 1 }, { 0, -21, 0, -21 }, { 0, 64, 0, 64 }, { 0, -24, 0, -357 }, { -3, 30, -3, 30 }, { 0, -357, 0, -24 }, { -24, 0, -357, 0 }, { 53, 53, 7, 7 }, { -237, -435, -237, -435 }, { 7, 7, 53, 53 }, { -357, 0, -24, 0 }, { -1, 0, -1, 0 }, { 1, 0, 1, 0 }, { -21, 0, -21, 0 }, { 64, 0, 64, 0 }, { 30, -3, 30, -3 }, { -435, -237, -435, -237 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 44, 32, 44, 32 }, { 0, 0, 0, 0 }, { 18, 24, 18, 24 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 50, -112, 50, -112 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 32, 44, 32, 44 }, { 24, 18, 24, 18 }, { -112, 50, -112, 50 }, { -25, -25, -25, -25 }, { 93, 93, 93, 93 }, { 3, 3, 3, 3 }, { 9, 9, 9, 9 }, }, }, { { { 3, 0, 3, 0 }, { 22, 0, 22, 0 }, { 5, 0, 5, 0 }, { -70, 0, -70, 0 }, { 62, 0, 122, 0 }, { 41, -77, 41, -77 }, { 122, 0, 62, 0 }, { 0, 62, 0, 122 }, { -47, -47, 325, 325 }, { -123, 230, -123, 230 }, { 325, 325, -47, -47 }, { 0, 122, 0, 62 }, { 0, 3, 0, 3 }, { 0, 22, 0, 22 }, { 0, 5, 0, 5 }, { 0, -70, 0, -70 }, { -77, 41, -77, 41 }, { 230, -123, 230, -123 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -7, 0, -7 }, { 0, 12, 0, 12 }, { 0, -19, 0, -19 }, { 0, 449, 0, 449 }, { 0, 2, 0, 182 }, { -6, 4, -6, 4 }, { 0, 182, 0, 2 }, { 2, 0, 182, 0 }, { -3, -3, 124, 124 }, { -14, 405, -14, 405 }, { 124, 124, -3, -3 }, { 182, 0, 2, 0 }, { -7, 0, -7, 0 }, { 12, 0, 12, 0 }, { -19, 0, -19, 0 }, { 449, 0, 449, 0 }, { 4, -6, 4, -6 }, { 405, -14, 405, -14 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -62, -32, -62, -32 }, { 0, 0, 0, 0 }, { -12, -134, -12, -134 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -117, 330, -117, 330 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -32, -62, -32, -62 }, { -134, -12, -134, -12 }, { 330, -117, 330, -117 }, { -32, -32, -32, -32 }, { -97, -97, -97, -97 }, { -62, -62, -62, -62 }, { 20, 20, 20, 20 }, }, }, { { { -21, 0, -21, 0 }, { -25, 0, -25, 0 }, { 16, 0, 16, 0 }, { -31, 0, -31, 0 }, { -58, 0, 59, 0 }, { -19, 161, -19, 161 }, { 59, 0, -58, 0 }, { 0, -58, 0, 59 }, { 47, 47, -70, -70 }, { -8, 4, -8, 4 }, { -70, -70, 47, 47 }, { 0, 59, 0, -58 }, { 0, -21, 0, -21 }, { 0, -25, 0, -25 }, { 0, 16, 0, 16 }, { 0, -31, 0, -31 }, { 161, -19, 161, -19 }, { 4, -8, 4, -8 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 47, 0, 47 }, { 0, -512, 0, -512 }, { 0, -512, 0, -512 }, { 0, 26, 0, 26 }, { 0, 0, 0, -473 }, { -10, -174, -10, -174 }, { 0, -473, 0, 0 }, { 0, 0, -473, 0 }, { 50, 50, -136, -136 }, { 43, -512, 43, -512 }, { -136, -136, 50, 50 }, { -473, 0, 0, 0 }, { 47, 0, 47, 0 }, { -512, 0, -512, 0 }, { -512, 0, -512, 0 }, { 26, 0, 26, 0 }, { -174, -10, -174, -10 }, { -512, 43, -512, 43 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 52, 226, 52, 226 }, { 0, 0, 0, 0 }, { 112, 178, 112, 178 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 94, -224, 94, -224 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 226, 52, 226, 52 }, { 178, 112, 178, 112 }, { -224, 94, -224, 94 }, { -16, -16, -16, -16 }, { 62, 62, 62, 62 }, { -11, -11, -11, -11 }, { 4, 4, 4, 4 }, }, }, { { { 45, 0, 45, 0 }, { -15, 0, -15, 0 }, { 15, 0, 15, 0 }, { -11, 0, -11, 0 }, { 75, 0, 91, 0 }, { -14, -51, -14, -51 }, { 91, 0, 75, 0 }, { 0, 75, 0, 91 }, { -63, -63, 89, 89 }, { -5, 44, -5, 44 }, { 89, 89, -63, -63 }, { 0, 91, 0, 75 }, { 0, 45, 0, 45 }, { 0, -15, 0, -15 }, { 0, 15, 0, 15 }, { 0, -11, 0, -11 }, { -51, -14, -51, -14 }, { 44, -5, 44, -5 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -34, 0, -34 }, { 0, 511, 0, 511 }, { 0, -67, 0, -67 }, { 0, 180, 0, 180 }, { 0, -94, 0, 495 }, { -12, -108, -12, -108 }, { 0, 495, 0, -94 }, { -94, 0, 495, 0 }, { -20, -20, 486, 486 }, { -11, 503, -11, 503 }, { 486, 486, -20, -20 }, { 495, 0, -94, 0 }, { -34, 0, -34, 0 }, { 511, 0, 511, 0 }, { -67, 0, -67, 0 }, { 180, 0, 180, 0 }, { -108, -12, -108, -12 }, { 503, -11, 503, -11 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 26, 511, 26, 511 }, { 0, 0, 0, 0 }, { -22, -33, -22, -33 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 33, 243, 33, 243 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 511, 26, 511, 26 }, { -33, -22, -33, -22 }, { 243, 33, 243, 33 }, { -13, -13, -13, -13 }, { -1, -1, -1, -1 }, { 6, 6, 6, 6 }, { 5, 5, 5, 5 }, }, }, { { { -120, 0, -120, 0 }, { 62, 0, 62, 0 }, { -87, 0, -87, 0 }, { 426, 0, 426, 0 }, { -153, 0, -80, 0 }, { -16, -28, -16, -28 }, { -80, 0, -153, 0 }, { 0, -153, 0, -80 }, { -48, -48, 482, 482 }, { 72, 95, 72, 95 }, { 482, 482, -48, -48 }, { 0, -80, 0, -153 }, { 0, -120, 0, -120 }, { 0, 62, 0, 62 }, { 0, -87, 0, -87 }, { 0, 426, 0, 426 }, { -28, -16, -28, -16 }, { 95, 72, 95, 72 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 511, 0, 511 }, { 0, 466, 0, 466 }, { 0, 238, 0, 238 }, { 0, 109, 0, 109 }, { 0, -87, 0, 148 }, { -23, -12, -23, -12 }, { 0, 148, 0, -87 }, { -87, 0, 148, 0 }, { -63, -63, 53, 53 }, { -21, -113, -21, -113 }, { 53, 53, -63, -63 }, { 148, 0, -87, 0 }, { 511, 0, 511, 0 }, { 466, 0, 466, 0 }, { 238, 0, 238, 0 }, { 109, 0, 109, 0 }, { -12, -23, -12, -23 }, { -113, -21, -113, -21 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -512, 20, -512, 20 }, { 0, 0, 0, 0 }, { 9, -8, 9, -8 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -18, -37, -18, -37 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 20, -512, 20, -512 }, { -8, 9, -8, 9 }, { -37, -18, -37, -18 }, { -5, -5, -5, -5 }, { -8, -8, -8, -8 }, { 54, 54, 54, 54 }, { 63, 63, 63, 63 }, }, }, }, { { { { 6, 0, 6, 0 }, { 1, 0, 1, 0 }, { 10, 0, 10, 0 }, { -38, 0, -38, 0 }, { 27, 0, -72, 0 }, { 58, -53, 58, -53 }, { -72, 0, 27, 0 }, { 0, 27, 0, -72 }, { -47, -47, 293, 293 }, { -114, 366, -114, 366 }, { 293, 293, -47, -47 }, { 0, -72, 0, 27 }, { 0, 6, 0, 6 }, { 0, 1, 0, 1 }, { 0, 10, 0, 10 }, { 0, -38, 0, -38 }, { -53, 58, -53, 58 }, { 366, -114, 366, -114 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 18, 0, 18 }, { 0, -18, 0, -18 }, { 0, 9, 0, 9 }, { 0, 9, 0, 9 }, { 0, 20, 0, -18 }, { 27, -20, 27, -20 }, { 0, -18, 0, 20 }, { 20, 0, -18, 0 }, { 9, 9, 140, 140 }, { -22, 35, -22, 35 }, { 140, 140, 9, 9 }, { -18, 0, 20, 0 }, { 18, 0, 18, 0 }, { -18, 0, -18, 0 }, { 9, 0, 9, 0 }, { 9, 0, 9, 0 }, { -20, 27, -20, 27 }, { 35, -22, 35, -22 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -2, -16, -2, -16 }, { 0, 0, 0, 0 }, { -11, 45, -11, 45 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 24, 154, 24, 154 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -16, -2, -16, -2 }, { 45, -11, 45, -11 }, { 154, 24, 154, 24 }, { 6, 6, 6, 6 }, { 16, 16, 16, 16 }, { -11, -11, -11, -11 }, { -9, -9, -9, -9 }, }, }, { { { -3, 0, -3, 0 }, { 30, 0, 30, 0 }, { -32, 0, -32, 0 }, { -19, 0, -19, 0 }, { 46, 0, -26, 0 }, { 48, -55, 48, -55 }, { -26, 0, 46, 0 }, { 0, 46, 0, -26 }, { -105, -105, 6, 6 }, { -88, 345, -88, 345 }, { 6, 6, -105, -105 }, { 0, -26, 0, 46 }, { 0, -3, 0, -3 }, { 0, 30, 0, 30 }, { 0, -32, 0, -32 }, { 0, -19, 0, -19 }, { -55, 48, -55, 48 }, { 345, -88, 345, -88 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 7, 0, 7 }, { 0, -10, 0, -10 }, { 0, 2, 0, 2 }, { 0, -8, 0, -8 }, { 0, 19, 0, 256 }, { 11, -18, 11, -18 }, { 0, 256, 0, 19 }, { 19, 0, 256, 0 }, { 3, 3, 4, 4 }, { -9, 299, -9, 299 }, { 4, 4, 3, 3 }, { 256, 0, 19, 0 }, { 7, 0, 7, 0 }, { -10, 0, -10, 0 }, { 2, 0, 2, 0 }, { -8, 0, -8, 0 }, { -18, 11, -18, 11 }, { 299, -9, 299, -9 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -31, -41, -31, -41 }, { 0, 0, 0, 0 }, { -26, 22, -26, 22 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -12, 279, -12, 279 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -41, -31, -41, -31 }, { 22, -26, 22, -26 }, { 279, -12, 279, -12 }, { -66, -66, -66, -66 }, { -10, -10, -10, -10 }, { 6, 6, 6, 6 }, { 8, 8, 8, 8 }, }, }, { { { 2, 0, 2, 0 }, { 11, 0, 11, 0 }, { 0, 0, 0, 0 }, { -59, 0, -59, 0 }, { 35, 0, 100, 0 }, { 1, -7, 1, -7 }, { 100, 0, 35, 0 }, { 0, 35, 0, 100 }, { -13, -13, 185, 185 }, { -49, 324, -49, 324 }, { 185, 185, -13, -13 }, { 0, 100, 0, 35 }, { 0, 2, 0, 2 }, { 0, 11, 0, 11 }, { 0, 0, 0, 0 }, { 0, -59, 0, -59 }, { -7, 1, -7, 1 }, { 324, -49, 324, -49 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -21, 0, -21 }, { 0, 33, 0, 33 }, { 0, -36, 0, -36 }, { 0, -1, 0, -1 }, { 0, 232, 0, 440 }, { 17, -81, 17, -81 }, { 0, 440, 0, 232 }, { 232, 0, 440, 0 }, { -79, -79, 150, 150 }, { -112, 361, -112, 361 }, { 150, 150, -79, -79 }, { 440, 0, 232, 0 }, { -21, 0, -21, 0 }, { 33, 0, 33, 0 }, { -36, 0, -36, 0 }, { -1, 0, -1, 0 }, { -81, 17, -81, 17 }, { 361, -112, 361, -112 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -47, -43, -47, -43 }, { 0, 0, 0, 0 }, { -61, 62, -61, 62 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 84, 154, 84, 154 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -43, -47, -43, -47 }, { 62, -61, 62, -61 }, { 154, 84, 154, 84 }, { -4, -4, -4, -4 }, { 0, 0, 0, 0 }, { -68, -68, -68, -68 }, { -44, -44, -44, -44 }, }, }, { { { -23, 0, -23, 0 }, { -9, 0, -9, 0 }, { 0, 0, 0, 0 }, { -4, 0, -4, 0 }, { -65, 0, -19, 0 }, { 5, 108, 5, 108 }, { -19, 0, -65, 0 }, { 0, -65, 0, -19 }, { -18, -18, -15, -15 }, { 47, 118, 47, 118 }, { -15, -15, -18, -18 }, { 0, -19, 0, -65 }, { 0, -23, 0, -23 }, { 0, -9, 0, -9 }, { 0, 0, 0, 0 }, { 0, -4, 0, -4 }, { 108, 5, 108, 5 }, { 118, 47, 118, 47 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -512, 0, -512 }, { 0, -512, 0, -512 }, { 0, -512, 0, -512 }, { 0, 97, 0, 97 }, { 0, 15, 0, -448 }, { -18, -347, -18, -347 }, { 0, -448, 0, 15 }, { 15, 0, -448, 0 }, { 49, 49, -158, -158 }, { 76, -108, 76, -108 }, { -158, -158, 49, 49 }, { -448, 0, 15, 0 }, { -512, 0, -512, 0 }, { -512, 0, -512, 0 }, { -512, 0, -512, 0 }, { 97, 0, 97, 0 }, { -347, -18, -347, -18 }, { -108, 76, -108, 76 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 40, 166, 40, 166 }, { 0, 0, 0, 0 }, { -10, 92, -10, 92 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 104, 28, 104, 28 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 166, 40, 166, 40 }, { 92, -10, 92, -10 }, { 28, 104, 28, 104 }, { -1, -1, -1, -1 }, { 57, 57, 57, 57 }, { -33, -33, -33, -33 }, { -5, -5, -5, -5 }, }, }, { { { -55, 0, -55, 0 }, { 20, 0, 20, 0 }, { -11, 0, -11, 0 }, { -38, 0, -38, 0 }, { -44, 0, 12, 0 }, { 25, 53, 25, 53 }, { 12, 0, -44, 0 }, { 0, -44, 0, 12 }, { -5, -5, 70, 70 }, { 34, 72, 34, 72 }, { 70, 70, -5, -5 }, { 0, 12, 0, -44 }, { 0, -55, 0, -55 }, { 0, 20, 0, 20 }, { 0, -11, 0, -11 }, { 0, -38, 0, -38 }, { 53, 25, 53, 25 }, { 72, 34, 72, 34 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, -45, 0, -45 }, { 0, 352, 0, 443 }, { 8, -18, 8, -18 }, { 0, 443, 0, 352 }, { 352, 0, 443, 0 }, { -50, -50, 374, 374 }, { -8, 468, -8, 468 }, { 374, 374, -50, -50 }, { 443, 0, 352, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { -45, 0, -45, 0 }, { -18, 8, -18, 8 }, { 468, -8, 468, -8 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -10, 318, -10, 318 }, { 0, 0, 0, 0 }, { 105, 18, 105, 18 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 206, -378, 206, -378 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 318, -10, 318, -10 }, { 18, 105, 18, 105 }, { -378, 206, -378, 206 }, { 8, 8, 8, 8 }, { 60, 60, 60, 60 }, { -20, -20, -20, -20 }, { 2, 2, 2, 2 }, }, }, { { { -104, 0, -104, 0 }, { 62, 0, 62, 0 }, { -87, 0, -87, 0 }, { 392, 0, 392, 0 }, { -154, 0, -80, 0 }, { -2, -67, -2, -67 }, { -80, 0, -154, 0 }, { 0, -154, 0, -80 }, { -47, -47, 156, 156 }, { 89, 136, 89, 136 }, { 156, 156, -47, -47 }, { 0, -80, 0, -154 }, { 0, -104, 0, -104 }, { 0, 62, 0, 62 }, { 0, -87, 0, -87 }, { 0, 392, 0, 392 }, { -67, -2, -67, -2 }, { 136, 89, 136, 89 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -512, 0, -512 }, { 0, -512, 0, -512 }, { 0, -282, 0, -282 }, { 0, -103, 0, -103 }, { 0, 149, 0, -190 }, { 23, -9, 23, -9 }, { 0, -190, 0, 149 }, { 149, 0, -190, 0 }, { 87, 87, -62, -62 }, { 28, 94, 28, 94 }, { -62, -62, 87, 87 }, { -190, 0, 149, 0 }, { -512, 0, -512, 0 }, { -512, 0, -512, 0 }, { -282, 0, -282, 0 }, { -103, 0, -103, 0 }, { -9, 23, -9, 23 }, { 94, 28, 94, 28 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -512, 22, -512, 22 }, { 0, 0, 0, 0 }, { 12, -18, 12, -18 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -21, -16, -21, -16 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 22, -512, 22, -512 }, { -18, 12, -18, 12 }, { -16, -21, -16, -21 }, { 41, 41, 41, 41 }, { 34, 34, 34, 34 }, { 48, 48, 48, 48 }, { -3, -3, -3, -3 }, }, }, }, { { { { 16, 0, 16, 0 }, { -14, 0, -14, 0 }, { 24, 0, 24, 0 }, { -55, 0, -55, 0 }, { 35, 0, 7, 0 }, { 59, 46, 59, 46 }, { 7, 0, 35, 0 }, { 0, 35, 0, 7 }, { -55, -55, -14, -14 }, { -145, 55, -145, 55 }, { -14, -14, -55, -55 }, { 0, 7, 0, 35 }, { 0, 16, 0, 16 }, { 0, -14, 0, -14 }, { 0, 24, 0, 24 }, { 0, -55, 0, -55 }, { 46, 59, 46, 59 }, { 55, -145, 55, -145 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -11, 0, -11 }, { 0, -2, 0, -2 }, { 0, 7, 0, 7 }, { 0, 7, 0, 7 }, { 0, -19, 0, -2 }, { -33, -60, -33, -60 }, { 0, -2, 0, -19 }, { -19, 0, -2, 0 }, { -2, -2, 8, 8 }, { 11, 89, 11, 89 }, { 8, 8, -2, -2 }, { -2, 0, -19, 0 }, { -11, 0, -11, 0 }, { -2, 0, -2, 0 }, { 7, 0, 7, 0 }, { 7, 0, 7, 0 }, { -60, -33, -60, -33 }, { 89, 11, 89, 11 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -5, 4, -5, 4 }, { 0, 0, 0, 0 }, { -14, -78, -14, -78 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -20, -133, -20, -133 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 4, -5, 4, -5 }, { -78, -14, -78, -14 }, { -133, -20, -133, -20 }, { 3, 3, 3, 3 }, { 35, 35, 35, 35 }, { -4, -4, -4, -4 }, { -5, -5, -5, -5 }, }, }, { { { 2, 0, 2, 0 }, { -17, 0, -17, 0 }, { 15, 0, 15, 0 }, { 1, 0, 1, 0 }, { 63, 0, 54, 0 }, { -55, 25, -55, 25 }, { 54, 0, 63, 0 }, { 0, 63, 0, 54 }, { 44, 44, -101, -101 }, { 65, -512, 65, -512 }, { -101, -101, 44, 44 }, { 0, 54, 0, 63 }, { 0, 2, 0, 2 }, { 0, -17, 0, -17 }, { 0, 15, 0, 15 }, { 0, 1, 0, 1 }, { 25, -55, 25, -55 }, { -512, 65, -512, 65 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -7, 0, -7 }, { 0, -9, 0, -9 }, { 0, -10, 0, -10 }, { 0, 65, 0, 65 }, { 0, -44, 0, -289 }, { -65, 53, -65, 53 }, { 0, -289, 0, -44 }, { -44, 0, -289, 0 }, { 43, 43, 41, 41 }, { 61, -107, 61, -107 }, { 41, 41, 43, 43 }, { -289, 0, -44, 0 }, { -7, 0, -7, 0 }, { -9, 0, -9, 0 }, { -10, 0, -10, 0 }, { 65, 0, 65, 0 }, { 53, -65, 53, -65 }, { -107, 61, -107, 61 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -27, -37, -27, -37 }, { 0, 0, 0, 0 }, { 10, -33, 10, -33 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -73, -32, -73, -32 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -37, -27, -37, -27 }, { -33, 10, -33, 10 }, { -32, -73, -32, -73 }, { 3, 3, 3, 3 }, { -6, -6, -6, -6 }, { 27, 27, 27, 27 }, { -8, -8, -8, -8 }, }, }, { { { 3, 0, 3, 0 }, { -21, 0, -21, 0 }, { 48, 0, 48, 0 }, { -9, 0, -9, 0 }, { -4, 0, 29, 0 }, { -38, -368, -38, -368 }, { 29, 0, -4, 0 }, { 0, -4, 0, 29 }, { -65, -65, -308, -308 }, { 124, -93, 124, -93 }, { -308, -308, -65, -65 }, { 0, 29, 0, -4 }, { 0, 3, 0, 3 }, { 0, -21, 0, -21 }, { 0, 48, 0, 48 }, { 0, -9, 0, -9 }, { -368, -38, -368, -38 }, { -93, 124, -93, 124 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -12, 0, -12 }, { 0, -14, 0, -14 }, { 0, 10, 0, 10 }, { 0, -17, 0, -17 }, { 0, 52, 0, 47 }, { -56, 50, -56, 50 }, { 0, 47, 0, 52 }, { 52, 0, 47, 0 }, { -195, -195, 33, 33 }, { 45, 114, 45, 114 }, { 33, 33, -195, -195 }, { 47, 0, 52, 0 }, { -12, 0, -12, 0 }, { -14, 0, -14, 0 }, { 10, 0, 10, 0 }, { -17, 0, -17, 0 }, { 50, -56, 50, -56 }, { 114, 45, 114, 45 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -56, -83, -56, -83 }, { 0, 0, 0, 0 }, { 33, 47, 33, 47 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -116, 27, -116, 27 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -83, -56, -83, -56 }, { 47, 33, 47, 33 }, { 27, -116, 27, -116 }, { 9, 9, 9, 9 }, { -6, -6, -6, -6 }, { -4, -4, -4, -4 }, { -13, -13, -13, -13 }, }, }, { { { 9, 0, 9, 0 }, { 107, 0, 107, 0 }, { -36, 0, -36, 0 }, { 14, 0, 14, 0 }, { 8, 0, 101, 0 }, { 90, 142, 90, 142 }, { 101, 0, 8, 0 }, { 0, 8, 0, 101 }, { 16, 16, 222, 222 }, { -42, -62, -42, -62 }, { 222, 222, 16, 16 }, { 0, 101, 0, 8 }, { 0, 9, 0, 9 }, { 0, 107, 0, 107 }, { 0, -36, 0, -36 }, { 0, 14, 0, 14 }, { 142, 90, 142, 90 }, { -62, -42, -62, -42 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -512, 0, -512 }, { 0, -512, 0, -512 }, { 0, 23, 0, 23 }, { 0, 8, 0, 8 }, { 0, 7, 0, -142 }, { 135, -358, 135, -358 }, { 0, -142, 0, 7 }, { 7, 0, -142, 0 }, { 39, 39, -254, -254 }, { 125, -365, 125, -365 }, { -254, -254, 39, 39 }, { -142, 0, 7, 0 }, { -512, 0, -512, 0 }, { -512, 0, -512, 0 }, { 23, 0, 23, 0 }, { 8, 0, 8, 0 }, { -358, 135, -358, 135 }, { -365, 125, -365, 125 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -79, -102, -79, -102 }, { 0, 0, 0, 0 }, { 152, 89, 152, 89 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -41, -4, -41, -4 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -102, -79, -102, -79 }, { 89, 152, 89, 152 }, { -4, -41, -4, -41 }, { 1, 1, 1, 1 }, { -18, -18, -18, -18 }, { 4, 4, 4, 4 }, { 4, 4, 4, 4 }, }, }, { { { 46, 0, 46, 0 }, { -12, 0, -12, 0 }, { 14, 0, 14, 0 }, { -9, 0, -9, 0 }, { 60, 0, 71, 0 }, { -115, -91, -115, -91 }, { 71, 0, 60, 0 }, { 0, 60, 0, 71 }, { 23, 23, -4, -4 }, { -51, -52, -51, -52 }, { -4, -4, 23, 23 }, { 0, 71, 0, 60 }, { 0, 46, 0, 46 }, { 0, -12, 0, -12 }, { 0, 14, 0, 14 }, { 0, -9, 0, -9 }, { -91, -115, -91, -115 }, { -52, -51, -52, -51 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, -512, 0, -512 }, { 0, 41, 0, 41 }, { 0, -512, 0, -512 }, { 0, -39, 0, -39 }, { 0, 17, 0, -462 }, { -512, 0, -512, 0 }, { 0, -462, 0, 17 }, { 17, 0, -462, 0 }, { 41, 41, -97, -97 }, { 106, -174, 106, -174 }, { -97, -97, 41, 41 }, { -462, 0, 17, 0 }, { -512, 0, -512, 0 }, { 41, 0, 41, 0 }, { -512, 0, -512, 0 }, { -39, 0, -39, 0 }, { 0, -512, 0, -512 }, { -174, 106, -174, 106 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -48, -194, -48, -194 }, { 0, 0, 0, 0 }, { 356, -34, 356, -34 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -14, 129, -14, 129 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -194, -48, -194, -48 }, { -34, 356, -34, 356 }, { 129, -14, 129, -14 }, { 7, 7, 7, 7 }, { -31, -31, -31, -31 }, { 7, 7, 7, 7 }, { 2, 2, 2, 2 }, }, }, { { { -89, 0, -89, 0 }, { 38, 0, 38, 0 }, { -34, 0, -34, 0 }, { 6, 0, 6, 0 }, { -105, 0, -87, 0 }, { 263, -35, 263, -35 }, { -87, 0, -105, 0 }, { 0, -105, 0, -87 }, { -75, -75, 106, 106 }, { 50, 113, 50, 113 }, { 106, 106, -75, -75 }, { 0, -87, 0, -105 }, { 0, -89, 0, -89 }, { 0, 38, 0, 38 }, { 0, -34, 0, -34 }, { 0, 6, 0, 6 }, { -35, 263, -35, 263 }, { 113, 50, 113, 50 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 511, 0, 511 }, { 0, 511, 0, 511 }, { 0, 281, 0, 281 }, { 0, 104, 0, 104 }, { 0, -189, 0, 511 }, { 274, -22, 274, -22 }, { 0, 511, 0, -189 }, { -189, 0, 511, 0 }, { -110, -110, 148, 148 }, { -8, -180, -8, -180 }, { 148, 148, -110, -110 }, { 511, 0, -189, 0 }, { 511, 0, 511, 0 }, { 511, 0, 511, 0 }, { 281, 0, 281, 0 }, { 104, 0, 104, 0 }, { -22, 274, -22, 274 }, { -180, -8, -180, -8 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, }, { { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -13, -33, -13, -33 }, { 0, 0, 0, 0 }, { 511, 29, 511, 29 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 11, 18, 11, 18 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { -33, -13, -33, -13 }, { 29, 511, 29, 511 }, { 18, 11, 18, 11 }, { 12, 12, 12, 12 }, { 12, 12, 12, 12 }, { -69, -69, -69, -69 }, { -10, -10, -10, -10 }, }, }, }, }; const int16_t dav2d_gdf_bias[6][6][3] = { { { 1925, -13583, 11215 }, { -3674, -1502, 8347 }, { -3652, -2643, 363 }, { -2178, -2054, -3486 }, { -1328, -2508, 2097 }, { -2124, -2024, -3121 }, }, { { -1067, -432, -4 }, { 9262, -3515, -3754 }, { -7059, 6034, -7748 }, { -6571, 91, 3451 }, { -5335, -1251, 12069 }, { -4763, 1391, -4988 }, }, { { 1762, -4143, 417 }, { -5345, 1826, -255 }, { -208, -3312, 9374 }, { -2768, -284, 264 }, { -1907, -6576, -1333 }, { -2787, -2222, 1078 }, }, { { 17278, 1136, -976 }, { -3596, 335, -11414 }, { -10642, -8146, -42 }, { -1286, -626, 1415 }, { -3309, -2694, -3776 }, { -2461, -3889, -2859 }, }, { { 10217, -10258, -835 }, { 3431, -3551, -6685 }, { -1138, -2845, 12088 }, { -847, -1156, 295 }, { -2479, -3801, -737 }, { -3460, -951, -3323 }, }, { { -12046, 10279, -209 }, { 9790, -177, -6841 }, { -3332, 12423, -2475 }, { -2864, -4263, -7763 }, { -3309, -2744, -5688 }, { 908, -3477, -929 }, } }; const int8_t dav2d_gdf_intra_error[6][4096] = { { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -3, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -3, -2, -2, -2, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, 0, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -3, -3, -2, -2, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, 0, -1, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, -2, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, 0, -1, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -2, -1, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -2, -1, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -3, -3, -3, -2, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -4, -3, -3, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -4, -4, -3, -3, -3, -3, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -3, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -2, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -4, -3, -3, -3, -3, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -3, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, 0, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -2, -2, -2, -2, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, -3, -3, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -3, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, 0, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 3, 4, 4, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, -2, -2, -2, -1, -1, 0, 0, 0, 0, -1, -1, -1, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -2, -3, -3, -3, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -3, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -3, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -2, -2, -2, -2, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4, 4, -4, -3, -3, -3, -3, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, 0, 0, -1, -1, -1, -3, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -2, -2, -3, -3, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -2, -2, -2, -2, -1, -1, -2, -1, -1, -1, 0, 0, 0, 0, -1, -1, -2, -2, -3, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 4, 4, -4, -4, -4, -3, -3, -3, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, -4, -4, -4, -3, -3, -3, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -3, -3, -3, -3, -3, -3, -2, -2, -1, -1, 0, -1, -1, -1, -1, -1, -3, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, -1, -1, -1, -1, -1, -3, -3, -3, -3, -2, -2, -2, -1, -1, 0, 0, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, -1, -1, -2, -2, -2, -1, -1, -2, -2, -1, -1, -1, -1, 0, 0, 0, -1, -1, -2, -2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, -4, -4, -4, -3, -3, -3, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -4, -4, -4, -3, -3, -3, -2, -2, -1, -1, 0, 0, -1, -1, -1, -1, -4, -4, -3, -3, -3, -3, -2, -2, -1, -1, 0, -1, -1, -1, -1, -1, -3, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, -1, -1, -1, -1, -1, -3, -3, -3, -3, -2, -2, -2, -1, -1, 0, 0, 0, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, -4, -4, -4, -3, -3, -3, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -4, -4, -3, -3, -3, -3, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -3, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -3, -3, -3, -3, -3, -2, -2, -1, -1, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, -4, -4, -3, -3, -3, -3, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -4, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -3, -3, -3, -3, -3, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, -4, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, -1, -1, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, -3, -3, -3, -3, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -3, -2, -2, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, -2, -2, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2, -1, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 }, { -8, -8, -8, -8, -8, -8, -9, -9, -9, -9, -8, -8, -7, -7, -6, -3, -9, -9, -8, -8, -9, -9, -10, -10, -9, -9, -9, -8, -8, -8, -7, -5, -9, -9, -9, -9, -9, -10, -10, -10, -9, -9, -9, -8, -8, -8, -8, -7, -9, -9, -9, -9, -9, -9, -9, -9, -8, -8, -8, -8, -8, -8, -8, -8, -9, -8, -9, -9, -8, -8, -8, -7, -7, -7, -6, -7, -7, -8, -8, -8, -8, -8, -8, -8, -7, -7, -6, -6, -5, -5, -6, -6, -7, -8, -8, -8, -8, -7, -7, -7, -6, -5, -4, -4, -3, -4, -5, -6, -6, -7, -8, -8, -7, -6, -6, -5, -4, -3, -2, -2, -1, -2, -4, -5, -6, -7, -7, -8, -6, -5, -4, -3, -2, -1, 0, 1, 0, -1, -2, -4, -5, -6, -7, -7, -4, -3, -2, -1, 0, 1, 3, 3, 2, 1, -1, -3, -4, -5, -6, -7, -3, -2, 0, 1, 2, 4, 4, 4, 3, 2, 0, -1, -3, -4, -5, -6, -1, 0, 2, 3, 5, 5, 5, 5, 5, 3, 1, 0, -2, -3, -4, -5, 0, 2, 4, 6, 6, 6, 6, 6, 5, 4, 2, 0, -1, -2, -3, -3, 2, 4, 5, 7, 7, 7, 7, 7, 6, 5, 3, 1, 0, -1, -2, -3, 3, 5, 6, 7, 7, 7, 7, 7, 6, 6, 4, 2, 1, 0, -1, -2, 4, 6, 7, 7, 7, 7, 7, 7, 6, 6, 5, 4, 3, 2, 1, 1, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -6, -6, -5, -4, -2, -7, -6, -6, -6, -6, -7, -8, -8, -8, -8, -7, -7, -7, -6, -5, -4, -7, -6, -7, -7, -7, -8, -8, -8, -8, -8, -7, -7, -7, -7, -6, -6, -7, -6, -7, -7, -7, -7, -7, -7, -7, -7, -6, -6, -6, -7, -7, -7, -7, -6, -7, -7, -7, -7, -6, -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -6, -6, -6, -6, -6, -5, -5, -5, -5, -5, -5, -6, -6, -7, -7, -7, -6, -5, -5, -5, -5, -4, -3, -3, -3, -4, -5, -6, -6, -7, -7, -6, -6, -5, -4, -4, -3, -2, -1, -1, -2, -3, -5, -6, -6, -7, -7, -5, -4, -4, -3, -2, -1, 0, 1, 0, -1, -2, -4, -5, -6, -6, -6, -4, -3, -2, -2, 0, 1, 2, 2, 2, 0, -1, -3, -4, -5, -5, -6, -3, -2, -1, 0, 1, 3, 4, 4, 3, 1, 0, -2, -3, -4, -4, -5, -2, -1, 1, 2, 3, 4, 5, 5, 4, 2, 0, -1, -2, -3, -3, -4, -1, 0, 2, 4, 5, 5, 6, 5, 5, 3, 1, -1, -2, -2, -2, -3, 0, 2, 4, 5, 6, 6, 6, 6, 5, 4, 2, 0, -1, -2, -2, -2, 2, 4, 5, 6, 7, 6, 6, 6, 5, 5, 3, 1, 0, -1, -1, -2, 3, 5, 6, 6, 7, 6, 6, 6, 5, 5, 4, 3, 2, 1, 0, 0, -4, -4, -4, -5, -4, -5, -5, -5, -5, -6, -6, -5, -5, -4, -3, -1, -5, -5, -5, -5, -5, -6, -6, -6, -6, -7, -6, -6, -6, -5, -4, -3, -5, -5, -5, -6, -6, -6, -7, -7, -7, -7, -6, -6, -6, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -7, -6, -6, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -6, -6, -6, -6, -5, -5, -5, -5, -5, -6, -6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -5, -5, -6, -6, -6, -5, -5, -5, -4, -4, -4, -3, -3, -3, -3, -4, -5, -5, -6, -6, -6, -5, -5, -4, -4, -4, -3, -2, -1, -1, -2, -3, -4, -5, -6, -6, -6, -5, -4, -3, -3, -2, -2, -1, 0, 0, -1, -2, -4, -5, -5, -5, -5, -4, -3, -3, -2, -1, 0, 2, 2, 2, 0, -1, -3, -4, -5, -5, -5, -3, -2, -1, 0, 0, 2, 3, 3, 3, 1, -1, -2, -3, -3, -4, -4, -2, -1, 0, 1, 2, 3, 4, 4, 3, 1, 0, -2, -2, -2, -3, -3, -1, 0, 1, 3, 4, 5, 5, 4, 4, 2, 0, -1, -2, -2, -2, -3, 0, 1, 3, 4, 5, 5, 5, 5, 4, 3, 1, -1, -2, -2, -2, -2, 1, 3, 4, 5, 5, 6, 5, 5, 4, 3, 2, 1, -1, -1, -1, -2, 2, 4, 5, 5, 6, 5, 5, 4, 4, 4, 3, 2, 1, 0, 0, -1, -3, -3, -3, -4, -3, -4, -4, -4, -4, -4, -5, -4, -4, -3, -2, -1, -4, -3, -4, -4, -4, -5, -5, -5, -5, -5, -5, -5, -4, -4, -3, -2, -4, -4, -4, -4, -5, -5, -6, -6, -6, -6, -6, -5, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -6, -6, -6, -5, -5, -4, -4, -4, -5, -5, -4, -4, -4, -4, -4, -5, -5, -5, -5, -5, -4, -4, -4, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -5, -5, -5, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -4, -4, -5, -5, -5, -5, -5, -4, -4, -3, -3, -3, -2, -1, -1, -2, -3, -4, -5, -5, -5, -5, -4, -4, -3, -3, -2, -2, -1, 0, 0, -1, -3, -4, -4, -5, -5, -5, -4, -3, -2, -2, -1, 0, 1, 2, 1, 0, -2, -3, -4, -4, -4, -4, -3, -2, -2, -1, 0, 1, 2, 3, 2, 0, -1, -2, -3, -3, -3, -4, -2, -1, -1, 0, 1, 3, 3, 4, 3, 1, -1, -2, -2, -2, -3, -3, -2, -1, 0, 2, 3, 4, 4, 4, 3, 2, 0, -2, -2, -2, -2, -3, -1, 0, 2, 3, 4, 5, 5, 4, 3, 2, 0, -1, -2, -2, -2, -2, 0, 2, 3, 4, 5, 5, 5, 4, 3, 2, 1, 0, -1, -1, -1, -2, 1, 3, 4, 5, 5, 5, 4, 4, 3, 2, 1, 0, 0, 0, -1, -1, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -3, -2, -2, -1, -2, -3, -3, -3, -4, -4, -4, -4, -5, -5, -5, -4, -3, -3, -2, -1, -3, -3, -3, -3, -4, -4, -5, -5, -5, -5, -5, -4, -3, -3, -3, -3, -3, -3, -3, -3, -4, -4, -5, -5, -5, -5, -4, -3, -3, -4, -4, -4, -3, -3, -3, -3, -4, -4, -4, -5, -5, -4, -3, -3, -4, -4, -4, -4, -3, -3, -3, -3, -3, -4, -4, -4, -4, -3, -3, -3, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -2, -2, -3, -3, -4, -4, -4, -4, -4, -4, -3, -3, -3, -2, -2, -1, -1, -2, -3, -4, -4, -4, -4, -4, -4, -3, -3, -2, -2, -2, -1, 0, 0, -1, -3, -3, -4, -4, -4, -4, -3, -3, -2, -2, -1, -1, 0, 1, 1, 0, -2, -3, -3, -4, -4, -4, -3, -2, -2, -1, 0, 0, 2, 2, 2, 0, -1, -2, -2, -3, -3, -3, -2, -2, -1, 0, 1, 2, 3, 3, 2, 0, -1, -2, -2, -2, -3, -3, -2, -1, 0, 1, 2, 3, 4, 3, 2, 1, -1, -2, -2, -2, -2, -3, -1, 0, 1, 2, 4, 4, 4, 4, 3, 1, 0, -1, -1, -2, -2, -2, 0, 1, 2, 4, 4, 4, 4, 3, 2, 1, 0, 0, -1, -1, -2, -2, 1, 2, 3, 4, 4, 4, 4, 3, 2, 1, 0, 0, 0, -1, -1, -1, -1, -2, -2, -3, -2, -2, -2, -3, -3, -3, -3, -3, -2, -2, -1, 0, -1, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4, -3, -2, -2, -2, -1, -2, -2, -2, -2, -3, -4, -4, -4, -5, -4, -4, -3, -2, -2, -3, -2, -2, -2, -2, -2, -3, -3, -4, -5, -5, -4, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -3, -3, -3, -4, -4, -4, -3, -2, -3, -3, -3, -3, -3, -2, -2, -2, -2, -3, -3, -3, -4, -3, -2, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -3, -3, -3, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -1, -1, -2, -2, -3, -3, -3, -4, -4, -3, -3, -2, -2, -2, -1, -1, 0, 0, -1, -2, -3, -3, -3, -3, -3, -3, -2, -2, -1, -1, -1, 0, 1, 1, -1, -2, -2, -3, -3, -3, -3, -2, -2, -1, -1, 0, 0, 1, 2, 1, 0, -1, -2, -2, -3, -3, -3, -2, -1, -1, 0, 0, 1, 2, 2, 1, 0, -1, -2, -2, -2, -2, -3, -2, -1, 0, 0, 2, 3, 3, 3, 2, 0, -1, -2, -2, -2, -2, -2, -2, -1, 0, 2, 3, 3, 3, 3, 2, 0, 0, -1, -1, -2, -2, -2, -1, 0, 2, 3, 3, 3, 3, 2, 1, 1, 0, -1, -1, -1, -2, -2, 0, 2, 3, 3, 3, 3, 3, 2, 1, 1, 0, -1, -1, -1, -1, -2, 0, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, 0, 0, -1, -1, -2, -2, -3, -2, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, -1, -1, -1, -2, -3, -3, -4, -4, -3, -3, -2, -2, -2, -2, -2, -1, -1, -1, -1, -2, -2, -3, -4, -4, -3, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -3, -4, -3, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -2, -2, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -2, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -2, -3, -3, -3, -3, -2, -2, -1, -1, -1, -1, 0, 0, -1, -2, -2, -2, -3, -3, -3, -3, -2, -2, -1, -1, 0, 0, 1, 1, -1, -1, -2, -2, -2, -3, -3, -2, -2, -1, -1, 0, 0, 1, 1, 1, 0, -1, -2, -2, -2, -3, -3, -2, -1, -1, 0, 0, 1, 1, 1, 1, 0, -1, -2, -2, -2, -2, -3, -2, -1, -1, 0, 1, 2, 2, 2, 1, 0, -1, -1, -2, -2, -2, -2, -2, -1, 0, 1, 2, 2, 2, 2, 1, 0, 0, -1, -1, -2, -2, -2, -1, 0, 1, 2, 3, 2, 2, 1, 1, 0, 0, -1, -1, -1, -2, -2, 0, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0, -1, -1, -1, -2, -2, 0, -1, -2, -2, -2, -1, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -2, -2, -3, -2, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -2, -2, -2, -1, 0, 0, 0, 0, -1, -1, -2, -2, -1, -1, -1, -1, -2, -2, -2, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, 0, -1, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, 0, 0, 0, 0, -1, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, 0, 1, 1, 0, 0, -1, -2, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, -1, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 1, 1, 1, 1, 0, -1, -1, -2, -2, -2, -2, -1, 0, 0, 0, 1, 1, 1, 1, 1, 0, -1, -1, -1, -2, -2, -2, -1, 0, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, -1, -2, -2, -1, -1, 1, 1, 2, 2, 2, 1, 1, 0, 0, -1, -1, -2, -2, -2, -1, 0, 1, 2, 2, 2, 1, 1, 0, 0, -1, -1, -1, -1, -2, -2, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 1, 1, 1, 0, 0, -1, -1, -1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 1, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, 2, 2, 1, 0, 0, -1, -1, -2, -1, -1, 0, 0, 0, 1, 1, 2, 2, 2, 1, 0, 0, -1, -1, -2, -1, -1, 0, 0, 0, 1, 1, 2, 2, 2, 1, 0, 0, -1, -1, -2, -2, -1, 0, 0, 1, 1, 2, 2, 2, 2, 1, 0, -1, -1, -1, -2, -2, -1, 0, 0, 1, 1, 2, 2, 2, 2, 1, 0, -1, -1, -1, -2, -2, -1, 0, 1, 1, 1, 2, 1, 1, 0, 0, -1, -1, -1, -2, -2, -1, 0, 1, 1, 2, 2, 1, 0, 0, 0, -1, -1, -1, -1, -2, -2, 0, 0, -1, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, 0, 1, 1, 0, 0, -1, -1, -2, -2, -2, -1, -2, -2, -1, -1, -1, 0, 1, 1, 1, 1, 1, 0, -1, -1, -1, -2, -2, -1, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, -2, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -2, -1, -1, 0, 0, 0, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, -1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, -1, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 0, -1, -1, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 2, 1, 0, 0, -1, -1, -1, 0, 1, 1, 1, 2, 2, 3, 3, 3, 2, 1, 0, 0, -1, -1, -1, 0, 1, 1, 1, 2, 3, 3, 3, 3, 2, 1, 0, -1, -1, -1, -1, 0, 0, 1, 1, 2, 3, 3, 2, 2, 2, 0, -1, -1, -1, -1, -2, -1, 0, 1, 1, 2, 2, 2, 1, 1, 0, 0, -1, -1, -1, -2, -1, 0, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -2, -2, 0, 0, -1, -1, -2, -2, -2, -2, -2, -1, -2, -2, -1, -1, -1, 0, 1, 1, 1, 0, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, 0, 1, 1, 1, 1, 1, 0, -1, -2, -2, -3, -2, -2, -2, -1, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 1, 1, 1, 1, 1, 1, 0, -1, -2, -2, -2, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, -1, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 2, 2, 1, 1, 0, -1, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 3, 2, 1, 0, 0, 0, -1, 0, 1, 2, 2, 3, 3, 4, 4, 3, 3, 1, 0, 0, 0, 0, -1, 0, 1, 1, 2, 3, 3, 3, 3, 3, 2, 1, 0, 0, 0, -1, -1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 1, 0, 0, -1, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -1, 1, 0, 0, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, 0, 1, 1, 1, 1, 0, -1, -2, -2, -3, -3, -3, -3, -2, -2, -1, 0, 1, 1, 1, 1, 1, 0, -1, -2, -3, -3, -3, -3, -3, -2, -1, 0, 2, 2, 2, 1, 1, 1, 0, -2, -3, -3, -3, -3, -2, -1, 0, 0, 1, 1, 2, 1, 2, 1, 0, -2, -3, -3, -3, -2, -1, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, -1, -2, -3, -2, -1, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 1, 0, -2, -2, -1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 2, 0, -1, -1, 0, 1, 1, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 2, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 2, 2, 3, 3, 3, 3, 2, 3, 3, 2, 2, 2, 1, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 0, 0, 1, 2, 3, 3, 3, 3, 4, 4, 4, 3, 2, 1, 1, 1, 0, 0, 1, 2, 2, 3, 3, 4, 5, 5, 4, 3, 2, 1, 1, 0, 0, -1, 0, 1, 2, 3, 3, 4, 4, 4, 3, 3, 2, 1, 0, 0, 0, -1, 0, 1, 2, 2, 3, 3, 3, 3, 2, 2, 1, 0, 0, 0, -1, -1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 1, 0, 0, -1, -2, -2, -3, -3, -3, -3, -3, -3, -2, -2, -1, -1, 1, 1, 1, 1, 0, -1, -2, -3, -3, -4, -4, -3, -3, -2, -1, -1, 2, 2, 2, 1, 1, 0, -1, -3, -4, -4, -4, -4, -3, -2, -1, 0, 2, 2, 2, 2, 2, 1, -1, -2, -4, -4, -4, -4, -2, -1, 0, 0, 2, 2, 2, 2, 2, 1, 0, -2, -3, -4, -4, -3, -1, 0, 0, 0, 2, 2, 3, 3, 2, 1, 0, -1, -3, -3, -3, -2, 0, 0, 1, 1, 2, 2, 3, 3, 3, 3, 1, -1, -2, -2, -1, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 2, 0, -1, -1, 0, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 3, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 3, 4, 4, 3, 2, 3, 3, 3, 2, 2, 2, 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 2, 2, 2, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 4, 3, 2, 2, 1, 1, 0, 2, 3, 3, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 1, 0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 4, 4, 2, 1, 1, 0, 0, -1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 1, 0, 0, 0, -1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, -1, -2, -3, -3, -3, -3, -3, -4, -4, -3, -2, -2, -1, 2, 1, 1, 1, 0, -1, -3, -4, -4, -4, -4, -4, -4, -3, -2, -1, 2, 2, 2, 2, 1, 0, -2, -3, -5, -5, -5, -5, -4, -3, -2, -1, 3, 2, 2, 2, 2, 1, -1, -3, -4, -5, -5, -4, -3, -2, -1, 0, 3, 3, 3, 2, 2, 1, -1, -2, -4, -5, -4, -3, -2, -1, 0, 1, 3, 3, 3, 3, 3, 1, 0, -2, -3, -4, -3, -2, -1, 0, 1, 1, 3, 3, 3, 4, 4, 3, 1, -1, -2, -3, -2, -1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 3, 2, 0, -1, -1, -1, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 3, 2, 1, 1, 2, 2, 3, 2, 2, 1, 3, 3, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 3, 3, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 4, 3, 2, 2, 2, 1, 2, 3, 4, 4, 5, 5, 6, 6, 5, 5, 4, 3, 2, 2, 1, 0, 2, 3, 3, 4, 5, 5, 5, 5, 5, 4, 3, 2, 2, 1, 1, 0, 1, 2, 3, 3, 3, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, -1, 0, 1, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, -1, -1, -2, -3, -4, -4, -4, -4, -5, -5, -4, -3, -3, -2, 2, 2, 1, 0, -1, -2, -3, -5, -5, -5, -5, -5, -4, -4, -3, -1, 3, 3, 2, 2, 1, -1, -2, -4, -5, -6, -6, -6, -5, -4, -3, -1, 4, 3, 3, 2, 2, 0, -2, -3, -5, -6, -6, -5, -4, -3, -1, 0, 5, 4, 4, 3, 2, 1, -1, -3, -4, -5, -5, -4, -3, -1, 0, 1, 5, 4, 4, 4, 3, 1, 0, -2, -3, -4, -4, -3, -1, 0, 1, 2, 4, 4, 4, 4, 4, 2, 1, -1, -2, -3, -2, -1, 0, 1, 2, 2, 4, 4, 5, 5, 4, 4, 2, 0, 0, -1, -1, 0, 1, 2, 2, 2, 4, 4, 5, 5, 5, 4, 3, 2, 1, 1, 2, 3, 3, 4, 3, 2, 4, 4, 5, 5, 5, 5, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 4, 4, 5, 5, 5, 5, 5, 4, 4, 5, 5, 4, 4, 4, 3, 3, 3, 4, 5, 5, 6, 6, 5, 5, 5, 6, 5, 5, 4, 3, 3, 3, 2, 3, 4, 5, 5, 5, 6, 6, 6, 6, 5, 5, 4, 3, 3, 3, 1, 3, 4, 4, 5, 5, 6, 6, 6, 6, 5, 4, 4, 3, 3, 2, 0, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, -1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, -1, -2, -3, -4, -5, -5, -6, -6, -6, -6, -5, -4, -4, -3, 2, 2, 1, 0, -1, -3, -4, -5, -6, -6, -6, -6, -6, -5, -4, -3, 3, 3, 3, 2, 1, -1, -3, -5, -6, -7, -7, -7, -6, -5, -4, -2, 5, 4, 3, 3, 2, 0, -2, -4, -5, -6, -7, -6, -5, -4, -2, -1, 6, 5, 4, 3, 2, 1, -1, -3, -4, -5, -6, -5, -4, -2, -1, 0, 6, 6, 5, 4, 3, 2, 0, -2, -3, -4, -4, -3, -2, 0, 1, 2, 6, 6, 6, 5, 5, 2, 1, -1, -2, -3, -3, -2, 0, 1, 2, 3, 6, 6, 6, 6, 5, 4, 2, 0, 0, -1, -1, 0, 1, 2, 3, 3, 6, 6, 6, 6, 6, 5, 4, 2, 1, 0, 1, 3, 3, 4, 4, 3, 5, 6, 6, 6, 6, 6, 5, 4, 4, 3, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 7, 6, 6, 5, 5, 5, 6, 5, 5, 5, 5, 5, 4, 5, 6, 6, 7, 7, 6, 6, 6, 7, 6, 6, 5, 5, 5, 5, 4, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 6, 6, 5, 5, 5, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7, 6, 6, 5, 5, 5, 4, 1, 3, 4, 4, 5, 5, 5, 6, 5, 5, 4, 4, 4, 4, 4, 4, -1, 1, 2, 3, 3, 3, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2 }, { -7, -12, -16, -17, -18, -18, -18, -18, -18, -18, -18, -18, -18, -17, -13, -10, -6, -10, -13, -14, -14, -14, -14, -13, -13, -14, -13, -13, -13, -11, -8, -5, -5, -8, -10, -11, -11, -10, -10, -10, -9, -9, -9, -9, -9, -7, -4, -2, -4, -7, -8, -8, -8, -8, -7, -7, -7, -7, -7, -7, -7, -6, -3, 0, -3, -6, -7, -7, -6, -6, -6, -6, -6, -6, -6, -5, -5, -4, -1, 1, -3, -4, -5, -5, -5, -5, -5, -5, -5, -4, -4, -3, -3, -2, 1, 3, -3, -3, -3, -4, -4, -5, -5, -4, -3, -2, -2, -1, -1, 0, 2, 5, -2, -2, -2, -3, -3, -4, -3, -3, -2, 0, 0, 1, 1, 2, 4, 7, -1, -1, -2, -2, -2, -2, -2, -1, 0, 2, 2, 3, 3, 4, 7, 9, 0, -1, -1, -1, -1, -1, 0, 1, 3, 4, 5, 5, 5, 6, 9, 11, 0, 0, 0, 1, 1, 0, 1, 3, 5, 6, 7, 7, 8, 9, 11, 13, 1, 1, 1, 2, 2, 3, 4, 6, 8, 9, 9, 10, 10, 12, 13, 16, 2, 2, 3, 3, 4, 6, 7, 9, 10, 12, 12, 12, 13, 14, 16, 18, 3, 3, 4, 5, 6, 8, 10, 12, 13, 14, 15, 15, 16, 17, 19, 20, 4, 4, 5, 7, 9, 11, 13, 15, 16, 17, 18, 18, 19, 20, 21, 22, 6, 6, 7, 9, 11, 13, 15, 17, 19, 20, 20, 21, 21, 23, 24, 24, -3, -7, -12, -14, -15, -15, -15, -16, -17, -17, -17, -17, -17, -17, -15, -12, -2, -6, -10, -11, -11, -12, -12, -12, -12, -13, -13, -13, -13, -12, -10, -7, -3, -6, -8, -9, -9, -9, -9, -9, -9, -9, -10, -10, -9, -8, -6, -4, -3, -6, -7, -8, -8, -8, -7, -7, -7, -7, -8, -8, -7, -6, -5, -2, -3, -5, -6, -7, -7, -6, -6, -6, -6, -6, -6, -6, -6, -5, -3, -1, -3, -5, -6, -6, -6, -5, -5, -5, -4, -4, -4, -4, -4, -3, -1, 1, -2, -4, -5, -5, -5, -4, -4, -4, -3, -3, -2, -2, -2, -1, 0, 2, -2, -3, -3, -4, -3, -4, -3, -3, -1, 0, 0, 0, 0, 1, 2, 3, -2, -2, -2, -3, -3, -3, -2, -1, 1, 2, 2, 2, 2, 2, 3, 5, -1, -1, -1, -2, -2, -1, 0, 1, 3, 4, 4, 4, 3, 3, 5, 7, 0, -1, -1, -1, -1, 0, 1, 3, 5, 6, 6, 5, 5, 6, 7, 9, 0, 0, 0, 1, 1, 2, 3, 5, 7, 7, 7, 7, 7, 8, 9, 11, 1, 1, 1, 2, 3, 4, 5, 7, 9, 9, 9, 9, 9, 10, 11, 13, 1, 1, 2, 3, 5, 6, 8, 9, 11, 11, 11, 11, 12, 13, 14, 16, 2, 2, 4, 5, 7, 9, 11, 12, 13, 14, 14, 14, 15, 16, 17, 19, 4, 4, 5, 7, 9, 11, 13, 15, 16, 16, 16, 17, 18, 19, 20, 22, -2, -4, -7, -10, -11, -12, -13, -14, -15, -16, -17, -17, -17, -17, -16, -13, -1, -4, -7, -8, -9, -10, -10, -11, -11, -12, -13, -13, -13, -12, -11, -9, -1, -4, -7, -8, -8, -8, -8, -9, -9, -10, -10, -11, -10, -9, -8, -6, -1, -4, -6, -7, -7, -7, -7, -7, -8, -8, -9, -9, -8, -7, -6, -4, -1, -4, -6, -6, -7, -7, -6, -6, -6, -7, -7, -7, -6, -5, -4, -2, -1, -4, -5, -6, -6, -6, -5, -5, -5, -5, -5, -4, -4, -3, -2, -1, -1, -4, -5, -5, -5, -5, -5, -4, -3, -3, -3, -3, -2, -2, -1, 0, -1, -3, -4, -4, -4, -4, -3, -3, -1, -1, 0, 0, -1, -1, 0, 1, -1, -2, -3, -3, -3, -3, -2, -1, 1, 1, 1, 1, 1, 0, 1, 2, -1, -2, -2, -2, -2, -1, 0, 1, 3, 3, 3, 3, 2, 2, 2, 4, 0, -1, -1, -2, -2, 0, 1, 3, 4, 5, 4, 4, 3, 3, 4, 5, 1, 0, 0, -1, -1, 1, 2, 4, 5, 6, 6, 5, 4, 5, 6, 7, 2, 0, 0, 0, 1, 2, 4, 6, 7, 7, 6, 6, 6, 7, 8, 10, 3, 1, 1, 2, 3, 4, 6, 8, 8, 8, 8, 8, 8, 9, 11, 12, 4, 1, 3, 4, 4, 6, 8, 10, 10, 10, 11, 11, 11, 12, 13, 15, 5, 3, 4, 5, 7, 9, 11, 12, 13, 13, 13, 14, 14, 15, 16, 18, -1, -3, -6, -7, -9, -10, -12, -13, -15, -16, -17, -17, -17, -17, -16, -16, -1, -3, -5, -7, -8, -9, -10, -11, -12, -13, -13, -14, -14, -13, -12, -12, 0, -3, -5, -6, -7, -8, -8, -9, -10, -11, -11, -12, -11, -11, -10, -9, 0, -2, -4, -6, -6, -7, -7, -8, -8, -9, -10, -10, -9, -9, -7, -7, 0, -2, -4, -5, -6, -6, -6, -6, -7, -8, -8, -8, -7, -6, -5, -5, 1, -2, -4, -5, -5, -5, -5, -5, -5, -6, -6, -5, -5, -4, -4, -4, 1, -1, -4, -5, -5, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, 2, -1, -3, -4, -4, -3, -3, -2, -2, -1, -1, -1, -2, -2, -2, -2, 2, -2, -2, -3, -2, -2, -2, -1, 0, 1, 1, 0, 0, -1, -1, -1, 2, -1, -2, -2, -2, -1, 0, 1, 2, 2, 2, 1, 1, 0, 0, 1, 4, 0, -1, -2, -1, 0, 1, 3, 3, 3, 3, 2, 1, 1, 2, 2, 5, 1, -1, -1, -1, 1, 2, 4, 4, 4, 4, 3, 2, 2, 3, 4, 6, 2, 0, -1, 0, 2, 3, 5, 5, 5, 4, 4, 4, 4, 5, 6, 6, 2, 1, 1, 2, 3, 5, 6, 6, 6, 5, 5, 5, 6, 8, 9, 7, 3, 2, 2, 3, 5, 6, 7, 7, 7, 7, 7, 8, 9, 10, 11, 7, 5, 3, 4, 5, 7, 8, 9, 9, 10, 10, 10, 10, 12, 13, 14, 1, -2, -5, -7, -8, -9, -11, -13, -14, -16, -17, -18, -18, -17, -16, -16, 1, -1, -4, -6, -7, -8, -10, -11, -13, -14, -15, -15, -15, -14, -14, -14, 1, -1, -4, -5, -6, -7, -8, -9, -11, -12, -13, -13, -12, -12, -12, -12, 1, -1, -3, -4, -5, -6, -7, -8, -9, -10, -11, -11, -11, -10, -10, -10, 2, 0, -2, -4, -5, -5, -5, -6, -8, -9, -9, -9, -9, -8, -8, -8, 2, 0, -2, -3, -4, -4, -4, -5, -6, -7, -7, -7, -6, -6, -6, -7, 3, 1, -2, -3, -4, -4, -4, -4, -4, -5, -4, -4, -4, -4, -5, -6, 3, 1, -1, -3, -3, -3, -2, -2, -2, -2, -2, -3, -3, -3, -4, -5, 4, 2, -1, -2, -2, -2, -1, 0, 0, 0, -1, -2, -2, -2, -3, -3, 5, 2, -1, -2, -2, -1, 1, 1, 1, 1, 0, -1, -1, -1, -2, -2, 6, 3, 0, -1, -1, 0, 2, 2, 2, 2, 1, 0, 0, 0, -1, -1, 7, 4, 0, -1, -1, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 1, 8, 5, 1, 0, 0, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, 3, 9, 6, 3, 0, 1, 3, 4, 4, 4, 4, 3, 3, 3, 4, 4, 5, 9, 7, 4, 1, 3, 4, 5, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 8, 5, 3, 4, 5, 6, 6, 6, 6, 7, 7, 8, 8, 10, 10, 2, 0, -3, -4, -6, -8, -10, -12, -14, -15, -16, -17, -17, -16, -16, -16, 2, 0, -2, -3, -5, -7, -9, -11, -12, -14, -15, -15, -15, -15, -14, -14, 3, 0, -2, -3, -4, -5, -8, -9, -11, -12, -13, -13, -13, -13, -13, -14, 3, 1, -1, -2, -3, -5, -7, -8, -10, -11, -12, -12, -12, -12, -12, -13, 3, 1, -1, -2, -3, -4, -5, -7, -8, -9, -10, -10, -10, -10, -11, -11, 3, 2, 0, -2, -3, -4, -4, -5, -7, -8, -8, -8, -8, -8, -9, -9, 4, 2, 0, -1, -2, -3, -3, -4, -5, -6, -6, -6, -6, -7, -7, -8, 4, 3, 1, 0, -2, -3, -2, -3, -3, -3, -3, -4, -5, -5, -6, -7, 5, 4, 2, 0, -1, -1, -1, -1, -1, -2, -2, -3, -4, -4, -5, -6, 6, 4, 3, 1, -1, -1, 0, 0, 0, -1, -2, -2, -3, -3, -4, -5, 7, 6, 3, 1, -1, 0, 1, 1, 1, 0, -1, -1, -2, -2, -3, -4, 9, 7, 4, 1, 0, 1, 2, 2, 1, 0, 0, 0, -1, -1, -2, -2, 10, 7, 4, 2, 1, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 11, 8, 5, 3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 1, 2, 11, 9, 7, 4, 2, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 11, 9, 7, 5, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 6, 7, 2, 0, -1, -2, -3, -6, -8, -10, -12, -14, -15, -15, -16, -15, -15, -15, 2, 0, -1, -1, -2, -4, -7, -9, -12, -13, -14, -15, -15, -14, -14, -14, 3, 1, 0, -1, -2, -3, -6, -8, -11, -12, -13, -13, -13, -13, -13, -14, 3, 2, 1, 0, -1, -2, -5, -7, -9, -11, -12, -12, -12, -12, -12, -13, 3, 2, 1, 0, -1, -2, -4, -6, -8, -9, -10, -10, -10, -11, -11, -11, 4, 2, 1, 0, -1, -2, -3, -5, -7, -8, -8, -8, -9, -9, -10, -10, 4, 2, 1, 1, 0, -2, -3, -4, -5, -6, -6, -7, -7, -8, -8, -9, 5, 3, 2, 1, 0, -1, -2, -3, -3, -4, -5, -5, -6, -7, -7, -8, 6, 5, 4, 3, 1, 0, -1, -1, -2, -3, -4, -4, -5, -6, -6, -7, 7, 6, 5, 4, 2, 0, 0, 0, -1, -2, -3, -4, -4, -5, -5, -6, 8, 7, 6, 5, 2, 1, 1, 1, -1, -2, -2, -3, -3, -4, -5, -5, 10, 8, 7, 5, 3, 2, 2, 1, 0, -1, -1, -2, -3, -3, -4, -4, 11, 9, 8, 6, 4, 2, 2, 1, 0, 0, -1, -1, -1, -2, -3, -4, 12, 10, 9, 7, 4, 3, 3, 2, 1, 0, 0, 0, 0, -1, -1, -2, 12, 11, 10, 8, 5, 4, 3, 2, 2, 1, 1, 1, 1, 1, 0, 0, 13, 11, 10, 8, 6, 4, 4, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 0, 0, 0, -1, -2, -5, -8, -11, -12, -13, -13, -14, -14, -14, -14, 2, 1, 1, 0, 0, -1, -4, -7, -10, -12, -13, -13, -14, -14, -14, -14, 3, 2, 1, 1, 0, -1, -2, -5, -9, -11, -12, -13, -13, -13, -13, -13, 4, 2, 2, 1, 1, 0, -1, -4, -8, -10, -11, -11, -11, -12, -12, -13, 4, 3, 2, 2, 1, 0, -1, -2, -7, -9, -9, -10, -10, -11, -11, -11, 4, 3, 3, 2, 2, 1, 0, -2, -5, -7, -8, -8, -9, -9, -10, -10, 5, 3, 3, 2, 2, 1, 0, -1, -4, -6, -6, -7, -7, -8, -9, -9, 5, 4, 3, 3, 3, 2, 0, 0, -2, -4, -4, -5, -6, -7, -7, -8, 7, 6, 5, 4, 4, 3, 1, 0, -1, -3, -4, -4, -5, -6, -6, -7, 8, 7, 6, 5, 5, 3, 2, 1, -1, -2, -3, -4, -4, -5, -6, -6, 9, 8, 7, 7, 6, 5, 2, 1, 0, -2, -2, -3, -4, -4, -5, -5, 10, 9, 8, 8, 7, 5, 3, 2, 0, -2, -2, -3, -3, -4, -4, -5, 11, 10, 10, 9, 8, 6, 4, 2, 0, -1, -2, -2, -2, -3, -4, -4, 12, 11, 11, 10, 8, 6, 4, 2, 0, -1, -1, -1, -1, -2, -3, -3, 13, 12, 12, 10, 9, 7, 5, 2, 1, 0, 0, 0, 0, -1, -1, -1, 13, 12, 11, 10, 8, 7, 5, 3, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, -1, -1, -2, -5, -7, -9, -10, -11, -12, -13, -13, -14, 2, 1, 1, 1, 0, -1, -2, -3, -6, -9, -11, -12, -13, -13, -13, -14, 3, 2, 2, 1, 1, 0, -1, -2, -4, -8, -10, -11, -12, -13, -13, -14, 4, 3, 2, 2, 1, 0, -1, -2, -4, -6, -9, -10, -11, -12, -12, -13, 4, 3, 2, 2, 1, 1, 0, -2, -3, -6, -7, -9, -10, -11, -11, -11, 5, 3, 3, 2, 2, 1, 0, -1, -3, -4, -6, -7, -9, -10, -10, -10, 5, 4, 3, 3, 2, 1, 0, -1, -2, -3, -5, -6, -8, -9, -9, -9, 6, 5, 4, 4, 3, 2, 1, -1, -1, -2, -4, -5, -6, -7, -8, -8, 7, 6, 5, 5, 4, 3, 3, 0, 0, -2, -3, -4, -5, -6, -7, -7, 8, 7, 7, 6, 5, 5, 4, 1, 0, -1, -3, -3, -4, -5, -6, -6, 9, 8, 8, 7, 7, 6, 5, 2, 0, -1, -2, -3, -4, -4, -5, -6, 11, 10, 9, 9, 8, 7, 6, 2, 0, -1, -2, -2, -3, -4, -4, -5, 12, 11, 11, 10, 9, 8, 6, 3, 0, -1, -1, -2, -3, -3, -4, -4, 13, 12, 12, 11, 10, 9, 7, 4, 1, 0, -1, -1, -2, -3, -3, -3, 14, 13, 13, 12, 10, 9, 7, 4, 1, 0, 0, 0, -1, -2, -2, -3, 13, 13, 12, 12, 10, 9, 7, 5, 2, 1, 0, 0, -1, -1, -1, -2, 0, -1, -1, -1, -2, -2, -2, -3, -4, -5, -7, -8, -10, -12, -13, -14, 1, 0, 0, -1, -1, -2, -2, -2, -3, -4, -6, -9, -11, -13, -13, -14, 2, 1, 1, 0, 0, -1, -2, -2, -2, -3, -5, -8, -10, -12, -13, -14, 3, 2, 1, 1, 0, -1, -1, -2, -2, -3, -4, -6, -9, -11, -12, -13, 4, 3, 2, 1, 1, 0, -1, -1, -2, -2, -3, -5, -8, -10, -11, -12, 4, 3, 2, 2, 1, 0, 0, -1, -1, -1, -1, -5, -8, -10, -10, -11, 5, 4, 3, 2, 2, 1, 0, -1, -1, 0, -1, -4, -7, -9, -9, -10, 6, 5, 4, 3, 2, 2, 1, 0, 0, 0, -1, -4, -6, -8, -8, -9, 7, 6, 5, 4, 4, 3, 2, 2, 1, 1, -1, -3, -5, -7, -7, -8, 8, 7, 7, 6, 5, 5, 4, 3, 2, 1, 0, -2, -4, -6, -6, -7, 10, 8, 8, 8, 7, 6, 5, 4, 3, 1, -1, -2, -4, -5, -5, -6, 11, 10, 10, 9, 9, 8, 6, 5, 3, 1, 0, -2, -3, -4, -4, -6, 12, 12, 11, 11, 10, 9, 7, 6, 3, 1, 0, -1, -3, -3, -4, -5, 13, 13, 13, 12, 11, 10, 8, 6, 4, 2, 0, -1, -2, -3, -3, -5, 14, 14, 13, 13, 12, 11, 9, 7, 5, 2, 1, -1, -2, -2, -3, -4, 14, 14, 14, 13, 12, 11, 9, 7, 5, 3, 1, 0, -1, -2, -3, -3, -2, -3, -3, -3, -3, -3, -3, -3, -4, -4, -5, -7, -9, -11, -13, -14, -1, -2, -2, -2, -3, -3, -3, -3, -3, -3, -3, -5, -8, -11, -12, -14, 0, -1, -1, -1, -2, -2, -3, -3, -3, -3, -2, -3, -7, -10, -12, -13, 1, 0, 0, -1, -1, -2, -3, -3, -3, -2, -1, -2, -6, -9, -11, -13, 2, 1, 0, 0, -1, -2, -2, -3, -2, -2, -1, -1, -5, -8, -10, -12, 3, 2, 1, 0, 0, -1, -2, -2, -2, -1, 0, -1, -5, -7, -10, -11, 4, 3, 2, 1, 0, 0, -1, -2, -1, 0, 1, -1, -4, -7, -9, -11, 5, 4, 3, 2, 1, 0, 0, 0, 0, 1, 1, -1, -4, -6, -8, -10, 6, 5, 4, 3, 2, 2, 1, 1, 2, 2, 1, -1, -3, -5, -7, -9, 7, 6, 5, 5, 4, 4, 4, 3, 3, 2, 1, 0, -2, -4, -6, -8, 9, 8, 7, 7, 7, 6, 5, 5, 4, 3, 2, 0, -2, -4, -5, -7, 10, 10, 9, 9, 9, 8, 6, 6, 4, 3, 2, 0, -1, -3, -5, -6, 12, 11, 11, 11, 10, 9, 8, 7, 5, 3, 2, 1, -1, -2, -4, -6, 13, 13, 13, 12, 11, 10, 9, 7, 6, 4, 2, 1, -1, -2, -4, -5, 14, 14, 14, 13, 12, 11, 10, 8, 6, 4, 2, 1, -1, -2, -3, -5, 16, 15, 15, 14, 13, 12, 10, 9, 7, 5, 3, 1, 0, -2, -3, -4, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -6, -7, -9, -10, -12, -3, -4, -4, -4, -4, -4, -5, -5, -4, -4, -4, -4, -6, -9, -11, -12, -2, -2, -3, -3, -4, -4, -4, -4, -4, -3, -2, -2, -4, -8, -11, -12, -1, -1, -2, -2, -3, -4, -4, -4, -4, -3, -1, -1, -2, -7, -9, -12, 0, 0, -1, -2, -3, -3, -4, -4, -3, -2, 0, 0, -1, -6, -9, -11, 1, 0, -1, -1, -2, -3, -3, -3, -2, -1, 0, 1, -1, -5, -8, -11, 2, 1, 0, 0, -1, -2, -2, -2, -1, 0, 1, 1, -1, -4, -7, -10, 3, 2, 1, 1, 0, -1, -1, 0, 1, 1, 2, 1, -1, -4, -6, -9, 4, 3, 3, 2, 1, 1, 1, 1, 2, 3, 2, 1, -1, -3, -6, -9, 6, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 1, 0, -2, -5, -8, 7, 6, 6, 6, 6, 6, 5, 5, 4, 4, 3, 2, 0, -2, -4, -7, 8, 8, 8, 8, 8, 7, 7, 6, 5, 4, 3, 2, 0, -1, -4, -6, 10, 10, 10, 10, 10, 9, 8, 7, 6, 5, 3, 2, 1, -1, -4, -6, 12, 12, 12, 12, 11, 10, 9, 8, 7, 5, 3, 2, 1, -1, -3, -5, 14, 14, 14, 13, 12, 11, 10, 9, 7, 5, 3, 2, 1, -1, -3, -5, 16, 16, 16, 15, 14, 13, 11, 10, 8, 6, 4, 2, 0, -1, -3, -5, -7, -7, -7, -7, -7, -7, -8, -7, -7, -7, -7, -7, -6, -7, -7, -8, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -5, -5, -6, -7, -8, -10, -5, -5, -5, -5, -5, -6, -6, -6, -5, -4, -3, -3, -4, -6, -8, -10, -3, -4, -4, -4, -5, -5, -5, -5, -4, -3, -2, -2, -2, -5, -8, -10, -2, -2, -3, -3, -4, -5, -5, -4, -3, -2, 0, 0, -1, -3, -6, -9, -1, -1, -2, -3, -3, -4, -4, -3, -2, -1, 1, 1, 0, -2, -6, -9, 1, 0, -1, -2, -3, -3, -3, -2, -1, 0, 1, 1, 1, -1, -5, -9, 2, 1, 0, 0, -1, -1, -1, 0, 1, 1, 2, 2, 1, -1, -5, -8, 3, 2, 2, 1, 1, 1, 1, 2, 2, 3, 3, 2, 1, -1, -4, -7, 4, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 1, 0, -3, -7, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 3, 1, 0, -3, -7, 6, 6, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 0, -3, -6, 8, 8, 9, 9, 9, 9, 8, 7, 6, 5, 4, 3, 2, 0, -2, -6, 10, 10, 11, 11, 11, 10, 9, 8, 7, 6, 4, 3, 2, 0, -2, -5, 13, 13, 13, 13, 12, 11, 10, 9, 8, 6, 4, 3, 2, 0, -2, -5, 16, 16, 16, 15, 14, 13, 12, 11, 9, 7, 5, 3, 2, 0, -2, -5, -10, -10, -10, -11, -12, -12, -12, -12, -11, -11, -10, -8, -7, -6, -6, -6, -9, -8, -8, -9, -9, -10, -10, -9, -8, -8, -8, -8, -6, -6, -7, -7, -7, -7, -7, -7, -8, -8, -8, -7, -6, -5, -5, -5, -5, -5, -6, -8, -6, -6, -6, -6, -6, -7, -7, -6, -5, -3, -3, -3, -3, -4, -6, -8, -5, -5, -4, -5, -5, -5, -6, -5, -3, -2, -1, -1, -1, -2, -5, -8, -3, -3, -3, -4, -4, -4, -4, -3, -2, -1, 0, 0, 0, -1, -4, -7, -2, -2, -2, -3, -3, -3, -3, -2, 0, 1, 2, 2, 2, 0, -3, -6, 0, 0, -1, -1, -2, -2, -1, 0, 1, 2, 2, 2, 2, 1, -2, -6, 1, 1, 1, 0, 0, 1, 1, 2, 3, 3, 3, 3, 2, 1, -2, -6, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 3, 3, 3, 1, -2, -6, 4, 3, 4, 4, 5, 5, 5, 5, 4, 4, 4, 4, 3, 2, -2, -6, 5, 5, 6, 6, 6, 7, 7, 6, 5, 5, 5, 4, 3, 2, -2, -6, 6, 7, 8, 8, 8, 8, 8, 7, 6, 6, 5, 4, 3, 2, -2, -6, 8, 9, 10, 10, 10, 10, 9, 8, 7, 7, 6, 4, 3, 2, -1, -5, 11, 12, 13, 12, 12, 11, 10, 9, 8, 7, 6, 5, 3, 2, -2, -6, 15, 16, 16, 15, 14, 13, 12, 11, 10, 8, 6, 4, 3, 1, -2, -6, -15, -15, -15, -15, -16, -16, -16, -15, -15, -13, -11, -9, -7, -6, -5, -6, -13, -13, -13, -13, -13, -14, -14, -13, -12, -11, -10, -8, -7, -5, -6, -6, -12, -11, -11, -11, -11, -12, -11, -10, -8, -7, -7, -7, -6, -4, -5, -6, -11, -10, -9, -9, -9, -9, -9, -7, -5, -4, -4, -5, -4, -3, -4, -5, -9, -8, -7, -7, -7, -7, -7, -5, -3, -2, -2, -2, -3, -2, -3, -5, -8, -7, -6, -6, -6, -5, -5, -3, -2, 0, -1, -1, -1, -1, -3, -5, -6, -4, -4, -4, -4, -4, -3, -2, 0, 1, 1, 1, 1, 0, -1, -4, -3, -3, -2, -2, -2, -2, -1, 0, 2, 3, 2, 3, 3, 2, 0, -3, -2, -1, 0, 0, 0, 0, 1, 2, 3, 3, 3, 4, 4, 2, 1, -3, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 3, 1, -4, 0, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 4, 3, 1, -4, 2, 4, 5, 5, 6, 6, 7, 6, 6, 6, 6, 5, 5, 4, 1, -5, 4, 5, 7, 7, 8, 8, 8, 7, 7, 7, 6, 6, 5, 4, 1, -5, 5, 7, 8, 9, 9, 10, 9, 9, 8, 7, 7, 6, 5, 4, 1, -5, 9, 10, 11, 12, 12, 11, 11, 10, 9, 8, 7, 6, 5, 4, 0, -5, 13, 14, 15, 15, 14, 14, 13, 12, 11, 9, 8, 6, 5, 3, -1, -5, -20, -20, -19, -19, -19, -18, -18, -17, -16, -14, -12, -10, -7, -5, -4, -4, -18, -18, -17, -17, -17, -16, -16, -15, -13, -12, -10, -8, -7, -5, -4, -5, -17, -16, -15, -15, -15, -15, -14, -12, -10, -10, -9, -7, -6, -4, -3, -5, -15, -14, -13, -13, -13, -13, -12, -9, -7, -7, -6, -6, -5, -3, -2, -4, -14, -13, -12, -11, -11, -10, -9, -6, -4, -4, -4, -4, -3, -2, -1, -3, -12, -11, -10, -9, -8, -7, -6, -3, -2, -1, -2, -2, -1, -1, -1, -3, -10, -9, -8, -7, -6, -4, -3, -1, 0, 0, 0, 0, 0, 1, 0, -2, -8, -6, -5, -4, -3, -2, -1, 1, 2, 2, 2, 2, 2, 2, 1, -1, -6, -4, -2, -1, 0, 0, 1, 2, 4, 4, 3, 3, 3, 3, 2, 0, -4, -2, -1, 1, 2, 2, 3, 4, 5, 5, 5, 5, 5, 4, 3, 1, -1, 0, 1, 3, 3, 4, 4, 5, 6, 6, 6, 6, 6, 4, 4, 0, 0, 1, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7, 6, 5, 4, -1, 1, 3, 5, 6, 7, 7, 8, 8, 7, 7, 7, 7, 7, 6, 4, -2, 3, 5, 6, 8, 9, 9, 9, 9, 8, 8, 8, 7, 7, 6, 4, -3, 6, 8, 10, 11, 11, 12, 11, 10, 10, 9, 8, 7, 7, 6, 3, -4, 11, 12, 13, 14, 15, 14, 13, 13, 11, 10, 9, 8, 7, 6, 1, -4 }, { -26, -29, -31, -33, -34, -32, -30, -31, -30, -28, -27, -28, -30, -30, -26, -22, -21, -26, -29, -30, -29, -28, -26, -24, -24, -23, -22, -24, -27, -25, -22, -17, -16, -20, -26, -26, -25, -24, -22, -20, -19, -18, -20, -22, -24, -21, -17, -12, -13, -16, -21, -22, -21, -20, -18, -16, -15, -16, -18, -20, -21, -18, -13, -8, -10, -13, -17, -18, -17, -16, -15, -13, -12, -14, -16, -19, -18, -15, -10, -5, -8, -10, -12, -14, -13, -12, -11, -10, -10, -12, -15, -17, -16, -12, -7, -2, -6, -7, -9, -9, -9, -8, -7, -7, -8, -11, -14, -15, -13, -10, -5, -1, -4, -5, -6, -5, -5, -4, -3, -4, -6, -8, -11, -12, -11, -7, -3, 1, -1, -3, -3, -2, -1, 0, 0, -1, -4, -6, -9, -9, -7, -4, 0, 2, 2, 1, 1, 2, 2, 3, 2, 0, -3, -6, -8, -6, -4, -1, 0, 2, 5, 4, 4, 5, 5, 5, 4, 2, -1, -5, -6, -4, -2, 1, 2, 3, 8, 7, 7, 8, 8, 8, 6, 4, 0, -5, -6, -2, 0, 2, 4, 5, 11, 10, 10, 11, 10, 10, 8, 5, 1, -3, -5, -1, 1, 3, 4, 7, 15, 14, 14, 13, 13, 11, 9, 6, 2, -2, -3, 0, 2, 4, 6, 9, 20, 18, 18, 17, 15, 13, 10, 7, 1, -1, -1, 3, 6, 7, 10, 12, 25, 23, 22, 20, 17, 15, 11, 6, 0, -2, -1, 4, 7, 9, 12, 15, -27, -30, -33, -34, -35, -33, -31, -30, -28, -26, -24, -25, -27, -26, -23, -18, -23, -27, -30, -31, -31, -30, -27, -25, -23, -22, -21, -23, -25, -23, -19, -13, -19, -23, -27, -28, -27, -26, -24, -21, -20, -18, -19, -21, -22, -19, -16, -9, -15, -19, -24, -25, -24, -23, -21, -18, -16, -15, -17, -19, -19, -16, -12, -5, -11, -15, -20, -20, -20, -19, -18, -15, -13, -12, -15, -17, -16, -13, -8, -1, -9, -12, -15, -16, -16, -15, -14, -12, -10, -10, -13, -15, -13, -10, -5, 1, -7, -9, -11, -10, -10, -10, -9, -8, -7, -8, -12, -13, -11, -7, -2, 2, -5, -7, -7, -7, -6, -5, -5, -4, -4, -6, -10, -10, -8, -5, 0, 3, -3, -4, -4, -4, -3, -2, -1, 0, -2, -4, -8, -7, -5, -2, 2, 4, -1, -1, -1, -1, 0, 1, 2, 2, -1, -4, -6, -4, -2, 0, 2, 4, 2, 1, 1, 2, 3, 4, 4, 3, 0, -3, -4, -1, 1, 3, 4, 5, 4, 4, 4, 5, 6, 6, 6, 5, 2, -2, -3, 0, 2, 4, 6, 6, 7, 7, 7, 8, 9, 9, 8, 7, 3, -1, -2, 2, 3, 5, 7, 8, 11, 10, 11, 11, 11, 11, 10, 8, 4, 1, 0, 2, 4, 6, 8, 10, 16, 15, 15, 15, 14, 13, 11, 8, 4, 1, 2, 5, 7, 8, 10, 13, 22, 21, 20, 19, 18, 15, 12, 7, 2, 0, 3, 6, 8, 9, 13, 15, -29, -31, -34, -36, -35, -33, -30, -28, -26, -24, -22, -24, -25, -24, -20, -12, -25, -29, -31, -32, -32, -30, -27, -25, -23, -21, -20, -22, -22, -21, -16, -9, -22, -26, -28, -29, -29, -29, -25, -22, -20, -17, -18, -20, -20, -17, -12, -6, -18, -22, -25, -26, -26, -25, -24, -20, -17, -14, -16, -17, -17, -14, -8, -2, -14, -18, -21, -22, -22, -22, -21, -17, -14, -12, -14, -15, -14, -11, -5, 0, -11, -14, -17, -18, -18, -18, -17, -13, -11, -10, -12, -13, -12, -8, -2, 2, -9, -11, -11, -12, -12, -12, -11, -10, -8, -7, -10, -11, -9, -5, 1, 3, -7, -8, -8, -8, -8, -7, -6, -6, -4, -5, -8, -8, -6, -3, 3, 4, -5, -5, -5, -5, -4, -3, -3, -2, -1, -3, -5, -5, -3, 0, 3, 5, -3, -3, -3, -2, -2, -1, 0, 1, 0, -3, -5, -1, 0, 3, 4, 6, -1, -1, -1, 0, 1, 2, 3, 3, 1, -2, -3, 1, 3, 5, 5, 6, 1, 1, 2, 2, 3, 4, 5, 5, 3, -1, -2, 2, 4, 7, 7, 7, 4, 4, 4, 5, 6, 7, 7, 7, 4, 0, 0, 3, 5, 7, 8, 9, 7, 7, 8, 8, 9, 9, 9, 8, 5, 1, 2, 4, 6, 8, 9, 10, 12, 12, 12, 12, 12, 12, 11, 9, 5, 2, 4, 6, 8, 9, 11, 13, 18, 17, 17, 17, 16, 14, 13, 9, 4, 2, 4, 8, 9, 11, 13, 15, -30, -32, -34, -37, -34, -32, -29, -28, -25, -23, -20, -22, -22, -20, -15, -8, -27, -30, -32, -33, -33, -30, -27, -25, -22, -19, -18, -20, -19, -17, -12, -5, -24, -28, -29, -30, -30, -28, -25, -22, -20, -17, -16, -18, -17, -14, -9, -3, -20, -24, -26, -27, -27, -26, -24, -20, -17, -15, -15, -16, -15, -11, -6, -1, -17, -20, -22, -23, -23, -23, -22, -18, -15, -12, -14, -14, -12, -8, -3, 1, -13, -16, -18, -18, -19, -19, -18, -15, -12, -9, -12, -11, -9, -5, 0, 2, -11, -12, -13, -13, -14, -14, -13, -12, -9, -7, -10, -10, -7, -3, 2, 4, -9, -9, -9, -9, -9, -9, -8, -7, -5, -5, -8, -7, -4, 0, 4, 5, -7, -7, -6, -6, -5, -5, -4, -3, -1, -3, -4, -4, -1, 3, 4, 6, -4, -5, -4, -3, -3, -2, -1, 0, 1, -2, -3, 0, 3, 4, 5, 7, -2, -3, -2, -2, -1, 0, 1, 2, 2, -1, -2, 2, 6, 6, 7, 7, 0, -1, 0, 0, 1, 2, 3, 4, 3, 0, 0, 4, 7, 8, 8, 8, 2, 2, 2, 3, 4, 5, 6, 6, 4, 1, 1, 5, 8, 9, 10, 10, 5, 4, 5, 6, 6, 7, 8, 8, 5, 2, 3, 6, 8, 10, 11, 12, 9, 8, 9, 9, 10, 10, 10, 9, 6, 3, 4, 8, 9, 11, 12, 14, 15, 14, 14, 14, 14, 13, 12, 9, 5, 2, 5, 8, 10, 12, 14, 16, -30, -33, -34, -36, -34, -32, -29, -27, -23, -21, -18, -19, -17, -15, -10, -6, -28, -31, -32, -33, -32, -30, -27, -25, -21, -18, -16, -17, -15, -12, -7, -4, -26, -29, -30, -30, -30, -27, -25, -23, -19, -16, -15, -15, -13, -10, -4, -2, -22, -25, -27, -28, -27, -26, -23, -20, -17, -14, -14, -13, -11, -7, -2, 0, -18, -21, -23, -24, -24, -23, -21, -18, -15, -12, -12, -11, -9, -4, 0, 1, -15, -17, -19, -19, -20, -19, -19, -16, -12, -9, -11, -9, -7, -2, 1, 3, -12, -13, -14, -15, -15, -15, -14, -13, -9, -7, -9, -7, -4, 0, 3, 4, -10, -11, -10, -10, -10, -10, -9, -8, -6, -4, -7, -4, -2, 2, 4, 6, -8, -8, -7, -7, -6, -6, -5, -4, -2, -3, -2, -1, 1, 4, 5, 7, -6, -6, -5, -5, -4, -3, -3, -1, 0, -2, -1, 2, 5, 6, 6, 7, -4, -4, -4, -3, -2, -1, 0, 1, 1, -1, 0, 4, 7, 8, 8, 8, -2, -2, -2, -1, 0, 1, 2, 3, 2, 0, 2, 6, 9, 9, 10, 10, 0, 0, 1, 1, 2, 3, 4, 5, 4, 1, 3, 8, 10, 11, 11, 11, 3, 3, 3, 3, 4, 5, 6, 7, 5, 3, 5, 9, 11, 12, 13, 13, 7, 6, 6, 7, 7, 8, 9, 8, 6, 3, 6, 9, 12, 13, 15, 15, 12, 12, 11, 11, 11, 11, 11, 8, 5, 3, 7, 9, 12, 14, 15, 17, -28, -32, -35, -34, -34, -31, -29, -26, -22, -19, -15, -15, -13, -10, -7, -5, -27, -30, -33, -32, -31, -29, -27, -24, -20, -17, -14, -13, -11, -7, -4, -3, -25, -28, -30, -30, -29, -27, -25, -22, -18, -15, -13, -12, -9, -5, -2, -1, -22, -25, -27, -26, -26, -25, -23, -20, -16, -13, -12, -10, -7, -3, 0, 1, -18, -21, -23, -23, -23, -22, -21, -18, -14, -11, -10, -8, -5, -1, 1, 2, -15, -18, -19, -19, -19, -19, -18, -16, -12, -9, -9, -7, -3, 0, 2, 4, -13, -14, -15, -15, -15, -15, -14, -13, -10, -7, -7, -5, -1, 2, 3, 5, -11, -12, -12, -12, -11, -10, -9, -8, -6, -4, -4, -2, 1, 3, 5, 6, -8, -10, -9, -8, -7, -6, -6, -5, -3, -2, 0, 2, 4, 5, 6, 7, -7, -7, -6, -6, -5, -4, -3, -2, 0, -1, 2, 5, 7, 7, 7, 8, -5, -5, -5, -4, -3, -2, -2, 0, 1, 0, 4, 7, 9, 9, 9, 10, -3, -3, -3, -2, -2, -1, 0, 2, 2, 1, 5, 9, 11, 11, 11, 12, -1, -1, -1, 0, 0, 1, 2, 4, 3, 2, 7, 11, 12, 13, 13, 13, 1, 1, 1, 2, 2, 3, 4, 6, 4, 4, 8, 12, 14, 14, 15, 15, 5, 5, 5, 5, 5, 6, 7, 8, 5, 5, 8, 12, 14, 16, 17, 18, 10, 9, 9, 9, 9, 9, 10, 8, 5, 5, 8, 11, 13, 16, 18, 19, -26, -31, -33, -33, -32, -30, -28, -25, -20, -16, -11, -10, -9, -7, -6, -4, -25, -29, -31, -30, -30, -28, -26, -23, -18, -14, -11, -9, -7, -5, -3, -2, -22, -26, -28, -28, -27, -26, -24, -21, -17, -12, -9, -8, -5, -3, -1, 1, -19, -23, -25, -25, -24, -23, -22, -19, -15, -10, -8, -7, -3, -1, 1, 2, -16, -20, -21, -21, -21, -20, -19, -18, -13, -9, -7, -6, -2, 0, 2, 4, -14, -18, -18, -18, -18, -17, -17, -15, -11, -8, -7, -4, 0, 2, 3, 5, -13, -14, -15, -14, -15, -14, -13, -12, -9, -7, -5, -2, 1, 3, 4, 6, -11, -12, -12, -11, -10, -10, -8, -8, -5, -3, -2, 0, 3, 4, 6, 7, -9, -10, -10, -9, -7, -6, -6, -5, -3, -1, 2, 4, 6, 6, 7, 8, -7, -8, -7, -6, -5, -4, -4, -3, -1, 0, 5, 7, 9, 9, 9, 9, -6, -6, -6, -5, -4, -3, -2, -1, 1, 2, 7, 9, 10, 11, 11, 11, -4, -4, -4, -3, -3, -2, -1, 1, 2, 3, 8, 11, 12, 12, 13, 13, -2, -2, -2, -2, -1, 0, 1, 3, 3, 4, 10, 13, 14, 14, 15, 15, 0, 0, 0, 1, 1, 2, 3, 4, 4, 5, 11, 14, 16, 16, 17, 17, 4, 3, 3, 4, 4, 4, 5, 6, 5, 7, 11, 15, 17, 19, 19, 19, 8, 8, 7, 7, 7, 7, 8, 7, 5, 7, 11, 14, 17, 19, 20, 20, -26, -29, -31, -31, -30, -29, -26, -23, -19, -13, -9, -8, -7, -6, -5, -4, -23, -27, -29, -28, -28, -27, -25, -21, -16, -11, -8, -7, -5, -4, -2, -1, -21, -24, -26, -25, -25, -24, -23, -19, -14, -8, -6, -5, -3, -2, 0, 1, -17, -20, -22, -22, -22, -21, -20, -18, -12, -6, -5, -4, -2, 0, 1, 3, -15, -18, -19, -19, -19, -18, -18, -16, -10, -5, -4, -3, -1, 1, 3, 4, -13, -16, -16, -16, -16, -15, -15, -13, -7, -4, -3, -1, 1, 2, 4, 6, -12, -13, -13, -13, -13, -12, -12, -10, -5, -3, -2, 0, 2, 4, 5, 7, -11, -12, -11, -10, -10, -9, -7, -6, -2, 0, 0, 2, 4, 5, 7, 8, -9, -10, -9, -8, -7, -6, -5, -3, 0, 1, 4, 6, 7, 8, 8, 9, -8, -8, -7, -6, -5, -4, -3, -2, 1, 2, 6, 9, 10, 10, 10, 10, -6, -6, -6, -5, -4, -3, -2, 0, 2, 4, 8, 11, 12, 12, 13, 13, -4, -5, -4, -4, -3, -2, -1, 1, 3, 6, 10, 12, 13, 14, 14, 14, -3, -3, -2, -2, -1, 0, 1, 3, 4, 7, 11, 14, 15, 16, 16, 16, -1, -1, 0, 0, 1, 2, 3, 5, 5, 9, 13, 16, 17, 18, 18, 18, 3, 2, 2, 3, 3, 4, 5, 6, 6, 9, 14, 17, 19, 20, 21, 20, 7, 7, 6, 6, 6, 7, 7, 7, 7, 10, 14, 17, 19, 21, 21, 21, -24, -28, -30, -30, -29, -28, -26, -22, -17, -12, -8, -6, -6, -5, -5, -4, -22, -25, -27, -27, -27, -26, -24, -20, -15, -9, -6, -4, -4, -3, -2, -1, -19, -22, -24, -24, -23, -23, -22, -18, -13, -6, -5, -3, -2, -1, 0, 1, -16, -19, -21, -21, -20, -20, -19, -16, -11, -5, -4, -2, -1, 0, 2, 3, -15, -17, -19, -18, -18, -17, -16, -14, -9, -4, -3, -1, 0, 1, 3, 5, -13, -15, -16, -16, -16, -15, -14, -11, -7, -4, -2, 0, 1, 2, 4, 6, -11, -13, -13, -13, -13, -12, -11, -9, -4, -2, -1, 1, 2, 4, 6, 7, -10, -12, -11, -10, -9, -8, -7, -4, 0, 0, 2, 4, 4, 5, 7, 8, -9, -10, -9, -8, -7, -6, -4, -2, 1, 1, 5, 7, 8, 9, 9, 9, -8, -8, -7, -6, -5, -4, -2, 0, 2, 2, 7, 10, 11, 12, 11, 11, -6, -6, -6, -5, -4, -3, -1, 1, 2, 4, 9, 12, 13, 14, 14, 13, -5, -5, -4, -4, -3, -2, 0, 2, 3, 6, 10, 14, 15, 15, 15, 16, -3, -3, -3, -2, -1, 0, 1, 3, 4, 7, 12, 15, 17, 17, 17, 17, -1, -1, -1, 0, 1, 2, 3, 5, 5, 9, 13, 17, 19, 19, 19, 19, 2, 2, 2, 3, 4, 4, 6, 6, 6, 10, 15, 19, 21, 22, 22, 22, 6, 6, 6, 6, 6, 7, 7, 8, 7, 11, 16, 19, 22, 23, 23, 23, -23, -26, -28, -28, -28, -27, -24, -20, -16, -11, -6, -6, -6, -5, -4, -3, -20, -24, -25, -26, -25, -24, -22, -18, -13, -8, -5, -5, -4, -3, -2, -1, -17, -21, -23, -23, -22, -21, -20, -16, -12, -5, -3, -3, -2, -1, 0, 1, -15, -19, -21, -21, -20, -19, -17, -14, -10, -4, -2, -1, 0, 1, 2, 3, -14, -17, -18, -18, -18, -16, -15, -12, -8, -3, -2, 0, 1, 2, 4, 5, -12, -15, -16, -16, -15, -13, -12, -10, -5, -2, 0, 1, 2, 3, 5, 7, -11, -12, -13, -14, -13, -11, -10, -7, -2, 0, 1, 2, 3, 5, 6, 8, -10, -11, -11, -10, -9, -8, -6, -3, 1, 2, 3, 4, 5, 6, 8, 9, -8, -9, -9, -8, -7, -5, -3, 0, 3, 4, 7, 8, 9, 9, 10, 10, -7, -8, -7, -6, -5, -3, -1, 2, 4, 6, 10, 12, 13, 13, 13, 12, -6, -6, -6, -5, -4, -2, 0, 3, 5, 7, 11, 14, 16, 17, 16, 15, -5, -5, -4, -4, -3, -1, 1, 3, 5, 8, 12, 16, 18, 18, 18, 17, -3, -3, -3, -2, -1, 0, 2, 4, 5, 9, 13, 17, 19, 19, 19, 20, -2, -1, -1, 0, 0, 2, 3, 5, 6, 10, 14, 18, 20, 21, 21, 21, 1, 1, 2, 2, 3, 4, 5, 6, 7, 11, 16, 20, 23, 24, 23, 23, 5, 5, 5, 6, 7, 7, 7, 7, 8, 11, 17, 20, 23, 24, 25, 24, -21, -24, -26, -26, -26, -24, -21, -18, -15, -11, -8, -8, -7, -6, -5, -4, -19, -22, -23, -24, -23, -22, -19, -16, -13, -8, -7, -6, -5, -4, -3, -1, -16, -20, -21, -22, -21, -20, -17, -14, -11, -6, -5, -4, -3, -2, 0, 1, -15, -18, -19, -20, -19, -17, -15, -12, -9, -5, -3, -2, -1, 1, 2, 3, -13, -16, -17, -17, -16, -15, -13, -10, -6, -3, -2, 0, 1, 2, 4, 5, -12, -14, -15, -15, -14, -12, -10, -8, -4, -2, 0, 1, 3, 4, 5, 7, -10, -11, -12, -13, -11, -10, -8, -5, -1, 0, 2, 3, 4, 5, 7, 8, -9, -10, -10, -9, -8, -7, -4, -1, 2, 2, 4, 4, 6, 7, 8, 10, -8, -9, -9, -7, -5, -3, 0, 3, 4, 5, 7, 9, 9, 10, 10, 11, -7, -7, -7, -5, -3, -1, 1, 5, 6, 8, 11, 13, 13, 14, 14, 14, -6, -6, -6, -4, -2, 0, 3, 6, 7, 10, 13, 16, 18, 18, 18, 17, -5, -5, -4, -3, -1, 1, 4, 7, 8, 10, 15, 18, 20, 21, 20, 20, -3, -3, -2, -1, 0, 3, 5, 8, 9, 11, 16, 19, 21, 22, 23, 22, -1, -1, -1, 0, 2, 4, 7, 8, 9, 12, 16, 20, 22, 23, 24, 24, 1, 1, 2, 3, 4, 6, 9, 9, 10, 14, 18, 22, 24, 26, 26, 26, 5, 5, 5, 6, 7, 8, 9, 10, 11, 14, 19, 23, 25, 26, 27, 26, -19, -22, -24, -24, -24, -22, -18, -16, -13, -10, -10, -11, -10, -9, -8, -7, -17, -20, -21, -22, -22, -20, -17, -14, -12, -8, -9, -8, -7, -6, -4, -3, -15, -19, -20, -20, -19, -18, -15, -12, -10, -7, -7, -6, -5, -3, -1, 1, -13, -17, -18, -18, -17, -16, -13, -11, -8, -6, -5, -4, -2, -1, 1, 3, -12, -15, -16, -16, -15, -13, -10, -8, -5, -4, -3, -1, 0, 2, 4, 5, -10, -13, -14, -13, -13, -11, -8, -6, -2, -3, -1, 1, 2, 4, 6, 7, -8, -10, -11, -11, -10, -8, -6, -3, 0, -1, 1, 2, 4, 5, 7, 9, -7, -8, -8, -8, -7, -5, -2, 1, 1, 1, 2, 4, 5, 7, 8, 10, -7, -7, -7, -6, -4, -1, 2, 4, 4, 4, 7, 8, 9, 10, 10, 11, -6, -6, -5, -4, -2, 1, 4, 7, 6, 8, 11, 12, 13, 14, 14, 14, -5, -5, -4, -2, 0, 3, 6, 8, 8, 11, 14, 16, 17, 18, 18, 18, -4, -4, -3, -1, 1, 4, 8, 9, 10, 13, 16, 19, 21, 22, 22, 21, -3, -2, -1, 0, 3, 6, 9, 10, 11, 14, 17, 20, 23, 25, 25, 24, -1, 0, 1, 3, 5, 8, 11, 11, 12, 14, 18, 22, 24, 26, 27, 26, 1, 2, 3, 5, 8, 10, 12, 12, 14, 16, 20, 24, 27, 28, 28, 28, 5, 6, 7, 8, 10, 12, 13, 14, 15, 17, 21, 25, 27, 28, 29, 29, -17, -20, -21, -22, -21, -19, -16, -13, -11, -10, -11, -12, -13, -13, -11, -9, -15, -18, -19, -20, -19, -17, -15, -12, -10, -8, -10, -11, -10, -9, -7, -5, -14, -17, -17, -18, -17, -15, -13, -11, -8, -7, -9, -8, -7, -5, -3, -1, -12, -15, -15, -15, -15, -13, -11, -9, -6, -6, -7, -6, -4, -3, 0, 2, -10, -13, -13, -13, -13, -11, -9, -7, -4, -5, -5, -3, -2, 0, 3, 4, -9, -11, -12, -11, -10, -8, -6, -4, -2, -3, -2, -1, 1, 3, 5, 7, -7, -9, -9, -9, -8, -6, -4, -2, -1, -2, 0, 1, 3, 5, 7, 9, -6, -6, -6, -6, -5, -2, 0, 1, 1, 0, 1, 3, 5, 7, 9, 10, -6, -6, -5, -4, -2, 1, 3, 4, 3, 3, 6, 7, 8, 9, 10, 12, -6, -5, -3, -2, 0, 4, 6, 7, 5, 7, 10, 12, 13, 13, 14, 13, -4, -4, -3, -1, 2, 6, 8, 9, 8, 10, 14, 16, 17, 17, 18, 17, -3, -2, -1, 0, 4, 8, 10, 10, 10, 13, 16, 19, 21, 22, 22, 21, -2, -1, 0, 2, 6, 10, 11, 12, 12, 16, 18, 22, 23, 25, 26, 25, 0, 1, 2, 5, 8, 12, 13, 13, 14, 17, 20, 23, 26, 27, 28, 27, 3, 4, 5, 8, 12, 14, 15, 15, 16, 19, 22, 26, 28, 30, 30, 29, 6, 7, 9, 12, 15, 16, 16, 17, 18, 20, 23, 27, 29, 31, 31, 29, -15, -18, -19, -19, -18, -17, -15, -12, -10, -9, -11, -14, -15, -16, -15, -13, -13, -16, -17, -17, -16, -15, -13, -11, -9, -8, -10, -12, -13, -12, -10, -8, -12, -14, -15, -15, -14, -13, -11, -9, -7, -7, -9, -10, -9, -8, -6, -3, -10, -13, -13, -13, -12, -11, -9, -7, -6, -6, -8, -7, -6, -5, -2, 0, -9, -11, -11, -11, -10, -8, -7, -5, -4, -5, -6, -5, -4, -2, 1, 3, -7, -9, -9, -8, -7, -6, -5, -4, -2, -4, -4, -3, -1, 2, 4, 6, -6, -7, -7, -6, -4, -3, -2, -1, -1, -2, -1, 0, 2, 4, 6, 8, -5, -4, -4, -4, -2, 0, 1, 2, 1, 0, 0, 2, 4, 6, 8, 10, -4, -4, -3, -1, 1, 3, 4, 5, 3, 2, 4, 6, 7, 8, 10, 12, -4, -3, -2, 0, 3, 6, 7, 7, 5, 6, 9, 11, 12, 13, 13, 13, -3, -2, -1, 1, 5, 8, 9, 8, 7, 9, 13, 15, 17, 17, 17, 15, -2, -1, 0, 3, 7, 10, 10, 10, 10, 12, 16, 19, 20, 21, 21, 19, 0, 1, 2, 6, 10, 12, 13, 12, 12, 16, 19, 21, 24, 25, 24, 23, 1, 2, 5, 8, 12, 14, 15, 14, 15, 18, 21, 24, 26, 27, 28, 26, 3, 5, 8, 12, 15, 17, 18, 17, 19, 22, 25, 27, 30, 31, 31, 29, 7, 9, 13, 16, 19, 19, 19, 19, 20, 23, 26, 28, 31, 33, 33, 30, -14, -16, -16, -17, -16, -14, -13, -11, -10, -9, -12, -15, -17, -19, -19, -17, -11, -14, -14, -14, -14, -13, -11, -9, -8, -8, -10, -13, -15, -15, -14, -13, -10, -12, -12, -12, -11, -11, -9, -8, -7, -7, -9, -11, -12, -11, -9, -8, -8, -10, -10, -10, -9, -8, -7, -7, -6, -6, -8, -9, -8, -7, -4, -3, -7, -9, -9, -8, -7, -6, -5, -5, -4, -5, -7, -7, -6, -3, -1, 0, -5, -7, -6, -6, -5, -4, -3, -3, -3, -4, -5, -4, -3, 0, 2, 3, -4, -5, -4, -4, -2, -1, -1, 0, -1, -2, -2, -2, 0, 3, 5, 6, -3, -2, -2, -1, 0, 1, 2, 3, 1, 0, 0, 1, 3, 6, 7, 9, -2, -1, -1, 1, 3, 4, 5, 5, 3, 2, 3, 4, 6, 8, 9, 11, -2, -1, 0, 3, 5, 7, 8, 7, 5, 5, 8, 10, 11, 11, 12, 13, -2, 0, 1, 5, 8, 9, 10, 9, 7, 8, 12, 14, 16, 16, 15, 14, -1, 1, 4, 7, 10, 11, 11, 11, 10, 11, 15, 18, 20, 20, 19, 17, 1, 2, 6, 9, 12, 13, 14, 13, 12, 15, 18, 21, 24, 23, 22, 20, 2, 5, 9, 12, 15, 16, 16, 15, 15, 18, 21, 24, 26, 27, 25, 23, 5, 9, 14, 16, 18, 20, 19, 19, 19, 22, 26, 28, 30, 32, 30, 27, 10, 14, 18, 21, 22, 22, 22, 21, 22, 25, 28, 30, 33, 34, 33, 30, -12, -14, -14, -15, -14, -12, -11, -10, -10, -10, -12, -16, -19, -21, -22, -23, -10, -11, -12, -12, -11, -11, -9, -8, -8, -9, -11, -13, -16, -18, -19, -18, -8, -9, -10, -10, -9, -9, -8, -7, -6, -7, -9, -11, -14, -15, -14, -12, -6, -8, -8, -8, -7, -6, -6, -5, -5, -6, -7, -9, -11, -10, -9, -7, -5, -6, -6, -6, -5, -4, -4, -4, -4, -5, -6, -8, -7, -6, -4, -3, -3, -4, -4, -4, -3, -2, -2, -1, -2, -4, -5, -5, -5, -3, 0, 1, -2, -2, -2, -2, -1, 0, 0, 1, 0, -2, -3, -3, -2, 1, 3, 5, -1, 0, 0, 1, 2, 2, 3, 4, 2, 0, 0, 0, 2, 4, 7, 8, -1, 0, 2, 3, 4, 5, 7, 6, 4, 3, 2, 3, 5, 7, 10, 11, 0, 1, 3, 5, 7, 9, 10, 8, 6, 5, 7, 8, 9, 10, 12, 13, 0, 1, 4, 7, 9, 11, 12, 10, 8, 8, 10, 13, 14, 15, 14, 15, 1, 4, 7, 10, 12, 13, 13, 13, 11, 11, 14, 17, 19, 20, 17, 16, 2, 6, 10, 13, 14, 15, 15, 14, 14, 14, 17, 20, 23, 23, 20, 18, 5, 10, 13, 15, 17, 17, 18, 17, 16, 18, 21, 24, 26, 26, 24, 21, 9, 14, 17, 20, 21, 21, 21, 20, 20, 22, 26, 29, 30, 31, 29, 26, 14, 19, 23, 24, 25, 25, 24, 23, 22, 25, 28, 31, 33, 34, 32, 29 }, { -53, -51, -49, -45, -41, -37, -32, -32, -34, -35, -38, -41, -41, -37, -30, -14, -50, -48, -46, -43, -39, -35, -31, -29, -31, -33, -35, -39, -39, -34, -26, -12, -46, -45, -43, -40, -37, -33, -29, -26, -28, -30, -33, -36, -36, -30, -22, -9, -43, -41, -39, -36, -33, -31, -27, -24, -26, -28, -31, -34, -32, -27, -19, -6, -39, -37, -34, -31, -29, -28, -26, -23, -25, -26, -28, -31, -29, -24, -17, -4, -35, -32, -29, -26, -24, -23, -23, -22, -23, -24, -26, -29, -26, -20, -14, -3, -30, -28, -24, -19, -18, -20, -20, -19, -20, -21, -24, -27, -24, -17, -12, -1, -25, -23, -20, -15, -14, -17, -17, -16, -16, -18, -21, -23, -19, -14, -8, 1, -20, -19, -17, -14, -13, -15, -15, -14, -12, -13, -16, -18, -15, -10, -4, 4, -15, -14, -14, -12, -12, -14, -15, -13, -11, -11, -12, -15, -12, -8, -1, 7, -8, -9, -10, -10, -10, -13, -14, -12, -10, -9, -10, -13, -10, -6, 4, 10, -1, -3, -5, -6, -8, -11, -12, -10, -8, -7, -8, -10, -9, -5, 6, 13, 4, 2, 0, -2, -5, -8, -10, -9, -7, -5, -5, -7, -7, -3, 8, 14, 7, 6, 5, 3, 0, -5, -7, -7, -5, -3, -1, -3, -3, -1, 8, 15, 11, 11, 9, 8, 5, 0, -3, -2, -1, 1, 4, 3, 2, 3, 8, 16, 15, 14, 12, 11, 9, 4, 2, 2, 4, 5, 8, 8, 6, 6, 10, 16, -47, -45, -43, -40, -35, -31, -27, -26, -30, -34, -38, -41, -39, -34, -26, -15, -44, -42, -40, -37, -33, -29, -25, -23, -28, -32, -36, -39, -36, -30, -22, -13, -41, -39, -36, -33, -30, -27, -24, -21, -27, -30, -34, -37, -33, -26, -19, -11, -37, -35, -32, -29, -27, -25, -22, -19, -25, -28, -32, -34, -29, -23, -16, -8, -33, -30, -28, -24, -21, -21, -21, -19, -23, -26, -30, -31, -26, -19, -13, -6, -29, -26, -22, -14, -11, -17, -18, -18, -20, -23, -28, -28, -22, -16, -11, -4, -25, -21, -14, -6, -5, -12, -16, -16, -18, -21, -25, -25, -18, -12, -8, -2, -19, -17, -10, -4, -4, -10, -13, -13, -14, -17, -21, -20, -14, -9, -4, 0, -15, -13, -10, -5, -5, -10, -12, -12, -11, -13, -17, -15, -10, -5, 0, 3, -10, -10, -8, -6, -6, -10, -12, -11, -10, -10, -14, -12, -8, -3, 4, 7, -4, -6, -6, -6, -6, -9, -11, -10, -9, -9, -11, -11, -7, -1, 7, 10, 1, 0, -3, -4, -4, -7, -9, -9, -8, -7, -9, -10, -6, 1, 10, 13, 5, 4, 1, -1, -2, -5, -8, -8, -6, -6, -7, -8, -4, 2, 12, 14, 9, 8, 5, 2, 1, -2, -6, -6, -5, -4, -4, -5, -2, 4, 14, 16, 12, 11, 9, 7, 4, 2, -2, -3, -2, 0, 1, 0, 1, 5, 15, 16, 16, 15, 13, 11, 9, 5, 1, 1, 2, 4, 6, 5, 5, 7, 15, 16, -41, -40, -38, -34, -30, -27, -23, -23, -30, -36, -39, -39, -36, -29, -20, -15, -38, -36, -34, -31, -28, -25, -21, -21, -28, -33, -37, -38, -32, -25, -18, -14, -35, -32, -29, -27, -25, -23, -20, -18, -26, -31, -35, -35, -29, -22, -16, -12, -31, -28, -25, -20, -20, -21, -19, -17, -24, -29, -33, -32, -25, -19, -14, -10, -27, -23, -17, -11, -12, -17, -18, -17, -22, -27, -31, -29, -21, -15, -11, -7, -22, -17, -7, -4, -6, -12, -16, -16, -19, -24, -28, -25, -17, -12, -9, -5, -18, -10, -4, 0, -2, -7, -14, -14, -17, -22, -25, -21, -14, -9, -5, -3, -13, -7, -2, 1, 0, -5, -11, -11, -13, -18, -21, -16, -10, -5, -2, 0, -9, -6, -3, 0, 0, -5, -10, -10, -10, -13, -16, -11, -6, -1, 1, 3, -6, -5, -3, -1, -1, -5, -9, -9, -9, -10, -13, -9, -3, 2, 5, 6, -3, -4, -3, -2, -1, -4, -8, -9, -8, -8, -11, -7, -1, 4, 9, 9, 2, -1, -1, -1, 0, -3, -7, -8, -7, -7, -9, -6, 0, 6, 11, 12, 7, 3, 1, 1, 1, -1, -6, -7, -6, -6, -8, -5, 2, 8, 14, 14, 9, 7, 4, 3, 3, 1, -4, -6, -4, -4, -6, -3, 3, 9, 15, 15, 13, 11, 8, 6, 6, 4, -1, -3, -2, 0, -1, 0, 5, 11, 17, 16, 16, 15, 12, 10, 9, 7, 2, -1, 1, 3, 4, 4, 7, 12, 17, 16, -36, -34, -31, -28, -25, -23, -20, -21, -33, -37, -37, -36, -31, -25, -17, -14, -32, -30, -26, -24, -23, -21, -19, -18, -30, -35, -36, -34, -28, -23, -15, -14, -29, -25, -21, -19, -20, -20, -18, -17, -28, -33, -35, -32, -25, -20, -14, -13, -25, -20, -15, -15, -16, -18, -17, -16, -25, -31, -33, -28, -22, -17, -12, -12, -20, -13, -10, -8, -11, -15, -16, -15, -23, -29, -31, -24, -19, -14, -11, -9, -15, -9, -4, -3, -6, -11, -14, -14, -20, -26, -27, -21, -15, -11, -8, -6, -10, -5, -2, 0, -2, -7, -12, -12, -17, -23, -23, -17, -11, -7, -5, -4, -7, -4, 0, 1, 0, -3, -9, -9, -12, -19, -18, -13, -7, -4, -2, 0, -5, -3, 0, 3, 2, -1, -6, -7, -8, -14, -13, -8, -3, 0, 2, 3, -3, -1, 0, 3, 3, 1, -4, -6, -7, -11, -10, -5, 0, 3, 5, 5, -1, 0, 1, 3, 4, 1, -3, -6, -6, -9, -8, -3, 2, 6, 9, 9, 1, 1, 2, 4, 4, 2, -2, -5, -5, -7, -7, -1, 4, 8, 11, 11, 6, 4, 4, 4, 5, 3, -1, -5, -4, -6, -6, 0, 7, 11, 13, 13, 9, 7, 5, 6, 6, 4, -1, -4, -3, -4, -5, 2, 8, 13, 15, 14, 13, 10, 8, 8, 8, 6, 2, -2, -1, 0, -2, 3, 10, 15, 16, 15, 16, 14, 12, 11, 11, 9, 5, 1, 2, 3, 3, 6, 12, 16, 17, 16, -30, -28, -25, -24, -21, -20, -18, -20, -33, -36, -35, -31, -26, -21, -16, -14, -26, -23, -21, -20, -19, -18, -16, -16, -31, -34, -33, -29, -25, -20, -15, -13, -22, -18, -16, -16, -17, -16, -14, -14, -29, -33, -31, -27, -23, -19, -14, -12, -17, -13, -12, -13, -14, -15, -14, -13, -27, -31, -29, -24, -20, -16, -12, -12, -13, -10, -9, -7, -10, -12, -13, -13, -25, -29, -26, -21, -17, -13, -11, -11, -10, -7, -5, -3, -5, -8, -11, -11, -22, -26, -23, -18, -13, -10, -9, -8, -8, -4, -2, -1, -2, -4, -8, -9, -18, -23, -19, -14, -9, -7, -6, -4, -6, -3, 0, 1, 0, -1, -4, -7, -13, -18, -15, -10, -6, -3, -2, -1, -3, -1, 1, 3, 3, 2, -1, -5, -9, -13, -10, -5, -2, 0, 2, 2, -1, 1, 4, 5, 5, 5, 1, -3, -7, -11, -7, -2, 1, 4, 5, 5, 1, 3, 5, 7, 6, 5, 2, -3, -6, -9, -5, 0, 4, 6, 8, 8, 3, 5, 6, 7, 7, 6, 2, -3, -5, -7, -3, 3, 7, 9, 10, 10, 7, 7, 7, 8, 8, 7, 3, -2, -3, -6, -1, 5, 9, 12, 12, 12, 10, 8, 8, 9, 9, 7, 3, -2, -2, -4, 0, 7, 12, 14, 14, 14, 12, 11, 10, 10, 10, 9, 5, 0, 0, -1, 1, 8, 14, 16, 16, 14, 16, 14, 13, 13, 12, 11, 7, 3, 3, 3, 4, 10, 15, 17, 17, 15, -24, -23, -22, -21, -19, -17, -16, -19, -32, -32, -30, -26, -23, -18, -14, -13, -20, -19, -18, -17, -16, -15, -14, -15, -30, -31, -28, -25, -22, -18, -13, -13, -17, -15, -15, -14, -13, -13, -12, -13, -27, -29, -27, -24, -21, -17, -13, -12, -13, -12, -11, -10, -11, -11, -11, -12, -26, -27, -25, -22, -19, -15, -12, -11, -11, -9, -7, -7, -8, -9, -10, -11, -24, -25, -22, -19, -15, -13, -11, -10, -10, -8, -5, -3, -5, -6, -9, -10, -22, -23, -19, -15, -12, -10, -9, -9, -8, -5, -3, -1, -2, -4, -7, -8, -19, -19, -16, -11, -8, -7, -7, -6, -5, -2, 0, 1, 0, 0, -3, -6, -14, -15, -11, -7, -5, -4, -3, -2, -1, 1, 3, 4, 4, 4, 1, -4, -9, -11, -7, -3, -1, 0, 1, 2, 1, 3, 5, 6, 6, 6, 3, -3, -7, -8, -4, 0, 2, 4, 4, 5, 4, 6, 7, 8, 8, 8, 5, -2, -5, -6, -1, 3, 5, 6, 7, 7, 6, 8, 9, 9, 9, 8, 5, -2, -4, -4, 1, 5, 8, 9, 9, 10, 8, 9, 10, 10, 10, 9, 6, -1, -2, -3, 4, 8, 10, 11, 11, 11, 11, 10, 11, 11, 11, 10, 6, 0, -2, -1, 5, 10, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 8, 2, 1, 1, 7, 12, 15, 15, 15, 14, 15, 15, 14, 14, 13, 12, 10, 4, 3, 4, 9, 14, 17, 17, 16, 15, -20, -20, -19, -18, -17, -16, -14, -18, -27, -27, -25, -23, -20, -16, -14, -13, -17, -17, -16, -15, -15, -14, -12, -14, -25, -25, -24, -22, -19, -16, -13, -13, -15, -13, -12, -12, -12, -11, -10, -12, -23, -23, -23, -22, -19, -15, -13, -12, -13, -10, -9, -9, -9, -10, -9, -10, -21, -22, -21, -20, -17, -14, -12, -11, -11, -9, -7, -6, -7, -8, -9, -9, -20, -20, -19, -17, -14, -13, -10, -10, -9, -7, -5, -4, -5, -7, -8, -8, -19, -18, -16, -13, -11, -10, -9, -9, -7, -5, -3, -2, -3, -4, -5, -7, -17, -15, -13, -10, -8, -7, -7, -6, -4, -2, 0, 1, 0, -1, -2, -5, -13, -11, -8, -5, -5, -5, -4, -2, 0, 2, 4, 4, 4, 3, 1, -3, -8, -7, -4, -2, -1, -1, 0, 1, 3, 5, 7, 7, 7, 6, 4, -2, -6, -4, -1, 1, 3, 3, 3, 3, 5, 8, 9, 9, 9, 8, 5, -1, -4, -2, 2, 4, 5, 6, 6, 6, 7, 9, 10, 10, 10, 10, 6, 0, -2, 0, 5, 7, 8, 8, 8, 8, 9, 11, 12, 12, 11, 10, 7, 1, 0, 2, 7, 9, 10, 10, 11, 11, 11, 12, 13, 13, 12, 11, 8, 2, 1, 3, 9, 11, 12, 13, 13, 12, 13, 13, 14, 14, 13, 13, 10, 4, 3, 5, 10, 14, 14, 15, 14, 13, 16, 15, 15, 15, 15, 14, 12, 7, 6, 8, 12, 16, 16, 16, 15, 15, -18, -18, -18, -18, -17, -15, -13, -14, -21, -21, -22, -20, -18, -15, -13, -13, -15, -14, -14, -14, -14, -13, -11, -11, -18, -20, -21, -20, -18, -15, -13, -12, -13, -11, -11, -12, -12, -11, -8, -9, -16, -18, -20, -19, -17, -15, -12, -12, -12, -10, -9, -9, -10, -9, -7, -8, -15, -16, -18, -18, -16, -14, -11, -11, -10, -8, -7, -7, -8, -8, -7, -7, -13, -15, -16, -15, -14, -13, -10, -10, -9, -7, -5, -5, -6, -6, -5, -6, -13, -14, -13, -12, -11, -10, -8, -8, -7, -5, -4, -4, -3, -3, -4, -5, -11, -11, -10, -8, -7, -7, -7, -6, -3, -2, 0, 0, 0, 0, 0, -2, -8, -7, -6, -5, -5, -5, -4, -3, 1, 3, 4, 4, 4, 3, 3, -1, -5, -3, -2, -1, -2, -2, -1, 1, 4, 6, 7, 7, 7, 7, 5, 0, -2, 0, 1, 2, 1, 1, 1, 3, 6, 8, 9, 9, 9, 9, 6, 1, 1, 2, 3, 4, 4, 4, 4, 5, 8, 10, 11, 12, 11, 10, 8, 2, 2, 4, 6, 7, 7, 7, 6, 7, 10, 12, 13, 13, 13, 12, 9, 3, 3, 5, 8, 9, 9, 9, 9, 9, 12, 13, 14, 14, 14, 13, 10, 5, 5, 7, 10, 11, 11, 12, 12, 11, 14, 14, 15, 15, 15, 14, 12, 7, 7, 9, 13, 14, 14, 14, 13, 13, 16, 16, 16, 17, 16, 15, 14, 10, 9, 11, 14, 16, 16, 15, 14, 14, -16, -16, -17, -17, -16, -13, -10, -10, -17, -18, -20, -18, -15, -14, -12, -13, -14, -14, -14, -14, -13, -11, -8, -7, -14, -17, -19, -18, -16, -14, -12, -12, -12, -12, -11, -12, -11, -9, -5, -6, -11, -15, -18, -18, -16, -14, -11, -11, -11, -10, -9, -10, -10, -8, -4, -5, -9, -14, -16, -16, -15, -14, -11, -10, -10, -8, -8, -8, -8, -7, -4, -4, -9, -12, -14, -14, -13, -13, -10, -9, -9, -7, -7, -6, -6, -5, -3, -3, -7, -11, -11, -12, -11, -11, -8, -8, -6, -5, -5, -4, -3, -2, -1, -2, -6, -8, -9, -9, -8, -8, -6, -6, -3, -2, -1, -1, 0, 1, 2, 0, -3, -4, -5, -5, -6, -5, -5, -3, 2, 3, 4, 4, 4, 4, 6, 1, 0, -1, -1, -2, -3, -3, -1, 0, 5, 6, 7, 7, 7, 8, 9, 3, 2, 2, 2, 1, 0, -1, 0, 3, 8, 9, 9, 9, 9, 10, 10, 4, 3, 4, 4, 4, 3, 1, 2, 5, 10, 11, 12, 12, 12, 12, 11, 6, 5, 6, 6, 6, 6, 5, 4, 7, 12, 13, 14, 14, 14, 14, 12, 9, 6, 8, 9, 9, 8, 8, 7, 9, 13, 14, 15, 15, 15, 15, 13, 10, 7, 9, 11, 11, 11, 10, 10, 11, 15, 16, 16, 16, 16, 16, 15, 12, 9, 11, 14, 14, 13, 13, 12, 12, 16, 17, 18, 18, 18, 17, 17, 15, 12, 14, 16, 16, 16, 15, 14, 14, -15, -16, -16, -16, -13, -9, -6, -7, -14, -15, -17, -15, -14, -12, -11, -12, -14, -13, -14, -14, -11, -6, -3, -5, -12, -15, -16, -15, -14, -13, -11, -11, -13, -12, -12, -12, -9, -5, -1, -4, -10, -13, -15, -15, -14, -13, -11, -10, -12, -11, -10, -10, -8, -4, 0, -2, -8, -12, -15, -14, -14, -12, -10, -9, -10, -9, -9, -9, -6, -3, 0, -1, -6, -11, -13, -13, -13, -12, -9, -8, -9, -8, -7, -6, -4, -1, 1, -1, -5, -9, -10, -11, -11, -10, -8, -7, -7, -6, -5, -4, -2, 1, 3, 0, -4, -7, -8, -9, -9, -8, -6, -5, -3, -2, -2, -1, 1, 4, 6, 2, 0, -4, -5, -6, -6, -6, -4, -3, 2, 2, 3, 4, 5, 7, 10, 3, 2, 0, -2, -3, -4, -3, -2, 0, 5, 6, 7, 7, 8, 11, 14, 6, 4, 3, 1, -1, -2, -1, 0, 3, 8, 9, 9, 9, 11, 13, 16, 10, 5, 5, 4, 2, 0, 0, 2, 5, 10, 11, 12, 12, 14, 15, 17, 13, 6, 7, 6, 5, 4, 2, 4, 7, 12, 13, 14, 15, 16, 16, 18, 14, 7, 9, 9, 9, 7, 5, 6, 9, 14, 15, 16, 17, 18, 18, 18, 16, 9, 11, 11, 11, 10, 9, 9, 11, 15, 17, 18, 18, 19, 19, 20, 17, 11, 12, 14, 14, 13, 13, 12, 12, 17, 18, 19, 20, 20, 21, 22, 20, 15, 15, 17, 17, 17, 16, 15, 15, -15, -16, -16, -15, -9, -5, -3, -6, -11, -13, -15, -14, -12, -11, -10, -10, -14, -14, -14, -12, -7, -2, 0, -4, -9, -12, -13, -14, -13, -12, -10, -10, -13, -12, -12, -10, -6, 0, 2, -2, -8, -11, -12, -13, -13, -12, -10, -9, -12, -11, -10, -9, -4, 1, 3, -2, -7, -10, -12, -12, -13, -12, -10, -8, -10, -10, -9, -7, -3, 2, 4, -1, -6, -10, -10, -11, -11, -11, -9, -7, -9, -8, -7, -5, -1, 2, 5, 0, -4, -8, -9, -10, -10, -10, -7, -5, -7, -6, -5, -3, 1, 4, 7, 1, -3, -7, -8, -9, -9, -8, -6, -4, -3, -2, -2, 0, 3, 6, 9, 3, 0, -4, -6, -7, -7, -6, -5, -3, 2, 2, 3, 5, 7, 10, 14, 4, 4, 1, -3, -5, -5, -4, -2, 1, 5, 6, 7, 8, 10, 14, 18, 8, 5, 4, 0, -2, -2, -2, 0, 3, 8, 9, 10, 10, 13, 18, 20, 11, 7, 6, 3, 0, -1, 0, 2, 5, 11, 11, 12, 13, 16, 20, 22, 15, 8, 9, 7, 4, 2, 2, 4, 7, 13, 14, 15, 17, 18, 21, 23, 18, 9, 11, 10, 7, 6, 4, 6, 9, 14, 16, 18, 19, 20, 22, 24, 20, 10, 12, 13, 11, 10, 8, 9, 11, 15, 18, 19, 20, 22, 23, 25, 22, 12, 14, 15, 14, 14, 13, 13, 13, 17, 19, 20, 22, 23, 25, 28, 25, 16, 16, 18, 18, 17, 16, 15, 16, -16, -16, -15, -10, -5, -1, -2, -3, -8, -12, -13, -12, -11, -10, -9, -9, -14, -14, -13, -8, -3, 2, 1, -2, -6, -10, -12, -12, -11, -10, -9, -9, -13, -12, -11, -7, -1, 4, 3, -1, -5, -8, -11, -12, -11, -11, -9, -8, -12, -11, -10, -5, 0, 5, 4, 0, -4, -7, -10, -11, -12, -11, -9, -7, -10, -9, -8, -4, 1, 6, 6, 0, -3, -6, -8, -10, -11, -10, -8, -6, -9, -8, -6, -2, 2, 7, 7, 1, -3, -5, -7, -9, -10, -9, -7, -5, -7, -6, -4, 0, 4, 8, 9, 2, -2, -5, -6, -7, -8, -8, -6, -3, -3, -2, -1, 2, 6, 10, 11, 4, 0, -3, -5, -6, -7, -6, -4, -2, 2, 3, 4, 7, 10, 14, 15, 6, 4, 1, -3, -5, -6, -5, -2, 2, 5, 6, 7, 10, 13, 18, 19, 10, 7, 5, -1, -3, -4, -2, 1, 5, 8, 9, 11, 13, 17, 22, 22, 13, 8, 8, 4, -1, -1, 0, 3, 6, 11, 12, 13, 15, 20, 25, 25, 16, 10, 11, 8, 3, 1, 2, 5, 8, 13, 14, 16, 18, 22, 26, 27, 19, 11, 13, 12, 8, 5, 5, 7, 10, 15, 16, 18, 21, 23, 27, 28, 21, 12, 14, 14, 12, 10, 9, 11, 13, 16, 18, 20, 22, 24, 27, 29, 23, 14, 16, 17, 16, 14, 13, 14, 15, 17, 20, 21, 23, 25, 29, 31, 26, 16, 17, 19, 19, 18, 17, 17, 17, -18, -16, -12, -6, 0, 1, -1, -2, -5, -9, -11, -10, -9, -9, -8, -9, -17, -14, -10, -4, 2, 4, 1, -1, -3, -8, -10, -10, -10, -9, -9, -8, -15, -13, -8, -2, 4, 6, 4, 0, -2, -6, -10, -11, -10, -10, -9, -7, -13, -11, -6, -1, 5, 7, 5, 1, -1, -4, -8, -10, -11, -10, -8, -6, -11, -9, -6, 0, 6, 9, 6, 2, 1, -3, -7, -9, -10, -9, -8, -5, -10, -8, -4, 1, 6, 10, 7, 3, 1, -2, -6, -8, -9, -8, -6, -3, -8, -6, -2, 3, 8, 11, 8, 4, 2, -1, -5, -7, -8, -7, -4, -2, -3, -2, 1, 5, 10, 13, 11, 6, 4, 0, -4, -6, -6, -6, -3, 0, 2, 3, 6, 9, 14, 18, 15, 8, 7, 3, -3, -5, -6, -4, -1, 4, 5, 6, 9, 12, 18, 22, 19, 12, 9, 7, 0, -4, -4, -2, 2, 7, 8, 9, 12, 15, 21, 26, 22, 15, 10, 11, 5, -1, -2, 0, 5, 9, 10, 11, 15, 18, 25, 29, 25, 17, 12, 13, 10, 4, 2, 4, 8, 11, 13, 14, 17, 21, 27, 31, 27, 19, 13, 15, 14, 10, 6, 8, 11, 14, 15, 15, 19, 23, 28, 32, 29, 21, 14, 16, 17, 14, 11, 11, 15, 17, 16, 18, 22, 25, 28, 32, 30, 23, 15, 17, 19, 18, 16, 16, 18, 19, 18, 20, 23, 26, 28, 32, 32, 25, 16, 18, 20, 21, 20, 20, 21, 22, -18, -15, -8, -1, 2, 1, -1, 0, -2, -6, -8, -8, -8, -8, -8, -8, -17, -13, -6, 2, 6, 4, 2, 1, -1, -4, -8, -8, -9, -9, -8, -7, -16, -12, -5, 3, 8, 7, 4, 2, 1, -3, -7, -9, -10, -9, -8, -6, -14, -9, -3, 4, 8, 9, 6, 3, 2, -1, -6, -9, -10, -9, -7, -4, -12, -8, -2, 4, 9, 10, 7, 4, 4, 1, -4, -8, -9, -8, -5, -2, -10, -6, 0, 5, 10, 11, 8, 5, 5, 2, -3, -7, -8, -6, -2, 1, -8, -4, 2, 6, 12, 12, 9, 7, 6, 3, -2, -6, -7, -3, 0, 3, -3, -1, 4, 8, 13, 14, 11, 8, 7, 4, -2, -5, -5, -1, 2, 6, 1, 4, 8, 13, 17, 19, 14, 10, 10, 6, -1, -4, -4, 0, 5, 10, 4, 8, 12, 16, 22, 23, 18, 13, 12, 9, 2, -2, -2, 3, 8, 13, 6, 10, 14, 19, 25, 26, 22, 17, 14, 13, 6, 1, 0, 6, 11, 16, 9, 12, 16, 22, 28, 29, 24, 19, 14, 15, 12, 6, 5, 10, 15, 19, 11, 14, 19, 25, 31, 31, 26, 19, 15, 17, 16, 12, 11, 14, 19, 22, 13, 17, 22, 27, 33, 33, 27, 21, 15, 18, 18, 17, 14, 17, 22, 24, 15, 19, 24, 29, 35, 34, 28, 22, 15, 18, 21, 20, 19, 20, 25, 27, 18, 22, 26, 29, 34, 35, 30, 24, 17, 20, 23, 24, 24, 24, 27, 27, -18, -10, -2, 3, 3, 1, 0, 1, 1, -3, -6, -6, -7, -7, -7, -6, -16, -8, 1, 6, 7, 4, 2, 2, 2, -1, -6, -7, -7, -8, -7, -5, -15, -6, 2, 8, 9, 7, 5, 4, 4, 1, -4, -7, -8, -8, -6, -2, -13, -4, 3, 9, 11, 9, 7, 5, 5, 2, -2, -7, -8, -6, -2, 1, -11, -3, 4, 9, 12, 11, 8, 6, 6, 4, -1, -6, -6, -2, 1, 4, -9, -1, 5, 10, 13, 12, 9, 7, 8, 6, 0, -5, -3, 1, 4, 7, -7, 1, 6, 11, 13, 12, 10, 8, 9, 7, 2, -2, 0, 4, 7, 9, -2, 4, 8, 12, 15, 14, 11, 9, 10, 9, 4, -1, 2, 6, 9, 12, 2, 7, 13, 17, 20, 19, 15, 12, 13, 11, 3, 1, 5, 9, 14, 16, 5, 10, 16, 21, 24, 23, 19, 15, 16, 14, 5, 3, 8, 14, 18, 20, 7, 12, 18, 24, 27, 25, 21, 18, 17, 15, 10, 6, 11, 18, 22, 23, 9, 14, 21, 27, 30, 28, 24, 20, 16, 18, 15, 12, 15, 22, 26, 26, 11, 16, 23, 30, 32, 30, 25, 21, 16, 20, 19, 17, 19, 25, 29, 29, 13, 19, 26, 33, 35, 32, 26, 22, 18, 21, 21, 20, 21, 27, 31, 32, 15, 22, 30, 36, 38, 34, 27, 23, 19, 22, 24, 23, 25, 29, 33, 34, 17, 24, 33, 39, 39, 36, 29, 25, 21, 23, 27, 27, 29, 31, 34, 34, -17, -3, 3, 4, 3, 0, 2, 3, 3, 0, -3, -4, -5, -6, -5, -3, -15, -1, 6, 8, 6, 4, 3, 4, 5, 2, -2, -4, -5, -5, -2, 1, -14, 1, 8, 10, 9, 7, 6, 5, 6, 4, 0, -5, -5, -2, 2, 5, -12, 3, 9, 12, 12, 10, 8, 6, 8, 6, 2, -4, -2, 2, 6, 9, -10, 5, 10, 13, 13, 11, 9, 7, 9, 9, 4, -1, 2, 7, 9, 11, -8, 6, 11, 14, 14, 12, 10, 8, 11, 11, 6, 3, 6, 10, 11, 13, -6, 8, 12, 15, 15, 13, 11, 9, 12, 13, 9, 8, 10, 12, 13, 15, -2, 9, 13, 16, 17, 15, 13, 10, 14, 15, 11, 12, 14, 16, 18, 20, 3, 13, 18, 20, 21, 19, 15, 14, 16, 17, 14, 16, 17, 20, 22, 25, 5, 16, 21, 24, 24, 22, 19, 16, 18, 19, 17, 19, 21, 23, 27, 29, 7, 19, 23, 27, 27, 24, 21, 18, 19, 20, 20, 21, 25, 27, 31, 33, 9, 21, 26, 30, 29, 27, 24, 20, 20, 22, 22, 23, 28, 30, 34, 35, 11, 23, 29, 33, 32, 30, 25, 22, 22, 24, 24, 25, 30, 34, 36, 36, 14, 26, 33, 36, 36, 31, 26, 23, 23, 25, 27, 27, 31, 35, 38, 38, 16, 29, 37, 39, 38, 33, 27, 24, 25, 26, 30, 31, 33, 37, 40, 40, 18, 33, 40, 42, 40, 35, 29, 26, 27, 28, 31, 34, 36, 38, 42, 41 } }; const int8_t dav2d_gdf_inter_error[5][6][1000] = { { { 6, 5, 4, 3, 2, 3, 4, 5, 5, 5, 6, 5, 4, 3, 2, 3, 3, 3, 3, 3, 6, 4, 3, 2, 2, 2, 3, 2, 2, 3, 4, 3, 2, 1, 1, 1, 1, 1, 2, 2, 3, 2, 1, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 0, -1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, -1, 0, 0, 6, 5, 4, 3, 2, 2, 4, 4, 4, 5, 5, 5, 3, 2, 1, 2, 3, 3, 4, 4, 4, 4, 3, 2, 1, 2, 2, 2, 2, 3, 3, 2, 1, 1, 1, 1, 1, 2, 2, 2, 3, 2, 1, 1, 1, 1, 0, 1, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 1, 1, 0, 0, -1, 0, 0, 2, 1, 1, 1, 1, 1, 0, -1, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, 0, 0, 0, 1, 1, 1, 1, 0, -1, -1, 0, 5, 5, 4, 3, 2, 1, 3, 4, 5, 4, 3, 3, 3, 1, 1, 2, 2, 4, 4, 4, 3, 2, 2, 1, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 2, 3, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1, 2, 3, 2, 2, 1, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 1, 1, 1, 0, 0, -1, -1, -1, 1, 0, 1, 1, 1, 1, 0, -1, -1, -1, -1, 0, 1, 1, 1, 1, 0, -1, -2, -2, 4, 3, 3, 2, 1, 1, 3, 3, 5, 4, 4, 3, 2, 2, 1, 2, 3, 3, 4, 4, 3, 2, 2, 1, 1, 2, 2, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 2, 2, 3, 2, 2, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 0, 0, -1, 0, 0, 0, 0, 1, 1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 1, 1, 0, -1, -1, -2, 4, 3, 2, 2, 1, 1, 3, 3, 3, 4, 3, 3, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 1, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, 4, 3, 2, 1, 0, 0, 1, 1, 1, 2, 3, 3, 1, 1, 0, 0, 0, 1, 1, 1, 3, 2, 1, 0, 0, 0, 0, 0, 1, 1, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -2, -2, -1, -1, -1, 0, 0, 0, 0, -1, 4, 3, 2, 1, 0, -1, -1, 0, 0, 1, 3, 2, 1, 1, 0, -1, -1, -1, 0, 1, 3, 2, 1, 1, 0, -1, -1, -1, 0, 0, 3, 2, 1, 1, 0, -1, -1, -1, 0, 0, 2, 2, 1, 0, 0, -1, -1, -1, -1, -1, 1, 2, 1, 0, 0, 0, -1, -1, 0, -1, 1, 1, 1, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, -1, -1, -1, 0, -1, -1, -1, -2, -1, -1, -2, -1, -1, -1, 0, 0, -1, -3, -3, -2, -2, -2, -1, -1, 0, -1, -1, 5, 4, 2, 2, 1, -1, -2, -1, -1, 0, 3, 2, 2, 2, 0, -1, -2, -2, -1, 0, 3, 3, 2, 1, 0, -1, -2, -2, -1, -1, 3, 2, 1, 1, 0, -1, -2, -2, -1, -1, 2, 2, 1, 1, 0, -1, -2, -2, -1, -1, 1, 2, 1, 1, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1, -3, -2, -2, -2, -2, -1, -1, -1, -1, -1, -4, -4, -4, -3, -2, -1, -1, 0, -1, -1, 10, 9, 7, 5, 2, -1, -2, -2, -2, -1, 7, 5, 2, 1, 0, -1, -2, -2, -2, -1, 3, 2, 1, 1, 0, -1, -2, -2, -2, -1, 2, 1, 1, 1, 0, -1, -2, -2, -2, -2, 1, 1, 1, 1, 0, -1, -2, -2, -2, -1, 1, 2, 2, 1, 0, 0, -1, -2, -2, -1, 1, 1, 1, 1, 0, 0, -1, -2, -2, -1, 0, 0, 0, 0, 0, 0, -1, -1, -2, -1, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -1, -1, -1, 0, -1, -1, 15, 14, 13, 11, 9, 6, 3, -1, -3, -3, 12, 11, 9, 7, 4, 1, -2, -2, -2, -2, 8, 7, 4, 2, 1, -1, -2, -3, -3, -2, 4, 2, 1, 1, 0, -1, -2, -3, -2, -2, 2, 1, 1, 1, 0, -1, -2, -2, -2, -1, 1, 1, 0, 0, 0, 0, -1, -2, -2, -2, 0, 0, 0, 0, 0, 0, -1, -2, -2, -2, -1, -1, -1, 0, 0, 0, -1, -1, -2, -2, -2, -1, -2, -1, -1, -1, -1, -1, -1, -2, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0 }, { -5, -5, -4, -3, -2, -1, -1, 0, 0, 0, -5, -5, -4, -3, -2, -1, 0, 0, 0, 0, -4, -4, -3, -3, -2, -1, 0, 1, 1, 1, -2, -2, -1, -1, -2, -1, 0, 1, 1, 1, -1, 0, 0, 0, 0, 0, 0, 1, 1, 1, -1, 0, 1, 1, 1, 0, 0, 1, 1, 1, -1, 0, 2, 2, 1, 1, 1, 1, 1, 1, -1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 2, 4, 5, 4, 4, 3, 3, 2, 2, 2, 5, 6, 6, 6, 5, 4, 4, 3, 3, 2, -4, -4, -3, -2, -1, -1, 0, 0, 0, 0, -4, -4, -3, -2, -1, 0, 0, 0, 0, 1, -4, -4, -3, -3, -1, 0, 0, 1, 1, 1, -3, -2, -2, -2, -1, 0, 1, 1, 1, 1, -2, -1, 0, 0, -1, 0, 1, 1, 1, 1, -1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 0, 1, 3, 3, 3, 2, 2, 1, 1, 1, 3, 4, 5, 4, 4, 3, 3, 2, 2, 2, 6, 6, 6, 6, 5, 4, 4, 3, 3, 2, -4, -3, -2, -1, 0, 0, 0, 0, 0, -1, -4, -3, -3, -2, -1, 0, 0, 0, 1, 0, -3, -3, -3, -2, -1, 0, 0, 1, 1, 1, -3, -3, -2, -2, -1, 0, 1, 1, 1, 1, -2, -1, -1, -1, -1, 0, 1, 1, 1, 1, -1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 3, 3, 3, 2, 2, 2, 1, 1, 4, 4, 5, 4, 4, 4, 3, 3, 2, 1, 5, 6, 6, 5, 5, 4, 4, 4, 3, 2, -3, -2, -1, 0, 0, 0, 0, 0, 0, -1, -3, -3, -2, -1, 0, 0, 0, 0, 0, 0, -3, -3, -3, -2, -1, 0, 0, 1, 1, 1, -3, -2, -2, -2, -1, 0, 1, 1, 1, 1, -2, -1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 3, 3, 3, 3, 2, 1, 1, 1, 3, 4, 4, 4, 4, 3, 3, 2, 1, 1, 5, 5, 5, 4, 4, 4, 4, 3, 2, 1, -3, -2, -1, 1, 1, 1, 0, 0, 0, -1, -3, -2, -1, 0, 0, 0, 0, 0, 0, 0, -3, -2, -2, -1, 0, 0, 0, 0, 1, 1, -2, -2, -2, -1, 0, 0, 0, 1, 1, 1, -2, -1, -1, -1, 0, 0, 1, 1, 1, 1, -1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 3, 3, 3, 3, 2, 1, 1, 1, 5, 4, 4, 4, 3, 3, 3, 2, 1, 1, -3, -1, 0, 0, 1, 1, 0, 0, -1, -1, -3, -2, -1, 0, 1, 0, 0, 0, 0, 0, -2, -2, -1, 0, 0, 0, 0, 0, 0, 0, -2, -2, -2, -1, 0, 0, 0, 0, 1, 1, -2, -1, -1, -1, 0, 0, 1, 1, 1, 1, -1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 3, 3, 2, 1, 1, 0, 0, 1, 4, 4, 4, 3, 2, 2, 1, 1, 1, 1, -2, -1, 0, 0, 0, 0, -1, -1, -2, -2, -2, -1, 0, 0, 0, 0, -1, -1, -1, -1, -2, -2, -1, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 0, 1, -2, -1, -1, 0, 0, 0, 0, 0, 1, 1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 1, 0, 0, 0, 0, 1, 4, 3, 3, 2, 1, 0, 0, 0, 0, 1, -2, -1, -1, 0, 0, -1, -1, -2, -3, -3, -2, -1, -1, 0, 0, -1, -2, -2, -2, -2, -2, -2, -1, -1, 0, -1, -1, -1, -1, 0, -2, -2, -1, -1, -1, -1, -1, 0, 0, 0, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 2, -1, -1, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 3, 3, 2, 1, 0, 0, 0, 0, 0, 1, -2, -1, -1, -1, -1, -1, -2, -3, -4, -4, -2, -2, -1, -1, -1, -2, -2, -2, -3, -2, -2, -2, -1, -1, -1, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 1, 2, 2, 0, -1, -1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 3, 2, 1, 1, 0, 0, -1, 0, 0, 1, -2, -1, -1, -2, -2, -2, -3, -4, -4, -5, -2, -2, -2, -2, -2, -2, -2, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, -1, -1, -1, -1, -1, 0, 1, 2, 2, 2, -1, -1, -1, -1, 0, 0, 1, 2, 2, 3, 1, 0, 0, 0, 0, 0, 0, 1, 2, 3, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2 }, { 0, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 0, -1, 0, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 2, 2, 2, 0, -1, -1, 2, 2, 1, 2, 2, 2, 1, 0, -1, -1, 1, 1, 1, 1, 1, 2, 1, 0, -1, -1, 0, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, -1, -2, -2, -1, -1, -3, -2, -1, -1, -1, -2, -3, -3, -2, -1, -5, -3, -2, -1, -2, -2, -3, -3, -3, -2, -6, -5, -3, -3, -3, -4, -4, -4, -4, -3, 0, -1, -1, -2, -2, -2, -2, -2, -1, 1, 1, 0, 0, -1, 0, 0, -1, -1, 0, 0, 2, 1, 0, 1, 1, 1, 0, -1, -1, 0, 3, 2, 1, 2, 2, 2, 0, -1, -1, -1, 2, 2, 2, 2, 2, 2, 0, -1, -1, -1, 0, 0, 0, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -2, -1, -1, -2, -2, -2, -2, -2, -2, -3, -2, -2, -1, -3, -3, -3, -2, -2, -3, -3, -4, -2, 0, -5, -4, -3, -2, -3, -4, -4, -4, -2, -1, 0, 0, -1, -1, -1, -2, -2, -1, 0, 1, 1, 0, -1, -1, -1, -1, -1, -1, 0, 1, 2, 1, 0, 0, 0, 0, -1, -1, -1, 0, 3, 2, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 2, 2, 2, 0, -1, -1, -1, -1, 1, 1, 1, 1, 1, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -2, -2, -1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -2, -1, -2, -2, -2, -2, -3, -3, -4, -4, -2, -1, -3, -3, -2, -2, -3, -4, -4, -4, -2, -1, 1, 0, -1, -1, -1, -1, -1, 0, 1, 2, 2, 0, -1, -1, -1, -1, -1, -1, 1, 2, 2, 1, 0, 0, 0, -1, -1, 0, 0, 1, 3, 1, 1, 0, 0, 0, -1, -1, 0, 0, 2, 2, 1, 1, 1, 0, -1, -1, -1, -1, 1, 1, 1, 1, 0, 0, -1, -1, -1, -1, 1, 0, 0, 0, 0, -1, -2, -1, -2, -2, 0, 0, -1, -1, -1, -2, -3, -3, -2, -2, -1, -1, -2, -1, -2, -3, -4, -4, -4, -3, -2, -2, -2, -1, -2, -3, -4, -5, -5, -4, 0, 0, -1, -1, -1, -1, -1, 1, 2, 3, 1, 0, 0, 0, -1, -1, 0, 1, 2, 3, 2, 1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, -1, -1, -1, -1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -2, 1, 0, 0, 0, -1, -2, -2, -2, -2, -2, 0, 0, 0, 0, -2, -3, -3, -4, -4, -4, 0, 0, 0, -1, -2, -3, -4, -4, -5, -5, -1, -1, -1, -1, -1, -1, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 0, 2, 4, 5, 0, 1, 0, 0, 0, 0, 1, 3, 3, 3, 1, 1, 1, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 1, 1, 0, 0, -1, -2, -2, -2, -2, -2, 1, 0, 0, -1, -2, -3, -3, -3, -3, -3, 0, 0, 0, -1, -2, -3, -3, -4, -4, -5, -2, -1, -1, -1, -1, -1, 0, 2, 4, 7, -1, -1, 0, -1, -1, 0, 1, 3, 5, 6, -1, 0, 0, 0, 0, 1, 3, 4, 5, 5, 0, 0, 1, 0, 1, 2, 3, 4, 3, 4, 0, 1, 2, 1, 1, 2, 3, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, -1, -2, -2, -2, -2, -1, -1, 1, 0, -1, -2, -3, -3, -3, -3, -3, -3, 0, 0, -1, -2, -3, -3, -4, -4, -4, -5, -2, -1, -1, -2, -2, -1, 1, 3, 5, 8, -1, -1, -1, -1, -1, 0, 2, 4, 7, 8, -1, 0, 0, 0, 0, 1, 3, 6, 6, 7, 0, 0, 1, -1, 0, 2, 4, 5, 5, 5, 0, 1, 1, 0, 1, 3, 4, 3, 3, 3, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 1, 1, 0, -1, 0, 0, 0, 1, 1, 1, 1, 0, -1, -2, -1, -2, -2, -1, -1, -1, 1, 0, -2, -3, -3, -3, -3, -3, -3, -3, 0, -1, -2, -3, -3, -4, -4, -4, -4, -4, -1, -1, -2, -2, -2, 0, 1, 4, 6, 9, -1, -1, -1, -2, -1, 1, 3, 5, 8, 9, 0, 0, -1, -2, -1, 2, 4, 7, 8, 8, 0, 0, -1, -2, -1, 3, 5, 6, 6, 7, 0, 0, -1, -1, 0, 2, 4, 4, 5, 5, 0, 0, 0, -1, -1, 1, 3, 3, 3, 3, 0, 0, -1, 0, 0, 1, 1, 1, 2, 2, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 1, -1, -2, -2, -2, -2, -3, -2, -2, -2, 0, -2, -3, -3, -4, -4, -4, -4, -4, -4, -1, -2, -2, -3, -2, 0, 2, 5, 7, 10, 0, -2, -2, -3, -2, 1, 3, 6, 9, 11, 0, -1, -2, -3, -2, 2, 5, 7, 9, 10, 0, -1, -2, -3, -1, 2, 6, 7, 8, 8, 0, -1, -2, -2, -2, 1, 5, 6, 6, 6, 0, -1, -1, -2, -2, 1, 3, 4, 4, 4, 0, -1, -1, -1, 0, 1, 2, 2, 3, 2, 0, -1, -1, 0, 0, 0, 0, 0, 1, 1, 0, -1, -2, -2, -2, -2, -2, -2, -1, -1, 1, -1, -3, -3, -3, -3, -3, -3, -3, -3 }, { 5, 2, 2, 3, 3, 4, 3, 1, -1, -2, 6, 3, 3, 3, 3, 4, 3, 1, 0, -2, 4, 4, 4, 4, 3, 3, 3, 1, 0, -2, 3, 2, 2, 2, 2, 3, 3, 1, -1, -2, 1, 0, 0, 0, 1, 2, 2, 0, -2, -3, -1, -2, -1, -1, -1, 0, 1, -1, -2, -4, -3, -3, -3, -3, -2, -1, -1, -2, -4, -5, -5, -5, -5, -5, -4, -3, -2, -3, -4, -5, -6, -6, -6, -6, -5, -4, -3, -4, -5, -6, -7, -7, -7, -6, -5, -5, -4, -4, -5, -6, 7, 4, 2, 3, 4, 4, 3, 2, 0, -1, 7, 5, 3, 4, 4, 4, 3, 1, 0, -1, 6, 5, 5, 5, 4, 3, 2, 1, 0, -1, 4, 4, 3, 3, 3, 3, 2, 1, 0, -2, 2, 2, 2, 1, 1, 1, 2, 0, -1, -3, 1, 0, 0, -1, -1, 0, 1, -1, -2, -4, 0, -2, -2, -2, -2, -1, -1, -2, -3, -5, -2, -3, -3, -3, -3, -3, -2, -3, -4, -5, -3, -4, -4, -4, -4, -4, -3, -4, -4, -5, -4, -4, -5, -4, -4, -4, -3, -3, -4, -6, 7, 4, 2, 3, 3, 3, 2, 2, 0, -1, 6, 6, 4, 4, 4, 4, 2, 1, 0, -1, 6, 5, 5, 5, 4, 3, 2, 1, 0, -1, 5, 5, 4, 4, 3, 3, 2, 1, 0, -1, 3, 3, 3, 2, 2, 2, 2, 0, -1, -2, 2, 2, 1, 1, 0, 0, 0, -1, -2, -3, 1, 1, 0, -1, -1, -1, 0, -2, -3, -4, 0, -1, -1, -1, -2, -1, -1, -3, -4, -5, -1, -2, -2, -2, -2, -2, -2, -3, -4, -5, -2, -2, -3, -3, -3, -2, -2, -3, -4, -6, 6, 5, 3, 1, 2, 2, 2, 1, 0, 0, 6, 5, 4, 3, 3, 3, 2, 1, 0, -1, 6, 5, 4, 4, 4, 3, 2, 0, -1, -1, 5, 5, 4, 4, 3, 2, 2, 0, -1, -2, 4, 4, 3, 3, 3, 2, 2, 0, -1, -2, 3, 2, 2, 2, 1, 1, 1, -1, -2, -2, 2, 1, 1, 1, 0, 0, 0, -1, -3, -3, 1, 1, 0, 0, 0, -1, -1, -2, -3, -5, 0, 0, 0, -1, -1, -1, -1, -3, -4, -5, 0, -1, -1, -1, -1, -1, -1, -3, -4, -6, 6, 5, 3, 1, 1, 2, 1, 1, 1, 1, 5, 4, 4, 2, 1, 2, 2, 0, 0, 0, 5, 4, 4, 3, 3, 2, 1, 0, -1, -1, 5, 4, 3, 3, 2, 2, 0, 0, -1, -2, 4, 3, 3, 2, 2, 1, 0, -1, -2, -2, 3, 3, 2, 2, 1, 0, 0, -1, -2, -2, 2, 2, 1, 1, 1, -1, -1, -3, -3, -4, 1, 1, 1, 0, 0, -1, -2, -3, -4, -5, 1, 0, 0, 0, -1, -1, -2, -3, -4, -5, 0, 0, -1, -1, -1, -2, -2, -4, -4, -6, 5, 4, 4, 2, 0, 0, 1, 2, 3, 2, 5, 4, 3, 2, 1, 1, 2, 2, 2, 1, 5, 4, 3, 2, 1, 1, 1, 1, 1, 0, 4, 4, 3, 2, 1, 1, 1, 0, 0, -1, 3, 3, 2, 2, 1, 0, 0, 0, -1, -2, 3, 2, 2, 1, 1, -1, -1, -1, -2, -3, 2, 1, 1, 0, 0, -2, -2, -4, -4, -4, 1, 1, 0, 0, -1, -2, -3, -5, -5, -5, 1, 0, -1, -1, -2, -3, -4, -5, -6, -6, 0, -1, -1, -2, -3, -3, -4, -6, -5, -7, 5, 4, 4, 2, 0, -1, 1, 4, 5, 5, 5, 4, 3, 2, 1, -1, 2, 4, 3, 3, 4, 4, 3, 2, 1, 1, 2, 3, 2, 2, 4, 3, 3, 2, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 1, 1, 1, 1, 0, -1, 3, 2, 1, 1, 0, 0, 0, 0, -1, -1, 2, 1, 0, 0, -2, -2, -2, -3, -3, -4, 1, 0, 0, -1, -2, -3, -4, -5, -6, -6, 1, 0, -1, -2, -3, -4, -5, -6, -6, -6, 0, -1, -2, -3, -4, -5, -6, -6, -6, -7, 5, 4, 4, 2, -1, -1, 1, 5, 7, 7, 5, 4, 4, 2, 1, 1, 2, 5, 6, 6, 5, 4, 3, 2, 2, 2, 3, 4, 4, 4, 4, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 1, 1, 1, 2, 2, 2, 1, 3, 2, 1, 1, 0, 0, 0, 1, 1, 0, 2, 1, 0, -1, -2, -2, -2, -1, -2, -3, 1, 0, -1, -2, -3, -4, -4, -4, -4, -5, 0, -1, -2, -3, -4, -5, -5, -6, -6, -6, 0, -1, -3, -4, -5, -6, -6, -7, -6, -7, 6, 4, 3, 2, 2, 1, 2, 6, 10, 10, 5, 4, 3, 3, 3, 3, 4, 6, 8, 8, 5, 4, 3, 3, 3, 4, 4, 6, 7, 6, 4, 4, 3, 3, 3, 3, 3, 4, 5, 4, 4, 3, 2, 2, 1, 1, 1, 2, 4, 3, 3, 2, 1, 1, 0, 0, 0, 1, 2, 1, 2, 1, 0, -1, -2, -2, -2, -1, 0, -1, 1, 0, -1, -3, -3, -4, -4, -3, -3, -3, 0, -1, -2, -4, -5, -5, -6, -5, -5, -5, 0, -2, -3, -4, -5, -6, -7, -6, -5, -7, 6, 4, 4, 4, 4, 4, 5, 7, 10, 13, 5, 4, 4, 4, 5, 5, 6, 7, 10, 11, 5, 4, 4, 4, 5, 5, 6, 6, 8, 10, 4, 4, 3, 3, 4, 3, 3, 3, 5, 8, 4, 3, 2, 2, 2, 1, 1, 1, 3, 5, 3, 2, 2, 1, 1, 0, 0, 0, 2, 3, 2, 1, 1, 0, -1, -2, -2, -1, 1, 1, 1, 0, -1, -2, -3, -3, -3, -2, -1, -1, 0, -1, -3, -4, -5, -5, -5, -4, -3, -2, 0, -2, -4, -5, -6, -6, -6, -4, -3, -5 }, { -8, -9, -9, -10, -8, -6, -4, -5, -7, -8, -6, -8, -8, -7, -6, -5, -2, -4, -6, -7, -5, -7, -7, -6, -5, -4, -2, -3, -4, -5, -5, -7, -7, -6, -4, -3, -1, -2, -3, -3, -5, -6, -6, -5, -3, -2, 0, -1, -2, -2, -5, -6, -6, -5, -3, -2, 0, 0, 0, -1, -4, -6, -6, -4, -3, -2, 1, 2, 2, 1, -4, -5, -5, -5, -3, -1, 2, 3, 3, 3, -4, -5, -5, -5, -3, 0, 3, 4, 5, 5, -4, -5, -5, -4, -3, 0, 3, 5, 6, 7, -7, -9, -10, -11, -11, -7, -5, -4, -4, -4, -6, -8, -8, -9, -9, -6, -4, -3, -3, -3, -4, -7, -7, -7, -7, -5, -3, -2, -2, -1, -4, -6, -7, -7, -6, -4, -2, -2, -2, 0, -4, -6, -6, -6, -5, -3, -1, -1, 0, 1, -4, -6, -6, -6, -4, -2, 0, 1, 1, 2, -3, -5, -6, -6, -4, -2, 1, 2, 3, 4, -2, -5, -5, -5, -3, -1, 2, 3, 4, 5, -2, -5, -5, -5, -3, -1, 2, 4, 6, 7, -1, -4, -4, -4, -4, -1, 2, 4, 6, 8, -7, -9, -11, -11, -11, -9, -6, -2, -2, -2, -6, -8, -8, -9, -10, -8, -5, -2, -1, 0, -4, -6, -7, -7, -8, -7, -4, -2, -1, 0, -4, -6, -7, -6, -7, -6, -4, -1, 0, 1, -3, -5, -6, -6, -6, -4, -2, -1, 1, 2, -3, -5, -6, -6, -5, -3, -1, 1, 3, 3, -3, -4, -5, -5, -5, -2, 0, 3, 4, 5, -2, -4, -5, -5, -4, -2, 1, 4, 5, 6, -1, -4, -4, -4, -3, -2, 1, 4, 6, 8, 1, -2, -3, -3, -3, -2, 1, 4, 5, 8, -8, -10, -11, -11, -11, -10, -7, -1, -1, 0, -6, -8, -9, -9, -10, -9, -6, -1, 0, 0, -4, -6, -7, -8, -8, -8, -5, -1, 1, 1, -3, -5, -6, -6, -7, -7, -5, -1, 1, 2, -3, -5, -5, -6, -6, -5, -3, 0, 2, 3, -2, -4, -5, -5, -5, -5, -2, 2, 3, 4, -2, -4, -5, -5, -5, -3, 0, 3, 5, 5, -1, -3, -4, -4, -4, -3, 0, 3, 5, 7, 0, -3, -3, -3, -3, -2, 0, 3, 5, 8, 2, -1, -2, -2, -2, -2, 0, 2, 4, 9, -8, -10, -11, -12, -11, -9, -6, -2, -1, 0, -6, -8, -9, -9, -10, -9, -5, -2, 0, 0, -4, -6, -7, -8, -8, -8, -5, -2, 0, 1, -3, -5, -6, -6, -7, -7, -4, -2, 1, 2, -2, -4, -5, -5, -5, -5, -4, -1, 2, 3, -1, -3, -4, -5, -5, -4, -2, 0, 3, 4, -1, -3, -4, -4, -4, -3, -1, 1, 4, 5, 0, -2, -4, -4, -3, -3, -1, 1, 4, 7, 1, -1, -2, -2, -2, -2, -1, 1, 4, 7, 3, 1, 0, -1, -1, -1, 0, 1, 4, 8, -7, -10, -11, -12, -11, -9, -6, -3, -2, 0, -6, -8, -9, -10, -10, -8, -5, -2, -1, 1, -4, -6, -7, -8, -8, -8, -5, -2, 0, 2, -2, -5, -5, -6, -7, -7, -5, -3, 0, 3, -2, -4, -4, -5, -5, -6, -5, -2, 1, 3, -1, -3, -4, -4, -4, -4, -3, -1, 3, 4, 0, -2, -3, -4, -4, -3, -2, 0, 4, 5, 1, -2, -3, -3, -3, -2, -1, 0, 4, 6, 3, 0, -1, -2, -2, -2, 0, 1, 4, 7, 5, 2, 1, 0, 0, 0, 1, 1, 4, 8, -7, -10, -12, -13, -13, -10, -7, -5, -3, 0, -5, -8, -10, -11, -11, -10, -6, -5, -3, 1, -3, -6, -7, -8, -9, -9, -6, -4, -2, 2, -2, -4, -5, -7, -8, -8, -6, -4, -1, 3, -1, -3, -4, -5, -5, -6, -5, -3, 0, 3, 0, -3, -4, -4, -4, -4, -3, -1, 2, 4, 1, -2, -3, -4, -3, -3, -1, 0, 2, 5, 2, -1, -3, -3, -2, -2, 0, 1, 3, 6, 4, 1, -1, -1, -1, -1, 1, 1, 3, 6, 7, 4, 2, 1, 1, 1, 2, 2, 4, 7, -6, -11, -12, -13, -13, -11, -8, -7, -5, 0, -4, -8, -10, -11, -11, -10, -7, -6, -4, 0, -3, -5, -7, -8, -9, -9, -6, -5, -4, 1, -1, -4, -5, -6, -7, -8, -6, -4, -2, 2, 0, -3, -4, -4, -5, -6, -4, -2, -1, 3, 1, -2, -3, -3, -3, -3, -2, 0, 1, 3, 1, -2, -3, -3, -2, -2, 0, 1, 1, 4, 3, -1, -2, -2, -1, -1, 1, 1, 2, 5, 6, 2, 0, 0, 0, 0, 2, 2, 3, 6, 9, 5, 3, 2, 2, 2, 3, 2, 3, 7, -5, -10, -12, -13, -13, -11, -10, -9, -6, -1, -3, -7, -9, -10, -11, -10, -8, -7, -5, -1, -1, -4, -6, -8, -8, -8, -6, -5, -4, 1, 0, -2, -4, -5, -6, -6, -5, -4, -3, 1, 2, -1, -2, -3, -4, -5, -3, -2, -2, 2, 3, 0, -1, -1, -2, -2, 0, 0, 0, 3, 4, 0, -1, -1, 0, 0, 2, 1, 0, 3, 6, 1, 0, 0, 1, 1, 2, 1, 1, 4, 8, 4, 2, 2, 3, 3, 3, 2, 2, 5, 11, 7, 5, 5, 5, 5, 4, 2, 3, 6, -5, -9, -11, -12, -12, -12, -11, -10, -8, -2, -2, -6, -8, -9, -10, -9, -8, -8, -7, -2, 1, -3, -5, -6, -7, -7, -6, -5, -5, 0, 2, -1, -3, -4, -4, -4, -4, -4, -3, 1, 4, 1, 0, -1, -1, -2, -1, -1, -2, 1, 5, 3, 2, 1, 1, 1, 1, 0, -1, 2, 7, 4, 2, 3, 3, 3, 3, 1, -1, 3, 9, 4, 3, 4, 4, 4, 4, 2, 0, 4, 11, 7, 5, 5, 6, 5, 5, 3, 1, 5, 14, 10, 8, 8, 8, 7, 6, 3, 3, 6 }, { 1, 1, 1, -1, -2, -10, -14, -15, -11, -3, 0, 1, 0, -1, -1, -8, -12, -12, -8, -1, 0, 0, 0, -1, -1, -6, -9, -8, -4, 1, -1, -1, -1, -1, -1, -3, -6, -4, 1, 3, -1, -1, -1, -1, 0, 0, -2, 1, 5, 6, 0, -1, -1, 0, 1, 1, 1, 3, 6, 7, 0, -1, 1, 4, 5, 4, 4, 6, 7, 8, 1, 2, 5, 8, 10, 8, 6, 7, 6, 6, 2, 6, 10, 12, 13, 11, 7, 7, 6, 6, 4, 10, 13, 15, 15, 12, 9, 7, 6, 7, -2, -1, 0, 0, -2, -10, -14, -14, -9, -1, -2, -1, 0, 0, -1, -8, -11, -11, -6, 0, -2, -1, 0, 0, 0, -6, -9, -8, -2, 2, -2, -1, 0, 0, 0, -4, -6, -4, 1, 4, -1, 0, 0, 0, 1, -1, -2, 0, 5, 6, 0, 0, 1, 1, 2, 2, 1, 3, 6, 7, -1, 1, 3, 5, 7, 5, 3, 5, 6, 7, -1, 4, 6, 8, 11, 8, 5, 6, 6, 7, 2, 8, 11, 12, 12, 10, 7, 7, 6, 7, 3, 12, 14, 13, 12, 11, 9, 7, 7, 8, -6, -4, -2, -2, -2, -10, -13, -13, -9, -1, -5, -3, -2, -1, -1, -8, -11, -11, -6, 1, -6, -3, -1, 0, -1, -6, -9, -5, -1, 3, -7, -2, -1, 0, 0, -4, -4, -1, 2, 4, -6, -2, -1, 0, 1, -2, -2, 1, 4, 6, -4, -1, 1, 2, 3, 1, 1, 3, 6, 7, -5, 0, 2, 5, 7, 4, 3, 4, 6, 7, -7, 3, 6, 8, 9, 6, 4, 5, 6, 8, -6, 6, 10, 11, 10, 8, 6, 6, 7, 9, -1, 7, 12, 12, 10, 9, 8, 7, 8, 8, -12, -8, -6, -4, -3, -9, -13, -12, -8, 0, -11, -6, -4, -3, -2, -8, -11, -8, -3, 2, -10, -5, -4, -2, -2, -6, -7, -3, 3, 4, -10, -4, -3, -2, -1, -4, -3, 1, 5, 6, -9, -4, -2, -1, 0, -3, 0, 3, 5, 6, -9, -2, -1, 1, 2, 0, 1, 3, 5, 7, -12, -2, 1, 4, 4, 2, 1, 4, 6, 8, -15, -2, 5, 6, 5, 4, 3, 4, 6, 9, -13, -1, 6, 8, 6, 6, 5, 6, 7, 9, -1, 0, 7, 10, 6, 8, 7, 7, 8, 9, -15, -14, -11, -8, -6, -9, -12, -11, -5, 2, -13, -12, -8, -6, -5, -9, -10, -5, 1, 5, -11, -9, -6, -5, -4, -8, -7, 0, 5, 8, -10, -7, -5, -5, -4, -6, -4, 3, 6, 8, -8, -6, -4, -3, -3, -5, -2, 3, 5, 7, -10, -5, -2, -1, 0, -1, 0, 2, 5, 7, -16, -8, -1, 1, 0, 0, 0, 3, 5, 8, -17, -8, 0, 3, 1, 2, 3, 4, 6, 9, -12, -7, 0, 4, 2, 3, 6, 6, 7, 9, 2, 0, 2, 5, 2, 4, 7, 7, 8, 9, -13, -17, -16, -13, -9, -13, -14, -8, -3, 4, -10, -13, -13, -11, -9, -12, -12, -5, 2, 8, -8, -10, -10, -9, -8, -11, -10, -3, 4, 10, -6, -7, -8, -7, -7, -10, -6, -1, 6, 10, -5, -4, -6, -5, -5, -7, -3, 1, 5, 8, -6, -4, -4, -3, -3, -4, -1, 1, 4, 8, -11, -7, -2, -1, -2, -2, 0, 2, 5, 10, -10, -7, -1, 0, -2, -1, 2, 3, 5, 10, -5, -5, -1, 1, -1, 1, 4, 5, 7, 11, 3, 1, 2, 1, -1, 2, 4, 6, 8, 11, -11, -15, -19, -16, -14, -15, -20, -13, -2, 7, -8, -11, -15, -15, -13, -14, -19, -11, 0, 10, -5, -8, -12, -13, -12, -13, -16, -9, 2, 12, -3, -5, -8, -11, -11, -12, -13, -7, 4, 11, -1, -4, -6, -9, -10, -10, -7, -2, 3, 8, -3, -4, -5, -6, -6, -5, -3, -1, 3, 8, -5, -3, -2, -3, -4, -3, 0, 1, 3, 10, -5, -3, -1, -1, -3, -1, 1, 3, 5, 10, 0, -2, 1, -1, -2, 0, 2, 4, 7, 10, 5, 3, 4, 2, -2, 0, 2, 4, 6, 10, -9, -13, -17, -18, -15, -16, -22, -23, -10, 7, -6, -9, -13, -16, -15, -15, -19, -20, -6, 10, -3, -6, -10, -15, -14, -14, -16, -16, -3, 11, -1, -4, -7, -12, -13, -13, -12, -11, -1, 10, 0, -3, -6, -9, -12, -11, -6, -4, 1, 7, -2, -3, -5, -6, -6, -6, -2, -1, 1, 2, -3, -2, -2, -2, -4, -3, 0, 1, 3, 7, -2, -1, 0, 0, -2, -1, 1, 2, 5, 10, 2, 1, 1, 1, -2, -1, 1, 3, 5, 9, 6, 5, 4, 1, -2, 0, 1, 3, 5, 8, -6, -11, -15, -18, -16, -16, -21, -21, -16, 3, -3, -7, -11, -16, -16, -16, -17, -17, -12, 6, -1, -4, -8, -14, -15, -16, -14, -13, -7, 7, 0, -2, -6, -11, -13, -14, -10, -7, -3, 7, 0, -3, -5, -9, -11, -12, -6, -1, 1, 6, -1, -3, -4, -4, -4, -6, -2, 0, 1, -5, -3, -2, -2, -2, -2, -2, 0, 2, 4, 1, 0, 0, 1, 0, -1, 0, 1, 3, 6, 10, 3, 2, 2, 1, -1, 0, 2, 4, 6, 9, 7, 6, 5, 1, -1, 0, 1, 4, 5, 9, -4, -9, -14, -18, -19, -19, -18, -14, -8, 2, -1, -6, -10, -15, -17, -17, -15, -10, -4, 4, 0, -3, -7, -13, -14, -16, -12, -6, -1, 6, 1, -2, -6, -11, -12, -14, -9, -3, 2, 6, 0, -2, -6, -9, -9, -11, -5, 1, 4, 6, -1, -3, -4, -4, -2, -2, 0, 3, 2, -3, -2, -2, -1, -2, -1, 0, 2, 4, 6, -1, -1, 0, 1, 1, 0, 1, 4, 6, 8, 11, 4, 3, 4, 2, 0, 2, 4, 7, 8, 11, 7, 5, 5, 2, 0, 2, 5, 7, 7, 10 }, }, { { 1, 1, 0, 0, 1, 1, -1, -1, 0, 0, 1, 1, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 2, 1, 1, 1, 0, 0, -1, -1, -1, -1, 3, 2, 1, 1, 1, 1, 0, -1, -1, -1, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, -1, -2, -2, 0, 0, 0, 2, 1, 1, 0, -1, -3, -4, -3, -1, 0, 2, 1, 0, 0, -2, -3, -5, -5, -3, -1, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, 0, 0, 0, 0, 1, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 2, 2, 1, 0, 0, 0, -1, -1, -1, -1, 2, 2, 1, 1, 0, 0, 0, -1, -1, -1, 2, 2, 1, 0, 0, -1, 0, -1, -1, -1, 2, 1, 1, 0, -1, -2, -2, -1, 0, -1, 2, 1, 0, 0, -1, -3, -4, -3, -2, 0, 1, 0, -1, -1, -2, -4, -4, -4, -3, -2, 1, 1, 1, 1, 0, 0, 0, -1, -1, -1, 0, 0, 1, 0, 0, 0, 0, -1, -1, -1, 1, 0, 1, 0, 0, 0, 0, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 2, 1, 1, 1, 0, -1, -1, -1, -1, -1, 2, 2, 1, 1, 0, -1, -1, -1, -1, -1, 2, 2, 1, 1, 0, -1, -1, -1, -1, -1, 2, 1, 1, 0, -1, -2, -3, -2, -1, -1, 2, 1, 0, 0, -2, -3, -3, -3, -2, -1, 0, 0, -1, -1, -2, -4, -4, -4, -3, -2, 1, 1, 1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, -1, -1, -1, -1, -1, 2, 1, 1, 0, 0, -1, -1, -1, -1, -1, 2, 1, 1, 0, 0, -1, -2, -1, -1, -1, 2, 1, 1, 0, -1, -2, -2, -2, -1, 0, 1, 1, 0, -1, -1, -2, -3, -2, -2, -1, 1, 0, -1, -2, -2, -3, -3, -3, -2, -2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 2, 1, 1, 1, 0, -1, -1, 0, -1, -1, 2, 1, 1, 1, 0, -1, -1, 0, 0, -1, 1, 1, 0, 0, 0, -1, -1, -1, 0, 0, 1, 0, 0, 0, -1, -2, -2, -1, -1, 0, 0, 0, -1, -1, -2, -2, -3, -2, -2, -1, 0, 0, 1, 1, 2, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1, 0, 0, 0, 0, -1, -1, -1, 1, 1, 1, 1, 0, 0, 0, 0, -1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, 0, -1, -1, -2, -2, -1, 0, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1, 0, -1, -1, -1, 1, 1, 0, 0, 0, 0, 0, -1, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 3, 2, 1, 1, 0, 0, 1, 2, 3, 3, 3, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 1, 0, 0, -1, 0, 0, 0, 1, 2, 2, 1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 0, -1, -1, -2, 1, 1, 1, 1, 1, 1, 0, 0, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 4, 4, 4, 3, 2, 1, 1, 0, 1, 2, 3, 4, 3, 2, 1, 1, -1, 0, 1, 1, 2, 3, 3, 1, 0, 0, -1, 0, 0, 0, 1, 2, 2, 1, 0, -1, -2, 0, 0, 0, 1, 1, 1, 0, 0, -1, -1, 1, 1, 0, 1, 1, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 3, 2, 1, 1, 1, 2, 3, 3, 4, 4, 2, 1, 1, 0, 0, 1, 2, 2, 3, 3, 2, 0, -1, -1, 0, 0, 1, 1, 2, 1, 0, 0, -1, -2, 0, 0, 0, 1, 1, 0, 0, 0, -1, -1, 1, 0, 0, 1, 1, 0, 0, 0, 0, -1, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0 }, { 1, 0, 0, -1, -2, -3, -3, -3, -3, -3, 1, 0, 0, -1, -1, -2, -3, -3, -3, -3, 1, 1, 1, 1, -1, -2, -2, -2, -3, -3, 1, 1, 2, 2, 1, -1, -2, -2, -2, -3, 2, 2, 2, 2, 2, 1, -1, -2, -2, -3, 2, 3, 3, 3, 3, 2, 1, -1, -2, -3, 2, 3, 3, 4, 4, 4, 2, 0, -2, -3, 2, 3, 4, 4, 5, 4, 3, 0, -2, -4, 2, 3, 4, 4, 4, 4, 1, -1, -2, -4, 2, 3, 3, 3, 3, 2, 0, -2, -2, -3, 0, 0, 0, -1, -2, -3, -3, -3, -3, -2, 0, 0, 0, -1, -1, -2, -3, -3, -2, -2, 0, 0, 0, 0, -1, -2, -2, -2, -2, -2, 0, 0, 0, 0, -1, -1, -2, -2, -2, -3, 0, 1, 1, 1, 1, 0, -1, -2, -2, -3, 0, 1, 1, 2, 2, 2, 0, -1, -2, -3, 1, 1, 2, 3, 3, 3, 1, -1, -2, -3, 0, 1, 2, 3, 4, 3, 2, 0, -2, -4, 1, 2, 3, 3, 3, 3, 1, -1, -2, -4, 0, 1, 1, 2, 2, 1, 0, -1, -2, -3, 0, 0, -1, -1, -1, -2, -3, -2, -2, -2, 0, 0, -1, -1, -2, -2, -3, -2, -2, -2, 0, 0, 0, -1, -1, -2, -2, -2, -2, -2, 0, 0, 0, 0, -1, -2, -2, -2, -2, -2, 0, 0, 0, 0, 0, -1, -1, -2, -2, -2, 0, 0, 0, 0, 1, 1, 0, -2, -2, -3, 0, 0, 1, 1, 2, 2, 1, -1, -2, -3, 0, 0, 1, 2, 2, 2, 1, -1, -3, -4, 0, 0, 1, 1, 1, 1, 0, -1, -2, -3, 0, 0, 0, 0, 1, 1, -1, -1, -2, -3, 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -2, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, 0, 0, 0, 0, -1, -1, -2, -2, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, -1, 0, 0, 0, 1, 1, 0, -1, -2, -3, -1, 0, 0, 0, 0, 0, 0, -1, -2, -2, -1, -1, -1, 0, 0, 0, 0, 0, -1, -2, 0, -1, -1, -2, -2, -2, -1, -1, 0, 0, 0, -1, -1, -2, -2, -1, -1, -1, 0, 0, 0, 0, -1, -2, -2, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 1, 1, 1, 0, 0, 0, -1, -1, -1, -2, -2, -1, -1, 0, 1, 0, 0, -1, -1, -2, -2, -1, 0, 0, 1, 0, 0, 0, -1, -2, -1, -1, 0, 0, 1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, -1, 0, 0, 0, 1, 1, 1, 0, 0, 0, -1, -1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 1, 1, 0, 0, -1, -1, -2, -1, -1, 0, 1, 1, 1, 0, -1, -1, -1, -1, -1, 0, 1, 1, 1, 1, 0, -1, -1, -1, 0, 0, 1, 1, 1, 1, 0, -1, -1, -1, 0, 0, 1, 1, 1, 1, 1, 0, -1, -1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 1, 1, 0, 0, 0, -1, -1, 0, 0, 0, 1, 1, 0, 0, -1, -1, -1, -1, 0, 0, 1, 1, 0, 0, -1, -2, -2, -2, -1, 0, 2, 1, 1, 0, -1, -2, -2, -2, -1, 0, 2, 1, 1, 0, -1, -2, -2, -1, -1, 1, 1, 1, 1, 0, -1, -2, -2, -1, 0, 1, 1, 1, 1, 1, 0, -1, -1, -1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, -1, -1, -1, -1, 1, 1, 1, 1, 0, -1, -2, -2, -2, -1, 2, 1, 0, 0, -1, -2, -3, -2, -2, -1, 2, 1, 0, 0, -2, -3, -3, -3, -2, -1, 2, 1, 0, -1, -2, -3, -3, -3, -2, -1, 1, 1, 0, 0, -1, -3, -3, -3, -1, 0, 1, 1, 1, 0, 0, -2, -2, -2, -1, 1, 1, 1, 1, 1, 1, 0, -1, -1, 0, 1, 0, 1, 1, 1, 2, 1, 1, 0, 0, 1, 0, 1, 1, 2, 2, 2, 1, 0, 0, 0, 2, 2, 1, 1, 1, 0, -1, -2, -3, -2, 2, 1, 1, 1, 0, -1, -3, -3, -3, -2, 2, 1, 1, 0, -1, -2, -3, -3, -3, -3, 2, 1, 0, 0, -2, -3, -4, -4, -4, -2, 1, 1, 0, -1, -3, -4, -4, -5, -3, -2, 1, 1, 0, -1, -2, -3, -4, -4, -2, -2, 1, 1, 0, 0, -1, -2, -3, -3, -2, -1, 1, 1, 1, 1, 0, -1, -2, -2, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, 0, 0, 1, 1, 1, 2, 2, 2, 2, 0, 0, 1 }, { 0, -1, -2, -1, -1, 0, 0, 0, -2, -5, 0, -1, -2, -1, 0, 0, 1, 0, -2, -6, 0, -1, -2, -2, -1, 1, 1, 0, -1, -6, 0, -1, -2, -1, 0, 1, 2, 1, -1, -6, 1, 0, -1, 0, 1, 1, 0, 1, -1, -6, 1, 0, 1, 2, 2, 2, 0, -1, -2, -4, 0, 0, 1, 3, 4, 3, 1, -1, -2, -3, -1, 0, 1, 3, 4, 4, 2, 1, 0, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, -2, -2, -1, -1, 0, -1, -1, 0, 0, -1, -2, -2, -1, 0, 0, 0, -1, -4, 0, -1, -2, -2, -1, 0, 1, 0, -1, -5, 1, -1, -2, -2, -1, 0, 0, 0, -2, -5, 1, -1, -2, -2, -1, 0, 1, 0, -2, -5, 1, 0, -1, -1, 0, 1, 0, -1, -2, -4, 1, 0, 0, 1, 2, 2, 0, -2, -2, -3, 0, -1, 0, 2, 3, 3, 1, 0, -1, -2, -1, -1, 0, 1, 3, 3, 2, 1, 0, 0, 0, -1, -1, 0, 1, 1, 1, 1, 1, 1, -1, -1, -2, -2, -2, -1, -1, -1, 0, 1, 0, -1, -2, -2, -2, -1, 0, 0, 0, -3, 1, -1, -2, -3, -2, -1, 0, 0, -1, -3, 1, 0, -2, -2, -2, -1, 0, 0, -2, -3, 1, 0, -1, -2, -1, -1, 0, -1, -3, -3, 2, 0, 0, -1, -1, 0, 0, -1, -3, -3, 1, 0, 0, 0, 1, 2, 0, -1, -2, -2, 0, 0, 0, 0, 2, 3, 1, 0, 0, 0, 0, -1, -1, 0, 2, 2, 1, 1, 1, 2, 0, 0, -1, 0, 1, 0, 0, 0, 2, 2, 0, -1, -1, -2, -2, -2, -2, 0, 1, 2, 0, -1, -2, -3, -2, -1, -1, 0, 0, -1, 1, -1, -2, -3, -2, -1, 0, 0, -1, -1, 1, 0, -1, -2, -2, -1, 0, -1, -1, -1, 2, 1, -1, -1, -2, -1, 0, -1, -2, -1, 2, 1, 0, 0, -1, 0, 0, -1, -2, -1, 1, 0, 0, 0, 0, 1, 1, -1, -1, -1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 0, -1, -1, 0, 1, 1, 0, 1, 2, 3, 0, 0, -1, -1, 0, -1, -1, 0, 2, 3, 0, 0, -1, -2, -2, -2, -1, 0, 1, 3, -1, -2, -3, -3, -3, -2, -1, 0, 0, 0, 0, -1, -2, -3, -2, -2, -1, 0, 0, 0, 1, 0, -2, -2, -2, -2, -1, 0, 0, 0, 2, 1, -1, -1, -2, -2, -1, 0, 0, 0, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, -1, -1, -1, 0, 0, 1, 2, 3, 1, 0, -1, -1, -1, -1, -1, 0, 2, 3, 1, 1, 0, -1, -2, -2, -1, 0, 2, 3, -3, -4, -4, -3, -3, -2, -1, 0, 0, 1, -1, -3, -3, -3, -3, -2, -1, 0, 0, 1, 0, -2, -2, -3, -3, -2, -1, 0, 0, 1, 0, -1, -1, -1, -2, -2, -1, 0, 0, 1, 1, 0, 0, 0, -1, -1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2, 3, 2, 1, 0, 0, 0, -1, -1, 0, 2, 3, 2, 2, 1, 0, -1, -1, -1, 0, 1, 3, -4, -5, -5, -4, -4, -3, -2, -1, 0, 1, -2, -3, -3, -4, -3, -3, -2, -1, 0, 1, -1, -1, -2, -3, -3, -3, -2, -1, 0, 1, 0, 0, -1, -1, -2, -2, -2, -1, -1, 1, 1, 1, 0, 0, -1, -1, 0, -1, 0, 0, 2, 2, 1, 1, 0, 0, 0, 0, 1, 0, 2, 2, 1, 1, 0, 0, 0, 0, 1, 1, 3, 2, 1, 1, 1, 0, -1, 0, 1, 1, 3, 2, 2, 1, 0, 0, -1, 0, 1, 1, 3, 3, 2, 1, 0, -1, -1, 0, 0, 1, -5, -6, -5, -5, -4, -4, -2, -1, -1, 1, -2, -3, -4, -4, -3, -3, -2, -2, -1, 0, 0, -1, -2, -2, -2, -2, -2, -2, -1, 0, 1, 0, -1, -1, -1, -1, -1, -2, -2, 0, 3, 2, 1, 0, 0, 0, 1, -1, -1, 0, 4, 3, 2, 2, 1, 1, 1, -1, 0, 0, 4, 4, 3, 2, 2, 1, 0, 0, 0, 1, 5, 4, 3, 3, 2, 1, -1, -1, 1, 1, 5, 5, 4, 3, 2, 0, -1, -1, 0, 1, 5, 5, 3, 3, 1, -1, -1, -1, 0, 0, -4, -5, -5, -5, -4, -3, -3, -2, -1, 0, -2, -3, -3, -3, -3, -3, -2, -3, -2, 0, 0, 0, -1, -1, -2, -2, -2, -3, -2, -1, 2, 2, 1, 0, 0, 0, -1, -2, -2, -1, 4, 4, 3, 2, 2, 1, 0, -1, -2, -1, 6, 5, 5, 4, 3, 2, 0, -2, -1, 0, 6, 6, 5, 5, 4, 2, -1, -1, 0, 0, 7, 6, 6, 5, 4, 1, -1, -1, 0, 1, 7, 7, 6, 5, 3, 0, -1, -1, 0, 1, 6, 6, 5, 4, 2, -1, -2, -1, -1, 1, -3, -4, -4, -4, -4, -3, -3, -3, -2, 0, -1, -2, -2, -2, -3, -2, -3, -3, -3, -1, 2, 1, 0, 0, -1, -1, -1, -3, -3, -2, 4, 3, 3, 2, 1, 1, 0, -2, -3, -2, 6, 5, 5, 4, 4, 2, 0, -1, -2, -1, 8, 7, 6, 6, 5, 2, -1, -2, -1, -1, 8, 8, 8, 7, 5, 2, -2, -1, -1, 0, 8, 8, 8, 7, 5, 1, -2, -1, 0, 0, 8, 8, 7, 7, 3, 0, -2, -1, 0, 0, 8, 8, 7, 4, 2, -1, -2, -1, 0, 1 }, { -4, -5, -5, -6, -6, -5, -5, -4, -3, -2, -3, -4, -5, -5, -5, -4, -3, -2, -1, 0, -2, -4, -4, -4, -4, -2, -1, 0, 1, 2, -2, -3, -4, -4, -2, -1, 0, 2, 3, 4, -1, -3, -3, -3, -2, 0, 1, 3, 5, 6, 0, -2, -3, -2, -1, 0, 2, 3, 6, 7, 0, -1, -2, -2, -1, 1, 3, 3, 6, 9, 1, 0, -1, -1, 0, 2, 4, 5, 8, 10, 2, 1, 0, 0, 1, 3, 5, 7, 9, 11, 3, 2, 1, 1, 2, 3, 5, 7, 10, 12, -3, -4, -4, -5, -6, -5, -5, -4, -4, -6, -2, -3, -4, -4, -4, -4, -3, -2, -3, -4, -2, -3, -3, -3, -3, -2, -1, -1, -2, -2, -1, -3, -3, -3, -2, -1, 0, 1, 0, 0, 0, -2, -3, -3, -2, 0, 2, 3, 3, 2, 0, -1, -3, -3, -1, 1, 3, 4, 4, 4, 1, -1, -2, -2, -1, 2, 3, 4, 5, 5, 1, -1, -2, -2, 0, 2, 4, 5, 6, 6, 0, -1, -2, -2, -1, 1, 3, 4, 6, 7, 1, -1, -1, -1, 0, 1, 3, 4, 7, 8, -2, -2, -3, -4, -5, -6, -6, -7, -8, -8, -1, -2, -3, -3, -4, -5, -5, -5, -6, -6, 0, -2, -2, -3, -2, -3, -3, -4, -4, -5, 0, -1, -2, -2, -2, -1, -1, -2, -2, -3, 1, -1, -2, -2, -2, 0, 0, 0, 0, -1, 1, 0, -1, -2, -1, 1, 2, 2, 1, 1, 2, 0, -1, -2, -1, 2, 3, 3, 2, 2, 2, 1, -1, -1, -1, 1, 3, 3, 3, 3, 1, 0, -1, -1, -1, 0, 2, 3, 3, 4, 1, 0, -1, -1, -1, 0, 1, 2, 4, 4, 0, -1, -2, -3, -4, -6, -8, -9, -9, -9, 0, -1, -1, -2, -4, -5, -7, -7, -7, -7, 1, -1, -1, -2, -3, -4, -5, -5, -6, -6, 1, 0, -1, -2, -2, -3, -3, -4, -4, -5, 2, 0, -1, -2, -2, -1, -1, -2, -3, -3, 2, 1, 0, -1, -1, 0, 0, 0, -1, -2, 3, 1, 0, 0, -1, 1, 1, 1, 0, 0, 3, 2, 1, 0, -1, 0, 1, 1, 1, 0, 2, 1, 0, 0, -1, 0, 0, 1, 1, 1, 2, 1, 0, 0, -1, -1, 0, 1, 1, 2, 1, 0, 0, -1, -3, -5, -7, -8, -8, -8, 1, 0, 0, -1, -2, -4, -6, -8, -8, -8, 2, 0, 0, -1, -2, -4, -5, -6, -6, -7, 2, 1, 0, -1, -2, -3, -4, -5, -5, -6, 3, 2, 1, 0, -1, -2, -3, -3, -4, -4, 3, 2, 2, 1, 0, 0, -1, -2, -3, -3, 3, 2, 2, 2, 1, 1, 0, -1, -2, -2, 3, 3, 3, 2, 1, 0, 0, -1, -1, -2, 4, 3, 3, 2, 1, 0, -1, -1, -1, -1, 3, 3, 2, 2, 1, -1, -1, -1, -1, 0, 1, 0, 0, 0, -2, -3, -5, -6, -7, -8, 1, 0, 0, 0, -1, -3, -4, -6, -7, -8, 2, 0, 0, 0, -1, -2, -4, -5, -6, -7, 2, 1, 0, 0, 0, -2, -3, -4, -5, -6, 3, 2, 1, 0, 0, -1, -2, -3, -4, -5, 3, 2, 2, 1, 0, 0, 0, -2, -3, -4, 3, 3, 3, 2, 1, 1, 0, -2, -3, -4, 3, 3, 3, 3, 3, 1, 0, -1, -2, -3, 4, 3, 4, 4, 3, 1, 0, -1, -2, -3, 4, 4, 4, 3, 2, 1, 0, -1, -2, -2, 0, 0, 0, 0, 0, -1, -3, -5, -6, -7, 1, 0, 0, 0, 0, -1, -2, -4, -6, -7, 1, 0, 0, 0, 0, 0, -2, -4, -5, -7, 2, 0, 0, -1, 0, 0, -1, -3, -5, -6, 2, 1, 0, -1, -1, 0, 0, -2, -4, -5, 3, 1, 1, 0, -1, 0, 1, -1, -3, -5, 3, 2, 1, 1, 1, 1, 1, -1, -3, -4, 3, 3, 3, 2, 2, 2, 1, -1, -3, -4, 4, 4, 4, 4, 3, 2, 1, -1, -2, -3, 4, 4, 5, 4, 3, 2, 1, -1, -2, -3, -1, 0, 0, 0, 0, -1, -1, -3, -5, -7, -1, -1, -1, -1, -1, 0, -1, -2, -5, -7, -1, -1, -1, -1, -1, 0, 0, -1, -4, -6, 0, -1, -2, -2, -2, -1, 0, 0, -3, -5, 1, -1, -2, -2, -2, -1, 1, 1, -2, -5, 1, -1, -1, -2, -2, -1, 1, 1, -2, -4, 2, 0, -1, -1, -1, 0, 1, 1, -2, -4, 2, 1, 1, 0, 1, 1, 1, 0, -2, -4, 3, 3, 3, 2, 2, 2, 1, 0, -2, -4, 5, 5, 4, 3, 3, 2, 1, 0, -2, -3, -3, -2, -2, -1, 0, 0, 0, -2, -6, -9, -3, -2, -2, -2, -1, -1, 1, 0, -4, -7, -3, -3, -3, -3, -2, -1, 1, 1, -2, -7, -2, -3, -3, -3, -3, -2, 0, 2, 0, -5, -2, -2, -3, -3, -3, -1, 1, 3, 1, -3, -1, -2, -3, -3, -2, -1, 1, 3, 1, -2, 0, -2, -2, -2, -1, 0, 1, 2, 0, -3, 1, -1, -1, 0, 0, 1, 1, 1, 0, -3, 2, 1, 1, 1, 1, 1, 1, 1, -1, -3, 4, 4, 3, 3, 3, 2, 1, 1, -1, -3, -7, -5, -4, -3, -2, -2, -2, -3, -6, -9, -6, -5, -5, -4, -3, -2, -1, -1, -5, -8, -6, -5, -5, -4, -4, -2, 0, 1, -2, -7, -5, -5, -5, -4, -4, -2, 0, 1, 0, -5, -5, -4, -4, -4, -3, -1, 1, 2, 1, -3, -4, -4, -4, -3, -2, 0, 2, 3, 2, -2, -3, -3, -3, -2, -1, 1, 2, 3, 1, -2, -2, -2, -1, -1, 0, 1, 2, 2, 1, -3, 0, 0, 0, 1, 1, 1, 2, 1, 0, -3, 3, 2, 3, 3, 3, 2, 2, 1, 0, -3 }, { -10, -10, -10, -10, -10, -9, -9, -9, -6, 1, -8, -9, -8, -8, -8, -8, -8, -7, -5, 2, -7, -6, -6, -6, -5, -6, -6, -6, -3, 3, -5, -4, -4, -3, -3, -3, -4, -4, -1, 4, -2, -2, -1, 0, -1, -1, -2, -3, 0, 5, 1, 1, 2, 2, 1, 0, -2, -1, 1, 6, 3, 3, 3, 3, 2, 1, -1, 0, 2, 9, 5, 5, 5, 5, 3, 0, -2, 0, 3, 9, 7, 7, 7, 5, 3, -1, -2, 0, 3, 10, 10, 9, 8, 5, 1, -2, -3, 1, 4, 10, -11, -10, -9, -9, -8, -8, -8, -7, -5, 1, -9, -8, -8, -7, -7, -7, -7, -6, -4, 2, -8, -7, -7, -6, -5, -5, -5, -4, -3, 3, -6, -6, -5, -4, -3, -3, -3, -3, 0, 4, -3, -4, -3, -1, 0, -1, -1, -1, 1, 5, -2, -1, 0, 2, 2, 1, 0, 0, 2, 6, 0, 0, 2, 3, 3, 2, -1, 1, 4, 8, 2, 2, 3, 4, 3, 1, -1, 1, 4, 9, 4, 4, 5, 5, 3, 0, -1, 2, 5, 9, 7, 6, 6, 4, 1, -1, -1, 3, 6, 10, -12, -10, -8, -8, -7, -6, -6, -5, -3, 1, -11, -10, -8, -7, -6, -5, -5, -4, -2, 2, -9, -9, -8, -7, -6, -4, -4, -3, -1, 3, -8, -7, -6, -5, -4, -3, -3, -2, 1, 4, -4, -4, -4, -3, -1, -1, -1, 0, 2, 5, -3, -2, -2, 0, 1, 1, 0, 2, 4, 6, -2, -1, 0, 1, 2, 1, 0, 3, 5, 8, -1, 0, 1, 3, 2, 1, 0, 3, 5, 9, 1, 2, 3, 4, 2, 0, 0, 3, 6, 9, 4, 4, 4, 4, 1, 0, 1, 4, 7, 10, -11, -10, -8, -7, -6, -5, -4, -2, -1, 1, -11, -9, -8, -7, -6, -5, -4, -2, 0, 2, -9, -9, -8, -7, -5, -4, -4, -1, 1, 3, -7, -8, -7, -6, -5, -3, -2, 0, 2, 4, -5, -4, -4, -4, -2, -1, 0, 2, 3, 5, -3, -3, -3, -2, 0, 1, 1, 3, 5, 6, -2, -2, -2, -1, 1, 1, 1, 4, 6, 7, -1, -1, 0, 1, 1, 0, 1, 4, 6, 8, 0, 1, 1, 2, 1, 0, 2, 5, 7, 8, 2, 2, 2, 2, 1, 1, 2, 6, 8, 9, -11, -9, -7, -6, -5, -3, -3, -1, 0, 1, -10, -8, -7, -5, -4, -2, -2, 0, 1, 2, -9, -8, -7, -5, -4, -3, -1, 1, 2, 3, -7, -7, -6, -5, -4, -3, 0, 2, 3, 4, -4, -4, -3, -3, -2, -1, 2, 3, 4, 4, -3, -3, -3, -2, -1, 0, 3, 4, 5, 6, -2, -2, -2, -1, 0, 0, 2, 4, 6, 7, -1, -1, -1, -1, 0, 0, 2, 4, 6, 7, 0, 0, 0, 0, 0, 0, 2, 4, 6, 8, 1, 1, 0, 1, 1, 1, 3, 5, 7, 8, -11, -9, -7, -6, -5, -2, -1, 0, 0, 0, -10, -8, -6, -5, -3, -1, 0, 1, 1, 2, -8, -7, -6, -5, -3, -1, 0, 2, 2, 2, -6, -6, -6, -4, -3, -1, 1, 2, 3, 3, -4, -4, -3, -2, -1, 0, 2, 3, 4, 4, -3, -3, -3, -2, 1, 1, 3, 4, 5, 6, -2, -2, -2, -1, 1, 1, 3, 4, 6, 7, -1, -2, -1, -1, 0, 0, 2, 4, 5, 7, 0, -1, -1, -1, 0, 0, 2, 4, 5, 7, 1, -1, -1, 0, 0, 0, 3, 4, 6, 8, -12, -10, -7, -5, -3, -1, -1, 0, 0, 0, -10, -8, -6, -4, -2, -1, 0, 0, 1, 1, -9, -7, -6, -4, -2, -1, 1, 1, 2, 2, -7, -6, -5, -3, -1, 0, 1, 2, 2, 3, -5, -4, -3, -2, 1, 2, 3, 4, 4, 4, -3, -3, -2, -1, 2, 3, 5, 6, 6, 7, -2, -2, -1, 0, 3, 3, 4, 5, 7, 9, -2, -1, -1, 0, 2, 2, 4, 5, 7, 9, 0, -1, -1, 0, 1, 1, 3, 4, 6, 9, 1, -1, -1, 0, 1, 1, 3, 4, 6, 9, -12, -9, -6, -3, -1, -2, -2, -1, -1, 0, -9, -7, -5, -3, 0, -1, -2, -1, 0, 1, -9, -7, -4, -3, -1, -1, -1, 0, 1, 2, -7, -5, -4, -2, -1, -1, 0, 2, 2, 3, -4, -3, -2, -1, 1, 1, 3, 3, 4, 4, -3, -2, -1, 1, 3, 3, 5, 6, 6, 7, -2, -1, 0, 2, 4, 4, 5, 6, 7, 9, -1, 0, 1, 2, 4, 4, 5, 6, 7, 10, 0, 0, 1, 2, 3, 3, 4, 5, 7, 10, 1, 0, 1, 2, 2, 2, 4, 5, 7, 11, -12, -9, -5, -2, -1, -2, -4, -3, -3, -2, -10, -7, -4, -2, 0, -2, -3, -3, -1, 0, -8, -6, -3, -2, 0, -2, -2, -1, 0, 2, -5, -4, -3, -1, 0, -1, -1, 0, 1, 3, -3, -2, -1, 1, 1, 1, 2, 2, 3, 4, -2, -1, 0, 2, 3, 3, 4, 5, 6, 6, -2, 0, 2, 3, 4, 4, 5, 6, 7, 8, -1, 1, 2, 3, 4, 5, 5, 6, 8, 10, 0, 1, 2, 3, 4, 4, 5, 6, 8, 11, 0, 1, 2, 3, 3, 3, 4, 5, 7, 11, -13, -9, -5, -2, -1, -3, -4, -5, -6, -6, -10, -7, -4, -1, -1, -3, -4, -4, -4, -3, -8, -6, -3, 0, -1, -2, -3, -3, -2, -1, -5, -3, -1, 0, 0, -1, -1, -1, -1, 0, -3, -1, 0, 2, 1, 1, 1, 1, 1, 2, -2, 1, 2, 3, 3, 2, 2, 3, 3, 4, -1, 1, 3, 4, 4, 4, 4, 4, 5, 6, 0, 1, 3, 5, 5, 5, 5, 5, 6, 7, 0, 1, 3, 5, 5, 5, 5, 5, 7, 7, -1, 0, 1, 3, 3, 3, 3, 4, 5, 8 }, { 13, 14, 12, 9, 11, 13, 6, 4, 5, 6, 12, 14, 11, 9, 10, 13, 7, 3, 2, 7, 11, 13, 11, 9, 10, 13, 7, 1, 0, 5, 10, 12, 11, 9, 9, 11, 5, 0, -3, 4, 10, 10, 10, 6, 4, 6, 3, -2, -6, 1, 10, 9, 8, 4, 2, 2, 1, -3, -7, -1, 11, 9, 7, 4, 2, 4, 1, -3, -6, -2, 11, 9, 6, 5, 4, 5, 1, -3, -4, -1, 13, 9, 7, 8, 8, 7, 2, -3, -2, 0, 15, 10, 9, 10, 12, 10, 3, -2, -1, 1, 13, 14, 12, 8, 9, 7, 4, 2, 2, 1, 13, 14, 12, 9, 9, 10, 7, 5, 5, 6, 11, 13, 12, 10, 9, 10, 8, 5, 5, 8, 11, 11, 11, 9, 8, 10, 7, 5, 4, 7, 10, 9, 10, 7, 5, 7, 5, 2, 1, 4, 10, 8, 8, 5, 3, 3, 2, -1, -2, 2, 9, 8, 7, 5, 3, 5, 2, -1, -2, 1, 10, 8, 6, 5, 4, 5, 3, -1, -2, 0, 12, 8, 6, 7, 6, 6, 3, -1, -2, 0, 13, 7, 7, 7, 8, 7, 2, -1, -2, 0, 13, 13, 10, 9, 7, 5, 2, 2, 0, -2, 13, 12, 11, 10, 8, 8, 6, 4, 4, 3, 12, 12, 11, 10, 9, 9, 7, 5, 5, 6, 11, 10, 10, 10, 8, 9, 7, 5, 5, 7, 9, 8, 9, 8, 5, 6, 5, 3, 2, 5, 8, 5, 6, 6, 2, 3, 2, 1, 1, 3, 7, 5, 5, 5, 2, 3, 2, 0, 0, 2, 7, 5, 5, 6, 3, 4, 2, 0, -1, 0, 9, 6, 5, 6, 5, 5, 2, -1, -1, -1, 10, 7, 5, 6, 6, 5, 2, -1, -1, -2, 12, 10, 7, 6, 5, 3, 1, 0, -2, -5, 12, 10, 8, 8, 7, 6, 4, 3, 2, 1, 11, 10, 8, 8, 8, 7, 5, 4, 4, 4, 10, 8, 8, 9, 7, 7, 5, 4, 4, 6, 7, 6, 6, 7, 5, 5, 5, 3, 3, 5, 5, 3, 4, 5, 1, 2, 2, 1, 1, 3, 4, 3, 3, 4, 2, 2, 1, 0, -1, 1, 5, 3, 3, 4, 3, 2, 0, -1, -1, 0, 6, 4, 4, 4, 3, 3, 0, -1, -2, -2, 7, 5, 5, 4, 4, 3, 0, -2, -3, -4, 10, 8, 6, 5, 2, -1, -4, -5, -7, -8, 11, 8, 7, 6, 4, 0, -1, -2, -3, -4, 10, 8, 7, 6, 4, 1, 0, 0, -1, 0, 8, 7, 6, 5, 4, 1, 0, 1, 0, 3, 5, 5, 4, 3, 3, 0, 0, 1, 1, 3, 3, 2, 1, 2, 1, -1, -2, -1, -1, 1, 3, 1, 1, 1, 0, -3, -3, -3, -3, -2, 4, 1, 1, 1, 0, -2, -3, -4, -5, -4, 4, 2, 2, 2, 0, -2, -3, -5, -6, -6, 5, 4, 3, 3, 1, -1, -3, -5, -7, -8, 9, 6, 5, 3, 1, -1, -6, -8, -9, -9, 9, 6, 5, 3, 2, 0, -4, -5, -6, -4, 8, 6, 4, 3, 2, 0, -3, -4, -3, 1, 6, 5, 3, 2, 1, 1, -2, -3, -1, 3, 4, 3, 1, 0, 0, 1, -2, -3, -1, 2, 1, -1, -2, -1, -2, 0, -3, -5, -3, 0, 1, -2, -3, -3, -4, -3, -5, -7, -5, -2, 0, -2, -3, -3, -4, -3, -5, -8, -7, -5, 1, -1, -1, -2, -3, -2, -6, -8, -9, -7, 3, 1, 0, 0, -1, -1, -5, -8, -10, -9, 8, 3, 2, 2, -1, -4, -7, -9, -10, -12, 7, 3, 1, 1, -1, -4, -5, -6, -7, -6, 6, 2, 0, 1, -1, -3, -4, -5, -3, 0, 5, 1, -1, 0, -1, -3, -4, -4, -3, 1, 2, -2, -1, 0, -2, -2, -3, -5, -3, 1, -1, -4, -4, -2, -4, -4, -4, -6, -4, 0, -3, -6, -6, -5, -7, -8, -6, -8, -7, -2, -4, -6, -7, -6, -8, -9, -8, -9, -8, -4, -2, -4, -6, -6, -7, -8, -8, -9, -10, -7, -1, -3, -4, -5, -6, -7, -8, -9, -11, -9, 6, 1, 1, 1, -1, -6, -9, -10, -12, -15, 5, 1, 0, 1, -2, -5, -7, -8, -9, -10, 4, -1, -1, 1, -2, -5, -5, -6, -6, -4, 2, -2, -1, 1, -1, -5, -4, -6, -6, -3, 0, -3, -1, 1, -2, -3, -3, -7, -6, -3, -3, -5, -4, -1, -3, -5, -5, -8, -7, -4, -5, -7, -6, -4, -7, -10, -9, -9, -9, -7, -6, -8, -7, -6, -8, -11, -10, -10, -10, -9, -5, -7, -8, -7, -9, -11, -11, -10, -11, -11, -5, -6, -7, -7, -8, -10, -10, -11, -12, -12, 7, 1, 1, 2, -1, -8, -10, -11, -13, -17, 5, -1, 1, 2, -1, -7, -8, -9, -10, -13, 4, -2, 1, 2, -1, -6, -6, -7, -7, -8, 1, -3, 1, 2, -1, -5, -5, -6, -7, -8, 0, -3, 1, 2, -1, -4, -4, -7, -8, -8, -3, -5, -1, 0, -3, -4, -5, -8, -10, -10, -5, -7, -4, -3, -6, -11, -8, -9, -11, -12, -6, -8, -6, -4, -8, -13, -10, -10, -12, -12, -7, -9, -8, -6, -9, -13, -12, -11, -12, -13, -7, -8, -8, -7, -9, -12, -13, -12, -12, -13, 7, 1, 3, 4, 1, -9, -11, -11, -15, -18, 5, 0, 3, 5, 2, -7, -9, -8, -11, -14, 3, 0, 4, 5, 2, -7, -6, -5, -8, -10, 2, 0, 4, 5, 1, -5, -4, -5, -8, -10, 1, 0, 4, 4, 1, -4, -3, -5, -8, -10, -1, -2, 3, 2, -1, -4, -4, -7, -10, -11, -3, -3, 1, 1, -3, -12, -9, -8, -11, -12, -5, -5, 0, 0, -5, -15, -14, -9, -12, -13, -8, -8, -4, -1, -5, -16, -16, -11, -12, -13, -12, -11, -9, -3, -7, -17, -19, -12, -13, -13 }, }, { { -2, -2, -1, 0, 1, 2, 4, 7, 11, 15, -2, -1, 0, 0, 1, 2, 4, 7, 10, 14, -2, -1, 0, 1, 1, 2, 4, 7, 10, 13, -2, -1, 0, 1, 2, 2, 4, 6, 9, 12, -2, -1, 0, 1, 2, 3, 4, 6, 9, 12, -2, -1, 0, 1, 2, 3, 4, 6, 8, 11, -2, -1, 0, 1, 2, 3, 4, 5, 8, 11, -2, -1, 0, 1, 2, 4, 4, 5, 8, 10, -2, -1, 0, 1, 2, 4, 5, 5, 7, 10, -1, -2, -1, 1, 2, 3, 5, 5, 6, 9, -2, -3, -2, -1, 0, 0, 2, 4, 7, 11, -2, -2, -2, -1, 0, 1, 2, 4, 7, 10, -1, -2, -2, -1, 0, 1, 2, 4, 6, 9, -1, -2, -2, -1, 0, 1, 2, 4, 6, 9, -1, -1, -2, -1, 0, 1, 2, 4, 5, 8, -1, -1, -2, -1, 0, 1, 2, 3, 5, 8, -1, -1, -2, -1, 0, 2, 2, 3, 5, 7, -1, -1, -1, -1, 0, 2, 3, 3, 5, 7, 0, -1, -1, -1, 0, 1, 3, 3, 4, 6, 0, 0, -1, -2, 0, 1, 2, 3, 3, 5, -2, -2, -2, -2, -2, -1, 0, 2, 4, 7, -1, -2, -2, -1, -1, -1, 0, 2, 3, 6, -1, -1, -2, -1, -1, -1, 0, 2, 3, 6, -1, -1, -1, -1, -1, -1, 0, 1, 3, 5, -1, -1, -1, -1, -1, -1, 0, 1, 3, 5, 0, -1, -1, -1, -1, -1, 0, 1, 3, 4, 0, -1, -1, -1, -1, -1, 0, 1, 2, 4, 0, -1, -1, -1, -1, -1, 0, 1, 2, 3, 0, 0, -1, -1, 0, -1, 0, 0, 1, 3, 0, 0, 0, -1, 0, -1, -1, 0, 0, 2, -1, -1, -2, -1, -1, -1, -1, -1, 1, 3, -1, -1, -2, -1, -1, -1, -1, -1, 1, 3, -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 0, -1, -1, -1, -1, -1, 0, -1, 0, 2, 0, -1, -1, -1, -1, 0, 0, -1, 0, 2, 0, -1, -1, -1, -1, 0, 0, -1, 0, 1, 0, 0, -1, -1, -1, 0, 0, 0, -1, 1, 0, 0, -1, -1, -1, 0, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -3, -2, -1, -1, -2, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, -1, 0, -1, 0, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 1, 0, -1, -1, -1, -1, -1, 0, 0, 0, 1, 0, -1, -1, -1, -1, 0, 0, 0, 0, 1, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -2, -1, 0, 0, 0, -1, -2, -2, -3, -2, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 1, 0, 0, 0, 0, -1, -1, -1, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, 0, 0, 1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, 0, -2, -1, 0, 0, 0, -1, -2, -2, -2, -1, -2, -2, -1, 0, 0, -1, -2, -2, -2, -2, 1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -2, -2, -1, 0, 0, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, 0, -1, -2, -2, -2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 0, 0, 0, 0, -1, 0, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 1, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 1, 0, -1, -1, 4, 3, 3, 2, 1, 0, 0, 0, 0, 0, 3, 3, 3, 2, 1, 0, 0, 0, 0, 0, 2, 3, 3, 2, 1, 1, 0, 0, -1, -1, 2, 2, 2, 2, 1, 1, 0, 0, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 1, 1, 1, 1, 0, -1, -1, -1, -1, 0, 1, 1, 1, 0, 0, -1, -1, -1, -1, 0, 1, 1, 1, 0, 0 }, { 3, 2, 1, 0, -1, 0, 0, 1, 2, 2, 3, 2, 1, 0, -1, 0, 0, 1, 1, 2, 2, 2, 1, 0, -1, -1, 0, 0, 0, 1, 2, 1, 1, 0, -1, -1, -1, -1, 0, 0, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -2, -1, -1, -1, -1, -1, 0, -1, -1, -2, -2, -2, -1, -1, 0, -2, 0, -1, -2, -2, -2, -2, -1, 0, -1, -2, -1, -2, -2, -3, -3, -2, -1, -1, -1, -2, -3, -3, -3, -3, -3, -2, -2, -1, -2, -3, 2, 2, 1, 0, 0, 0, 1, 1, 2, 3, 3, 2, 1, 0, -1, 0, 0, 1, 1, 2, 2, 2, 1, 0, -1, -1, 0, 0, 0, 1, 1, 2, 1, 0, -1, -1, -1, -1, -1, 0, 1, 1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -2, -1, -1, -2, -2, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -4, -3, -3, -3, -3, -2, -2, -2, -3, -3, 2, 1, 0, 0, 0, 0, 1, 1, 2, 4, 2, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 2, 1, 1, -1, -1, -1, 0, 0, 1, 1, 2, 1, 1, -1, -1, -1, -1, -1, 0, 1, 1, 1, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2, -3, -2, -2, -2, -2, -3, -2, -2, -2, -2, -4, -3, -2, -2, -2, -2, -2, -2, -3, -3, 1, 0, 0, 0, 0, 1, 1, 2, 3, 4, 1, 1, 0, 0, 0, 0, 1, 1, 2, 3, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, -1, -1, -1, 0, 0, 1, 1, 1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -4, -3, -2, -2, -2, -2, -2, -2, -3, -3, 1, 0, 0, 0, 0, 1, 2, 3, 4, 4, 1, 1, 0, 0, 0, 1, 2, 3, 3, 3, 2, 1, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, -1, -1, -1, -1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -1, 0, -1, 0, 0, -1, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -2, -3, -2, -2, -2, -3, -2, -1, -1, -1, -2, -2, -2, -3, -3, 1, 0, 0, 0, 0, 2, 3, 4, 4, 5, 1, 1, 0, 0, 1, 1, 2, 3, 4, 4, 1, 1, 1, 0, 1, 1, 2, 2, 3, 3, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, -1, -1, -1, -1, -1, 1, 0, 0, 0, 0, -1, -2, -2, -2, -1, 0, 0, 0, 0, -1, -2, -2, -2, -2, -2, -1, -1, 0, 0, -1, -2, -2, -2, -2, -2, -2, -1, 0, 0, -1, -2, -2, -2, -2, -3, 1, 0, 0, -1, 0, 1, 2, 4, 5, 5, 1, 1, 0, 0, 0, 1, 2, 3, 4, 4, 1, 1, 1, 0, 1, 1, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, -1, -1, 1, 1, 1, 0, 0, -1, -1, -1, -1, -1, 1, 1, 0, 0, -1, -2, -2, -2, -2, -2, 0, 0, 0, 0, -1, -2, -2, -2, -2, -3, -1, 0, 0, 0, 0, -2, -2, -2, -2, -3, 0, 0, -1, -1, 0, 1, 2, 3, 4, 5, 1, 1, 0, 0, 0, 0, 2, 3, 4, 4, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, -1, 0, 0, 0, -1, 1, 1, 1, 0, 0, -1, -1, -1, -1, -2, 1, 1, 1, 0, -1, -2, -2, -2, -2, -2, 1, 1, 0, 0, -1, -2, -2, -2, -2, -3, 0, 1, 0, 0, -1, -2, -2, -2, -3, -3, 0, 0, -1, -1, -1, 1, 2, 3, 4, 5, 1, 0, 0, 0, 0, 0, 1, 2, 3, 4, 1, 1, 1, 1, 0, 0, 0, 1, 2, 3, 1, 1, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, -1, -1, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, -1, -1, -1, 1, 1, 1, 0, -1, -2, -2, -2, -2, -2, 1, 1, 1, 0, -1, -1, -2, -2, -3, -3, 1, 0, 0, 0, -1, -1, -2, -3, -3, -3, 0, -1, -1, -1, -1, 0, 1, 2, 3, 4, 1, 0, -1, -1, -1, -1, 0, 1, 2, 3, 1, 0, 0, 0, 0, -1, 0, 0, 0, 2, 1, 1, 1, 1, 0, 0, -1, -1, -1, 1, 1, 1, 1, 1, 1, 0, -1, -1, 0, 1, 1, 1, 1, 1, 0, -1, -1, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, -1, 1, 1, 1, 0, -1, -1, -1, -1, -2, -2, 1, 1, 0, 0, -1, -2, -2, -2, -3, -3, 1, 0, 0, 0, -1, -1, -2, -3, -3, -4 }, { -5, -4, -3, -2, -2, -2, -3, -4, -5, -6, -5, -4, -3, -2, -1, -2, -3, -4, -5, -5, -4, -3, -3, -3, -2, -2, -2, -4, -4, -3, -2, -3, -3, -3, -2, -1, -2, -2, -2, -1, -2, -2, -3, -2, -1, 0, -1, -1, 0, 0, -2, -2, -2, -1, 0, 1, 0, 0, 1, 2, -1, -2, -1, -1, 1, 1, 1, 2, 2, 2, -1, -1, -1, -1, 0, 1, 1, 1, 1, 2, 0, -1, -2, -1, 0, 0, 0, 0, 0, 1, 0, -1, -2, -2, -1, -1, 0, -1, 0, 0, -6, -4, -3, -2, -1, -1, -1, -2, -3, -3, -6, -4, -3, -2, -2, -1, -1, -2, -3, -3, -4, -4, -4, -3, -3, -1, -2, -3, -2, -1, -2, -3, -3, -3, -2, -1, -2, -2, -1, 0, -1, -2, -2, -2, -1, 0, 0, 0, 0, 1, -1, -1, -2, -1, 0, 1, 1, 1, 2, 2, -1, -1, -1, -1, 0, 1, 1, 1, 2, 2, -1, -1, -1, -1, 0, 0, 0, 0, 1, 2, 0, -1, -1, -1, -1, -1, -1, 0, 0, 1, 0, -1, -2, -2, -2, -2, -2, -1, 0, 0, -6, -5, -3, -2, -1, 0, 0, 0, -1, -2, -6, -5, -3, -2, -1, -1, 0, -1, -1, -1, -5, -4, -4, -3, -2, -1, -1, -1, -1, 0, -3, -3, -3, -3, -2, 0, -1, -1, 0, 1, -2, -2, -2, -2, -1, 0, 0, 0, 1, 2, -1, -1, -2, -2, -1, 0, 1, 1, 2, 3, -1, -1, -1, -1, -1, 0, 0, 1, 2, 3, 0, -1, -1, -1, -1, -1, -1, 0, 1, 2, 0, 0, -1, -2, -1, -1, -1, 0, 0, 1, 1, 0, -1, -2, -2, -2, -2, -1, 0, 1, -6, -5, -4, -2, -1, 0, 1, 0, 0, 0, -6, -5, -4, -2, -1, 0, 0, 0, 0, 0, -5, -5, -3, -3, -2, 0, 0, 0, 0, 1, -3, -3, -3, -2, -1, 0, 0, 0, 1, 2, -2, -2, -2, -1, 0, 0, 0, 1, 2, 2, -1, -1, -1, -1, 0, 0, 1, 1, 2, 3, -1, -1, -1, -1, 0, 0, 0, 1, 2, 3, 0, 0, 0, -1, -1, -1, -1, 0, 1, 2, 1, 0, 0, -1, -1, -1, -1, 0, 1, 1, 2, 0, 0, -1, -1, -2, -2, -1, 0, 1, -6, -5, -4, -2, -1, 0, 0, 1, 1, 0, -6, -5, -4, -3, -2, 0, 0, 1, 1, 1, -6, -5, -3, -3, -2, 0, 0, 1, 1, 1, -4, -3, -3, -2, -1, 0, 0, 1, 1, 2, -2, -2, -2, -1, 0, 0, 1, 2, 2, 3, -1, -1, -1, 0, 0, 0, 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 1, 1, 0, 0, 0, 0, 0, 1, 2, 3, 2, 1, 1, 0, 0, -1, -1, 0, 1, 2, 2, 1, 1, 0, -1, -1, -1, 0, 0, 0, -7, -5, -4, -3, -2, -1, 0, 1, 1, 1, -6, -5, -4, -3, -2, -1, 0, 1, 1, 1, -5, -4, -4, -3, -2, -1, 0, 1, 1, 1, -3, -3, -3, -2, -1, -1, 0, 1, 1, 1, -2, -2, -1, 0, 0, 0, 1, 2, 2, 2, -1, 0, 1, 1, 1, 0, 1, 2, 2, 3, 1, 1, 1, 2, 1, 0, 1, 1, 2, 3, 2, 2, 2, 2, 1, 0, 0, 1, 2, 3, 3, 3, 2, 1, 0, 0, 0, 0, 1, 1, 3, 3, 2, 1, 0, -1, -1, -1, -1, 0, -6, -5, -4, -3, -3, -2, -1, 0, 1, 1, -6, -5, -4, -3, -3, -2, -1, 0, 1, 1, -5, -4, -3, -3, -2, -2, -1, 0, 1, 1, -3, -3, -2, -2, -1, -1, 0, 1, 1, 1, -1, -1, 0, 1, 0, 0, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 1, 0, 1, 1, 1, 3, 4, 4, 4, 3, 1, 0, 0, 1, 1, 2, 4, 4, 4, 2, 1, 0, 0, 0, 0, 1, 5, 4, 3, 2, 1, -1, -2, -1, -1, 0, -6, -5, -4, -3, -3, -2, -2, -1, 1, 1, -6, -5, -4, -3, -3, -3, -2, -1, 1, 1, -4, -3, -3, -3, -2, -3, -2, -1, 1, 1, -2, -2, -2, -1, -1, -1, -1, 0, 1, 1, -1, 0, 0, 1, 1, 0, 1, 1, 2, 2, 1, 2, 3, 3, 2, 1, 1, 1, 2, 3, 3, 4, 5, 4, 2, 1, 1, 1, 1, 2, 5, 5, 5, 4, 2, 0, 0, 0, 1, 2, 6, 6, 5, 3, 2, 0, -1, -1, 0, 1, 6, 6, 5, 3, 1, -1, -2, -1, -1, -1, -6, -5, -4, -4, -4, -3, -2, -1, 0, 1, -6, -5, -4, -3, -3, -3, -2, -1, 0, 1, -4, -3, -2, -2, -2, -2, -2, -1, 0, 1, -2, -1, -1, 0, -1, -1, -1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4, 2, 2, 2, 1, 1, 2, 2, 4, 5, 5, 4, 3, 2, 1, 0, 1, 2, 6, 6, 6, 4, 2, 1, 0, 0, 0, 1, 7, 7, 6, 4, 2, 0, -1, -1, 0, 0, 7, 7, 6, 4, 1, -1, -2, -1, -1, -1, -6, -5, -4, -4, -4, -4, -3, -1, 0, 1, -5, -4, -4, -3, -3, -3, -3, -1, 0, 1, -4, -3, -2, -2, -1, -2, -2, -1, 0, 1, -2, -1, 0, -1, 0, 0, 0, 0, 1, 1, 0, 1, 1, -1, 1, 1, 1, 1, 1, 2, 3, 4, 3, 1, 2, 2, 2, 1, 1, 2, 5, 6, 6, 4, 3, 3, 1, 0, 1, 2, 7, 8, 7, 5, 3, 1, 0, 0, 0, 1, 8, 8, 7, 4, 2, 1, 0, -1, 0, 0, 9, 8, 6, 4, 1, 0, -1, -1, -1, -1 }, { 4, 4, 5, 4, 4, 3, 3, 5, 9, 10, 4, 4, 4, 4, 3, 2, 3, 4, 7, 8, 4, 4, 4, 3, 2, 1, 1, 3, 5, 6, 4, 4, 4, 3, 2, 1, 1, 2, 3, 4, 3, 3, 3, 2, 1, 1, 1, 1, 2, 2, 3, 2, 2, 1, -1, -1, 0, 0, 1, 1, 2, 1, 1, -1, -2, -2, -3, -2, -1, -1, 1, 1, 0, -2, -3, -3, -4, -3, -3, -3, 1, 0, -1, -3, -4, -4, -4, -4, -5, -5, 0, -1, -2, -4, -4, -5, -6, -6, -7, -7, 4, 4, 4, 4, 3, 2, 2, 4, 7, 8, 4, 4, 4, 4, 2, 1, 1, 3, 5, 6, 4, 4, 4, 3, 2, 1, 1, 2, 4, 4, 4, 4, 3, 3, 2, 1, 1, 2, 3, 3, 3, 3, 3, 2, 1, 1, 1, 1, 2, 1, 3, 2, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, -1, -2, -2, -2, -2, -2, -2, 2, 1, 0, -2, -3, -3, -3, -3, -3, -4, 1, 0, -1, -3, -3, -3, -4, -4, -4, -5, 0, -1, -2, -3, -4, -5, -5, -5, -6, -6, 3, 3, 3, 3, 2, 1, 1, 2, 5, 6, 4, 4, 3, 3, 2, 1, 1, 2, 4, 4, 4, 4, 4, 3, 2, 2, 2, 2, 3, 3, 4, 4, 4, 3, 2, 1, 1, 2, 2, 1, 3, 3, 3, 2, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 0, 0, 0, 0, -1, -1, 2, 1, 1, 0, -2, -2, -2, -2, -3, -3, 2, 1, 0, -1, -2, -3, -3, -3, -4, -4, 1, 0, -1, -2, -3, -3, -3, -4, -5, -5, 1, 0, -1, -2, -3, -4, -4, -5, -6, -7, 4, 4, 3, 2, 1, 0, 0, 1, 3, 4, 5, 4, 4, 3, 2, 1, 1, 2, 2, 2, 4, 4, 4, 4, 3, 1, 2, 2, 1, 1, 4, 4, 3, 3, 2, 1, 1, 1, 1, 0, 4, 3, 3, 3, 1, 1, 1, 0, 0, -1, 2, 2, 2, 2, 0, 0, 0, -1, -1, -2, 2, 1, 1, 0, -1, -2, -2, -3, -3, -3, 2, 1, 0, 0, -1, -2, -3, -3, -4, -5, 1, 0, 0, -1, -2, -3, -3, -4, -5, -6, 0, 0, -1, -1, -2, -3, -4, -5, -7, -8, 5, 4, 3, 2, 1, 0, 0, 0, 2, 2, 5, 4, 4, 3, 3, 1, 1, 1, 1, 1, 5, 4, 4, 3, 3, 1, 1, 1, 0, 0, 4, 4, 3, 3, 2, 0, 0, 0, 0, -1, 3, 3, 3, 2, 2, 0, 0, -1, -1, -1, 2, 2, 2, 2, 1, 0, -1, -2, -2, -2, 2, 1, 1, 0, -1, -1, -2, -3, -3, -4, 1, 1, 0, 0, -1, -1, -3, -3, -4, -5, 1, 0, 0, -1, -1, -2, -3, -4, -5, -6, 0, 0, -1, -1, -2, -2, -4, -5, -6, -8, 5, 4, 3, 2, 1, 0, 0, 0, 1, 1, 5, 4, 4, 3, 2, 1, 0, 0, 0, 0, 5, 4, 4, 3, 2, 0, 0, 0, 0, -1, 4, 4, 3, 3, 2, 1, 0, -1, -1, -1, 3, 3, 3, 2, 2, 0, 0, -1, -1, -2, 2, 2, 2, 1, 1, 0, 0, -1, -2, -2, 2, 1, 1, 0, 0, -1, -2, -3, -3, -4, 1, 1, 0, 0, 0, -1, -2, -3, -4, -5, 0, 0, 0, 0, -1, -1, -3, -4, -5, -6, 0, -1, -1, -1, -1, -2, -3, -5, -6, -7, 5, 4, 3, 3, 1, 1, 0, 0, 0, 0, 5, 4, 4, 3, 2, 1, 0, 0, 0, -1, 5, 4, 4, 3, 2, 1, 1, 0, -1, -1, 4, 4, 4, 3, 2, 1, 1, 0, -1, -2, 3, 3, 3, 2, 2, 1, 1, 0, -1, -2, 2, 1, 1, 1, 0, 0, 0, -1, -2, -2, 1, 0, 0, 0, -1, -1, -2, -3, -4, -4, 0, 0, 0, -1, -1, -1, -2, -4, -4, -5, -1, -1, -1, -1, -1, -1, -2, -4, -5, -6, -2, -2, -2, -2, -2, -2, -3, -5, -6, -8, 5, 4, 3, 2, 1, 1, 0, 0, 0, -1, 5, 4, 3, 3, 2, 1, 1, 0, 0, -1, 5, 4, 3, 3, 2, 2, 2, 0, 0, -1, 4, 3, 3, 2, 2, 2, 2, 0, -1, -1, 2, 2, 2, 2, 1, 1, 1, 0, -1, -2, 1, 0, 0, 0, 0, 0, 0, -1, -2, -2, 0, -1, -1, -2, -2, -1, -2, -3, -3, -4, -1, -1, -2, -2, -2, -2, -2, -3, -4, -5, -2, -2, -3, -3, -3, -3, -3, -4, -5, -6, -3, -4, -4, -4, -3, -3, -4, -5, -6, -8, 4, 4, 3, 1, 1, 1, 1, 0, -1, -1, 5, 4, 3, 2, 2, 2, 2, 0, 0, -1, 4, 4, 3, 3, 2, 2, 2, 1, 0, -1, 3, 3, 2, 2, 2, 2, 2, 1, 0, -2, 1, 1, 1, 1, 1, 1, 1, 0, -1, -2, 0, -1, -1, -1, -1, 0, 0, -1, -2, -3, -1, -2, -2, -2, -2, -2, -2, -3, -4, -4, -2, -3, -3, -3, -3, -3, -3, -3, -4, -5, -4, -4, -5, -4, -4, -3, -3, -4, -5, -6, -6, -6, -5, -5, -5, -5, -5, -6, -6, -8, 4, 3, 2, 1, 1, 1, 1, 0, -1, -2, 4, 4, 3, 2, 2, 2, 2, 1, -1, -2, 3, 3, 3, 3, 2, 2, 2, 1, -1, -2, 2, 2, 2, 2, 2, 2, 2, 1, 0, -2, 0, 0, 0, 1, 1, 1, 1, 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, -1, -2, -3, -2, -3, -2, -2, -2, -2, -2, -3, -3, -4, -4, -4, -4, -4, -3, -3, -3, -4, -4, -5, -6, -6, -6, -5, -5, -4, -5, -5, -5, -6, -8, -8, -7, -7, -6, -6, -6, -7, -7, -8 }, { -5, -7, -13, -15, -13, -9, -7, -7, -8, -11, -5, -6, -7, -8, -7, -6, -4, -4, -5, -8, -6, -6, -7, -6, -5, -3, -2, -2, -3, -7, -6, -6, -7, -7, -5, -2, 0, 0, -1, -5, -6, -6, -6, -6, -4, 0, 2, 2, 1, -2, -5, -5, -5, -4, -2, 1, 3, 4, 3, 0, -5, -4, -4, -2, 0, 3, 4, 5, 4, 2, -4, -4, -2, 0, 2, 4, 5, 6, 6, 4, -3, -3, -1, 1, 4, 5, 6, 7, 7, 5, -3, -2, 1, 4, 7, 7, 7, 9, 8, 7, -3, -4, -8, -12, -13, -10, -7, -7, -7, -9, -2, -3, -3, -5, -7, -6, -5, -4, -5, -8, -2, -3, -4, -5, -4, -4, -3, -3, -4, -6, -2, -2, -3, -4, -4, -1, 0, -1, -2, -5, -2, -2, -2, -3, -3, 0, 2, 1, 0, -3, -1, -1, -1, -1, -1, 1, 4, 3, 2, -1, 0, -1, 0, 1, 1, 3, 5, 5, 3, 2, 0, 0, 1, 2, 3, 4, 5, 6, 5, 3, 1, 0, 2, 3, 5, 5, 6, 7, 6, 5, 2, 0, 1, 4, 7, 8, 8, 8, 8, 6, -4, -3, -5, -12, -13, -10, -7, -6, -6, -8, -2, -1, -1, -3, -8, -7, -5, -5, -5, -7, -1, -1, -2, -2, -3, -5, -4, -3, -4, -6, -1, -1, -2, -2, -3, -2, -2, -2, -3, -5, 0, -1, -1, -1, -2, 0, 1, 0, -1, -3, 1, 0, 0, 0, 0, 2, 3, 2, 1, -1, 1, 0, 1, 2, 3, 4, 4, 4, 2, 0, 2, 1, 2, 4, 4, 5, 5, 5, 4, 2, 3, 2, 3, 4, 6, 7, 6, 6, 5, 4, 4, 2, 3, 5, 8, 9, 8, 7, 6, 5, -6, -5, -6, -10, -12, -11, -7, -6, -6, -7, -3, -2, -2, -3, -8, -8, -6, -5, -5, -7, -1, 0, -1, -1, -3, -5, -5, -4, -5, -6, 0, 0, 0, -1, -1, -3, -3, -3, -4, -5, 1, 0, 0, 0, -1, 0, -1, -1, -2, -3, 2, 1, 1, 2, 2, 2, 2, 1, -1, -2, 3, 2, 2, 4, 4, 4, 3, 2, 1, -1, 3, 2, 3, 5, 5, 5, 4, 4, 3, 1, 4, 3, 4, 5, 6, 7, 5, 5, 4, 3, 5, 4, 4, 6, 9, 10, 7, 6, 5, 5, -8, -7, -8, -11, -12, -10, -7, -5, -5, -6, -5, -4, -4, -5, -8, -8, -6, -4, -5, -6, -2, -1, -2, -2, -4, -6, -5, -4, -4, -5, 0, 0, -1, -1, -2, -4, -4, -3, -4, -5, 1, 1, 0, 0, -1, -1, -3, -3, -3, -4, 2, 2, 2, 2, 0, 0, -1, -1, -2, -3, 3, 2, 2, 4, 3, 1, 1, 0, -1, -2, 3, 2, 4, 5, 5, 3, 2, 1, 1, 0, 4, 3, 4, 5, 6, 5, 3, 2, 2, 1, 5, 4, 4, 6, 8, 9, 6, 4, 3, 3, -9, -9, -9, -11, -12, -11, -7, -4, -4, -5, -5, -6, -5, -6, -9, -8, -6, -4, -3, -5, -3, -3, -3, -3, -5, -6, -5, -4, -4, -5, -1, -2, -2, -3, -3, -5, -5, -4, -4, -5, 1, 0, -1, -1, -1, -2, -3, -3, -3, -4, 2, 1, 1, 0, 0, 0, -1, -2, -2, -3, 2, 2, 2, 2, 2, 1, 0, -1, -1, -2, 2, 2, 3, 4, 3, 1, 1, 1, 0, -1, 3, 3, 3, 5, 5, 4, 2, 1, 1, 1, 5, 4, 4, 5, 7, 8, 5, 3, 2, 2, -11, -11, -10, -11, -13, -11, -8, -4, -2, -3, -8, -8, -8, -8, -10, -9, -6, -4, -2, -3, -6, -6, -6, -6, -7, -7, -5, -3, -2, -3, -4, -5, -5, -6, -6, -5, -4, -3, -2, -3, -3, -3, -4, -5, -3, -3, -3, -2, -2, -2, -1, -2, -2, -2, -1, 0, -1, -1, 0, -1, 0, -1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 2, 3, 4, 3, 3, 2, 3, 3, 4, 4, 3, 5, 6, 7, 5, 4, 3, 3, -12, -12, -10, -11, -12, -10, -5, -2, -1, -1, -9, -10, -9, -8, -9, -8, -4, -2, -1, -1, -7, -8, -8, -7, -7, -6, -3, -2, 0, -1, -6, -6, -7, -6, -5, -4, -3, -1, 0, -1, -5, -5, -5, -5, -2, -1, -1, 0, 0, 0, -3, -4, -3, -2, -1, 1, 1, 1, 1, 1, -2, -2, -1, 0, 0, 2, 3, 2, 2, 2, 1, 0, 0, 1, 1, 3, 4, 3, 3, 3, 4, 3, 2, 2, 3, 4, 4, 4, 4, 4, 6, 5, 4, 4, 6, 7, 6, 5, 4, 4, -14, -12, -11, -10, -11, -9, -5, 0, 2, 2, -11, -11, -9, -8, -8, -6, -3, 0, 1, 2, -9, -9, -8, -6, -5, -5, -2, 0, 1, 2, -8, -7, -7, -6, -4, -3, -1, 1, 2, 2, -6, -6, -6, -5, -2, 0, 1, 2, 3, 3, -5, -5, -4, -4, -1, 3, 4, 3, 4, 4, -4, -4, -3, -2, 0, 4, 5, 5, 5, 5, -1, -2, -1, 0, 0, 4, 6, 6, 6, 6, 3, 2, 1, 2, 3, 4, 6, 6, 6, 6, 6, 6, 4, 4, 6, 7, 8, 6, 5, 6, -15, -13, -11, -9, -9, -8, -4, 0, 3, 6, -13, -11, -9, -7, -6, -5, -2, 2, 4, 6, -11, -10, -8, -6, -4, -3, 1, 3, 5, 7, -9, -8, -7, -6, -3, -1, 2, 4, 5, 6, -8, -7, -7, -5, -3, 1, 4, 5, 6, 7, -7, -6, -5, -5, -2, 3, 7, 7, 7, 8, -5, -5, -4, -4, -1, 4, 8, 8, 8, 9, -3, -3, -3, -2, -1, 3, 8, 9, 10, 10, 1, 0, 0, 1, 1, 2, 6, 8, 8, 9, 5, 5, 3, 4, 4, 5, 6, 6, 6, 7 }, { -11, -14, -9, -8, -12, -17, -14, -12, -14, -18, -9, -10, -4, -4, -11, -17, -12, -10, -13, -16, -5, -7, 0, -3, -10, -16, -10, -9, -10, -12, -3, -4, 3, -1, -9, -14, -7, -7, -8, -9, -1, 0, 5, 0, -3, -7, -3, -4, -6, -6, 0, 2, 8, 2, -1, -6, 0, -2, -6, -8, 0, 2, 7, 4, -2, -7, -2, -4, -7, -9, 0, 1, 6, 3, -2, -9, -4, -6, -10, -13, 0, 1, 5, 3, -2, -10, -8, -9, -12, -17, 0, 1, 3, 2, -3, -12, -12, -13, -16, -21, -7, -10, -7, -7, -9, -13, -13, -10, -11, -13, -8, -9, -6, -7, -10, -13, -11, -9, -10, -11, -6, -8, -3, -5, -9, -13, -10, -8, -9, -7, -5, -6, -2, -4, -9, -12, -7, -7, -5, -4, -3, -2, 2, -2, -4, -6, -4, -5, -3, -2, -1, 0, 4, 0, -3, -6, -2, -2, -4, -2, 0, 1, 5, 1, -3, -7, -3, -4, -5, -4, 0, 1, 4, 1, -4, -7, -4, -6, -8, -8, -1, 1, 4, 0, -4, -9, -8, -9, -11, -14, -1, 0, 3, 0, -5, -11, -11, -12, -14, -18, -5, -8, -6, -6, -8, -9, -9, -10, -9, -8, -6, -9, -7, -8, -9, -10, -9, -9, -9, -7, -8, -9, -6, -7, -10, -10, -8, -8, -7, -4, -7, -7, -4, -6, -9, -10, -6, -8, -4, -1, -4, -4, 0, -2, -4, -5, -3, -6, -1, 2, -2, -1, 2, 0, -3, -4, -2, -5, -2, 3, -2, -1, 2, 0, -3, -6, -3, -6, -4, 2, -2, 0, 2, 0, -4, -6, -4, -8, -6, -3, -2, 0, 2, 0, -4, -8, -6, -10, -8, -9, -1, 0, 2, -1, -5, -9, -9, -12, -13, -15, -4, -6, -5, -5, -6, -6, -7, -8, -9, -6, -6, -8, -7, -7, -8, -8, -8, -8, -8, -5, -7, -9, -7, -8, -9, -8, -7, -7, -6, -2, -7, -8, -6, -6, -8, -8, -6, -6, -3, 0, -5, -5, -2, -3, -4, -5, -3, -3, -1, 3, -3, -2, 1, 0, -3, -4, -2, -2, 0, 4, -2, -1, 1, 0, -4, -4, -3, -3, -2, 3, -2, -1, 1, 0, -4, -5, -3, -5, -3, -1, -2, 0, 1, 0, -4, -6, -6, -8, -7, -7, -1, 0, 1, 0, -4, -7, -8, -10, -12, -13, -3, -3, -3, -3, -2, -2, -5, -7, -7, -5, -4, -6, -5, -5, -4, -3, -6, -6, -6, -4, -6, -7, -6, -6, -5, -3, -6, -5, -4, -2, -6, -7, -6, -6, -4, -3, -5, -4, -2, 0, -6, -5, -3, -3, -1, -1, -3, -2, 0, 2, -3, -2, 0, -1, 0, 0, -1, 0, 2, 3, -1, -1, 0, 0, 0, -1, -2, -1, 1, 2, -1, 0, 1, 0, -1, -1, -3, -1, 0, 0, 0, 1, 1, 1, -1, -3, -5, -4, -4, -6, 1, 1, 1, 0, -2, -4, -7, -8, -9, -12, 0, 1, 1, 1, 1, -1, -2, -3, -3, -3, -1, -1, -1, -1, 0, -1, -2, -3, -2, -2, -3, -3, -2, -2, -1, -2, -2, -2, -1, 0, -4, -4, -2, -3, -2, -3, -2, -1, 0, 2, -4, -3, -2, -2, 0, 0, 0, 0, 2, 4, -2, 0, 2, 1, 1, 0, 2, 2, 4, 5, 1, 2, 3, 2, 1, 0, 2, 3, 4, 5, 2, 3, 4, 2, 1, 0, 1, 3, 3, 3, 3, 4, 4, 3, 1, -1, -1, 1, 0, -2, 4, 3, 3, 1, 0, -3, -4, -4, -5, -8, 4, 4, 5, 4, 5, 4, 0, -1, -1, -1, 3, 3, 4, 3, 3, 3, 0, -1, 0, 0, 1, 2, 3, 2, 2, 2, 0, -1, 0, 1, 0, 1, 2, 1, 2, 3, 0, 0, 1, 3, -1, 1, 2, 2, 1, 2, 1, 0, 2, 4, 2, 3, 4, 4, 4, 5, 4, 2, 4, 6, 4, 6, 6, 5, 6, 6, 6, 4, 6, 7, 5, 7, 6, 5, 6, 6, 6, 4, 6, 6, 6, 7, 6, 5, 5, 4, 4, 3, 3, 0, 6, 6, 4, 3, 2, 0, 0, 0, -2, -5, 10, 8, 7, 6, 6, 5, 1, -1, -1, 0, 9, 7, 6, 5, 5, 4, 1, -1, 0, 1, 7, 6, 5, 4, 4, 3, 1, -1, 0, 2, 5, 4, 4, 3, 3, 3, 1, -1, 0, 3, 4, 4, 4, 3, 1, 2, 1, -1, 1, 3, 6, 6, 6, 5, 5, 6, 4, 1, 3, 6, 8, 8, 8, 7, 8, 9, 7, 4, 6, 9, 8, 8, 8, 7, 8, 8, 7, 5, 6, 8, 8, 7, 8, 7, 7, 7, 6, 4, 5, 2, 7, 6, 6, 5, 4, 3, 2, 1, 1, -3, 12, 10, 9, 9, 9, 7, 2, -1, 0, 0, 11, 9, 8, 7, 8, 6, 1, -2, 0, 1, 10, 8, 6, 4, 5, 5, 1, -3, -1, 1, 8, 6, 4, 3, 3, 4, 0, -3, -1, 1, 7, 5, 4, 2, 1, 3, 1, -3, -1, 2, 9, 8, 6, 4, 4, 6, 3, -1, 1, 5, 10, 10, 8, 7, 8, 9, 6, 2, 4, 9, 10, 10, 8, 8, 9, 10, 6, 4, 6, 10, 10, 10, 9, 9, 9, 9, 6, 4, 7, 5, 10, 9, 8, 7, 8, 6, 4, 3, 3, 0, 14, 13, 12, 12, 14, 9, 1, -2, 1, 1, 13, 11, 10, 8, 11, 7, 0, -3, 0, 0, 11, 10, 8, 5, 5, 5, 0, -4, -2, -1, 10, 9, 6, 3, 3, 4, 0, -5, -3, -1, 9, 7, 5, 2, 1, 3, 0, -5, -4, 0, 11, 10, 7, 4, 4, 5, 1, -4, -3, 2, 12, 11, 9, 7, 8, 10, 4, -2, 1, 7, 12, 11, 10, 9, 10, 11, 5, 0, 4, 9, 12, 12, 11, 10, 12, 12, 6, 3, 7, 7, 13, 12, 11, 10, 12, 11, 6, 5, 5, 3 }, }, { { 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 1, 0, 0, -1, -1, -1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, -1, -1, -2, -1, 1, 0, 0, -1, -1, -1, -1, -2, -2, -2, 1, 0, -1, -1, -1, -1, -1, -2, -2, -2, 0, 0, -1, -1, -2, -1, -1, -1, -2, -2, 0, -1, -1, -2, -2, -2, -1, -2, -2, -2, 0, -1, -1, -2, -2, -2, -2, -2, -1, -2, 0, 0, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, -1, 0, 1, 0, 0, -1, -1, -1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, -1, -1, -1, -2, 1, 0, -1, -1, -1, -1, -1, -2, -2, -2, 0, 0, -1, -1, -2, -1, -1, -2, -2, -2, 0, 0, -1, -2, -2, -1, -1, -2, -1, -1, 0, 0, -1, -2, -2, -2, -2, -1, -1, -1, 0, 0, -1, -2, -2, -2, -1, -1, -1, -1, 0, 0, 0, -1, -2, -1, -1, 0, 0, 0, 0, -1, -1, -2, -2, -2, -1, 0, 0, 0, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, -1, -1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, -1, -1, -1, -2, 1, 0, 0, -1, -1, -1, -1, -1, -2, -2, 0, 0, -1, -1, -2, -1, -1, -1, -2, -2, 0, 0, -1, -1, -2, -2, -2, -1, -1, -1, 0, 0, -1, -2, -2, -2, -1, -1, -1, -1, 0, 0, -1, -1, -2, -1, -1, 0, 0, 0, 0, 0, 0, -1, -2, -1, -1, 0, 1, 1, 0, -1, -1, -2, -3, -2, -1, 0, 0, -2, 1, 1, 0, 0, -1, -1, 0, 0, 0, 0, 1, 1, 0, 0, 0, -1, -1, -1, -1, -1, 1, 1, 0, 0, -1, -1, -1, -1, -1, -2, 1, 0, 0, -1, -1, -1, -1, -1, -2, -2, 0, 0, 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, 0, -1, -2, -2, -1, -1, -1, -1, 0, 0, -1, -1, -2, -1, -1, 0, 0, 0, 0, 0, 0, -1, -2, -1, 0, 0, 0, 0, 1, 0, 0, -1, -1, -1, 0, 1, 1, 1, 0, -1, -1, -2, -3, -3, -1, 0, -1, -1, 1, 1, 0, -1, -1, -2, 0, 0, 0, 0, 1, 1, 0, 0, 0, -1, -1, 0, -1, -1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -1, 1, 0, 0, 0, -1, -1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -2, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, 0, 1, 1, 1, 1, 0, 0, -1, -1, 0, 1, 1, 1, 1, 0, 0, -1, -2, -3, -2, -1, 0, -1, -1, 1, 1, 0, 0, -1, -2, -1, 0, 0, -2, 1, 1, 0, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 0, 0, -1, -1, -1, -1, -1, 1, 0, 0, 0, -1, -1, -1, -1, -1, -1, 1, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 0, 0, -1, -1, -2, -2, -1, 0, -1, -1, 1, 1, 1, 0, -1, -1, 0, 0, -1, -2, 1, 1, 1, 0, 0, 0, 0, 0, -1, -1, 1, 1, 1, 0, 0, 0, 0, 0, -1, -1, 1, 1, 0, 0, 0, 0, 0, 0, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, -1, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, -1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 1, 0, 0, -1, -1, -1, 0, 0, -1, -1, 1, 1, 1, 0, 0, 0, 0, 0, -1, -2, 1, 1, 1, 1, 0, 0, 0, 0, -1, -2, 1, 1, 1, 1, 0, 0, 0, 0, -1, -1, 1, 1, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 1, 1, 0, 1, 0, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0, 0, 0, 0, 0, -1, -1, 1, 1, 1, 1, 1, 1, 0, 0, -1, -2, 1, 1, 1, 1, 1, 1, 0, 0, -1, -1, 1, 1, 1, 1, 1, 0, 0, 0, -1, -1, 1, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, -1, 1, 2, 1, 1, 0, 0, 1, 0, 0, 0, 1, 3, 2, 2, 1, 1, 1, 1, 0, 0, 2, 3, 3, 3, 2, 1, 1, 1, 0, 1, 2, 3, 4, 3, 3, 2, 1, 1, 1, 1, 1, 1, 0, 0, -1, -1, 1, 1, 1, 1, 1, 1, 1, 0, -1, -2, 1, 1, 1, 1, 1, 1, 1, 0, -1, -1, 1, 1, 1, 1, 1, 1, 0, 0, -1, -2, 1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 1, 0, 1, 0, 0, 0, 2, 3, 2, 2, 1, 1, 1, 1, 0, 1, 2, 4, 3, 3, 2, 2, 1, 1, 1, 1, 3, 4, 4, 3, 3, 2 }, { -1, -1, -1, -1, -2, -2, -3, -3, -4, -4, -1, -1, -1, -1, -1, -2, -2, -3, -3, -3, -3, -3, -2, -1, -1, -1, -2, -2, -1, -1, -3, -3, -3, -2, -2, -2, -1, 0, 0, 1, -3, -3, -2, -2, -2, -2, -1, 0, 1, 2, -2, -2, -2, -2, -1, -1, 0, 1, 2, 2, -2, -1, -1, -1, -1, 0, 0, 1, 1, 2, -1, -1, -1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, -1, 0, -1, -2, -2, -2, -3, -4, -4, -4, -2, -2, -1, -1, -2, -2, -3, -3, -3, -3, -2, -2, -2, -2, -2, -1, -2, -2, -1, -1, -3, -3, -2, -2, -2, -2, -2, -1, 0, 2, -3, -3, -2, -2, -2, -2, -1, 0, 1, 1, -3, -2, -2, -2, -1, 0, 0, 1, 2, 2, -2, -1, -1, -1, 0, 0, 1, 1, 1, 2, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 0, 0, 1, 1, -1, 0, -1, -2, -2, -3, -3, -4, -4, -5, -1, -1, -1, -2, -2, -3, -3, -4, -4, -4, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -3, -2, -2, -2, -2, -2, -2, -1, 0, 0, -3, -3, -3, -2, -2, -1, -1, 0, 1, 1, -3, -2, -2, -2, -1, 0, 1, 2, 2, 2, -1, -2, -1, -1, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 4, 2, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, -1, -2, -2, -3, -4, -4, -5, -6, -1, -1, -1, -1, -2, -2, -4, -4, -5, -4, -2, -2, -1, -1, -2, -2, -2, -3, -3, -3, -2, -2, -2, -1, -2, -2, -2, -1, -1, -1, -3, -3, -2, -2, -1, -1, 0, 0, 0, 0, -2, -2, -2, -1, -1, 0, 1, 1, 1, 2, -1, -1, -1, -1, 0, 0, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 3, 2, 1, 1, 0, 0, 1, 1, 1, 1, 4, 3, 2, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, -1, -3, -4, -5, -6, -6, -1, -1, 0, -1, -1, -2, -3, -4, -5, -5, -2, -1, -1, -1, -1, -2, -3, -3, -3, -3, -2, -2, -2, -1, -1, -1, -2, -2, -2, -1, -2, -2, -2, -1, 0, 0, -1, 0, 0, 0, -1, -1, -2, -1, 0, 0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 0, 0, 1, 1, 1, 1, 4, 3, 2, 1, 1, 0, 0, 1, 1, 1, 5, 4, 3, 2, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, -1, -2, -3, -5, -6, -6, -1, 0, 0, 0, 0, -2, -3, -4, -4, -4, -2, -1, -1, 0, 0, -1, -2, -3, -3, -3, -2, -2, -1, -1, 0, -1, -2, -2, -2, -1, -2, -2, -1, -1, 0, 0, -1, 0, 0, 1, -1, 0, -1, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 1, 4, 4, 3, 2, 1, 0, 0, 0, 1, 1, 5, 5, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -1, -2, -3, -4, -6, -1, 0, 0, 0, 0, -1, -2, -3, -3, -4, -2, -1, -1, 0, 0, -1, -1, -2, -2, -2, -2, -2, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 1, 0, 0, 0, 1, 1, 1, 3, 3, 2, 1, 1, 0, 0, 0, 1, 1, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, -1, 0, 1, 1, 0, 0, -1, -2, -3, -3, -1, 0, 0, 0, 0, 0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, -1, -1, -1, -2, -1, -1, 0, 0, 1, 1, 0, 0, 0, -1, 0, -1, 0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 1, 1, 4, 4, 3, 2, 1, 0, 0, 0, 1, 1, 5, 5, 4, 3, 2, 1, 0, 0, 0, 0, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, -1, 0, 1, 1, 1, 1, 0, -1, -2, -2, -2, -1, 0, 0, 0, 0, 0, 0, -1, -1, -2, -1, -1, 0, 1, 1, 1, 1, 0, 0, -2, -1, -1, -1, 1, 1, 1, 1, 2, 2, -1, -1, -1, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 1, 0, 0, 0, 0, 1, 4, 5, 4, 2, 1, 1, 0, 0, 0, 0, 6, 6, 5, 3, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, -1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, -1, -1, -1, 0, 0, 1, 1, 1, 1, 2, -1, -2, -1, 0, 1, 1, 1, 2, 2, 2, -1, -1, 0, 0, 1, 1, 2, 2, 2, 2, -1, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 0, 0, 1, 1, 3, 4, 4, 2, 1, 0, 0, 0, 0, 0, 5, 6, 4, 4, 2, 1, 0, 0, 0, 0 }, { -5, -4, -3, -2, -1, -1, -1, -1, -1, -1, -4, -3, -3, -1, 0, 0, 0, -1, -1, -1, -4, -3, -2, 0, 0, 1, 0, -1, -2, -2, -3, -2, -2, 0, 0, 1, 0, -2, -3, -3, -3, -2, -1, 0, 0, 0, 0, -2, -4, -6, -3, -1, -1, 0, 0, 0, -1, -3, -5, -7, -3, -2, -2, 0, 0, 0, -1, -3, -6, -8, -3, -2, -2, 0, 0, 0, -1, -3, -6, -8, -3, -2, -1, 1, 1, 0, -1, -3, -5, -8, -2, -1, -1, 1, 1, 0, -1, -3, -5, -7, -5, -4, -4, -3, -2, -2, -1, -1, -1, -1, -4, -4, -3, -3, -1, 0, 0, -1, -1, -1, -4, -4, -2, -2, -1, 0, 0, 0, -1, -1, -3, -3, -2, -1, 0, 0, 1, -1, -2, -2, -3, -3, -2, -1, 0, 0, 0, -1, -3, -4, -3, -3, -2, -1, 0, 0, 0, -2, -4, -5, -3, -2, -2, -1, 0, 0, 0, -2, -4, -6, -2, -2, -2, 0, 0, 0, 0, -2, -4, -6, -1, -1, -1, 1, 0, 0, 0, -1, -3, -6, -1, 0, 0, 1, 1, 1, 1, -1, -3, -5, -5, -5, -4, -4, -3, -2, -2, -2, -1, -1, -5, -5, -4, -3, -3, -2, -1, -1, -1, 0, -5, -4, -3, -3, -2, -1, 0, 0, -1, -1, -4, -4, -3, -2, -1, -1, 0, 0, -1, -1, -4, -4, -2, -2, -1, 0, 0, 0, -2, -3, -4, -4, -2, -2, -1, 0, 0, -1, -3, -4, -3, -3, -1, -1, 0, 0, 0, -1, -3, -5, -2, -2, 0, 0, 0, 0, 0, -1, -3, -5, -1, -1, 1, 1, 1, 0, 0, 0, -2, -4, 0, 0, 1, 1, 1, 1, 1, 1, -1, -3, -6, -4, -4, -4, -4, -3, -2, -1, -2, -1, -5, -4, -4, -3, -3, -3, -2, -2, -1, -1, -5, -4, -3, -3, -3, -2, -1, -1, -1, -1, -5, -4, -3, -2, -2, -1, -1, 0, -1, -1, -5, -5, -3, -2, -2, -1, 0, 0, -1, -2, -4, -4, -2, -2, -1, 0, 0, 0, -1, -3, -4, -4, -2, -1, 0, 0, 0, 0, -2, -3, -3, -3, -1, 0, 0, 0, 0, 0, -1, -2, -2, -2, 0, 1, 1, 1, 1, 1, 0, -1, -1, 0, 1, 1, 1, 1, 1, 1, 1, 0, -5, -4, -3, -3, -3, -3, -2, -1, -1, -2, -5, -3, -3, -3, -3, -2, -2, -2, -1, -2, -6, -3, -3, -2, -2, -2, -2, -1, -1, -1, -6, -3, -2, -2, -2, -2, -1, 0, -1, -1, -5, -3, -2, -2, -1, -1, -1, 0, -1, -1, -5, -2, -1, -1, 0, 0, 0, 0, -1, -1, -5, -2, -2, -1, 0, 0, 0, 1, 0, -1, -4, -2, -1, 0, 0, 1, 1, 1, 0, 0, -3, -1, 0, 0, 1, 1, 1, 1, 1, 1, -1, 0, 1, 1, 1, 1, 1, 2, 2, 2, -3, -2, -2, -2, -2, -2, -2, -1, -1, -1, -4, -2, -2, -2, -1, -2, -2, -2, -1, -1, -4, -2, -2, -1, -1, -1, -2, -1, -1, -1, -4, -2, -1, -1, -1, -1, -1, -1, -1, -1, -4, -1, -1, 0, 0, 0, -1, 0, 0, -1, -4, -1, -1, 0, 1, 1, 0, 0, 0, 0, -4, -1, -1, 0, 1, 1, 1, 1, 1, 1, -4, -1, -1, -1, 0, 1, 1, 1, 1, 1, -3, -1, 0, 0, 1, 1, 1, 2, 2, 2, -2, 0, 1, 1, 1, 1, 1, 2, 2, 3, -1, 0, 0, 0, 0, 0, -1, -1, -1, -2, -1, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 1, 1, 1, 0, -1, -1, -1, -1, 0, 0, 1, 1, 1, 1, 0, -1, -1, 0, -1, 0, 1, 1, 2, 1, 0, 0, 0, 0, -2, -1, 0, 1, 2, 2, 0, 0, 1, 1, -2, -2, -1, 0, 1, 2, 0, 0, 1, 1, -2, -2, -1, -1, 1, 1, 0, 1, 1, 2, -2, -1, 0, 0, 1, 1, 1, 1, 2, 2, -1, 0, 1, 1, 1, 1, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 1, 0, 0, -2, 2, 2, 3, 3, 3, 2, 1, 0, 0, -2, 2, 3, 3, 4, 3, 3, 1, 0, 0, -2, 2, 3, 3, 4, 4, 3, 1, 0, 0, 0, 2, 3, 3, 4, 4, 3, 1, 0, 0, 0, -1, 0, 1, 3, 4, 3, 1, 0, 0, 0, -3, -2, -1, 0, 3, 2, 1, 0, 0, 1, -3, -2, -1, 0, 2, 2, 0, 0, 1, 1, -2, -1, -1, 0, 0, 1, 0, 0, 1, 2, -1, 0, 0, 1, 1, 1, 1, 1, 2, 2, 4, 5, 5, 5, 5, 4, 3, 2, 0, -2, 4, 5, 6, 6, 5, 4, 3, 2, -1, -2, 4, 5, 6, 6, 6, 5, 3, 2, 0, -2, 4, 5, 6, 7, 6, 5, 3, 2, 1, -2, 3, 4, 6, 7, 6, 5, 3, 2, 1, -1, 0, 1, 3, 6, 6, 4, 2, 1, 0, -1, -2, -1, 0, 3, 5, 4, 2, 1, 0, 0, -3, -2, -1, 1, 3, 3, 1, 0, 0, 0, -2, -1, 0, 1, 2, 2, 1, 0, 0, 1, -1, -1, 0, 1, 1, 1, 0, 0, 1, 2, 7, 8, 8, 8, 7, 6, 5, 2, 1, -1, 7, 8, 9, 8, 8, 7, 5, 2, 0, -1, 6, 8, 9, 9, 8, 7, 5, 3, 0, -1, 6, 8, 9, 9, 8, 7, 5, 3, 0, -2, 5, 7, 9, 9, 8, 6, 4, 3, 0, -2, 2, 3, 6, 9, 8, 6, 4, 2, 1, -2, 0, 0, 3, 6, 7, 5, 3, 1, 0, -2, -1, -1, 0, 3, 5, 4, 2, 1, 0, -2, -1, -1, 0, 1, 3, 2, 1, 0, 0, 0, -1, -1, 1, 1, 2, 1, 1, 0, 0, 1 }, { 1, 1, 1, 0, 0, -1, -1, 2, 7, 12, 1, 1, 1, 0, 0, -1, 0, 3, 6, 10, 1, 1, 1, 1, 0, -1, 0, 1, 4, 7, 1, 1, 1, 0, -1, -2, -1, 0, 3, 5, 1, 1, 0, -1, -2, -2, -1, 2, 3, 3, 1, 0, -1, -2, -2, -2, 0, 1, 1, 2, 2, 0, -2, -2, -3, -3, -2, -1, -1, 0, 1, -1, -2, -3, -4, -4, -4, -3, -3, -2, 0, -2, -4, -4, -5, -6, -6, -5, -5, -3, -1, -3, -5, -6, -6, -7, -7, -8, -6, -5, 2, 2, 1, 0, -1, -2, 0, 3, 6, 9, 2, 2, 2, 1, 0, -1, 2, 4, 6, 7, 3, 2, 2, 1, 0, 0, 2, 3, 5, 5, 3, 2, 1, 1, -1, -1, 0, 3, 3, 3, 2, 2, 1, 0, -1, 0, 0, 2, 2, 1, 2, 1, 0, -1, -1, -1, 0, 0, 0, 0, 2, 1, -1, -2, -3, -3, -2, -2, -1, -2, 1, 0, -2, -3, -4, -4, -4, -3, -3, -3, 0, -1, -3, -4, -4, -5, -5, -5, -5, -4, -1, -2, -4, -5, -6, -6, -7, -8, -7, -6, 3, 3, 2, 0, -1, -2, 1, 3, 5, 7, 3, 3, 2, 1, 0, 0, 3, 4, 4, 4, 3, 3, 2, 2, 1, 2, 3, 3, 3, 2, 3, 2, 2, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, -1, -1, -1, 0, 0, -1, -1, 2, 1, 0, -2, -2, -2, -2, -2, -2, -2, 1, 0, -1, -2, -3, -4, -4, -4, -4, -4, 1, -1, -2, -3, -4, -4, -5, -6, -6, -5, 0, -1, -3, -4, -5, -5, -6, -7, -8, -7, 4, 3, 1, 1, 1, -1, 1, 2, 3, 4, 4, 3, 2, 2, 1, 1, 2, 3, 2, 2, 3, 3, 3, 2, 1, 2, 2, 2, 1, 1, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2, 1, 0, 0, 0, 0, -1, -1, 2, 1, 1, 0, -1, 0, -1, -1, -2, -2, 2, 1, 0, -1, -2, -2, -3, -3, -3, -3, 1, 1, 0, -2, -3, -3, -4, -5, -5, -4, 1, 0, -1, -2, -3, -4, -5, -6, -7, -6, 0, -1, -2, -3, -4, -5, -6, -7, -8, -8, 3, 3, 2, 2, 1, -1, 0, 1, 1, 2, 3, 3, 2, 2, 1, 0, 1, 1, 1, 1, 3, 3, 3, 2, 1, 1, 1, 0, 0, -1, 3, 3, 3, 2, 1, 0, 0, -1, -1, -1, 2, 2, 2, 1, 1, 0, 0, -1, -2, -2, 2, 2, 1, 1, -1, -1, -1, -2, -2, -2, 2, 1, 1, 0, -1, -2, -3, -4, -4, -3, 1, 1, 0, 0, -2, -2, -3, -4, -5, -4, 1, 0, 0, -1, -2, -2, -4, -5, -6, -6, 0, -1, -1, -1, -2, -3, -5, -6, -7, -8, 4, 3, 3, 2, 1, 0, 0, 0, 0, 1, 4, 3, 3, 2, 2, 0, 0, 0, 0, -1, 4, 3, 3, 2, 2, 0, 0, -1, -1, -1, 3, 3, 3, 3, 2, 0, 0, -1, -2, -2, 2, 2, 2, 2, 1, 1, 0, -1, -2, -2, 2, 2, 1, 2, 1, 0, -1, -2, -2, -2, 2, 1, 1, 1, 0, -1, -2, -3, -3, -3, 1, 1, 1, 1, 0, -1, -3, -4, -4, -4, 1, 0, 0, 0, -1, -1, -4, -5, -5, -6, 0, 0, 0, 0, -1, -2, -5, -6, -6, -7, 4, 4, 3, 2, 2, 1, 0, -1, -1, -1, 4, 4, 3, 3, 2, 1, 0, -1, -1, -1, 4, 4, 4, 3, 2, 1, 0, -1, -1, -1, 3, 3, 3, 3, 2, 2, 1, -1, -2, -2, 2, 2, 2, 2, 2, 2, 1, -1, -2, -2, 2, 1, 1, 1, 1, 1, 0, -2, -2, -3, 1, 1, 1, 0, 0, -1, -1, -3, -3, -3, 1, 1, 0, 0, -1, -1, -2, -4, -4, -4, 0, 0, 0, -1, -1, -2, -4, -5, -5, -6, -2, -1, -1, -1, -2, -2, -5, -6, -6, -7, 4, 4, 3, 3, 2, 2, 1, 0, -1, -1, 4, 4, 4, 3, 2, 2, 1, 0, -1, -1, 4, 4, 4, 3, 3, 3, 2, 0, -1, -2, 3, 3, 3, 3, 3, 2, 1, 0, -1, -2, 2, 2, 2, 2, 2, 2, 1, 0, -2, -2, 2, 1, 1, 0, 0, 0, 0, -1, -2, -3, 1, 1, 0, -1, -1, -1, -1, -2, -3, -3, 0, 0, -1, -1, -2, -2, -2, -4, -4, -4, -1, -1, -2, -2, -3, -3, -3, -5, -5, -6, -3, -3, -3, -3, -3, -3, -5, -6, -7, -7, 4, 4, 4, 4, 3, 3, 2, 0, -1, -1, 4, 5, 5, 4, 3, 3, 2, 1, -1, -1, 3, 4, 4, 4, 3, 3, 2, 0, -1, -1, 3, 3, 3, 3, 3, 3, 2, 0, -1, -2, 2, 2, 1, 1, 2, 2, 2, 0, -1, -2, 1, 0, 0, 0, 0, 0, 0, -1, -2, -2, 0, -1, -1, -2, -1, -1, -1, -2, -2, -3, -1, -2, -2, -3, -2, -2, -2, -3, -3, -4, -3, -3, -3, -4, -3, -3, -2, -4, -5, -5, -5, -5, -5, -4, -4, -4, -4, -5, -6, -7, 4, 4, 4, 4, 3, 3, 2, 0, -1, -1, 5, 5, 4, 4, 4, 3, 2, 1, 0, -1, 4, 4, 4, 4, 4, 4, 3, 1, 0, -1, 2, 3, 3, 3, 3, 3, 3, 1, 0, -1, 1, 1, 1, 1, 2, 2, 2, 1, 0, -1, 0, -1, -1, 0, 0, 0, 1, 0, -1, -2, -3, -2, -2, -2, -1, -1, 0, -1, -2, -2, -4, -3, -3, -3, -2, -2, -1, -2, -3, -3, -7, -5, -4, -4, -3, -3, -2, -3, -4, -4, -9, -7, -6, -5, -4, -4, -4, -5, -5, -6 }, { -6, -10, -10, -10, -9, -9, -10, -11, -12, -12, -5, -8, -9, -8, -7, -6, -6, -7, -8, -8, -3, -6, -7, -7, -5, -3, -4, -5, -6, -7, -2, -5, -6, -5, -4, -1, -2, -3, -5, -5, 0, -3, -4, -4, -2, 0, 0, 0, -1, -2, 1, -2, -3, -3, -1, 2, 3, 2, 1, 0, 4, -1, -3, -2, 0, 3, 4, 4, 2, 1, 5, 2, -2, -2, 0, 3, 5, 5, 4, 2, 7, 4, 1, 0, 0, 3, 6, 7, 6, 5, 9, 7, 4, 1, 1, 3, 6, 8, 9, 11, -4, -8, -8, -8, -8, -9, -10, -11, -12, -13, -2, -6, -7, -7, -6, -6, -7, -9, -10, -11, -1, -5, -6, -6, -4, -4, -5, -7, -9, -9, 0, -3, -5, -5, -3, -2, -4, -6, -7, -7, 1, -2, -3, -3, -1, 0, -1, -3, -4, -5, 3, -1, -2, -2, 0, 1, 2, 0, -1, -2, 4, 0, -1, -1, 0, 3, 3, 2, 0, -2, 6, 2, -1, -1, 0, 3, 4, 3, 1, -1, 7, 5, 2, -1, 1, 3, 5, 5, 3, 2, 9, 6, 4, 0, 1, 3, 5, 7, 6, 5, -1, -4, -6, -6, -7, -9, -10, -11, -12, -13, 0, -4, -5, -5, -5, -6, -8, -9, -10, -11, 1, -3, -5, -4, -4, -5, -7, -8, -10, -11, 2, -2, -4, -4, -3, -3, -5, -7, -8, -9, 2, 0, -2, -2, -1, -1, -3, -5, -5, -5, 3, 1, 0, 0, 0, 1, 0, -2, -3, -4, 5, 2, 0, 0, 0, 2, 2, 0, -2, -3, 6, 4, 0, 0, 0, 2, 2, 1, -1, -2, 8, 5, 2, 0, 0, 3, 4, 2, 1, -1, 9, 7, 4, 1, -1, 2, 5, 5, 4, 3, 0, -1, -3, -4, -5, -7, -8, -9, -11, -12, 1, -1, -3, -4, -5, -6, -7, -8, -10, -11, 2, 0, -2, -3, -4, -5, -7, -8, -9, -10, 2, 1, -2, -3, -3, -4, -6, -7, -8, -9, 3, 2, 0, -1, -1, -2, -4, -5, -6, -6, 4, 3, 2, 1, 0, 0, -2, -3, -4, -4, 6, 4, 2, 1, 0, 1, 0, -2, -3, -4, 7, 5, 2, 1, 0, 1, 1, -1, -2, -3, 8, 6, 3, 0, 0, 2, 1, 0, -1, -2, 10, 7, 4, 2, -1, 1, 3, 2, 2, 1, 1, 0, -1, -2, -3, -4, -7, -8, -9, -11, 2, 1, 0, -1, -2, -3, -6, -7, -9, -10, 3, 2, 1, 0, -2, -3, -6, -7, -8, -9, 3, 2, 1, 1, -2, -2, -5, -6, -7, -7, 3, 3, 2, 2, -1, -1, -4, -4, -5, -5, 4, 4, 4, 3, 1, 0, -2, -3, -4, -5, 6, 5, 4, 3, 1, 1, -1, -2, -3, -4, 8, 6, 4, 2, 1, 1, 0, -2, -3, -3, 8, 6, 4, 2, 1, 1, 0, -1, -2, -2, 10, 7, 5, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, -1, -2, -3, -5, -7, -9, -10, 2, 2, 1, 0, -1, -2, -5, -6, -8, -9, 3, 3, 2, 1, -1, -2, -4, -6, -7, -8, 3, 3, 3, 2, -1, -1, -4, -5, -6, -6, 4, 3, 3, 3, 0, 0, -2, -4, -4, -4, 5, 5, 5, 5, 2, 1, -1, -3, -3, -4, 7, 6, 6, 5, 2, 1, 0, -2, -3, -4, 8, 7, 6, 4, 2, 1, 0, -1, -2, -3, 8, 7, 5, 3, 2, 1, 0, -1, -2, -3, 9, 7, 6, 3, 2, 1, 0, 0, -1, -1, 0, 0, -1, -1, -2, -3, -4, -7, -9, -11, 2, 1, 1, 0, -1, -2, -3, -6, -8, -9, 2, 2, 1, 1, -1, -2, -3, -5, -7, -8, 3, 2, 2, 2, 0, -1, -2, -4, -6, -7, 3, 3, 3, 3, 2, 1, -1, -3, -4, -4, 5, 6, 6, 5, 4, 2, 1, -1, -3, -4, 8, 8, 7, 6, 5, 4, 3, 0, -2, -3, 9, 8, 7, 6, 5, 4, 3, 1, -1, -3, 10, 8, 7, 6, 5, 4, 3, 1, -1, -2, 10, 9, 7, 5, 4, 3, 2, 1, 0, -1, -1, -1, -3, -4, -4, -3, -3, -6, -10, -11, 1, 0, -1, -2, -3, -3, -3, -5, -8, -10, 2, 1, 1, -1, -2, -2, -2, -4, -7, -9, 3, 2, 2, 0, -1, -2, -1, -3, -6, -7, 3, 3, 2, 2, 0, 0, 0, -1, -3, -5, 5, 5, 5, 4, 3, 2, 2, 1, -2, -4, 8, 8, 7, 6, 5, 4, 4, 2, 0, -3, 10, 9, 8, 7, 6, 4, 4, 3, 1, -3, 11, 9, 8, 7, 6, 5, 5, 4, 1, -2, 12, 10, 8, 7, 6, 5, 5, 4, 2, -1, -3, -5, -6, -6, -4, -4, -4, -6, -9, -12, -1, -2, -4, -4, -4, -3, -2, -4, -7, -11, 1, 0, -2, -3, -3, -2, -2, -2, -5, -9, 2, 1, 0, -2, -2, -2, -1, -1, -4, -7, 3, 2, 1, 0, 0, 0, 0, 0, -2, -4, 4, 4, 3, 3, 1, 1, 2, 2, 0, -3, 7, 7, 6, 5, 3, 3, 3, 3, 2, -2, 9, 9, 8, 7, 5, 4, 5, 4, 3, -2, 10, 9, 9, 8, 6, 5, 5, 5, 4, -1, 12, 10, 9, 8, 7, 6, 5, 5, 4, -1, -8, -8, -7, -6, -5, -4, -4, -5, -8, -12, -5, -6, -6, -6, -4, -3, -2, -3, -6, -10, -1, -3, -4, -5, -4, -3, -2, -1, -4, -8, 0, -1, -3, -3, -3, -2, -1, 0, -3, -6, 2, 1, -1, -2, -1, -1, 0, 0, 0, -3, 4, 3, 2, 1, 0, 0, 1, 3, 1, -2, 6, 6, 4, 3, 2, 2, 3, 4, 3, -1, 8, 8, 6, 5, 4, 3, 4, 5, 5, 0, 9, 9, 8, 7, 5, 5, 5, 6, 6, 1, 11, 9, 9, 8, 7, 6, 6, 6, 6, 0 }, { 2, 1, 3, 1, -3, -11, -12, -12, -15, -17, 2, 1, 4, 2, -3, -10, -8, -9, -12, -15, 2, 2, 5, 2, -3, -8, -3, -6, -9, -12, 2, 2, 5, 2, -2, -7, -1, -4, -7, -10, 2, 1, 5, 2, -2, -6, -1, -3, -6, -9, 0, 0, 3, 0, -3, -5, -2, -5, -8, -9, -4, -3, 3, -2, -8, -11, -5, -7, -11, -12, -6, -4, 2, -2, -9, -14, -7, -9, -12, -14, -9, -7, -1, -3, -10, -14, -8, -11, -13, -15, -10, -11, -6, -7, -12, -15, -9, -11, -14, -16, 2, 1, 2, 0, -5, -9, -10, -10, -12, -15, 2, 1, 2, 0, -5, -8, -7, -8, -10, -13, 1, 1, 3, 1, -4, -6, -3, -6, -7, -10, 0, 1, 3, 1, -4, -5, -1, -4, -5, -7, -1, 0, 4, 1, -3, -5, -1, -3, -3, -5, -4, -2, 1, -1, -4, -5, -2, -4, -4, -5, -8, -5, -2, -4, -9, -10, -6, -6, -7, -8, -9, -7, -3, -5, -10, -12, -7, -8, -9, -10, -9, -8, -4, -5, -10, -11, -7, -9, -11, -12, -7, -8, -6, -6, -9, -11, -8, -9, -11, -13, 1, 0, 1, -1, -5, -7, -8, -9, -10, -13, 0, 0, 1, -1, -5, -6, -5, -7, -8, -11, 0, 0, 1, -1, -4, -5, -3, -5, -5, -8, -1, 0, 2, 0, -4, -4, -2, -3, -3, -5, -2, -1, 2, 0, -3, -4, -2, -2, -1, -3, -6, -4, 0, -2, -4, -4, -3, -3, -2, -2, -9, -7, -4, -6, -8, -9, -6, -6, -5, -4, -10, -9, -6, -7, -9, -10, -8, -7, -6, -6, -9, -8, -6, -6, -8, -9, -7, -7, -8, -8, -6, -7, -6, -6, -7, -8, -7, -7, -8, -10, 1, 0, 1, -1, -4, -6, -7, -7, -9, -11, 0, 0, 1, -1, -3, -5, -5, -6, -7, -9, 0, 0, 1, -1, -3, -3, -3, -4, -4, -5, -2, -1, 1, -1, -3, -3, -2, -2, -2, -3, -3, -2, 0, -1, -3, -3, -2, -1, -1, -2, -7, -5, -2, -3, -3, -3, -3, -3, -1, -1, -10, -8, -5, -6, -7, -6, -6, -6, -4, -4, -10, -9, -6, -6, -8, -7, -7, -7, -6, -5, -8, -8, -6, -6, -7, -7, -6, -7, -7, -6, -6, -6, -5, -5, -6, -6, -6, -6, -7, -7, 1, 1, 1, 1, -1, -3, -5, -6, -8, -10, 0, 0, 1, 1, 0, -2, -4, -4, -4, -5, -1, 0, 0, 0, 0, -1, -2, -3, -2, -2, -2, -1, 0, 0, 1, 0, -1, -2, -1, -1, -3, -3, -1, 0, 1, 0, 0, -1, 0, 0, -7, -5, -3, -2, 0, -1, -2, -2, 0, 0, -9, -7, -5, -4, -2, -3, -4, -5, -3, -1, -8, -7, -5, -5, -2, -3, -5, -5, -4, -2, -6, -6, -4, -4, -2, -3, -5, -5, -5, -4, -4, -3, -2, -2, 0, -2, -4, -5, -5, -5, 2, 2, 2, 2, 0, -2, -3, -3, -4, -7, 1, 2, 3, 2, 1, 0, 1, 1, 0, -2, 1, 1, 2, 2, 1, 1, 3, 3, 3, 1, 0, 0, 2, 3, 2, 1, 3, 3, 4, 2, -2, -1, 1, 2, 1, 1, 3, 3, 4, 3, -5, -3, -1, 0, 0, 0, 1, 0, 1, 1, -6, -5, -3, -2, -1, -2, -1, -1, 0, 1, -5, -4, -3, -2, -1, -2, -1, -1, -1, 0, -3, -2, -2, 0, 0, -1, -1, -1, -1, -1, 0, 1, 1, 2, 2, 0, -1, -1, -1, -2, 4, 3, 4, 3, 2, 0, 0, -1, -1, -5, 3, 4, 5, 6, 6, 5, 4, 3, 3, 0, 3, 4, 5, 6, 6, 6, 5, 4, 4, 4, 2, 4, 6, 6, 7, 6, 5, 4, 5, 5, 1, 2, 4, 5, 6, 5, 4, 2, 4, 6, -2, -1, 2, 2, 2, 2, 1, -1, 0, 1, -3, -1, 0, 1, 2, 1, 0, -2, -1, 1, -2, -1, 0, 1, 2, 2, 0, -2, -1, 0, 1, 1, 1, 2, 3, 2, 1, -2, -1, -1, 4, 4, 4, 4, 5, 3, 1, -1, -1, -1, 6, 6, 6, 6, 4, 2, 2, 1, 0, -3, 6, 6, 7, 8, 8, 7, 6, 4, 4, 3, 6, 7, 7, 7, 8, 8, 6, 4, 6, 6, 5, 6, 7, 7, 8, 8, 6, 3, 4, 6, 3, 5, 7, 6, 6, 6, 4, 1, 3, 5, 0, 1, 4, 2, 2, 2, 1, -2, -1, 1, 0, 1, 2, 2, 2, 2, 0, -3, -2, 0, 1, 2, 2, 2, 3, 2, 0, -3, -2, 0, 3, 3, 3, 3, 4, 3, 0, -2, -2, -1, 7, 7, 6, 5, 6, 4, 1, -1, -1, 0, 9, 8, 9, 9, 7, 5, 3, 2, 2, -1, 10, 9, 9, 9, 10, 9, 6, 4, 5, 4, 10, 9, 9, 8, 10, 9, 6, 3, 5, 7, 9, 9, 8, 7, 9, 9, 5, 1, 3, 6, 7, 9, 8, 5, 6, 7, 3, 0, 2, 4, 3, 4, 5, 2, 2, 2, 0, -3, -2, 0, 3, 4, 4, 2, 2, 2, -1, -3, -2, -1, 3, 4, 3, 2, 3, 3, -1, -3, -3, -1, 5, 6, 4, 3, 4, 3, -1, -3, -2, -1, 9, 9, 7, 7, 8, 5, 1, -1, -1, 0, 12, 10, 9, 10, 12, 9, 5, 4, 5, 4, 13, 11, 10, 9, 12, 11, 5, 2, 5, 6, 13, 11, 9, 8, 11, 10, 3, -2, 1, 6, 12, 11, 9, 7, 9, 8, 2, -4, -2, 4, 12, 11, 8, 5, 5, 6, 1, -4, -3, 2, 6, 9, 6, 1, 1, 2, -1, -5, -5, -1, 6, 7, 5, 1, 2, 2, -2, -5, -5, -2, 7, 7, 4, 1, 4, 3, -2, -5, -4, -2, 9, 9, 4, 3, 7, 5, -1, -4, -2, -2, 13, 11, 9, 11, 12, 8, 1, -2, 0, 0 }, }, { { 0, 0, -1, -1, 0, 0, 1, 1, 2, 2, 0, 0, -1, -1, 0, 0, 1, 1, 2, 2, 0, 0, -1, -1, -1, 0, 0, 1, 2, 2, -1, 0, -1, -1, -1, -1, 0, 1, 1, 1, -1, -1, -1, -1, -1, -1, 0, 0, 1, 1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -2, -2, -1, -1, -1, -1, -1, 0, 0, 0, -2, -2, -2, -2, -1, -1, -1, -1, -1, 0, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -2, -2, -2, -1, -1, 0, -1, -1, -2, -2, 1, 1, 0, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0, -1, -1, 0, 1, 1, 2, 2, 0, 0, 0, -1, -1, 0, 0, 1, 1, 2, 0, 0, 0, 0, -1, -1, 0, 1, 1, 1, -1, -1, 0, 0, -1, -1, 0, 0, 1, 1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, 0, -1, -1, -1, -1, 0, -2, -2, -1, -1, 0, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -1, -2, -2, -2, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, -1, 0, 0, 1, 2, 2, 1, 0, 1, 0, -1, -1, 0, 1, 1, 2, 0, 0, 0, 0, -1, -1, 0, 0, 1, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, 0, -1, -2, -2, -2, 2, 1, 0, 0, 0, 1, 1, 2, 2, 3, 2, 2, 1, 0, 0, 0, 1, 1, 2, 3, 2, 2, 1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 0, 0, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -2, -1, -1, -1, -1, 0, -1, -1, -2, -1, 2, 2, 1, 0, 0, 1, 1, 2, 2, 3, 2, 2, 1, 0, 0, 1, 1, 2, 2, 3, 2, 2, 2, 1, 0, 0, 1, 2, 2, 3, 2, 2, 2, 1, 0, 0, 1, 1, 2, 3, 1, 1, 1, 1, 1, 0, 0, 1, 2, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, -2, -1, -1, -1, -1, 0, 0, -1, -1, -1, 3, 2, 1, 0, 0, 1, 1, 2, 2, 3, 2, 2, 1, 1, 0, 1, 1, 2, 2, 3, 2, 2, 2, 1, 0, 0, 1, 2, 2, 3, 2, 2, 2, 1, 0, 0, 0, 1, 2, 3, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, -1, -1, -1, 0, 0, 1, 1, 0, 0, 0, -1, -1, -1, -1, 0, 0, 1, 0, -1, -1, 3, 2, 1, 1, 0, 1, 1, 2, 3, 3, 2, 2, 2, 1, 0, 0, 1, 2, 2, 3, 2, 2, 2, 1, 0, 0, 1, 1, 2, 3, 1, 2, 2, 1, 1, 0, 0, 1, 2, 3, 1, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, -1, -1, 0, 0, 1, 1, 1, 1, 1, 1, -1, -1, -1, 0, 0, 1, 1, 1, 0, 0, 3, 2, 1, 1, 0, 0, 1, 2, 3, 4, 3, 3, 2, 1, 0, 0, 1, 2, 3, 3, 2, 2, 2, 1, 0, 0, 1, 1, 2, 3, 2, 2, 2, 1, 0, 0, 0, 1, 2, 3, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, -1, -1, 0, 0, 1, 1, 1, 1, 1, 1, -1, -1, 0, 0, 1, 1, 1, 1, 1, 1, 3, 2, 2, 1, 0, 1, 1, 2, 3, 4, 3, 3, 2, 1, 0, 0, 1, 2, 3, 3, 2, 2, 1, 1, 0, 0, 1, 1, 2, 3, 2, 1, 1, 0, 0, 0, 1, 1, 2, 3, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, -1, 0, 0, 1, 1, 1, 1, 1, 1, 2, -1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 0, 0, 1, 2, 2, 3, 4, 3, 2, 1, 0, 0, 0, 1, 2, 3, 3, 2, 2, 1, 0, 0, 0, 1, 2, 2, 3, 2, 1, 0, 0, 0, 0, 1, 1, 2, 3, 1, 1, 0, 0, 0, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 0, 0, 1, 1, 1, 2, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 0, 1, 2, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, { 3, 2, 1, 1, 1, 1, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 2, 3, 2, 1, 0, 0, 0, 0, 1, 2, 3, 3, 1, 0, 0, -1, 0, 0, 1, 2, 3, 3, 0, 0, -1, -1, 0, 0, 1, 2, 3, 3, 0, -1, -1, -1, -1, 0, 1, 2, 2, 2, -1, -1, -1, -2, -1, 0, 1, 1, 2, 2, -1, -2, -2, -2, -1, 0, 0, 1, 1, 1, -1, -2, -2, -2, -1, -1, 0, 0, 1, 1, 4, 3, 2, 2, 1, 2, 2, 2, 2, 2, 4, 3, 2, 1, 1, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, 2, 2, 3, 2, 1, 1, 0, 0, 1, 1, 2, 3, 3, 2, 1, 0, 0, 0, 1, 2, 3, 3, 3, 1, 0, 0, 0, 0, 1, 2, 3, 3, 3, 0, 0, -1, -1, 0, 1, 2, 2, 3, 3, -1, -1, -1, -1, 0, 1, 2, 2, 3, 3, 0, -1, -1, -1, 0, 1, 1, 2, 2, 2, -1, -2, -1, -1, 0, 0, 1, 1, 1, 1, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 2, 1, 2, 2, 2, 2, 2, 4, 3, 2, 1, 1, 1, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 2, 2, 2, 3, 2, 1, 1, 0, 1, 2, 2, 2, 3, 3, 1, 0, 0, 0, 0, 2, 2, 3, 3, 3, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 0, -1, -1, 0, 0, 1, 2, 3, 3, 3, 0, -1, -1, 0, 0, 1, 2, 2, 2, 2, -1, -1, -1, -1, 0, 1, 1, 2, 1, 1, 5, 4, 2, 2, 2, 2, 2, 2, 2, 2, 5, 4, 2, 2, 1, 2, 2, 2, 2, 2, 4, 3, 2, 1, 1, 1, 2, 2, 2, 2, 3, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 2, 2, 2, 3, 0, 0, 0, 0, 1, 2, 2, 2, 3, 3, 0, 0, 0, 0, 1, 2, 2, 3, 3, 3, 0, -1, -1, 0, 1, 1, 2, 2, 3, 3, -1, -1, -1, 0, 1, 1, 2, 2, 2, 2, -1, -1, -1, 0, 1, 1, 1, 2, 1, 1, 5, 4, 3, 2, 2, 2, 2, 2, 3, 2, 5, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 1, 1, 2, 2, 2, 2, 2, 3, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 0, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 0, 0, 0, 1, 2, 2, 3, 3, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 0, 0, 5, 4, 2, 1, 1, 2, 2, 2, 3, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 0, -1, 0, 0, 0, 1, 1, 1, 1, 1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, 4, 3, 1, 0, 0, 1, 1, 2, 2, 2, 4, 3, 2, 1, 1, 1, 1, 2, 2, 2, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -1, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, -1, 0, -1, -1, -2, -2, -1, 0, -1, 0, 0, -1, 0, -1, -2, -3, -3, 3, 2, 0, -1, 0, 1, 1, 2, 3, 2, 3, 3, 1, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 0, -1, -1, -1, 0, 0, -1, -1, -1, -1, 0, 0, 0, -1, -1, -1, -2, -2, -2, -2, 0, 0, -1, -1, -1, -1, -2, -3, -3, -3, -1, -1, -1, -1, -1, -1, -2, -3, -4, -4, 1, 1, 0, -1, 0, 0, 1, 2, 3, 2, 1, 1, 0, 0, -1, 0, 1, 2, 2, 2, 0, 0, 0, -1, -1, 0, 1, 1, 2, 2, -1, -1, -1, -1, -1, 0, 0, 1, 1, 2, -1, -1, -2, -1, -1, 0, 0, 1, 1, 1, -1, -2, -2, -2, -1, 0, 0, 0, 1, 1, -1, -2, -2, -2, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -3, -3, -4, -1, -1, -1, -1, -2, -2, -2, -3, -4, -4, 0, -1, -1, -1, -1, 0, 1, 2, 3, 2, -1, -1, -1, -1, -1, 0, 1, 2, 2, 2, -1, -2, -2, -1, -1, 0, 1, 1, 2, 2, -2, -2, -2, -2, -1, 0, 1, 1, 2, 2, -2, -2, -2, -2, -1, -1, 0, 1, 1, 2, -2, -3, -2, -2, -1, -1, 0, 0, 1, 1, -2, -2, -2, -2, -1, 0, 0, -1, 0, 0, -2, -2, -2, -2, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -2, -3, -4, -2, -2, -2, -2, -2, -2, -2, -2, -4, -5 }, { 5, 5, 3, 2, 0, 0, 0, 0, 0, 4, 6, 6, 5, 3, 1, -1, -1, -1, -1, 1, 7, 7, 4, 4, 2, 0, -1, -1, -1, -1, 8, 6, 2, 2, 3, 1, 0, 0, -1, -1, 8, 7, 5, 4, 3, 2, 1, 0, 0, 0, 7, 6, 6, 5, 3, 2, 2, 1, 0, 0, 6, 5, 4, 3, 2, 2, 2, 2, 2, 2, 4, 3, 2, 1, 1, 1, 2, 2, 3, 3, 3, 2, 1, 0, 0, 1, 0, 1, 2, 3, 1, 0, 0, -1, -2, -1, 0, 0, 1, 2, 2, 2, 2, 1, 0, 0, 0, 1, 1, 3, 4, 3, 3, 2, 0, -1, -1, 0, 0, 1, 5, 5, 2, 2, 1, 0, -1, -1, -1, -1, 5, 4, 0, 1, 1, 0, 0, -1, -1, -1, 5, 6, 3, 2, 2, 1, 1, 0, 0, 0, 5, 5, 5, 4, 3, 2, 1, 1, 0, 0, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 3, 3, 1, 1, 0, 0, 0, 1, 2, 2, 3, 3, 0, 0, -1, -1, -1, 0, 1, 1, 2, 3, 0, 0, 0, 1, 0, 1, 1, 1, 1, 3, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 3, 2, 1, -1, -1, 0, 0, 0, 3, 3, 1, 0, 1, 0, -1, 0, 0, 0, 2, 3, 2, 1, 1, 1, 0, 0, 0, 0, 2, 3, 4, 3, 2, 1, 1, 1, 0, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 2, 3, 3, 3, 0, 0, 0, -1, 0, 0, 1, 2, 3, 3, -2, -2, -1, 0, 1, 1, 1, 1, 2, 3, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, -1, 0, 0, 1, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, -1, -1, 0, 0, 0, 1, 2, 3, 3, 3, -1, -1, 0, 0, 0, 0, 1, 2, 3, 3, -3, -3, -2, 0, 1, 2, 2, 2, 2, 3, -3, -2, -2, -1, 0, 1, 1, 1, 2, 2, -2, -2, -1, -1, 0, 0, 0, 1, 1, 1, -2, -2, -1, 0, 0, 0, 0, 1, 1, 1, -2, -2, -1, 0, 0, -1, 0, 1, 1, 1, -2, -2, -1, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, 0, 0, 0, 1, 1, 2, 1, -1, -1, -1, 0, 0, 0, 1, 2, 2, 2, 0, -1, -1, 0, 0, 0, 1, 2, 2, 3, -4, -3, -2, -1, 0, 1, 2, 2, 2, 3, -4, -3, -3, -1, 0, 1, 1, 2, 2, 2, -4, -3, -3, -2, -1, 0, 1, 1, 2, 2, -4, -3, -3, -2, -1, 0, 0, 1, 2, 2, -4, -3, -3, -2, -1, 0, 0, 1, 2, 1, -3, -3, -3, -2, -1, 0, 0, 1, 2, 2, -2, -2, -2, -2, -1, 0, 0, 1, 1, 2, -1, -2, -2, -1, 0, 0, 0, 1, 1, 1, -1, -1, -2, -1, 0, 0, 0, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1, -4, -3, -2, -1, 0, 1, 2, 3, 3, 3, -5, -4, -3, -2, 0, 1, 1, 2, 2, 3, -5, -4, -3, -2, -1, 0, 1, 1, 2, 2, -5, -4, -4, -2, -1, 0, 1, 1, 2, 2, -4, -4, -4, -3, -1, 0, 0, 1, 2, 2, -4, -4, -4, -3, -2, -1, 0, 1, 2, 2, -3, -3, -3, -3, -2, -1, 0, 0, 1, 2, -2, -2, -2, -2, -1, 0, 0, 0, 0, 1, -1, -2, -2, -2, -1, 0, -1, -1, 0, 0, 0, -1, -1, -1, -1, 0, -1, -1, -1, 0, -4, -3, -2, -1, 0, 1, 2, 3, 3, 3, -5, -4, -3, -2, -1, 0, 1, 2, 3, 3, -5, -4, -3, -2, -1, 0, 1, 2, 2, 3, -6, -5, -4, -2, -1, 0, 0, 1, 2, 2, -6, -5, -4, -3, -2, -1, 0, 1, 2, 2, -5, -5, -4, -3, -2, -1, 0, 1, 1, 2, -4, -4, -4, -3, -2, -1, 0, 0, 1, 1, -2, -3, -3, -3, -2, -1, 0, 0, 0, 0, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, 0, -1, -1, -2, -2, -1, -1, -2, -2, -2, -4, -3, -2, -1, 0, 1, 1, 2, 3, 3, -5, -4, -3, -2, -1, 0, 1, 2, 2, 3, -6, -4, -3, -2, -1, 0, 1, 1, 2, 3, -6, -5, -4, -3, -1, -1, 0, 1, 2, 2, -6, -5, -4, -3, -2, -1, 0, 0, 1, 2, -5, -5, -4, -3, -2, -1, 0, 0, 1, 1, -4, -5, -4, -3, -2, -1, 0, 0, 0, 1, -3, -3, -3, -3, -3, -1, -1, -1, -1, -1, -2, -2, -3, -3, -3, -2, -2, -2, -3, -3, -1, -1, -1, -2, -2, -2, -2, -3, -4, -4, -5, -3, -2, -1, -1, 0, 1, 1, 2, 3, -6, -4, -3, -2, -1, 0, 1, 1, 2, 3, -6, -5, -3, -2, -1, 0, 0, 1, 1, 2, -6, -5, -4, -3, -2, -1, 0, 0, 1, 2, -6, -5, -4, -3, -2, -1, 0, 0, 1, 1, -5, -5, -4, -3, -2, -1, 0, 0, 0, 1, -4, -4, -4, -3, -3, -2, -1, 0, 0, 0, -3, -3, -3, -3, -3, -2, -1, -1, -2, -2, -2, -2, -2, -3, -3, -3, -2, -3, -4, -5, -1, -1, -2, -2, -2, -3, -3, -3, -5, -7 }, { -4, -1, 0, 1, 1, 1, 0, 1, 1, 0, -4, -2, 0, 1, 2, 2, 1, 0, -1, -2, -4, -2, 0, 1, 2, 1, 0, -1, -3, -5, -4, -2, 0, 1, 1, -1, -2, -3, -4, -6, -5, -2, 0, 0, -1, -3, -4, -5, -6, -8, -5, -3, -1, 0, -2, -4, -5, -6, -8, -10, -6, -3, -1, -1, -2, -4, -6, -8, -9, -10, -5, -4, -2, -2, -2, -4, -7, -8, -10, -11, -6, -4, -2, -2, -2, -4, -6, -8, -10, -11, -7, -4, -2, -2, -2, -3, -5, -7, -9, -11, -5, -3, -1, 1, 1, 2, 2, 3, 4, 4, -5, -3, 0, 1, 2, 3, 3, 2, 2, 2, -5, -3, 0, 2, 3, 3, 2, 1, 0, 0, -5, -3, -1, 2, 2, 1, 0, -1, -2, -2, -6, -4, -1, 1, 1, -1, -2, -3, -4, -4, -6, -4, -1, 0, 0, -2, -3, -4, -5, -5, -7, -5, -2, 0, -1, -2, -4, -5, -5, -5, -7, -6, -2, -1, -1, -2, -4, -5, -5, -5, -7, -5, -3, -1, -1, -1, -3, -4, -4, -5, -8, -5, -2, -1, -1, -1, -2, -3, -4, -5, -5, -3, -1, 1, 1, 2, 2, 3, 3, 4, -5, -3, -1, 1, 2, 2, 2, 2, 2, 2, -5, -3, -1, 2, 2, 2, 2, 1, 1, 0, -5, -4, -1, 2, 2, 1, 0, 0, -1, -1, -6, -4, -1, 1, 1, 0, -1, -2, -2, -2, -7, -5, -2, 0, 0, -1, -2, -3, -3, -2, -7, -5, -3, -1, 0, -1, -2, -3, -3, -2, -7, -6, -4, -1, 0, -1, -2, -2, -2, -1, -8, -6, -4, -1, -1, -1, -1, -1, -1, -1, -9, -6, -4, -2, -1, -1, -1, -1, -1, -1, -5, -3, -1, 0, 1, 2, 2, 3, 4, 5, -5, -3, -1, 1, 2, 2, 2, 2, 3, 3, -5, -3, -1, 1, 2, 2, 2, 1, 1, 1, -5, -4, -2, 1, 1, 1, 1, 0, 0, 0, -5, -4, -2, 0, 0, 0, -1, -1, -1, 0, -7, -5, -3, -1, 0, -1, -1, -1, -1, 0, -7, -6, -4, -1, -1, -1, -1, -1, 0, 0, -7, -7, -5, -2, 0, 0, -1, 0, 0, 1, -8, -6, -6, -3, -1, 0, 0, 0, 1, 2, -9, -7, -5, -4, -1, 0, 0, 0, 1, 2, -4, -3, -1, 0, 1, 2, 3, 4, 5, 6, -4, -3, -1, 0, 1, 2, 3, 4, 4, 5, -4, -3, -1, 0, 1, 2, 3, 3, 3, 3, -4, -3, -1, 0, 1, 2, 2, 2, 2, 2, -4, -3, -2, 0, 0, 1, 1, 1, 1, 2, -5, -4, -3, -2, -1, 0, 0, 1, 2, 2, -5, -6, -4, -2, -1, 0, 0, 1, 2, 3, -6, -6, -5, -3, -1, 0, 1, 1, 2, 3, -7, -6, -6, -4, -1, 0, 1, 1, 2, 3, -9, -8, -6, -5, -2, 0, 0, 1, 2, 4, -3, -2, -1, 0, 1, 3, 4, 5, 6, 7, -3, -1, -1, 0, 1, 3, 4, 5, 6, 6, -3, -1, -1, 0, 1, 3, 4, 5, 5, 5, -2, -1, -1, 0, 1, 3, 4, 4, 4, 4, -3, -1, -1, 0, 0, 1, 2, 2, 3, 4, -3, -3, -2, -1, -1, 1, 1, 2, 3, 4, -4, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -4, -3, -1, 0, 1, 2, 3, 4, -6, -5, -5, -4, -2, 0, 0, 1, 3, 4, -9, -7, -6, -5, -3, -1, 0, 1, 2, 4, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7, 0, 0, 0, 1, 1, 3, 4, 5, 6, 7, 0, 1, 1, 1, 1, 3, 5, 5, 6, 6, 0, 1, 1, 1, 1, 3, 4, 4, 5, 5, 0, 1, 1, 0, 0, 1, 2, 3, 4, 5, -1, 0, 0, -1, -1, 0, 1, 3, 4, 5, -1, -1, -1, -1, -1, 0, 1, 2, 4, 5, -2, -2, -3, -3, -2, -1, 0, 1, 3, 5, -4, -4, -4, -3, -3, -1, 0, 1, 3, 4, -7, -6, -5, -4, -3, -2, -1, 0, 2, 4, 3, 3, 2, 2, 1, 1, 3, 4, 5, 7, 3, 3, 3, 2, 1, 1, 3, 5, 6, 7, 3, 3, 3, 2, 1, 2, 3, 5, 6, 6, 3, 3, 3, 2, 1, 1, 3, 4, 5, 6, 2, 2, 2, 1, -1, 0, 1, 4, 5, 6, 1, 0, 0, -1, -1, -1, 1, 3, 5, 6, 0, -1, -2, -2, -2, -1, 0, 2, 4, 6, -1, -2, -3, -3, -3, -2, -1, 1, 3, 5, -3, -4, -4, -4, -3, -2, -1, 1, 2, 4, -6, -6, -5, -4, -4, -3, -1, 0, 2, 4, 7, 6, 4, 2, 1, 1, 2, 3, 4, 6, 7, 6, 4, 2, 1, 1, 2, 3, 5, 6, 7, 6, 4, 2, 1, 0, 2, 4, 5, 6, 6, 5, 3, 1, 1, 0, 2, 4, 5, 7, 4, 3, 2, 0, -1, 0, 1, 3, 6, 7, 2, 1, 0, -1, -2, -1, 1, 3, 5, 7, 1, -1, -2, -2, -2, -2, 0, 2, 4, 6, 0, -2, -3, -4, -3, -2, -1, 1, 3, 5, -3, -4, -4, -4, -3, -3, -1, 1, 3, 5, -5, -6, -6, -5, -4, -3, -2, 0, 3, 5, 13, 10, 7, 4, 2, 1, 1, 2, 3, 5, 12, 10, 7, 4, 2, 1, 1, 2, 4, 5, 12, 10, 7, 4, 2, 1, 1, 3, 4, 6, 11, 9, 6, 3, 1, 0, 1, 3, 5, 6, 9, 8, 5, 2, 0, 0, 1, 3, 5, 7, 7, 5, 3, 1, -1, -1, 0, 3, 5, 6, 5, 3, 1, -1, -2, -1, 0, 2, 5, 7, 3, 1, -1, -3, -3, -2, -1, 2, 4, 6, 0, -1, -3, -4, -4, -3, -1, 1, 4, 6, -3, -4, -4, -5, -4, -3, -2, 1, 3, 5 }, { -3, 0, 3, 6, 7, 8, 9, 10, 10, 9, -4, 0, 3, 5, 6, 8, 8, 8, 7, 6, -5, -2, 2, 5, 5, 6, 6, 5, 4, 4, -6, -3, 1, 4, 4, 4, 4, 3, 2, 2, -7, -4, 1, 2, 2, 2, 1, 0, 0, 0, -8, -5, -1, 0, 0, -1, -1, -2, -2, -2, -9, -7, -4, -2, -2, -3, -4, -4, -3, -4, -11, -8, -6, -4, -4, -5, -5, -6, -6, -7, -13, -11, -8, -6, -6, -6, -7, -8, -9, -9, -16, -13, -11, -8, -7, -8, -9, -10, -11, -12, -1, 2, 4, 6, 7, 8, 9, 10, 11, 12, -2, 1, 3, 5, 6, 7, 8, 9, 9, 9, -4, 0, 2, 4, 5, 6, 6, 6, 6, 6, -5, -2, 1, 4, 4, 4, 4, 3, 3, 4, -7, -4, 0, 2, 2, 2, 1, 1, 1, 3, -8, -5, -2, 0, 0, -1, -1, -1, 0, 2, -9, -6, -4, -2, -2, -2, -2, -2, -1, 0, -10, -8, -5, -4, -4, -4, -3, -3, -2, -2, -12, -10, -7, -5, -5, -5, -5, -5, -4, -4, -14, -12, -9, -6, -6, -6, -6, -6, -6, -6, 0, 1, 3, 5, 5, 7, 8, 10, 12, 13, -1, 1, 3, 4, 5, 7, 8, 9, 10, 11, -3, 1, 2, 4, 5, 6, 7, 8, 8, 8, -4, -1, 1, 3, 4, 5, 5, 5, 5, 6, -6, -4, -1, 2, 3, 2, 2, 2, 3, 4, -7, -5, -2, 0, 0, 0, 0, 1, 2, 3, -8, -6, -4, -1, -1, -1, -1, 0, 2, 2, -9, -7, -5, -3, -3, -2, -1, -1, 0, 1, -11, -8, -6, -4, -3, -3, -3, -2, -1, -1, -13, -11, -8, -5, -4, -4, -4, -3, -3, -2, 0, 0, 1, 3, 4, 6, 8, 9, 11, 12, -1, 0, 1, 3, 4, 6, 8, 9, 10, 11, -2, -1, 0, 3, 4, 5, 7, 8, 9, 9, -4, -2, 0, 2, 4, 5, 6, 6, 7, 7, -5, -4, -1, 1, 3, 3, 3, 4, 5, 5, -7, -5, -3, -1, 0, 1, 1, 2, 3, 4, -8, -6, -4, -2, -1, 0, 0, 2, 3, 3, -9, -7, -5, -2, -2, -1, 0, 1, 2, 2, -11, -9, -6, -3, -2, -1, 0, 0, 0, 1, -14, -11, -8, -4, -3, -2, -2, -1, -1, -1, 0, 0, 0, 1, 2, 4, 7, 9, 10, 12, -1, -1, 0, 0, 1, 4, 6, 8, 9, 10, -3, -2, -1, 0, 1, 3, 6, 7, 8, 9, -3, -2, -1, 0, 1, 3, 5, 6, 7, 7, -4, -3, -2, -1, 0, 2, 3, 4, 5, 6, -5, -4, -4, -2, 0, 0, 2, 3, 4, 5, -7, -6, -5, -4, -1, 0, 1, 2, 3, 4, -9, -8, -6, -4, -2, 0, 0, 1, 2, 3, -11, -9, -7, -5, -2, -1, 0, 0, 1, 1, -14, -11, -9, -7, -3, -2, -1, -1, -1, 0, 2, 1, 0, 1, 2, 3, 6, 8, 10, 12, 0, -1, -1, 0, 1, 3, 6, 7, 9, 10, -2, -2, -1, 0, 1, 2, 5, 7, 8, 9, -3, -2, -1, 0, 1, 2, 4, 6, 6, 7, -4, -3, -2, -1, 0, 1, 3, 4, 5, 5, -5, -4, -3, -3, 0, 0, 1, 2, 3, 4, -6, -6, -5, -5, -2, -1, 0, 2, 3, 3, -8, -7, -7, -6, -2, -2, -1, 1, 2, 3, -11, -9, -8, -6, -3, -2, -1, 0, 1, 2, -14, -12, -10, -8, -4, -3, -2, -1, -1, 1, 3, 3, 2, 2, 2, 2, 4, 7, 9, 10, 1, 1, 1, 1, 2, 2, 4, 6, 8, 10, -1, -1, 0, 0, 1, 1, 3, 5, 7, 8, -2, -1, -1, 0, 0, 0, 2, 4, 6, 7, -3, -2, -1, -1, -1, 0, 1, 3, 4, 5, -5, -4, -3, -3, -2, -2, -1, 1, 3, 4, -6, -6, -6, -5, -4, -3, -2, -1, 2, 3, -8, -7, -8, -7, -5, -4, -3, -2, 1, 3, -10, -9, -9, -7, -6, -5, -4, -3, 0, 1, -14, -12, -10, -9, -7, -6, -5, -4, -1, 0, 5, 5, 4, 4, 4, 3, 2, 4, 6, 9, 3, 3, 3, 3, 3, 2, 2, 3, 6, 8, 0, 1, 1, 2, 2, 1, 1, 3, 5, 7, -1, 0, 0, 1, 1, 1, 0, 1, 4, 6, -2, -1, 0, 0, 0, -1, -2, 0, 3, 5, -4, -3, -2, -2, -2, -2, -2, -1, 3, 4, -5, -5, -6, -5, -4, -4, -3, -2, 1, 3, -7, -7, -8, -7, -6, -5, -5, -3, 0, 3, -9, -10, -10, -8, -7, -6, -5, -4, -1, 2, -13, -12, -11, -10, -9, -8, -6, -5, -3, 1, 8, 7, 7, 6, 4, 2, 1, 1, 3, 5, 5, 5, 5, 5, 4, 2, 1, 2, 4, 6, 3, 3, 3, 3, 3, 1, 1, 1, 3, 6, 1, 1, 2, 2, 2, 1, 0, -1, 2, 5, -1, 0, 1, 1, 0, -1, -2, -2, 1, 4, -2, -1, -1, -1, -1, -3, -4, -2, 1, 3, -4, -3, -4, -4, -5, -5, -5, -4, 0, 3, -5, -5, -6, -6, -6, -6, -6, -5, -2, 2, -8, -8, -9, -8, -8, -8, -8, -6, -3, 1, -12, -12, -11, -10, -10, -10, -9, -7, -4, 0, 13, 11, 9, 7, 4, 2, 0, 0, 0, 0, 11, 9, 7, 6, 3, 1, 0, 1, 1, 2, 8, 6, 4, 4, 2, 0, 0, -1, 1, 3, 5, 3, 2, 2, 1, -1, -2, -2, 0, 4, 4, 2, 1, 0, -1, -3, -3, -3, 0, 3, 2, 0, -2, -2, -3, -5, -5, -3, 0, 3, 0, -2, -4, -6, -6, -6, -6, -5, -2, 2, -2, -4, -7, -8, -8, -8, -7, -6, -3, 1, -4, -7, -9, -9, -10, -10, -9, -7, -5, 0, -8, -10, -12, -12, -13, -13, -12, -9, -6, -2 }, { -16, -13, -11, -11, -17, -14, -5, -4, -11, -13, -13, -11, -9, -8, -13, -11, -5, -4, -7, -11, -10, -8, -6, -5, -10, -9, -4, -3, -5, -9, -10, -7, -4, -2, -7, -6, -3, -2, -4, -8, -9, -7, -4, -1, -3, -2, -2, -1, -2, -7, -10, -7, -4, -1, -3, 0, 0, 1, 0, -6, -10, -7, -4, -2, -4, 0, 1, 3, 2, -3, -13, -9, -5, -4, -5, -1, 3, 5, 4, -2, -17, -13, -8, -7, -7, -2, 3, 5, 5, -1, -20, -16, -12, -10, -10, -4, 2, 5, 4, 0, -15, -12, -11, -8, -13, -12, -7, -6, -10, -10, -12, -11, -9, -7, -11, -11, -7, -6, -9, -10, -8, -8, -6, -5, -10, -10, -7, -6, -8, -10, -9, -7, -4, -3, -7, -8, -7, -5, -6, -9, -9, -7, -4, -1, -3, -3, -4, -4, -5, -8, -8, -6, -4, -1, -3, 0, -1, -1, -3, -8, -9, -7, -4, -2, -4, 0, 1, 2, -1, -6, -11, -7, -5, -3, -5, -1, 2, 3, 1, -4, -15, -12, -8, -6, -6, -3, 2, 3, 2, -3, -20, -15, -11, -10, -9, -5, 1, 3, 2, -2, -12, -11, -9, -7, -10, -10, -8, -8, -8, -7, -9, -10, -8, -6, -9, -10, -9, -8, -9, -9, -5, -7, -6, -5, -9, -10, -8, -8, -9, -9, -5, -6, -4, -3, -7, -8, -8, -7, -8, -10, -6, -6, -4, -2, -3, -4, -5, -5, -7, -9, -6, -5, -3, -1, -3, 0, -2, -2, -6, -8, -7, -5, -3, -1, -3, -1, 1, 0, -3, -7, -8, -6, -4, -3, -4, -2, 1, 1, -2, -6, -14, -10, -8, -5, -6, -4, 1, 1, -1, -4, -19, -14, -11, -9, -9, -5, 0, 1, 0, -3, -8, -9, -8, -6, -8, -8, -7, -7, -6, -4, -5, -8, -7, -5, -8, -9, -8, -7, -7, -7, -1, -5, -5, -5, -7, -8, -8, -8, -8, -8, -1, -3, -4, -3, -6, -8, -7, -7, -8, -9, -1, -3, -3, -2, -3, -4, -5, -6, -7, -9, -3, -4, -2, -1, -2, -1, -2, -4, -7, -8, -4, -4, -2, -2, -2, -1, -1, -2, -5, -8, -5, -5, -3, -3, -3, -2, 0, -1, -4, -6, -11, -9, -7, -5, -5, -3, 0, 0, -2, -5, -17, -14, -10, -9, -8, -5, 0, 0, -1, -3, -3, -5, -6, -4, -5, -5, -4, -3, -2, -1, 0, -3, -5, -5, -6, -6, -6, -5, -4, -3, 3, 0, -3, -4, -6, -6, -6, -6, -6, -6, 3, 1, -1, -3, -5, -6, -6, -6, -7, -8, 2, 0, -1, -2, -3, -3, -5, -6, -7, -8, 0, -1, -1, -1, -1, -1, -3, -4, -6, -8, -2, -2, -2, -2, -2, -1, -1, -3, -5, -7, -4, -3, -3, -3, -2, -1, -1, -1, -4, -6, -9, -7, -7, -5, -4, -2, 0, 0, -2, -4, -15, -12, -10, -8, -6, -3, 0, 0, -1, -3, -1, -2, -3, -3, -2, 0, 0, 1, 2, 1, 4, 1, -2, -3, -3, -1, -1, 0, 0, 0, 6, 4, 0, -2, -3, -2, -2, -2, -1, -2, 6, 4, 2, -1, -3, -2, -3, -3, -3, -4, 5, 3, 2, 1, -1, 0, -3, -4, -4, -5, 3, 2, 1, 0, 0, 0, -1, -3, -4, -6, 1, 0, -1, -1, 0, 1, 1, -1, -3, -5, -1, -1, -2, -2, -1, 1, 1, 0, -2, -4, -5, -5, -5, -4, -3, 0, 1, 1, 0, -3, -14, -10, -9, -7, -4, -1, 0, 1, 0, -2, 1, 0, -1, -1, -1, 1, 2, 3, 3, 2, 4, 2, 0, -1, -1, 1, 1, 1, 2, 1, 6, 4, 1, 0, -1, 0, 0, 0, 0, -1, 7, 5, 2, 1, -1, 0, -1, -2, -2, -2, 6, 5, 3, 3, 1, 0, -1, -2, -3, -2, 5, 3, 3, 4, 3, 2, 1, -2, -3, -2, 3, 2, 1, 2, 3, 3, 3, 0, -2, -2, 1, 0, 0, 1, 2, 3, 3, 1, -1, -1, -4, -3, -2, -1, 2, 3, 3, 2, 1, 0, -11, -8, -7, -5, -1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 5, 4, 5, 5, 5, 3, 2, 1, 0, 1, 4, 4, 4, 4, 4, 5, 3, 1, 0, 0, 3, 4, 3, 3, 3, 6, 4, 2, 1, 0, 2, 2, 2, 3, 2, 7, 5, 3, 3, 1, 1, 1, 2, 2, 2, 7, 5, 5, 4, 3, 3, 3, 2, 2, 2, 6, 4, 4, 5, 6, 6, 5, 4, 2, 2, 4, 3, 3, 4, 5, 6, 6, 5, 4, 3, 0, 1, 2, 3, 4, 6, 6, 5, 5, 2, -7, -4, -2, 1, 2, 4, 6, 5, 5, 3, 2, 1, 0, -1, 2, 7, 8, 8, 8, 8, 3, 0, -1, -2, 1, 6, 8, 8, 7, 7, 4, 0, -2, -3, 0, 3, 5, 6, 6, 6, 5, 2, -2, -2, -1, 2, 2, 3, 5, 5, 7, 3, 1, 0, 0, 1, 1, 2, 4, 5, 8, 4, 3, 2, 3, 3, 2, 3, 4, 4, 8, 5, 4, 4, 5, 6, 4, 4, 5, 5, 7, 5, 4, 4, 6, 7, 6, 5, 6, 6, 3, 4, 4, 4, 6, 8, 7, 6, 8, 8, -2, 2, 2, 2, 3, 7, 8, 7, 8, 8, 1, -1, -2, -2, 1, 9, 12, 12, 11, 11, 1, -2, -4, -3, 0, 7, 10, 10, 10, 10, 2, -3, -5, -3, -1, 3, 5, 6, 9, 9, 3, -2, -6, -4, -1, 2, 2, 3, 5, 8, 5, 0, -3, -2, -1, 1, 1, 2, 4, 7, 7, 3, -1, 0, 2, 2, 1, 2, 4, 7, 9, 4, 1, 2, 4, 5, 4, 4, 6, 9, 8, 6, 2, 3, 5, 8, 5, 6, 9, 10, 5, 6, 4, 3, 5, 9, 7, 8, 11, 12, 3, 7, 3, 2, 5, 11, 10, 10, 12, 13 }, } }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/gdf_tables.h000066400000000000000000000034511517466257200231010ustar00rootroot00000000000000/* * Copyright © 2026, VideoLAN and dav2d authors * Copyright © 2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_GDF_TABLES_H #define DAV2D_SRC_GDF_TABLES_H #include #include "common/intops.h" EXTERN const uint16_t dav2d_gdf_alpha[6][6][22][4]; EXTERN const int16_t dav2d_gdf_weight[6][6][3][22][4]; EXTERN const int16_t dav2d_gdf_bias[6][6][3]; EXTERN const int8_t dav2d_gdf_intra_error[6][4096]; EXTERN const int8_t dav2d_gdf_inter_error[5][6][1000]; #endif /* DAV2D_SRC_GDF_TABLES_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/getbits.c000066400000000000000000000124001517466257200224350ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/getbits.h" void dav2d_init_get_bits(GetBits *const c, const uint8_t *const data, const size_t sz) { assert(sz); c->ptr = c->ptr_start = data; c->ptr_end = &c->ptr_start[sz]; c->state = 0; c->bits_left = 0; c->error = 0; } unsigned dav2d_get_bit(GetBits *const c) { if (!c->bits_left) { if (c->ptr >= c->ptr_end) { c->error = 1; } else { const unsigned state = *c->ptr++; c->bits_left = 7; c->state = (uint64_t) state << 57; return state >> 7; } } const uint64_t state = c->state; c->bits_left--; c->state = state << 1; return (unsigned) (state >> 63); } static inline void refill(GetBits *const c, const int n) { assert(c->bits_left >= 0 && c->bits_left < 32); unsigned state = 0; do { if (c->ptr >= c->ptr_end) { c->error = 1; if (state) break; return; } state = (state << 8) | *c->ptr++; c->bits_left += 8; } while (n > c->bits_left); c->state |= (uint64_t) state << (64 - c->bits_left); } #define GET_BITS(name, type, type64) \ type name(GetBits *const c, const int n) { \ assert(n > 0 && n <= 32); \ /* Unsigned cast avoids refill after eob */ \ if ((unsigned) n > (unsigned) c->bits_left) \ refill(c, n); \ const uint64_t state = c->state; \ c->bits_left -= n; \ c->state = state << n; \ return (type) ((type64) state >> (64 - n)); \ } GET_BITS(dav2d_get_bits, unsigned, uint64_t) GET_BITS(dav2d_get_sbits, int, int64_t) unsigned dav2d_get_uleb128(GetBits *const c) { uint64_t val = 0; unsigned i = 0, more; do { const int v = dav2d_get_bits(c, 8); more = v & 0x80; val |= ((uint64_t) (v & 0x7F)) << i; i += 7; } while (more && i < 56); if (val > UINT32_MAX || more) { c->error = 1; return 0; } return (unsigned) val; } unsigned dav2d_get_golomb(GetBits *const c, const unsigned k) { unsigned bits; assert(k < 32); for (bits = 0; bits < 32 - k; bits++) if (!dav2d_get_bit(c)) break; if (bits + k == 32) return ~0U; return (bits << k) | dav2d_get_bits(c, k); } unsigned dav2d_get_uniform(GetBits *const c, const unsigned max) { // Output in range [0..max-1] // max must be > 1, or else nothing is read from the bitstream assert(max > 1); const int l = ulog2(max) + 1; assert(l > 1); const unsigned m = (1U << l) - max; const unsigned v = dav2d_get_bits(c, l - 1); return v < m ? v : (v << 1) - m + dav2d_get_bit(c); } unsigned dav2d_get_vlc(GetBits *const c) { if (dav2d_get_bit(c)) return 0; int n_bits = 0; do { if (++n_bits == 32) return UINT32_MAX; } while (!dav2d_get_bit(c)); return ((1U << n_bits) - 1) + dav2d_get_bits(c, n_bits); } unsigned dav2d_get_bits_subexp_u(GetBits *const c, const unsigned ref, const unsigned n, const int k) { unsigned v = 0; for (int i = 0;; i++) { const int b = i ? k + i - 1 : k; const int a = 1 << b; if (n <= v + 3 * a) { v += dav2d_get_uniform(c, n - v); break; } if (!dav2d_get_bit(c)) { v += dav2d_get_bits(c, b); break; } v += a; } return ref * 2 <= n ? inv_recenter(ref, v) : n - 1 - inv_recenter(n - 1 - ref, v); } int dav2d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) { const int off = n - 1, n2 = n + off; return (int) dav2d_get_bits_subexp_u(c, ref + off, n2, 3) - off; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/getbits.h000066400000000000000000000063131517466257200224500ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_GETBITS_H #define DAV2D_SRC_GETBITS_H #include #include typedef struct GetBits { uint64_t state; int bits_left, error; const uint8_t *ptr, *ptr_start, *ptr_end; } GetBits; void dav2d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz); unsigned dav2d_get_bit(GetBits *c); unsigned dav2d_get_bits(GetBits *c, int n); int dav2d_get_sbits(GetBits *c, int n); unsigned dav2d_get_uleb128(GetBits *c); // Output in range 0..max-1 unsigned dav2d_get_golomb(GetBits *c, unsigned k); unsigned dav2d_get_uniform(GetBits *c, unsigned max); unsigned dav2d_get_vlc(GetBits *c); unsigned dav2d_get_bits_subexp_u(GetBits *c, unsigned ref, unsigned n, int k); int dav2d_get_bits_subexp(GetBits *c, int ref, unsigned n); static inline unsigned dav2d_get_ref_uniform(GetBits *c, const unsigned max, const unsigned def) { if (!dav2d_get_bit(c)) return def; const unsigned res = dav2d_get_uniform(c, max - 1); return res + (res >= def); } // Discard bits from the buffer until we're next byte-aligned. static inline void dav2d_bytealign_get_bits(GetBits *c) { // bits_left is never more than 7, because it is only incremented // by refill(), called by dav2d_get_bits and that never reads more // than 7 bits more than it needs. // // If this wasn't true, we would need to work out how many bits to // discard (bits_left % 8), subtract that from bits_left and then // shift state right by that amount. assert(c->bits_left <= 7); c->bits_left = 0; c->state = 0; } // Return the current bit position relative to the start of the buffer. static inline unsigned dav2d_get_bits_pos(const GetBits *c) { return (unsigned) (c->ptr - c->ptr_start) * 8 - c->bits_left; } #endif /* DAV2D_SRC_GETBITS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ibp.c000066400000000000000000000047211517466257200215550ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/intops.h" #include "src/ibp.h" #include "src/levels.h" #include "src/tables.h" uint8_t dav2d_ibp_weights[7][16][16]; static inline unsigned fast_div32(const unsigned num, const unsigned den) { unsigned shift = ulog2(den); const unsigned rem = den - (1 << shift); // Quantize fractional part of divisor between powers of two (0..128) const unsigned idx = ((rem << 7) + (1 << (shift - 1))) >> shift; assert(idx <= 128); shift += 2; const unsigned res = ((num * dav2d_div_recip[idx]) + ((1 << shift) >> 1)) >> shift; assert(res < 256); return res; } COLD void dav2d_init_ibp_weights(void) { static const int dr_dy_q6[7] = { 682, 256, 170, 128, 81, 64, 50 }; for (int m = 0; m < 7; m++) { const int dy = dr_dy_q6[m]; for (int y = 0; y < 16; y++) { const int yy = (y + 1) << 6; int y_pos = dy; for (int x = 0; x < 16; x++, y_pos += dy) { dav2d_ibp_weights[m][y][x] = fast_div32(y_pos, yy + y_pos); } } } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ibp.h000066400000000000000000000030721517466257200215600ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_IBP_H #define DAV2D_SRC_IBP_H EXTERN uint8_t dav2d_ibp_weights[7][16][16]; void dav2d_init_ibp_weights(void); #endif /* DAV2D_SRC_IBP_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/internal.h000066400000000000000000000410611517466257200226220ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_INTERNAL_H #define DAV2D_SRC_INTERNAL_H #include #include "ccso.h" #include "dav2d/data.h" typedef struct Dav2dFrameContext Dav2dFrameContext; typedef struct Dav2dTileState Dav2dTileState; typedef struct Dav2dTaskContext Dav2dTaskContext; typedef struct Dav2dTask Dav2dTask; #include "common/attributes.h" #include "src/cdef.h" #include "src/cdf.h" #include "src/data.h" #include "src/env.h" #include "src/filmgrain.h" #include "src/intra_edge.h" #include "src/ipred.h" #include "src/itx.h" #include "src/levels.h" #include "src/lf_mask.h" #include "src/deblock.h" #include "src/looprestoration.h" #include "src/mc.h" #include "src/msac.h" #include "src/pal.h" #include "src/picture.h" #include "src/recon.h" #include "src/refmvs.h" #include "src/stx.h" #include "src/thread.h" typedef struct Dav2dDSPContext { Dav2dFilmGrainDSPContext fg; Dav2dIntraPredDSPContext ipred; Dav2dMCDSPContext mc; Dav2dInvTxfmDSPContext itx; Dav2dStxDSPContext stx; Dav2dDeblockDSPContext lf; Dav2dCcsoDSPContext ccso; Dav2dCdefDSPContext cdef; Dav2dLoopRestorationDSPContext lr; } Dav2dDSPContext; struct Dav2dTileGroup { Dav2dData data; int start, end; }; enum TaskType { DAV2D_TASK_TYPE_INIT, DAV2D_TASK_TYPE_INIT_CDF, DAV2D_TASK_TYPE_TILE_ENTROPY, DAV2D_TASK_TYPE_ENTROPY_PROGRESS, DAV2D_TASK_TYPE_TILE_MV_RESOLUTION, DAV2D_TASK_TYPE_TILE_RECONSTRUCTION, DAV2D_TASK_TYPE_DEBLOCK_COLS, DAV2D_TASK_TYPE_DEBLOCK_ROWS, DAV2D_TASK_TYPE_CDEF, DAV2D_TASK_TYPE_LOOP_RESTORATION, DAV2D_TASK_TYPE_RECONSTRUCTION_PROGRESS, DAV2D_TASK_TYPE_FG_PREP, DAV2D_TASK_TYPE_FG_APPLY, }; struct Dav2dContext { Dav2dFrameContext *fc; unsigned n_fc; Dav2dTaskContext *tc; unsigned n_tc; // cache of OBUs that make up a single frame before we submit them // to a frame worker to be decoded struct Dav2dTileGroup *tile; int n_tile_data_alloc; int n_tile_data; int n_tiles; Dav2dMemPool *seq_hdr_pool; Dav2dRef *seq_hdr_ref; Dav2dSequenceHeader *seq_hdr; Dav2dMemPool *frame_hdr_pool; Dav2dRef *frame_hdr_ref; Dav2dFrameHeader *frame_hdr; Dav2dRef *content_light_ref; Dav2dContentLightLevel *content_light; Dav2dRef *mastering_display_ref; Dav2dMasteringDisplay *mastering_display; Dav2dRef *itut_t35_ref; Dav2dITUTT35 *itut_t35; int n_itut_t35; // decoded output picture queue Dav2dData in; struct OutputQueue { Dav2dThreadPicture p; int res; // FIXME event/frame_flags } *dpb; // output buffer management int dpb_in, dpb_out, dpb_sz, dpb_poc, drain; atomic_int flush_mem, *flush; struct { #if 0 Dav2dThreadPicture *out_delayed; #endif unsigned next; } frame_thread; // task threading (refer to tc[] for per_thread thingies) struct TaskThreadData { pthread_mutex_t lock; pthread_cond_t cond; atomic_uint first; unsigned cur; // This is used for delayed reset of the task cur pointer when // such operation is needed but the thread doesn't enter a critical // section (typically when executing the next sbrow task locklessly). // See src/thread_task.c:reset_task_cur(). atomic_uint reset_task_cur; atomic_int cond_signaled; struct { int exec, finished; pthread_cond_t cond; const Dav2dPicture *in; Dav2dPicture *out; enum TaskType type; atomic_int progress[2]; /* [0]=started, [1]=completed */ union { struct { ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16); ALIGN(uint8_t scaling_8bpc[3][256], 64); }; struct { ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16); ALIGN(uint8_t scaling_16bpc[3][4096], 64); }; }; } delayed_fg; int inited; // how block decoding (entropy, motion vector resolving, reconstruction) // is split // 1: all in a single pass // 2: first entropy decoding as one pass, then motion vector resolving + // reconstruction in a 2nd pass // 3: each in a separate pass int n_passes; } task_thread; // reference/entropy state Dav2dMemPool *segmap_pool; Dav2dMemPool *refmvs_pool; Dav2dMemPool *ccsomap_pool; struct { Dav2dThreadPicture p; Dav2dRef *segmap; Dav2dRef *refmvs; Dav2dRef *ccsomap; uint8_t refpoc[7]; } refs[8]; Dav2dMemPool *cdf_pool; CdfThreadContext cdf[8]; Dav2dMemPool *fgm_pool; Dav2dRef *fgm[8]; Dav2dMemPool *ci_pool; Dav2dRef *ci_ref; // uv segmap, not part of the reference state Dav2dMemPool *segmap_uv_pool; Dav2dDSPContext dsp[3 /* 8, 10, 12 bits/component */]; Dav2dPalDSPContext pal_dsp; Dav2dRefmvsDSPContext refmvs_dsp; Dav2dPicAllocator allocator; int apply_grain; int operating_point; unsigned operating_point_idc; int all_layers; int max_spatial_id; unsigned frame_size_limit; int strict_std_compliance; int output_invisible_frames; enum Dav2dInloopFilterType inloop_filters; enum Dav2dDecodeFrameType decode_frame_type; #if 0 enum PictureFlags frame_flags; enum Dav2dEventFlags event_flags; Dav2dDataProps cached_error_props; int cached_error; #endif Dav2dLogger logger; Dav2dMemPool *picture_pool; Dav2dMemPool *pic_ctx_pool; }; struct Dav2dTask { unsigned frame_idx; // frame thread id enum TaskType type; // task work int sby; // sbrow // task dependencies int recon_progress, deblock_progress; int deps_skip; struct Dav2dTask *next; // only used in task queue }; struct Dav2dFrameContext { Dav2dRef *seq_hdr_ref; Dav2dSequenceHeader *seq_hdr; Dav2dRef *frame_hdr_ref; Dav2dFrameHeader *frame_hdr; Dav2dThreadPicture refp[7]; Dav2dThreadPicture cur; Dav2dRef *mvs_ref; refmvs_temporal_block *mvs, *ref_mvs[7]; Dav2dRef *ref_mvs_ref[7]; Dav2dRef *cur_segmap_ref, *prev_segmap_ref; uint8_t *cur_segmap; const uint8_t *prev_segmap; Dav2dRef *cur_ccsomap_ref, *prev_ccsomap_ref[3]; uint8_t *cur_ccsomap; const uint8_t *prev_ccsomap[3]; uint8_t refpoc[7], refrefpoc[7][7], refcnt[7]; union { int8_t refdir_with_intra[1 /* intra */ + 7 + 1 /* tip */]; struct { int8_t refdir_intra; uint8_t refdir[7 + 1 /* tip */]; }; }; int8_t furthest_future_refidx; uint8_t absrefdist[7]; int8_t refdist[7]; union refpair skip_mode_refs; uint8_t gmv_warp_allowed[7]; int use_pri_sec_cdf; CdfThreadContext in_cdf, out_cdf, src_cdf[2]; struct Dav2dTileGroup *tile; int n_tile_data_alloc; int n_tile_data; // for scalable references struct ScalableMotionParams { int scale; // if no scaling, this is 0 int step; } svc[7][2 /* x, y */]; const Dav2dContext *c; Dav2dTileState *ts; int n_ts; const Dav2dDSPContext *dsp; struct { recon_b_fn recon_b; filter_sbrow_fn filter_sbrow; filter_sbrow_fn filter_sbrow_deblock_cols; filter_sbrow_fn filter_sbrow_deblock_rows; void (*filter_sbrow_cdef)(Dav2dTaskContext *tc, int sby); filter_sbrow_fn filter_sbrow_lr; backup_prefilter_data_fn backup_prefilter_data; read_coef_blocks_fn read_coef_blocks; copy_pal_block_fn copy_pal_block_y; read_pal_plane_fn read_pal_plane; } bd_fn; size_t prefilter_data_sz; pixel *prefilter_data[3]; int prefilter_data_full_frame; ptrdiff_t b4_stride; int bw, bh, sb256w, sb256h, sbh, sb_shift, sb_step; int ss_ver, ss_hor; uint32_t dq[DAV2D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */]; BlockContext *a; int a_sz /* w*tile_rows */; refmvs_frame rf; int bitdepth_max; enum BlockSize root_bs; struct { int next_tile_row[2 /* 0: reconstruction, 1: entropy */]; atomic_int entropy_progress; atomic_int deblock_progress; // in sby units atomic_uint *frame_progress, *copy_db_progress; // indexed using t->by * f->b4_stride + t->bx Av2Block *b; struct CodedBlockInfo { int16_t eob[3 /* plane */]; uint16_t txtp[3 /* plane */]; } *cbi; // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1) pixel (*pal)[8 /* idx */]; // iterated over inside tile state uint8_t *pal_idx; coef *cf; uint8_t *partition; int prog_sz; int cbi_sz, pal_sz, pal_idx_sz, cf_sz, part_sz; // start offsets per tile unsigned *tile_start_off; int scheduled; } frame_thread; // loopfilter struct { Av2Filter *mask; Av2Restoration *lr_mask; uint8_t *segmap_uv; int mask_sz /* w*h */, lr_mask_sz, uv_segmap_sz; ptrdiff_t uv_segmap_stride; int cdef_buf_plane_sz[2]; /* stride*sbh*4 */ int cdef_buf_sbh; /* 0-1: (stride*sbh*4) << sb128 if n_tc > 1, else stride*4; * 2-3: stride*(n_tile_rows-1)*4 if n_tc==1, double that otherwise */ int lr_buf_plane_sz[4]; int re_sz /* h */; int base_q; int gdf_ref_dst_idx; const uint8_t *ns_subclass_lut; const uint8_t *pc_subclass_lut; const int16_t (*pc_filters)[13]; uint8_t *tx_db_right_edge[2]; uint8_t *cdef_line_buf, *lr_line_buf; pixel *cdef_line[2 /* pre, post */][3 /* plane */]; pixel *lr_db_line[3 /* plane */]; pixel *lr_cdef_line[3 /* plane */]; // in-loop filter per-frame state keeping uint8_t *start_of_tile_row; int start_of_tile_row_sz; pixel *p[3]; int restore_planes; // enum LrRestorePlanes } lf; struct { pthread_mutex_t lock; pthread_cond_t cond; struct TaskThreadData *ttd; struct Dav2dTask *tasks, *tile_tasks[3], init_task; int num_tasks, num_tile_tasks; atomic_int init_done; atomic_int done[2]; int retval; int update_set; // whether we need to update CDF reference atomic_int error; atomic_int task_counter, entropy_task_counter; struct Dav2dTask *task_head, *task_tail; // Points to the task directly before the cur pointer in the queue. // This cur pointer is theoretical here, we actually keep track of the // "prev_t" variable. This is needed to not loose the tasks in // [head;cur-1] when picking one for execution. struct Dav2dTask *task_cur_prev; struct { // async task insertion atomic_int merge; pthread_mutex_t lock; Dav2dTask *head, *tail; } pending_tasks; } task_thread; // threading (refer to tc[] for per-thread things) struct FrameTileThreadData { int (*lowest_pixel_mem)[7][2]; int lowest_pixel_mem_sz; } tile_thread; }; struct Dav2dTileState { CdfContext cdf; MsacContext msac; struct { int col_start, col_end, row_start, row_end; // in 4px units int col, row; // in tile units } tiling; // in sby units, TILE_ERROR after a decoding error atomic_int progress[3 /* pass */]; struct { uint8_t *pal_idx; pixel (*pal)[8]; struct CodedBlockInfo *cbi; coef *cf; uint8_t *partition[2]; } frame_thread[2 /* 0: reconstruction, 1: entropy */]; // in fullpel units, [0] = Y, [1] = UV, used for progress requirements // each entry is one tile-sbrow; middle index is refidx int (*lowest_pixel)[7][2]; uint32_t dqmem[DAV2D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; const uint32_t (*dq)[3][2]; int last_qidx; Av2RestorationUnit *lr_ref[3]; struct NsWienerBank { uint8_t bank_size[16], bank_idx[16]; int8_t filter[4][16][32]; } ns_wiener_bank[3]; }; struct Dav2dTaskContext { const Dav2dContext *c; const Dav2dFrameContext *f; Dav2dTileState *ts; int bx, by, cbx, cby, sdp_cfl_disallowed, intra_region; BlockContext l, *a; struct SBEdgeCtx a_sb_cache; uint64_t is_coded[2 /* luma, chroma */][64]; struct { int col_start; int row_start; union { struct { int alpha, beta; } bawp[3 /* plane */]; }; } pb; refmvs_tile rt; // chroma backups uint16_t /*enum TxfmType*/ chroma_txtp[16 * 16][2]; // why 2? int16_t chroma_eob[16 * 16][2]; coef *cf_uv; ALIGN(union, 64) { int16_t cf_y_8bpc [32 * 32]; int32_t cf_y_16bpc[32 * 32]; }; ALIGN(union, 64) { int16_t cf_uv_8bpc [2][64 * 64]; int32_t cf_uv_16bpc[2][64 * 64]; }; union { uint8_t al_pal_8bpc [2 /* a/l */][64 /* bx/y4 */][8 /* palette_idx */]; uint16_t al_pal_16bpc[2 /* a/l */][64 /* bx/y4 */][8 /* palette_idx */]; }; uint8_t luma_intra_dir_mode_map[16 * 16]; uint8_t luma_fsc_map[16 * 16]; ALIGN(union, 64) { struct { int16_t compinter[2][64 * 64]; uint8_t seg_mask[64 * 64]; union { uint8_t p_8bpc[2][24 * 128]; uint16_t p_16bpc[2][24 * 96]; }; union { // stride=192 for non-SVC, or 320 for SVC uint8_t emu_edge_8bpc [320 * (256 + 7)]; uint16_t emu_edge_16bpc[320 * (256 + 7)]; }; }; struct { uint8_t pal_idx_y[64 * 64]; union { struct { uint8_t interintra_8bpc[64 * 64]; uint8_t edge_8bpc[64 * 8 + 2 * 1 + 2 * 9]; ALIGN(uint8_t pal_8bpc[8 /* palette_idx */], 8); }; struct { uint16_t interintra_16bpc[64 * 64]; uint16_t edge_16bpc[64 * 8 + 2 * 1 + 3 * 6]; ALIGN(uint16_t pal_16bpc[8 /* palette_idx */], 16); }; }; union { int8_t levels[33 * 33]; struct { uint8_t pal_order[64][8]; uint8_t pal_ctx[64]; }; }; }; } scratch; uint8_t txtp_map[16 * 16]; // inter-only union { Dav2dWarpedMotionParams warpmv[2]; // refined mvs after tip/opfl/refinemv union mv rmv[16 * 16 /* y * 16 + x */][2 /* refined, tmv */][2 /* ref */]; }; Av2Filter *lf_mask; int top_pre_cdef_toggle; uint8_t u_has_cf; struct { enum { PASS_ENTROPY = 1 << 0, PASS_MVRES = 1 << 1, PASS_RECON = 1 << 2, PASS_ALL = PASS_ENTROPY | PASS_MVRES | PASS_RECON, } pass; struct thread_data td; struct TaskThreadData *ttd; struct FrameTileThreadData *fttd; int flushed; int die; } task_thread; }; struct OutputQueue *dav2d_queue_output(Dav2dContext *c, Dav2dThreadPicture *p); #endif /* DAV2D_SRC_INTERNAL_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/intra_edge.h000066400000000000000000000043521517466257200231110ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_INTRA_EDGE_H #define DAV2D_SRC_INTRA_EDGE_H #include enum EdgeFlags { EDGE_I444_TOP_HAS_RIGHT = 1 << 0, EDGE_I422_TOP_HAS_RIGHT = 1 << 1, EDGE_I420_TOP_HAS_RIGHT = 1 << 2, EDGE_I444_LEFT_HAS_BOTTOM = 1 << 3, EDGE_I422_LEFT_HAS_BOTTOM = 1 << 4, EDGE_I420_LEFT_HAS_BOTTOM = 1 << 5, EDGE_ALL_TOP_HAS_RIGHT = EDGE_I444_TOP_HAS_RIGHT | EDGE_I422_TOP_HAS_RIGHT | EDGE_I420_TOP_HAS_RIGHT, EDGE_ALL_LEFT_HAS_BOTTOM = EDGE_I444_LEFT_HAS_BOTTOM | EDGE_I422_LEFT_HAS_BOTTOM | EDGE_I420_LEFT_HAS_BOTTOM, EDGE_ALL_TR_AND_BL = EDGE_ALL_TOP_HAS_RIGHT | EDGE_ALL_LEFT_HAS_BOTTOM, }; #endif /* DAV2D_SRC_INTRA_EDGE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ipred.h000066400000000000000000000166321517466257200221170ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_IPRED_H #define DAV2D_SRC_IPRED_H #include #include "common/bitdepth.h" #include "src/levels.h" // These flags are OR'ed with the angle parameter in intra predictors. // They encode intra mode features such as edge availability, multi-line MRL, // DIP, IBP, edge filtering, and whether we're luma or chroma. // // ANGLE_IS_LUMA - z1-3 use this flag to switch between a 4-tap // luma filter and a 2-tap (bilinear) chroma filter // ANGLE_DIP_FLAG – enables directional intra prediction (DIP) mode // ANGLE_HAS_TOP_FLAG – top reference edge is available for prediction // ANGLE_HAS_LEFT_FLAG – left reference edge is available for prediction // ANGLE_MULTI_MRL_FLAG – enables multi-line MRL mode // ANGLE_MRL_IDX – multi-reference line (MRL) index bits (bits 13–14) // ANGLE_IBP_FLAG – enables intra bi-prediction for the current block // ANGLE_USE_EDGE_FILTER_FLAG – apply edge filtering (convolution) to reference samples // ANGLE_SMOOTH_TOP_EDGE_FLAG – indicates smooth top edge; use reduced filter strength // ANGLE_SMOOTH_LEFT_EDGE_FLAG – indicates smooth left edge; use reduced filter strength #define ANGLE_IS_LUMA (1 << 19) #define ANGLE_DIP_FLAG (1 << 18) #define ANGLE_HAS_TOP_FLAG (1 << 17) #define ANGLE_HAS_LEFT_FLAG (1 << 16) #define ANGLE_MULTI_MRL_FLAG (1 << 15) #define ANGLE_MRL_IDX_SHIFT (13) #define ANGLE_MRL_IDX_MASK (3 << ANGLE_MRL_IDX_SHIFT) #define ANGLE_IBP_FLAG (1 << 12) #define ANGLE_USE_EDGE_FILTER_FLAG (1 << 11) #define ANGLE_SMOOTH_TOP_EDGE_FLAG (1 << 10) #define ANGLE_SMOOTH_LEFT_EDGE_FLAG (1 << 9) #define CFL_FLT_TYPE_UNIFORM (0) #define CFL_FLT_TYPE_VSTRIP (1) #define CFL_FLT_TYPE_GAUSS (2) #define CFL_FLT_TYPE (0x3) #define CFL_HAS_TOP (1 << 2) #define CFL_HAS_LEFT (1 << 3) #define CFL_IS_TOP_SB_EDGE (1 << 4) #define CFL_ALPHA_LOG2 (5) #define CFL_ALPHA_U_SHIFT (16 - CFL_ALPHA_LOG2) #define CFL_ALPHA_V_SHIFT (32 - CFL_ALPHA_LOG2) #define CFL_ALPHA_U_MASK (((1U << CFL_ALPHA_LOG2) - 1U) << CFL_ALPHA_U_SHIFT) #define CFL_ALPHA_V_MASK (((1U << CFL_ALPHA_LOG2) - 1U) << CFL_ALPHA_V_SHIFT) /* * Intra prediction. * - a is the angle (in degrees) for directional intra predictors. For other * modes, it is ignored; * - topleft is the same as the argument given to dav2d_prepare_intra_edges(), * see ipred_prepare.h for more detailed documentation. */ #define decl_angular_ipred_fn(name) \ void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \ int width, int height, int angle, int max_width, int max_height \ HIGHBD_DECL_SUFFIX) typedef decl_angular_ipred_fn(*angular_ipred_fn); /* CFL - explicit and implicit alpha */ /* * Does chroma-from-luma prediction for both chroma planes. * - w/hpad is the edge of the width/height that extends outside the visible * portion of the frame in 4px units; * dst[x,y] = alpha * ac[x,y] + dc */ #define decl_cfl_pred_fn(name) \ void (name)(pixel *const *ptrs, const ptrdiff_t *stride, int wpad, \ int hpad, int w, int h, unsigned flags HIGHBD_DECL_SUFFIX) typedef decl_cfl_pred_fn(*cfl_pred_fn); /* CFL - multi-hypothesis cross component prediction (MHCCP) */ /* * max luma size * = 2 * align64(max_top_w) + max_bl_size + 2 * max_left * = 2 * align64(130) + 64 * 64 + 2 * 128 * = 2 * 192 + 2^12 + 256 * = 384 + 4096 + 256 = 4736 */ #define CFL_MHCCP_MAX_LUMA_SIZE 4736 #define decl_cfl_gen_y_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_top_stride, \ const pixel *src, const pixel *top_sb_edge, ptrdiff_t src_stride, \ int refw, int refh, int tw, int th, int flags) typedef decl_cfl_gen_y_fn(*cfl_gen_y_fn); /* * max edge samples * = 2*(1tl+64a+64tr)+64l+64bl = 2tl+64a+64tr+2*(64l+64bl) * = 2tl+128a+128tr+64l+64bl = 2tl+64a+64tr+128l+128bl * = 386 */ #define CFL_MHCCP_MAX_EDGE_SAMPLES 386 #define decl_cfl_gen_mat_fn(name) \ void (name)(int32_t mat[3][3], uint16_t imat[2][CFL_MHCCP_MAX_EDGE_SAMPLES], \ const pixel *y, ptrdiff_t y_top_stride, int refw, int refh, \ int edge_flags HIGHBD_DECL_SUFFIX) typedef decl_cfl_gen_mat_fn(*cfl_gen_mat_fn); /* * alpha[3] = gauss elimination(mat <- imat) */ #define decl_cfl_calc_alphas_fn(name) \ void (name)(int alpha[3], const pixel *c, const pixel *top_sb_edge, \ ptrdiff_t stride, int w, int h, int32_t mat[3][3], \ const uint16_t imat[2][CFL_MHCCP_MAX_EDGE_SAMPLES], \ int edge_flags HIGHBD_DECL_SUFFIX) typedef decl_cfl_calc_alphas_fn(*cfl_calc_alphas_fn); /* * dst[x,y] = alpha[0] * (luma[center/above/left] >> 3) + * alpha[1] * ((luma[center]/8)^2 + (1 << (bd/2)) >> 3) + * alpha[2] << (bd/2) */ #define decl_cfl_mhccp_pred_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *src, \ ptrdiff_t src_top_stride, int w, int h, const int alpha[3], \ int edge_flags HIGHBD_DECL_SUFFIX) typedef decl_cfl_mhccp_pred_fn(*cfl_mhccp_pred_fn); /* * dst[x,y] = pal[idx[x,y]] * - palette indices are [0-7] * - only 16-byte alignment is guaranteed for idx. */ #define decl_pal_pred_fn(name) \ void (name)(pixel *dst, ptrdiff_t stride, const pixel *pal, \ const uint8_t *idx, int w, int h) typedef decl_pal_pred_fn(*pal_pred_fn); typedef struct Dav2dIntraPredDSPContext { angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES]; // cfl explicit / implicit cfl_pred_fn cfl_pred[2 /* explicit, implicit */][3 /* 420, 422, 444 */]; // cfl mhccp cfl_gen_y_fn cfl_gen_y[3 /* 420, 422, 444 */][3 /* cfl_ds_filter_type */]; cfl_gen_mat_fn cfl_gen_mat[3 /* CflMhDir */]; cfl_calc_alphas_fn cfl_calc_alphas; cfl_mhccp_pred_fn cfl_mhccp_pred[3 /* CflMhDir */]; // palette pal_pred_fn pal_pred; } Dav2dIntraPredDSPContext; bitfn_decls(void dav2d_intra_pred_dsp_init, Dav2dIntraPredDSPContext *c); #endif /* DAV2D_SRC_IPRED_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ipred_prepare.h000066400000000000000000000103031517466257200236220ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_IPRED_PREPARE_H #define DAV2D_SRC_IPRED_PREPARE_H #include #include #include "common/bitdepth.h" #include "src/env.h" #include "src/intra_edge.h" #include "src/levels.h" #include "src/ipred.h" /* * Luma intra edge preparation. * * x/y/start/w/h are in luma block (4px) units: * - x and y are the absolute block positions in the image; * - start/w/h are the *dependent tile* boundary positions. In practice, start * is the horizontal tile start, w is the horizontal tile end, the vertical * tile start is assumed to be 0 and h is the vertical image end. * * edge_flags signals which edges are available for this transform-block inside * the given partition, as well as for the partition inside the superblock * structure. * * dst and stride are pointers to the top/left position of the current block, * and can be used to locate the top, left, top/left, top/right and bottom/left * edge pointers also. * * angle is the angle_delta [-3..3] on input, and the absolute angle on output. * * mode is the intra prediction mode as coded in the bitstream. The return value * is this same mode, converted to an index in the DSP functions. * * tw/th are the size of the transform block in block (4px) units. * * topleft_out is a pointer to scratch memory that will be filled with the edge * pixels. The memory array should have space to be indexed in the [-2*w,2*w] * range, in the following order: * - [0] will be the top/left edge pixel; * - [1..w] will be the top edge pixels (1 being left-most, w being right-most); * - [w+1..2*w] will be the top/right edge pixels; * - [-1..-w] will be the left edge pixels (-1 being top-most, -w being bottom- * most); * - [-w-1..-2*w] will be the bottom/left edge pixels. * Each edge may remain uninitialized if it is not used by the returned mode * index. If edges are not available (because the edge position is outside the * tile dimensions or because edge_flags indicates lack of edge availability), * they will be extended from nearby edges as defined by the av2 spec. * * Intra flags is a bitmask indicating intra mode features such as top/left * edge use, MRL mode, IBP, and edge filtering options (see ipred.h). */ enum IntraPredMode bytefn(dav2d_prepare_intra_edges)(DB_ONLY(const int print_dbg) int x, int y, int w, int h, int n_tr, int n_bl, const pixel *dst, ptrdiff_t stride, const pixel *prefilter_toplevel_sb_edge, enum IntraPredMode mode, int tw, int th, int intra_flags, pixel *topleft_out HIGHBD_DECL_SUFFIX); #endif /* DAV2D_SRC_IPRED_PREPARE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ipred_prepare_tmpl.c000066400000000000000000000301051517466257200246530ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "common/dump.h" #include "common/intops.h" #include "src/debug.h" #include "src/ipred_prepare.h" static const uint8_t mode_conv[2 /* is_paeth */][2 /* have_left */][2 /* have_top */] = { [0 /*DC_PRED*/] = { { DC_128_PRED, TOP_DC_PRED }, { LEFT_DC_PRED, DC_PRED } }, [1 /*PAETH_PRED*/] = { { DC_128_PRED, VERT_PRED }, { HOR_PRED, PAETH_PRED } }, }; typedef struct EdgeMask { uint8_t needs_left:1; uint8_t needs_top:1; uint8_t needs_topleft:1; uint8_t needs_topright:1; uint8_t needs_bottomleft:1; } EdgeMask; static const EdgeMask intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = { [DC_PRED] = { .needs_top = 1, .needs_left = 1 }, [VERT_PRED] = { .needs_top = 1 }, [HOR_PRED] = { .needs_left = 1 }, [LEFT_DC_PRED] = { .needs_left = 1 }, [TOP_DC_PRED] = { .needs_top = 1 }, [DC_128_PRED] = { 0 }, [Z1_PRED] = { .needs_top = 1, .needs_topright = 1, .needs_topleft = 1 }, [Z2_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 }, [Z3_PRED] = { .needs_left = 1, .needs_bottomleft = 1, .needs_topleft = 1 }, [SMOOTH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topright = 1, .needs_bottomleft = 1 }, [SMOOTH_V_PRED] = { .needs_top = 1, .needs_bottomleft = 1 }, [SMOOTH_H_PRED] = { .needs_left = 1, .needs_topright = 1 }, [PAETH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 }, [DIP_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1, .needs_topright = 1, .needs_bottomleft= 1 } }; enum IntraPredMode bytefn(dav2d_prepare_intra_edges)(DB_ONLY(const int print_dbg) const int x, const int y, const int w, const int h, const int n_tr, const int n_bl, const pixel *const dst, const ptrdiff_t stride, const pixel *prefilter_toplevel_sb_edge, enum IntraPredMode mode, const int tw4, const int th4, const int intra_flags, pixel *const topleft_out HIGHBD_DECL_SUFFIX) { const int bitdepth = bitdepth_from_max(bitdepth_max); assert(y < h && x < w); int is_dir = 0; const int enable_edge_filter = !!(intra_flags & ANGLE_USE_EDGE_FILTER_FLAG); const int angle = intra_flags & 511; const int apply_dip = !!(intra_flags & ANGLE_DIP_FLAG); const int apply_ibp = !!(intra_flags & ANGLE_IBP_FLAG); const int mrl_idx = (intra_flags & ANGLE_MRL_IDX_MASK) >> ANGLE_MRL_IDX_SHIFT; const int mrl_mul = !!(intra_flags & ANGLE_MULTI_MRL_FLAG); const int have_left = !!(intra_flags & ANGLE_HAS_LEFT_FLAG); const int have_top = !!(intra_flags & ANGLE_HAS_TOP_FLAG); int tl_filter = 0; switch (mode) { case VERT_PRED: case HOR_PRED: case DIAG_DOWN_LEFT_PRED: case DIAG_DOWN_RIGHT_PRED: case VERT_RIGHT_PRED: case HOR_DOWN_PRED: case HOR_UP_PRED: case VERT_LEFT_PRED: { is_dir = 1; if (angle <= 90) mode = angle < 90 && (have_top || apply_ibp) ? Z1_PRED : VERT_PRED; else if (angle < 180) mode = Z2_PRED; else mode = angle > 180 && (have_left || apply_ibp) ? Z3_PRED : HOR_PRED; tl_filter = (unsigned) mode - Z1_PRED <= 2U && have_left && have_top && !mrl_idx && enable_edge_filter && tw4 + th4 >= 6; break; } case DC_PRED: mode = apply_dip ? DIP_PRED : mode_conv[0][have_left][have_top]; break; case PAETH_PRED: assert(!apply_dip); mode = mode_conv[1][have_left][have_top]; break; default: break; } assert(!mrl_idx || is_dir); EdgeMask e = intra_prediction_edges[mode]; if ((mode == Z1_PRED || mode == Z3_PRED) && apply_ibp) e = intra_prediction_edges[DIP_PRED]; // all edges const pixel *dst_top, *dst_top2; ptrdiff_t top_stride; if (have_top && ((e.needs_top | e.needs_topleft | e.needs_topright) || ((e.needs_left | e.needs_bottomleft) && !have_left))) { if (prefilter_toplevel_sb_edge) { dst_top = dst_top2 = &prefilter_toplevel_sb_edge[x * 4]; top_stride = 0; } else { dst_top = &dst[-((mrl_idx + 1) * PXSTRIDE(stride))]; dst_top2 = &dst[-PXSTRIDE(stride)]; top_stride = stride; } } const int tw = tw4 << 2, th = th4 << 2; // in case of multi-mrl: ptr1 will point to the mrl_idx line, which // contains one topleft pixel, mrl_idx pixels between top/left and // top-most left pixel, then width pixels above, followed by height // pixels and 2*mrl_idx pixels top/right. Then the left edge of the // adjacent (non-mrl_idx) line, which contains width+height pixels. const int diag_mrl_idx = (unsigned) mode - Z1_PRED <= 2U ? mrl_idx : 0; const int e_stride = (tw + th) * 2 + diag_mrl_idx * 3 + 1; if (e.needs_left || tl_filter) { int sz = e.needs_left ? th : 1, sz2 = th; if (e.needs_bottomleft) { sz += apply_dip ? th >> 2 : is_dir ? tw + 2 * diag_mrl_idx : 1 /* smooth */; sz2 = sz - 2 * diag_mrl_idx; } pixel *const left = &topleft_out[-(diag_mrl_idx + 1)]; pixel *const left2 = &topleft_out[e_stride - 1]; if (have_left) { int px_have = e.needs_left ? imin(th, (h - y) << 2) : 1; int i; for (i = 0; i < px_have; i++) left[-i] = dst[PXSTRIDE(stride) * i - 1 - mrl_idx]; if (e.needs_bottomleft && n_bl > 0) { px_have += imin(n_bl << 2, sz - th); for (; i < px_have; i++) left[-i] = dst[PXSTRIDE(stride) * i - 1 - mrl_idx]; } if (px_have < sz) pixel_set(&left[1 - sz], left[1 - i], sz - px_have); if (mrl_mul) { px_have = imin(px_have, sz2); for (int i = 0; i < px_have; i++) left2[-i] = dst[PXSTRIDE(stride) * i - 1]; if (px_have < sz2) pixel_set(&left2[1 - sz2], left2[1 - i], sz2 - px_have); } } else { pixel_set(&left[1 - sz], have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1, sz); if (mrl_mul) pixel_set(&left2[1 - sz2], have_top ? *dst_top2 : ((1 << bitdepth) >> 1) + 1, sz2); } #if DEBUG_BLOCK_INFO if (print_dbg) { hex_dump(&left[1 - sz], 0, sz, 1, "l"); if (mrl_mul) hex_dump(&left2[1 - sz2], 0, sz2, 1, "l2"); } #endif } else if (e.needs_bottomleft) { assert(mode == SMOOTH_V_PRED); pixel *const bottom_left = &topleft_out[-(1 + th)]; if (!have_left) { *bottom_left = have_top ? *dst_top : ((1 << bitdepth) >> 1) + 1; } else if (n_bl <= 0) { *bottom_left = dst[PXSTRIDE(stride) * (imin(th, (h - y) << 2) - 1) - 1]; } else { *bottom_left = dst[PXSTRIDE(stride) * th - 1]; } #if DEBUG_BLOCK_INFO if (print_dbg) hex_dump(bottom_left, 0, 1, 1, "bl"); #endif } if (e.needs_top || tl_filter) { int sz = e.needs_top ? tw : 1, sz2 = tw; if (e.needs_topright) { sz += apply_dip ? tw >> 2 : is_dir ? th + 2 * diag_mrl_idx : 1 /* smooth */; sz2 = sz - 2 * diag_mrl_idx; } pixel *const top = &topleft_out[diag_mrl_idx + 1]; pixel *const top2 = &topleft_out[e_stride + 1]; if (have_top) { int px_have = e.needs_top ? imin(tw, (w - x) << 2) : 1; pixel_copy(top, dst_top, px_have); if (e.needs_topright && n_tr > 0) { px_have += imin(n_tr << 2, sz - tw); pixel_copy(top + tw, dst_top + tw, px_have - tw); } if (px_have < sz) pixel_set(top + px_have, top[px_have - 1], sz - px_have); if (mrl_mul) { px_have = imin(px_have, sz2); pixel_copy(top2, dst_top2, px_have); if (px_have < sz2) pixel_set(top2 + px_have, top2[px_have - 1], sz2 - px_have); } } else { pixel_set(top, have_left ? dst[-(1 + mrl_idx)] : ((1 << bitdepth) >> 1) - 1, sz); if (mrl_mul) pixel_set(top2, have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1, sz2); } #if DEBUG_BLOCK_INFO if (print_dbg) { hex_dump(top, 0, sz, 1, "t"); if (mrl_mul) hex_dump(top2, 0, sz2, 1, "t2"); } #endif } else if (e.needs_topright) { assert(mode == SMOOTH_H_PRED); pixel *const top_right = &topleft_out[1 + tw]; if (!have_top) { *top_right = have_left ? dst[-1] : ((1 << bitdepth) >> 1) - 1; } else if (n_tr <= 0) { *top_right = dst_top[imin(tw, (w - x) << 2) - 1]; } else { *top_right = dst_top[tw]; } #if DEBUG_BLOCK_INFO if (print_dbg) hex_dump(top_right, 0, 1, 1, "tr"); #endif } if (e.needs_topleft) { assert(diag_mrl_idx == mrl_idx); if (have_top && have_left) { for (int i = -mrl_idx; i < 0; i++) topleft_out[i] = dst_top[-(mrl_idx + 1) + (-i) * PXSTRIDE(top_stride)]; for (int i = 0; i <= mrl_idx; i++) topleft_out[i] = dst_top[-(mrl_idx + 1 - i)]; } else { int v; if (have_left) v = dst[-(1 + mrl_idx)]; else v = have_top ? *dst_top : (1 << bitdepth) >> 1; pixel_set(&topleft_out[-mrl_idx], v, 2 * mrl_idx + 1); } topleft_out[e_stride] = have_left ? have_top ? dst_top2[-1] : dst[-1] : have_top ? *dst_top2 : (1 << bitdepth) >> 1; #if DEBUG_BLOCK_INFO if (print_dbg) { hex_dump(&topleft_out[-mrl_idx], 0, 2 * mrl_idx + 1, 1, "tl"); if (mrl_mul) hex_dump(&topleft_out[e_stride], 0, 1, 1, "tl2"); } #endif if (tl_filter) { const int c = topleft_out[0] + (topleft_out[-1] + topleft_out[0] + topleft_out[1]) * 5; topleft_out[0] = (c + 8) >> 4; } } return mode; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ipred_tmpl.c000066400000000000000000001767041517466257200231550ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "common/attributes.h" #include "common/intops.h" #include "src/derivation.h" #include "src/dip_tables.h" #include "src/ibp.h" #include "src/ipred.h" #include "src/tables.h" typedef struct { int8_t a; uint8_t b, c; int8_t d; } DRFilter4Tap; static const DRFilter4Tap dr_interp_filter[32] = { { 0, 128, 0, 0 }, { -2, 127, 4, -1 }, { -3, 125, 8, -2 }, { -5, 123, 13, -3 }, { -6, 121, 17, -4 }, { -7, 118, 22, -5 }, { -9, 116, 27, -6 }, { -9, 112, 32, -7 }, { -10, 109, 37, -8 }, { -11, 106, 41, -8 }, { -11, 102, 46, -9 }, { -12, 98, 52, -10 }, { -12, 94, 56, -10 }, { -12, 90, 61, -11 }, { -12, 85, 66, -11 }, { -12, 81, 71, -12 }, { -12, 76, 76, -12 }, { -12, 71, 81, -12 }, { -11, 66, 85, -12 }, { -11, 61, 90, -12 }, { -10, 56, 94, -12 }, { -10, 52, 98, -12 }, { -9, 46, 102, -11 }, { -8, 41, 106, -11 }, { -8, 37, 109, -10 }, { -7, 32, 112, -9 }, { -6, 27, 116, -9 }, { -5, 22, 118, -7 }, { -4, 17, 121, -6 }, { -3, 13, 123, -5 }, { -2, 8, 125, -3 }, { -1, 4, 127, -2 } }; static NOINLINE void splat_dc(pixel *dst, const ptrdiff_t stride, const int width, int height, const int dc) { do { for (int x = 0; x < width; x++) dst[x] = dc; dst += PXSTRIDE(stride); } while (--height); } static unsigned dc_gen_top(const pixel *const topleft, const int width, const int skip) { unsigned dc = width >> (1 + skip); for (int i = 0; i < width; i += 1 + skip) dc += topleft[1 + i]; return dc >> ctz(width >> skip); } static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const unsigned dc = dc_gen_top(topleft, width, 0); if (a & ANGLE_IBP_FLAG) { const int h = height >> 2; const uint8_t *w_y = &dav2d_dc_ibp_weights[h]; for (int y = 0; y < h; y++) { const int wy = 128 - w_y[y]; const int dc_wy = dc * w_y[y]; for (int x = 0; x < width; x++) { dst[x] = (topleft[x + 1] * wy + dc_wy + 64) >> 7; } dst += PXSTRIDE(stride); } height -= h; } splat_dc(dst, stride, width, height, dc); } static unsigned dc_gen_left(const pixel *const topleft, const int height, const int skip) { unsigned dc = height >> (1 + skip); for (int i = 0; i < height; i += 1 + skip) dc += topleft[-(1 + i)]; return dc >> ctz(height >> skip); } static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, int width, const int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const unsigned dc = dc_gen_left(topleft, height, 0); if (a & ANGLE_IBP_FLAG) { const int w = width >> 2; const uint8_t *w_x = &dav2d_dc_ibp_weights[w]; for (int y = 0; y < height; y++) { const int left = topleft[-(y + 1)]; for (int x = 0; x < w; x++) { dst[x] = (left * (128 - w_x[x]) + dc * w_x[x] + 64) >> 7; } dst += PXSTRIDE(stride); } dst -= PXSTRIDE(stride) * height; width -= w; dst += w; } splat_dc(dst, stride, width, height, dc); } static inline unsigned fast_div32_dc(const unsigned num, const unsigned den) { assert(den > 0 && den <= 255); int shift = ulog2(den); const int rem = den - (1 << shift); const int idx = rem << (7 - shift); assert(idx <= 128); shift += 9; return ((num * dav2d_div_recip[idx]) + ((1 << shift) >> 1)) >> shift; } static unsigned dc_gen(const pixel *const topleft, const int width, const int height, const int hskip, const int vskip HIGHBD_DECL_SUFFIX) { const int n_pel = (width >> hskip) + (height >> vskip); unsigned dc = 0; for (int i = 0; i < width; i += 1 + hskip) dc += topleft[i + 1]; for (int i = 0; i < height; i += 1 + vskip) dc += topleft[-(i + 1)]; if (!(n_pel & (n_pel - 1))) return (dc + (width >> hskip)) >> ctz(n_pel); return iclip_pixel(fast_div32_dc(dc, n_pel)); } static void ipred_dc_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, int width, int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const unsigned dc = dc_gen(topleft, width, height, 0, 0 HIGHBD_TAIL_SUFFIX); if (a & ANGLE_IBP_FLAG) { pixel *const p_dst = dst; const int h = height >> 2; const int w = width >> 2; const int x_start = width < height ? w : 0; const uint8_t *const w_y = &dav2d_dc_ibp_weights[h]; for (int y = 0; y < h; y++) { const int wy = 128 - w_y[y]; const int dc_wy = dc * w_y[y]; for (int x = x_start; x < width; x++) { dst[x] = (topleft[x + 1] * wy + dc_wy + 64) >> 7; } dst += PXSTRIDE(stride); } const int y_start = width >= height ? h : 0; dst = p_dst + y_start * PXSTRIDE(stride); const uint8_t *const w_x = &dav2d_dc_ibp_weights[w]; for (int y = y_start; y < height; y++) { const int left = topleft[-(y + 1)]; for (int x = 0; x < w; x++) { dst[x] = (left * (128 - w_x[x]) + dc * w_x[x] + 64) >> 7; } dst += PXSTRIDE(stride); } dst = p_dst + (h * PXSTRIDE(stride) + w); width -= w; height -= h; } splat_dc(dst, stride, width, height, dc); } static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { #if BITDEPTH == 16 const int dc = (bitdepth_max + 1) >> 1; #else const int dc = 128; #endif splat_dc(dst, stride, width, height, dc); } static void ipred_v_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, const int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int mrl_mul = !!(angle & ANGLE_MULTI_MRL_FLAG); const pixel *const top = &topleft[1]; if (mrl_mul) { // Safe maximum size for edge buffers const int e_stride = (width + height) * 2 + 1; const pixel *const top2 = &topleft[1 + e_stride]; for (int x = 0; x < width; x++) { dst[x] = (top[x] + top2[x] + 1) >> 1; } const pixel *const edge = dst; int y = 1; do { dst += PXSTRIDE(stride); pixel_copy(dst, edge, width); } while (++y < height); return; } for (int y = 0; y < height; y++) { pixel_copy(dst, top, width); dst += PXSTRIDE(stride); } } static void ipred_h_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, const int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int mrl_mul = !!(angle & ANGLE_MULTI_MRL_FLAG); const pixel *left = &topleft[-1]; if (mrl_mul) { // Safe maximum size for edge buffers const int e_stride = (width + height) * 2 + 1; const pixel *left2 = &topleft[e_stride - 1]; for (int y = 0; y < height; y++) { const int v = (left[-y] + left2[-y] + 1) >> 1; pixel_set(dst, v, width); dst += PXSTRIDE(stride); } return; } for (int y = 0; y < height; y++) { pixel_set(dst, left[-y], width); dst += PXSTRIDE(stride); } } static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride, const pixel *const tl_ptr, const int width, const int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int topleft = tl_ptr[0]; for (int y = 0; y < height; y++) { const int left = tl_ptr[-(y + 1)]; for (int x = 0; x < width; x++) { const int top = tl_ptr[1 + x]; const int base = left + top - topleft; const int ldiff = abs(left - base); const int tdiff = abs(top - base); const int tldiff = abs(topleft - base); dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left : tdiff <= tldiff ? top : topleft; } dst += PXSTRIDE(stride); } } static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int bwl2 = ulog2(width), bhl2 = ulog2(height); const int rnd_ver = height >> 1; const int rnd_hor = width >> 1; const int n_pel = width * height; const int scale = (n_pel >= 64) + (n_pel > 512); const uint8_t *const weights = dav2d_sm_weights[scale]; const int right = topleft[width + 1], bottom = topleft[-(height + 1)]; for (int y = 0; y < height; y++) { const int left = topleft[-(y + 1)]; const int diff_hor = left - right; const int off_ver = height - 1 - y; const int w_ver = weights[y]; for (int x = 0; x < width; x++) { const int above = topleft[1 + x]; const int mul_ver = (above - bottom) * off_ver; const int mul_hor = diff_hor * (width - 1 - x); int pred_ver = bottom + ((mul_ver + rnd_ver) >> bhl2); int pred_hor = right + ((mul_hor + rnd_hor) >> bwl2); pred_ver += ((above - pred_ver) * w_ver + 32) >> 6; pred_hor += ((left - pred_hor) * weights[x] + 32) >> 6; dst[x] = (pred_ver + pred_hor + 1) >> 1; } dst += PXSTRIDE(stride); } } static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int bhl2 = ulog2(height); const int rnd = height >> 1; const int n_pel = width * height; const int scale = (n_pel >= 64) + (n_pel > 512); const uint8_t *const weights = dav2d_sm_weights[scale]; const int bottom = topleft[-(height + 1)]; for (int y = 0; y < height; y++) { const int off = height - 1 - y; const int w_ver = weights[y]; for (int x = 0; x < width; x++) { const int above = topleft[1 + x]; const int mul = (above - bottom) * off; const int pred = bottom + ((mul + rnd) >> bhl2); dst[x] = pred + (((above - pred) * w_ver + 32) >> 6); } dst += PXSTRIDE(stride); } } static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, const int a, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int bwl2 = ulog2(width); const int rnd = width >> 1; const int n_pel = width * height; const int scale = (n_pel >= 64) + (n_pel > 512); const uint8_t *const weights = dav2d_sm_weights[scale]; const int right = topleft[width + 1]; for (int y = 0; y < height; y++) { const int left = topleft[-(y + 1)]; const int diff = left - right; for (int x = 0; x < width; x++) { const int mul = diff * (width - 1 - x); const int pred = right + ((mul + rnd) >> bwl2); dst[x] = pred + (((left - pred) * weights[x] + 32) >> 6); } dst += PXSTRIDE(stride); } } static NOINLINE int get_filter_strength(const int wh, const int angle, const int is_sm) { if (is_sm) { if (wh <= 8) { if (angle >= 64) return 2; if (angle >= 40) return 1; } else if (wh <= 16) { if (angle >= 48) return 2; if (angle >= 20) return 1; } else if (wh <= 24) { if (angle >= 4) return 3; } else { return 3; } } else { if (wh <= 8) { if (angle >= 56) return 1; } else if (wh <= 16) { if (angle >= 40) return 1; } else if (wh <= 24) { if (angle >= 32) return 3; if (angle >= 16) return 2; if (angle >= 8) return 1; } else if (wh <= 32) { if (angle >= 32) return 3; if (angle >= 4) return 2; return 1; } else { return 3; } } return 0; } static NOINLINE void filter_edge(pixel *const out, const int sz, const int lim_from, const int lim_to, const pixel *const in, const int from, const int to, const int strength) { static const uint8_t kernel[3][5] = { { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 } }; assert(strength > 0); int i = 0; for (; i < imin(sz, lim_from); i++) out[i] = in[iclip(i, from, to - 1)]; for (; i < imin(lim_to, sz); i++) { int s = 0; for (int j = 0; j < 5; j++) s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j]; out[i] = (s + 8) >> 4; } for (; i < sz; i++) out[i] = in[iclip(i, from, to - 1)]; } static void ibp_blend(pixel *dst, const ptrdiff_t stride, const pixel *tmp /* stride=64 */, const int width, const int height, const int inv, const uint8_t weights[16][16] HIGHBD_DECL_SUFFIX) { const int x_shift = width >> (4 + 1); const int y_shift = height >> (4 + 1); for (int y = 0; y < height; y++) { const int wy = y >> y_shift; for (int x = 0; x < width; x++) { const int wx = x >> x_shift; const int weight = weights[inv ? wx : wy][inv ? wy : wx]; dst[x] = (tmp[x] * (128 - weight) + dst[x] * weight + 64) >> 7; } dst += PXSTRIDE(stride); tmp += 64; } } static decl_angular_ipred_fn(ipred_z3_c); static void ipred_z1_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in, const int width, const int height, int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int angle_flags = angle & ~(511 | ANGLE_IBP_FLAG); const int is_luma = angle & ANGLE_IS_LUMA; const int is_sm_t = !!(angle & ANGLE_SMOOTH_TOP_EDGE_FLAG); const int enable_intra_edge_filter = !!(angle & ANGLE_USE_EDGE_FILTER_FLAG); const int enable_ibp = !!(angle & ANGLE_IBP_FLAG); const int mrl_idx = (angle & ANGLE_MRL_IDX_MASK) >> ANGLE_MRL_IDX_SHIFT; const int mrl_mul = !!(angle & ANGLE_MULTI_MRL_FLAG); const int have_top = !!(angle & ANGLE_HAS_TOP_FLAG); angle &= 511; assert(angle < 90); if (mrl_mul) { const int e_stride = (width + height) * 2 + mrl_idx * 3 + 1; const pixel *tl2 = &topleft_in[e_stride]; pixel tmp[64 * 64]; assert(is_luma); ipred_z1_c(tmp, 64 * sizeof(pixel), topleft_in, width, height, angle | (mrl_idx << ANGLE_MRL_IDX_SHIFT) | ANGLE_IS_LUMA, max_width, max_height HIGHBD_TAIL_SUFFIX); ipred_z1_c(dst, stride, tl2, width, height, angle | ANGLE_IS_LUMA, max_width, max_height HIGHBD_TAIL_SUFFIX); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) dst[x] = (tmp[y * 64 + x] + dst[x] + 1) >> 1; dst += PXSTRIDE(stride); } return; } const int dx = dav2d_dr_intra_derivative[angle]; const int max_base_x = (width + height) - 1 + (mrl_idx << 1); // Buffer organization: // - 1 pixel left padding; // - 1 pixel top/left; // - mrl_idx pixels extra between top/left and top (max 3); // - width pixels top; // - height pixels top/right; // - 2 * mrl_idx pixels extra between top/right and right padding (max 2 * 3); // - 2 pixels right padding. pixel filt[1 + 1 + 3 + 64 + 64 + 2 * 3 + 2], *const top = &filt[2 + mrl_idx]; const int str = enable_intra_edge_filter && have_top && !mrl_idx ? get_filter_strength(width + height, 90 - angle, is_sm_t) : 0; const int sz = 1 + mrl_idx + width + height + mrl_idx * 2; if (str) { filter_edge(&filt[1], sz, 1, sz + max_width - width, topleft_in, 0, sz, str); } else { pixel_copy(&filt[1], topleft_in, sz); } filt[0] = filt[1]; filt[sz + 2] = filt[sz + 1] = filt[sz]; for (int y = 0, xpos = dx * (1 + mrl_idx); y < height; y++, xpos += dx) { int base = xpos >> 6; if (base > max_base_x) { for (; y < height; y++) { pixel_set(&dst[y * PXSTRIDE(stride)], top[max_base_x], width); } break; } const int shift = (xpos & 0x3F) >> 1; const DRFilter4Tap f = dr_interp_filter[shift]; for (int x = 0; x < width; x++, base++) { if (base > max_base_x) { pixel_set(&dst[y * PXSTRIDE(stride) + x], top[max_base_x], width - x); break; } if (is_luma) { const int v = f.a * top[base - 1] + f.b * top[base] + f.c * top[base + 1] + f.d * top[base + 2]; dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 64) >> 7); } else { const int v = (32 - shift) * top[base] + shift * top[base + 1]; dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5); } } } if (enable_ibp) { // I've observed the following values here: // angle | mode_idx | intra mode | angle_delta | comment // 84 | 0 | vert_pred | -2 | // 73 | 1 | vert_left | +2 | // 67 | 2 | vert_left | 0 | // 61 | 3 | vert_left | -2 | // 51 | 4 | diag_down_left | +2 | // 45 | 5 | diag_down_left | 0 | // 39 | 6 | diag_down_left | -2 | // 29 | 6 | diag_down_right | +2 | wide_angle_remap // 23 | 6 | diag_down_right | 0 | wide_angle_remap const int mode_idx = imin(10 - (angle >> 3), 6); pixel tmp[64 * 64]; ipred_z3_c(tmp, 64 * sizeof(pixel), topleft_in, width, height, (180 + angle) | angle_flags, max_width, max_height HIGHBD_TAIL_SUFFIX); ibp_blend(dst, stride, tmp, width, height, 0, dav2d_ibp_weights[mode_idx] HIGHBD_TAIL_SUFFIX); } } static void ipred_z2_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in, const int width, const int height, int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int mrl_mul = !!(angle & ANGLE_MULTI_MRL_FLAG); const int is_luma = angle & ANGLE_IS_LUMA; const int is_sm_l = !!(angle & ANGLE_SMOOTH_LEFT_EDGE_FLAG); const int is_sm_t = !!(angle & ANGLE_SMOOTH_TOP_EDGE_FLAG); const int enable_intra_edge_filter = !!(angle & ANGLE_USE_EDGE_FILTER_FLAG); const int mrl_idx = (angle & ANGLE_MRL_IDX_MASK) >> ANGLE_MRL_IDX_SHIFT; const int have_top = !!(angle & ANGLE_HAS_TOP_FLAG); const int have_left = !!(angle & ANGLE_HAS_LEFT_FLAG); angle &= 511; assert(angle > 90 && angle < 180); if (mrl_mul) { const int e_stride = (width + height) * 2 + mrl_idx * 3 + 1; const pixel *tl2 = &topleft_in[e_stride]; pixel tmp[64 * 64]; assert(is_luma); ipred_z2_c(tmp, 64 * sizeof(pixel), topleft_in, width, height, angle | (mrl_idx << ANGLE_MRL_IDX_SHIFT) | ANGLE_IS_LUMA, max_width, max_height HIGHBD_TAIL_SUFFIX); ipred_z2_c(dst, stride, tl2, width, height, angle | ANGLE_IS_LUMA, max_width, max_height HIGHBD_TAIL_SUFFIX); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) dst[x] = (tmp[y * 64 + x] + dst[x] + 1) >> 1; dst += PXSTRIDE(stride); } return; } const int dy = dav2d_dr_intra_derivative[angle - 90]; const int dx = dav2d_dr_intra_derivative[180 - angle]; // Buffer organization: // - 1 pixels top|left padding; // - 1 pixel top/left; // - mrl_idx pixels extra between top/left and left|top (max 3); // - height|width pixels left|top; // - 1 pixel bottom|right padding. pixel filt[1 + 1 + 3 + 64 + 1], *const top = &filt[mrl_idx]; const int str_t = enable_intra_edge_filter && have_top && !mrl_idx ? get_filter_strength(width + height, angle - 90, is_sm_t) : 0; const int sz_t = 1 + width + mrl_idx; if (str_t) { filter_edge(&filt[1], sz_t, 1, sz_t + max_width - width, topleft_in, 0, sz_t, str_t); } else { pixel_copy(&filt[1], topleft_in, sz_t); } filt[0] = filt[1]; filt[sz_t + 1] = filt[sz_t]; pixel filt2[1 + 1 + 3 + 64 + 1], *const left = &filt2[height + 2]; const int str_l = enable_intra_edge_filter && have_left && !mrl_idx ? get_filter_strength(width + height, 180 - angle, is_sm_l) : 0; const int sz_l = 1 + height + mrl_idx; if (str_l) { filter_edge(&filt2[1], sz_l, height - max_height, sz_l - 1, &topleft_in[-height], 0, sz_l, str_l); } else { pixel_copy(&filt2[1], &topleft_in[-(height + mrl_idx)], sz_l); } filt2[1 + sz_l] = filt2[sz_l]; filt2[0] = filt2[1]; for (int y = 0; y < height; y++) { const int ypos = y + 1; int xpos = -(ypos + mrl_idx) * dx; int x; for (x = 0; x < width && xpos < -(64 * (1 + mrl_idx)); x++, xpos += 64) { const int xpos_l = x + 1; const int ypos_l = (y << 6) - (xpos_l + mrl_idx) * dy; const int base_y = ypos_l >> 6; assert(base_y >= -(1 + mrl_idx)); const int shift = (ypos_l & 0x3F) >> 1; if (is_luma) { const int v = dr_interp_filter[shift].a * left[-(base_y + 1)] + dr_interp_filter[shift].b * left[-(base_y + 2)] + dr_interp_filter[shift].c * left[-(base_y + 3)] + dr_interp_filter[shift].d * left[-(base_y + 4)]; dst[x] = iclip_pixel((v + 64) >> 7); } else { const int v = (32 - shift) * left[-(base_y + 2)] + shift * left[-(base_y + 3)]; dst[x] = iclip_pixel((v + 16) >> 5); } } for (; x < width; x++, xpos += 64) { const int base_x = xpos >> 6; const int shift = (xpos & 0x3F) >> 1; if (is_luma) { const int v = dr_interp_filter[shift].a * top[base_x + 1] + dr_interp_filter[shift].b * top[base_x + 2] + dr_interp_filter[shift].c * top[base_x + 3] + dr_interp_filter[shift].d * top[base_x + 4]; dst[x] = iclip_pixel((v + 64) >> 7); } else { const int v = (32 - shift) * top[base_x + 2] + shift * top[base_x + 3]; dst[x] = iclip_pixel((v + 16) >> 5); } } dst += PXSTRIDE(stride); } } static void ipred_z3_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in, const int width, const int height, int angle, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int angle_flags = angle & ~(511 | ANGLE_IBP_FLAG); const int is_luma = angle & ANGLE_IS_LUMA; const int is_sm_l = !!(angle & ANGLE_SMOOTH_LEFT_EDGE_FLAG); const int enable_intra_edge_filter = !!(angle & ANGLE_USE_EDGE_FILTER_FLAG); const int have_left = !!(angle & ANGLE_HAS_LEFT_FLAG); const int enable_ibp = !!(angle & ANGLE_IBP_FLAG); const int mrl_idx = (angle & ANGLE_MRL_IDX_MASK) >> ANGLE_MRL_IDX_SHIFT; const int mrl_mul = !!(angle & ANGLE_MULTI_MRL_FLAG); angle &= 511; assert(angle > 180); if (mrl_mul) { const int e_stride = (width + height) * 2 + mrl_idx * 3 + 1; const pixel *tl2 = &topleft_in[e_stride]; pixel tmp[64 * 64]; assert(is_luma); ipred_z3_c(tmp, 64 * sizeof(pixel), topleft_in, width, height, angle | (mrl_idx << ANGLE_MRL_IDX_SHIFT) | ANGLE_IS_LUMA, max_width, max_height HIGHBD_TAIL_SUFFIX); ipred_z3_c(dst, stride, tl2, width, height, angle | ANGLE_IS_LUMA, max_width, max_height HIGHBD_TAIL_SUFFIX); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) dst[x] = (tmp[y * 64 + x] + dst[x] + 1) >> 1; dst += PXSTRIDE(stride); } return; } const int dy = dav2d_dr_intra_derivative[270 - angle]; const int max_base_y = width + height - 1 + (mrl_idx << 1); // Buffer organization: // - 1 pixel top padding; // - 1 pixel top/left; // - mrl_idx pixels extra between top/left and left (max 3); // - height pixels left; // - width pixels bottom/left; // - 2 * mrl_idx pixels extra between bottom/left and bottom padding (max 2 * 3); // - 2 pixels bottom padding. pixel filt[1 + 1 + 3 + 64 + 64 + 2 * 3 + 2]; pixel *const left = &filt[1 + width + height + mrl_idx * 2]; const int n_px = width + height; const int str = enable_intra_edge_filter && !mrl_idx && have_left ? get_filter_strength(n_px, angle - 180, is_sm_l) : 0; const int sz = 1 + mrl_idx + width + height + mrl_idx * 2; if (str) { filter_edge(&filt[2], sz, height - max_height, sz - 1, &topleft_in[1 - sz], 0, sz, str); } else { pixel_copy(&filt[2], &topleft_in[1 - sz], sz); } filt[0] = filt[1] = filt[2]; filt[sz + 2] = filt[sz + 1]; int ypos = dy * (1 + mrl_idx); for (int x = 0; x < width; x++, ypos += dy) { const int shift = (ypos & 0x3F) >> 1; const DRFilter4Tap f = dr_interp_filter[shift]; for (int y = 0, base = ypos >> 6; y < height; y++, base++) { if (base <= max_base_y) { if (is_luma) { const int v = f.a * left[-(base - 1)] + f.b * left[-base] + f.c * left[-(base + 1)] + f.d * left[-(base + 2)]; dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 64) >> 7); } else { const int v = (32 - shift) * left[-base] + shift * left[-(base + 1)]; dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5); } } else { do { dst[y * PXSTRIDE(stride) + x] = left[-max_base_y]; } while (++y < height); break; } } } if (enable_ibp) { // I've observed the following values here: // angle | mode_idx | intra mode | angle_delta | comment // 186 | 0 | hor_pred | +2 | // 197 | 1 | hor_up_pred | -2 | // 203 | 2 | hor_up_pred | 0 | // 209 | 3 | hor_up_pred | +2 | // 219 | 4 | diag_down_right | -2 | // 225 | 5 | diag_down_right | 0 | // 231 | 6 | diag_down_right | +2 | // 241 | 6 | diag_down_left | -2 | wide_angle_remap // 247 | 6 | diag_down_left | 0 | wide_angle_remap // 253 | 6 | diag_down_left | +2 | wide_angle_remap const int mode_idx = imin((angle - 183) >> 3, 6); pixel tmp[64 * 64]; ipred_z1_c(tmp, 64 * sizeof(pixel), topleft_in, width, height, (angle - 180) | angle_flags, max_width, max_height HIGHBD_TAIL_SUFFIX); ibp_blend(dst, stride, tmp, width, height, 1, dav2d_ibp_weights[mode_idx] HIGHBD_TAIL_SUFFIX); } } /* CFL EXPLICIT / IMPLICIT */ static NOINLINE void cfl_pred(pixel *const ptrs[6], const ptrdiff_t *stride, const int wpad, const int hpad, const int w, const int h, const unsigned flags, const int implicit, const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) { assert(wpad >= 0 && wpad * 4 < w / 2); assert(hpad >= 0 && hpad * 4 < h / 2); const int has_t = flags & CFL_HAS_TOP; const int has_l = flags & CFL_HAS_LEFT; const int xlim = w - 4 * wpad, ylim = h - 4 * hpad; const int skiph = w == 64, skipv = h == 64; const pixel *const ytop = ptrs[0], *const utop = ptrs[1], *const vtop = ptrs[2]; const pixel *ypx = ptrs[3], *uleft = ptrs[4] - 1, *vleft = ptrs[5] - 1; const ptrdiff_t ystride = PXSTRIDE(stride[0]), cstride = PXSTRIDE(stride[1]); int dc[3] = { 0 }, x, y; int n_top = 0, n_left = 0, i = 0; int sum_x = 0, sum_xx = 0, sum_y[2] = { 0 }, sum_xy[2] = { 0 }; pixel edge[3][8]; if (implicit) { if (has_t && has_l) { if (w > h * 2) { n_top = 8; n_left = 0; } else if (h > w * 2) { n_top = 0; n_left = 8; } else { n_top = 4; n_left = 4; } } else { n_top = has_t ? imin(8, w) : 0; n_left = has_l ? imin(8, h) : 0; } } if (has_l) { const pixel *yleft = ypx - (1 + ss_hor); int step; if (n_left) step = h >> ctz(n_left); int l; for (y = 0; y < ylim; y++) { if (!(ss_hor | ss_ver)) { l = yleft[0] << 3; } else if (!ss_ver) { if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_GAUSS) l = yleft[0] << 3; else if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_VSTRIP) l = (yleft[-1] + 2 * yleft[0] + yleft[1]) << 1; else // CFL_FLT_TYPE_UNIFORM l = (yleft[0] + yleft[1]) << 2; } else { if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_GAUSS) l = yleft[-1] + 4 * yleft[0] + yleft[1] + yleft[y ? -ystride : 0] + yleft[ystride]; else if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_VSTRIP) l = yleft[-1] + 2 * yleft[0] + yleft[1] + yleft[-1 + ystride] + 2 * yleft[ystride] + yleft[1 + ystride]; else // CFL_FLT_TYPE_UNIFORM l = (yleft[0] + yleft[1] + yleft[ystride] + yleft[1 + ystride]) << 1; } if (!skipv || !(y & 1)) { dc[0] += l; dc[1] += uleft[0]; dc[2] += vleft[0]; } if (implicit && n_left && !((y & (step - 1)) ^ (step >> 1))) { edge[0][i] = l >> 3; edge[1][i] = uleft[0]; edge[2][i] = vleft[0]; i++; } yleft += ystride << ss_ver; uleft += cstride; vleft += cstride; } for (y = ylim; y < h; y++) { if (!skipv || !(y & 1)) { dc[0] += l; dc[1] += uleft[-cstride]; dc[2] += vleft[-cstride]; } if (implicit && n_left && !((y & (step - 1)) ^ (step >> 1))) { edge[0][i] = l >> 3; edge[1][i] = uleft[-cstride]; edge[2][i] = vleft[-cstride]; i++; } } } if (has_t) { int step; if (n_top) step = w >> ctz(n_top); int l; for (x = 0; x < xlim; x++) { const int xl = x << ss_hor; if (!(ss_hor | ss_ver)) { l = ytop[xl] << 3; } else if (!ss_ver) { if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_GAUSS) l = ytop[xl] << 3; else if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_VSTRIP) l = (ytop[imax(0, xl - 1)] + 2 * ytop[xl] + ytop[xl + 1]) << 1; else // CFL_FLT_TYPE_UNIFORM l = (ytop[xl] + ytop[xl + 1]) << 2; } else { const int is_top_sb_edge = flags & CFL_IS_TOP_SB_EDGE; const ptrdiff_t bottom = is_top_sb_edge ? 0 : ystride; if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_GAUSS) { l = ytop[imax(0, xl - 1)] + 4 * ytop[xl] + ytop[xl + 1] + ytop[xl - bottom] + ytop[xl + bottom]; } else if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_VSTRIP) { l = ytop[imax(0, xl - 1)] + 2 * ytop[xl] + ytop[xl + 1] + ytop[imax(0, xl - 1) + bottom] + 2 * ytop[xl + bottom] + ytop[xl + 1 + bottom]; } else { // CFL_FLT_TYPE_UNIFORM l = (ytop[xl] + ytop[xl + 1] + ytop[xl + bottom] + ytop[xl + 1 + bottom]) << 1; } } if (!skiph || !(x & 1)) { dc[0] += l; dc[1] += utop[x]; dc[2] += vtop[x]; } if (implicit && n_top && !((x & (step - 1)) ^ (step >> 1))) { edge[0][i] = l >> 3; edge[1][i] = utop[x]; edge[2][i] = vtop[x]; i++; } } for (x = xlim; x < w; x++) { if (!skiph || !(x & 1)) { dc[0] += l; dc[1] += utop[xlim - 1]; dc[2] += vtop[xlim - 1]; } if (implicit && n_top && !((x & (step - 1)) ^ (step >> 1))) { edge[0][i] = l >> 3; edge[1][i] = utop[xlim - 1]; edge[2][i] = vtop[xlim - 1]; i++; } } } if (!has_t && !has_l) { dc[0] = 4 << bitdepth_from_max(bitdepth_max); dc[1] = (BITDEPTH_MAX + 1) >> 1; dc[2] = (BITDEPTH_MAX + 1) >> 1; } else { const int npx = (has_t ? w >> skiph : 0) + (has_l ? h >> skipv : 0); if (!(npx & (npx - 1))) { dc[0] = (dc[0] + (npx >> 1)) >> ctz(npx); dc[1] = (dc[1] + (npx >> 1)) >> ctz(npx); dc[2] = (dc[2] + (npx >> 1)) >> ctz(npx); } else { dc[0] = fast_div32_dc(dc[0], npx); dc[1] = fast_div32_dc(dc[1], npx); dc[2] = fast_div32_dc(dc[2], npx); } } int alpha[2]; if (implicit) { assert(i == n_top + n_left); for (int i = 0; i < n_top + n_left; i++) { sum_x += edge[0][i]; sum_y[0] += edge[1][i]; sum_y[1] += edge[2][i]; sum_xx += edge[0][i] * edge[0][i]; sum_xy[0] += edge[0][i] * edge[1][i]; sum_xy[1] += edge[0][i] * edge[2][i]; } const int count_l2 = ctz(n_top + n_left); const int den = sum_xx - (int)(((int64_t)sum_x * sum_x) >> count_l2); for (int pl = 0; pl < 2; pl++) { int num = sum_xy[pl] - (int)(((int64_t)sum_x * sum_y[pl]) >> count_l2); alpha[pl] = derive_alpha(num, den, 0); } } else { const int shu = CFL_ALPHA_U_SHIFT - 5; const int shv = CFL_ALPHA_V_SHIFT - 5; alpha[0] = ((int16_t) (flags & CFL_ALPHA_U_MASK)) >> shu; alpha[1] = ((int32_t) (flags & CFL_ALPHA_V_MASK)) >> shv; } pixel *dst[2] = { ptrs[4], ptrs[5] }; for (int pl = 0; pl < 2; pl++) { if (!alpha[pl]) splat_dc(dst[pl], stride[1], w, h, dc[1 + pl]); } for (y = 0; y < ylim; y++) { for (x = 0; x < xlim; x++) { int ac; const int xl = x << ss_hor; const int left = imax(xl & -64, xl - 1); if (!(ss_hor | ss_ver)) { ac = ypx[x] << 3; } else if (!ss_ver) { if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_GAUSS) ac = ypx[xl] << 3; else if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_VSTRIP) ac = (ypx[left] + 2 * ypx[xl] + ypx[xl + 1]) << 1; else // CFL_FLT_TYPE_UNIFORM ac = (ypx[xl] + ypx[xl + 1]) << 2; } else { const ptrdiff_t bot = xl + ystride; if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_GAUSS) { const ptrdiff_t top = (y & 31) == 0 ? xl : (xl - ystride); ac = ypx[left] + 4 * ypx[xl] + ypx[xl + 1] + ypx[top] + ypx[bot]; } else if ((flags & CFL_FLT_TYPE) == CFL_FLT_TYPE_VSTRIP) { ac = ypx[left] + 2 * ypx[xl] + ypx[xl + 1] + ypx[left + ystride] + 2 * ypx[bot] + ypx[bot + 1]; } else { // CFL_FLT_TYPE_UNIFORM ac = (ypx[xl] + ypx[xl + 1] + ypx[bot] + ypx[bot + 1]) << 1; } } ac -= dc[0]; for (int pl = 0; pl < 2; pl++) { if (alpha[pl]) { int diff = alpha[pl] * ac; int val = dc[1 + pl] + apply_sign((abs(diff) + 1024) >> 11, diff); dst[pl][x] = iclip_pixel(val); } } } for (int pl = 0; pl < 2; pl++) if (alpha[pl]) { for (int xpad = x; xpad < w; xpad++) dst[pl][xpad] = dst[pl][x - 1]; dst[pl] += cstride; } ypx += ystride << ss_ver; } for (int pl = 0; pl < 2; pl++) if (alpha[pl]) for (y = ylim; y < h; y++) { memcpy(dst[pl], &dst[pl][-(1 + y - ylim) * cstride], w * sizeof(pixel)); dst[pl] += cstride; } } #define cfl_explicit_fn(fmt, ss_hor, ss_ver) \ static void cfl_explicit_##fmt##_c(pixel *const *ptrs, \ const ptrdiff_t *const stride, \ const int wpad, const int hpad, \ const int w, const int h, \ const unsigned flags HIGHBD_DECL_SUFFIX) \ { \ cfl_pred(ptrs, stride, wpad, hpad, w, h, flags, 0, ss_hor, ss_ver HIGHBD_TAIL_SUFFIX); \ } cfl_explicit_fn(420, 1, 1) cfl_explicit_fn(422, 1, 0) cfl_explicit_fn(444, 0, 0) #define cfl_implicit_fn(fmt, ss_hor, ss_ver) \ static void cfl_implicit_##fmt##_c(pixel *const *ptrs, \ const ptrdiff_t *const stride, \ const int wpad, const int hpad, \ const int w, const int h, \ const unsigned flags HIGHBD_DECL_SUFFIX) \ { \ cfl_pred(ptrs, stride, wpad, hpad, w, h, flags, 1, ss_hor, ss_ver HIGHBD_TAIL_SUFFIX); \ } cfl_implicit_fn(420, 1, 1) cfl_implicit_fn(422, 1, 0) cfl_implicit_fn(444, 0, 0) /* CFL MHCCP */ static NOINLINE void cfl_gen_y_420_c(pixel *dst, const ptrdiff_t dst_top_stride, const pixel *src, const pixel *const top_sb_edge, const ptrdiff_t src_stride, const int refw, const int refh, const int tw, const int th, const int flags, const int filter_type) { const int has_t = flags & CFL_HAS_TOP; const int has_l = flags & CFL_HAS_LEFT; const int dir = flags & CFL_DIR_ALL; const int n_left = has_l ? 1 + (dir == CFL_DIR_LEFT): 0; const int n_top = has_t ? 1 + (dir == CFL_DIR_TOP) : 0; pixel *dst_left = dst + n_top * dst_top_stride + 64 * 64; src -= n_left << 1; // :: #define FILTER_UNIFORM(src) \ (src[c] + src[r] + src[b + c] + src[b + r]) >> 2 // ::: #define FILTER_VSTRIP(src) \ (src[l] + 2 * src[c] + src[r] + src[b + l] + 2 * src[b + c] + src[b + r]) >> 3 // -|- #define FILTER_GAUSS(src, top) \ (src[l] + 4 * src[c] + src[r] + (top)[c] + src[b + c]) >> 3 // tl+t+tr if (has_t) { const pixel *top = top_sb_edge ? top_sb_edge - n_left * 2 : src - n_top * 2 * src_stride; const ptrdiff_t b = !top_sb_edge ? src_stride : 0; ptrdiff_t t = n_top == 1 ? -b : 0; for (int y = 0; y < n_top; y++) { int x = 0; for (; x < n_left; x++) { const int c = x * 2, r = c + 1; const int l = (n_left & 1) ? c - 1 : imax(c - 1, 0); switch (filter_type) { case CFL_FLT_TYPE_UNIFORM: dst_left[x] = FILTER_UNIFORM(top); break; case CFL_FLT_TYPE_VSTRIP: dst_left[x] = FILTER_VSTRIP(top); break; case CFL_FLT_TYPE_GAUSS: dst_left[x] = FILTER_GAUSS(top, &top[t]); break; } } for (; x < refw; x++) { const int c = x * 2, r = c + 1; const int l = n_left ? c - 1 : imax(c - 1, 0); switch (filter_type) { case CFL_FLT_TYPE_UNIFORM: dst[x - n_left] = FILTER_UNIFORM(top); break; case CFL_FLT_TYPE_VSTRIP: dst[x - n_left] = FILTER_VSTRIP(top); break; case CFL_FLT_TYPE_GAUSS: dst[x - n_left] = FILTER_GAUSS(top, &top[t]); break; } } if (!top_sb_edge) { top += 2 * src_stride; t = -src_stride; } dst_left += n_left; dst += dst_top_stride; } } // l+blk const ptrdiff_t b = src_stride; const pixel *top = has_t ? top_sb_edge ? top_sb_edge - n_left * 2 : src - src_stride : src; for (int y = 0; y < th; y++) { int x = 0; for (; x < n_left; x++) { const int c = x * 2, r = c + 1; const int l = (n_left & 1) ? c - 1 : imax(c - 1, 0); switch (filter_type) { case CFL_FLT_TYPE_UNIFORM: dst_left[x] = FILTER_UNIFORM(src); break; case CFL_FLT_TYPE_VSTRIP: dst_left[x] = FILTER_VSTRIP(src); break; case CFL_FLT_TYPE_GAUSS: dst_left[x] = FILTER_GAUSS(src, top); break; } } for (; x < n_left + tw; x++) { const int c = x * 2, r = c + 1; const int l = (n_left & 1) ? c - 1 : imax(c - 1, 0); switch (filter_type) { case CFL_FLT_TYPE_UNIFORM: dst[x - n_left] = FILTER_UNIFORM(src); break; case CFL_FLT_TYPE_VSTRIP: dst[x - n_left] = FILTER_VSTRIP(src); break; case CFL_FLT_TYPE_GAUSS: dst[x - n_left] = FILTER_GAUSS(src, top); break; } } src += src_stride << 1; top = src - src_stride; dst_left += n_left; dst += tw; } // bl const int n_bl = refh - th; for (int y = 0; y < n_bl; y++) { for (int x = 0; x < n_left; x++) { const int c = x * 2, r = c + 1; const int l = (n_left & 1) ? c - 1 : imax(c - 1, 0); switch (filter_type) { case CFL_FLT_TYPE_UNIFORM: dst_left[x] = FILTER_UNIFORM(src); break; case CFL_FLT_TYPE_VSTRIP: dst_left[x] = FILTER_VSTRIP(src); break; case CFL_FLT_TYPE_GAUSS: dst_left[x] = FILTER_GAUSS(src, top); break; } } src += src_stride << 1; top = src - src_stride; dst_left += n_left; } } #define cfl_gen_y_420_fn(ucflt, lcflt) \ static void \ cfl_gen_y_420_##lcflt##_c(pixel *const dst, const ptrdiff_t dst_top_stride, \ const pixel *const src, const pixel *const top_sb_edge, \ ptrdiff_t const src_stride, const int refw, const int refh, \ int const tw, int const th, int flags) \ { \ cfl_gen_y_420_c(dst, PXSTRIDE(dst_top_stride), src, top_sb_edge, \ PXSTRIDE(src_stride), refw, refh, tw, th, flags, \ CFL_FLT_TYPE_##ucflt); \ } #define cfl_gen_y_fn(fmt) \ cfl_gen_y_##fmt##_fn(UNIFORM, uniform) \ cfl_gen_y_##fmt##_fn(VSTRIP, vstrip) \ cfl_gen_y_##fmt##_fn(GAUSS, gauss) cfl_gen_y_fn(420) #define SQRND(v) (((v) * (v) + mid) >> bd) #define GEN_MATRIX() \ do { \ mat[0][0] += v0 * v0; \ mat[0][1] += v0 * v1; \ mat[0][2] += v0 << (bd - 1); \ mat[1][1] += v1 * v1; \ mat[1][2] += v1 << (bd - 1); \ } while (0); static void cfl_gen_mat_c(int32_t mat[3][3], uint16_t imat[2][CFL_MHCCP_MAX_EDGE_SAMPLES], const pixel *y, const ptrdiff_t y_top_stride, const int refw, const int refh, const int edge_flags, const enum CflMhDir dir HIGHBD_DECL_SUFFIX) { const int bd = bitdepth_from_max(bitdepth_max); const int mid = 1 << (bd - 1); const int has_t = !!(edge_flags & CFL_HAS_TOP); const int has_l = !!(edge_flags & CFL_HAS_LEFT); const int dir_t = dir == CFL_DIR_TOP; const int dir_l = dir == CFL_DIR_LEFT; const int n_top = has_t ? 1 + dir_t : 0; const int n_left = has_l ? 1 + dir_l : 0; const pixel *left = y + n_top * y_top_stride + 64 * 64; int n = 0; if (has_t) { for (int i = 0; i < n_left; i++, n++) { const int v0 = left[i]; const int v1 = SQRND(!i ? left[i + (dir_t | dir_l)] : y[0]); imat[0][n] = v0; imat[1][n] = v1; GEN_MATRIX(); } const int start = !dir_l && !has_l; for (int i = start; i < refw - n_left - 1 - !start; i++, n++) { const int v0 = y[i]; const int v1 = SQRND(y[dir_t * y_top_stride + i + dir_l]); imat[0][n] = v0; imat[1][n] = v1; GEN_MATRIX(); } } if (has_l) { const int start = dir_t && !has_t; for (int i = 1 - start; i < refh - start - 1; i++, n++) { const int v0 = left[i * n_left]; const int v1 = SQRND(left[(i + dir_t) * n_left + dir_l]); imat[0][n] = v0; imat[1][n] = v1; GEN_MATRIX(); } } mat[2][2] = n << ((bd - 1) << 1); const int nl2 = 31 - clz(n); const int mat_sh = 22 - 2 * bd - nl2 - !!(n & ((1 << nl2) - 1)); if (mat_sh > 0) for (int i = 0; i < 3; i++) for (int j = i; j < 3; j++) mat[i][j] <<= mat_sh; else if (mat_sh < 0) for (int i = 0; i < 3; i++) for (int j = i; j < 3; j++) mat[i][j] >>= -mat_sh; mat[0][0] += 2 << (bd - 8); mat[1][1] += 2 << (bd - 8); mat[2][2] += 2 << (bd - 8); mat[1][0] = mat[0][1]; mat[2][0] = mat[0][2]; mat[2][1] = mat[1][2]; } #define cfl_gen_mat_fn(name, dir) \ static void \ cfl_gen_mat_##name##_c(int32_t mat[3][3], uint16_t imat[2][CFL_MHCCP_MAX_EDGE_SAMPLES], \ const pixel *const y, const ptrdiff_t y_top_stride, \ const int refw, const int refh, const int edge_flags \ HIGHBD_DECL_SUFFIX) \ { \ cfl_gen_mat_c(mat, imat, y, PXSTRIDE(y_top_stride), refw, refh, \ edge_flags, dir HIGHBD_TAIL_SUFFIX); \ } cfl_gen_mat_fn(c, CFL_DIR_CENTER) cfl_gen_mat_fn(t, CFL_DIR_TOP) cfl_gen_mat_fn(l, CFL_DIR_LEFT) static void get_div_scale_sh(int d, int *scale, int *sh) { d = imax(1, abs(d)); *sh = ulog2(d); // 1. Normalize D into fixed-point format with 14 fractional bits. const int nsh = *sh - 14; if (nsh >= 0) { const int rnd = (nsh > 0) ? 1 << (nsh - 1) : 0; d = (d + rnd) >> nsh; } else { d <<= -nsh; } // 2. Clip the scaled value to make sure it's within the valid range // [1, 2), represented as: [1 << 14, (1 << 15) - 1]. // The rounding in 1. may push the value out of range, so clipping is needed. // XXX looks like this could just be an imin d = iclip(d, 1, 0x7fff); // 3. Extract the fractional part of the normalized denominator d. d &= (1 << 14) - 1; const int idx = d >> 11; const uint8_t coefw = dav2d_div_scale_sh_coefw[idx]; const uint16_t bias = dav2d_div_scale_sh_bias[idx]; d -= dav2d_div_scale_sh_offset[idx]; *scale = (((coefw * ((d * d) >> 14)) >> 8) - (d >> 1) + bias) << 2; } /* * Approximate ((a * b) + round) >> shift using only 32-bit intermediates. * Strategy: * 1. Right-shift a and/or b by sh1/sh2 so that (bits(a)-sh1)+(bits(b)-sh2) <= 31. * 2. Compensate in the final right shift: adj = sh - (ash + bsh). * 3. Perform symmetric round-to-nearest (round half away from zero). * This keeps all intermediates in 32-bit while keeping the mean error low. */ static int mul32(int a, int b, int sh) { const int a2 = ulog2(abs(a) | 1) + 1; const int b2 = ulog2(abs(b) | 1) + 1; // 1. Decide how many bits to drop in total to avoid mul overflow const int drop = a2 + b2 > 29 ? a2 + b2 - 29 : 0; // 2. Split the drop across a and b to minimize error const int ash = drop >> 1; const int bsh = drop - ash; const int adj = sh - (ash + bsh); const int mul = (a >> ash) * (b >> bsh); if (adj <= 0) return mul; assert(adj <= 29); // 3. Final right shift with symmetric rounding to nearest const unsigned bias = 1U << (adj - 1); return mul >= 0 ? (int) (((unsigned) mul + bias) >> adj) : -(int) (((unsigned) -mul + bias) >> adj); } static void cfl_calc_alphas_c(int alpha[3], const pixel *const c, const pixel *const top_sb_edge, ptrdiff_t stride, const int refw, const int refh, int32_t mat[3][3], const uint16_t imat[2][CFL_MHCCP_MAX_EDGE_SAMPLES], const int edge_flags HIGHBD_DECL_SUFFIX) { const int bd = bitdepth_from_max(bitdepth_max); const int has_t = !!(edge_flags & CFL_HAS_TOP); const int has_l = !!(edge_flags & CFL_HAS_LEFT); int n = 0; if (has_t) { const pixel *const top = top_sb_edge ? top_sb_edge - has_l : c - PXSTRIDE(stride) - has_l; const int start = !has_l; for (int i = start; i < refw - 1 - !start; i++, n++) { alpha[0] += imat[0][n] * top[i]; alpha[1] += imat[1][n] * top[i]; alpha[2] += top[i] << (bd - 1); } } if (has_l) { for (int i = !has_t; i < refh - 1 - has_t; i++, n++) { const int v = c[i * PXSTRIDE(stride) - 1]; alpha[0] += imat[0][n] * v; alpha[1] += imat[1][n] * v; alpha[2] += v << (bd - 1); } } const int nl2 = 31 - clz(n); const int mat_sh = 22 - 2 * bd - nl2 - !!(n & ((1 << nl2) - 1)); if (mat_sh > 0) { alpha[0] <<= mat_sh; alpha[1] <<= mat_sh; alpha[2] <<= mat_sh; } else if (mat_sh < 0) { alpha[0] >>= -mat_sh; alpha[1] >>= -mat_sh; alpha[2] >>= -mat_sh; } // alpha holds the results of the system of linear equations // Gaussian elimination int tmp[3][2], scale, sh; // row0 get_div_scale_sh(mat[0][0], &scale, &sh); tmp[0][0] = mul32(mat[0][1], scale, sh); tmp[0][1] = mul32(mat[0][2], scale, sh); alpha[0] = mul32(alpha[0], scale, sh); tmp[1][0] = mat[1][1] - mul32(mat[1][0], tmp[0][0], 16); tmp[1][1] = mat[1][2] - mul32(mat[1][0], tmp[0][1], 16); alpha[1] -= mul32(mat[1][0], alpha[0], 16); tmp[2][0] = mat[2][1] - mul32(mat[2][0], tmp[0][0], 16); tmp[2][1] = mat[2][2] - mul32(mat[2][0], tmp[0][1], 16); alpha[2] -= mul32(mat[2][0], alpha[0], 16); // row1 get_div_scale_sh(tmp[1][0], &scale, &sh); tmp[1][1] = mul32(tmp[1][1], scale, sh); alpha[1] = mul32(alpha[1], scale, sh); tmp[2][1] -= mul32(tmp[2][0], tmp[1][1], 16); alpha[2] -= mul32(tmp[2][0], alpha[1], 16); // row2 get_div_scale_sh(tmp[2][1], &scale, &sh); alpha[2] = mul32(alpha[2], scale, sh); alpha[1] -= mul32(tmp[1][1], alpha[2], 16); alpha[0] -= mul32(tmp[0][0], alpha[1], 16) + mul32(tmp[0][1], alpha[2], 16); } static void cfl_mhccp_pred_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_top_stride, const int w, const int h, const int alpha[3], const int edge_flags, const enum CflMhDir dir HIGHBD_DECL_SUFFIX) { const int bd = bitdepth_from_max(bitdepth_max); const int mid = 1 << (bd - 1); const int has_t = !!(edge_flags & CFL_HAS_TOP); const int has_l = !!(edge_flags & CFL_HAS_LEFT); const int dir_t = dir == CFL_DIR_TOP; const int dir_l = dir == CFL_DIR_LEFT; const int n_top = has_t ? 1 + dir_t : 0; const int n_left = has_l ? 1 + dir_l : 0; const pixel *const left = src + 64 * 64 + n_left * n_top; const int a2v2 = mul32(alpha[2], mid, 16); int y = 0; for (; y < dir_t && has_t; y++) { for (int x = 0; x < w; x++) { const int v0 = src[x - src_top_stride]; const int v1 = SQRND(src[x]); dst[x] = iclip_pixel(mul32(alpha[0], v0, 16) + mul32(alpha[1], v1, 16) + a2v2); } src += w; dst += dst_stride; } for (; y < h; y++) { int x = 0; for (; x < dir_l && has_l; x++) { const int v0 = left[y * n_left + dir_l]; const int v1 = SQRND(src[0]); dst[0] = iclip_pixel(mul32(alpha[0], v0, 16) + mul32(alpha[1], v1, 16) + a2v2); } for (; x < w; x++) { const int v0 = src[dir_t ? x - ((!!y) | has_t) * w : dir_l ? imax(x - 1, 0) : x]; const int v1 = SQRND(src[x]); dst[x] = iclip_pixel(mul32(alpha[0], v0, 16) + mul32(alpha[1], v1, 16) + a2v2); } src += w; dst += dst_stride; } } #define cfl_mhccp_pred_fn(name, dir) \ static void \ cfl_mhccp_pred_##name##_c(pixel *dst, ptrdiff_t dst_stride, \ const pixel *src, const ptrdiff_t src_top_stride, \ int w, int h, const int alpha[3], int edge_flags \ HIGHBD_DECL_SUFFIX) \ { \ cfl_mhccp_pred_c(dst, PXSTRIDE(dst_stride), src, PXSTRIDE(src_top_stride), \ w, h, alpha, edge_flags, dir HIGHBD_TAIL_SUFFIX); \ } cfl_mhccp_pred_fn(c, CFL_DIR_CENTER) cfl_mhccp_pred_fn(t, CFL_DIR_TOP) cfl_mhccp_pred_fn(l, CFL_DIR_LEFT) static void pal_pred_c(pixel *dst, const ptrdiff_t stride, const pixel *const pal, const uint8_t *idx, const int w, const int h) { assert(w * h >= 64); // >= 4x16, >= 8x8 or >= 16x4; 4x4/4x8/8x4 are not allowed for (int y = 0; y < h; y++) { for (int x = 0; x < w; x += 2) { const int i = *idx++; assert(!(i & 0x88)); dst[x + 0] = pal[i & 7]; dst[x + 1] = pal[i >> 4]; } dst += PXSTRIDE(stride); } } static void ipred_dip_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, const int width, const int height, int mode, const int max_width, const int max_height HIGHBD_DECL_SUFFIX) { const int trans = !!(mode & 16); const int wd = width >> 2; const int hd = height >> 2; const int wl2 = ulog2(wd); const int hl2 = ulog2(hd); const int wrnd = width >> 3; const int hrnd = height >> 3; const int i_t = 1 + 4 * trans; const int i_l = 5 - 4 * trans; pixel in[11]; int sum; int in_sum = in[0] = topleft[0]; const pixel *tl = &topleft[1]; for (int i = 0; i < 4; i++) { sum = 0; for (int x = 0; x < wd; x++) sum += *tl++; in_sum += in[i_t + i] = (sum + wrnd) >> wl2; } tl = &topleft[-1]; for (int i = 0; i < 4; i++) { sum = 0; for (int y = 0; y < hd; y++) sum += *tl--; in_sum += in[i_l + i] = (sum + hrnd) >> hl2; } sum = 0; for (int x = 0; x < wd; x++) sum += topleft[x + width + 1]; in_sum += in[9 + trans] = (sum + wrnd) >> wl2; sum = 0; for (int y = 0; y < hd; y++) sum += topleft[-(y + height + 1)]; in_sum += in[10 - trans] = (sum + hrnd) >> hl2; const int m = mode & 7; assert(m < 6); int uwl2 = wl2 - 1; int dwl2 = 0; if (uwl2 < 0) { dwl2 = -uwl2; uwl2 = 0; } const int step_x = 1 << uwl2; const int dw = 1 << dwl2; int uhl2 = hl2 - 1; int dhl2 = 0; if (uhl2 < 0) { dhl2 = -uhl2; uhl2 = 0; } const int step_y = 1 << uhl2; const int dh = 1 << dhl2; const int grid_h = 8 >> dhl2; const int grid_w = 8 >> dwl2; // Run DIP prediction at each coarse grid position int y = step_y - 1; for (int gy = 0; gy < grid_h; gy++) { const int iy = gy * dh; int x = step_x - 1; for (int gx = 0; gx < grid_w; gx++) { const int ix = gx * dw; const int idx = trans ? (ix * 8 + iy) : (iy * 8 + ix); int sum = 0; for (int i = 0; i < 11; i++) { sum += dav2d_dip_weights[m][idx][i] * in[i]; } dst[y * PXSTRIDE(stride) + x] = iclip_pixel(((sum + 2048) >> 12) - in_sum); x += step_x; } y += step_y; } if (step_x > 1) { // Horizontal interpolation between coarse DIP samples y = step_y - 1; for (int gy = 0; gy < grid_h; gy++) { int p1 = topleft[-(y + 1)]; int x = 0; for (int gx = 0; gx < grid_w; gx++) { const int p0 = p1; p1 = dst[y * PXSTRIDE(stride) + x + step_x - 1]; for (int z = 0; z < step_x - 1; z++) { const int z1 = z + 1; dst[y * PXSTRIDE(stride) + x + z] = (p0 * (step_x - z1) + (p1 * z1)) >> uwl2; } x += step_x; } y += step_y; } } if (step_y > 1) { // Vertical interpolation between coarse DIP samples. for (int x = 0; x < width; x++) { int p1 = topleft[x + 1]; y = 0; for (int gy = 0; gy < grid_h; gy++) { const int p0 = p1; p1 = dst[(y + step_y - 1) * PXSTRIDE(stride) + x]; for (int z = 0; z < step_y - 1; z++) { const int z1 = z + 1; dst[(y + z) * PXSTRIDE(stride) + x] = (p0 * (step_y - z1) + (p1 * z1)) >> uhl2; } y += step_y; } } } } #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/ipred.h" #elif ARCH_RISCV #include "src/riscv/ipred.h" #elif ARCH_X86 #include "src/x86/ipred.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/ipred.h" #endif #endif COLD void bitfn(dav2d_intra_pred_dsp_init)(Dav2dIntraPredDSPContext *const c) { c->intra_pred[DC_PRED ] = ipred_dc_c; c->intra_pred[DC_128_PRED ] = ipred_dc_128_c; c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c; c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c; c->intra_pred[HOR_PRED ] = ipred_h_c; c->intra_pred[VERT_PRED ] = ipred_v_c; c->intra_pred[PAETH_PRED ] = ipred_paeth_c; c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c; c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c; c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c; c->intra_pred[Z1_PRED ] = ipred_z1_c; c->intra_pred[Z2_PRED ] = ipred_z2_c; c->intra_pred[Z3_PRED ] = ipred_z3_c; c->intra_pred[DIP_PRED ] = ipred_dip_c; c->cfl_pred[CFL_EXPLICIT][DAV2D_PIXEL_LAYOUT_I420 - 1] = cfl_explicit_420_c; c->cfl_pred[CFL_EXPLICIT][DAV2D_PIXEL_LAYOUT_I422 - 1] = cfl_explicit_422_c; c->cfl_pred[CFL_EXPLICIT][DAV2D_PIXEL_LAYOUT_I444 - 1] = cfl_explicit_444_c; c->cfl_pred[CFL_IMPLICIT][DAV2D_PIXEL_LAYOUT_I420 - 1] = cfl_implicit_420_c; c->cfl_pred[CFL_IMPLICIT][DAV2D_PIXEL_LAYOUT_I422 - 1] = cfl_implicit_422_c; c->cfl_pred[CFL_IMPLICIT][DAV2D_PIXEL_LAYOUT_I444 - 1] = cfl_implicit_444_c; #define assign_cfl_mhccp(dir, name) \ c->cfl_gen_mat[dir] = cfl_gen_mat_##name##_c; \ c->cfl_mhccp_pred[dir] = cfl_mhccp_pred_##name##_c; c->cfl_gen_y[DAV2D_PIXEL_LAYOUT_I420 - 1][0] = cfl_gen_y_420_uniform_c; c->cfl_gen_y[DAV2D_PIXEL_LAYOUT_I420 - 1][1] = cfl_gen_y_420_vstrip_c; c->cfl_gen_y[DAV2D_PIXEL_LAYOUT_I420 - 1][2] = cfl_gen_y_420_gauss_c; c->cfl_calc_alphas = cfl_calc_alphas_c; assign_cfl_mhccp(CFL_DIR_CENTER, c); assign_cfl_mhccp(CFL_DIR_TOP , t); assign_cfl_mhccp(CFL_DIR_LEFT , l); c->pal_pred = pal_pred_c; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM intra_pred_dsp_init_arm(c); #elif ARCH_RISCV intra_pred_dsp_init_riscv(c); #elif ARCH_X86 intra_pred_dsp_init_x86(c); #elif ARCH_LOONGARCH64 intra_pred_dsp_init_loongarch(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/itx.h000066400000000000000000000052261517466257200216150ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_ITX_H #define DAV2D_SRC_ITX_H #include #include "common/bitdepth.h" #include "src/levels.h" #define decl_cctx_fn(name) \ void (name)(coef *u, coef *v, const int16_t angle[3], \ size_t sz HIGHBD_DECL_SUFFIX) typedef decl_cctx_fn(*cctx_fn); #define decl_itxfm_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, \ enum TxfmType txtp, int eob HIGHBD_DECL_SUFFIX) typedef decl_itxfm_fn(*itxfm_fn); #define decl_itx_w_fns(w, ext) \ decl_itxfm_fn(BF(dav2d_inv_txfm_add_##w##x4, ext)); \ decl_itxfm_fn(BF(dav2d_inv_txfm_add_##w##x8, ext)); \ decl_itxfm_fn(BF(dav2d_inv_txfm_add_##w##x16, ext)); \ decl_itxfm_fn(BF(dav2d_inv_txfm_add_##w##x32, ext)); \ decl_itxfm_fn(BF(dav2d_inv_txfm_add_##w##x64, ext)) #define decl_itx_fns(ext) \ decl_itx_w_fns( 4, ext); \ decl_itx_w_fns( 8, ext); \ decl_itx_w_fns(16, ext); \ decl_itx_w_fns(32, ext); \ decl_itx_w_fns(64, ext) typedef struct Dav2dInvTxfmDSPContext { cctx_fn cctx; itxfm_fn itxfm_add[N_RECT_TX_SIZES]; } Dav2dInvTxfmDSPContext; bitfn_decls(void dav2d_itx_dsp_init, Dav2dInvTxfmDSPContext *c); #define assign_itx_fn(pfx, w, h, ext) \ c->itxfm_add[pfx##TX_##w##X##h] = BF(dav2d_inv_txfm_add_##w##x##h, ext) #endif /* DAV2D_SRC_ITX_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/itx_1d.c000066400000000000000000000321261517466257200221730ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "common/intops.h" #include "src/itx_1d.h" static const int8_t dct8_kernel[4 * 4] = { 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89, }; static const int8_t dct16_kernel[8 * 8] = { 90, 87, 80, 70, 57, 43, 26, 9, 87, 57, 9, -43, -80, -90, -70, -26, 80, 9, -70, -87, -26, 57, 90, 43, 70, -43, -87, 9, 90, 26, -80, -57, 57, -80, -26, 90, -9, -87, 43, 70, 43, -90, 57, 26, -87, 70, 9, -80, 26, -70, 90, -80, 43, 9, -57, 87, 9, -26, 43, -57, 70, -80, 87, -90, }; static const int8_t dct32_kernel[16 * 16] = { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 47, 39, 30, 22, 13, 4, 90, 82, 67, 47, 22, -4, -30, -54, -73, -85, -90, -88, -78, -61, -39, -13, 88, 67, 30, -13, -54, -82, -90, -78, -47, -4, 39, 73, 90, 85, 61, 22, 85, 47, -13, -67, -90, -73, -22, 39, 82, 88, 54, -4, -61, -90, -78, -30, 82, 22, -54, -90, -61, 13, 78, 85, 30, -47, -90, -67, 4, 73, 88, 39, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 30, 90, 54, -39, -90, -47, 73, -30, -90, -22, 78, 67, -39, -90, -13, 82, 61, -47, -88, -4, 85, 54, 67, -54, -78, 39, 85, -22, -90, 4, 90, 13, -88, -30, 82, 47, -73, -61, 61, -73, -47, 82, 30, -88, -13, 90, -4, -90, 22, 85, -39, -78, 54, 67, 54, -85, -4, 88, -47, -61, 82, 13, -90, 39, 67, -78, -22, 90, -30, -73, 47, -90, 39, 54, -90, 30, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 39, -88, 73, -4, -67, 90, -47, -30, 85, -78, 13, 61, -90, 54, 22, -82, 30, -78, 90, -61, 4, 54, -88, 82, -39, -22, 73, -90, 67, -13, -47, 85, 22, -61, 85, -90, 73, -39, -4, 47, -78, 90, -82, 54, -13, -30, 67, -88, 13, -39, 61, -78, 88, -90, 85, -73, 54, -30, 4, 22, -47, 67, -82, 90, 4, -13, 22, -30, 39, -47, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, }; static const int8_t adst4_kernel[4 * 4] = { 18, 50, 75, 89, 50, 89, 18, -75, 75, 18, -89, 50, 89, -75, 50, -18, }; static const int8_t adst8_kernel[8 * 8] = { 11, 34, 54, 71, 84, 88, 79, 50, 28, 74, 89, 68, 17, -44, -83, -69, 44, 89, 48, -41, -89, -44, 50, 81, 58, 76, -34, -86, 10, 88, 6, -84, 70, 39, -87, 1, 86, -44, -59, 78, 79, -12, -66, 87, -35, -44, 86, -62, 86, -58, 12, 38, -75, 88, -74, 40, 89, -86, 79, -70, 58, -44, 29, -14, }; static const int8_t adst16_kernel[16 * 16] = { 8, 25, 41, 55, 67, 77, 84, 88, 89, 87, 81, 73, 62, 48, 33, 17, 17, 48, 73, 87, 88, 77, 55, 25, -8, -41, -67, -84, -89, -81, -62, -33, 25, 67, 88, 81, 48, 0, -48, -81, -88, -67, -25, 25, 67, 88, 81, 48, 33, 81, 84, 41, -25, -77, -87, -48, 17, 73, 88, 55, -8, -67, -89, -62, 41, 88, 62, -17, -81, -77, -8, 67, 87, 33, -48, -89, -55, 25, 84, 73, 48, 88, 25, -67, -81, 0, 81, 67, -25, -88, -48, 48, 88, 25, -67, -81, 55, 81, -17, -89, -25, 77, 62, -48, -84, 8, 88, 33, -73, -67, 41, 87, 62, 67, -55, -73, 48, 77, -41, -81, 33, 84, -25, -87, 17, 88, -8, -89, 67, 48, -81, -25, 88, 0, -88, 25, 81, -48, -67, 67, 48, -81, -25, 88, 73, 25, -89, 33, 67, -77, -17, 88, -41, -62, 81, 8, -87, 48, 55, -84, 77, 0, -77, 77, 0, -77, 77, 0, -77, 77, 0, -77, 77, 0, -77, 77, 81, -25, -48, 88, -67, 0, 67, -88, 48, 25, -81, 81, -25, -48, 88, -67, 84, -48, -8, 62, -88, 77, -33, -25, 73, -89, 67, -17, -41, 81, -87, 55, 87, -67, 33, 8, -48, 77, -89, 81, -55, 17, 25, -62, 84, -88, 73, -41, 88, -81, 67, -48, 25, 0, -25, 48, -67, 81, -88, 88, -81, 67, -48, 25, 89, -88, 87, -84, 81, -77, 73, -67, 62, -55, 48, -41, 33, -25, 17, -8, }; static const int8_t flipadst4_kernel[4 * 4] = { 89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89, }; static const int8_t flipadst16_kernel[16 * 16] = { 89, 88, 87, 84, 81, 77, 73, 67, 62, 55, 48, 41, 33, 25, 17, 8, 88, 81, 67, 48, 25, 0, -25, -48, -67, -81, -88, -88, -81, -67, -48, -25, 87, 67, 33, -8, -48, -77, -89, -81, -55, -17, 25, 62, 84, 88, 73, 41, 84, 48, -8, -62, -88, -77, -33, 25, 73, 89, 67, 17, -41, -81, -87, -55, 81, 25, -48, -88, -67, 0, 67, 88, 48, -25, -81, -81, -25, 48, 88, 67, 77, 0, -77, -77, 0, 77, 77, 0, -77, -77, 0, 77, 77, 0, -77, -77, 73, -25, -89, -33, 67, 77, -17, -88, -41, 62, 81, -8, -87, -48, 55, 84, 67, -48, -81, 25, 88, 0, -88, -25, 81, 48, -67, -67, 48, 81, -25, -88, 62, -67, -55, 73, 48, -77, -41, 81, 33, -84, -25, 87, 17, -88, -8, 89, 55, -81, -17, 89, -25, -77, 62, 48, -84, -8, 88, -33, -73, 67, 41, -87, 48, -88, 25, 67, -81, 0, 81, -67, -25, 88, -48, -48, 88, -25, -67, 81, 41, -88, 62, 17, -81, 77, -8, -67, 87, -33, -48, 89, -55, -25, 84, -73, 33, -81, 84, -41, -25, 77, -87, 48, 17, -73, 88, -55, -8, 67, -89, 62, 25, -67, 88, -81, 48, 0, -48, 81, -88, 67, -25, -25, 67, -88, 81, -48, 17, -48, 73, -87, 88, -77, 55, -25, -8, 41, -67, 84, -89, 81, -62, 33, 8, -25, 41, -55, 67, -77, 84, -88, 89, -87, 81, -73, 62, -48, 33, -17, }; static const int8_t ddt8_kernel[8 * 8] = { 4, 6, 22, 57, 96, 103, 78, 56, 7, 14, 48, 94, 73, -17, -79, -96, 15, 36, 85, 76, -43, -80, 7, 98, 33, 77, 88, -26, -69, 56, 56, -77, 65, 100, 0, -73, 55, 15, -82, 54, 98, 45, -86, 34, 20, -66, 79, -33, 106, -57, -23, 54, -71, 75, -56, 19, 80, -98, 82, -66, 53, -41, 26, -6, }; static const int8_t ddt16_kernel[16 * 16] = { 12, 17, 37, 45, 47, 60, 64, 82, 89, 100, 92, 84, 69, 50, 51, 44, 15, 23, 49, 60, 60, 74, 70, 73, 48, 9, -35, -71, -83, -79, -89, -95, 19, 30, 60, 69, 61, 64, 40, 3, -53, -99, -91, -46, 2, 47, 73, 124, 23, 38, 69, 73, 49, 28, -19, -80, -96, -45, 42, 88, 75, 14, -17,-126, 30, 48, 75, 66, 19, -31, -79, -91, -5, 84, 71, -16, -78, -60, -45, 108, 39, 61, 75, 40, -29, -87, -78, 10, 89, 36, -69, -67, 18, 67, 89, -81, 51, 76, 61, -8, -77, -82, 11, 94, 16, -81, -22, 79, 50, -37,-103, 54, 66, 87, 29, -65, -83, 4, 92, 18, -83, 4, 85, -22, -85, -6, 97, -30, 78, 83, -18, -91, -16, 88, 28, -84, 12, 73, -60, -46, 81, 49, -83, 16, 88, 59, -67, -57, 75, 54, -85, -5, 75, -60, -17, 84, -43, -80, 71, -6, 94, 19, -96, 21, 93, -55, -41, 80, -51, -17, 77, -68, -6, 98, -56, 1, 97, -30, -83, 86, 3, -77, 82, -17, -43, 76, -70, 15, 53, -99, 44, 3, 93, -73, -28, 81, -92, 29, 39, -70, 81, -55, 11, 46, -81, 90, -31, -4, 83, -99, 40, 8, -74, 88, -83, 47, -14, -21, 56, -83, 88, -71, 22, 5, 68, -99, 84, -69, 32, 3, -37, 55, -75, 81, -83, 82, -69, 48, -11, -3, 50, -76, 83, -90, 97, -86, 83, -68, 67, -56, 49, -40, 32, -19, 5, 2, }; static NOINLINE void inv_dct_1d_c(int32_t *const c, const ptrdiff_t stride, const int8_t *mat, const int n) { int32_t a[16], b[16]; const int k = n * 2 - 1; assert(stride > 0); for (int i = 0; i < n; i++) { int sum = 0; for (int j = 1; j <= k; j += 2) sum += *mat++ * c[j * stride]; a[i] = c[i * 2 * stride]; b[i] = sum; } for (int i = 0; i < n; i++) { c[(i ) * stride] = a[i] + b[i]; c[(k - i) * stride] = a[i] - b[i]; } } static NOINLINE void inv_dct4_1d_c(int32_t *const c, const ptrdiff_t stride) { const int a0 = c[0 * stride] * 64 + c[2 * stride] * 64; const int a1 = c[0 * stride] * 64 - c[2 * stride] * 64; const int b0 = c[1 * stride] * 83 + c[3 * stride] * 35; const int b1 = c[1 * stride] * 35 - c[3 * stride] * 83; c[0 * stride] = a0 + b0; c[1 * stride] = a1 + b1; c[2 * stride] = a1 - b1; c[3 * stride] = a0 - b0; } static NOINLINE void inv_dct8_1d_c(int32_t *const c, const ptrdiff_t stride) { inv_dct4_1d_c(c, 2 * stride); inv_dct_1d_c(c, stride, dct8_kernel, 4); } static NOINLINE void inv_dct16_1d_c(int32_t *const c, const ptrdiff_t stride) { inv_dct8_1d_c(c, 2 * stride); inv_dct_1d_c(c, stride, dct16_kernel, 8); } static void inv_dct32_1d_c(int32_t *const c, const ptrdiff_t stride) { inv_dct16_1d_c(c, 2 * stride); inv_dct_1d_c(c, stride, dct32_kernel, 16); } static NOINLINE void inv_dst_1d_c(int32_t *c, ptrdiff_t stride, const int8_t *mat, const int n, const int f) { int32_t sums[16]; assert(stride > 0); for (int i = 0; i < n; i++) { int sum = 0; for (int j = 0; j < n; j++) sum += *mat++ * c[j * stride]; sums[i] = sum; } if (f) { c += f * stride; stride = -stride; } for (int i = 0; i < n; i++) c[i * stride] = sums[i]; } #define inv_dst_1d(type, kernel, sz, flip) \ static void inv_##type##sz##_1d_c(int32_t *const c, const ptrdiff_t stride) { \ inv_dst_1d_c(c, stride, kernel##sz##_kernel, sz, flip ? sz - 1 : 0); \ } \ inv_dst_1d(adst, adst, 4, 0); inv_dst_1d(adst, adst, 8, 0); inv_dst_1d(adst, adst, 16, 0); inv_dst_1d(flipadst, flipadst, 4, 0); inv_dst_1d(flipadst, adst, 8, 1); inv_dst_1d(flipadst, flipadst, 16, 0); inv_dst_1d(ddt, ddt, 8, 0); inv_dst_1d(ddt, ddt, 16, 0); inv_dst_1d(flipddt, ddt, 8, 1); inv_dst_1d(flipddt, ddt, 16, 1); static void inv_identity4_1d_c(int32_t *const c, const ptrdiff_t stride) { assert(stride > 0); for (int i = 0; i < 4; i++) { c[stride * i] *= 128; } } static void inv_identity8_1d_c(int32_t *const c, const ptrdiff_t stride) { assert(stride > 0); for (int i = 0; i < 8; i++) { c[stride * i] *= 181; } } static void inv_identity16_1d_c(int32_t *const c, const ptrdiff_t stride) { assert(stride > 0); for (int i = 0; i < 16; i++) { c[stride * i] *= 256; } } static void inv_identity32_1d_c(int32_t *const c, const ptrdiff_t stride) { assert(stride > 0); for (int i = 0; i < 32; i++) c[stride * i] *= 362; } const itx_1d_fn dav2d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES - 1] = { [TX_4X4] = { [DCT] = inv_dct4_1d_c, [ADST] = inv_adst4_1d_c, [FLIPADST] = inv_flipadst4_1d_c, [IDENTITY] = inv_identity4_1d_c, }, [TX_8X8] = { [DCT] = inv_dct8_1d_c, [ADST] = inv_adst8_1d_c, [FLIPADST] = inv_flipadst8_1d_c, [IDENTITY] = inv_identity8_1d_c, [DDT] = inv_ddt8_1d_c, [FLIPDDT] = inv_flipddt8_1d_c, }, [TX_16X16] = { [DCT] = inv_dct16_1d_c, [ADST] = inv_adst16_1d_c, [FLIPADST] = inv_flipadst16_1d_c, [IDENTITY] = inv_identity16_1d_c, [DDT] = inv_ddt16_1d_c, [FLIPDDT] = inv_flipddt16_1d_c, }, [TX_32X32] = { [DCT] = inv_dct32_1d_c, [IDENTITY] = inv_identity32_1d_c, }, [TX_64X64] = { [DCT] = inv_dct32_1d_c, }, }; void dav2d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) { assert(stride > 0); const int in0 = c[0 * stride], in1 = c[1 * stride]; const int in2 = c[2 * stride], in3 = c[3 * stride]; const int t0 = in0 + in1; const int t2 = in2 - in3; const int t4 = (t0 - t2) >> 1; const int t3 = t4 - in3; const int t1 = t4 - in1; c[0 * stride] = t0 - t3; c[1 * stride] = t3; c[2 * stride] = t1; c[3 * stride] = t2 + t1; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/itx_1d.h000066400000000000000000000033551517466257200222020ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "src/levels.h" #ifndef DAV2D_SRC_ITX_1D_H #define DAV2D_SRC_ITX_1D_H typedef void (*itx_1d_fn)(int32_t *c, ptrdiff_t stride); EXTERN const itx_1d_fn dav2d_tx1d_fns[N_TX_SIZES][N_TX_1D_TYPES - 1]; void dav2d_inv_wht4_1d_c(int32_t *c, ptrdiff_t stride); #endif /* DAV2D_SRC_ITX_1D_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/itx_tmpl.c000066400000000000000000000253071517466257200226460ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "common/attributes.h" #include "common/intops.h" #include "src/itx.h" #include "src/itx_1d.h" #include "src/scan.h" #include "src/tables.h" static void cctx_c(coef *const u, coef *const v, const int16_t angle[3], const size_t sz HIGHBD_DECL_SUFFIX) { assert(!(sz & (sz - 1)) && sz >= 4 * 4 && sz <= 32 * 32); const int bd = bitdepth_from_max(bitdepth_max); const int min = -(1 << (bd + 7)); const int max = (1 << (bd + 7)) - 1; const int sina = angle[0]; const int cosa = angle[1]; assert(angle[2] == -sina); for (size_t i = 0; i < sz; i++) { const int a = u[i] * cosa - v[i] * sina; const int b = u[i] * sina + v[i] * cosa; u[i] = iclip((a + 128 - (a < 0)) >> 8, min, max); v[i] = iclip((b + 128 - (b < 0)) >> 8, min, max); } } static void residual_add(pixel *dst, const ptrdiff_t stride, const int32_t *c, const int w, const int h, const int rnd, const int shift, const enum TxfmType txtp HIGHBD_DECL_SUFFIX) { const int dpcm_flag = txtp >> 8; assert(!dpcm_flag || (txtp & 0xe7) == IDTX || (txtp & 0xe7) == WHT_WHT); switch (dpcm_flag) { default: assert(0); case 0: for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) for (int x = 0; x < w; x++) dst[x] = iclip_pixel(dst[x] + ((*c++ + rnd) >> shift)); break; case 1: for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) for (int x = 0, acc = 0; x < w; x++) { acc += (*c++ + rnd) >> shift; dst[x] = iclip_pixel(dst[x] + acc); } break; case 2: for (int x = 0; x < w; x++, c++, dst++) for (int y = 0, acc = 0; y < h; y++) { acc += (c[y * w] + rnd) >> shift; dst[y * PXSTRIDE(stride)] = iclip_pixel(dst[y * PXSTRIDE(stride)] + acc); } break; } } static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, const enum TxfmType txtp, const int eob HIGHBD_DECL_SUFFIX) { int32_t tmp[4 * 4], *c = tmp; for (int y = 0; y < 4; y++, c += 4) { for (int x = 0; x < 4; x++) c[x] = coeff[y + x * 4] >> 3; dav2d_inv_wht4_1d_c(c, 1); } memset(coeff, 0, sizeof(*coeff) * 4 * 4); for (int x = 0; x < 4; x++) dav2d_inv_wht4_1d_c(&tmp[x], 4); residual_add(dst, stride, tmp, 4, 4, 0, 0, txtp HIGHBD_TAIL_SUFFIX); } static NOINLINE void inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, coef *const coeff, const enum TxfmType txtp, const int eob, const /*enum RectTxfmSize*/ int tx HIGHBD_DECL_SUFFIX) { if ((txtp & 0xff) == WHT_WHT) { assert(tx == TX_4X4); inv_txfm_add_wht_wht_4x4_c(dst, stride, coeff, txtp, eob HIGHBD_TAIL_SUFFIX); return; } const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const uint8_t *const tx_shift = dav2d_tx_shift[tx]; const int w = 4 * t_dim->w, h = 4 * t_dim->h; assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); assert(eob >= 0 && eob < 32 * 32); const int is_rect2 = (t_dim->lw + t_dim->lh) & 1; if (eob + txtp == 0) { // DC-only DCT_DCT const int shift_p1 = tx_shift[0]; const int shift = shift_p1 + tx_shift[1] - 12; const int rnd = (1 << (shift - 1)) + shift_p1 - 6; int dc = coeff[0]; coeff[0] = 0; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) for (int x = 0; x < w; x++) dst[x] = iclip_pixel(dst[x] + dc); return; } const itx_1d_fn first_1d_fn = dav2d_tx1d_fns[t_dim->lw][txtp & 7]; const itx_1d_fn second_1d_fn = dav2d_tx1d_fns[t_dim->lh][(txtp >> 5) & 7]; const int sh = imin(h, 32), sw = imin(w, 32); #if BITDEPTH == 8 const int row_clip_min = INT16_MIN; #else const int row_clip_min = (int) ((unsigned) ~bitdepth_max << 7); #endif const int row_clip_max = ~row_clip_min; int32_t tmp[32 * 32], *c = tmp; int col = 0; const enum TxClass tx_class = (txtp >> 3) & 0x3; if (tx_class == TX_CLASS_2D) { const uint16_t *last_eob = dav2d_last_eob_per_col.table + dav2d_last_eob_per_col.offset[tx]; do { if (is_rect2) for (int x = 0; x < sw; x++) c[x] = (coeff[col + x * sh] * 181 + 128) >> 8; else for (int x = 0; x < sw; x++) c[x] = coeff[col + x * sh]; first_1d_fn(c, 1); c += sw; } while ((++col & 3) || eob > *last_eob++); } else { int last_nz_col; if (tx_class == TX_CLASS_H) last_nz_col = imin(sh - 1, eob); else if (tx_class == TX_CLASS_V) last_nz_col = eob >> (t_dim->lw + 2); else /* TX_CLASS_2D_INV */ last_nz_col = sh - 1; assert(last_nz_col < sh); do { if (is_rect2) for (int x = 0; x < sw; x++) c[x] = (coeff[col + x * sh] * 181 + 128) >> 8; else for (int x = 0; x < sw; x++) c[x] = coeff[col + x * sh]; first_1d_fn(c, 1); c += sw; } while (++col <= last_nz_col); } if (col < sh) memset(c, 0, sizeof(*c) * (sh - col) * sw); memset(coeff, 0, sizeof(*coeff) * sw * sh); int shift = tx_shift[0]; int rnd = (1 << shift) >> 1; for (int i = 0; i < sw * sh; i++) tmp[i] = iclip((tmp[i] + rnd) >> shift, row_clip_min, row_clip_max); for (int x = 0; x < sw; x++) second_1d_fn(&tmp[x], sw); shift = tx_shift[1]; rnd = (1 << shift) >> 1; c = tmp; /* Handle idct64 upsampling */ if (w > sw) { if (h > sh) { for (int y = 0; y < h; y += 2, dst += PXSTRIDE(stride) * 2) { pixel *const dst2 = dst + PXSTRIDE(stride); for (int x = 0; x < w; x += 2) { const int cf = (*c++ + rnd) >> shift; dst[x + 0] = iclip_pixel(dst[x + 0] + cf); dst[x + 1] = iclip_pixel(dst[x + 1] + cf); dst2[x + 0] = iclip_pixel(dst2[x + 0] + cf); dst2[x + 1] = iclip_pixel(dst2[x + 1] + cf); } } } else { for (int y = 0; y < h; y++, dst += PXSTRIDE(stride)) for (int x = 0; x < w; x += 2) { const int cf = (*c++ + rnd) >> shift; dst[x + 0] = iclip_pixel(dst[x + 0] + cf); dst[x + 1] = iclip_pixel(dst[x + 1] + cf); } } } else if (h > sh) { for (int y = 0; y < h; y += 2, dst += PXSTRIDE(stride) * 2) { pixel *const dst2 = dst + PXSTRIDE(stride); for (int x = 0; x < w; x++) { const int cf = (*c++ + rnd) >> shift; dst[x] = iclip_pixel(dst[x] + cf); dst2[x] = iclip_pixel(dst2[x] + cf); } } } else { residual_add(dst, stride, c, w, h, rnd, shift, txtp HIGHBD_TAIL_SUFFIX); } } #define inv_txfm_fn(pfx, w, h) \ static void \ inv_txfm_add_##w##x##h##_c(pixel *dst, const ptrdiff_t stride, \ coef *const coeff, const enum TxfmType txtp, \ const int eob HIGHBD_DECL_SUFFIX) \ { \ inv_txfm_add_c(dst, stride, coeff, txtp, eob, pfx##TX_##w##X##h \ HIGHBD_TAIL_SUFFIX); \ } inv_txfm_fn( , 4, 4) inv_txfm_fn(R, 4, 8) inv_txfm_fn(R, 4, 16) inv_txfm_fn(R, 4, 32) inv_txfm_fn(R, 4, 64) inv_txfm_fn(R, 8, 4) inv_txfm_fn( , 8, 8) inv_txfm_fn(R, 8, 16) inv_txfm_fn(R, 8, 32) inv_txfm_fn(R, 8, 64) inv_txfm_fn(R, 16, 4) inv_txfm_fn(R, 16, 8) inv_txfm_fn( , 16, 16) inv_txfm_fn(R, 16, 32) inv_txfm_fn(R, 16, 64) inv_txfm_fn(R, 32, 4) inv_txfm_fn(R, 32, 8) inv_txfm_fn(R, 32, 16) inv_txfm_fn( , 32, 32) inv_txfm_fn(R, 32, 64) inv_txfm_fn(R, 64, 4) inv_txfm_fn(R, 64, 8) inv_txfm_fn(R, 64, 16) inv_txfm_fn(R, 64, 32) inv_txfm_fn( , 64, 64) #if HAVE_ASM #if ARCH_X86 #include "src/x86/itx.h" #endif #endif COLD void bitfn(dav2d_itx_dsp_init)(Dav2dInvTxfmDSPContext *const c) { #define assign_itx(w, h, pfx) \ c->itxfm_add[pfx##TX_##w##X##h] = inv_txfm_add_##w##x##h##_c c->cctx = cctx_c; assign_itx( 4, 4, ); assign_itx( 4, 8, R); assign_itx( 4, 16, R); assign_itx( 4, 32, R); assign_itx( 4, 64, R); assign_itx( 8, 4, R); assign_itx( 8, 8, ); assign_itx( 8, 16, R); assign_itx( 8, 32, R); assign_itx( 8, 64, R); assign_itx(16, 4, R); assign_itx(16, 8, R); assign_itx(16, 16, ); assign_itx(16, 32, R); assign_itx(16, 64, R); assign_itx(32, 4, R); assign_itx(32, 8, R); assign_itx(32, 16, R); assign_itx(32, 32, ); assign_itx(32, 64, R); assign_itx(64, 4, R); assign_itx(64, 8, R); assign_itx(64, 16, R); assign_itx(64, 32, R); assign_itx(64, 64, ); #if HAVE_ASM #if ARCH_X86 itx_dsp_init_x86(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/levels.h000066400000000000000000000207351517466257200223050ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LEVELS_H #define DAV2D_SRC_LEVELS_H #include #include "dav2d/headers.h" #include "common/attributes.h" enum ObuMetaType { OBU_META_HDR_CLL = 1, OBU_META_HDR_MDCV = 2, OBU_META_SCALABILITY = 3, OBU_META_ITUT_T35 = 4, OBU_META_TIMECODE = 5, }; enum TxfmSize { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, N_TX_SIZES, }; enum RectTxfmSize { RTX_4X8 = N_TX_SIZES, RTX_8X4, RTX_8X16, RTX_16X8, RTX_16X32, RTX_32X16, RTX_32X64, RTX_64X32, RTX_4X16, RTX_16X4, RTX_8X32, RTX_32X8, RTX_16X64, RTX_64X16, RTX_4X32, RTX_32X4, RTX_8X64, RTX_64X8, RTX_4X64, RTX_64X4, N_RECT_TX_SIZES }; enum Tx1dType { DCT, IDENTITY, ADST, FLIPADST, DDT, FLIPDDT, WHT, N_TX_1D_TYPES, }; enum TxClass { TX_CLASS_2D, TX_CLASS_2D_INV, /* inverse coefficient order */ TX_CLASS_H, TX_CLASS_V, }; #define TX_TYPE_ENUM(NAME, HOR_1D, VER_1D, CLASS) \ NAME = (HOR_1D) | (TX_CLASS_##CLASS << 3) | (VER_1D << 5) #define TX_TYPE_ENUM_2D(HOR_1D, VER_1D) \ TX_TYPE_ENUM(VER_1D##_##HOR_1D, HOR_1D, VER_1D, 2D) enum TxfmType { TX_TYPE_ENUM_2D(DCT, DCT), TX_TYPE_ENUM_2D(DCT, ADST), TX_TYPE_ENUM_2D(ADST, DCT), TX_TYPE_ENUM_2D(ADST, ADST), TX_TYPE_ENUM_2D(DCT, FLIPADST), TX_TYPE_ENUM_2D(FLIPADST, DCT), TX_TYPE_ENUM_2D(FLIPADST, FLIPADST), TX_TYPE_ENUM_2D(FLIPADST, ADST), TX_TYPE_ENUM_2D(ADST, FLIPADST), TX_TYPE_ENUM(IDTX, IDENTITY, IDENTITY, 2D), TX_TYPE_ENUM(IDTX_INV, IDENTITY, IDENTITY, 2D_INV), TX_TYPE_ENUM(V_DCT, IDENTITY, DCT, V), TX_TYPE_ENUM(H_DCT, DCT, IDENTITY, H), TX_TYPE_ENUM(V_ADST, IDENTITY, ADST, V), TX_TYPE_ENUM(H_ADST, ADST, IDENTITY, H), TX_TYPE_ENUM(V_FLIPADST, IDENTITY, FLIPADST, V), TX_TYPE_ENUM(H_FLIPADST, FLIPADST, IDENTITY, H), TX_TYPE_ENUM_2D(WHT, WHT), // when the ddt sequence header bit is enabled TX_TYPE_ENUM_2D(DCT, DDT), TX_TYPE_ENUM_2D(DCT, FLIPDDT), TX_TYPE_ENUM_2D(IDENTITY, DDT), TX_TYPE_ENUM_2D(IDENTITY, FLIPDDT), TX_TYPE_ENUM_2D(DDT, DDT), TX_TYPE_ENUM_2D(DDT, FLIPDDT), TX_TYPE_ENUM_2D(DDT, DCT), TX_TYPE_ENUM_2D(DDT, IDENTITY), TX_TYPE_ENUM_2D(FLIPDDT, DDT), TX_TYPE_ENUM_2D(FLIPDDT, FLIPDDT), TX_TYPE_ENUM_2D(FLIPDDT, DCT), TX_TYPE_ENUM_2D(FLIPDDT, IDENTITY), // when one side is 4-point, we can mix adst/ddt TX_TYPE_ENUM_2D(ADST, DDT), TX_TYPE_ENUM_2D(ADST, FLIPDDT), TX_TYPE_ENUM_2D(FLIPADST, DDT), TX_TYPE_ENUM_2D(FLIPADST, FLIPDDT), TX_TYPE_ENUM_2D(DDT, ADST), TX_TYPE_ENUM_2D(FLIPDDT, ADST), TX_TYPE_ENUM_2D(DDT, FLIPADST), TX_TYPE_ENUM_2D(FLIPDDT, FLIPADST), }; #undef TX_TYPE_ENUM_2D #undef TX_TYPE_ENUM enum IntraPredMode { DC_PRED, VERT_PRED, HOR_PRED, DIAG_DOWN_LEFT_PRED, DIAG_DOWN_RIGHT_PRED, VERT_RIGHT_PRED, HOR_DOWN_PRED, HOR_UP_PRED, VERT_LEFT_PRED, SMOOTH_PRED, SMOOTH_V_PRED, SMOOTH_H_PRED, PAETH_PRED, N_INTRA_PRED_MODES, CFL_PRED = N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES, N_IMPL_INTRA_PRED_MODES = N_UV_INTRA_PRED_MODES, LEFT_DC_PRED = DIAG_DOWN_LEFT_PRED, TOP_DC_PRED, DC_128_PRED, Z1_PRED, Z2_PRED, Z3_PRED, DIP_PRED = N_INTRA_PRED_MODES, }; enum InterIntraPredMode { II_DC_PRED, II_VERT_PRED, II_HOR_PRED, II_SMOOTH_PRED, N_INTER_INTRA_PRED_MODES, }; enum BlockPartition { PARTITION_INVALID = -1, PARTITION_NONE, // [ ] PARTITION_H, // [-] PARTITION_V, // [|] PARTITION_H3, // 4x4 -> 4x1[top], 2x2 [left], 2x2 [right], 4x1[bottom] PARTITION_V3, // transpose of H3 PARTITION_H4A, // Nx8 -> Nx1,Nx2,Nx4,Nx1 PARTITION_H4B, // Nx8 -> Nx1,Nx4,Nx2,Nx1 PARTITION_V4A, // transpose of H4A PARTITION_V4B, // transpose of H4B PARTITION_SPLIT, // [+] N_PARTITIONS, }; enum TxPartition { TX_PARTITION_NONE, TX_PARTITION_SPLIT, TX_PARTITION_H, TX_PARTITION_V, TX_PARTITION_H4, TX_PARTITION_V4, TX_PARTITION_H5, TX_PARTITION_V5, }; enum BlockSize { BS_INVALID = -1, BS_256x256, BS_256x128, BS_128x256, BS_128x128, BS_128x64, BS_64x128, BS_64x64, BS_64x32, BS_64x16, BS_64x8, BS_64x4, BS_32x64, BS_32x32, BS_32x16, BS_32x8, BS_32x4, BS_16x64, BS_16x32, BS_16x16, BS_16x8, BS_16x4, BS_8x64, BS_8x32, BS_8x16, BS_8x8, BS_8x4, BS_4x64, BS_4x32, BS_4x16, BS_4x8, BS_4x4, N_BS_SIZES, }; enum InterPredMode { NEARMV = 13, GLOBALMV, NEWMV, WARPMV, WARPNEWMV, }; enum CompInterPredMode { NEARMV_NEARMV = 18, NEARMV_NEWMV, NEWMV_NEARMV, GLOBALMV_GLOBALMV, NEWMV_NEWMV, JOINT_NEWMV, OPFL_NEARMV_NEARMV, OPFL_NEARMV_NEWMV, OPFL_NEWMV_NEARMV, OPFL_NEWMV_NEWMV, OPFL_JOINT_NEWMV, }; enum CompInterType { COMP_INTER_NONE, COMP_INTER_AVG, COMP_INTER_WEDGE, COMP_INTER_SEG, }; typedef union mv { struct { int32_t y, x; }; uint64_t n; } mv; CHECK_SIZE(mv, 8); #define INVALID_MV 0x200000 // applied to mv.y #define COPY2MV(dst, src) memcpy(dst, src, 2 * sizeof(union mv)) #define CMP2MV(src1, src2) memcmp(src1, src2, 2 * sizeof(union mv)) #define ZERO2MV(dst) memset(dst, 0, 2 * sizeof(union mv)) PACKED(typedef union refpair { int8_t ref[2]; int16_t pair; }) ALIGN(refpair, 2); CHECK_SIZE(refpair, 2); enum MotionMode { MM_TRANSLATION, MM_INTERINTRA, MM_WARP_CAUSAL, MM_WARP_DELTA, MM_WARP_EXTEND, }; enum CflType { CFL_EXPLICIT, CFL_IMPLICIT, CFL_MHCCP, }; enum CflMhDir { CFL_DIR_CENTER, CFL_DIR_TOP, CFL_DIR_LEFT, CFL_DIR_ALL, }; #define TIP_FRAME 7 typedef struct Av2Block { int8_t bs, cbs; uint8_t intra, intrabc, seg_id, skip_mode, skip_txfm, tx_part, fsc, tx_size_ll; union refpair ref; union { struct { // it's also possible to access this using mv[0] union mv intrabc_mv; uint8_t dpcm[2], y_mode, mrl_index, multi_mrl, dip; uint8_t morph_pred, is_refmv, is_qpel; // for intrabc uint8_t uv_mode, pal_sz; int8_t y_angle, uv_angle, cfl_type; union { int8_t cfl_alpha[2]; uint8_t cfl_mh_dir; // enum CflMhDir }; struct { int a, l; } is_sm[2 /* luma, chroma */]; }; // intra struct { union mv mv[2]; int8_t wedge_idx, wedge_sign; // -1 for no wedge uint8_t mask_sign, interintra_mode; int8_t matrix[4]; uint8_t drl_idx[2]; uint8_t warp_ref_idx, warpmv_with_mvd; uint8_t comp_type, inter_mode, motion_mode, warp_ii; int8_t cwp_idx, mv_prec, amvd; uint8_t bawp[2], filter; uint8_t refine_mv; // 1 = enabled, 2 = implicitly enabled int32_t mtxbak[6]; // for frame-mt only }; // inter }; } Av2Block; #endif /* DAV2D_SRC_LEVELS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/lf_mask.c000066400000000000000000000332601517466257200224170ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/ctx.h" #include "src/levels.h" #include "src/lf_mask.h" #include "src/tables.h" static ALWAYS_INLINE void mask_outer_edge_l(uint16_t (*const masks)[4], const int by4, const int h4, const int bwl4c, uint8_t *const l) { assert((unsigned) bwl4c <= 3U); // left block edge uint64_t mask = 1ULL << by4; for (int y = 0; y < h4; y++, mask <<= 1) { const int sidx = (by4 + y) >> 4; const unsigned smask = (unsigned) (mask >> (sidx << 4)); masks[imin(bwl4c, l[y])][sidx] |= smask; } dav2d_memset_likely_pow2(l, bwl4c, h4); } static ALWAYS_INLINE void mask_outer_edge_t(uint16_t (*const masks)[4], const int bx4, const int w4, const int bhl4c, uint8_t *const a) { assert((unsigned) bhl4c <= 3U); // top block edge uint64_t mask = 1ULL << bx4; for (int x = 0; x < w4; x++, mask <<= 1) { const int sidx = (bx4 + x) >> 4; const unsigned smask = (unsigned) (mask >> (sidx << 4)); masks[imin(bhl4c, a[x])][sidx] |= smask; } dav2d_memset_likely_pow2(a, bhl4c, w4); } static ALWAYS_INLINE void mask_inner_edges_v(uint16_t (*const masks)[64][5][4], const uint64_t inner, const int bx4, const int w4, const int twl4c, const int xoff, const int hstep) { assert((unsigned) twl4c <= 3U); // inner (tx) left|right edges const unsigned inner1 = (unsigned) (inner & 0xffff); const unsigned inner2 = (unsigned) ((inner >> 16) & 0xffff); const unsigned inner3 = (unsigned) ((inner >> 32) & 0xffff); const unsigned inner4 = (unsigned) ((inner >> 48)); for (int x = xoff; x < w4; x += hstep) { if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1; if (inner2) masks[0][bx4 + x][twl4c][1] |= inner2; if (inner3) masks[0][bx4 + x][twl4c][2] |= inner3; if (inner4) masks[0][bx4 + x][twl4c][3] |= inner4; } } static ALWAYS_INLINE void mask_inner_edges_h(uint16_t (*const masks)[64][5][4], const uint64_t inner, const int by4, const int h4, const int thl4c, const int yoff, const int vstep) { assert((unsigned) thl4c <= 3U); // top // inner (tx) --- edges // bottom const unsigned inner1 = (unsigned) (inner & 0xffff); const unsigned inner2 = (unsigned) ((inner >> 16) & 0xffff); const unsigned inner3 = (unsigned) ((inner >> 32) & 0xffff); const unsigned inner4 = (unsigned) ((inner >> 48)); for (int y = yoff; y < h4; y += vstep) { if (inner1) masks[1][by4 + y][thl4c][0] |= inner1; if (inner2) masks[1][by4 + y][thl4c][1] |= inner2; if (inner3) masks[1][by4 + y][thl4c][2] |= inner3; if (inner4) masks[1][by4 + y][thl4c][3] |= inner4; } } static inline void mask_edges_part(uint16_t (*const masks)[64][5][4], const int by4, const int bx4, const int w4, const int h4, const enum TxPartition tx_part, const TxfmInfo *const t_dim, const int hlim, const int vlim, uint8_t *const a, uint8_t *const l) { const int tw4 = t_dim->w, th4 = t_dim->h; const int twl4c = imin(hlim, t_dim->lw), thl4c = imin(vlim, t_dim->lh); if (tx_part < TX_PARTITION_H5) { mask_outer_edge_l(masks[0][bx4], by4, h4, twl4c, l); mask_outer_edge_t(masks[1][by4], bx4, w4, thl4c, a); if (w4 > tw4) { const uint64_t inner = (~0ULL >> (64 - h4)) << by4; mask_inner_edges_v(masks, inner, bx4, w4, twl4c, tw4, tw4); } if (h4 > th4) { const uint64_t inner = (~0ULL >> (64 - w4)) << bx4; mask_inner_edges_h(masks, inner, by4, h4, thl4c, th4, th4); } } else if (tx_part == TX_PARTITION_H5) { assert(th4 * 4 >= h4 && tw4 * 2 >= w4); mask_outer_edge_t(masks[1][by4], bx4, w4, thl4c, a); mask_outer_edge_l(masks[0][bx4], by4, imin(th4, h4), twl4c, l); if (h4 > th4) { mask_outer_edge_l(masks[0][bx4], by4 + th4, imin(2 * th4, h4 - th4), imin(twl4c + 1, hlim), &l[th4]); if (h4 > th4 * 3) mask_outer_edge_l(masks[0][bx4], by4 + th4 * 3, imin(th4, h4 - 3 * th4), twl4c, &l[th4 * 3]); } const uint64_t inner = (~0ULL >> (64 - w4)) << bx4; mask_inner_edges_h(masks, inner, by4, h4, thl4c, th4, th4 * 2); const uint64_t inner_a = (~0ULL >> (64 - h4)) << by4; const uint64_t inner_b = (~0ULL >> (64 - th4 * 2)) << (by4 + th4); const uint64_t inner_c = inner_a & ~inner_b; mask_inner_edges_v(masks, inner_c, bx4, w4, twl4c, tw4, tw4); } else { assert(tx_part == TX_PARTITION_V5 && tw4 * 4 >= w4 && th4 * 2 >= h4); mask_outer_edge_l(masks[0][bx4], by4, h4, twl4c, l); mask_outer_edge_t(masks[1][by4], bx4, imin(tw4, w4), thl4c, a); if (w4 > tw4) { mask_outer_edge_t(masks[1][by4], bx4 + tw4, imin(2 * tw4, w4 - tw4), imin(thl4c + 1, vlim), &a[tw4]); if (w4 > tw4 * 3) mask_outer_edge_t(masks[1][by4], bx4 + tw4 * 3, imin(tw4, w4 - 3 * tw4), thl4c, &a[tw4 * 3]); } const uint64_t inner = (~0ULL >> (64 - h4)) << by4; mask_inner_edges_v(masks, inner, bx4, w4, twl4c, tw4, tw4 * 2); const uint64_t inner_a = (~0ULL >> (64 - w4)) << bx4; const uint64_t inner_b = (~0ULL >> (64 - tw4 * 2)) << (bx4 + tw4); const uint64_t inner_c = inner_a & ~inner_b; mask_inner_edges_h(masks, inner_c, by4, h4, thl4c, th4, th4); } } static ALWAYS_INLINE void mask_subpu_edges(uint16_t (*const masks)[64][5][4], const int by4, const int bx4, const int w4, const int h4, const int twl4c, const int thl4c, const int hsz, const int vsz, const int ds_sub_pu_mask) { assert(!(hsz & (hsz - 1)) && hsz >= 0 && hsz <= 8); assert(!(vsz & (vsz - 1)) && vsz >= 0 && vsz <= 8); assert((unsigned) thl4c <= 2U && (unsigned) twl4c <= 2U); assert(ds_sub_pu_mask == 15 || ds_sub_pu_mask == 0); if (hsz) { // inner (subpu) left|right edges const uint64_t inner = (~0ULL >> (64 - h4)) << by4; const unsigned inner0 = (unsigned) (inner & 0xffff); const unsigned inner1 = (unsigned) ((inner >> 16) & 0xffff); const unsigned inner2 = (unsigned) ((inner >> 32) & 0xffff); const unsigned inner3 = (unsigned) ((inner >> 48)); for (int x = hsz; x < w4; x += hsz) { #define mask_subpu(a, b, c, d, e) \ if (inner##e) { \ const unsigned m = masks[a][b + c][d][e]; \ masks[a][b + c][d][e] |= inner##e; \ if (c & ds_sub_pu_mask) \ masks[a][b + c][4][e] |= inner##e & ~m; \ } mask_subpu(0, bx4, x, twl4c, 0); mask_subpu(0, bx4, x, twl4c, 1); mask_subpu(0, bx4, x, twl4c, 2); mask_subpu(0, bx4, x, twl4c, 3); } } if (vsz) { // top // inner (subpu) --- edges // bottom const uint64_t inner = (~0ULL >> (64 - w4)) << bx4; const unsigned inner0 = (unsigned) (inner & 0xffff); const unsigned inner1 = (unsigned) ((inner >> 16) & 0xffff); const unsigned inner2 = (unsigned) ((inner >> 32) & 0xffff); const unsigned inner3 = (unsigned) ((inner >> 48)); for (int y = vsz; y < h4; y += vsz) { mask_subpu(1, by4, y, thl4c, 0); mask_subpu(1, by4, y, thl4c, 1); mask_subpu(1, by4, y, thl4c, 2); mask_subpu(1, by4, y, thl4c, 3); #undef mask_subpu } } } static int subpu_flt_lvl(const Dav2dSequenceHeader *const seq_hdr, const Dav2dFrameHeader *const frame_hdr, const enum BlockSize bs, const int bw4, const int bh4, const Av2Block *const b, const int max_lvl) { if (b->intra || !frame_hdr->deblock.sub_pu) { /* do nothing */ } else if (b->ref.ref[0] == TIP_FRAME) { const int opfl = seq_hdr->tip_refine_mv && (frame_hdr->tip.frame_mode == 1 || frame_hdr->tip.subpel_filter == DAV2D_FILTER_8TAP_SHARP); return 1 + (frame_hdr->tip.frame_mode == 2 /* frame */ ? !opfl : ((!opfl && imin(bw4, bh4) >= 4) || bs == BS_256x256)); } else if (b->ref.ref[1] != -1) { if (b->inter_mode >= OPFL_NEARMV_NEARMV) { return 1 - (bs == BS_8x8); } else if (b->refine_mv && b->comp_type == COMP_INTER_AVG) { return 2; } } return max_lvl; } void dav2d_create_db_mask(uint16_t (*const masks)[64][5][4], const Av2Block *const b, const enum BlockSize bs, const int bx, const int by, const int iw, const int ih, const enum Dav2dPixelLayout layout, const int chroma, uint8_t *const a, uint8_t *const l, const Dav2dFrameHeader *const frame_hdr, const Dav2dSequenceHeader *const seq_hdr) { const int ss_ver = chroma && layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = chroma && layout != DAV2D_PIXEL_LAYOUT_I444; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = imin(iw - bx, b_dim[0]) >> ss_hor; const int bh4 = imin(ih - by, b_dim[1]) >> ss_ver; const int bx4 = (bx & 63) >> ss_hor; const int by4 = (by & 63) >> ss_ver; assert(bw4 > 0 && bh4 > 0); const int subpu_l2 = subpu_flt_lvl(seq_hdr, frame_hdr, bs, b_dim[0], b_dim[1], b, 3); const int ds_subpu_mask = (frame_hdr->tip.frame_mode != 2) * 15; int twl4c, thl4c; const int lossless = frame_hdr->segmentation.lossless[b->seg_id]; if (b->intra || !b->skip_txfm) { const enum TxPartition tx_part = chroma ? TX_PARTITION_NONE : b->tx_part; enum RectTxfmSize tx; if (lossless) { tx = !chroma && b->tx_size_ll ? dav2d_max_txfm_size_for_bs[bs][3] : (int) TX_4X4; } else { tx = chroma ? dav2d_max_txfm_size_for_bs[bs][DAV2D_PIXEL_LAYOUT_I444 - layout] : dav2d_tx_part_tbl[bs][tx_part]; } const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; mask_edges_part(masks, by4, bx4, bw4, bh4, tx_part, t_dim, iclip(subpu_l2 - ss_hor, 0, 3 - chroma), iclip(subpu_l2 - ss_ver, 0, 3 - chroma), a, l); twl4c = imin(subpu_l2, t_dim->lw); thl4c = imin(subpu_l2, t_dim->lh); } else { int hlim, vlim; if (lossless) { hlim = 0; vlim = 0; } else { hlim = iclip(imin(subpu_l2, b_dim[2]) - ss_hor, 0, 3 - chroma); vlim = iclip(imin(subpu_l2, b_dim[3]) - ss_ver, 0, 3 - chroma); } mask_outer_edge_l(masks[0][bx4], by4, bh4, hlim, l); mask_outer_edge_t(masks[1][by4], bx4, bw4, vlim, a); twl4c = thl4c = subpu_l2; } if (subpu_l2 != 3) { const int h_subpu_l2 = twl4c - (ss_hor && twl4c); const int v_subpu_l2 = thl4c - (ss_ver && thl4c); mask_subpu_edges(masks, by4, bx4, bw4, bh4, h_subpu_l2, v_subpu_l2, (1 << subpu_l2) >> ss_hor, (1 << subpu_l2) >> ss_ver, // this variable isn't subsampled for some reason ds_subpu_mask); } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/lf_mask.h000066400000000000000000000054521517466257200224260ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LF_MASK_H #define DAV2D_SRC_LF_MASK_H #include #include #include "src/levels.h" typedef struct Av2RestorationUnit { uint8_t /* enum Dav2dRestorationType */ type; int8_t ns_filter[16][32]; } Av2RestorationUnit; // each struct describes one 256x256 area typedef struct Av2Filter { // each bit is 1 col uint16_t filter_y[2 /* 0=col, 1=row */][64][5][4]; uint16_t filter_uv[2 /* 0=col, 1=row */][64][5][4]; uint16_t qidx[16]; uint8_t gdf[16]; int8_t cdef_idx[16]; // -1 means "unset" uint8_t ccso[3]; uint16_t noskip_mask[32][4]; // for 8x8 blocks, but stored on a 4x8 basis uint16_t lr_noskip_mask[64][4]; // for 4x4 blocks uint16_t lossless_mask_y[64][4]; uint16_t lossless_mask_uv[64][4]; } Av2Filter; // each struct describes one 256x256 area (1, 4, or 16 SBs) typedef struct Av2Restoration { Av2RestorationUnit lr[3][16]; } Av2Restoration; // for luma, set layout to 400; for chroma, select the appropriate layout void dav2d_create_db_mask(uint16_t (*const masks)[64][5][4], const Av2Block *b, enum BlockSize bs, int bx, int by, int iw, int ih, enum Dav2dPixelLayout layout, int chroma, uint8_t *a, uint8_t *l, const Dav2dFrameHeader *frame_hdr, const Dav2dSequenceHeader *seq_hdr); #endif /* DAV2D_SRC_LF_MASK_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/lib.c000066400000000000000000000706421517466257200215560ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "vcs_version.h" #include #include #if defined(__linux__) && HAVE_DLSYM #include #endif #include "dav2d/dav2d.h" #include "dav2d/data.h" #include "common/validate.h" #include "src/cpu.h" #include "src/fg_apply.h" #include "src/ibp.h" #include "src/internal.h" #include "src/log.h" #include "src/obu.h" #include "src/quantizer.h" #include "src/ref.h" #include "src/thread_task.h" #include "src/wedge.h" static COLD void init_internal(void) { dav2d_init_cpu(); dav2d_init_ibp_weights(); dav2d_init_ii_wedge_masks(); dav2d_init_qm_tables(); dav2d_init_thread(); } COLD const char *dav2d_version(void) { return DAV2D_VERSION; } COLD unsigned dav2d_version_api(void) { return (DAV2D_API_VERSION_MAJOR << 16) | (DAV2D_API_VERSION_MINOR << 8) | (DAV2D_API_VERSION_PATCH << 0); } COLD void dav2d_default_settings(Dav2dSettings *const s) { s->n_threads = 0; s->max_frame_delay = 0; s->apply_grain = 1; s->allocator.cookie = NULL; s->allocator.alloc_picture_callback = dav2d_default_picture_alloc; s->allocator.release_picture_callback = dav2d_default_picture_release; s->logger.cookie = NULL; s->logger.callback = dav2d_log_default_callback; s->operating_point = 0; s->all_layers = 1; // just until the tests are adjusted s->frame_size_limit = 0; s->strict_std_compliance = 0; s->output_invisible_frames = 0; s->inloop_filters = DAV2D_INLOOPFILTER_ALL; s->decode_frame_type = DAV2D_DECODEFRAMETYPE_ALL; } static void close_internal(Dav2dContext **const c_out, int flush); #if defined(__linux__) && HAVE_DLSYM && defined(__GLIBC__) NO_SANITIZE("cfi-icall") // CFI is broken with dlsym() static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) { /* glibc has an issue where the size of the TLS is subtracted from the stack * size instead of allocated separately. As a result the specified stack * size may be insufficient when used in an application with large amounts * of TLS data. The following is a workaround to compensate for that. * See https://sourceware.org/bugzilla/show_bug.cgi?id=11787 */ size_t (*const get_minstack)(const pthread_attr_t*) = dlsym(RTLD_DEFAULT, "__pthread_get_minstack"); if (get_minstack) return get_minstack(thread_attr) - PTHREAD_STACK_MIN; return 0; } #else #define get_stack_size_internal(attr) (0) #endif static COLD void get_num_threads(Dav2dContext *const c, const Dav2dSettings *const s, unsigned *n_tc, unsigned *n_fc) { /* ceil(sqrt(n)) */ static const uint8_t fc_lut[49] = { 1, /* 1 */ 2, 2, 2, /* 2- 4 */ 3, 3, 3, 3, 3, /* 5- 9 */ 4, 4, 4, 4, 4, 4, 4, /* 10-16 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */ }; *n_tc = s->n_threads ? s->n_threads : iclip(dav2d_num_logical_processors(c), 1, DAV2D_MAX_THREADS); *n_fc = s->max_frame_delay ? umin(s->max_frame_delay, *n_tc) : *n_tc < 50 ? fc_lut[*n_tc - 1] : 8; // min(8, ceil(sqrt(n))) } COLD int dav2d_get_frame_delay(const Dav2dSettings *const s) { unsigned n_tc, n_fc; validate_input_or_ret(s != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->n_threads >= 0 && s->n_threads <= DAV2D_MAX_THREADS, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->max_frame_delay >= 0 && s->max_frame_delay <= DAV2D_MAX_FRAME_DELAY, DAV2D_ERR(EINVAL)); get_num_threads(NULL, s, &n_tc, &n_fc); return n_fc; } COLD int dav2d_open(Dav2dContext **const c_out, const Dav2dSettings *const s) { static pthread_once_t initted = PTHREAD_ONCE_INIT; pthread_once(&initted, init_internal); validate_input_or_ret(c_out != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(s != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->n_threads >= 0 && s->n_threads <= DAV2D_MAX_THREADS, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->max_frame_delay >= 0 && s->max_frame_delay <= DAV2D_MAX_FRAME_DELAY, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->allocator.alloc_picture_callback != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->allocator.release_picture_callback != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->operating_point >= 0 && s->operating_point <= 31, DAV2D_ERR(EINVAL)); validate_input_or_ret(s->decode_frame_type >= DAV2D_DECODEFRAMETYPE_ALL && s->decode_frame_type <= DAV2D_DECODEFRAMETYPE_KEY, DAV2D_ERR(EINVAL)); pthread_attr_t thread_attr; if (pthread_attr_init(&thread_attr)) return DAV2D_ERR(ENOMEM); size_t stack_size = 1024 * 1024 + get_stack_size_internal(&thread_attr); pthread_attr_setstacksize(&thread_attr, stack_size); Dav2dContext *const c = *c_out = dav2d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64); if (!c) goto error; memset(c, 0, sizeof(*c)); c->allocator = s->allocator; c->logger = s->logger; c->apply_grain = s->apply_grain; c->operating_point = s->operating_point; c->all_layers = s->all_layers; c->frame_size_limit = s->frame_size_limit; c->strict_std_compliance = s->strict_std_compliance; c->output_invisible_frames = s->output_invisible_frames; c->inloop_filters = s->inloop_filters; c->decode_frame_type = s->decode_frame_type; #if 0 dav2d_data_props_set_defaults(&c->cached_error_props); #endif if (dav2d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) || dav2d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) || dav2d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) || dav2d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_uv_pool) || dav2d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) || dav2d_mem_pool_init(ALLOC_CCSOMAP, &c->ccsomap_pool) || dav2d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) || dav2d_mem_pool_init(ALLOC_CDF, &c->cdf_pool) || dav2d_mem_pool_init(ALLOC_CDF, &c->fgm_pool) || dav2d_mem_pool_init(ALLOC_CDF, &c->ci_pool)) { goto error; } if (c->allocator.alloc_picture_callback == dav2d_default_picture_alloc && c->allocator.release_picture_callback == dav2d_default_picture_release) { if (c->allocator.cookie) goto error; if (dav2d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error; c->allocator.cookie = c->picture_pool; } else if (c->allocator.alloc_picture_callback == dav2d_default_picture_alloc || c->allocator.release_picture_callback == dav2d_default_picture_release) { goto error; } /* On 32-bit systems extremely large frame sizes can cause overflows in * dav2d_decode_frame() malloc size calculations. Prevent that from occuring * by enforcing a maximum frame size limit, chosen to roughly correspond to * the largest size possible to decode without exhausting virtual memory. */ if (sizeof(size_t) < 8 && s->frame_size_limit - 1 >= 8192 * 8192) { c->frame_size_limit = 8192 * 8192; if (s->frame_size_limit) dav2d_log(c, "Frame size limit reduced from %u to %u.\n", s->frame_size_limit, c->frame_size_limit); } c->flush = &c->flush_mem; atomic_init(c->flush, 0); get_num_threads(c, s, &c->n_tc, &c->n_fc); c->fc = dav2d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32); if (!c->fc) goto error; memset(c->fc, 0, sizeof(*c->fc) * c->n_fc); c->tc = dav2d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64); if (!c->tc) goto error; memset(c->tc, 0, sizeof(*c->tc) * c->n_tc); if (c->n_tc > 1) { if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error; if (pthread_cond_init(&c->task_thread.cond, NULL)) { pthread_mutex_destroy(&c->task_thread.lock); goto error; } if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) { pthread_cond_destroy(&c->task_thread.cond); pthread_mutex_destroy(&c->task_thread.lock); goto error; } c->task_thread.cur = c->n_fc; atomic_init(&c->task_thread.reset_task_cur, UINT_MAX); atomic_init(&c->task_thread.cond_signaled, 0); c->task_thread.inited = 1; } c->task_thread.n_passes = 1 + (c->n_tc > 1) + (c->n_fc > 1); #if 0 if (c->n_fc > 1) { const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc; c->frame_thread.out_delayed = dav2d_malloc(ALLOC_THREAD_CTX, out_delayed_sz); if (!c->frame_thread.out_delayed) goto error; memset(c->frame_thread.out_delayed, 0, out_delayed_sz); } #endif c->dpb_sz = c->n_fc + 16; c->dpb = dav2d_malloc(ALLOC_THREAD_CTX, sizeof(*c->dpb) * c->dpb_sz); if (!c->dpb) goto error; memset(c->dpb, 0, sizeof(*c->dpb) * c->dpb_sz); for (unsigned n = 0; n < c->n_fc; n++) { Dav2dFrameContext *const f = &c->fc[n]; if (c->n_tc > 1) { if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error; if (pthread_cond_init(&f->task_thread.cond, NULL)) { pthread_mutex_destroy(&f->task_thread.lock); goto error; } if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) { pthread_cond_destroy(&f->task_thread.cond); pthread_mutex_destroy(&f->task_thread.lock); goto error; } } f->c = c; f->task_thread.ttd = &c->task_thread; f->refdir_intra = -1; f->refdir[TIP_FRAME] = 1; } for (unsigned m = 0; m < c->n_tc; m++) { Dav2dTaskContext *const t = &c->tc[m]; t->f = &c->fc[0]; t->task_thread.ttd = &c->task_thread; t->c = c; memset(t->cf_y_16bpc, 0, sizeof(t->cf_y_16bpc)); memset(t->cf_uv_16bpc, 0, sizeof(t->cf_uv_16bpc)); if (c->n_tc > 1) { if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error; if (pthread_cond_init(&t->task_thread.td.cond, NULL)) { pthread_mutex_destroy(&t->task_thread.td.lock); goto error; } if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav2d_worker_task, t)) { pthread_cond_destroy(&t->task_thread.td.cond); pthread_mutex_destroy(&t->task_thread.td.lock); goto error; } t->task_thread.td.inited = 1; } } dav2d_pal_dsp_init(&c->pal_dsp); dav2d_refmvs_dsp_init(&c->refmvs_dsp); pthread_attr_destroy(&thread_attr); return 0; error: if (c) close_internal(c_out, 0); pthread_attr_destroy(&thread_attr); return DAV2D_ERR(ENOMEM); } static struct OutputQueue *queue_append(Dav2dContext *const c, Dav2dThreadPicture *const p) { struct OutputQueue *const q = &c->dpb[c->dpb_in++]; dav2d_thread_picture_ref(&q->p, p); q->res = 0; if (c->dpb_in == c->dpb_sz) c->dpb_in = 0; assert(c->dpb_in != c->dpb_out); assert(!c->dpb_poc || c->dpb_poc != p->p.frame_hdr->frame_offset); c->dpb_poc = p->p.frame_hdr->frame_offset; return q; } static void queue_flush(Dav2dContext *const c) { if (!c->seq_hdr) return; const int nb = c->seq_hdr->order_hint_n_bits; int mask = 0; for (;;) { int cand_n = -1, cand_poc; for (int n = 0, m = 1; n < 8; n++, m <<= 1) { if (mask & m) continue; if (!c->refs[n].p.p.data[0]) continue; const Dav2dFrameHeader *const hdr = c->refs[n].p.p.frame_hdr; assert(hdr); if (!hdr->show_implicit) continue; const int ipoc = hdr->frame_offset; if (get_poc_diff(nb, ipoc, c->dpb_poc) > 0 && (cand_n == -1 || get_poc_diff(nb, ipoc, cand_poc) < 0)) { cand_n = n; cand_poc = ipoc; } } if (cand_n == -1) break; queue_append(c, &c->refs[cand_n].p); mask |= 1 << cand_n; } } struct OutputQueue *dav2d_queue_output(Dav2dContext *const c, Dav2dThreadPicture *const p) { assert(c->seq_hdr); if (c->output_invisible_frames) return queue_append(c, p); // FIXME the remainder of this code is not multi-layer-compatible yet const int nb = c->seq_hdr->order_hint_n_bits; const int poc = p->p.frame_hdr->frame_offset; unsigned mask = 0; for (;;) { int cand_n = -1, cand_poc = poc; for (int n = 0, m = 1; n < 8; n++, m <<= 1) { if (mask & m) continue; if (!c->refs[n].p.p.data[0]) continue; const Dav2dFrameHeader *const hdr = c->refs[n].p.p.frame_hdr; assert(hdr); if (!hdr->show_implicit) continue; const int ipoc = hdr->frame_offset; if (get_poc_diff(nb, ipoc, c->dpb_poc) > 0 && get_poc_diff(nb, ipoc, cand_poc) < 0) { cand_n = n; cand_poc = ipoc; } } if (cand_n == -1) break; queue_append(c, &c->refs[cand_n].p); mask |= 1 << cand_n; } struct OutputQueue *const q = queue_append(c, p); // immediately-adjacent future refs after the trigger frame for (;;) { int n, m; for (n = 0, m = 1; n < 8; n++, m <<= 1) { if (mask & m) continue; if (!c->refs[n].p.p.data[0]) continue; const Dav2dFrameHeader *const hdr = c->refs[n].p.p.frame_hdr; assert(hdr); if (!hdr->show_implicit) continue; const int ipoc = hdr->frame_offset; if (get_poc_diff(nb, ipoc, c->dpb_poc) == 1) break; } if (n == 8) break; queue_append(c, &c->refs[n].p); mask |= 1 << n; } return q; } static int has_grain(const Dav2dPicture *const pic) { const Dav2dFilmGrainData *fgdata = pic->fgm; return fgdata && (fgdata->num_points[0] || fgdata->num_points[1] || fgdata->num_points[2] || (fgdata->clip_to_restricted_range && fgdata->chroma_scaling_from_luma)); } static int output_image(Dav2dContext *const c, Dav2dPicture *const out) { if (c->dpb_in == c->dpb_out) { if (!c->drain) return DAV2D_ERR(EAGAIN); c->drain = 0; return DAV2D_EOF; } // frame-threading completion condition struct OutputQueue *const q = &c->dpb[c->dpb_out]; unsigned progress = c->n_fc == 1 ? UINT_MAX : atomic_load_explicit(&q->p.progress[2], memory_order_relaxed); if (c->drain && c->n_fc > 1) { pthread_mutex_lock(&c->task_thread.lock); while (progress != FRAME_ERROR && progress != UINT_MAX) { const unsigned next = c->frame_thread.next++; if (c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; Dav2dFrameContext *const f = &c->fc[next]; while (f->n_tile_data != 0) { pthread_cond_wait(&f->task_thread.cond, &c->task_thread.lock); } progress = atomic_load_explicit(&q->p.progress[2], memory_order_relaxed); } pthread_mutex_unlock(&c->task_thread.lock); } if (progress != FRAME_ERROR && progress != UINT_MAX) return DAV2D_ERR(EAGAIN); c->dpb_out++; if (c->dpb_out == c->dpb_sz) c->dpb_out = 0; // FIXME if decoding had an error, report it back to the user here const int res = dav2d_apply_grain(c, out, &q->p.p); dav2d_thread_picture_unref(&q->p); return res; } static int output_picture_ready(Dav2dContext *const c) { if (c->dpb_out == c->dpb_in) return 0; struct OutputQueue *const q = &c->dpb[c->dpb_out]; const unsigned progress = c->n_fc == 1 ? UINT_MAX : atomic_load_explicit(&q->p.progress[2], memory_order_relaxed); return progress == FRAME_ERROR || progress == UINT_MAX; } static int gen_picture(Dav2dContext *const c) { Dav2dData *const in = &c->in; if (output_picture_ready(c)) return 0; while (in->sz > 0) { const ptrdiff_t res = dav2d_parse_obus(c, in); if (res < 0) { dav2d_data_unref_internal(in); } else { assert((size_t)res <= in->sz); in->sz -= res; in->data += res; if (!in->sz) dav2d_data_unref_internal(in); } if (output_picture_ready(c)) break; if (res < 0) return (int)res; } return 0; } int dav2d_send_data(Dav2dContext *const c, Dav2dData *const in) { validate_input_or_ret(c != NULL, DAV2D_ERR(EINVAL)); if (!in) { c->drain = 1; return 0; } else if (c->drain) { return DAV2D_EOF; } validate_input_or_ret(in->sz > 0 && in->sz <= SIZE_MAX / 2, DAV2D_ERR(EINVAL)); if (c->in.data) return DAV2D_ERR(EAGAIN); dav2d_data_ref(&c->in, in); dav2d_data_unref(in); return 0; } int dav2d_get_picture(Dav2dContext *const c, Dav2dPicture *const out) { validate_input_or_ret(c != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(out != NULL, DAV2D_ERR(EINVAL)); int res = gen_picture(c); if (res < 0) return res; if (c->drain) queue_flush(c); return output_image(c, out); } int dav2d_apply_grain(Dav2dContext *const c, Dav2dPicture *const out, const Dav2dPicture *const in) { validate_input_or_ret(c != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(out != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(in != NULL, DAV2D_ERR(EINVAL)); if (!has_grain(in) || !c->apply_grain) { dav2d_picture_ref(out, in); return 0; } int res = dav2d_picture_alloc_copy(c, out, in); if (res < 0) goto error; if (c->n_tc > 1) { dav2d_task_delayed_fg(c, out, in); } else { switch (out->p.bpc) { #if CONFIG_8BPC case 8: dav2d_apply_grain_8bpc(&c->dsp[0].fg, out, in); break; #endif #if CONFIG_16BPC case 10: case 12: dav2d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in); break; #endif default: abort(); } } return 0; error: dav2d_picture_unref_internal(out); return res; } void dav2d_flush(Dav2dContext *const c) { dav2d_data_unref_internal(&c->in); for (int n = 0; n < c->dpb_sz; n++) if (c->dpb[n].p.p.data[0]) dav2d_thread_picture_unref(&c->dpb[n].p); c->dpb_in = c->dpb_out = c->drain = 0; for (int i = 0; i < 8; i++) { if (c->refs[i].p.p.frame_hdr) dav2d_thread_picture_unref(&c->refs[i].p); dav2d_ref_dec(&c->refs[i].segmap); dav2d_ref_dec(&c->refs[i].refmvs); dav2d_cdf_thread_unref(&c->cdf[i]); dav2d_ref_dec(&c->fgm[i]); } dav2d_ref_dec(&c->ci_ref); c->frame_hdr = NULL; c->seq_hdr = NULL; dav2d_ref_dec(&c->seq_hdr_ref); c->mastering_display = NULL; c->content_light = NULL; c->itut_t35 = NULL; c->n_itut_t35 = 0; dav2d_ref_dec(&c->mastering_display_ref); dav2d_ref_dec(&c->content_light_ref); dav2d_ref_dec(&c->itut_t35_ref); if (c->n_fc == 1 && c->n_tc == 1) return; atomic_store(c->flush, 1); if (c->n_tc > 1) { pthread_mutex_lock(&c->task_thread.lock); // stop running tasks in worker threads for (unsigned i = 0; i < c->n_tc; i++) { Dav2dTaskContext *const tc = &c->tc[i]; while (!tc->task_thread.flushed) { pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock); } } for (unsigned i = 0; i < c->n_fc; i++) { c->fc[i].task_thread.task_head = NULL; c->fc[i].task_thread.task_tail = NULL; c->fc[i].task_thread.task_cur_prev = NULL; c->fc[i].task_thread.pending_tasks.head = NULL; c->fc[i].task_thread.pending_tasks.tail = NULL; atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0); } atomic_init(&c->task_thread.first, 0); c->task_thread.cur = c->n_fc; atomic_store(&c->task_thread.reset_task_cur, UINT_MAX); atomic_store(&c->task_thread.cond_signaled, 0); pthread_mutex_unlock(&c->task_thread.lock); } if (c->n_fc > 1) { for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) { if (next == c->n_fc) next = 0; Dav2dFrameContext *const f = &c->fc[next]; dav2d_decode_frame_exit(f, -1); f->n_tile_data = 0; f->task_thread.retval = 0; f->task_thread.error = 0; f->frame_thread.scheduled = 0; #if 0 Dav2dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.frame_hdr) { dav2d_thread_picture_unref(out_delayed); } #endif } c->frame_thread.next = 0; } atomic_store(c->flush, 0); } COLD void dav2d_close(Dav2dContext **const c_out) { validate_input(c_out != NULL); #if TRACK_HEAP_ALLOCATIONS dav2d_log_alloc_stats(*c_out); #endif close_internal(c_out, 1); } static COLD void close_internal(Dav2dContext **const c_out, int flush) { Dav2dContext *const c = *c_out; if (!c) return; if (flush) dav2d_flush(c); if (c->tc) { struct TaskThreadData *ttd = &c->task_thread; if (ttd->inited) { pthread_mutex_lock(&ttd->lock); for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++) c->tc[n].task_thread.die = 1; pthread_cond_broadcast(&ttd->cond); pthread_mutex_unlock(&ttd->lock); for (unsigned n = 0; n < c->n_tc; n++) { Dav2dTaskContext *const pf = &c->tc[n]; if (!pf->task_thread.td.inited) break; pthread_join(pf->task_thread.td.thread, NULL); pthread_cond_destroy(&pf->task_thread.td.cond); pthread_mutex_destroy(&pf->task_thread.td.lock); } pthread_cond_destroy(&ttd->delayed_fg.cond); pthread_cond_destroy(&ttd->cond); pthread_mutex_destroy(&ttd->lock); } dav2d_free_aligned(c->tc); } for (unsigned n = 0; c->fc && n < c->n_fc; n++) { Dav2dFrameContext *const f = &c->fc[n]; // clean-up threading stuff dav2d_free(f->tile_thread.lowest_pixel_mem); dav2d_free(f->frame_thread.b); dav2d_free_aligned(f->frame_thread.cbi); dav2d_free_aligned(f->frame_thread.pal_idx); dav2d_free_aligned(f->frame_thread.cf); dav2d_free(f->frame_thread.tile_start_off); dav2d_free_aligned(f->frame_thread.pal); dav2d_free(f->frame_thread.partition); if (c->n_tc > 1) { pthread_mutex_destroy(&f->task_thread.pending_tasks.lock); pthread_cond_destroy(&f->task_thread.cond); pthread_mutex_destroy(&f->task_thread.lock); } dav2d_free(f->frame_thread.frame_progress); dav2d_free(f->task_thread.tasks); dav2d_free(f->task_thread.tile_tasks[0]); dav2d_free_aligned(f->ts); if (f->prefilter_data_sz) dav2d_free_aligned(f->prefilter_data[0]); dav2d_free(f->a); dav2d_free(f->tile); dav2d_free(f->lf.mask); dav2d_free(f->lf.lr_mask); dav2d_free(f->lf.tx_db_right_edge[0]); dav2d_free(f->lf.start_of_tile_row); dav2d_free_aligned(f->rf.rp_proj); dav2d_free_aligned(f->lf.cdef_line_buf); dav2d_free_aligned(f->lf.lr_line_buf); } dav2d_free_aligned(c->fc); #if 0 if (c->n_fc > 1 && c->frame_thread.out_delayed) { for (unsigned n = 0; n < c->n_fc; n++) if (c->frame_thread.out_delayed[n].p.frame_hdr) dav2d_thread_picture_unref(&c->frame_thread.out_delayed[n]); dav2d_free(c->frame_thread.out_delayed); } #endif for (int n = 0; n < c->n_tile_data; n++) dav2d_data_unref_internal(&c->tile[n].data); dav2d_free(c->tile); for (int n = 0; n < 8; n++) { dav2d_cdf_thread_unref(&c->cdf[n]); if (c->refs[n].p.p.frame_hdr) dav2d_thread_picture_unref(&c->refs[n].p); dav2d_ref_dec(&c->refs[n].refmvs); dav2d_ref_dec(&c->refs[n].segmap); } dav2d_ref_dec(&c->seq_hdr_ref); dav2d_ref_dec(&c->frame_hdr_ref); dav2d_ref_dec(&c->mastering_display_ref); dav2d_ref_dec(&c->content_light_ref); dav2d_ref_dec(&c->itut_t35_ref); dav2d_mem_pool_end(c->seq_hdr_pool); dav2d_mem_pool_end(c->frame_hdr_pool); dav2d_mem_pool_end(c->segmap_pool); dav2d_mem_pool_end(c->segmap_uv_pool); dav2d_mem_pool_end(c->refmvs_pool); dav2d_mem_pool_end(c->ccsomap_pool); dav2d_mem_pool_end(c->cdf_pool); dav2d_mem_pool_end(c->fgm_pool); dav2d_mem_pool_end(c->ci_pool); dav2d_mem_pool_end(c->picture_pool); dav2d_mem_pool_end(c->pic_ctx_pool); dav2d_freep_aligned(c_out); } #if 0 int dav2d_get_event_flags(Dav2dContext *const c, enum Dav2dEventFlags *const flags) { validate_input_or_ret(c != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(flags != NULL, DAV2D_ERR(EINVAL)); *flags = c->event_flags; c->event_flags = 0; return 0; } int dav2d_get_decode_error_data_props(Dav2dContext *const c, Dav2dDataProps *const out) { validate_input_or_ret(c != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(out != NULL, DAV2D_ERR(EINVAL)); dav2d_data_props_unref_internal(out); *out = c->cached_error_props; dav2d_data_props_set_defaults(&c->cached_error_props); return 0; } #endif void dav2d_picture_unref(Dav2dPicture *const p) { dav2d_picture_unref_internal(p); } uint8_t *dav2d_data_create(Dav2dData *const buf, const size_t sz) { return dav2d_data_create_internal(buf, sz); } int dav2d_data_wrap(Dav2dData *const buf, const uint8_t *const ptr, const size_t sz, void (*const free_callback)(const uint8_t *data, void *user_data), void *const user_data) { return dav2d_data_wrap_internal(buf, ptr, sz, free_callback, user_data); } int dav2d_data_wrap_user_data(Dav2dData *const buf, const uint8_t *const user_data, void (*const free_callback)(const uint8_t *user_data, void *cookie), void *const cookie) { return dav2d_data_wrap_user_data_internal(buf, user_data, free_callback, cookie); } void dav2d_data_unref(Dav2dData *const buf) { dav2d_data_unref_internal(buf); } void dav2d_data_props_unref(Dav2dDataProps *const props) { dav2d_data_props_unref_internal(props); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/log.c000066400000000000000000000037301517466257200215630ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "dav2d/dav2d.h" #include "common/validate.h" #include "src/internal.h" #include "src/log.h" #if CONFIG_LOG COLD void dav2d_log_default_callback(void *const cookie, const char *const format, va_list ap) { vfprintf(stderr, format, ap); } COLD void dav2d_log(Dav2dContext *const c, const char *const format, ...) { assert(c != NULL); if (!c->logger.callback) return; va_list ap; va_start(ap, format); c->logger.callback(c->logger.cookie, format, ap); va_end(ap); } #endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/log.h000066400000000000000000000035121517466257200215660ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOG_H #define DAV2D_SRC_LOG_H #include "config.h" #include #include "dav2d/dav2d.h" #include "common/attributes.h" #if CONFIG_LOG #define dav2d_log dav2d_log void dav2d_log_default_callback(void *cookie, const char *format, va_list ap); void dav2d_log(Dav2dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3); #else #define dav2d_log_default_callback NULL #define dav2d_log(...) do { } while(0) #endif #endif /* DAV2D_SRC_LOG_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/000077500000000000000000000000001517466257200226075ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/cdef.S000066400000000000000000002100621517466257200236350ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" // static int cdef_find_dir_lsx(const pixel *img, const ptrdiff_t stride, // unsigned *const var HIGHBD_DECL_SUFFIX) // param: img: a0, stride: a1, var: a2 function cdef_find_dir_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 li.d a3, 128 vreplgr2vr.w vr31, a3 // hv: vr0-vr3 diag: vr4-vr11 alt: vr12-vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, \ vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr20, vr21, vr22, vr23 vxor.v \i, \i, \i .endr .CFDL01: // 8 // 0 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vadd.w vr4, vr4, vr24 //diag[0][y+x] vadd.w vr5, vr5, vr25 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr12, vr12, vr26 vadd.w vr12, vr12, vr27 //alt[0][y+(x>>1)] vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 0 //hv[0][y] vadd.w vr15, vr15, vr26 vadd.w vr15, vr15, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr15, vr15, 0x1b vadd.w vr9, vr9, vr24 vadd.w vr8, vr8, vr25 vpermi.w vr8, vr8, 0x1b vpermi.w vr9, vr9, 0x1b //diag[1][7+y-x] vxor.v vr28, vr28, vr28 vxor.v vr29, vr29, vr29 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.w vr18, vr28, 0x30 vshuf4i.w vr19, vr28, 0x39 vextrins.w vr19, vr29, 0x30 vshuf4i.w vr20, vr29, 0x39 //alt[2][3-(y>>1)+7] vinsgr2vr.w vr20, zero, 3 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vadd.w vr21, vr21, vr24 vadd.w vr22, vr22, vr25 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 1 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr4, 4 //1-4 vbsrl.v vr29, vr5, 4 //5-8 vextrins.w vr28, vr5, 0x30 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vbsll.v vr5, vr29, 4 vextrins.w vr5, vr28, 0x03 vextrins.w vr6, vr29, 0x03 vextrins.w vr28, vr4, 0x30 vshuf4i.w vr4, vr28, 0x93 vbsrl.v vr28, vr12, 4 vextrins.w vr28, vr13, 0x30 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr13, vr28, 0x03 vextrins.w vr28, vr12, 0x30 vshuf4i.w vr12, vr28, 0x93 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 1 //hv[0][y] vbsrl.v vr28, vr15, 4 vextrins.w vr28, vr16, 0x30 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr16, vr28, 0x00 vextrins.w vr28, vr15, 0x00 vshuf4i.w vr15, vr28, 0x6c vbsrl.v vr28, vr8, 4 //4321 vbsrl.v vr29, vr9, 4 //8765 vextrins.w vr28, vr9, 0x30 vpermi.w vr28, vr28, 0x1b vpermi.w vr29, vr29, 0x1b vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr10, vr29, 0x00 vextrins.w vr29, vr28, 0x00 vshuf4i.w vr9, vr29, 0x6c vextrins.w vr28, vr8, 0x00 vshuf4i.w vr8, vr28, 0x6c vbsll.v vr28, vr19, 4 vextrins.w vr28, vr18, 0x03 vbsll.v vr29, vr20, 4 vextrins.w vr29, vr19, 0x03 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] vextrins.w vr18, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr19, vr28, 0x39 vbsrl.v vr20, vr29, 4 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vadd.w vr21, vr21, vr24 vadd.w vr22, vr22, vr25 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 2 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr4, 8 vbsrl.v vr29, vr5, 8 vextrins.d vr28, vr5, 0x10 //2-5 vextrins.d vr29, vr6, 0x10 //6-9 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.d vr4, vr28, 0x10 vextrins.d vr5, vr28, 0x01 vextrins.d vr5, vr29, 0x10 vextrins.d vr6, vr29, 0x01 vbsrl.v vr28, vr12, 8 vextrins.d vr28, vr13, 0x10 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.d vr12, vr28, 0x10 vextrins.d vr13, vr28, 0x01 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 2 //hv[0][y] vbsrl.v vr28, vr15, 8 vextrins.d vr28, vr16, 0x10 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr28, vr28, 0x1b vextrins.d vr15, vr28, 0x10 vextrins.d vr16, vr28, 0x01 vbsrl.v vr28, vr8, 8 vextrins.d vr28, vr9, 0x10 vbsrl.v vr29, vr9, 8 vextrins.d vr29, vr10, 0x10 vpermi.w vr28, vr28, 0x1b //5432 vpermi.w vr29, vr29, 0x1b //9876 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 vpermi.w vr28, vr28, 0x1b vpermi.w vr29, vr29, 0x1b vextrins.d vr8, vr28, 0x10 vextrins.d vr9, vr28, 0x01 vextrins.d vr9, vr29, 0x10 vextrins.d vr10, vr29, 0x01 //diag[1][7+y-x] vbsrl.v vr28, vr18, 8 vextrins.d vr28, vr19, 0x10 //2345 vbsrl.v vr29, vr19, 8 vextrins.d vr29, vr20, 0x10 //6789 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr18, vr28, 0x10 vextrins.d vr19, vr28, 0x01 vextrins.d vr19, vr29, 0x10 vextrins.d vr20, vr29, 0x01 //alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 4 vextrins.w vr28, vr22, 0x30 //1234 vbsrl.v vr29, vr22, 4 //5678 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr23, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr22, vr29, 0x93 vextrins.w vr28, vr21, 0x30 vshuf4i.w vr21, vr28, 0x93 add.d a0, a0, a1 // 3 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsll.v vr28, vr5, 4 vextrins.w vr28, vr4, 0x03 //3456 vbsll.v vr29, vr6, 4 vextrins.w vr29, vr5, 0x03 //78910 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.w vr4, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr5, vr28, 0x39 vbsrl.v vr6, vr29, 4 vbsll.v vr28, vr13, 4 vextrins.w vr28, vr12, 0x03 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr12, vr28, 0x30 vbsrl.v vr13, vr28, 4 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr0, a3, 3 //hv[0][y] vbsll.v vr28, vr16, 4 vextrins.w vr28, vr15, 0x03 vpermi.w vr28, vr28, 0x1b //6543 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr15, vr28, 0x33 vshuf4i.w vr16, vr28, 0xc6 vinsgr2vr.w vr16, zero, 3 vbsll.v vr28, vr9, 4 vextrins.w vr28, vr8, 0x03 //3456 vbsll.v vr29, vr10, 4 vextrins.w vr29, vr9, 0x03 //78910 vpermi.w vr28, vr28, 0x1b //6543 vpermi.w vr29, vr29, 0x1b //10987 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr8, vr28, 0x33 vextrins.w vr28, vr29, 0x33 vshuf4i.w vr9, vr28, 0xc6 vshuf4i.w vr10, vr29, 0xc6 vinsgr2vr.w vr10, zero, 3 vbsrl.v vr28, vr18, 8 vextrins.d vr28, vr19, 0x10 //2345 vbsrl.v vr29, vr19, 8 vextrins.d vr29, vr20, 0x10 //6789 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr18, vr28, 0x10 vextrins.d vr19, vr28, 0x01 vextrins.d vr19, vr29, 0x10 vextrins.d vr20, vr29, 0x01 //alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 4 vextrins.w vr28, vr22, 0x30 //1234 vbsrl.v vr29, vr22, 4 //5678 vextrins.w vr29, vr23, 0x30 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr23, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr22, vr29, 0x93 vextrins.w vr28, vr21, 0x30 vshuf4i.w vr21, vr28, 0x93 add.d a0, a0, a1 // 4 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vadd.w vr5, vr5, vr24 //diag[0][y+x] vadd.w vr6, vr6, vr25 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr13, vr13, vr26 vadd.w vr13, vr13, vr27 //alt[0][y+(x>>1)] vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 0 //hv[0][y] vpermi.w vr16, vr16, 0x1b vadd.w vr16, vr16, vr26 vadd.w vr16, vr16, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr16, vr16, 0x1b vpermi.w vr9, vr9, 0x1b vpermi.w vr10, vr10, 0x1b vadd.w vr10, vr10, vr24 vadd.w vr9, vr9, vr25 vpermi.w vr9, vr9, 0x1b vpermi.w vr10, vr10, 0x1b //diag[1][7+y-x] vbsrl.v vr28, vr18, 4 vextrins.w vr28, vr19, 0x30 //1234 vbsrl.v vr29, vr19, 4 vextrins.w vr29, vr20, 0x30 //5678 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] vextrins.w vr20, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr19, vr29, 0x93 vbsll.v vr18, vr28, 4 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 8 vextrins.d vr28, vr22, 0x10 vbsrl.v vr29, vr22, 8 vextrins.d vr29, vr23, 0x10 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr21, vr28, 0x10 vextrins.d vr22, vr28, 0x01 vextrins.d vr22, vr29, 0x10 vextrins.d vr23, vr29, 0x01 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 5 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr5, 4 //5-8 vbsrl.v vr29, vr6, 4 //9-12 vextrins.w vr28, vr6, 0x30 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.w vr7, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr6, vr29, 0x93 vextrins.w vr28, vr5, 0x30 vshuf4i.w vr5, vr28, 0x93 vbsrl.v vr28, vr13, 4 vextrins.w vr28, vr14, 0x30 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr14, vr28, 0x03 vextrins.w vr28, vr13, 0x30 vshuf4i.w vr13, vr28, 0x93 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 1 //hv[0][y] vbsrl.v vr28, vr16, 4 vextrins.w vr28, vr17, 0x30 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr17, vr28, 0x00 vextrins.w vr28, vr16, 0x00 vshuf4i.w vr16, vr28, 0x6c vbsrl.v vr28, vr9, 4 vbsrl.v vr29, vr10, 4 vextrins.w vr28, vr10, 0x30 vpermi.w vr28, vr28, 0x1b //8-5 vpermi.w vr29, vr29, 0x1b //12-9 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr11, vr29, 0x00 vextrins.w vr29, vr28, 0x00 vshuf4i.w vr10, vr29, 0x6c vextrins.w vr28, vr9, 0x00 vshuf4i.w vr9, vr28, 0x6c vbsrl.v vr28, vr18, 4 vextrins.w vr28, vr19, 0x30 //1234 vbsrl.v vr29, vr19, 4 vextrins.w vr29, vr20, 0x30 //5678 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[2][3-(y>>1)+7] vextrins.w vr20, vr29, 0x03 vextrins.w vr29, vr28, 0x33 vshuf4i.w vr19, vr29, 0x93 vbsll.v vr18, vr28, 4 vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsrl.v vr28, vr21, 8 vextrins.d vr28, vr22, 0x10 vbsrl.v vr29, vr22, 8 vextrins.d vr29, vr23, 0x10 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 vextrins.d vr21, vr28, 0x10 vextrins.d vr22, vr28, 0x01 vextrins.d vr22, vr29, 0x10 vextrins.d vr23, vr29, 0x01 //alt[3][(y>>1)+x] add.d a0, a0, a1 // 6 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsrl.v vr28, vr5, 8 vbsrl.v vr29, vr6, 8 vextrins.d vr28, vr6, 0x10 //6-9 vextrins.d vr29, vr7, 0x10 //10-13 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.d vr5, vr28, 0x10 vextrins.d vr6, vr28, 0x01 vextrins.d vr6, vr29, 0x10 vextrins.d vr7, vr29, 0x01 vbsrl.v vr28, vr13, 8 vextrins.d vr28, vr14, 0x10 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.d vr13, vr28, 0x10 vextrins.d vr14, vr28, 0x01 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 2 //hv[0][y] vbsrl.v vr28, vr16, 8 vextrins.d vr28, vr17, 0x10 vpermi.w vr28, vr28, 0x1b vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vpermi.w vr28, vr28, 0x1b vextrins.d vr16, vr28, 0x10 vextrins.d vr17, vr28, 0x01 vbsrl.v vr28, vr9, 8 vextrins.d vr28, vr10, 0x10 vbsrl.v vr29, vr10, 8 vextrins.d vr29, vr11, 0x10 vpermi.w vr28, vr28, 0x1b //9876 vpermi.w vr29, vr29, 0x1b //13-10 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 vpermi.w vr28, vr28, 0x1b vpermi.w vr29, vr29, 0x1b vextrins.d vr9, vr28, 0x10 vextrins.d vr10, vr28, 0x01 vextrins.d vr10, vr29, 0x10 vextrins.d vr11, vr29, 0x01 //diag[1][7+y-x] vadd.w vr18, vr18, vr24 //0123 vadd.w vr19, vr19, vr25 //4567 alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsll.v vr28, vr22, 4 vextrins.w vr28, vr21, 0x03 //3456 vbsll.v vr29, vr23, 4 vextrins.w vr29, vr22, 0x03 //78910 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr21, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr22, vr28, 0x39 vbsrl.v vr23, vr29, 4 add.d a0, a0, a1 // 7 fld.d f24, a0, 0 //img vpermi.w vr25, vr24, 0x01 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr24, vr24, 0 vsllwil.hu.bu vr25, vr25, 0 vsllwil.hu.bu vr25, vr25, 0 vsub.w vr24, vr24, vr31 //px vsub.w vr25, vr25, vr31 vbsll.v vr28, vr6, 4 vextrins.w vr28, vr5, 0x03 //78910 vbsll.v vr29, vr7, 4 vextrins.w vr29, vr6, 0x03 //11-14 vadd.w vr28, vr28, vr24 //diag[0][y+x] vadd.w vr29, vr29, vr25 vextrins.w vr5, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr6, vr28, 0x39 vbsrl.v vr7, vr29, 4 vbsll.v vr28, vr14, 4 vextrins.w vr28, vr13, 0x03 vpackev.w vr26, vr25, vr24 vpackod.w vr27, vr25, vr24 vpermi.w vr26, vr26, 0xd8 //px0246 vpermi.w vr27, vr27, 0xd8 //px1357 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[0][y+(x>>1)] vextrins.w vr13, vr28, 0x30 vbsrl.v vr14, vr28, 4 vhaddw.d.w vr28, vr24, vr24 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr25, vr25 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr1, a3, 3 //hv[0][y] vbsll.v vr28, vr17, 4 vextrins.w vr28, vr16, 0x03 vpermi.w vr28, vr28, 0x1b //10987 vadd.w vr28, vr28, vr26 vadd.w vr28, vr28, vr27 //alt[1][3+y-(x>>1)] vextrins.w vr16, vr28, 0x33 vshuf4i.w vr17, vr28, 0xc6 vinsgr2vr.w vr17, zero, 3 vbsll.v vr28, vr10, 4 vextrins.w vr28, vr9, 0x03 //7-10 vbsll.v vr29, vr11, 4 vextrins.w vr29, vr10, 0x03 //11-14 vpermi.w vr28, vr28, 0x1b //10-7 vpermi.w vr29, vr29, 0x1b //14-11 vadd.w vr29, vr29, vr24 vadd.w vr28, vr28, vr25 //diag[1][7+y-x] vextrins.w vr9, vr28, 0x33 vextrins.w vr28, vr29, 0x33 vshuf4i.w vr10, vr28, 0xc6 vshuf4i.w vr11, vr29, 0xc6 vinsgr2vr.w vr11, zero, 3 vadd.w vr18, vr18, vr24 //0123 vadd.w vr19, vr19, vr25 //4567 alt[2][3-(y>>1)+7] vadd.w vr2, vr2, vr24 vadd.w vr3, vr3, vr25 //hv[1][x] vbsll.v vr28, vr22, 4 vextrins.w vr28, vr21, 0x03 //3456 vbsll.v vr29, vr23, 4 vextrins.w vr29, vr22, 0x03 //78910 vadd.w vr28, vr28, vr24 vadd.w vr29, vr29, vr25 //alt[3][(y>>1)+x] vextrins.w vr21, vr28, 0x30 vextrins.w vr28, vr29, 0x00 vshuf4i.w vr22, vr28, 0x39 vbsrl.v vr23, vr29, 4 add.d a0, a0, a1 vxor.v vr24, vr24, vr24 //unsigned cost[8] vxor.v vr25, vr25, vr25 vmul.w vr26, vr0, vr0 vmul.w vr27, vr1, vr1 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vhaddw.d.w vr28, vr27, vr27 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vmul.w vr26, vr2, vr2 vmul.w vr27, vr3, vr3 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vhaddw.d.w vr28, vr27, vr27 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a5, vr28, 0 add.d a4, a4, a5 li.d a6, 105 mul.w a3, a3, a6 mul.w a4, a4, a6 vinsgr2vr.w vr24, a3, 2 vinsgr2vr.w vr25, a4, 2 vxor.v vr30, vr30, vr30 //div_table vxor.v vr31, vr31, vr31 li.d t0, 840 vinsgr2vr.w vr30, t0, 0 li.d t0, 420 vinsgr2vr.w vr30, t0, 1 li.d t0, 280 vinsgr2vr.w vr30, t0, 2 li.d t0, 210 vinsgr2vr.w vr30, t0, 3 li.d t0, 168 vinsgr2vr.w vr31, t0, 0 li.d t0, 140 vinsgr2vr.w vr31, t0, 1 li.d t0, 120 vinsgr2vr.w vr31, t0, 2 vbsll.v vr27, vr7, 4 vextrins.w vr27, vr6, 0x03 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr4, vr4 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr30 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a3, vr28, 0 vbsll.v vr27, vr6, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr5, vr5 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr31 vextrins.w vr26, vr31, 0x33 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 //cost[0] vbsll.v vr27, vr11, 4 vextrins.w vr27, vr10, 0x03 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr8, vr8 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr30 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vbsll.v vr27, vr10, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr26, vr9, vr9 vmadd.w vr26, vr27, vr27 vmul.w vr26, vr26, vr31 vextrins.w vr26, vr31, 0x33 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a5, vr28, 0 add.d a4, a4, a5 //cost[4] vpickve2gr.w a5, vr5, 3 mul.w a5, a5, a5 mul.w a5, a5, a6 add.w a3, a3, a5 vinsgr2vr.w vr24, a3, 0 vpickve2gr.w a5, vr9, 3 mul.w a5, a5, a5 mul.w a5, a5, a6 add.w a4, a4, a5 vinsgr2vr.w vr25, a4, 0 //n=0 vpickve2gr.w a3, vr24, 1 vmul.w vr26, vr13, vr13 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr12, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vextrins.w vr29, vr30, 0x01 vextrins.w vr29, vr30, 0x13 vextrins.w vr29, vr31, 0x21 vextrins.w vr29, vr31, 0x33 vbsll.v vr27, vr14, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr12, vr12 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr24, a3, 1 //n=1 vpickve2gr.w a3, vr24, 3 vmul.w vr26, vr16, vr16 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr15, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vbsll.v vr27, vr17, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr15, vr15 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr24, a3, 3 //n=2 vpickve2gr.w a3, vr25, 1 vmul.w vr26, vr19, vr19 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr18, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vbsll.v vr27, vr20, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr18, vr18 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr25, a3, 1 //n=3 vpickve2gr.w a3, vr25, 3 vmul.w vr26, vr22, vr22 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 vpickve2gr.w a5, vr21, 3 mul.w a5, a5, a5 add.d a3, a3, a4 add.d a3, a3, a5 mul.w a3, a3, a6 //*cost_ptr vbsll.v vr27, vr23, 4 vpermi.w vr27, vr27, 0x1b vmul.w vr28, vr21, vr21 vextrins.w vr28, vr31, 0x33 vmadd.w vr28, vr27, vr27 vmul.w vr26, vr28, vr29 vhaddw.d.w vr28, vr26, vr26 vhaddw.q.d vr28, vr28, vr28 vpickve2gr.d a4, vr28, 0 add.d a3, a3, a4 vinsgr2vr.w vr25, a3, 3 xor a3, a3, a3 //best_dir vpickve2gr.w a4, vr24, 0 //best_cost .BSETDIR01: vpickve2gr.w a5, vr24, 1 bge a4, a5, .BSETDIR02 or a4, a5, a5 ori a3, zero, 1 .BSETDIR02: vpickve2gr.w a5, vr24, 2 bge a4, a5, .BSETDIR03 or a4, a5, a5 ori a3, zero, 2 .BSETDIR03: vpickve2gr.w a5, vr24, 3 bge a4, a5, .BSETDIR04 or a4, a5, a5 ori a3, zero, 3 .BSETDIR04: vpickve2gr.w a5, vr25, 0 bge a4, a5, .BSETDIR05 or a4, a5, a5 ori a3, zero, 4 .BSETDIR05: vpickve2gr.w a5, vr25, 1 bge a4, a5, .BSETDIR06 or a4, a5, a5 ori a3, zero, 5 .BSETDIR06: vpickve2gr.w a5, vr25, 2 bge a4, a5, .BSETDIR07 or a4, a5, a5 ori a3, zero, 6 .BSETDIR07: vpickve2gr.w a5, vr25, 3 bge a4, a5, .BSETDIREND or a4, a5, a5 ori a3, zero, 7 .BSETDIREND: xori a5, a3, 4 li.d a1, 4 bge a5, a1, .GETCOST01 vreplve.w vr26, vr24, a5 b .GETCOST02 .GETCOST01: vreplve.w vr26, vr25, a5 .GETCOST02: vpickve2gr.w a5, vr26, 0 sub.w a5, a4, a5 srai.d a5, a5, 10 st.w a5, a2, 0 or a0, a3, a3 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc .macro cdef_fill tmp, stride, w, h beqz \h, 700f //h or t0, zero, zero //y 100: or t1, zero, zero //xx srai.d s6, \w, 3 //x beqz s6, 300f 200: vstx vr18, \tmp, t1 addi.d t1, t1, 16 addi.d s6, s6, -1 bnez s6, 200b 300: andi s6, \w, 4 beqz s6, 400f fstx.d f18, \tmp, t1 addi.d t1, t1, 8 400: andi s6, \w, 2 beqz s6, 500f fstx.s f18, \tmp, t1 addi.d t1, t1, 4 500: andi s6, \w, 1 beqz s6, 600f li.w s6, -16384 stx.h s6, \tmp, t1 addi.d t1, t1, 2 600: add.d \tmp, \tmp, \stride add.d \tmp, \tmp, \stride addi.d t0, t0, 1 blt t0, \h, 100b 700: .endm const dav2d_cdef_directions .byte 1 * 12 + 0, 2 * 12 + 0 .byte 1 * 12 + 0, 2 * 12 - 1 .byte -1 * 12 + 1, -2 * 12 + 2 .byte 0 * 12 + 1, -1 * 12 + 2 .byte 0 * 12 + 1, 0 * 12 + 2 .byte 0 * 12 + 1, 1 * 12 + 2 .byte 1 * 12 + 1, 2 * 12 + 2 .byte 1 * 12 + 0, 2 * 12 + 1 .byte 1 * 12 + 0, 2 * 12 + 0 .byte 1 * 12 + 0, 2 * 12 - 1 .byte -1 * 12 + 1, -2 * 12 + 2 .byte 0 * 12 + 1, -1 * 12 + 2 endconst .macro constrain_vrh in0, in1, in2, tmp0, tmp1, out vabsd.h \tmp0, \in0, vr23 //adiff vsra.h \tmp1, \tmp0, \in2 vsub.h \tmp1, \in1, \tmp1 vmax.h \tmp1, vr23, \tmp1 //imax vmin.h \tmp0, \tmp0, \tmp1 //imin //apply_sign vslt.h \tmp1, \in0, vr23 vandn.v \in0, \tmp1, \tmp0 vsigncov.h \tmp0, \tmp1, \tmp0 vor.v \out, \in0, \tmp0 .endm .macro iclip_vrh in0, in1, in2, tmp0, tmp1, out vmin.h \tmp0, \in2, \in0 vslt.h \in0, \in0, \in1 vand.v \tmp1, \in0, \in1 vandn.v \tmp0, \in0, \tmp0 vor.v \out, \tmp1, \tmp0 .endm .macro cdef_padding_data //y < 0 beqz t7, 90f 4: or t4, t5, t5 //data index xx slli.d t0, t4, 1 mul.w t2, t7, s5 slli.d t2, t2, 1 add.d t2, s4, t2 sub.d t3, t6, t5 //loop param x srai.d t3, t3, 3 add.d t3, t3, t5 beq t5, t3, 6f 5: // /8 fldx.d f18, a3, t4 vsllwil.hu.bu vr18, vr18, 0 vstx vr18, t2, t0 addi.d t0, t0, 16 addi.d t4, t4, 8 addi.d t3, t3, -1 bne t5, t3, 5b 6: // &4 sub.d t1, t6, t5 andi t1, t1, 4 beqz t1, 7f fldx.s f18, a3, t4 vsllwil.hu.bu vr18, vr18, 0 fstx.d f18, t2, t0 addi.d t0, t0, 8 addi.d t4, t4, 4 7: // &2 sub.d t1, t6, t5 andi t1, t1, 2 beqz t1, 9f ldx.bu t1, a3, t4 stx.h t1, t2, t0 addi.d t0, t0, 2 addi.d t4, t4, 1 ldx.bu t1, a3, t4 stx.h t1, t2, t0 addi.d t0, t0, 2 addi.d t4, t4, 1 9: add.d a3, a3, a1 addi.d t7, t7, 1 bnez t7, 4b 90: // y < h beqz s1, 12f beqz t5, 12f or t7, zero, zero //y 10: or t4, t5, t5 //data index x 11: slli.d t3, t7, 1 addi.d t3, t3, 2 add.d t3, t3, t4 ldx.bu t1, a2, t3 mul.w t3, t7, s5 add.d t3, t3, t4 slli.d t3, t3, 1 stx.h t1, s4, t3 addi.d t4, t4, 1 bnez t4, 11b addi.d t7, t7, 1 bne t7, s1, 10b 12: // y = 0 ; y < h or s0, s4, s4 beqz s1, 20f or s6, a0, a0 or t7, zero, zero //y srai.d t4, t6, 3 //loop max 13: or t0, zero, zero //loop param or t3, t0, t0 //data index src or t1, t0, t0 //data index tmp beqz t4, 16f 15: // /8 fldx.d f18, s6, t3 vsllwil.hu.bu vr18, vr18, 0 vstx vr18, s0, t1 addi.d t3, t3, 8 addi.d t1, t1, 16 addi.d t0, t0, 1 blt t0, t4, 15b 16: // &4 andi t0, t6, 4 beqz t0, 17f fldx.s f18, s6, t3 vsllwil.hu.bu vr18, vr18, 0 fstx.d f18, s0, t1 addi.d t3, t3, 4 addi.d t1, t1, 8 17: // &2 andi t0, t6, 2 beqz t0, 19f ldx.bu t2, s6, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 ldx.bu t2, s6, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 19: // src+ tmp+ add.d s6, s6, a1 add.d s0, s0, s5 add.d s0, s0, s5 addi.d t7, t7, 1 blt t7, s1, 13b // y = h ; y < y_end 20: beq s1, t8, 27f or t7, s1, s1 //y sub.d t4, t6, t5 srai.d t4, t4, 3 add.d t4, t4, t5 //8 loop max 21: or t0, t5, t5 //xx or t3, t0, t0 //data index bottom slli.d t1, t0, 1 //data index tmp beq t5, t4, 23f 22: // /8 fldx.d f18, a4, t3 vsllwil.hu.bu vr18, vr18, 0 vstx vr18, s0, t1 addi.d t3, t3, 8 addi.d t1, t1, 16 addi.d t0, t0, 1 blt t0, t4, 22b 23: // &4 sub.d t0, t6, t5 andi t0, t0, 4 beqz t0, 24f fldx.s f18, a4, t3 vsllwil.hu.bu vr18, vr18, 0 fstx.d f18, s0, t1 addi.d t3, t3, 4 addi.d t1, t1, 8 24: // &2 sub.d t0, t6, t5 andi t0, t0, 2 beqz t0, 26f ldx.bu t2, a4, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 ldx.bu t2, a4, t3 stx.h t2, s0, t1 addi.d t3, t3, 1 addi.d t1, t1, 2 26: // bottom+ tmp+ add.d a4, a4, a1 add.d s0, s0, s5 add.d s0, s0, s5 addi.d t7, t7, 1 blt t7, t8, 21b 27: // padding end .endm .macro cdef_pri_sec_init clz.w t3, a6 sub.w t3, t2, t3 sub.w t3, s7, t3 //sec_shift vreplgr2vr.h vr4, t0 //pri_tap_k vreplgr2vr.h vr9, a5 //pri_strength vreplgr2vr.h vr10, t1 //pri_shift vreplgr2vr.h vr18, a6 //sec_strength vreplgr2vr.h vr19, t3 //sec_shift or t2, s1, s1 //dowhile loop param addi.d s1, a7, 2 slli.d s1, s1, 1 //directions dir+2 addi.d s2, a7, 4 slli.d s2, s2, 1 //directions dir+4 slli.d s3, a7, 1 //directions dir+0 la.local t0, dav2d_cdef_directions add.d s1, t0, s1 ld.b a2, s1, 0 //off01 ld.b a3, s1, 1 //off11 add.d s2, t0, s2 ld.b s1, s2, 0 //off02 ld.b s2, s2, 1 //off12 add.d s3, t0, s3 ld.b t0, s3, 0 //off03 ld.b s3, s3, 1 //off13 slli.d a2, a2, 1 slli.d a3, a3, 1 slli.d s1, s1, 1 slli.d s2, s2, 1 slli.d t0, t0, 1 slli.d s3, s3, 1 .endm .macro cdef_pri_init vreplgr2vr.h vr4, t0 //pri_tap_k vreplgr2vr.h vr9, a5 //pri_strength vreplgr2vr.h vr10, t1 //pri_shift or t2, s1, s1 //dowhile loop param addi.d s1, a7, 2 slli.d s1, s1, 1 //directions dir+2 la.local t0, dav2d_cdef_directions add.d s1, t0, s1 ld.b a2, s1, 0 //off01 ld.b a3, s1, 1 //off11 slli.d a2, a2, 1 slli.d a3, a3, 1 .endm .macro cdef_sec_init clz.w t3, a6 li.w t2, 31 sub.w t3, t2, t3 sub.w t3, s7, t3 //sec_shift vreplgr2vr.h vr18, a6 //sec_strength vreplgr2vr.h vr19, t3 //sec_shift or t2, s1, s1 //dowhile loop param addi.d s2, a7, 4 slli.d s2, s2, 1 //directions dir+4 slli.d s3, a7, 1 //directions dir+0 la.local t0, dav2d_cdef_directions add.d s1, t0, s1 add.d s2, t0, s2 ld.b s1, s2, 0 //off02 ld.b s2, s2, 1 //off12 add.d s3, t0, s3 ld.b t0, s3, 0 //off03 ld.b s3, s3, 1 //off13 slli.d s1, s1, 1 slli.d s2, s2, 1 slli.d t0, t0, 1 slli.d s3, s3, 1 .endm .macro cdef_process_data_w8 in0, in1 vsub.h vr11, vr5, vr0 vsub.h vr12, vr6, vr0 vsub.h vr13, vr7, vr0 vsub.h vr14, vr8, vr0 constrain_vrh vr11, \in0, \in1, vr16, vr17, vr11 constrain_vrh vr12, \in0, \in1, vr16, vr17, vr12 constrain_vrh vr13, \in0, \in1, vr16, vr17, vr13 constrain_vrh vr14, \in0, \in1, vr16, vr17, vr14 .endm .macro cdef_process_data_w4 in0, in1 vpermi.w vr6, vr5, 0x44 vpermi.w vr8, vr7, 0x44 vsub.h vr12, vr6, vr0 vsub.h vr14, vr8, vr0 constrain_vrh vr12, \in0, \in1, vr16, vr17, vr12 constrain_vrh vr14, \in0, \in1, vr16, vr17, vr14 .endm .macro cdef_calc_sum_tapchange_w8 vmul.h vr1, vr15, vr11 //sum vmadd.h vr1, vr15, vr12 //sum vand.v vr15, vr15, vr21 vor.v vr15, vr15, vr22 vmadd.h vr1, vr15, vr13 //sum vmadd.h vr1, vr15, vr14 //sum .endm .macro cdef_calc_sum_tapchange_w4 vmul.h vr1, vr15, vr12 //sum vand.v vr15, vr15, vr21 vor.v vr15, vr15, vr22 vmadd.h vr1, vr15, vr14 //sum .endm .macro cdef_calc_sum_no_tapchange_w4 in0 vmadd.h vr1, \in0, vr12 vmadd.h vr1, \in0, vr14 .endm .macro cdef_calc_sum_no_tapchange_w8 in0 vmadd.h vr1, \in0, vr11 //sum vmadd.h vr1, \in0, vr12 vmadd.h vr1, \in0, vr13 vmadd.h vr1, \in0, vr14 .endm .macro cdef_calc_maxmin_w4 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr6, vr2 vmin.hu vr3, vr8, vr3 //min vmax.h vr2, vr8, vr2 //max .endm .macro cdef_calc_maxmin_w8 vmin.hu vr3, vr5, vr3 vmax.h vr2, vr5, vr2 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr6, vr2 vmin.hu vr3, vr7, vr3 vmax.h vr2, vr7, vr2 vmin.hu vr3, vr8, vr3 //min vmax.h vr2, vr8, vr2 //max .endm .macro cdef_calc_dst vslti.h vr5, vr1, 0 vand.v vr5, vr5, vr20 vsub.h vr5, vr1, vr5 vaddi.hu vr5, vr5, 8 vsrai.h vr5, vr5, 4 vadd.h vr5, vr0, vr5 .endm //static NOINLINE void cdef_filter_block_lsx // (pixel *dst, const ptrdiff_t dst_stride, // const pixel (*left)[2], const pixel *const top, // const int pri_strength, const int sec_strength, // const int dir, const int damping, const int w, int h, // const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) // w=4 h=4 //param: dst:a0, dst_stride:a1, left:a2, top:a3, bottom:a4, pri_strength:a5 //sec_strength:a6, dir:a7, damping:s7, w:s0, h:s1, edges:s2 function cdef_filter_block_4x4_8bpc_lsx ld.w t0, sp, 0 ld.w t1, sp, 8 addi.d sp, sp, -(64+288) st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 li.w s0, 4 //w li.w s1, 4 //h or s2, t1, t1 //edges or s7, t0, t0 //damping li.d s5, 12 //tmp_stride addi.d s4, sp, 64 slli.d t0, s5, 1 addi.d t0, t0, 2 slli.d t0, t0, 1 add.d s4, s4, t0 //ptr tmp vxor.v vr23, vr23, vr23 li.w t2, 1 vreplgr2vr.h vr20, t2 vaddi.hu vr21, vr20, 2 vaddi.hu vr22, vr20, 1 li.w t0, -16384 vreplgr2vr.h vr18, t0 //padding li.w t5, -2 //x_start addi.d t6, s0, 2 //x_end li.w t7, -2 //y_start addi.d t8, s1, 2 //y_end li.w t2, 2 andi t4, s2, 4 bnez t4, 1f //CDEF_HAVE_TOP slli.d t3, s5, 2 addi.d t4, s4, -4 sub.d t4, t4, t3 addi.d t3, s0, 4 cdef_fill t4, s5, t3, t2 or t7, zero, zero 1: //CDEF_HAVE_BOTTOM andi t4, s2,8 bnez t4, 2f mul.w t3, s1, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 li.d t3, 8 cdef_fill t4, s5, t3, t2 addi.d t8, t8, -2 2: //CDEF_HAVE_LEFT andi t4, s2,1 bnez t4, 3f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 or t5, zero, zero 3: //CDEF_HAVE_RIGHT andi t4, s2,2 bnez t4, 40f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, 8 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 addi.d t6, t6, -2 40: cdef_padding_data beqz a5, 33f 28: //if (pri_strength) li.w t0, 4 andi t1, a5, 1 sub.d t0, t0, t1 //pri_tap clz.w t1, a5 li.d t2, 31 sub.w t1, t2, t1 sub.w t1, s7, t1 blt t1, zero, 281f or t1, t1, t1 b 282f 281: or t1, zero, zero //t1: pri_shift 282: beqz a6, 31f 29: //if (sec_strength) cdef_pri_sec_init 30: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr2, vr0, vr0 //max vor.v vr3, vr0, vr0 //min vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 cdef_calc_maxmin_w4 sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 cdef_calc_maxmin_w4 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 cdef_calc_maxmin_w4 vshuf4i.w vr5, vr1, 0x0e vshuf4i.w vr6, vr3, 0x0e vshuf4i.w vr7, vr2, 0x0e vadd.h vr1, vr1, vr5 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr7, vr2 cdef_calc_dst iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 30b b 35f 31: // pri_strength only cdef_pri_init 32: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 32b b 35f 33: // sec_strength only cdef_sec_init 34: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 34b 35: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 addi.d sp, sp, (64+288) endfunc function cdef_filter_block_4x8_8bpc_lsx ld.w t0, sp, 0 ld.w t1, sp, 8 addi.d sp, sp, -(64+288) st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 li.w s0, 4 //w li.w s1, 8 //h or s2, t1, t1 //edges or s7, t0, t0 //damping li.d s5, 12 //tmp_stride addi.d s4, sp, 64 slli.d t0, s5, 1 addi.d t0, t0, 2 slli.d t0, t0, 1 add.d s4, s4, t0 //ptr tmp vxor.v vr23, vr23, vr23 li.w t2, 1 vreplgr2vr.h vr20, t2 vaddi.hu vr21, vr20, 2 vaddi.hu vr22, vr20, 1 li.w t0, -16384 vreplgr2vr.h vr18, t0 //padding li.w t5, -2 //x_start addi.d t6, s0, 2 //x_end li.w t7, -2 //y_start addi.d t8, s1, 2 //y_end li.w t2, 2 andi t4, s2, 4 bnez t4, 1f //CDEF_HAVE_TOP slli.d t3, s5, 2 addi.d t4, s4, -4 sub.d t4, t4, t3 addi.d t3, s0, 4 cdef_fill t4, s5, t3, t2 or t7, zero, zero 1: //CDEF_HAVE_BOTTOM andi t4, s2,8 bnez t4, 2f mul.w t3, s1, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 li.d t3, 8 cdef_fill t4, s5, t3, t2 addi.d t8, t8, -2 2: //CDEF_HAVE_LEFT andi t4, s2,1 bnez t4, 3f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 or t5, zero, zero 3: //CDEF_HAVE_RIGHT andi t4, s2,2 bnez t4, 40f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, 8 sub.d t3, t8, t7 cdef_fill t4, s5, t2, t3 addi.d t6, t6, -2 40: cdef_padding_data beqz a5, 33f 28: //if (pri_strength) li.w t0, 4 andi t1, a5, 1 sub.d t0, t0, t1 //pri_tap clz.w t1, a5 li.d t2, 31 sub.w t1, t2, t1 sub.w t1, s7, t1 blt t1, zero, 281f or t1, t1, t1 b 282f 281: or t1, zero, zero //t1: pri_shift 282: beqz a6, 31f 29: //if (sec_strength) cdef_pri_sec_init 30: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr2, vr0, vr0 //max vor.v vr3, vr0, vr0 //min vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 cdef_calc_maxmin_w4 sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 cdef_calc_maxmin_w4 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 cdef_calc_maxmin_w4 vshuf4i.w vr5, vr1, 0x0e vshuf4i.w vr6, vr3, 0x0e vshuf4i.w vr7, vr2, 0x0e vadd.h vr1, vr1, vr5 vmin.hu vr3, vr6, vr3 vmax.h vr2, vr7, vr2 cdef_calc_dst iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 30b b 35f 31: // pri_strength only cdef_pri_init 32: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 fldx.d f5, s4, a2 //p0_00 fld.d f6, t4, 0 //p0_01 fldx.d f7, s4, a3 //p0_10 fld.d f8, t5, 0 //p0_11 cdef_process_data_w4 vr9, vr10 cdef_calc_sum_tapchange_w4 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 32b b 35f 33: // sec_strength only cdef_sec_init 34: fld.s f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vpermi.w vr0, vr0, 0x44 vxor.v vr1, vr1, vr1 //sum sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] fldx.d f5, s4, s1 //s0_00 fld.d f6, t4, 0 //s0_01 fldx.d f7, s4, t0 //s0_02 fld.d f8, t5, 0 //s0_03 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr22 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] fldx.d f5, s4, s2 //s0_10 fld.d f6, t4, 0 //s0_11 fldx.d f7, s4, s3 //s0_12 fld.d f8, t5, 0 //s0_13 cdef_process_data_w4 vr18, vr19 cdef_calc_sum_no_tapchange_w4 vr20 vshuf4i.w vr5, vr1, 0x0e vadd.h vr1, vr1, vr5 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.s f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 34b 35: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 addi.d sp, sp, (64+288) endfunc function cdef_filter_block_8x8_8bpc_lsx ld.w t0, sp, 0 ld.w t1, sp, 8 addi.d sp, sp, -(64+288) st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 li.w s0, 8 //w li.w s1, 8 //h or s2, t1, t1 //edges or s7, t0, t0 //damping // cdef_filter_block_kernel li.d s5, 12 //tmp_stride addi.d s4, sp, 64 slli.d t0, s5, 1 addi.d t0, t0, 2 slli.d t0, t0, 1 add.d s4, s4, t0 //ptr tmp vxor.v vr23, vr23, vr23 li.w t2, 1 vreplgr2vr.h vr20, t2 vaddi.hu vr21, vr20, 2 vaddi.hu vr22, vr20, 1 li.w t0, -16384 vreplgr2vr.h vr18, t0 //padding li.w t5, -2 //x_start addi.d t6, s0, 2 //x_end li.w t7, -2 //y_start addi.d t8, s1, 2 //y_end li.w t2, 2 andi t4, s2, 4 bnez t4, 1f //CDEF_HAVE_TOP slli.d t3, s5, 2 addi.d t4, s4, -4 sub.d t4, t4, t3 addi.d t3, s0, 4 cdef_fill t4, s5, t3, t2 or t7, zero, zero 1: //CDEF_HAVE_BOTTOM andi t4, s2,8 bnez t4, 2f mul.w t3, s1, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 li.d t3, 12 cdef_fill t4, s5, t3, t2 addi.d t8, t8, -2 2: //CDEF_HAVE_LEFT andi t4, s2,1 bnez t4, 3f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, -4 sub.d t3, t8, t7 li.d t2, 2 cdef_fill t4, s5, t2, t3 or t5, zero, zero 3: //CDEF_HAVE_RIGHT andi t4, s2,2 bnez t4, 40f mul.w t3, t7, s5 slli.d t3, t3, 1 add.d t4, s4, t3 addi.d t4, t4, 16 sub.d t3, t8, t7 li.d t2, 2 cdef_fill t4, s5, t2, t3 addi.d t6, t6, -2 40: cdef_padding_data beqz a5, 33f 28: //if (pri_strength) li.w t0, 4 andi t1, a5, 1 sub.d t0, t0, t1 //pri_tap //edit clz.w t1, a5 li.d t2, 31 sub.w t3, t2, t1 sub.w t3, s7, t3 or t1, zero, zero //t1: pri_shift blt t3, zero, 281f or t1, t3, t3 281: beqz a6, 31f 29: //if (sec_strength) cdef_pri_sec_init 301: fld.d f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vxor.v vr1, vr1, vr1 //sum vor.v vr2, vr0, vr0 //max vor.v vr3, vr0, vr0 //min vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 vldx vr5, s4, a2 vld vr6, t4, 0 vldx vr7, s4, a3 vld vr8, t5, 0 cdef_process_data_w8 vr9, vr10 cdef_calc_sum_tapchange_w8 cdef_calc_maxmin_w8 //s 00-03 sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] vldx vr5, s4, s1 vld vr6, t4, 0 vldx vr7, s4, t0 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr22 cdef_calc_maxmin_w8 //s 10-13 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] vldx vr5, s4, s2 vld vr6, t4, 0 vldx vr7, s4, s3 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr20 cdef_calc_maxmin_w8 cdef_calc_dst iclip_vrh vr5, vr3, vr2, vr16, vr17, vr5 vsrlni.b.h vr5, vr5, 0 fst.d f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 301b b 35f 31: // pri_strength only cdef_pri_init 32: fld.d f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vxor.v vr1, vr1, vr1 //sum vor.v vr15, vr4, vr4 //pri_tap_k sub.d t4, s4, a2 sub.d t5, s4, a3 vldx vr5, s4, a2 vld vr6, t4, 0 vldx vr7, s4, a3 vld vr8, t5, 0 cdef_process_data_w8 vr9, vr10 cdef_calc_sum_tapchange_w8 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.d f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 32b b 35f 33: // sec_strength only cdef_sec_init 34: fld.d f0, a0, 0 //px vsllwil.hu.bu vr0, vr0, 0 vxor.v vr1, vr1, vr1 //sum sub.d t4, s4, s1 //tmp[-off02] sub.d t5, s4, t0 //tmp[-off03] vldx vr5, s4, s1 vld vr6, t4, 0 vldx vr7, s4, t0 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr22 sub.d t4, s4, s2 //tmp[-off12] sub.d t5, s4, s3 //tmp[-off13] vldx vr5, s4, s2 vld vr6, t4, 0 vldx vr7, s4, s3 vld vr8, t5, 0 cdef_process_data_w8 vr18, vr19 cdef_calc_sum_no_tapchange_w8 vr20 cdef_calc_dst vsrlni.b.h vr5, vr5, 0 fst.d f5, a0, 0 add.d a0, a0, a1 add.d s4, s4, s5 add.d s4, s4, s5 addi.d t2, t2, -1 blt zero, t2, 34b 35: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 addi.d sp, sp, (64+288) endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/cdef.h000066400000000000000000000043101517466257200236570ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_CDEF_H #define DAV2D_SRC_LOONGARCH_CDEF_H #include "config.h" #include "src/cdef.h" #include "src/cpu.h" decl_cdef_dir_fn(BF(dav2d_cdef_find_dir, lsx)); decl_cdef_fn(BF(dav2d_cdef_filter_block_4x4, lsx)); decl_cdef_fn(BF(dav2d_cdef_filter_block_4x8, lsx)); decl_cdef_fn(BF(dav2d_cdef_filter_block_8x8, lsx)); static ALWAYS_INLINE void cdef_dsp_init_loongarch(Dav2dCdefDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; #if BITDEPTH == 8 c->dir = BF(dav2d_cdef_find_dir, lsx); c->fb[0] = BF(dav2d_cdef_filter_block_8x8, lsx); c->fb[1] = BF(dav2d_cdef_filter_block_4x8, lsx); c->fb[2] = BF(dav2d_cdef_filter_block_4x4, lsx); #endif } #endif /* DAV2D_SRC_LOONGARCH_CDEF_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/cpu.c000066400000000000000000000036531517466257200235510ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/attributes.h" #include "src/cpu.h" #include "src/loongarch/cpu.h" #if HAVE_GETAUXVAL #include #define LA_HWCAP_LSX ( 1 << 4 ) #define LA_HWCAP_LASX ( 1 << 5 ) #endif COLD unsigned dav2d_get_cpu_flags_loongarch(void) { unsigned flags = dav2d_get_default_cpu_flags(); #if HAVE_GETAUXVAL unsigned long hw_cap = dav2d_getauxval(AT_HWCAP); flags |= (hw_cap & LA_HWCAP_LSX) ? DAV2D_LOONGARCH_CPU_FLAG_LSX : 0; flags |= (hw_cap & LA_HWCAP_LASX) ? DAV2D_LOONGARCH_CPU_FLAG_LASX : 0; #endif return flags; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/cpu.h000066400000000000000000000031601517466257200235470ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_CPU_H #define DAV2D_SRC_LOONGARCH_CPU_H enum CpuFlags { DAV2D_LOONGARCH_CPU_FLAG_LSX = 1 << 0, DAV2D_LOONGARCH_CPU_FLAG_LASX = 1 << 1, }; unsigned dav2d_get_cpu_flags_loongarch(void); #endif /* DAV2D_SRC_LOONGARCH_CPU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/deblock.h000066400000000000000000000043041517466257200243640ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_DEBLOCK_H #define DAV2D_SRC_LOONGARCH_DEBLOCK_H #include "src/cpu.h" #include "src/deblock.h" decl_deblock_sb_fn(BF(dav2d_lpf_h_sb_y, lsx)); decl_deblock_sb_fn(BF(dav2d_lpf_v_sb_y, lsx)); decl_deblock_sb_fn(BF(dav2d_lpf_h_sb_uv, lsx)); decl_deblock_sb_fn(BF(dav2d_lpf_v_sb_uv, lsx)); static ALWAYS_INLINE void deblock_dsp_init_loongarch(Dav2dDeblockDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; #if BITDEPTH == 8 c->deblock_sb[0][0] = BF(dav2d_lpf_h_sb_y, lsx); c->deblock_sb[0][1] = BF(dav2d_lpf_v_sb_y, lsx); c->deblock_sb[1][0] = BF(dav2d_lpf_h_sb_uv, lsx); c->deblock_sb[1][1] = BF(dav2d_lpf_v_sb_uv, lsx); #endif } #endif /* DAV2D_SRC_LOONGARCH_DEBLOCK_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/ipred.S000066400000000000000000003726461517466257200240600ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" .macro ipred_dc_gen topleft, width, height add.d t0, \width, \height //dc srai.d t0, t0, 1 addi.d t3, \topleft,1 or t1, zero, zero //data index srai.d t2, \width, 4 //loop param beqz t2, 2f 1: // width/16 vldx vr0, t3, t1 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vhaddw.qu.du vr0, vr0, vr0 vpickve2gr.du t4, vr0, 0 add.d t0, t0, t4 addi.d t1, t1, 16 addi.d t2, t2, -1 bnez t2, 1b b 4f 2: // &8 andi t2, \width, 8 beqz t2, 3f vxor.v vr0, vr0, vr0 fldx.d f0, t3, t1 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vpickve2gr.du t4, vr0, 0 add.d t0, t0, t4 addi.d t1, t1, 8 b 4f 3: // &4 andi t2, \width, 4 beqz t2, 4f vxor.v vr0, vr0, vr0 fldx.s f0, t3, t1 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vpickve2gr.wu t4, vr0, 0 add.d t0, t0, t4 addi.d t1, t1, 4 4: addi.d t3, \topleft,0 srai.d t2, \height, 4 //loop param beqz t2, 8f 7: // height/16 addi.d t3, t3, -16 vld vr0, t3, 0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vhaddw.qu.du vr0, vr0, vr0 vpickve2gr.du t4, vr0, 0 add.d t0, t0, t4 addi.d t2, t2, -1 bnez t2, 7b b 10f 8: // &8 andi t2, \height, 8 beqz t2, 9f addi.d t3, t3, -8 vxor.v vr0, vr0, vr0 fld.d f0, t3, 0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vpickve2gr.du t4, vr0, 0 add.d t0, t0, t4 b 10f 9: // &4 andi t2, \height, 4 beqz t2, 10f addi.d t3, t3, -4 vxor.v vr0, vr0, vr0 fld.s f0, t3, 0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vpickve2gr.wu t4, vr0, 0 add.d t0, t0, t4 10: add.d t1, \width, \height ctz.w t1, t1 sra.w t0, t0, t1 // w != h beq \width, \height, 16f add.d t2, \height, \height add.d t3, \width, \width slt t2, t2, \width slt t3, t3, \height or t2, t2, t3 li.w t3, 0x3334 maskeqz t1, t3, t2 li.w t3, 0x5556 masknez t2, t3, t2 or t1, t1, t2 mul.w t0, t0, t1 srai.w t0, t0, 16 16: .endm .macro ipred_splat_dc dst, stride, width, height, dc li.w t1, 4 blt t1, \width, 2f li.w t1, 0x01010101 mulw.d.wu t1, \dc, t1 beqz \height, 7f or t2, \dst, \dst 1: // width <= 4 st.w t1, t2, 0 add.d t2, t2, \stride addi.d \height, \height, -1 bnez \height, 1b b 7f 2: //width > 4 li.d t1, 0x0101010101010101 mul.d t1, \dc, t1 vreplgr2vr.d vr0, t1 or t4, \dst, \dst beqz \height, 7f 3: andi t5, \width, 64 beqz t5, 4f vst vr0, t4, 0 vst vr0, t4, 16 vst vr0, t4, 32 vst vr0, t4, 48 b 6f 4: andi t5, \width, 32 beqz t5, 41f vst vr0, t4, 0 vst vr0, t4, 16 b 6f 41: andi t5, \width, 16 beqz t5, 5f vst vr0, t4, 0 b 6f 5: fst.d f0, t4, 0 6: add.d t4, t4, \stride addi.d \height, \height, -1 bnez \height, 3b 7: .endm .macro ipred_dc_gen_top topleft, width srai.d t0, \width, 1 addi.d t1, \topleft,1 srai.d t2, \width, 4 beqz t2, 2f 1: vld vr0, t1, 0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vhaddw.qu.du vr0, vr0, vr0 vpickve2gr.du t3, vr0, 0 add.d t0, t0, t3 addi.d t1, t1, 16 addi.d t2, t2, -1 bnez t2, 1b b 4f 2: // &8 andi t2, \width, 8 beqz t2, 3f vxor.v vr0, vr0, vr0 fld.d f0, t1, 0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vpickve2gr.du t2, vr0, 0 add.d t0, t0, t2 addi.d t1, t1, 8 b 4f 3: // &4 andi t2, \width, 4 beqz t2, 4f vxor.v vr0, vr0, vr0 fld.s f0, t1, 0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vpickve2gr.du t2, vr0, 0 add.d t0, t0, t2 addi.d t1, t1, 4 4: ctz.w t1, \width sra.w t0, t0, t1 .endm .macro ipred_dc_gen_left topleft, height srai.d t0, \height, 1 srai.d t2, \height, 4 //loop param beqz t2, 8f 7: // height/16 addi.d \topleft,\topleft,-16 vld vr0, \topleft,0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vhaddw.qu.du vr0, vr0, vr0 vpickve2gr.du t4, vr0, 0 add.d t0, t0, t4 addi.d t2, t2, -1 bnez t2, 7b b 10f 8: // &8 andi t2, \height, 8 beqz t2, 9f addi.d \topleft,\topleft,-8 vxor.v vr0, vr0, vr0 fld.d f0, \topleft,0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.du.wu vr0, vr0, vr0 vpickve2gr.du t4, vr0, 0 add.d t0, t0, t4 b 10f 9: // &4 andi t2, \height, 4 beqz t2, 10f addi.d \topleft,\topleft,-4 vxor.v vr0, vr0, vr0 fld.s f0, \topleft,0 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vpickve2gr.wu t4, vr0, 0 add.d t0, t0, t4 10: ctz.w t1, \height sra.w t0, t0, t1 .endm // void ipred_dc_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_dc_8bpc_lsx ipred_dc_gen a2, a3, a4 ipred_splat_dc a0, a1, a3, a4, t0 endfunc // void ipred_dc_128_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_dc_128_8bpc_lsx li.w t0, 128 ipred_splat_dc a0, a1, a3, a4, t0 endfunc // void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_dc_top_8bpc_lsx ipred_dc_gen_top a2, a3 ipred_splat_dc a0, a1, a3, a4, t0 endfunc // void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_dc_left_8bpc_lsx ipred_dc_gen_left a2, a4 ipred_splat_dc a0, a1, a3, a4, t0 endfunc .macro pixel_set_8bpc dst_ptr, src_ptr, width vldrepl.b vr0, \src_ptr, 0 1: andi a5, \width, 64 beqz a5, 2f vst vr0, \dst_ptr, 0 vst vr0, \dst_ptr, 16 vst vr0, \dst_ptr, 32 vst vr0, \dst_ptr, 48 b 6f 2: andi a5, \width, 32 beqz a5, 3f vst vr0, \dst_ptr, 0 vst vr0, \dst_ptr, 16 b 6f 3: andi a5, \width, 16 beqz a5, 4f vst vr0, \dst_ptr, 0 b 6f 4: andi a5, \width, 8 beqz a5, 5f fst.d f0, \dst_ptr, 0 b 6f 5: andi a5, \width, 4 beqz a5, 6f fst.s f0, \dst_ptr, 0 6: .endm // void ipred_h_c(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_h_8bpc_lsx beqz a4, .IPRED_H_END .IPRED_H_LOOP: addi.d a2, a2, -1 pixel_set_8bpc a0, a2, a3 add.d a0, a0, a1 addi.d a4, a4, -1 bnez a4, .IPRED_H_LOOP .IPRED_H_END: endfunc .macro pixel_copy_8bpc dst_ptr, src_ptr, width 1: andi a5, \width, 64 beqz a5, 2f vld vr0, \src_ptr, 0 vld vr1, \src_ptr, 16 vld vr2, \src_ptr, 32 vld vr3, \src_ptr, 48 vst vr0, \dst_ptr, 0 vst vr1, \dst_ptr, 16 vst vr2, \dst_ptr, 32 vst vr3, \dst_ptr, 48 b 6f 2: andi a5, \width, 32 beqz a5, 3f vld vr0, \src_ptr, 0 vld vr1, \src_ptr, 16 vst vr0, \dst_ptr, 0 vst vr1, \dst_ptr, 16 b 6f 3: andi a5, \width, 16 beqz a5, 4f vld vr0, \src_ptr, 0 vst vr0, \dst_ptr, 0 b 6f 4: andi a5, \width, 8 beqz a5, 5f fld.d f0, \src_ptr, 0 fst.d f0, \dst_ptr, 0 b 6f 5: andi a5, \width, 4 beqz a5, 6f fld.s f0, \src_ptr, 0 fst.s f0, \dst_ptr, 0 6: .endm // void ipred_v_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_v_8bpc_lsx beqz a4, .IPRED_V_END addi.d a2, a2, 1 .IPRED_V_LOOP: pixel_copy_8bpc a0, a2, a3 add.d a0, a0, a1 addi.d a4, a4, -1 bnez a4, .IPRED_V_LOOP .IPRED_V_END: endfunc // void ipred_paeth_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const tl_ptr, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_paeth_8bpc_lsx vldrepl.b vr0, a2, 0 //topleft vsllwil.hu.bu vr0, vr0, 0 or a6, a2, a2 addi.d a7, a2, 1 .IPRED_PAETH_H_LOOP: addi.d a6, a6, -1 vldrepl.b vr1, a6, 0 //left vsllwil.hu.bu vr1, vr1, 0 .IPRED_PAETH_W_LOOP64: andi a5, a3, 64 beqz a5, .IPRED_PAETH_W_LOOP32 vld vr2, a7, 0 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 0 vld vr2, a7, 16 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 16 vld vr2, a7, 32 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 32 vld vr2, a7, 48 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 48 b .IPRED_PAETH_W_LOOPEND .IPRED_PAETH_W_LOOP32: andi a5, a3, 32 beqz a5, .IPRED_PAETH_W_LOOP16 vld vr2, a7, 0 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 0 vld vr2, a7, 16 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 16 b .IPRED_PAETH_W_LOOPEND .IPRED_PAETH_W_LOOP16: andi a5, a3, 16 beqz a5, .IPRED_PAETH_W_LOOP8 vld vr2, a7, 0 //top vpermi.w vr9, vr2, 0x0e vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr9, vr9, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vabsd.hu vr10, vr0, vr9 vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vadd.h vr11, vr1, vr9 vabsd.hu vr6, vr3, vr6 //tldiff vabsd.hu vr11, vr3, vr11 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 vsle.hu vr12, vr5, vr11 vbitsel.v vr7, vr0, vr9, vr12 vsle.hu vr12, vr10, vr5 vsle.hu vr8, vr10, vr11 vand.v vr12, vr12, vr8 vbitsel.v vr12, vr7, vr1, vr12 vsrlni.b.h vr12, vr12, 0 vpermi.w vr12, vr3, 0x44 vst vr12, a0, 0 b .IPRED_PAETH_W_LOOPEND .IPRED_PAETH_W_LOOP8: andi a5, a3, 8 beqz a5, .IPRED_PAETH_W_LOOP4 fld.d f2, a7, 0 //top vsllwil.hu.bu vr2, vr2, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vabsd.hu vr6, vr3, vr6 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 fst.d f3, a0, 0 b .IPRED_PAETH_W_LOOPEND .IPRED_PAETH_W_LOOP4: andi a5, a3, 4 beqz a5, .IPRED_PAETH_W_LOOPEND fld.s f2, a7, 0 //top vsllwil.hu.bu vr2, vr2, 0 vabsd.hu vr5, vr0, vr1 //tdiff vabsd.hu vr4, vr0, vr2 //ldiff vadd.h vr3, vr0, vr0 vadd.h vr6, vr1, vr2 vabsd.hu vr6, vr3, vr6 //tldiff vsle.hu vr3, vr5, vr6 vbitsel.v vr7, vr0, vr2, vr3 vsle.hu vr3, vr4, vr5 vsle.hu vr8, vr4, vr6 vand.v vr3, vr3, vr8 vbitsel.v vr3, vr7, vr1, vr3 vsrlni.b.h vr3, vr3, 0 fst.s f3, a0, 0 b .IPRED_PAETH_W_LOOPEND .IPRED_PAETH_W_LOOPEND: add.d a0, a0, a1 addi.d a4, a4, -1 bnez a4, .IPRED_PAETH_H_LOOP endfunc const dav2d_sm_weights .byte 0, 0 // bs = 2 .byte 255, 128 // bs = 4 .byte 255, 149, 85, 64 // bs = 8 .byte 255, 197, 146, 105, 73, 50, 37, 32 // bs = 16 .byte 255, 225, 196, 170, 145, 123, 102, 84 .byte 68, 54, 43, 33, 26, 20, 17, 16 // bs = 32 .byte 255, 240, 225, 210, 196, 182, 169, 157 .byte 145, 133, 122, 111, 101, 92, 83, 74 .byte 66, 59, 52, 45, 39, 34, 29, 25 .byte 21, 17, 14, 12, 10, 9, 8, 8 // bs = 64 .byte 255, 248, 240, 233, 225, 218, 210, 203 .byte 196, 189, 182, 176, 169, 163, 156, 150 .byte 144, 138, 133, 127, 121, 116, 111, 106 .byte 101, 96, 91, 86, 82, 77, 73, 69 .byte 65, 61, 57, 54, 50, 47, 44, 41 .byte 38, 35, 32, 29, 27, 25, 22, 20 .byte 18, 16, 15, 13, 12, 10, 9, 8 .byte 7, 6, 6, 5, 5, 4, 4, 4 endconst // void ipred_smooth_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_smooth_8bpc_lsx la.local a5, dav2d_sm_weights add.d a6, a5, a3 //hor add.d a5, a5, a4 //ver add.d a7, a2, a3 sub.d t0, a2, a4 vldrepl.b vr0, a7, 0 //right vldrepl.b vr1, t0, 0 //bottom vsllwil.hu.bu vr0, vr0, 0 vsllwil.wu.hu vr0, vr0, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.wu.hu vr1, vr1, 0 li.w t0, 256 vreplgr2vr.w vr6, t0 addi.d t0, a2, 1 //ptr topleft[x] addi.d t3, a2, -1 //ptr topleft[y] .IPRED_SMOOTH_H_LOOP: vldrepl.b vr2, a5, 0 //ver[y] vldrepl.b vr3, t3, 0 //topleft[y] vsllwil.hu.bu vr2, vr2, 0 vsllwil.wu.hu vr2, vr2, 0 vsllwil.hu.bu vr3, vr3, 0 vsllwil.wu.hu vr3, vr3, 0 vsub.w vr7, vr6, vr2 //256-ver[y] or t1, zero, zero //xx srai.d t2, a3, 2 //loop max .IPRED_SMOOTH_W_LOOP: fldx.s f4, t0, t1 //topleft[x] fldx.s f5, a6, t1 //hor[x] vsllwil.hu.bu vr4, vr4, 0 vsllwil.wu.hu vr4, vr4, 0 vsllwil.hu.bu vr5, vr5, 0 vsllwil.wu.hu vr5, vr5, 0 vsub.w vr8, vr6, vr5 //256-hor[x] vmul.w vr9, vr8, vr0 vmadd.w vr9, vr5, vr3 vmadd.w vr9, vr7, vr1 vmadd.w vr9, vr2, vr4 //pred vadd.w vr9, vr9, vr6 vsrlni.h.w vr9, vr9, 9 vsrlni.b.h vr9, vr9, 0 fstx.s f9, a0, t1 addi.d t1, t1, 4 addi.d t2, t2, -1 bnez t2, .IPRED_SMOOTH_W_LOOP .IPRED_SMOOTH_W_LOOP_END: addi.d t3, t3, -1 addi.d a5, a5, 1 add.d a0, a0, a1 addi.d a4, a4, -1 bnez a4, .IPRED_SMOOTH_H_LOOP endfunc // void ipred_smooth_v_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_smooth_v_8bpc_lsx la.local a5, dav2d_sm_weights add.d a5, a5, a4 //ver sub.d t0, a2, a4 vldrepl.b vr0, t0, 0 //bottom vsllwil.hu.bu vr0, vr0, 0 li.w t0, 256 vreplgr2vr.h vr2, t0 li.w t0, 128 vreplgr2vr.h vr3, t0 addi.d t0, a2, 1 //ptr topleft[x] .IPRED_SMOOTH_V_H_LOOP: vldrepl.b vr1, a5, 0 //ver[y] vsllwil.hu.bu vr1, vr1, 0 vsub.h vr5, vr2, vr1 //256-ver[y] or t1, zero, zero //xx srai.d t2, a3, 3 //loop max beqz t2, .IPRED_SMOOTH_V_W_LOOP4 .IPRED_SMOOTH_V_W_LOOP8: fldx.d f4, t0, t1 //topleft[x] vsllwil.hu.bu vr4, vr4, 0 vmul.h vr6, vr5, vr0 vmadd.h vr6, vr1, vr4 //pred vadd.h vr6, vr6, vr3 vsrlni.b.h vr6, vr6, 8 fstx.d f6, a0, t1 addi.d t1, t1, 8 addi.d t2, t2, -1 bnez t2, .IPRED_SMOOTH_V_W_LOOP8 b .IPRED_SMOOTH_V_W_LOOP_END .IPRED_SMOOTH_V_W_LOOP4: fldx.s f4, t0, t1 //topleft[x] vsllwil.hu.bu vr4, vr4, 0 vmul.h vr6, vr5, vr0 vmadd.h vr6, vr1, vr4 //pred vadd.h vr6, vr6, vr3 vsrai.h vr6, vr6, 8 vsrlni.b.h vr6, vr6, 0 fstx.s f6, a0, t1 addi.d t1, t1, 4 .IPRED_SMOOTH_V_W_LOOP_END: addi.d a5, a5, 1 add.d a0, a0, a1 addi.d a4, a4, -1 bnez a4, .IPRED_SMOOTH_V_H_LOOP endfunc // void ipred_smooth_h_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_smooth_h_8bpc_lsx la.local a5, dav2d_sm_weights add.d a6, a5, a3 //hor add.d a7, a2, a3 vldrepl.b vr0, a7, 0 //right vsllwil.hu.bu vr0, vr0, 0 li.w t0, 256 vreplgr2vr.h vr1, t0 li.w t0, 128 vreplgr2vr.h vr2, t0 addi.d t3, a2, -1 //ptr topleft[y] .IPRED_SMOOTH_H_H_LOOP: vldrepl.b vr3, t3, 0 //topleft[y] vsllwil.hu.bu vr3, vr3, 0 or t1, zero, zero //xx srai.d t2, a3, 3 //loop max beqz t2, .IPRED_SMOOTH_H_W_LOOP4 .IPRED_SMOOTH_H_W_LOOP8: fldx.d f5, a6, t1 //hor[x] vsllwil.hu.bu vr5, vr5, 0 vsub.h vr4, vr1, vr5 //256-hor[x] vmul.h vr6, vr4, vr0 vmadd.h vr6, vr5, vr3 //pred vadd.h vr6, vr6, vr2 vsrlni.b.h vr6, vr6, 8 fstx.d f6, a0, t1 addi.d t1, t1, 8 addi.d t2, t2, -1 bnez t2, .IPRED_SMOOTH_H_W_LOOP8 b .IPRED_SMOOTH_W_H_LOOP_END .IPRED_SMOOTH_H_W_LOOP4: fldx.s f5, a6, t1 //hor[x] vsllwil.hu.bu vr5, vr5, 0 vsub.h vr4, vr1, vr5 //256-hor[x] vmul.h vr6, vr4, vr0 vmadd.h vr6, vr5, vr3 //pred vadd.h vr6, vr6, vr2 vsrai.h vr6, vr6, 8 vsrlni.b.h vr6, vr6, 0 fstx.s f6, a0, t1 addi.d t1, t1, 4 .IPRED_SMOOTH_W_H_LOOP_END: addi.d t3, t3, -1 add.d a0, a0, a1 addi.d a4, a4, -1 bnez a4, .IPRED_SMOOTH_H_H_LOOP endfunc // void pal_pred_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const pal, const uint8_t *idx, // const int w, const int h) function pal_pred_8bpc_lsx srai.d a7, a5, 2 .PAL_PRED_WLOOP4: andi a6, a4, 4 beqz a6, .PAL_PRED_WLOOP8 fld.d f0, a3, 0 vsrli.b vr1, vr0, 4 vandi.b vr2, vr0, 7 vilvl.b vr0, vr1, vr2 fld.d f1, a2, 0 vshuf.b vr2, vr1, vr1, vr0 vstelm.w vr2, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr2, a0, 0, 1 add.d a0, a0, a1 vstelm.w vr2, a0, 0, 2 add.d a0, a0, a1 vstelm.w vr2, a0, 0, 3 add.d a0, a0, a1 addi.d a3, a3, 8 addi.d a7, a7, -1 bnez a7, .PAL_PRED_WLOOP4 b .PAL_PRED_END .PAL_PRED_WLOOP8: andi a6, a4, 8 beqz a6, .PAL_PRED_WLOOP16 vld vr0, a3, 0 vsrli.b vr1, vr0, 4 vandi.b vr2, vr0, 7 vilvl.b vr0, vr1, vr2 vilvh.b vr3, vr1, vr2 fld.d f1, a2, 0 vshuf.b vr0, vr1, vr1, vr0 vshuf.b vr3, vr1, vr1, vr3 vstelm.d vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr0, a0, 0, 1 add.d a0, a0, a1 vstelm.d vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr3, a0, 0, 1 add.d a0, a0, a1 addi.d a3, a3, 16 addi.d a7, a7, -1 bnez a7, .PAL_PRED_WLOOP8 b .PAL_PRED_END .PAL_PRED_WLOOP16: andi a6, a4, 16 beqz a6, .PAL_PRED_WLOOP32 vld vr0, a3, 0 vld vr1, a3, 16 fld.d f6, a2, 0 vsrli.b vr2, vr0, 4 vandi.b vr3, vr0, 7 vsrli.b vr4, vr1, 4 vandi.b vr5, vr1, 7 vilvl.b vr0, vr2, vr3 vilvh.b vr1, vr2, vr3 vilvl.b vr2, vr4, vr5 vilvh.b vr3, vr4, vr5 vshuf.b vr0, vr6, vr6, vr0 vshuf.b vr1, vr6, vr6, vr1 vshuf.b vr2, vr6, vr6, vr2 vshuf.b vr3, vr6, vr6, vr3 vst vr0, a0, 0 add.d a0, a0, a1 vst vr1, a0, 0 add.d a0, a0, a1 vst vr2, a0, 0 add.d a0, a0, a1 vst vr3, a0, 0 add.d a0, a0, a1 addi.d a3, a3, 32 addi.d a7, a7, -1 bnez a7, .PAL_PRED_WLOOP16 b .PAL_PRED_END .PAL_PRED_WLOOP32: andi a6, a4, 32 beqz a6, .PAL_PRED_WLOOP64 vld vr0, a3, 0 vld vr1, a3, 16 vld vr2, a3, 32 vld vr3, a3, 48 fld.d f4, a2, 0 vsrli.b vr5, vr0, 4 vandi.b vr6, vr0, 7 vsrli.b vr7, vr1, 4 vandi.b vr8, vr1, 7 vsrli.b vr9, vr2, 4 vandi.b vr10, vr2, 7 vsrli.b vr11, vr3, 4 vandi.b vr12, vr3, 7 vilvl.b vr0, vr5, vr6 vilvh.b vr1, vr5, vr6 vilvl.b vr2, vr7, vr8 vilvh.b vr3, vr7, vr8 vilvl.b vr5, vr9, vr10 vilvh.b vr6, vr9, vr10 vilvl.b vr7, vr11, vr12 vilvh.b vr8, vr11, vr12 vshuf.b vr0, vr4, vr4, vr0 vshuf.b vr1, vr4, vr4, vr1 vshuf.b vr2, vr4, vr4, vr2 vshuf.b vr3, vr4, vr4, vr3 vshuf.b vr5, vr4, vr4, vr5 vshuf.b vr6, vr4, vr4, vr6 vshuf.b vr7, vr4, vr4, vr7 vshuf.b vr8, vr4, vr4, vr8 vst vr0, a0, 0 vst vr1, a0, 16 add.d a0, a0, a1 vst vr2, a0, 0 vst vr3, a0, 16 add.d a0, a0, a1 vst vr5, a0, 0 vst vr6, a0, 16 add.d a0, a0, a1 vst vr7, a0, 0 vst vr8, a0, 16 add.d a0, a0, a1 addi.d a3, a3, 64 addi.d a7, a7, -1 bnez a7, .PAL_PRED_WLOOP32 b .PAL_PRED_END .PAL_PRED_WLOOP64: vld vr0, a3, 0 vld vr1, a3, 16 fld.d f2, a2, 0 vsrli.b vr3, vr0, 4 vandi.b vr4, vr0, 7 vsrli.b vr5, vr1, 4 vandi.b vr6, vr1, 7 vilvl.b vr0, vr3, vr4 vilvh.b vr1, vr3, vr4 vilvl.b vr3, vr5, vr6 vilvh.b vr4, vr5, vr6 vshuf.b vr0, vr2, vr2, vr0 vshuf.b vr1, vr2, vr2, vr1 vshuf.b vr3, vr2, vr2, vr3 vshuf.b vr4, vr2, vr2, vr4 vst vr0, a0, 0 vst vr1, a0, 16 vst vr3, a0, 32 vst vr4, a0, 48 add.d a0, a0, a1 addi.d a3, a3, 32 addi.d a5, a5, -1 bnez a5, .PAL_PRED_WLOOP64 .PAL_PRED_END: endfunc .macro apply_sign_vrh v, s, vrzero, vrt0 ,out vslt.h \vrt0, \s, \vrzero vandn.v \s, \vrt0, \v vsigncov.h \v, \vrt0, \v vor.v \out, \s, \v .endm .macro iclip_pixel_vrh in0, in1, in2, tmp0, tmp1, out vmin.h \tmp0, \in2, \in0 vslt.h \in0, \in0, \in1 vand.v \tmp1, \in0, \in1 vandn.v \tmp0, \in0, \tmp0 vor.v \out, \tmp1, \tmp0 .endm .macro ipred_cfl_pred dst, stride, w, h, dc, ac, alpha vreplgr2vr.h vr2, \alpha vreplgr2vr.h vr7, \dc li.w t1, 32 vreplgr2vr.h vr3, t1 vxor.v vr4, vr4, vr4 li.w t1, 255 vreplgr2vr.h vr6, t1 add.d t4, \w, \w 1: or t1, zero, zero or t2, zero, zero srai.d t3, \w, 3 beqz t3, 3f 2: vldx vr0, \ac, t1 vmul.h vr1, vr2, vr0 vadda.h vr0, vr1, vr3 vsrai.h vr0, vr0, 6 apply_sign_vrh vr0, vr1, vr4, vr5, vr0 vadd.h vr1, vr0, vr7 iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0 vsrlni.b.h vr0, vr0, 0 fstx.d f0, \dst, t2 addi.d t1, t1, 16 addi.d t2, t2, 8 addi.d t3, t3, -1 bnez t3, 2b b 4f 3: fld.d f0, \ac, 0 vmul.h vr1, vr2, vr0 vadda.h vr0, vr1, vr3 vsrai.h vr0, vr0, 6 apply_sign_vrh vr0, vr1, vr4, vr5, vr0 vadd.h vr1, vr0, vr7 iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0 vsrlni.b.h vr0, vr0, 0 fst.s f0, \dst, 0 4: add.d \ac, \ac, t4 add.d \dst, \dst, \stride addi.d \h, \h, -1 bnez \h, 1b .endm function ipred_cfl_8bpc_lsx ipred_dc_gen a2, a3, a4 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 endfunc function ipred_cfl_top_8bpc_lsx ipred_dc_gen_top a2, a3 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 endfunc function ipred_cfl_left_8bpc_lsx ipred_dc_gen_left a2, a4 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 endfunc function ipred_cfl_128_8bpc_lsx li.w t0, 128 ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6 endfunc const dav2d_filter_intra_taps_lsx //arr0 8*7 .byte -6, -5, -3, -3, -4, -3, -3, -3 .byte 10, 2, 1, 1, 6, 2, 2, 1 .byte 0, 10, 1, 1, 0, 6, 2, 2 .byte 0, 0, 10, 2, 0, 0, 6, 2 .byte 0, 0, 0, 10, 0, 0, 0, 6 .byte 12, 9, 7, 5, 2, 2, 2, 3 .byte 0, 0, 0, 0, 12, 9, 7, 5 //arr1 .byte -10, -6, -4, -2, -10, -6, -4, -2 .byte 16, 0, 0, 0, 16, 0, 0, 0 .byte 0, 16, 0, 0, 0, 16, 0, 0 .byte 0, 0, 16, 0, 0, 0, 16, 0 .byte 0, 0, 0, 16, 0, 0, 0, 16 .byte 10, 6, 4, 2, 0, 0, 0, 0 .byte 0, 0, 0, 0, 10, 6, 4, 2 //arr2 .byte -8, -8, -8, -8, -4, -4, -4, -4 .byte 8, 0, 0, 0, 4, 0, 0, 0 .byte 0, 8, 0, 0, 0, 4, 0, 0 .byte 0, 0, 8, 0, 0, 0, 4, 0 .byte 0, 0, 0, 8, 0, 0, 0, 4 .byte 16, 16, 16, 16, 0, 0, 0, 0 .byte 0, 0, 0, 0, 16, 16, 16, 16 //arr3 .byte -2, -1, -1, 0, -1, -1, -1, -1 .byte 8, 3, 2, 1, 4, 3, 2, 2 .byte 0, 8, 3, 2, 0, 4, 3, 2 .byte 0, 0, 8, 3, 0, 0, 4, 3 .byte 0, 0, 0, 8, 0, 0, 0, 4 .byte 10, 6, 4, 2, 3, 4, 4, 3 .byte 0, 0, 0, 0, 10, 6, 4, 3 //arr4 .byte -12, -10, -9, -8, -10, -9, -8, -7 .byte 14, 0, 0, 0, 12, 1, 0, 0 .byte 0, 14, 0, 0, 0, 12, 0, 0 .byte 0, 0, 14, 0, 0, 0, 12, 1 .byte 0, 0, 0, 14, 0, 0, 0, 12 .byte 14, 12, 11, 10, 0, 0, 1, 1 .byte 0, 0, 0, 0, 14, 12, 11, 9 endconst .macro ipred_filter_load_p vldrepl.b vr0, t0, 0 vldrepl.b vr1, a7, 0 vldrepl.b vr2, a7, 1 vldrepl.b vr3, a7, 2 vldrepl.b vr4, a7, 3 vldrepl.b vr5, t1, 0 vldrepl.b vr6, t1, -1 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr3, vr3, 0 vsllwil.hu.bu vr4, vr4, 0 vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 .endm .macro ipred_filter_loadx_p vldrepl.b vr0, t0, 0 vldrepl.b vr1, a7, 0 vldrepl.b vr2, a7, 1 vldrepl.b vr3, a7, 2 vldrepl.b vr4, a7, 3 vldrepl.b vr5, t1, 0 ldx.bu t3, t1, a1 vreplgr2vr.b vr6, t3 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr3, vr3, 0 vsllwil.hu.bu vr4, vr4, 0 vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 .endm .macro ipred_filter_load_fltptr fld.d f7, a6, 0 fld.d f8, a6, 8 fld.d f9, a6, 16 fld.d f10, a6, 24 fld.d f11, a6, 32 fld.d f12, a6, 40 fld.d f13, a6, 48 vsllwil.h.b vr7, vr7, 0 vsllwil.h.b vr8, vr8, 0 vsllwil.h.b vr9, vr9, 0 vsllwil.h.b vr10, vr10, 0 vsllwil.h.b vr11, vr11, 0 vsllwil.h.b vr12, vr12, 0 vsllwil.h.b vr13, vr13, 0 .endm .macro ipred_filter_calc_acc vmul.h vr7, vr7, vr0 vmadd.h vr7, vr8, vr1 vmadd.h vr7, vr9, vr2 vmadd.h vr7, vr10, vr3 vmadd.h vr7, vr11, vr4 vmadd.h vr7, vr12, vr5 vmadd.h vr7, vr13, vr6 vaddi.hu vr7, vr7, 8 vsrai.h vr7, vr7, 4 iclip_pixel_vrh vr7, vr14, vr15, vr9, vr10, vr8 vsrlni.b.h vr8, vr8, 0 .endm // void ipred_filter_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft_in, // const int width, const int height, int filt_idx, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_filter_8bpc_lsx andi a5, a5, 511 la.local a6, dav2d_filter_intra_taps_lsx li.w a7, 56 mul.w a7, a7, a5 add.d a6, a6, a7 //*filter addi.d a7, a2, 1 //*top or a5, zero, zero //y vxor.v vr14, vr14, vr14 li.w t0, 255 vreplgr2vr.h vr15, t0 .FILTER_LOOP_H: sub.d t0, a2, a5 //*topleft addi.d t1, t0, -1 //left ctz.w t2, a3 addi.d t3, t2, -2 beqz t3, .FILTER_LOOP_W4 addi.d t3, t2, -3 beqz t3, .FILTER_LOOP_W8 addi.d t3, t2, -4 beqz t3, .FILTER_LOOP_W16 addi.d t3, t2, -5 beqz t3, .FILTER_LOOP_W32 .FILTER_LOOP_W4: ipred_filter_load_p or t3, a0, a0 //*ptr ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 b .FILTER_LOOP_W_END .FILTER_LOOP_W8: ipred_filter_load_p or t3, a0, a0 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 3 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 4 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 b .FILTER_LOOP_W_END .FILTER_LOOP_W16: ipred_filter_load_p or t3, a0, a0 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 3 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 4 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 7 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 8 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 11 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 12 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 b .FILTER_LOOP_W_END .FILTER_LOOP_W32: ipred_filter_load_p or t3, a0, a0 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 3 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 4 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 7 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 8 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 11 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 12 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 15 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 16 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 19 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 20 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 23 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 24 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 addi.d t1, a0, 27 addi.d a7, a7, 4 addi.d t0, a7, -1 ipred_filter_loadx_p addi.d t3, a0, 28 ipred_filter_load_fltptr ipred_filter_calc_acc fst.s f8, t3, 0 add.d t3, t3, a1 vstelm.w vr8, t3, 0, 1 add.d t3, t3, a1 .FILTER_LOOP_W_END: add.d a7, a0, a1 add.d t2, a1, a1 add.d a0, a0, t2 addi.d a5, a5, 2 blt a5, a4, .FILTER_LOOP_H endfunc const dav2d_dr_intra_derivative // Values that are 0 will never be used .short 0 // Angles: .short 1023, 0 // 3, 93, 183 .short 547 // 6, 96, 186 .short 372, 0, 0 // 9, 99, 189 .short 273 // 14, 104, 194 .short 215, 0 // 17, 107, 197 .short 178 // 20, 110, 200 .short 151, 0 // 23, 113, 203 (113 & 203 are base angles) .short 132 // 26, 116, 206 .short 116, 0 // 29, 119, 209 .short 102, 0 // 32, 122, 212 .short 90 // 36, 126, 216 .short 80, 0 // 39, 129, 219 .short 71 // 42, 132, 222 .short 64, 0 // 45, 135, 225 (45 & 135 are base angles) .short 57 // 48, 138, 228 .short 51, 0 // 51, 141, 231 .short 45, 0 // 54, 144, 234 .short 40 // 58, 148, 238 .short 35, 0 // 61, 151, 241 .short 31 // 64, 154, 244 .short 27, 0 // 67, 157, 247 (67 & 157 are base angles) .short 23 // 70, 160, 250 .short 19, 0 // 73, 163, 253 .short 15, 0 // 76, 166, 256 .short 11, 0 // 81, 171, 261 .short 7 // 84, 174, 264 .short 3 // 87, 177, 267 endconst const z1_upsample_edge_kernel .short -1, 9, 9, -1, -1, 9, 9, -1 endconst const ipred_filter_edge_kernel1 .short 0, 4, 8, 4, 0, 4, 8, 4 .short 0, 5, 6, 5, 0, 5, 6, 5 .short 2, 4, 4, 4, 2, 4, 4, 4 endconst const ipred_filter_edge_kernel2 .short 0, 0, 0, 0, 0, 0, 0, 0 .short 0, 0, 0, 0, 0, 0, 0, 0 .short 2, 2, 2, 2, 2, 2, 2, 2 endconst .macro z1_upsample_edge_calc_loop vsllwil.hu.bu vr10, vr7, 0 vsllwil.hu.bu vr11, vr11, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 vmul.h vr10, vr10, vr0 vmul.h vr11, vr11, vr0 vmul.h vr12, vr12, vr0 vmul.h vr13, vr13, vr0 vhaddw.w.h vr10, vr10, vr10 vhaddw.w.h vr11, vr11, vr11 vhaddw.w.h vr12, vr12, vr12 vhaddw.w.h vr13, vr13, vr13 vhaddw.d.w vr10, vr10, vr10 vhaddw.d.w vr11, vr11, vr11 vhaddw.d.w vr12, vr12, vr12 vhaddw.d.w vr13, vr13, vr13 vpackev.h vr10, vr11, vr10 vpackev.h vr11, vr13, vr12 vpackev.w vr12, vr11, vr10 //s:01234567 vsrari.h vr12, vr12, 4 iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12 vsrlni.b.h vr12, vr12, 0 //out: 13579... vbsrl.v vr11, vr7, 1 //out:02468... vilvl.b vr13, vr12, vr11 .endm .macro z1_upsample_edge_data_init1 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop .endm .macro z1_upsample_edge_data_init2 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_upsample_edge_calc_loop .endm .macro z1_upsample_edge_calc_other vsllwil.hu.bu vr10, vr7, 0 vmul.h vr10, vr10, vr0 vhaddw.w.h vr10, vr10, vr10 vhaddw.d.w vr10, vr10, vr10 vreplvei.h vr12, vr10, 0 //s0-s7 vsrari.h vr12, vr12, 4 iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12 vsrlni.b.h vr12, vr12, 0 vilvl.b vr13, vr12, vr7 .endm .macro z1_filter_edge_calc_loop1 vmul.h vr10, vr10, vr1 vmul.h vr11, vr11, vr1 vmul.h vr12, vr12, vr1 vmul.h vr13, vr13, vr1 vhaddw.w.h vr10, vr10, vr10 vhaddw.w.h vr11, vr11, vr11 vhaddw.w.h vr12, vr12, vr12 vhaddw.w.h vr13, vr13, vr13 vhaddw.d.w vr10, vr10, vr10 vhaddw.d.w vr11, vr11, vr11 vhaddw.d.w vr12, vr12, vr12 vhaddw.d.w vr13, vr13, vr13 vpackev.h vr10, vr11, vr10 vpackev.h vr11, vr13, vr12 vpackev.w vr10, vr11, vr10 //s:01234567 .endm .macro z1_filter_edge_calc_loop2 vsllwil.hu.bu vr13, vr13, 0 vmadd.h vr10, vr13, vr6 vsrari.h vr12, vr10, 4 vsrlni.b.h vr12, vr12, 0 //out: 0-7 .endm .macro z1_filter_edge_calc_other vsllwil.hu.bu vr10, vr10, 0 vmul.h vr11, vr10, vr1 vhaddw.w.h vr11, vr11, vr11 vhaddw.d.w vr11, vr11, vr11 vreplvei.h vr12, vr11, 4 vextrins.h vr12, vr11, 0x00 vreplvei.h vr13, vr10, 1 vmadd.h vr12, vr13, vr6 vsrari.h vr12, vr12, 4 vsrlni.b.h vr12, vr12, 0 //out: 0-7 .endm .macro z1_filter_edge_data_init1 vbsll.v vr10, vr7, 1 vextrins.b vr10, vr10, 0x01 vbsrl.v vr12, vr7, 1 vbsrl.v vr13, vr7, 2 vsllwil.hu.bu vr10, vr10, 0 vsllwil.hu.bu vr11, vr7, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 z1_filter_edge_calc_loop1 .endm .macro z1_filter_edge_data_init2 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vbsrl.v vr13, vr7, 3 vsllwil.hu.bu vr10, vr7, 0 vsllwil.hu.bu vr11, vr11, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 z1_filter_edge_calc_loop1 .endm .macro z1_filter_edge_data_init3 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x76 vsllwil.hu.bu vr10, vr7, 0 vsllwil.hu.bu vr11, vr11, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 z1_filter_edge_calc_loop1 .endm .macro z1_filter_edge_data_init4 vbsll.v vr10, vr7, 1 vextrins.b vr10, vr10, 0x01 vbsrl.v vr12, vr7, 1 vbsrl.v vr13, vr7, 2 vextrins.b vr13, vr13, 0x76 vsllwil.hu.bu vr10, vr10, 0 vsllwil.hu.bu vr11, vr7, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 z1_filter_edge_calc_loop1 .endm .macro pixel_set_8bpc_allw dst_ptr, src_ptr, width, tmp0, tmp1 vldrepl.b vr10, \src_ptr, 0 or \tmp1, zero, zero srai.d \tmp0, \width, 4 beqz \tmp0, 2f 1: vstx vr10, \dst_ptr, \tmp1 addi.d \tmp1, \tmp1, 16 addi.d \tmp0, \tmp0, -1 bnez \tmp0, 1b 2: andi \tmp0, \width, 8 beqz \tmp0, 3f fstx.d f10, \dst_ptr, \tmp1 addi.d \tmp1, \tmp1, 8 3: andi \tmp0, \width, 4 beqz \tmp0, 4f fstx.s f10, \dst_ptr, \tmp1 addi.d \tmp1, \tmp1, 4 4: andi \tmp0, \width, 2 beqz \tmp0, 5f ldx.bu \tmp0, \src_ptr, zero stx.b \tmp0, \dst_ptr, \tmp1 addi.d \tmp1, \tmp1, 1 stx.b \tmp0, \dst_ptr, \tmp1 addi.d \tmp1, \tmp1, 1 5: andi \tmp0, \width, 1 beqz \tmp0, 6f ldx.bu \tmp0, \src_ptr, zero stx.b \tmp0, \dst_ptr, \tmp1 6: .endm // void ipred_z1_lsx(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft_in, // const int width, const int height, int angle, // const int max_width, const int max_height // HIGHBD_DECL_SUFFIX) function ipred_z1_8bpc_lsx addi.d a2, a2, 1 //&topleft_in[1] addi.d sp, sp, -128 or t2, sp, sp //top_out srai.d a6, a5, 9 andi a6, a6, 1 //is_sum srai.d a7, a5, 10 //enable_intra_edge_filter andi a5, a5, 511 la.local t0, dav2d_dr_intra_derivative andi t1, a5, 0xFFE ldx.hu t1, t0, t1 //dx beqz a7, .IPRED_Z1_NOTUA add.d t3, a3, a4 li.w t4, 90 sub.w t4, t4, a5 // ipred_get_upsample t5:upsample_above li.w t6, 16 sra.d t6, t6, a6 bge t6, t3, .Z1_GETUS1 addi.d t5, zero, 0 b .Z1_GETUS2 .Z1_GETUS1: addi.d t5, zero, 1 .Z1_GETUS2: li.w t6, 40 blt t4, t6, .Z1_GETUS3 addi.d t6, zero, 0 b .Z1_GETUS4 .Z1_GETUS3: addi.d t6, zero, 1 .Z1_GETUS4: and t5, t5, t6 beqz t5, .IPRED_Z1_NOTUA la.local t0, z1_upsample_edge_kernel vld vr0, t0, 0 //kernel vxor.v vr15, vr15, vr15 li.w t0, 255 vreplgr2vr.h vr16, t0 .Z1_UEDGE_W4: andi t6, a3, 4 beqz t6, .Z1_UEDGE_W8 .Z1_UEDGE_W4_H4: andi t6, a4, 4 beqz t6, .Z1_UEDGE_W4_H8 //0-6 vld vr7, a2, -1 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 0 vstelm.w vr13, t2, 8, 2 vstelm.h vr13, t2, 12, 6 ld.bu t7, a2, 7 st.b t7, t2, 14 b .Z1_UEDGE_END .Z1_UEDGE_W4_H8: andi t6, a4, 8 beqz t6, .Z1_UEDGE_W4_H16 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init2 vst vr13, t2, 0 //8-10 vldrepl.b vr7, a2, 7 z1_upsample_edge_calc_other vstelm.w vr13, t2, 16, 0 vstelm.h vr13, t2, 20, 2 ld.bu t7, a2, 7 st.b t7, t2, 22 b .Z1_UEDGE_END .Z1_UEDGE_W4_H16: andi t6, a4, 16 beqz t6, .Z1_UEDGE_W4_H32 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init2 vst vr13, t2, 0 //8-15 vldrepl.b vr7, a2, 7 z1_upsample_edge_calc_other vst vr13, t2, 16 //16-18 vstelm.w vr13, t2, 32, 0 vstelm.h vr13, t2, 36, 2 ld.bu t7, a2, 7 st.b t7, t2, 38 b .Z1_UEDGE_END .Z1_UEDGE_W4_H32: andi t6, a4, 32 beqz t6, .Z1_UEDGE_W4_H64 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init2 vst vr13, t2, 0 //8-15 vldrepl.b vr7, a2, 7 z1_upsample_edge_calc_other vst vr13, t2, 16 vst vr13, t2, 32 //16-23 vst vr13, t2, 48 //24-31 //32-34 vstelm.w vr13, t2, 64, 0 vstelm.h vr13, t2, 68, 2 ld.bu t7, a2, 7 st.b t7, t2, 70 b .Z1_UEDGE_END .Z1_UEDGE_W4_H64: //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init2 vst vr13, t2, 0 //8-15 vldrepl.b vr7, a2, 7 z1_upsample_edge_calc_other vst vr13, t2, 16 vst vr13, t2, 32 //16-23 vst vr13, t2, 48 //24-31 vst vr13, t2, 64 //32-39 vst vr13, t2, 80 //40-47 vst vr13, t2, 96 //48-55 vst vr13, t2, 112 //56-63 //64-66 vstelm.w vr13, t2, 128, 0 vstelm.h vr13, t2, 132, 2 ld.bu t7, a2, 7 st.b t7, t2, 134 b .Z1_UEDGE_END .Z1_UEDGE_W8: andi t6, a3, 8 beqz t6, .Z1_UEDGE_W16 .Z1_UEDGE_W8_H4: andi t6, a4, 4 beqz t6, .Z1_UEDGE_W8_H8 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x32 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x21 vextrins.b vr13, vr13, 0x31 z1_upsample_edge_calc_loop vstelm.w vr13, t2, 16, 0 vstelm.h vr13, t2, 20, 2 ld.bu t7, a2, 11 st.b t7, t2, 22 b .Z1_UEDGE_END .Z1_UEDGE_W8_H8: andi t6, a4, 8 beqz t6, .Z1_UEDGE_W8_H16 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-14 vld vr7, a2, 7 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 16 vstelm.w vr13, t2, 24, 2 vstelm.h vr13, t2, 28, 6 ld.bu t7, a2, 15 st.b t7, t2, 30 b .Z1_UEDGE_END .Z1_UEDGE_W8_H16: andi t6, a4, 16 beqz t6, .Z1_UEDGE_W8_H32 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init2 vst vr13, t2, 16 //16-22 vldrepl.b vr7, a2, 15 z1_upsample_edge_calc_other fst.d f13, t2, 32 vstelm.w vr13, t2, 40, 2 vstelm.h vr13, t2, 44, 6 ld.bu t7, a2, 15 st.b t7, t2, 46 b .Z1_UEDGE_END .Z1_UEDGE_W8_H32: andi t6, a4, 32 beqz t6, .Z1_UEDGE_W8_H64 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init2 vst vr13, t2, 16 //16-23 vldrepl.b vr7, a2, 15 z1_upsample_edge_calc_other vst vr13, t2, 32 vst vr13, t2, 48 //24-31 //32-38 fst.d f13, t2, 64 vstelm.w vr13, t2, 72, 2 vstelm.h vr13, t2, 76, 6 ld.bu t7, a2, 15 st.b t7, t2, 78 b .Z1_UEDGE_END .Z1_UEDGE_W8_H64: //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init2 vst vr13, t2, 16 //16-23 vldrepl.b vr7, a2, 15 z1_upsample_edge_calc_other vst vr13, t2, 32 vst vr13, t2, 48 //24-31 vst vr13, t2, 64 //32-39 vst vr13, t2, 80 //40-47 vst vr13, t2, 96 //48-55 vst vr13, t2, 112 //56-63 //64-70 fst.d f13, t2, 128 vstelm.w vr13, t2, 136, 2 vstelm.h vr13, t2, 140, 6 ld.bu t7, a2, 15 st.b t7, t2, 142 b .Z1_UEDGE_END .Z1_UEDGE_W16: andi t6, a3, 16 beqz t6, .Z1_UEDGE_W32 .Z1_UEDGE_W16_H4: andi t6, a4, 4 beqz t6, .Z1_UEDGE_W16_H8 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-18 vld vr7, a2, 15 z1_upsample_edge_data_init1 vstelm.w vr13, t2, 32, 0 vstelm.h vr13, t2, 36, 2 ld.bu t7, a2, 19 st.b t7, t2, 38 b .Z1_UEDGE_END .Z1_UEDGE_W16_H8: andi t6, a4, 8 beqz t6, .Z1_UEDGE_W16_H16 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-22 vld vr7, a2, 15 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 32 vstelm.w vr13, t2, 40, 2 vstelm.h vr13, t2, 44, 6 ld.bu t7, a2, 23 st.b t7, t2, 46 b .Z1_UEDGE_END .Z1_UEDGE_W16_H16: andi t6, a4, 16 beqz t6, .Z1_UEDGE_W16_H32 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-30 vld vr7, a2, 23 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 48 vstelm.w vr13, t2, 56, 2 vstelm.h vr13, t2, 60, 6 ld.bu t7, a2, 31 st.b t7, t2, 62 b .Z1_UEDGE_END .Z1_UEDGE_W16_H32: andi t6, a4, 32 beqz t6, .Z1_UEDGE_W16_H64 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init2 vst vr13, t2, 48 //32-39 vldrepl.b vr7, a2, 31 z1_upsample_edge_calc_other vst vr13, t2, 64 //40-46 fst.d f13, t2, 80 vstelm.w vr13, t2, 88, 2 vstelm.h vr13, t2, 92, 6 ld.bu t7, a2, 31 st.b t7, t2, 94 b .Z1_UEDGE_END .Z1_UEDGE_W16_H64: //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init2 vst vr13, t2, 48 //32-39 vldrepl.b vr7, a2, 31 z1_upsample_edge_calc_other vst vr13, t2, 64 vst vr13, t2, 80 //40-47 vst vr13, t2, 96 //48-55 vst vr13, t2, 112 //56-63 vst vr13, t2, 128 //64-71 //72-78 fst.d f13, t2, 144 vstelm.w vr13, t2, 152, 2 vstelm.h vr13, t2, 156, 6 ld.bu t7, a2, 31 st.b t7, t2, 158 b .Z1_UEDGE_END .Z1_UEDGE_W32: andi t6, a3, 32 beqz t6, .Z1_UEDGE_W64 .Z1_UEDGE_W32_H8: andi t6, a4, 8 beqz t6, .Z1_UEDGE_W32_H16 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-38 vld vr7, a2, 31 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 64 vstelm.w vr13, t2, 72, 2 vstelm.h vr13, t2, 76, 6 ld.bu t7, a2, 39 st.b t7, t2, 78 b .Z1_UEDGE_END .Z1_UEDGE_W32_H16: andi t6, a4, 16 beqz t6, .Z1_UEDGE_W32_H32 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-39 vld vr7, a2, 31 z1_upsample_edge_data_init1 vst vr13, t2, 64 //40-46 vld vr7, a2, 39 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 80 vstelm.w vr13, t2, 88, 2 vstelm.h vr13, t2, 92, 6 ld.bu t7, a2, 47 st.b t7, t2, 94 b .Z1_UEDGE_END .Z1_UEDGE_W32_H32: andi t6, a4, 32 beqz t6, .Z1_UEDGE_W32_H64 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-39 vld vr7, a2, 31 z1_upsample_edge_data_init1 vst vr13, t2, 64 //40-47 vld vr7, a2, 39 z1_upsample_edge_data_init1 vst vr13, t2, 80 //48-55 vld vr7, a2, 47 z1_upsample_edge_data_init1 vst vr13, t2, 96 //56-62 vld vr7, a2, 55 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vextrins.b vr12, vr12, 0x76 vbsrl.v vr13, vr7, 3 z1_upsample_edge_calc_loop fst.d f13, t2, 112 vstelm.w vr13, t2, 120, 2 vstelm.h vr13, t2, 124, 6 ld.bu t7, a2, 63 st.b t7, t2, 126 b .Z1_UEDGE_END .Z1_UEDGE_W32_H64: //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-39 vld vr7, a2, 31 z1_upsample_edge_data_init1 vst vr13, t2, 64 //40-47 vld vr7, a2, 39 z1_upsample_edge_data_init1 vst vr13, t2, 80 //48-55 vld vr7, a2, 47 z1_upsample_edge_data_init1 vst vr13, t2, 96 //56-63 vld vr7, a2, 55 z1_upsample_edge_data_init2 vst vr13, t2, 112 //64-71 vldrepl.b vr7, a2, 63 z1_upsample_edge_calc_other vst vr13, t2, 128 vst vr13, t2, 144 //72-79 vst vr13, t2, 160 //80-87 //88-94 fst.d f13, t2, 176 vstelm.w vr13, t2, 184, 2 vstelm.h vr13, t2, 188, 6 ld.bu t7, a2, 63 st.b t7, t2, 190 b .Z1_UEDGE_END .Z1_UEDGE_W64: .Z1_UEDGE_W64_H16: andi t6, a4, 16 beqz t6, .Z1_UEDGE_W64_H32 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-39 vld vr7, a2, 31 z1_upsample_edge_data_init1 vst vr13, t2, 64 //40-47 vld vr7, a2, 39 z1_upsample_edge_data_init1 vst vr13, t2, 80 //48-55 vld vr7, a2, 47 z1_upsample_edge_data_init1 vst vr13, t2, 96 //56-63 vld vr7, a2, 55 z1_upsample_edge_data_init1 vst vr13, t2, 112 //64-71 vld vr7, a2, 63 z1_upsample_edge_data_init1 vst vr13, t2, 128 //72-78 vld vr7, a2, 71 z1_upsample_edge_data_init2 fst.d f13, t2, 144 vstelm.w vr13, t2, 152, 2 vstelm.h vr13, t2, 156, 6 ld.bu t7, a2, 79 st.b t7, t2, 158 b .Z1_UEDGE_END .Z1_UEDGE_W64_H32: andi t6, a4, 32 beqz t6, .Z1_UEDGE_W64_H64 //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-39 vld vr7, a2, 31 z1_upsample_edge_data_init1 vst vr13, t2, 64 //40-47 vld vr7, a2, 39 z1_upsample_edge_data_init1 vst vr13, t2, 80 //48-55 vld vr7, a2, 47 z1_upsample_edge_data_init1 vst vr13, t2, 96 //56-63 vld vr7, a2, 55 z1_upsample_edge_data_init1 vst vr13, t2, 112 //64-71 vld vr7, a2, 63 z1_upsample_edge_data_init1 vst vr13, t2, 128 //72-79 vld vr7, a2, 71 z1_upsample_edge_data_init1 vst vr13, t2, 144 //80-87 vld vr7, a2, 79 z1_upsample_edge_data_init1 vst vr13, t2, 160 //88-94 vld vr7, a2, 87 z1_upsample_edge_data_init2 fst.d f13, t2, 176 vstelm.w vr13, t2, 184, 2 vstelm.h vr13, t2, 188, 6 ld.bu t7, a2, 95 st.b t7, t2, 190 b .Z1_UEDGE_END .Z1_UEDGE_W64_H64: //0-7 vld vr7, a2, -1 z1_upsample_edge_data_init1 vst vr13, t2, 0 //8-15 vld vr7, a2, 7 z1_upsample_edge_data_init1 vst vr13, t2, 16 //16-23 vld vr7, a2, 15 z1_upsample_edge_data_init1 vst vr13, t2, 32 //24-31 vld vr7, a2, 23 z1_upsample_edge_data_init1 vst vr13, t2, 48 //32-39 vld vr7, a2, 31 z1_upsample_edge_data_init1 vst vr13, t2, 64 //40-47 vld vr7, a2, 39 z1_upsample_edge_data_init1 vst vr13, t2, 80 //48-55 vld vr7, a2, 47 z1_upsample_edge_data_init1 vst vr13, t2, 96 //56-63 vld vr7, a2, 55 z1_upsample_edge_data_init1 vst vr13, t2, 112 //64-71 vld vr7, a2, 63 z1_upsample_edge_data_init1 vst vr13, t2, 128 //72-79 vld vr7, a2, 71 z1_upsample_edge_data_init1 vst vr13, t2, 144 //80-87 vld vr7, a2, 79 z1_upsample_edge_data_init1 vst vr13, t2, 160 //88-95 vld vr7, a2, 87 z1_upsample_edge_data_init1 vst vr13, t2, 176 //96-103 vld vr7, a2, 95 z1_upsample_edge_data_init1 vst vr13, t2, 192 //104-111 vld vr7, a2, 103 z1_upsample_edge_data_init1 vst vr13, t2, 208 //112-119 vld vr7, a2, 111 z1_upsample_edge_data_init1 vst vr13, t2, 224 //120-126 vld vr7, a2, 119 z1_upsample_edge_data_init2 fst.d f13, t2, 240 vstelm.w vr13, t2, 248, 2 vstelm.h vr13, t2, 252, 6 ld.bu t7, a2, 127 st.b t7, t2, 254 b .Z1_UEDGE_END .Z1_UEDGE_END: //upsample_edge end or a7, t2, t2 //top add.d t0, a3, a4 slli.d t0, t0, 1 addi.d t0, t0, -2 //max_base_x slli.d t1, t1, 1 b .IPRED_Z1_UA_END .IPRED_Z1_NOTUA: or t5, zero, zero //upsample_above=0 beqz a7, .IPRED_Z1_NOTFS add.d a7, a3, a4 //w+h li.w t4, 90 sub.d t4, t4, a5 // ipred_get_filter_strength a6:filter_strength beqz a6, .Z1_GETFS20 .Z1_GETFS10: //wh<=8 addi.d t6, a7, -8 blt zero, t6, .Z1_GETFS11 addi.d t6, t4, -64 blt t6, zero, .Z1_GETFS101 ori a6, zero, 2 b .Z1_GETFS40 .Z1_GETFS101: addi.d t6, t4, -40 blt t6, zero, .Z1_GETFS30 ori a6, zero, 1 b .Z1_GETFS40 .Z1_GETFS11: //wh<=16 addi.d t6, a7, -16 blt zero, t6, .Z1_GETFS12 addi.d t6, t4, -48 blt t6, zero, .Z1_GETFS111 ori a6, zero, 2 b .Z1_GETFS40 .Z1_GETFS111: addi.d t6, t4, -20 blt t6, zero, .Z1_GETFS30 ori a6, zero, 1 b .Z1_GETFS40 .Z1_GETFS12: //wh<=24 addi.d t6, a7, -24 blt zero, t6, .Z1_GETFS13 addi.d t6, t4, -4 blt t6, zero, .Z1_GETFS30 ori a6, zero, 3 b .Z1_GETFS40 .Z1_GETFS13: ori a6, zero, 3 b .Z1_GETFS40 .Z1_GETFS20: //wh<=8 addi.d t6, a7, -8 blt zero, t6, .Z1_GETFS21 addi.d t6, t4, -56 blt t6, zero, .Z1_GETFS30 ori a6, zero, 1 b .Z1_GETFS40 .Z1_GETFS21: //wh<=16 addi.d t6, a7, -16 blt zero, t6, .Z1_GETFS22 addi.d t6, t4, -40 blt t6, zero, .Z1_GETFS30 ori a6, zero, 1 b .Z1_GETFS40 .Z1_GETFS22: //wh<=24 addi.d t6, a7, -24 blt zero, t6, .Z1_GETFS23 addi.d t6, t4, -32 blt t6, zero, .Z1_GETFS221 ori a6, zero, 3 b .Z1_GETFS40 .Z1_GETFS221: addi.d t6, t4, -16 blt t6, zero, .Z1_GETFS222 ori a6, zero, 2 b .Z1_GETFS40 .Z1_GETFS222: addi.d t6, t4, -8 blt t6, zero, .Z1_GETFS30 ori a6, zero, 1 b .Z1_GETFS40 .Z1_GETFS23: //wh<=32 addi.d t6, a7, -32 blt zero, t6, .Z1_GETFS24 addi.d t6, t4, -32 blt t6, zero, .Z1_GETFS231 ori a6, zero, 3 b .Z1_GETFS40 .Z1_GETFS231: addi.d t6, t4, -4 blt t6, zero, .Z1_GETFS232 ori a6, zero, 2 b .Z1_GETFS40 .Z1_GETFS232: ori a6, zero, 1 b .Z1_GETFS40 .Z1_GETFS24: ori a6, zero, 3 b .Z1_GETFS40 .Z1_GETFS30: or a6, zero, zero .Z1_GETFS40: beqz a6, .IPRED_Z1_NOTFS .IPRED_Z1_IFFS: // filter_edge addi.d a6, a6, -1 slli.d a6, a6, 4 la.local t0, ipred_filter_edge_kernel1 vldx vr1, t0, a6 //kernel[0-3] la.local t0, ipred_filter_edge_kernel2 vldx vr6, t0, a6 //kernel[4] .IPRED_Z1_FS_W4: andi t0, a3, 4 beqz t0, .IPRED_Z1_FS_W8 .IPRED_Z1_FS_W4_H4: andi t0, a4, 4 beqz t0, .IPRED_Z1_FS_W4_H8 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init4 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W4_H8: andi t0, a4, 8 beqz t0, .IPRED_Z1_FS_W4_H16 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init4 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-11 vreplvei.b vr10, vr7, 8 vextrins.b vr10, vr7, 0x07 z1_filter_edge_calc_other fst.s f12, t2, 8 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W4_H16: andi t0, a4, 16 beqz t0, .IPRED_Z1_FS_W4_H32 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init4 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vreplvei.b vr10, vr7, 8 vextrins.b vr10, vr7, 0x07 z1_filter_edge_calc_other fst.d f12, t2, 8 //16-19 vreplvei.b vr12, vr12, 1 fst.s f12, t2, 16 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W4_H32: andi t0, a4, 32 beqz t0, .IPRED_Z1_FS_W4_H64 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init4 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vreplvei.b vr10, vr7, 8 vextrins.b vr10, vr7, 0x07 z1_filter_edge_calc_other fst.d f12, t2, 8 //16-23 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 16 fst.d f12, t2, 24 //24-31 fst.s f12, t2, 32 //32-35 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W4_H64: //0-7 vld vr7, a2, -1 z1_filter_edge_data_init4 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vreplvei.b vr10, vr7, 8 vextrins.b vr10, vr7, 0x07 z1_filter_edge_calc_other fst.d f12, t2, 8 //16-23 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 16 fst.d f12, t2, 24 //24-31 fst.d f12, t2, 32 //32-39 fst.d f12, t2, 40 //40-47 fst.d f12, t2, 48 //48-55 fst.d f12, t2, 56 //56-63 fst.s f12, t2, 64 //64-67 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W8: andi t0, a3, 8 beqz t0, .IPRED_Z1_FS_W16 .IPRED_Z1_FS_W8_H4: andi t0, a4, 4 beqz t0, .IPRED_Z1_FS_W8_H8 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-11 vld vr7, a2, 6 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x32 vsllwil.hu.bu vr10, vr7, 0 vsllwil.hu.bu vr11, vr11, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 z1_filter_edge_calc_loop1 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x21 vextrins.b vr13, vr13, 0x31 z1_filter_edge_calc_loop2 fst.s f12, t2, 8 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W8_H8: andi t0, a4, 8 beqz t0, .IPRED_Z1_FS_W8_H16 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W8_H16: andi t0, a4, 16 beqz t0, .IPRED_Z1_FS_W8_H32 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vreplvei.b vr10, vr7, 9 vextrins.b vr10, vr7, 0x08 z1_filter_edge_calc_other fst.d f12, t2, 16 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W8_H32: andi t0, a4, 32 beqz t0, .IPRED_Z1_FS_W8_H64 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vreplvei.b vr10, vr7, 9 vextrins.b vr10, vr7, 0x08 z1_filter_edge_calc_other fst.d f12, t2, 16 //24-31 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 24 //32-39 fst.d f12, t2, 32 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W8_H64: //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vreplvei.b vr10, vr7, 9 vextrins.b vr10, vr7, 0x08 z1_filter_edge_calc_other fst.d f12, t2, 16 //24-31 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 24 fst.d f12, t2, 32 //32-39 fst.d f12, t2, 40 //40-47 fst.d f12, t2, 48 //48-55 fst.d f12, t2, 56 //56-63 fst.d f12, t2, 64 //64-71 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W16: andi t0, a3, 16 beqz t0, .IPRED_Z1_FS_W32 .IPRED_Z1_FS_W16_H4: andi t0, a4, 4 beqz t0, .IPRED_Z1_FS_W16_H8 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-19 vld vr7, a2, 14 vbsrl.v vr11, vr7, 1 vbsrl.v vr12, vr7, 2 vbsrl.v vr13, vr7, 3 vextrins.b vr13, vr13, 0x32 vsllwil.hu.bu vr10, vr7, 0 vsllwil.hu.bu vr11, vr11, 0 vsllwil.hu.bu vr12, vr12, 0 vsllwil.hu.bu vr13, vr13, 0 z1_filter_edge_calc_loop1 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x21 vextrins.b vr13, vr13, 0x31 z1_filter_edge_calc_loop2 fst.s f12, t2, 16 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W16_H8: andi t0, a4, 8 beqz t0, .IPRED_Z1_FS_W16_H16 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W16_H16: andi t0, a4, 16 beqz t0, .IPRED_Z1_FS_W16_H32 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W16_H32: andi t0, a4, 32 beqz t0, .IPRED_Z1_FS_W16_H64 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vreplvei.b vr10, vr7, 9 vextrins.b vr10, vr7, 0x08 z1_filter_edge_calc_other fst.d f12, t2, 32 //40-47 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 40 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W16_H64: //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vreplvei.b vr10, vr7, 9 vextrins.b vr10, vr7, 0x08 z1_filter_edge_calc_other fst.d f12, t2, 32 //40-47 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 40 fst.d f12, t2, 48 //48-55 fst.d f12, t2, 56 //56-63 fst.d f12, t2, 64 //64-71 fst.d f12, t2, 72 //72-81 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W32: andi t0, a3, 32 beqz t0, .IPRED_Z1_FS_W64 .IPRED_Z1_FS_W32_H8: andi t0, a4, 8 beqz t0, .IPRED_Z1_FS_W32_H16 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W32_H16: andi t0, a4, 16 beqz t0, .IPRED_Z1_FS_W32_H32 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 //40-47 vld vr7, a2, 38 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 40 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W32_H32: andi t0, a4, 32 beqz t0, .IPRED_Z1_FS_W32_H64 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 //40-47 vld vr7, a2, 38 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 40 //48-55 vld vr7, a2, 46 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 48 //56-63 vld vr7, a2, 54 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 56 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W32_H64: //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 //40-47 vld vr7, a2, 38 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 40 //48-55 vld vr7, a2, 46 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 48 //56-63 vld vr7, a2, 54 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 56 //64-71 vreplvei.b vr10, vr7, 9 vextrins.b vr10, vr7, 0x08 z1_filter_edge_calc_other fst.d f12, t2, 64 //72-89 vreplvei.b vr12, vr12, 1 fst.d f12, t2, 72 fst.d f12, t2, 80 //80-87 fst.d f12, t2, 88 //88-95 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W64: .IPRED_Z1_FS_W64_H16: andi t0, a4, 16 beqz t0, .IPRED_Z1_FS_W64_H32 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 //40-47 vld vr7, a2, 38 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 40 //48-55 vld vr7, a2, 46 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 48 //56-63 vld vr7, a2, 54 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 56 //64-71 vld vr7, a2, 62 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 64 //72-79 vld vr7, a2, 70 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 72 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W64_H32: andi t0, a4, 32 beqz t0, .IPRED_Z1_FS_W64_H64 //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 //40-47 vld vr7, a2, 38 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 40 //48-55 vld vr7, a2, 46 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 48 //56-63 vld vr7, a2, 54 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 56 //64-71 vld vr7, a2, 62 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 64 //72-79 vld vr7, a2, 70 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 72 //80-87 vld vr7, a2, 78 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 80 //88-95 vld vr7, a2, 86 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 88 b .IPRED_Z1_FS_END .IPRED_Z1_FS_W64_H64: //0-7 vld vr7, a2, -1 z1_filter_edge_data_init1 vbsrl.v vr13, vr7, 3 z1_filter_edge_calc_loop2 fst.d f12, t2, 0 //8-15 vld vr7, a2, 6 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 8 //16-23 vld vr7, a2, 14 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 16 //24-31 vld vr7, a2, 22 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 24 //32-39 vld vr7, a2, 30 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 32 //40-47 vld vr7, a2, 38 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 40 //48-55 vld vr7, a2, 46 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 48 //56-63 vld vr7, a2, 54 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 56 //64-71 vld vr7, a2, 62 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 64 //72-79 vld vr7, a2, 70 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 72 //80-87 vld vr7, a2, 78 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 80 //88-95 vld vr7, a2, 86 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 88 //96-103 vld vr7, a2, 94 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 96 //104-111 vld vr7, a2, 102 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 104 //112-119 vld vr7, a2, 110 z1_filter_edge_data_init2 vbsrl.v vr13, vr7, 4 z1_filter_edge_calc_loop2 fst.d f12, t2, 112 //120-127 vld vr7, a2, 118 z1_filter_edge_data_init3 vbsrl.v vr13, vr7, 4 vextrins.b vr13, vr13, 0x65 vextrins.b vr13, vr13, 0x75 z1_filter_edge_calc_loop2 fst.d f12, t2, 120 .IPRED_Z1_FS_END: addi.d t0, a7, -1 //max_base_x or a7, t2, t2 //top b .IPRED_Z1_UA_END .IPRED_Z1_NOTFS: or a7, a2, a2 //top // imin_gr blt a3, a4, .Z1_IMIN1 or t0, a4, a4 b .Z1_IMIN2 .Z1_IMIN1: or t0, a3, a3 .Z1_IMIN2: add.d t0, a3, t0 addi.d t0, t0, -1 //max_base_x .IPRED_Z1_UA_END: //st dst, t1:dx a2 a6 t6 t7 beqz t5, .Z1_UA0 li.w a5, 64 vreplgr2vr.h vr0, a5 vsrai.h vr7, vr0, 1 or t2, zero, zero //y or t3, t1, t1 //xpos .Z1_LOOPY: andi t4, t3, 0x3e //frac vreplgr2vr.h vr1, t4 vsub.h vr2, vr0, vr1 or a6, zero, zero //x or a2, zero, zero //base_num srai.d t6, t3, 6 //base or t7, t6, t6 bge t7, t0, .Z1_LOOPX .Z1_BASENUM: addi.d a2, a2, 1 addi.d t7, t7, 2 blt t7, t0, .Z1_BASENUM .Z1_LOOPX: blt a2, a3, .Z1_LOOPX_BASEMAX srai.d t8, a3, 3 //loop param beqz t8, .Z1_LOOPX_W4 .Z1_LOOPX_W8: add.d t5, a7, t6 vld vr3, t5, 0 vpickev.b vr5, vr3, vr3 //0 2 4 6... vpickod.b vr6, vr3, vr3 //1 3 5 7... vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.d f3, a0, a6 addi.d a6, a6, 8 addi.d t6, t6, 16 addi.d t8, t8, -1 bnez t8, .Z1_LOOPX_W8 b .Z1_LOOPY_END .Z1_LOOPX_W4: vldx vr3, a7, t6 vsllwil.hu.bu vr3, vr3, 0 vpickev.h vr5, vr3, vr3 //0 2 4 6... vpickod.h vr6, vr3, vr3 //1 3 5 7... vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.s f3, a0, a6 b .Z1_LOOPY_END .Z1_LOOPX_BASEMAX: srai.d t8, a2, 3 //loop param beqz t8, .Z1_LOOPX_BASEMAX4 .Z1_LOOPX_BASEMAX8: add.d t5, a7, t6 vld vr3, t5, 0 vpickev.b vr5, vr3, vr3 //0 2 4 6... vpickod.b vr6, vr3, vr3 //1 3 5 7... vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.d f3, a0, a6 addi.d a6, a6, 8 addi.d t6, t6, 16 addi.d t8, t8, -1 bnez t8, .Z1_LOOPX_BASEMAX8 .Z1_LOOPX_BASEMAX4: andi t8, a2, 4 beqz t8, .Z1_LOOPX_BASEMAX2 vldx vr3, a7, t6 vsllwil.hu.bu vr3, vr3, 0 vpickev.h vr5, vr3, vr3 //0 2 4 6... vpickod.h vr6, vr3, vr3 //1 3 5 7... vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.s f3, a0, a6 addi.d a6, a6, 4 addi.d t6, t6, 8 .Z1_LOOPX_BASEMAX2: andi t8, a2, 2 beqz t8, .Z1_LOOPX_BASEMAX1 vldx vr3, a7, t6 vsllwil.hu.bu vr3, vr3, 0 vpickev.h vr5, vr3, vr3 //0 2 4 6... vpickod.h vr6, vr3, vr3 //1 3 5 7... vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 vpickve2gr.bu t7, vr3, 0 vpickve2gr.bu t8, vr3, 1 stx.b t7, a0, a6 addi.d a6, a6, 1 stx.b t8, a0, a6 addi.d a6, a6, 1 addi.d t6, t6, 4 .Z1_LOOPX_BASEMAX1: andi t8, a2, 1 beqz t8, .Z1_LOOPX_BASEMAX_MSET add.d a2, a7, t6 sub.d t7, a5, t4 ld.bu t8, a2, 0 mul.w t7, t7, t8 ld.bu t8, a2, 1 mul.w t8, t8, t4 add.d t7, t7, t8 addi.d t7, t7, 32 srai.d t7, t7, 6 stx.b t7, a0, a6 addi.d a6, a6, 1 .Z1_LOOPX_BASEMAX_MSET: //memset add.d t6, a0, a6 //dst add.d t7, a7, t0 //src sub.d a2, a3, a6 //size pixel_set_8bpc_allw t6, t7, a2, t8, t4 .Z1_LOOPY_END: addi.d t2, t2, 1 add.d a0, a0, a1 add.d t3, t3, t1 blt t2, a4, .Z1_LOOPY b .Z1_END .Z1_UA0: li.w a5, 64 vreplgr2vr.h vr0, a5 vsrai.h vr7, vr0, 1 or t2, zero, zero //y or t3, t1, t1 //xpos .Z1_UA0_LOOPY: andi t4, t3, 0x3e //frac vreplgr2vr.h vr1, t4 vsub.h vr2, vr0, vr1 or a6, zero, zero //x srai.d t6, t3, 6 //base sub.d a2, t0, t6 //a2:base_num blt a2, zero, .Z1_UA0_BASENUM b .Z1_UA0_LOOPX .Z1_UA0_BASENUM: or a2, zero, zero .Z1_UA0_LOOPX: blt a2, a3, .Z1_UA0_LOOPX_BASEMAX srai.d t8, a3, 3 //loop param beqz t8, .Z1_UA0_LOOPX_W4 .Z1_UA0_LOOPX_W8: add.d t5, a7, t6 vld vr5, t5, 0 vld vr6, t5, 1 vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.d f3, a0, a6 addi.d a6, a6, 8 addi.d t6, t6, 8 addi.d t8, t8, -1 bnez t8, .Z1_UA0_LOOPX_W8 b .Z1_UA0_LOOPY_END .Z1_UA0_LOOPX_W4: vldx vr5, a7, t6 vsllwil.hu.bu vr5, vr5, 0 vbsrl.v vr6, vr5, 2 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.s f3, a0, a6 b .Z1_UA0_LOOPY_END .Z1_UA0_LOOPX_BASEMAX: srai.d t8, a2, 3 //loop param beqz t8, .Z1_UA0_LOOPX_BASEMAX4 .Z1_UA0_LOOPX_BASEMAX8: add.d t5, a7, t6 vld vr5, t5, 0 vld vr6, t5, 1 vsllwil.hu.bu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.d f3, a0, a6 addi.d a6, a6, 8 addi.d t6, t6, 8 addi.d t8, t8, -1 bnez t8, .Z1_UA0_LOOPX_BASEMAX8 .Z1_UA0_LOOPX_BASEMAX4: andi t8, a2, 4 beqz t8, .Z1_UA0_LOOPX_BASEMAX2 vldx vr5, a7, t6 vsllwil.hu.bu vr5, vr5, 0 vbsrl.v vr6, vr5, 2 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 fstx.s f3, a0, a6 addi.d a6, a6, 4 addi.d t6, t6, 4 .Z1_UA0_LOOPX_BASEMAX2: andi t8, a2, 2 beqz t8, .Z1_UA0_LOOPX_BASEMAX1 vldx vr5, a7, t6 vsllwil.hu.bu vr5, vr5, 0 vbsrl.v vr6, vr5, 2 vmul.h vr3, vr5, vr2 vmadd.h vr3, vr6, vr1 vadd.h vr3, vr3, vr7 vsrai.h vr3, vr3, 6 vsrlni.b.h vr3, vr3, 0 vpickve2gr.bu t7, vr3, 0 vpickve2gr.bu t8, vr3, 1 stx.b t7, a0, a6 addi.d a6, a6, 1 stx.b t8, a0, a6 addi.d a6, a6, 1 addi.d t6, t6, 2 .Z1_UA0_LOOPX_BASEMAX1: andi t8, a2, 1 beqz t8, .Z1_UA0_LOOPX_BASEMAX_MSET add.d a2, a7, t6 sub.d t7, a5, t4 ld.bu t8, a2, 0 mul.w t7, t7, t8 ld.bu t8, a2, 1 mul.w t8, t8, t4 add.d t7, t7, t8 addi.d t7, t7, 32 srai.d t7, t7, 6 stx.b t7, a0, a6 addi.d a6, a6, 1 .Z1_UA0_LOOPX_BASEMAX_MSET: //memset add.d t6, a0, a6 //dst add.d t7, a7, t0 //src sub.d a2, a3, a6 //size pixel_set_8bpc_allw t6, t7, a2, t8, t4 .Z1_UA0_LOOPY_END: addi.d t2, t2, 1 add.d a0, a0, a1 add.d t3, t3, t1 blt t2, a4, .Z1_UA0_LOOPY .Z1_END: addi.d sp, sp, 128 endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/ipred.h000066400000000000000000000100241517466257200240600ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_IPRED_H #define DAV2D_SRC_LOONGARCH_IPRED_H #include "config.h" #include "src/ipred.h" #include "src/cpu.h" #include "src/tables.h" #define MULTIPLIER_1x2 0x5556 #define MULTIPLIER_1x4 0x3334 #define BASE_SHIFT 16 #define init_fn(type0, type1, name, suffix) \ c->type0[type1] = BF(dav2d_##name, suffix) #define init_angular_ipred_fn(type, name, suffix) \ init_fn(intra_pred, type, name, suffix) #define init_cfl_pred_fn(type, name, suffix) \ init_fn(cfl_pred, type, name, suffix) decl_angular_ipred_fn(BF(dav2d_ipred_dc, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_dc_128, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_dc_top, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_dc_left, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_h, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_v, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_paeth, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth_v, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth_h, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_filter, lsx)); decl_angular_ipred_fn(BF(dav2d_ipred_z1, lsx)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl, lsx)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl_128, lsx)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl_top, lsx)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl_left, lsx)); decl_pal_pred_fn(BF(dav2d_pal_pred, lsx)); static ALWAYS_INLINE void intra_pred_dsp_init_loongarch(Dav2dIntraPredDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; #if BITDEPTH == 8 init_angular_ipred_fn(DC_PRED, ipred_dc, lsx); init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, lsx); init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, lsx); init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, lsx); init_angular_ipred_fn(HOR_PRED, ipred_h, lsx); init_angular_ipred_fn(VERT_PRED, ipred_v, lsx); init_angular_ipred_fn(PAETH_PRED, ipred_paeth, lsx); init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, lsx); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, lsx); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, lsx); //init_angular_ipred_fn(DIP_PRED, ipred_dip, lsx); init_angular_ipred_fn(Z1_PRED, ipred_z1, lsx); init_cfl_pred_fn(DC_PRED, ipred_cfl, lsx); init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, lsx); init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, lsx); init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, lsx); c->pal_pred = BF(dav2d_pal_pred, lsx); #endif } #endif /* DAV2D_SRC_LOONGARCH_IPRED_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/itx.S000066400000000000000000005442301517466257200235470ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" #include "src/loongarch/loongson_util.S" .macro PUSH_REG addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 .endm .macro POP_REG fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 .endm .macro malloc_space number li.w t0, \number sub.d sp, sp, t0 addi.d sp, sp, -64 PUSH_REG .endm .macro free_space number POP_REG li.w t0, \number add.d sp, sp, t0 addi.d sp, sp, 64 .endm .macro iwht4 vadd.h vr0, vr0, vr1 vsub.h vr4, vr2, vr3 vsub.h vr5, vr0, vr4 vsrai.h vr5, vr5, 1 vsub.h vr2, vr5, vr1 vsub.h vr1, vr5, vr3 vadd.h vr3, vr4, vr2 vsub.h vr0, vr0, vr1 .endm .macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 vilvl.w \in0, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... vilvl.w \in2, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... vsllwil.hu.bu \in0, \in0, 0 vsllwil.hu.bu \in2, \in2, 0 vadd.h \in0, \in4, \in0 vadd.h \in2, \in5, \in2 vssrani.bu.h \in2, \in0, 0 vstelm.w \in2, a0, 0, 0 vstelmx.w \in2, a0, a1, 1 vstelmx.w \in2, a0, a1, 2 vstelmx.w \in2, a0, a1, 3 .endm .macro VLD_DST_ADD_W4 in0, in1 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 .endm function inv_txfm_add_wht_wht_4x4_8bpc_lsx vld vr0, a2, 0 vld vr2, a2, 16 vxor.v vr20, vr20, vr20 vsrai.h vr0, vr0, 2 vsrai.h vr2, vr2, 2 vst vr20, a2, 0 vpickod.d vr1, vr0, vr0 vpickod.d vr3, vr2, vr2 vst vr20, a2, 16 iwht4 LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 iwht4 vilvl.d vr4, vr1, vr0 vilvl.d vr5, vr3, vr2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr4, vr5 endfunc const idct_coeffs, align=4 .word 2896, 2896*8, 1567, 3784 .word 799, 4017, 3406, 2276 .word 401, 4076, 3166, 2598 .word 1931, 3612, 3920, 1189 .word 201, 4091, 3035, 2751 .word 1751, 3703, 3857, 1380 .word 995, 3973, 3513, 2106 .word 2440, 3290, 4052, 601 endconst .macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift vsrari.h \out0, \in0, \shift vsrari.h \out1, \in1, \shift vsrari.h \out2, \in2, \shift vsrari.h \out3, \in3, \shift .endm .macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \ out1, out2, out3, out4, out5, out6, out7, shift vsrari.h \out0, \in0, \shift vsrari.h \out1, \in1, \shift vsrari.h \out2, \in2, \shift vsrari.h \out3, \in3, \shift vsrari.h \out4, \in4, \shift vsrari.h \out5, \in5, \shift vsrari.h \out6, \in6, \shift vsrari.h \out7, \in7, \shift .endm .macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz vmulwev.w.h \out0, \in0, \in2 vmulwod.w.h \out1, \in0, \in2 vmaddwev.w.h \out0, \in1, \in3 vmaddwod.w.h \out1, \in1, \in3 .ifc \sz, .4h vilvl.w \out0, \out1, \out0 .else vilvl.w vr22, \out1, \out0 vilvh.w \out1, \out1, \out0 vor.v \out0, vr22, vr22 .endif .endm const idct_coeffs_h, align=4 .short 2896, 2896*8, 1567, 3784 .short 799, 4017, 3406, 2276 .short 401, 4076, 3166, 2598 .short 1931, 3612, 3920, 1189 .short 201, 4091, 3035, 2751 .short 1751, 3703, 3857, 1380 .short 995, 3973, 3513, 2106 .short 2440, 3290, 4052, 601 endconst const iadst4_coeffs, align=4 .word 1321, 3803, 2482, 3344 endconst .macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz vssrarni.h.w vr18, vr16, 12 // t0 vssrarni.h.w vr19, vr17, 12 // t1 vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz vssrarni.h.w vr16, \in0, 12 // t3 vssrarni.h.w vr17, \in2, 12 // t2 vsadd.h \out0, vr18, vr16 vsadd.h \out1, vr19, vr17 vssub.h \out2, vr19, vr17 vssub.h \out3, vr18, vr16 .endm functionl inv_dct_4h_x4_lsx inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h endfuncl functionl inv_dct_8h_x4_lsx inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h endfuncl .macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3 vsub.w vr16, \in0, \in2 // in0-in2 vmul.w vr17, \in0, vr20 // in0*1321 vmul.w vr19, \in0, vr22 // in0*2482 vmul.w vr18, \in1, vr23 // in1*3344 vmadd.w vr17, \in2, vr21 // in0*1321+in2*3803 vmsub.w vr19, \in2, vr20 // in2*1321 vadd.w vr16, vr16, \in3 // in0-in2+in3 vmadd.w vr17, \in3, vr22 // in0*1321+in2*3803+in3*2482 vmsub.w vr19, \in3, vr21 // in0*2482-in2*1321-in3*3803 vadd.w vr15, vr17, vr19 vmul.w \out2, vr16, vr23 // out[2] 8 9 10 11 vadd.w \out0, vr17, vr18 // out[0] 0 1 2 3 vadd.w \out1, vr19, vr18 // out[1] 4 5 6 7 vsub.w \out3, vr15, vr18 // out[3] 12 13 14 15 .endm .macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr0, \in0, 0 vsllwil.w.h vr1, \in1, 0 vsllwil.w.h vr2, \in2, 0 vsllwil.w.h vr3, \in3, 0 inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3 vssrarni.h.w \out0, \out0, 12 vssrarni.h.w \out1, \out1, 12 vssrarni.h.w \out2, \out2, 12 vssrarni.h.w \out3, \out3, 12 .endm functionl inv_adst_4h_x4_lsx inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3 endfuncl functionl inv_flipadst_4h_x4_lsx inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0 endfuncl .macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3 la.local t0, iadst4_coeffs vldrepl.w vr20, t0, 0 // 1321 vldrepl.w vr21, t0, 4 // 3803 vldrepl.w vr22, t0, 8 // 2482 vldrepl.w vr23, t0, 12 // 3344 vsllwil.w.h vr10, \in0, 0 // in0 vsllwil.w.h vr11, \in1, 0 // in1 vsllwil.w.h vr12, \in2, 0 // in2 vsllwil.w.h vr13, \in3, 0 // in3 inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 vexth.w.h \in0, \in0 // in0 vexth.w.h \in1, \in1 // in1 vexth.w.h \in2, \in2 // in2 vexth.w.h \in3, \in3 // in3 inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3 vssrarni.h.w \out0, vr10, 12 vssrarni.h.w \out1, vr11, 12 vssrarni.h.w \out2, vr12, 12 vssrarni.h.w \out3, vr13, 12 .endm functionl inv_adst_8h_x4_lsx inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3 endfuncl functionl inv_flipadst_8h_x4_lsx inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0 endfuncl functionl inv_identity_4h_x4_lsx li.w t0, 1697 vreplgr2vr.h vr20, t0 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vmulwev.w.h vr16, vr0, vr20 vmulwod.w.h vr17, vr0, vr20 vmulwev.w.h vr18, vr2, vr20 vmulwod.w.h vr19, vr2, vr20 vilvl.w vr1, vr17, vr16 vilvh.w vr3, vr17, vr16 vilvl.w vr22, vr19, vr18 vilvh.w vr23, vr19, vr18 vssrarni.h.w vr3, vr1, 12 vssrarni.h.w vr23, vr22, 12 vsadd.h vr0, vr3, vr0 // t0 vsadd.h vr2, vr23, vr2 // t2 vilvh.d vr1, vr0, vr0 // t1 vilvh.d vr3, vr2, vr2 // t3 endfuncl .macro inv_identity4_lsx1 in0, in1, in2, out0, out1 vsllwil.w.h vr16, \in0, 0 vexth.w.h vr17, \in1 vmul.w vr18, vr16, \in2 vmul.w vr19, vr17, \in2 vsrari.w vr18, vr18, 12 vsrari.w vr19, vr19, 12 vadd.w \out0, vr18, vr16 vadd.w \out1, vr19, vr17 vssrarni.h.w \out1, \out0, 1 .endm functionl inv_identity_8h_x4_lsx li.w t0, 1697 vreplgr2vr.h vr20, t0 vmulwev.w.h vr16, vr0, vr20 vmulwod.w.h vr17, vr0, vr20 vmulwev.w.h vr18, vr1, vr20 vmulwod.w.h vr19, vr1, vr20 vilvl.w vr21, vr17, vr16 vilvh.w vr22, vr17, vr16 vilvl.w vr23, vr19, vr18 vilvh.w vr16, vr19, vr18 vssrarni.h.w vr22, vr21, 12 vssrarni.h.w vr16, vr23, 12 vsadd.h vr0, vr22, vr0 // t0 vsadd.h vr1, vr16, vr1 // t1 vmulwev.w.h vr16, vr2, vr20 vmulwod.w.h vr17, vr2, vr20 vmulwev.w.h vr18, vr3, vr20 vmulwod.w.h vr19, vr3, vr20 vilvl.w vr21, vr17, vr16 vilvh.w vr22, vr17, vr16 vilvl.w vr23, vr19, vr18 vilvh.w vr16, vr19, vr18 vssrarni.h.w vr22, vr21, 12 vssrarni.h.w vr16, vr23, 12 vsadd.h vr2, vr22, vr2 // t2 vsadd.h vr3, vr16, vr3 // t3 endfuncl functionl inv_identity_8h_x4_lsx1 li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3 inv_identity4_lsx1 \i, \i vr20, vr21, \i .endr endfuncl functionl inv_txfm_add_4x4_lsx vxor.v vr23, vr23, vr23 vld vr0, a2, 0 vld vr2, a2, 16 vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vst vr23, a2, 0 vst vr23, a2, 16 move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 move t6, ra jirl ra, t8, 0 move ra, t6 vilvl.d vr4, vr1, vr0 vilvl.d vr5, vr3, vr2 vsrari.h vr4, vr4, 4 vsrari.h vr5, vr5, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr4, vr5 endfuncl .macro idct_dc w, h, shift ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 .if (2*\w == \h) || (2*\h == \w) vmul.w vr2, vr0, vr2 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 .endif .if \shift>0 vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift .endif vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 alsl.d t2, a1, a0, 1 vmadd.w vr20, vr2, vr0 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 vssrarni.h.w vr20, vr20, 12 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 .endm .macro fun4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, 1f idct_dc 4, 4, 0 DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20 b .\txfm1\()_\txfm2\()_4X4_END 1: .endif la.local t7, inv_\txfm1\()_4h_x4_lsx la.local t8, inv_\txfm2\()_4h_x4_lsx b inv_txfm_add_4x4_lsx .\txfm1\()_\txfm2\()_4X4_END: endfunc .endm fun4x4 dct, dct fun4x4 identity, identity fun4x4 adst, dct fun4x4 dct, adst fun4x4 adst, adst fun4x4 dct, flipadst fun4x4 flipadst, adst fun4x4 adst, flipadst fun4x4 flipadst, dct fun4x4 flipadst, flipadst fun4x4 dct, identity fun4x4 identity, dct fun4x4 flipadst, identity fun4x4 identity, flipadst fun4x4 identity, adst fun4x4 adst, identity const iadst8_coeffs_h, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst .macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz la.local t0, iadst8_coeffs_h vldrepl.h vr20, t0, 0 // 4076 vldrepl.h vr21, t0, 2 // 401 vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz vssrarni.h.w vr17, vr16, 12 // t0a vssrarni.h.w vr19, vr18, 12 // t1a vldrepl.h vr20, t0, 4 // 3612 vldrepl.h vr21, t0, 6 // 1931 vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz vssrarni.h.w vr16, vr0, 12 // t2a vssrarni.h.w vr18, vr7, 12 // t3a vldrepl.h vr20, t0, 8 // 2598 vldrepl.h vr21, t0, 10 // 3166 vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz vssrarni.h.w vr0, vr2, 12 // t4a vssrarni.h.w vr7, vr5, 12 // t5a vldrepl.h vr20, t0, 12 // 1189 vldrepl.h vr21, t0, 14 // 3920 vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz vssrarni.h.w vr2, vr3, 12 // t6a vssrarni.h.w vr5, vr4, 12 // t7a vsadd.h vr3, vr17, vr0 // t0 vssub.h vr4, vr17, vr0 // t4 vsadd.h vr1, vr19, vr7 // t1 vssub.h vr6, vr19, vr7 // t5 vsadd.h vr17, vr16, vr2 // t2 vssub.h vr19, vr16, vr2 // t6 vsadd.h vr0, vr18, vr5 // t3 vssub.h vr7, vr18, vr5 // t7 la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz vssrarni.h.w vr5, vr16, 12 // t4a vssrarni.h.w vr2, vr18, 12 // t5a vneg.h vr21, vr21 vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz vssrarni.h.w vr16, vr4, 12 // t7a vssrarni.h.w vr18, vr6, 12 // t6a vsadd.h vr4, vr5, vr18 // out1 vssub.h vr19, vr5, vr18 // t6 vsadd.h vr20, vr1, vr0 // out7 vssub.h vr18, vr1, vr0 // t3 vsadd.h \out0, vr3, vr17 // out0 vssub.h vr5, vr3, vr17 // t2 vsadd.h \out6, vr2, vr16 // out6 vssub.h vr23, vr2, vr16 // t7 vsllwil.w.h vr3, vr20, 0 // out7 vexth.w.h \out7, vr20 // out7 vsllwil.w.h vr21, vr4, 0 // out1 vexth.w.h \out1, vr4 // out1 vneg.w vr3, vr3 vneg.w \out7, \out7 vneg.w vr21, vr21 vneg.w \out1, \out1 vssrarni.h.w \out7, vr3, 0 vssrarni.h.w \out1, vr21, 0 la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz vsrari.w vr16, vr16, 12 vsrari.w \out3, \out3, 12 vneg.w vr16, vr16 vneg.w \out3, \out3 vssrarni.h.w \out3, vr16, 0 // out3 vssrarni.h.w \out4, vr17, 12 // out4 vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz vssrarni.h.w \out2, vr16, 12 // out2 vsrari.w vr17, vr17, 12 vsrari.w \out5, \out5, 12 vneg.w vr17, vr17 vneg.w \out5, \out5 vssrarni.h.w \out5, vr17, 0 // out5 .endm functionl inv_adst_8h_x8_lsx inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h endfuncl functionl inv_flipadst_8h_x8_lsx inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h endfuncl functionl inv_adst_4h_x8_lsx inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h endfuncl functionl inv_flipadst_4h_x8_lsx inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h endfuncl .macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 8 // 799 vldrepl.h vr21, t0, 10 // 4017 vmulev_vmaddod_lsx \in1, \in7, vr21, vr20, vr16, vr17, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx \in1, \in7, vr20, vr21, vr18, vr19, \sz vssrarni.h.w vr17, vr16, 12 // t7a vssrarni.h.w vr19, vr18, 12 // t4a vldrepl.h vr20, t0, 12 // 3406 vldrepl.h vr21, t0, 14 // 2276 vmulev_vmaddod_lsx \in5, \in3, vr21, vr20, \in1, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx \in5, \in3, vr20, vr21, \in7, vr18, \sz vssrarni.h.w vr16, \in1, 12 // t6a vssrarni.h.w vr18, \in7, 12 // t5a vssub.h \in7, vr19, vr18 // t5a vsadd.h vr18, vr19, vr18 // t4 vssub.h \in5, vr17, vr16 // t6a vsadd.h vr16, vr17, vr16 // t7 vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx \in5, \in7, vr20, vr20, \in1, vr17, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx \in5, \in7, vr20, vr21, vr23, vr19, \sz vssrarni.h.w vr17, \in1, 12 // t6 vssrarni.h.w vr19, vr23, 12 // t5 vssub.h \in7, \in0, vr16 //c[7] vsadd.h \in0, \in0, vr16 //c[0] vssub.h \in5, \in4, vr19 //c[5] vsadd.h vr23, \in4, vr19 //c[2] vssub.h \in4, \in6, vr18 //c[4] vsadd.h \in3, \in6, vr18 //c[3] vssub.h \in6, \in2, vr17 //c[6] vsadd.h \in1, \in2, vr17 //c[1] vor.v \in2, vr23, vr23 .endm functionl inv_dct_8h_x8_lsx inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h endfuncl functionl inv_dct_4h_x8_lsx inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h endfuncl .macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr0, \in0, 0 vsllwil.hu.bu vr1, \in1, 0 vsllwil.hu.bu vr2, \in2, 0 vsllwil.hu.bu vr3, \in3, 0 vadd.h vr0, \in4, vr0 vadd.h vr1, \in5, vr1 vadd.h vr2, \in6, vr2 vadd.h vr3, \in7, vr3 vssrani.bu.h vr1, vr0, 0 vssrani.bu.h vr3, vr2, 0 vstelm.d vr1, a0, 0, 0 vstelmx.d vr1, a0, a1, 1 vstelmx.d vr3, a0, a1, 0 vstelmx.d vr3, a0, a1, 1 .endm .macro VLD_DST_ADD_W8 in0, in1, in2, in3 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 .endm functionl inv_identity_8h_x8_lsx .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsadd.h \i, \i, \i .endr endfuncl functionl inv_identity_4h_x8_lsx .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsadd.h \i, \i, \i .endr endfuncl .macro def_fn_8x8_base variant functionl inv_txfm_\variant\()add_8x8_lsx vxor.v vr23, vr23, vr23 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .irp i, 0, 16, 32, 48, 64, 80, 96, 112 vst vr23, a2, \i .endr .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out b .itx_8x8_epilog .else move t6, ra jirl ra, t7, 0 move ra, t6 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 1 .endr .itx_8x8_epilog: LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 add.d a0, a0, a1 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 .endif endfuncl .endm def_fn_8x8_base identity_ def_fn_8x8_base .macro fn8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_8x8 idct_dc 8, 8, 1 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 b .\txfm1\()_\txfm2\()_8X8_END .NO_HAS_DCONLY_8x8: .endif la.local t8, inv_\txfm2\()_8h_x8_lsx .ifc \txfm1, identity b inv_txfm_identity_add_8x8_lsx .else la.local t7, inv_\txfm1\()_8h_x8_lsx b inv_txfm_add_8x8_lsx .endif .\txfm1\()_\txfm2\()_8X8_END: endfunc .endm fn8x8 dct, dct fn8x8 identity, identity fn8x8 dct, adst fn8x8 dct, flipadst fn8x8 dct, identity fn8x8 adst, dct fn8x8 adst, adst fn8x8 adst, flipadst fn8x8 flipadst, dct fn8x8 flipadst, adst fn8x8 flipadst, flipadst fn8x8 identity, dct fn8x8 adst, identity fn8x8 flipadst, identity fn8x8 identity, adst fn8x8 identity, flipadst .macro rect2_lsx in0, in1, out0 vsllwil.w.h vr22, \in0, 0 // in1 vexth.w.h \in0, \in0 // in1 vmul.w vr22, vr22, \in1 vmul.w \out0, \in0, \in1 vssrarni.h.w \out0, vr22, 12 .endm .macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5 vilvl.h \tmp0, \in1, \in0 vilvl.h \tmp1, \in3, \in2 vilvl.w \tmp2, \tmp1, \tmp0 vilvh.w \tmp3, \tmp1, \tmp0 vilvl.h \tmp0, \in5, \in4 vilvl.h \tmp1, \in7, \in6 vilvl.w \tmp4, \tmp1, \tmp0 vilvh.w \tmp5, \tmp1, \tmp0 vilvl.d \out0, \tmp4, \tmp2 vilvh.d \out1, \tmp4, \tmp2 vilvl.d \out2, \tmp5, \tmp3 vilvh.d \out3, \tmp5, \tmp3 .endm functionl inv_txfm_add_8x4_lsx vxor.v vr23, vr23, vr23 vld vr0, a2, 0 vld vr2, a2, 16 vld vr4, a2, 32 vld vr6, a2, 48 .irp i, 0, 16, 32, 48 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 rect2_lsx vr0, vr23, vr0 rect2_lsx vr2, vr23, vr2 rect2_lsx vr4, vr23, vr4 rect2_lsx vr6, vr23, vr6 vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vilvh.d vr5, vr4, vr4 vilvh.d vr7, vr6, vr6 move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \ vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 endfuncl .macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \ out5, out6, out7, tmp0, tmp1, tmp2, tmp3 vilvl.h \tmp0, \in1, \in0 vilvl.h \tmp1, \in3, \in2 vilvh.h \tmp2, \in1, \in0 vilvh.h \tmp3, \in3, \in2 vilvl.w \out0, \tmp1, \tmp0 vilvh.w \out2, \tmp1, \tmp0 vilvl.w \out4, \tmp3, \tmp2 vilvh.w \out6, \tmp3, \tmp2 vbsrl.v \out1, \out0, 8 vbsrl.v \out3, \out2, 8 vbsrl.v \out5, \out4, 8 vbsrl.v \out7, \out6, 8 vinsgr2vr.d \out0, zero, 1 vinsgr2vr.d \out2, zero, 1 vinsgr2vr.d \out4, zero, 1 vinsgr2vr.d \out6, zero, 1 .endm functionl inv_txfm_add_4x8_lsx vxor.v vr23, vr23, vr23 vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 .irp i, 0, 16, 32, 48 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 rect2_lsx vr0, vr23, vr0 rect2_lsx vr1, vr23, vr1 rect2_lsx vr2, vr23, vr2 rect2_lsx vr3, vr23, vr3 move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \ vr6, vr7, vr16, vr17, vr18, vr19 move t6, ra jirl ra, t8, 0 move ra, t6 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr16, vr17 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr18, vr19 endfuncl .macro fn8x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx .ifc \txfm1()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_8x4 idct_dc 8, 4, 0 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 b .\txfm1\()_\txfm2\()_8X4_END .NO_HAS_DCONLY_8x4: .endif la.local t7, inv_\txfm1\()_4h_x8_lsx la.local t8, inv_\txfm2\()_8h_x4_lsx b inv_txfm_add_8x4_lsx .\txfm1\()_\txfm2\()_8X4_END: endfunc .endm fn8x4 dct, dct fn8x4 identity, identity fn8x4 dct, adst fn8x4 dct, flipadst fn8x4 dct, identity fn8x4 adst, dct fn8x4 adst, adst fn8x4 adst, flipadst fn8x4 flipadst, dct fn8x4 flipadst, adst fn8x4 flipadst, flipadst fn8x4 identity, dct fn8x4 adst, identity fn8x4 flipadst, identity fn8x4 identity, adst fn8x4 identity, flipadst .macro fn4x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx .ifc \txfm1()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_4x8 idct_dc 4, 8, 0 DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20 add.d a0, a0, a1 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr5, vr5 b .\txfm1\()_\txfm2\()_4X8_END .NO_HAS_DCONLY_4x8: .endif la.local t7, inv_\txfm1\()_8h_x4_lsx la.local t8, inv_\txfm2\()_4h_x8_lsx b inv_txfm_add_4x8_lsx .\txfm1\()_\txfm2\()_4X8_END: endfunc .endm fn4x8 dct, dct fn4x8 identity, identity fn4x8 dct, adst fn4x8 dct, flipadst fn4x8 dct, identity fn4x8 adst, dct fn4x8 adst, adst fn4x8 adst, flipadst fn4x8 flipadst, dct fn4x8 flipadst, adst fn4x8 flipadst, flipadst fn4x8 identity, dct fn4x8 adst, identity fn4x8 flipadst, identity fn4x8 identity, adst fn4x8 identity, flipadst .macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1 vsllwil.w.h vr4, \in0, 0 vexth.w.h vr5, \in0 vsllwil.w.h vr6, \in1, 0 vexth.w.h vr7, \in1 vmul.w vr4, vr4, \in2 vmul.w vr5, vr5, \in2 vmul.w vr6, vr6, \in2 vmul.w vr7, vr7, \in2 vssrarni.h.w vr5, vr4, 12 vssrarni.h.w vr7, vr6, 12 vsadd.h \out0, vr5, \in3 vsadd.h \out1, vr7, \in4 .endm .macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \out0, vr22, \in2 vmul.w \out1, vr23, \in2 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmadd.w \out0, vr22, \in3 vmadd.w \out1, vr23, \in3 .endm .macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \out0, vr22, \in2 vmul.w \out1, vr23, \in2 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmsub.w \out0, vr22, \in3 vmsub.w \out1, vr23, \in3 .endm .macro inv_dct16_lsx sz inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 16 // 401 vldrepl.h vr21, t0, 18 // 4076 vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz vssrarni.h.w vr17, vr16, 12 // t15a vssrarni.h.w vr19, vr18, 12 // t8a vldrepl.h vr20, t0, 20 // 3166 -> 1583 vldrepl.h vr21, t0, 22 // 2598 -> 1299 vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz vssrarni.h.w vr16, vr1, 12 // t14a vssrarni.h.w vr18, vr15, 12 // t9a vldrepl.h vr20, t0, 24 // 1931 vldrepl.h vr21, t0, 26 // 3612 vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz vssrarni.h.w vr1, vr7, 12 // t13a vssrarni.h.w vr15, vr9, 12 // t10a vldrepl.h vr20, t0, 28 // 3920 vldrepl.h vr21, t0, 30 // 1189 vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz vssrarni.h.w vr7, vr5, 12 // t12a vssrarni.h.w vr9, vr11, 12 // t11a vsadd.h vr5, vr19, vr18 // t8 vssub.h vr11, vr19, vr18 // t9 vssub.h vr3, vr9, vr15 // t10 vsadd.h vr13, vr9, vr15 // t11 vsadd.h vr18, vr7, vr1 // t12 vssub.h vr19, vr7, vr1 // t13 vssub.h vr9, vr17, vr16 // t14 vsadd.h vr15, vr17, vr16 // t15 vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz vssrarni.h.w vr16, vr1, 12 // t14a vssrarni.h.w vr17, vr7, 12 // t9a vneg.h vr21, vr21 vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz vneg.w vr1, vr1 vneg.w vr9, vr9 vssrarni.h.w vr7, vr11, 12 // t13a vssrarni.h.w vr1, vr9, 12 // t10a vsadd.h vr9, vr5, vr13 // t8a vssub.h vr11, vr5, vr13 // t11a vssub.h vr3, vr15, vr18 // t12a vsadd.h vr19, vr15, vr18 // t15a vsadd.h vr5, vr17, vr1 // t9 vssub.h vr13, vr17, vr1 // t10 vssub.h vr15, vr16, vr7 // t13 vsadd.h vr18, vr16, vr7 // t14 vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz vssrarni.h.w vr7, vr1, 12 // t13a vssrarni.h.w vr16, vr17, 12 // t10a vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz vssrarni.h.w vr23, vr13, 12 // t12 vssrarni.h.w vr17, vr15, 12 // t11 vssub.h vr15, vr0, vr19 // c[15] vsadd.h vr0, vr0, vr19 // c[0] vsadd.h vr1, vr2, vr18 // c[1] vssub.h vr20, vr2, vr18 // c[14] vsadd.h vr2, vr4, vr7 // c[2] vssub.h vr13, vr4, vr7 // c[13] vsadd.h vr3, vr6, vr23 // c[3] vssub.h vr21, vr6, vr23 // c[12] vsadd.h vr4, vr8, vr17 // c[4] vssub.h vr11, vr8, vr17 // c[11] vsadd.h vr7, vr14, vr9 // c[7] vssub.h vr8, vr14, vr9 // c[8] vsadd.h vr6, vr12, vr5 // c[6] vssub.h vr9, vr12, vr5 // c[9] vsadd.h vr5, vr10, vr16 // c[5] vssub.h vr10, vr10, vr16 // c[10] vor.v vr14, vr20, vr20 vor.v vr12, vr21, vr21 .endm functionl inv_dct_8h_x16_lsx inv_dct16_lsx .8h endfuncl functionl inv_dct_4h_x16_lsx inv_dct16_lsx .4h endfuncl .macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in0, \in1 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in2, \in3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in4, \in5 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 \in6, \in7 .endm .macro def_fn_4x16_base txfm functionl inv_txfm_\txfm\()add_4x16_lsx PUSH_REG blt a3, t5, 416f vld vr0, a2, 16 vld vr1, a2, 48 vld vr2, a2, 80 vld vr3, a2, 112 vxor.v vr23, vr23, vr23 .irp i, 16, 48, 80, 112 vst vr23, a2, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 .ifnc \txfm, identity_ vsrari.h vr0, vr0, 1 vsrari.h vr1, vr1, 1 vsrari.h vr2, vr2, 1 vsrari.h vr3, vr3, 1 .endif LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \ vr27, vr14, vr28, vr10, vr11, vr12, vr13 416: bge a3, t5, 416416f .irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28 vxor.v \i, \i, \i .endr 416416: vld vr0, a2, 0 vld vr1, a2, 32 vld vr2, a2, 64 vld vr3, a2, 96 vxor.v vr23, vr23, vr23 .irp i, 0, 32, 64, 96 vst vr23, a2, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 .ifnc \txfm, identity_ vsrari.h vr0, vr0, 1 vsrari.h vr1, vr1, 1 vsrari.h vr2, vr2, 1 vsrari.h vr3, vr3, 1 .endif LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \ vr6, vr7, vr16, vr17, vr18, vr19 vor.v vr10, vr24, vr24 vor.v vr11, vr25, vr25 vor.v vr12, vr26, vr26 vor.v vr13, vr27, vr27 vor.v vr15, vr28, vr28 move t6, ra jirl ra, t8, 0 move ra, t6 vilvl.d vr16, vr1, vr0 vilvl.d vr17, vr3, vr2 vilvl.d vr18, vr5, vr4 vilvl.d vr19, vr7, vr6 vilvl.d vr20, vr9, vr8 vilvl.d vr21, vr11, vr10 vilvl.d vr22, vr13, vr12 vilvl.d vr23, vr15, vr14 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 4 .endr VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 POP_REG endfuncl .endm def_fn_4x16_base identity_ def_fn_4x16_base .macro fn4x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx .ifc \txfm1()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_4x16 idct_dc 4, 16, 1 DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 .rept 3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W4 vr5, vr5 .endr b .\txfm1\()_\txfm2\()_4X16_END .NO_HAS_DCONLY_4x16: .endif li.w t5, \eob_half la.local t7, inv_\txfm1\()_8h_x4_lsx .ifc \txfm1, identity la.local t7, inv_\txfm1\()_8h_x4_lsx1 .endif la.local t8, inv_\txfm2\()_4h_x16_lsx .ifc \txfm1, identity b inv_txfm_identity_add_4x16_lsx .else b inv_txfm_add_4x16_lsx .endif .\txfm1\()_\txfm2\()_4X16_END: endfunc .endm fn4x16 dct, dct, 29 fn4x16 identity, identity, 29 fn4x16 dct, adst, 29 fn4x16 dct, flipadst, 29 fn4x16 dct, identity, 8 fn4x16 adst, dct, 29 fn4x16 adst, adst, 29 fn4x16 adst, flipadst, 29 fn4x16 flipadst, dct, 29 fn4x16 flipadst, adst, 29 fn4x16 flipadst, flipadst, 29 fn4x16 identity, dct, 32 fn4x16 adst, identity, 8 fn4x16 flipadst, identity, 8 fn4x16 identity, adst, 32 fn4x16 identity, flipadst, 32 .macro inv_identity16_lsx in0, in1, in2, out0, sz .ifc \sz, .8h vsllwil.w.h vr16, \in0, 0 vexth.w.h vr17, \in0 vmul.w vr16, vr16, \in1 vmul.w vr17, vr17, \in1 vsadd.h \in2, \in2, \in2 vssrarni.h.w vr17, vr16, 11 vsadd.h \out0, vr17, \in2 .else vsllwil.w.h vr16, \in0, 0 vmul.w vr16, vr16, \in1 vsadd.h \in2, \in2, \in2 vssrarni.h.w vr16, vr16, 11 vsadd.h \out0, vr16, \in2 .endif .endm .macro inv_identity16_lsx1 in0, in1, in2, out0 vsllwil.w.h vr16, \in0, 0 vexth.w.h vr17, \in1 vmul.w vr18, vr16, \in2 vmul.w vr19, vr17, \in2 vsrari.w vr18, vr18, 11 vsrari.w vr19, vr19, 11 vslli.w vr16, vr16, 1 vslli.w vr17, vr17, 1 vadd.w vr16, vr18, vr16 vadd.w \out0, vr19, vr17 vssrarni.h.w \out0, vr16, 1 .endm functionl inv_identity_8h_x16_lsx li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_identity16_lsx \i, vr20, \i, \i, .8h .endr endfuncl functionl inv_identity_4h_x16_lsx li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_identity16_lsx \i, vr20, \i, \i, .4h .endr endfuncl functionl inv_identity_8h_x16_lsx1 li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_identity16_lsx1 \i, \i, vr20, \i .endr endfuncl const iadst16_coeffs_h, align=4 .short 4091, 201, 3973, 995 .short 3703, 1751, 3290, 2440 .short 2751, 3035, 2106, 3513 .short 1380, 3857, 601, 4052 endconst .macro inv_adst16_lsx txfm, sz la.local t0, iadst16_coeffs_h vldrepl.h vr20, t0, 0 // 4091 vldrepl.h vr21, t0, 2 // 201 vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz vssrarni.h.w vr18, vr16, 12 // t0 vssrarni.h.w vr19, vr17, 12 // t1 vldrepl.h vr20, t0, 4 // 3973 vldrepl.h vr21, t0, 6 // 995 vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz vssrarni.h.w vr0, vr16, 12 // t2 vssrarni.h.w vr15, vr17, 12 // t3 vldrepl.h vr20, t0, 8 // 3703 vldrepl.h vr21, t0, 10 // 1751 vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz vssrarni.h.w vr2, vr16, 12 // t4 vssrarni.h.w vr13, vr17, 12 // t5 vldrepl.h vr20, t0, 12 // 3290 -> 1645 vldrepl.h vr21, t0, 14 // 2440 -> 1220 vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz vssrarni.h.w vr4, vr16, 12 // t6 vssrarni.h.w vr11, vr17, 12 // t7 vldrepl.h vr20, t0, 16 // 2751 vldrepl.h vr21, t0, 18 // 3035 vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz vssrarni.h.w vr6, vr16, 12 // t8 vssrarni.h.w vr9, vr17, 12 // t9 vldrepl.h vr20, t0, 20 // 2106 vldrepl.h vr21, t0, 22 // 3513 vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz vssrarni.h.w vr7, vr16, 12 // t10 vssrarni.h.w vr8, vr17, 12 // t11 vldrepl.h vr20, t0, 24 // 1380 vldrepl.h vr21, t0, 26 // 3857 vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz vssrarni.h.w vr5, vr16, 12 // t12 vssrarni.h.w vr10, vr17, 12 // t13 vldrepl.h vr20, t0, 28 // 601 vldrepl.h vr21, t0, 30 // 4052 vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz vssrarni.h.w vr3, vr16, 12 // t14 vssrarni.h.w vr12, vr17, 12 // t15 vsadd.h vr1, vr18, vr6 // t0a vssub.h vr14, vr18, vr6 // t8a vsadd.h vr16, vr19, vr9 // t1a vssub.h vr17, vr19, vr9 // t9a vsadd.h vr6, vr0, vr7 // t2a vssub.h vr18, vr0, vr7 // t10a vsadd.h vr9, vr15, vr8 // t3a vssub.h vr19, vr15, vr8 // t11a vsadd.h vr0, vr2, vr5 // t4a vssub.h vr7, vr2, vr5 // t12a vsadd.h vr8, vr13, vr10 // t5a vssub.h vr15, vr13, vr10 // t13a vsadd.h vr2, vr4, vr3 // t6a vssub.h vr5, vr4, vr3 // t14a vsadd.h vr10, vr11, vr12 // t7a vssub.h vr13, vr11, vr12 // t15a la.local t0, idct_coeffs_h vldrepl.h vr20, t0, 8 // 799 vldrepl.h vr21, t0, 10 // 4017 vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz vssrarni.h.w vr11, vr3, 12 // t8 vssrarni.h.w vr12, vr4, 12 // t9 vneg.h vr21, vr21 vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz vssrarni.h.w vr14, vr3, 12 // t13 vssrarni.h.w vr17, vr4, 12 // t12 vldrepl.h vr20, t0, 12 // 3406 vldrepl.h vr21, t0, 14 // 2276 vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz vssrarni.h.w vr7, vr3, 12 // t10 vssrarni.h.w vr15, vr4, 12 // t11 vneg.h vr21, vr21 vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz vssrarni.h.w vr18, vr3, 12 // t15 vssrarni.h.w vr19, vr4, 12 // t14 vsadd.h vr5, vr1, vr0 // t0 vssub.h vr13, vr1, vr0 // t4 vsadd.h vr3, vr16, vr8 // t1 vssub.h vr4, vr16, vr8 // t5 vsadd.h vr0, vr6, vr2 // t2 vssub.h vr1, vr6, vr2 // t6 vsadd.h vr8, vr9, vr10 // t3 vssub.h vr16, vr9, vr10 // t7 vsadd.h vr2, vr11, vr17 // t8a vssub.h vr6, vr11, vr17 // t12a vsadd.h vr9, vr12, vr14 // t9a vssub.h vr10, vr12, vr14 // t13a vsadd.h vr11, vr7, vr19 // t10a vssub.h vr17, vr7, vr19 // t14a vsadd.h vr12, vr15, vr18 // t11a vssub.h vr14, vr15, vr18 // t15a vldrepl.h vr20, t0, 4 // 1567 vldrepl.h vr21, t0, 6 // 3784 vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz vssrarni.h.w vr18, vr7, 12 // t4a vssrarni.h.w vr19, vr15, 12 // t5a vneg.h vr21, vr21 vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz vssrarni.h.w vr4, vr7, 12 // t7a vssrarni.h.w vr13, vr15, 12 // t6a vneg.h vr20, vr20 vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz vneg.h vr21, vr21 vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz vssrarni.h.w vr1, vr7, 12 // t12 vssrarni.h.w vr16, vr15, 12 // t13 vneg.h vr21, vr21 vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz vneg.h vr20, vr20 vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz vssrarni.h.w vr6, vr7, 12 // t15 vssrarni.h.w vr10, vr15, 12 // t14 vssub.h vr17, vr5, vr0 // t2a vsadd.h vr14, vr5, vr0 // out[0] vssub.h vr7, vr3, vr8 // t3a vsadd.h vr15, vr3, vr8 // out[15] vsllwil.w.h vr22, vr15, 0 vexth.w.h vr15, vr15 vneg.w vr22, vr22 vneg.w vr15, vr15 vssrarni.h.w vr15, vr22, 0 // out[15] vsadd.h vr3, vr19, vr4 // out[12] vssub.h vr8, vr19, vr4 // t7 vssub.h vr0, vr18, vr13 // t6 vsadd.h vr5, vr18, vr13 // out[3] vsllwil.w.h vr22, vr5, 0 vexth.w.h vr5, vr5 vneg.w vr22, vr22 vneg.w vr5, vr5 vssrarni.h.w vr5, vr22, 0 // out[3] vsadd.h vr13, vr9, vr12 // out[14] vssub.h vr19, vr9, vr12 // t11 vssub.h vr4, vr2, vr11 // t10 vsadd.h vr18, vr2, vr11 // out[1] vsllwil.w.h vr22, vr18, 0 vexth.w.h vr18, vr18 vneg.w vr22, vr22 vneg.w vr18, vr18 vssrarni.h.w vr18, vr22, 0 // out[1] vsadd.h vr2, vr1, vr10 // out[2] vssub.h vr11, vr1, vr10 // t14a vssub.h vr12, vr16, vr6 // t15a vsadd.h vr9, vr16, vr6 // out[13] vsllwil.w.h vr22, vr9, 0 vexth.w.h vr9, vr9 vneg.w vr22, vr22 vneg.w vr9, vr9 vssrarni.h.w vr9, vr22, 0 // out[13] vldrepl.h vr20, t0, 0 // 2896 vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz vneg.h vr21, vr20 vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz vssrarni.h.w vr1, vr16, 12 // out[8] vsrari.w vr6, vr6, 12 vsrari.w vr10, vr10, 12 vneg.w vr6, vr6 vneg.w vr10, vr10 vssrarni.h.w vr10, vr6, 0 // out[7] vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz vssrarni.h.w vr7, vr6, 12 // out[4] vsrari.w vr16, vr16, 12 vsrari.w vr17, vr17, 12 vneg.w vr16, vr16 vneg.w vr17, vr17 vssrarni.h.w vr17, vr16, 0 // out[11] vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz vssrarni.h.w vr8, vr6, 12 // out[6] vsrari.w vr16, vr16, 12 vsrari.w vr0, vr0, 12 vneg.w vr16, vr16 vneg.w vr0, vr0 vssrarni.h.w vr0, vr16, 0 // out[9] vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz vssrarni.h.w vr19, vr16, 12 // out[10] vsrari.w vr6, vr6, 12 vsrari.w vr4, vr4, 12 vneg.w vr6, vr6 vneg.w vr4, vr4 vssrarni.h.w vr4, vr6, 0 // out[5] .ifc \txfm, adst vor.v vr12, vr3, vr3 vor.v vr3, vr5, vr5 vor.v vr5, vr4, vr4 vor.v vr4, vr7, vr7 vor.v vr7, vr10, vr10 vor.v vr10, vr19, vr19 vor.v vr6, vr8, vr8 vor.v vr8, vr1, vr1 vor.v vr11, vr17, vr17 vor.v vr20, vr13, vr13 vor.v vr13, vr9, vr9 vor.v vr9, vr0, vr0 vor.v vr0, vr14, vr14 vor.v vr14, vr20, vr20 vor.v vr1, vr18, vr18 .else vor.v vr6, vr0, vr0 vor.v vr0, vr15, vr15 vor.v vr15, vr14, vr14 vor.v vr14, vr18, vr18 vor.v vr11, vr7, vr7 vor.v vr7, vr1, vr1 vor.v vr1, vr13, vr13 vor.v vr13, vr2, vr2 vor.v vr2, vr9, vr9 vor.v vr9, vr8, vr8 vor.v vr8, vr10, vr10 vor.v vr10, vr4, vr4 vor.v vr4, vr17, vr17 vor.v vr12, vr5, vr5 vor.v vr5, vr19, vr19 .endif .endm // inv_adst16_lsx functionl inv_adst_8h_x16_lsx inv_adst16_lsx adst, 8h endfuncl functionl inv_flipadst_8h_x16_lsx inv_adst16_lsx flipadst, 8h endfuncl functionl inv_adst_4h_x16_lsx inv_adst16_lsx adst, 4h endfuncl functionl inv_flipadst_4h_x16_lsx inv_adst16_lsx flipadst, 4h endfuncl .macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \ in9, in10, in11, in12, in13, in14, in15 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in0, \in1, \in2, \in3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in4, \in5, \in6, \in7 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in8, \in9, \in10, \in11 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 \in12, \in13, \in14, \in15 .endm .macro def_base_8x16 txfm1 functionl inv_txfm_\txfm1\()add_8x16_lsx blt a3, t5, 816f vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vxor.v vr23, vr23, vr23 .irp i, 16, 48, 80, 112, 144, 176, 208, 240 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 rect2_lsx \i, vr23, \i .endr .ifc \txfm1, identity_ LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 .else move t6, ra jirl ra, t7, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 .endif 816: bge a3, t5, 816816f .irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v \i, \i, \i .endr 816816: vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vxor.v vr23, vr23, vr23 .irp i, 0, 32, 64, 96, 128, 160, 192, 224 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 rect2_lsx \i, vr23, \i .endr .ifc \txfm1, identity_ .else move t6, ra jirl ra, t7, 0 move ra, t6 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 1 .endr .endif LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vor.v vr0, vr0, vr0 vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4 VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 endfuncl .endm def_base_8x16 identity_ def_base_8x16 .macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 vsllwil.hu.bu vr4, \in0, 0 vexth.hu.bu vr0, \in0 vsllwil.hu.bu vr5, \in1, 0 vexth.hu.bu vr1, \in1 vsllwil.hu.bu vr6, \in2, 0 vexth.hu.bu vr2, \in2 vsllwil.hu.bu vr7, \in3, 0 vexth.hu.bu vr3, \in3 vadd.h vr4, vr4, \in4 vadd.h vr0, vr0, \in5 vadd.h vr5, vr5, \in6 vadd.h vr1, vr1, \in7 vadd.h vr6, vr6, \in8 vadd.h vr2, vr2, \in9 vadd.h vr7, vr7, \in10 vadd.h vr3, vr3, \in11 vssrani.bu.h vr0, vr4, 0 vssrani.bu.h vr1, vr5, 0 vssrani.bu.h vr2, vr6, 0 vssrani.bu.h vr3, vr7, 0 vst vr0, a0, 0 vstx vr1, a0, a1 vst vr2, t2, 0 vstx vr3, t2, a1 .endm .macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ \in4, \in5, \in6, \in7 .endm .macro def_fn_16x8 txfm1 functionl inv_txfm_\txfm1\()add_16x8_lsx PUSH_REG vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr23, vr23, vr23 .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \ 176, 192, 208, 224, 240 vst vr23, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 .ifnc \txfm1, identity_ .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 1 .endr .endif LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15 POP_REG endfuncl .endm def_fn_16x8 identity_ def_fn_16x8 .macro fun16x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_16x8 idct_dc 16, 8, 1 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 b .\txfm1\()_\txfm2\()_16x8_END .NO_HAS_DCONLY_16x8: .endif la.local t7, inv_\txfm1\()_8h_x16_lsx .ifc \txfm1, identity la.local t7, inv_identity_8h_x16_lsx1 .endif la.local t8, inv_\txfm2\()_8h_x8_lsx .ifc \txfm1, identity b inv_txfm_identity_add_16x8_lsx .else b inv_txfm_add_16x8_lsx .endif .\txfm1\()_\txfm2\()_16x8_END: endfunc .endm fun16x8 dct, dct fun16x8 identity, identity fun16x8 dct, adst fun16x8 dct, flipadst fun16x8 dct, identity fun16x8 adst, dct fun16x8 adst, adst fun16x8 adst, flipadst fun16x8 flipadst, dct fun16x8 flipadst, adst fun16x8 flipadst, flipadst fun16x8 identity, dct fun16x8 adst, identity fun16x8 flipadst, identity fun16x8 identity, adst fun16x8 identity, flipadst .macro fun8x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x16_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_8x16 idct_dc 8, 16, 1 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 .rept 3 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 .endr b .\txfm1\()_\txfm2\()_8x16_END .NO_HAS_DCONLY_8x16: .endif li.w t5, \eob_half .ifnc \txfm1, identity la.local t7, inv_\txfm1\()_8h_x8_lsx .endif la.local t8, inv_\txfm2\()_8h_x16_lsx .ifc \txfm1, identity b inv_txfm_identity_add_8x16_lsx .else b inv_txfm_add_8x16_lsx .endif .\txfm1\()_\txfm2\()_8x16_END: endfunc .endm fun8x16 dct, dct, 43 fun8x16 identity, identity, 43 fun8x16 dct, adst, 43 fun8x16 dct, flipadst, 43 fun8x16 dct, identity, 8 fun8x16 adst, dct, 43 fun8x16 adst, adst, 43 fun8x16 adst, flipadst, 43 fun8x16 flipadst, dct, 43 fun8x16 flipadst, adst, 43 fun8x16 flipadst, flipadst, 43 fun8x16 identity, dct, 64 fun8x16 adst, identity, 8 fun8x16 flipadst, identity, 8 fun8x16 identity, adst, 64 fun8x16 identity, flipadst, 64 functionl inv_txfm_add_16x16_lsx malloc_space 512 addi.d t1, sp, 64 addi.d t2, a2, 0 .rept 2 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr23, vr23, vr23 .irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, \ 384, 416, 448, 480 vst vr23, a2, \i .endr move t6, ra jirl ra, t7, 0 move ra, t6 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 2 .endr vst_x8 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vst_x8 t1, 16, 32, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t1, t1, 256 addi.d a2, a2, 16 blt a3, t5, 1616f .endr 1616: bge a3, t5, 16161616f addi.d t1, sp, 320 vxor.v vr23, vr23, vr23 .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 240 vst vr23, t1, \i .endr 16161616: addi.d t1, sp, 64 .rept 2 vld_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 move t6, ra jirl ra, t8, 0 move ra, t6 vst_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t1, t1, 16 .endr alsl.d t2, a1, a0, 1 addi.d t1, sp, 64 .rept 4 vld_x8 t1, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vsrari_h_x8 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4 VLD_DST_ADD_W16 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 addi.d t1, t1, 128 .endr free_space 512 endfuncl .macro fun16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_16x16 idct_dc 16, 16, 2 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 .rept 3 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .\txfm1\()_\txfm2\()_16x16_END .NO_HAS_DCONLY_16x16: .endif li.w t5, \eob_half la.local t7, inv_\txfm1\()_8h_x16_lsx la.local t8, inv_\txfm2\()_8h_x16_lsx b inv_txfm_add_16x16_lsx .\txfm1\()_\txfm2\()_16x16_END: endfunc .endm fun16x16 dct, dct, 36 fun16x16 adst, adst, 36 fun16x16 adst, dct, 36 fun16x16 dct, adst, 36 fun16x16 flipadst, dct, 36 fun16x16 dct, flipadst, 36 fun16x16 adst, flipadst, 36 fun16x16 flipadst, adst, 36 .macro dct_8x32_core_lsx in1, in2, vld_st0, vld_st1, vld_stride, \ vst_st0, vst_st1, vst_st2, vst_st3, vst_stride, \ transpose8x8, shift la.local t0, idct_coeffs vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 vssrarni.h.w vr9, vr8, 12 // t31a vssrarni.h.w vr10, vr11, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 vssrarni.h.w vr0, vr8, 12 // t30a vssrarni.h.w vr30, vr11, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 vssrarni.h.w vr7, vr8, 12 // t29a vssrarni.h.w vr19, vr11, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 vssrarni.h.w vr4, vr8, 12 // t28a vssrarni.h.w vr26, vr11, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 vssrarni.h.w vr3, vr8, 12 // t27a vssrarni.h.w vr27, vr11, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 vssrarni.h.w vr2, vr8, 12 // t26a vssrarni.h.w vr28, vr11, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 vssrarni.h.w vr5, vr8, 12 // t25a vssrarni.h.w vr25, vr11, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 vssrarni.h.w vr6, vr8, 12 // t24a vssrarni.h.w vr24, vr11, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 vssrarni.h.w vr7, vr4, 12 // t30a vssrarni.h.w vr0, vr11, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 vssrarni.h.w vr9, vr4, 12 // t18a vssrarni.h.w vr2, vr11, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 vssrarni.h.w vr29, vr4, 12 // t26a vssrarni.h.w vr6, vr11, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 vssrarni.h.w vr8, vr4, 12 // t22a vssrarni.h.w vr24, vr11, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 vssrarni.h.w vr5, vr3, 12 // t29a vssrarni.h.w vr2, vr11, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 vssrarni.h.w vr7, vr3, 12 // t28 vssrarni.h.w vr24, vr11, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 vssrarni.h.w vr28, vr3, 12 // t20 vssrarni.h.w vr25, vr11, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 vssrarni.h.w vr30, vr3, 12 // t21a vssrarni.h.w vr1, vr11, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 vssrarni.h.w vr5, vr1, 12 // t20 vssrarni.h.w vr7, vr11, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 vssrarni.h.w vr25, vr1, 12 // t21a vssrarni.h.w vr6, vr11, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 vssrarni.h.w vr19, vr1, 12 // t22 vssrarni.h.w vr10, vr11, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 vssrarni.h.w vr31, vr1, 12 // t23a vssrarni.h.w vr8, vr11, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 \in2, \vld_st0, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st0, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st1, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 \in2, \vld_st1, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st2, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 .endif .ifnb \shift .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vsrari.h \i, \i, \shift .endr .endif vst_x8 \in1, \vst_st3, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 .endm const eob_32x32 .short 36, 136, 300, 1024 endconst const eob_8x32 .short 43, 107, 171, 256 endconst const eob_16x32 .short 36, 151, 279, 512 endconst .macro DST_ADD_W32 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr4, vr10, 0 vsllwil.hu.bu vr5, vr11, 0 vsllwil.hu.bu vr6, vr12, 0 vsllwil.hu.bu vr7, vr13, 0 vexth.hu.bu vr10, vr10 vexth.hu.bu vr11, vr11 vexth.hu.bu vr12, vr12 vexth.hu.bu vr13, vr13 vadd.h vr4, vr4, \in0 vadd.h vr10, vr10, \in1 vadd.h vr5, vr5, \in2 vadd.h vr11, vr11, \in3 vadd.h vr6, vr6, \in4 vadd.h vr12, vr12, \in5 vadd.h vr7, vr7, \in6 vadd.h vr13, vr13, \in7 vssrani.bu.h vr10, vr4, 0 vssrani.bu.h vr11, vr5, 0 vssrani.bu.h vr12, vr6, 0 vssrani.bu.h vr13, vr7, 0 vst vr10, a0, 0 vst vr11, a0, 16 vst vr12, t2, 0 vst vr13, t2, 16 .endm .macro idct_dc_w32 w, h, shift ld.h t2, a2, 0 // dc vldi vr0, 0x8b5 // 181 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 // 128 vmul.w vr2, vr0, vr1 // dc * 181 st.h zero, a2, 0 add.d t2, a0, a1 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 vld vr13, t2, 16 .if (2*\w == \h) || (2*\h == \w) vmul.w vr2, vr2, vr0 vsrari.w vr2, vr2, 8 .endif .if \shift>0 vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift .endif vld vr11, a0, 16 vmadd.w vr20, vr2, vr0 vld vr12, t2, 0 vssrarni.h.w vr20, vr20, 12 vld vr10, a0, 0 .endm function inv_txfm_add_dct_dct_32x8_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x8 idct_dc_w32 32, 8, 2 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .rept 3 alsl.d a0, a1, a0, 1 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_32X8_END .NO_HAS_DCONLY_32x8: malloc_space 512+256 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 64 addi.d t3, t3, 512 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 vst_x16 t2, 0, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vxor.v vr31, vr31, vr31 vst_x16 t2, 16, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2 addi.d t2, sp, 64 .rept 4 vld_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 4 .endr vst_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d t2, t2, 16 .endr addi.d t0, sp, 64 .rept 4 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 1 addi.d t0, t0, 128 .endr free_space 512+256 .DCT_DCT_32X8_END: endfunc function inv_txfm_add_dct_dct_32x16_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x16 idct_dc_w32 32, 16, 1 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .rept 7 alsl.d a0, a1, a0, 1 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_32X16_END .NO_HAS_DCONLY_32x16: malloc_space 1024+256 // 32*32*2+512 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 64 addi.d t3, t3, 1024 .rept 2 vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 vst_x16 t2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 la.local t0, idct_coeffs vldrepl.w vr23, t0, 0 // 2896 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 rect2_lsx \i, vr23, \i .endr vxor.v vr31, vr31, vr31 vst_x16 t2, 32, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 1 addi.d t2, t2, 16 addi.d t1, t1, 512 .endr addi.d t2, sp, 64 .rept 4 vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 4 .endr vst_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t2, t2, 16 .endr addi.d t0, sp, 64 .rept 8 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 1 addi.d t0, t0, 128 .endr free_space 1024+256 .DCT_DCT_32X16_END: endfunc function inv_txfm_add_dct_dct_32x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_32x32 idct_dc_w32 32, 32, 2 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .rept 15 alsl.d a0, a1, a0, 1 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_32X32_END .NO_HAS_DCONLY_32x32: malloc_space 2560 // 32*32*2+512 addi.d t1, sp, 64 addi.d t2, a2, 0 addi.d t3, sp, 1024 addi.d t3, t3, 1024 addi.d t3, t3, 64 la.local t8, eob_32x32 .DCT_DCT_EOB_32x32: ld.h t7, t8, 0 addi.d t8, t8, 2 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 vst_x16 t2, 0, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 vxor.v vr31, vr31, vr31 vst_x16 t2, 64, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2 addi.d t2, t2, 16 addi.d t1, t1, 512 bge a3, t7, .DCT_DCT_EOB_32x32 la.local t8, eob_32x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_32x32_END // a3>=t7 vst_x16 sp, 64+1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t1, sp, 256+64 vst_x16 t1, 1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_32x32_END vst_x16 sp, 64+1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x16 t1, 1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_32x32_END vst_x16 sp, 64+512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x16 t1, 512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 .DCT_DCT_EOB_32x32_END: addi.d t2, sp, 64 addi.d t1, sp, 64 .rept 4 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 1536, 512, 1024, 64, , 4 addi.d t2, t2, 16 addi.d t1, t1, 16 .endr addi.d t0, sp, 64 .rept 16 add.d t2, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, t2, 0 vld vr13, t2, 16 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 alsl.d a0, a1, a0, 1 addi.d t0, t0, 128 .endr free_space 2560 // 32*32*2+512 .DCT_DCT_32X32_END: endfunc /* * temp: vr8, vr9, vr10, vr12, vr20, vr21, vr22, vr23 */ .macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ out1, out2, out3, out4, out5, out6, out7, rect2 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 rect2_lsx \i, vr23, \i .endr .endif la.local t0, idct_coeffs vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vsllwil.w.h vr22, \in2, 0 vexth.w.h vr23, \in2 vmul.w vr8, vr22, vr20 vmul.w vr10, vr23, vr20 vmul.w \in2, vr22, vr21 vmul.w vr9, vr23, vr21 vssrarni.h.w vr10, vr8, 12 // t2 vssrarni.h.w vr9, \in2, 12 // t3 vldrepl.w vr20, t0, 0 // 2896 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w vr8, vr22, vr20 vmul.w \in2, vr23, vr20 vssrarni.h.w \in2, vr8, 12 vsadd.h vr8, \in2, vr9 // c[0] vssub.h vr9, \in2, vr9 // c[3] vsadd.h \in0, \in2, vr10 // c[1] vssub.h vr10, \in2, vr10 // c[2] // inv_dct8_1d_internal_c tx64 // in1 in3 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vsllwil.w.h vr22, \in1, 0 vexth.w.h vr23, \in1 vmul.w \in2, vr22, vr21 vmul.w \in4, vr23, vr21 vmul.w \in1, vr22, vr20 vmul.w \in6, vr23, vr20 vssrarni.h.w \in4, \in2, 12 // t7a vssrarni.h.w \in6, \in1, 12 // t4a vldrepl.w vr20, t0, 24 // 3406 vldrepl.w vr21, t0, 28 // 2276 vsllwil.w.h vr22, \in3, 0 vexth.w.h vr23, \in3 vneg.w vr21, vr21 vmul.w \in2, vr22, vr20 vmul.w \in1, vr23, vr20 vmul.w \in3, vr22, vr21 vmul.w \in7, vr23, vr21 vssrarni.h.w \in1, \in2, 12 // t6a vssrarni.h.w \in7, \in3, 12 // t5a vsadd.h \in3, \in6, \in7 // t4 vssub.h \in6, \in6, \in7 // t5a vsadd.h \in5, \in4, \in1 // t7 vssub.h \in4, \in4, \in1 // t6a vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 vssrarni.h.w \in1, vr21, 12 // t6 vssrarni.h.w \in7, \in2, 12 // t5 vsadd.h \out0, vr8, \in5 // c[0] vssub.h \out7, vr8, \in5 // c[7] vsadd.h \out1, \in0, \in1 // c[1] vssub.h \out6, \in0, \in1 // c[6] vsadd.h \out2, vr10, \in7 // c[2] vssub.h \out5, vr10, \in7 // c[5] vsadd.h \out3, vr9, \in3 // c[3] vssub.h \out4, vr9, \in3 // c[4] .endm /* * input: in0, in1, in2, in3, in4, in5, in6, in7 (fixed) * vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 * in8, in9, in10, in11, in12, in13, in14, in15 * vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 * output: out0, out1, out2, out3, out4, out5, out6, out7 (fixed) * vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 * out8, out9, out10, out11, out12, out13, out14, out15 * vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 */ .macro dct_8x16_tx64_core_lsx rect2 dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ vr12, vr13, vr14, vr15, vr16, vr17, vr18, \rect2 // in1 in3 in5 in7 in9 in11 in13 in15 // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, vr1, vr3, vr5, vr7, vr24, vr26, vr28, vr30 rect2_lsx \i, vr23, \i .endr .endif vldrepl.w vr20, t0, 32 // 401 vldrepl.w vr21, t0, 36 // 4076 vsllwil.w.h vr22, vr1, 0 vexth.w.h vr23, vr1 vmul.w vr0, vr22, vr21 vmul.w vr10, vr23, vr21 vmul.w vr1, vr22, vr20 vmul.w vr29, vr23, vr20 vssrarni.h.w vr10, vr0, 12 // t15a vssrarni.h.w vr29, vr1, 12 // t8a vldrepl.w vr20, t0, 40 // 3166 -> 1583 vldrepl.w vr21, t0, 44 // 2598 -> 1299 vsllwil.w.h vr22, vr7, 0 vexth.w.h vr23, vr7 vneg.w vr21, vr21 vmul.w vr0, vr22, vr20 vmul.w vr30, vr23, vr20 vmul.w vr7, vr22, vr21 vmul.w vr31, vr23, vr21 vssrarni.h.w vr30, vr0, 12 // t14a vssrarni.h.w vr31, vr7, 12 // t9a vldrepl.w vr20, t0, 48 // 1931 vldrepl.w vr21, t0, 52 // 3612 vsllwil.w.h vr22, vr5, 0 vexth.w.h vr23, vr5 vmul.w vr0, vr22, vr21 vmul.w vr24, vr23, vr21 vmul.w vr5, vr22, vr20 vmul.w vr25, vr23, vr20 vssrarni.h.w vr24, vr0, 12 // t13a vssrarni.h.w vr25, vr5, 12 // t10a vldrepl.w vr20, t0, 56 // 3920 vldrepl.w vr21, t0, 60 // 1189 vsllwil.w.h vr22, vr3, 0 vexth.w.h vr23, vr3 vneg.w vr21, vr21 vmul.w vr0, vr22, vr20 vmul.w vr26, vr23, vr20 vmul.w vr3, vr22, vr21 vmul.w vr27, vr23, vr21 vssrarni.h.w vr26, vr0, 12 // t12a vssrarni.h.w vr27, vr3, 12 // t11a // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 vsadd.h vr28, vr29, vr31 // t8 vssub.h vr19, vr29, vr31 // t9 vssub.h vr29, vr27, vr25 // t10 vsadd.h vr9, vr27, vr25 // t11 vsadd.h vr31, vr26, vr24 // t12 vssub.h vr25, vr26, vr24 // t13 vssub.h vr27, vr10, vr30 // t14 vsadd.h vr24, vr10, vr30 // t15 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 vssrarni.h.w vr26, vr0, 12 // t14a vssrarni.h.w vr30, vr1, 12 // t9a vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 vneg.w vr0, vr0 vneg.w vr19, vr19 vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 vssrarni.h.w vr19, vr0, 12 // t10a vssrarni.h.w vr27, vr1, 12 // t13a vsadd.h vr25, vr28, vr9 // t8a vssub.h vr29, vr28, vr9 // t11a vssub.h vr28, vr24, vr31 // t12a vsadd.h vr10, vr24, vr31 // t15a vsadd.h vr9, vr30, vr19 // t9 vssub.h vr31, vr30, vr19 // t10 vssub.h vr30, vr26, vr27 // t13 vsadd.h vr24, vr26, vr27 // t14 vldrepl.w vr20, t0, 0 // 2896 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 vssrarni.h.w vr26, vr0, 12 // t13a vssrarni.h.w vr27, vr1, 12 // t10a vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 vssrarni.h.w vr31, vr0, 12 // t12 vssrarni.h.w vr30, vr1, 12 // t11 // vr11 vr12 ... vr18 vsadd.h vr28, vr14, vr31 // c[3] vssub.h vr29, vr14, vr31 // c[12] vsadd.h vr20, vr15, vr30 // c[4] vssub.h vr21, vr15, vr30 // c[11] vsadd.h vr14, vr16, vr27 // c[5] vssub.h vr23, vr16, vr27 // c[10] vsadd.h vr15, vr17, vr9 // c[6] vssub.h vr30, vr17, vr9 // c[9] vsadd.h vr16, vr18, vr25 // c[7] vssub.h vr27, vr18, vr25 // c[8] vsadd.h vr17, vr13, vr26 // c[2] vssub.h vr26, vr13, vr26 // c[13] vsadd.h vr18, vr12, vr24 // c[1] vssub.h vr25, vr12, vr24 // c[14] vsadd.h vr22, vr11, vr10 // c[0] vssub.h vr24, vr11, vr10 // c[15] .endm // dct_8x16_tx64_core_lsx .macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 vsllwil.w.h vr22, \in0, 0 vexth.w.h vr23, \in0 vmul.w \tmp0, vr22, \in1 vmul.w \out0, vr23, \in1 vmul.w \tmp1, vr22, \in2 vmul.w \out1, vr23, \in2 vssrarni.h.w \out0, \tmp0, 12 vssrarni.h.w \out1, \tmp1, 12 .endm const idct64_coeffs, align=4 .word 101, 4095, 2967, -2824 .word 1660, 3745, 3822, -1474 .word 4076, 401, 4017, 799 .word 4036, -700, 2359, 3349 .word 3461, -2191, 897, 3996 .word -3166, -2598, -799, -4017 .word 501, 4065, 3229, -2520 .word 2019, 3564, 3948, -1092 .word 3612, 1931, 2276, 3406 .word 4085, -301, 2675, 3102 .word 3659, -1842, 1285, 3889 .word -3920, -1189, -3406, -2276 endconst .macro dct64_step1_lsx vldrepl.w vr20, t0, 0 // 101 vldrepl.w vr21, t0, 4 // 4095 vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a vldrepl.w vr20, t0, 8 // 2967 vldrepl.w vr21, t0, 12 // -2824 vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a vldrepl.w vr20, t0, 16 // 1660 vldrepl.w vr21, t0, 20 // 3745 vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a vldrepl.w vr20, t0, 24 // 3822 vldrepl.w vr21, t0, 28 // -1474 vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a vsadd.h vr0, vr8, vr11 // t32 vssub.h vr1, vr8, vr11 // t33 vssub.h vr2, vr15, vr12 // t34 vsadd.h vr3, vr15, vr12 // t35 vsadd.h vr4, vr14, vr13 // t60 vssub.h vr5, vr14, vr13 // t61 vssub.h vr6, vr9, vr10 // t62 vsadd.h vr7, vr9, vr10 // t63 vldrepl.w vr20, t0, 32 // 4076 vldrepl.w vr21, t0, 36 // 401 vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 vssrarni.h.w vr10, vr9, 12 // t62a vssrarni.h.w vr11, vr13, 12 // t33a vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 vneg.w vr9, vr9 vneg.w vr1, vr1 vssrarni.h.w vr6, vr13, 12 // t61a vssrarni.h.w vr1, vr9, 12 // t34a vsadd.h vr2, vr0, vr3 // t32a vssub.h vr5, vr0, vr3 // t35a vsadd.h vr9, vr11, vr1 // t33 vssub.h vr13, vr11, vr1 // t34 vssub.h vr0, vr7, vr4 // t60a vsadd.h vr3, vr7, vr4 // t63a vssub.h vr1, vr10, vr6 // t61 vsadd.h vr11, vr10, vr6 // t62 vldrepl.w vr20, t0, 40 // 4017 vldrepl.w vr21, t0, 44 // 799 vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 vssrarni.h.w vr4, vr8, 12 // t61a vssrarni.h.w vr7, vr12, 12 // t34a vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 vssrarni.h.w vr6, vr8, 12 // t60 vssrarni.h.w vr10, vr12, 12 // t35 vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 .endm // dct64_step1 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a .macro dct64_step2_lsx vld vr0, t5, 0 // t32a vld vr2, t4, 0 // t63a vld vr3, t5, 16*8 // t56a vld vr1, t4, 16*8 // t39a vld vr4, t5, 16*16 // t40a vld vr6, t4, 16*16 // t55a vld vr7, t5, 16*24 // t48a vld vr5, t4, 16*24 // t47a vsadd.h vr8, vr0, vr1 // t32 vssub.h vr9, vr0, vr1 // t39 vsadd.h vr10, vr2, vr3 // t63 vssub.h vr11, vr2, vr3 // t56 vssub.h vr12, vr5, vr4 // t40 vsadd.h vr13, vr5, vr4 // t47 vsadd.h vr14, vr7, vr6 // t48 vssub.h vr15, vr7, vr6 // t55 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 vssrarni.h.w vr2, vr0, 12 // t56a vssrarni.h.w vr3, vr1, 12 // t39a vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 vneg.w vr0, vr0 vneg.w vr4, vr4 vssrarni.h.w vr5, vr1, 12 // t55a vssrarni.h.w vr4, vr0, 12 // t40a vsadd.h vr9, vr8, vr13 // t32a vssub.h vr11, vr8, vr13 // t47a vsadd.h vr6, vr3, vr4 // t39 vssub.h vr7, vr3, vr4 // t40 vssub.h vr12, vr10, vr14 // t48a vsadd.h vr15, vr10, vr14 // t63a vssub.h vr0, vr2, vr5 // t55 vsadd.h vr1, vr2, vr5 // t56 vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 vssrarni.h.w vr13, vr8, 12 // t40a vssrarni.h.w vr4, vr3, 12 // t55a vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 vssrarni.h.w vr10, vr8, 12 // t47 vssrarni.h.w vr14, vr3, 12 // t48 // t32a t39 t40a t47 t48 t55a t56 t63a // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 vst vr9, t5, 0 // t32a vst vr6, t4, 0 // t39 vst vr13, t5, 16*8 // t40a vst vr10, t4, 16*8 // t47 vst vr14, t5, 16*16 // t48 vst vr4, t4, 16*16 // t55a vst vr1, t5, 16*24 // t56 vst vr15, t4, 16*24 // t63a .endm // dct64_step2_lsx .macro dct64_step3_lsx // t0 t1 t2 t3 t4 t5 t6 t7 vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 vld vr9, t5, 16*24 // t56 vld vr6, t5, 16*24+16 // t57a vld vr13, t5, 16*24+32 // t58 vld vr10, t5, 16*24+48 // t59a vld vr14, t4, 16*24-48 // t60 vld vr4, t4, 16*24-32 // t61a vld vr1, t4, 16*24-16 // t62 vld vr15, t4, 16*24 // t63a vsadd.h vr20, vr2, vr15 // c[0] vssub.h vr21, vr2, vr15 // c[63] vsadd.h vr22, vr3, vr1 // c[1] vssub.h vr23, vr3, vr1 // c[62] vsadd.h vr24, vr7, vr4 // c[2] vssub.h vr25, vr7, vr4 // c[61] vsadd.h vr26, vr8, vr14 // c[3] vssub.h vr27, vr8, vr14 // c[60] vsadd.h vr28, vr11, vr10 // c[4] vssub.h vr29, vr11, vr10 // c[59] vsadd.h vr30, vr12, vr13 // c[5] vssub.h vr31, vr12, vr13 // c[58] vsadd.h vr2, vr16, vr6 // c[6] vssub.h vr15, vr16, vr6 // c[57] vsadd.h vr1, vr17, vr9 // c[7] vssub.h vr3, vr17, vr9 // c[56] .endm // dct64_step3_lsx .macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 dct64_step3_lsx .ifnb \transpose8x8 LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 .endif .ifnb \shift .irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 vsrari.h \i, \i, \shift .endr .endif vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 .endm // dct64_step4_lsx .macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 fld.d f4, t0, 0 fldx.d f5, t0, a1 fld.d f6, t6, 0 fldx.d f7, t6, a1 alsl.d t0, a1, t0, 2 alsl.d t6, a1, t6, 2 fld.d f8, t0, 0 fldx.d f9, t0, a1 fld.d f10, t6, 0 fldx.d f11, t6, a1 .irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 vsllwil.hu.bu \i, \i, 0 .endr vsrari.h vr20, \in0, 4 vsrari.h vr22, \in1, 4 vsrari.h vr24, \in2, 4 vsrari.h vr26, \in3, 4 vsrari.h vr28, \in4, 4 vsrari.h vr30, \in5, 4 vsrari.h vr2, \in6, 4 vsrari.h vr1, \in7, 4 vadd.h vr4, vr4, vr20 vadd.h vr5, vr5, vr22 vadd.h vr6, vr6, vr24 vadd.h vr7, vr7, vr26 vadd.h vr8, vr8, vr28 vadd.h vr9, vr9, vr30 vadd.h vr10, vr10, vr2 vadd.h vr11, vr11, vr1 vssrani.bu.h vr5, vr4, 0 vssrani.bu.h vr7, vr6, 0 vssrani.bu.h vr9, vr8, 0 vssrani.bu.h vr11, vr10, 0 vstelm.d vr5, t1, 0, 0 vstelm.d vr5, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr7, t1, 0, 0 vstelm.d vr7, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr9, t1, 0, 0 vstelm.d vr9, t2, 0, 1 alsl.d t1, a1, t1, 1 alsl.d t2, a1, t2, 1 vstelm.d vr11, t1, 0, 0 vstelm.d vr11, t2, 0, 1 .endm // dct64_step5_lsx .macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1, rect2 vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 dct_8x16_tx64_core_lsx \rect2 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 vxor.v vr31, vr31, vr31 vst_x8 t2, \vld_loc0, \stride0, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vst_x8 t2, \vld_loc1, \stride1, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 la.local t0, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, t0, 0 // 2896 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 rect2_lsx \i, vr23, \i .endr .endif vldrepl.w vr20, t0, 64 // 201 vldrepl.w vr21, t0, 68 // 4091 vsllwil.w.h vr22, vr0, 0 vexth.w.h vr23, vr0 vmul.w vr8, vr22, vr21 vmul.w vr9, vr23, vr21 vmul.w vr0, vr22, vr20 vmul.w vr10, vr23, vr20 vssrarni.h.w vr9, vr8, 12 // t31a vssrarni.h.w vr10, vr0, 12 // t16a vldrepl.w vr20, t0, 72 // 3035 vldrepl.w vr21, t0, 76 // 2751 vsllwil.w.h vr22, vr7, 0 vexth.w.h vr23, vr7 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr0, vr23, vr20 vmul.w vr7, vr22, vr21 vmul.w vr30, vr23, vr21 vssrarni.h.w vr0, vr8, 12 // t30a vssrarni.h.w vr30, vr7, 12 // t17a vldrepl.w vr20, t0, 80 // 1751 vldrepl.w vr21, t0, 84 // 3703 vsllwil.w.h vr22, vr4, 0 vexth.w.h vr23, vr4 vmul.w vr8, vr22, vr21 vmul.w vr7, vr23, vr21 vmul.w vr4, vr22, vr20 vmul.w vr19, vr23, vr20 vssrarni.h.w vr7, vr8, 12 // t29a vssrarni.h.w vr19, vr4, 12 // t18a vldrepl.w vr20, t0, 88 // 3857 vldrepl.w vr21, t0, 92 // 1380 vsllwil.w.h vr22, vr3, 0 vexth.w.h vr23, vr3 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr4, vr23, vr20 vmul.w vr3, vr22, vr21 vmul.w vr26, vr23, vr21 vssrarni.h.w vr4, vr8, 12 // t28a vssrarni.h.w vr26, vr3, 12 // t19a vldrepl.w vr20, t0, 96 // 995 vldrepl.w vr21, t0, 100 // 3973 vsllwil.w.h vr22, vr2, 0 vexth.w.h vr23, vr2 vmul.w vr8, vr22, vr21 vmul.w vr3, vr23, vr21 vmul.w vr2, vr22, vr20 vmul.w vr27, vr23, vr20 vssrarni.h.w vr3, vr8, 12 // t27a vssrarni.h.w vr27, vr2, 12 // t20a vldrepl.w vr20, t0, 104 // 3513 vldrepl.w vr21, t0, 108 // 2106 vsllwil.w.h vr22, vr5, 0 vexth.w.h vr23, vr5 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr2, vr23, vr20 vmul.w vr5, vr22, vr21 vmul.w vr28, vr23, vr21 vssrarni.h.w vr2, vr8, 12 // t26a vssrarni.h.w vr28, vr5, 12 // t21a vldrepl.w vr20, t0, 112 // 2440 -> 1220 vldrepl.w vr21, t0, 116 // 3290 -> 1645 vsllwil.w.h vr22, vr6, 0 vexth.w.h vr23, vr6 vmul.w vr8, vr22, vr21 vmul.w vr5, vr23, vr21 vmul.w vr6, vr22, vr20 vmul.w vr25, vr23, vr20 vssrarni.h.w vr5, vr8, 12 // t25a vssrarni.h.w vr25, vr6, 12 // t22a vldrepl.w vr20, t0, 120 // 4052 vldrepl.w vr21, t0, 124 // 601 vsllwil.w.h vr22, vr1, 0 vexth.w.h vr23, vr1 vneg.w vr21, vr21 vmul.w vr8, vr22, vr20 vmul.w vr6, vr23, vr20 vmul.w vr1, vr22, vr21 vmul.w vr24, vr23, vr21 vssrarni.h.w vr6, vr8, 12 // t24a vssrarni.h.w vr24, vr1, 12 // t23a vsadd.h vr1, vr10, vr30 // t16 vssub.h vr29, vr10, vr30 // t17 vssub.h vr8, vr26, vr19 // t18 vsadd.h vr31, vr26, vr19 // t19 vsadd.h vr10, vr27, vr28 // t20 vssub.h vr30, vr27, vr28 // t21 vssub.h vr19, vr24, vr25 // t22 vsadd.h vr26, vr24, vr25 // t23 vsadd.h vr27, vr6, vr5 // t24 vssub.h vr28, vr6, vr5 // t25 vssub.h vr24, vr3, vr2 // t26 vsadd.h vr25, vr3, vr2 // t27 vsadd.h vr5, vr4, vr7 // t28 vssub.h vr6, vr4, vr7 // t29 vssub.h vr2, vr9, vr0 // t30 vsadd.h vr3, vr9, vr0 // t31 vldrepl.w vr20, t0, 16 // 799 vldrepl.w vr21, t0, 20 // 4017 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 vssrarni.h.w vr7, vr4, 12 // t30a vssrarni.h.w vr0, vr11, 12 // t17a vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 vneg.w vr4, vr4 vneg.w vr9, vr9 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 vssrarni.h.w vr9, vr4, 12 // t18a vssrarni.h.w vr2, vr11, 12 // t29a vldrepl.w vr20, t0, 24 // 3406 -> 1703 vldrepl.w vr21, t0, 28 // 2276 -> 1138 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 vssrarni.h.w vr29, vr4, 12 // t26a vssrarni.h.w vr6, vr11, 12 // t21a vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 vneg.w vr4, vr4 vneg.w vr8, vr8 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 vssrarni.h.w vr8, vr4, 12 // t22a vssrarni.h.w vr24, vr11, 12 // t25a vsadd.h vr4, vr1, vr31 // t16a vssub.h vr30, vr1, vr31 // t19a vsadd.h vr19, vr0, vr9 // t17 vssub.h vr28, vr0, vr9 // t18 vssub.h vr1, vr26, vr10 // t20a vsadd.h vr31, vr26, vr10 // t23a vssub.h vr0, vr8, vr6 // t21 vsadd.h vr9, vr8, vr6 // t22 vsadd.h vr10, vr27, vr25 // t24a vssub.h vr26, vr27, vr25 // t27a vsadd.h vr6, vr24, vr29 // t25 vssub.h vr8, vr24, vr29 // t26 vssub.h vr25, vr3, vr5 // t28a vsadd.h vr27, vr3, vr5 // t31a vssub.h vr24, vr7, vr2 // t29 vsadd.h vr29, vr7, vr2 // t30 vldrepl.w vr20, t0, 8 // 1567 vldrepl.w vr21, t0, 12 // 3784 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 vssrarni.h.w vr5, vr3, 12 // t29a vssrarni.h.w vr2, vr11, 12 // 18a vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 vssrarni.h.w vr7, vr3, 12 // t28 vssrarni.h.w vr24, vr11, 12 // t19 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 vneg.w vr3, vr3 vneg.w vr28, vr28 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 vssrarni.h.w vr28, vr3, 12 // t20 vssrarni.h.w vr25, vr11, 12 // t27 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 vneg.w vr3, vr3 vneg.w vr30, vr30 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 vssrarni.h.w vr30, vr3, 12 // t21a vssrarni.h.w vr1, vr11, 12 // t26a vsadd.h vr3, vr4, vr31 // t16 vssub.h vr26, vr4, vr31 // t23 vsadd.h vr0, vr19, vr9 // t17a vssub.h vr8, vr19, vr9 // t22a vsadd.h vr4, vr2, vr30 // t18 vssub.h vr31, vr2, vr30 // t21 vsadd.h vr9, vr24, vr28 // t19a vssub.h vr19, vr24, vr28 // t20a vssub.h vr2, vr27, vr10 // t24 vsadd.h vr30, vr27, vr10 // t31 vssub.h vr24, vr29, vr6 // t25a vsadd.h vr28, vr29, vr6 // t30a vssub.h vr10, vr5, vr1 // t26 vsadd.h vr27, vr5, vr1 // t29 vssub.h vr6, vr7, vr25 // t27a vsadd.h vr29, vr7, vr25 // t28a vldrepl.w vr20, t0, 0 // 2896 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 vssrarni.h.w vr5, vr1, 12 // t20 vssrarni.h.w vr7, vr11, 12 // t27 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 vssrarni.h.w vr25, vr1, 12 // t21a vssrarni.h.w vr6, vr11, 12 // t26a vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 vssrarni.h.w vr19, vr1, 12 // t22 vssrarni.h.w vr10, vr11, 12 // t25 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 vssrarni.h.w vr31, vr1, 12 // t23a vssrarni.h.w vr8, vr11, 12 // t24a // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr30 // c[0] vssub.h vr2, vr11, vr30 // c[31] vsadd.h vr24, vr12, vr28 // c[1] vssub.h vr26, vr12, vr28 // c[30] vsadd.h vr11, vr13, vr27 // c[2] vssub.h vr30, vr13, vr27 // c[29] vsadd.h vr12, vr14, vr29 // c[3] vssub.h vr28, vr14, vr29 // c[28] vsadd.h vr13, vr15, vr7 // c[4] vssub.h vr27, vr15, vr7 // c[27] vsadd.h vr14, vr16, vr6 // c[5] vssub.h vr29, vr16, vr6 // c[26] vsadd.h vr7, vr17, vr10 // c[6] vssub.h vr15, vr17, vr10 // c[25] vsadd.h vr6, vr18, vr8 // c[7] vssub.h vr16, vr18, vr8 // c[24] vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 vsadd.h vr1, vr11, vr31 // c[8] vssub.h vr2, vr11, vr31 // c[23] vsadd.h vr24, vr12, vr19 // c[9] vssub.h vr26, vr12, vr19 // c[22] vsadd.h vr11, vr13, vr25 // c[10] vssub.h vr30, vr13, vr25 // c[21] vsadd.h vr12, vr14, vr5 // c[11] vssub.h vr28, vr14, vr5 // c[20] vsadd.h vr13, vr15, vr9 // c[12] vssub.h vr27, vr15, vr9 // c[19] vsadd.h vr14, vr16, vr4 // c[13] vssub.h vr29, vr16, vr4 // c[18] vsadd.h vr7, vr17, vr0 // c[14] vssub.h vr15, vr17, vr0 // c[17] vsadd.h vr6, vr18, vr3 // c[15] vssub.h vr16, vr18, vr3 // c[16] vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 .endm // dct_8x32_tx64_new_lsx .macro DST_ADD_W64 in0, in1, in2, in3, in4, in5, in6, in7 vsllwil.hu.bu vr4, vr10, 0 vsllwil.hu.bu vr5, vr11, 0 vsllwil.hu.bu vr6, vr12, 0 vsllwil.hu.bu vr7, vr13, 0 vexth.hu.bu vr10, vr10 vexth.hu.bu vr11, vr11 vexth.hu.bu vr12, vr12 vexth.hu.bu vr13, vr13 vadd.h vr4, vr4, \in0 vadd.h vr10, vr10, \in1 vadd.h vr5, vr5, \in2 vadd.h vr11, vr11, \in3 vadd.h vr6, vr6, \in4 vadd.h vr12, vr12, \in5 vadd.h vr7, vr7, \in6 vadd.h vr13, vr13, \in7 vssrani.bu.h vr10, vr4, 0 vssrani.bu.h vr11, vr5, 0 vssrani.bu.h vr12, vr6, 0 vssrani.bu.h vr13, vr7, 0 vst vr10, a0, 0 vst vr11, a0, 16 vst vr12, a0, 32 vst vr13, a0, 48 .endm .macro idct_dc_w64 w, h, shift ld.h t2, a2, 0 vldi vr0, 0x8b5 vreplgr2vr.w vr1, t2 vldi vr20, 0x880 vmul.w vr2, vr0, vr1 st.h zero, a2, 0 vsrari.w vr2, vr2, 8 vld vr13, a0, 48 .if (2*\w == \h) || (2*\h == \w) vmul.w vr2, vr2, vr0 vsrari.w vr2, vr2, 8 .endif .if \shift>0 vsrari.w vr2, vr2, \shift .endif vld vr11, a0, 16 vmadd.w vr20, vr2, vr0 vld vr12, a0, 32 vssrarni.h.w vr20, vr20, 12 vld vr10, a0, 0 .endm function inv_txfm_add_dct_dct_64x64_8bpc_lsx bnez a3, .NO_HAS_DCONLY_64x64 idct_dc_w64 64, 64, 2 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 li.w t3, 63 .loop63: add.d a0, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, a0, 32 vld vr13, a0, 48 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 addi.d t3, t3, -1 blt zero, t3, .loop63 b .DCT_DCT_64X64_END .NO_HAS_DCONLY_64x64: malloc_space 64*32*2+512+512 .macro dct64x64_core1_lsx shift, rect2 //addi.d t2, a2, \in0 //addi.d t7, t7, \in1 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t6, t3, 512 add.d t5, t6, zero dct_8x32_tx64_new_lsx 0, 256, 128, 256, \rect2 la.local t0, idct64_coeffs vxor.v vr31, vr31, vr31 //addi.d a4, a2, \in2 // 32 ... // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a vld vr0, a4, 128*0 // in1 vld vr1, a4, 128*15 // in31 vld vr2, a4, 128*8 // in17 vld vr3, a4, 128*7 // in15 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*0 vst vr31, a4, 128*15 vst vr31, a4, 128*8 vst vr31, a4, 128*7 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a vld vr0, a4, 128*3 // in7 vld vr1, a4, 128*12 // in25 vld vr2, a4, 128*11 // in23 vld vr3, a4, 128*4 // in9 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*3 vst vr31, a4, 128*12 vst vr31, a4, 128*11 vst vr31, a4, 128*4 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a vld vr0, a4, 128*2 // in5 vld vr1, a4, 128*13 // in27 vld vr2, a4, 128*10 // in21 vld vr3, a4, 128*5 // in11 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*2 vst vr31, a4, 128*13 vst vr31, a4, 128*10 vst vr31, a4, 128*5 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld vr0, a4, 128*1 // in3 vld vr1, a4, 128*14 // in29 vld vr2, a4, 128*9 // in19 vld vr3, a4, 128*6 // in13 la.local a6, idct_coeffs .ifc \rect2, rect2_lsx vldrepl.w vr23, a6, 0 // 2896 .irp i, vr0, vr1, vr2, vr3 rect2_lsx \i, vr23, \i .endr .endif vst vr31, a4, 128*1 vst vr31, a4, 128*14 vst vr31, a4, 128*9 vst vr31, a4, 128*6 dct64_step1_lsx la.local t0, idct_coeffs addi.d t4, t5, 16*7 // t32a/t39/t40a/t47/t48/t55a/t56/t63a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t34a/t37/t42a/t45/t50/t53a/t58/t61a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 dct64_step2_lsx li.w t4, 64*32*2+64+512 add.d t5, t4, sp addi.d t4, t5, 16*7 dct64_step4_lsx transpose8x8, \shift, 0, 128, 112, 128 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step4_lsx transpose8x8, \shift, 16, 128, 96, 128 addi.d t5, t5, -16*8 addi.d t4, t4, -16*8 addi.d t3, t3, 128 dct64_step4_lsx transpose8x8, \shift, 32, 128, 80, 128 addi.d t5, t5, -16*8 addi.d t4, t4, -16*8 addi.d t3, t3, 128 dct64_step4_lsx transpose8x8, \shift, 48, 128, 64, 128 .endm la.local t8, eob_32x32 addi.d t2, a2, 0 addi.d t7, sp, 64 addi.d t7, t7, 0 addi.d a4, a2, 64 .DCT_DCT_EOB_64x64: ld.h a5, t8, 0 addi.d t8, t8, 2 dct64x64_core1_lsx 2, no_rect2 addi.d t2, t2, 16 addi.d t7, t7, 128*8 addi.d a4, a4, 16 bge a3, a5, .DCT_DCT_EOB_64x64 la.local t8, eob_32x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_64x64_END li.d t1, 1024*3+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_64x64_END li.d t1, 1024*2+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_64x64_END li.d t1, 1024*1+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr .DCT_DCT_EOB_64x64_END: .macro dct64x64_core2_lsx in0, in1, rect2 addi.d t2, sp, 64+\in0 addi.d t7, sp, 64+\in0 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t6, t3, 512 add.d t5, t6, zero addi.d t2, t2, 1024 addi.d t2, t2, 1024 dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512, \rect2 la.local t0, idct64_coeffs addi.d t2, sp, 64+64*2+\in0 addi.d t4, t2, 256*7 addi.d t4, t4, 256 vld vr0, t2, 256*0 // in1 vld vr1, t4, 256*7 // in31 vld vr2, t4, 256*0 // in17 vld vr3, t2, 256*7 // in15 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*3 // in7 vld vr1, t4, 256*4 // in25 vld vr2, t4, 256*3 // in23 vld vr3, t2, 256*4 // in9 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*2 // in5 vld vr1, t4, 256*5 // in27 vld vr2, t4, 256*2 // in21 vld vr3, t2, 256*5 // in11 dct64_step1_lsx addi.d t0, t0, 48 addi.d t6, t6, 128 vld vr0, t2, 256*1 // in3 vld vr1, t4, 256*6 // in29 vld vr2, t4, 256*1 // in19 vld vr3, t2, 256*6 // in13 dct64_step1_lsx la.local t0, idct_coeffs addi.d t4, t5, 16*7 // t32a/t39/t40a/t47/t48/t55a/t56/t63a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t34a/t37/t42a/t45/t50/t53a/t58/t61a dct64_step2_lsx addi.d t5, t5, 16 addi.d t4, t4, -16 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 dct64_step2_lsx li.w t4, 64*32*2+64+512 add.d t5, t4, sp addi.d t4, t5, 16*7 addi.d a0, a0, \in1 // 0 - 7, 56 -63 dct64_step3_lsx li.w t8, 0 mul.w t0, t8, a1 add.d t0, a0, t0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 56 mul.w t0, t8, a1 add.d t0, a0, t0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 8 - 15, 48 - 55 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 8 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 48 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 16 - 23, 40 - 47 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 16 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 40 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 // 24 - 31, 32 - 39 addi.d t3, t3, 128 addi.d t4, t4, -16*8 addi.d t5, t5, -16*8 dct64_step3_lsx li.w t8, 24 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 li.w t8, 32 mul.w t0, t8, a1 add.d t0, t0, a0 alsl.d t6, a1, t0, 1 addi.d t1, t0, 0 add.d t2, t0, a1 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 .endm dct64x64_core2_lsx 16*0, 0, no_rect2 dct64x64_core2_lsx 16*1, 8, no_rect2 dct64x64_core2_lsx 16*2, 8, no_rect2 dct64x64_core2_lsx 16*3, 8, no_rect2 dct64x64_core2_lsx 16*4, 8, no_rect2 dct64x64_core2_lsx 16*5, 8, no_rect2 dct64x64_core2_lsx 16*6, 8, no_rect2 dct64x64_core2_lsx 16*7, 8, no_rect2 free_space 64*32*2+512+512 .DCT_DCT_64X64_END: endfunc function inv_txfm_add_dct_dct_64x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_64x32 idct_dc_w64 64, 32, 1 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 li.w t3, 31 .loop31: add.d a0, a0, a1 vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, a0, 32 vld vr13, a0, 48 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 addi.d t3, t3, -1 blt zero, t3, .loop31 b .DCT_DCT_64X32_END .NO_HAS_DCONLY_64x32: malloc_space 64*32*2+512+512 la.local t8, eob_32x32 addi.d t2, a2, 0 addi.d t7, sp, 64 addi.d t7, t7, 0 addi.d a4, a2, 64 .DCT_DCT_EOB_64x32: ld.h a5, t8, 0 addi.d t8, t8, 2 dct64x64_core1_lsx 1, rect2_lsx addi.d t2, t2, 16 addi.d t7, t7, 128*8 addi.d a4, a4, 16 bge a3, a5, .DCT_DCT_EOB_64x32 la.local t8, eob_32x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_64x32_END li.d t1, 1024*3+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_64x32_END li.d t1, 1024*2+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_64x32_END li.d t1, 1024*1+64 add.d t0, sp, t1 .rept 4 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 addi.d t0, t0, 256 .endr .DCT_DCT_EOB_64x32_END: addi.d t2, sp, 64 li.w t4, 64*32*2+64 add.d t3, sp, t4 addi.d t5, sp, 64 addi.d t5, t5, 1024 addi.d t5, t5, 1024 .rept 8 vld_x8 t2, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d t4, t2, 1024 addi.d t4, t4, 1024 vld_x8 t4, 0, 256, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx no_rect2 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d t4, t2, 128 vld_x8 t4, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d t4, t4, 1024 addi.d t4, t4, 1024 vld_x8 t4, 0, 256, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t5, t3, 0, 128, 16, -2048, 1024, -1024, 0, 128, , 4 addi.d t2, t2, 16 addi.d t5, t5, 16 addi.d t1, t1, 16 .endr addi.d t2, sp, 64 li.w t3, 32 .loop32: vld vr10, a0, 0 vld vr11, a0, 16 vld vr12, a0, 32 vld vr13, a0, 48 vld_x8 t2, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 DST_ADD_W64 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 add.d a0, a0, a1 addi.d t2, t2, 128 addi.d t3, t3, -1 blt zero, t3, .loop32 free_space 64*32*2+512+512 .DCT_DCT_64X32_END: endfunc .macro VLD_DST_ADD_W8_H32 in0 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, t3, 64 add.d a0, a1, a0 alsl.d t2, a1, t2, 2 vld vr4, t3, 0 vld vr5, t3, 16 vld vr6, t3, 32 vld vr7, t3, 48 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 addi.d t3, sp, \in0 add.d a0, a1, a0 alsl.d t2, a1, t2, 2 .endm function inv_txfm_add_dct_dct_8x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_8x32 idct_dc 8, 32, 2 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 .rept 7 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 .endr b .DCT_DCT_8X32_END .NO_HAS_DCONLY_8x32: malloc_space 512 la.local t8, eob_8x32 addi.d t3, sp, 64 addi.d t2, a2, 0 .DCT_DCT_EOB_8x32: ld.h t7, t8, 0 addi.d t8, t8, 2 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 2 .endr vxor.v vr31, vr31, vr31 vst_x8 a2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 addi.d a2, a2, 16 addi.d t3, t3, 128 bge a3, t7, .DCT_DCT_EOB_8x32 la.local t8, eob_8x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_8x32_END vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_8x32_END vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_8x32_END vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 .DCT_DCT_EOB_8x32_END: addi.d t2, sp, 64 addi.d t3, sp, 64 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h vst_x16 t3, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t2, t3, 0, 256, 32, 0, 128, 256, 384, 16, , 4 alsl.d t2, a1, a0, 1 addi.d t3, sp, 64 VLD_DST_ADD_W8_H32 320 VLD_DST_ADD_W8_H32 448 VLD_DST_ADD_W8_H32 192 VLD_DST_ADD_W8_H32 0 free_space 512 .DCT_DCT_8X32_END: endfunc function inv_txfm_add_identity_identity_8x32_8bpc_lsx la.local t7, eob_8x32 alsl.d t2, a1, a0, 1 .IDENTITY_IDENTITY_EOB_8x32: ld.h t6, t7, 0 addi.d t7, t7, 2 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vxor.v vr23, vr23, vr23 vst_x8 a2, 0, 64, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vsrari.h \i, \i, 1 .endr LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 vsrari.h \i, \i, 2 .endr VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 add.d a0, a1, a0 alsl.d t2, a1, a0, 1 addi.d a2, a2, 16 bge a3, t6, .IDENTITY_IDENTITY_EOB_8x32 endfunc .macro def_fn_16x4_base txfm functionl inv_txfm_\txfm\()add_16x4_lsx vld_x8 a2, 0, 16, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14 .ifc \txfm, identity_ li.w t0, 1697 vreplgr2vr.w vr20, t0 .irp i, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14 inv_identity16_lsx \i, vr20, \i, \i, .8h .endr vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vilvh.d vr5, vr4, vr4 vilvh.d vr7, vr6, vr6 vilvh.d vr9, vr8, vr8 vilvh.d vr11, vr10, vr10 vilvh.d vr13, vr12, vr12 vilvh.d vr15, vr14, vr14 .else vilvh.d vr1, vr0, vr0 vilvh.d vr3, vr2, vr2 vilvh.d vr5, vr4, vr4 vilvh.d vr7, vr6, vr6 vilvh.d vr9, vr8, vr8 vilvh.d vr11, vr10, vr10 vilvh.d vr13, vr12, vr12 vilvh.d vr15, vr14, vr14 move t6, ra jirl ra, t7, 0 move ra, t6 .endif vxor.v vr23, vr23, vr23 vst_x8 a2, 0, 16, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23 LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \ vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21 LSX_TRANSPOSE8x4_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, vr4, \ vr5, vr6, vr7, vr16, vr17, vr18, vr19, vr20, vr21 vsrari.h vr0, vr0, 1 vsrari.h vr1, vr1, 1 vsrari.h vr2, vr2, 1 vsrari.h vr3, vr3, 1 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari.h vr8, vr0, 4 vsrari.h vr9, vr1, 4 vsrari.h vr10, vr2, 4 vsrari.h vr11, vr3, 4 vsrari.h vr0, vr4, 1 vsrari.h vr1, vr5, 1 vsrari.h vr2, vr6, 1 vsrari.h vr3, vr7, 1 move t6, ra jirl ra, t8, 0 move ra, t6 vsrari.h vr16, vr0, 4 vsrari.h vr17, vr1, 4 vsrari.h vr18, vr2, 4 vsrari.h vr19, vr3, 4 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr8, vr16, vr9, vr17, vr10, vr18, vr11, vr19 endfuncl .endm def_fn_16x4_base identity_ def_fn_16x4_base .macro fn_16x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x4_8bpc_lsx .ifc \txfm1\()_\txfm2, dct_dct bnez a3, .NO_HAS_DCONLY_16x4 idct_dc 16, 4, 1 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 b .\txfm1\()_\txfm2\()_16x4_END .NO_HAS_DCONLY_16x4: .endif .ifnc \txfm1, identity la.local t7, inv_\txfm1\()_4h_x16_lsx .endif la.local t8, inv_\txfm2\()_8h_x4_lsx .ifc \txfm1, identity b inv_txfm_identity_add_16x4_lsx .else b inv_txfm_add_16x4_lsx .endif .\txfm1\()_\txfm2\()_16x4_END: endfunc .endm fn_16x4 dct, dct fn_16x4 identity, identity fn_16x4 adst, dct .macro VLD_DST_ADD_W16_H32 in0 vld vr14, t3, 0 vld vr15, t3, 16 vld vr16, t3, 32 vld vr17, t3, 48 vld vr18, t5, 0 vld vr19, t5, 16 vld vr20, t5, 32 vld vr21, t5, 48 vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \ vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4 VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 addi.d t3, t3, 64 addi.d t5, t5, 64 vld vr14, t3, 0 vld vr15, t3, 16 vld vr16, t3, 32 vld vr17, t3, 48 vld vr18, t5, 0 vld vr19, t5, 16 vld vr20, t5, 32 vld vr21, t5, 48 vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \ vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4 VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21 alsl.d a0, a1, a0, 2 alsl.d t2, a1, t2, 2 addi.d t3, sp, \in0 addi.d t5, sp, \in0+512 .endm function inv_txfm_add_dct_dct_16x32_8bpc_lsx bnez a3, .NO_HAS_DCONLY_16x32 idct_dc 16, 32, 1 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ vr20, vr20, vr20, vr20, vr20 .rept 7 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 .endr b .DCT_DCT_16x32_END .NO_HAS_DCONLY_16x32: malloc_space 512+512 addi.d t3, sp, 64 la.local t8, eob_16x32 .DCT_DCT_EOB_16x32: ld.h t7, t8, 0 addi.d t8, t8, 2 vld_x16 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vxor.v vr31, vr31, vr31 .irp i, 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960 vst vr31, a2, \i .endr li.w t0, 2896 vreplgr2vr.w vr23, t0 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 rect2_lsx \i, vr23, \i .endr inv_dct16_lsx .8h LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vsrari.h \i, \i, 1 .endr vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vst_x8 t3, 512, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 addi.d a2, a2, 16 addi.d t3, t3, 128 bge a3, t7, .DCT_DCT_EOB_16x32 la.local t8, eob_16x32 vxor.v vr31, vr31, vr31 ld.h t7, t8, 4 bge a3, t7, .DCT_DCT_EOB_16x32_END vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x8 sp, 64+896, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 2 bge a3, t7, .DCT_DCT_EOB_16x32_END vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x8 sp, 64+768, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 ld.h t7, t8, 0 bge a3, t7, .DCT_DCT_EOB_16x32_END vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 vst_x8 sp, 64+512+128, 16 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 .DCT_DCT_EOB_16x32_END: addi.d t7, sp, 64 .rept 2 vld_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 inv_dct16_lsx .8h vst_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 vld_x16 t7, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 dct_8x32_core_lsx t7, t7, 0, 256, 32, 0, 128, 256, 384, 16, , addi.d t7, t7, 512 .endr alsl.d t2, a1, a0, 1 addi.d t3, sp, 64 addi.d t5, sp, 512+64 VLD_DST_ADD_W16_H32 320 VLD_DST_ADD_W16_H32 448 VLD_DST_ADD_W16_H32 192 VLD_DST_ADD_W16_H32 0 free_space 512+512 .DCT_DCT_16x32_END: endfunc .macro xvmulev_xvmaddod_lasx in0, in1, in2, in3, out0, out1 xvmulwev.w.h \out0, \in0, \in2 xvmulwod.w.h \out1, \in0, \in2 xvmaddwev.w.h \out0, \in1, \in3 xvmaddwod.w.h \out1, \in1, \in3 .endm .macro xvsrari_h_x16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ in11, in12, in13, in14, in15, out0, out1, out2, out3, \ out4, out5, out6, out7, out8, out9, out10, out11, out12, \ out13, out14, out15, shift xvsrari.h \out0, \in0, \shift xvsrari.h \out1, \in1, \shift xvsrari.h \out2, \in2, \shift xvsrari.h \out3, \in3, \shift xvsrari.h \out4, \in4, \shift xvsrari.h \out5, \in5, \shift xvsrari.h \out6, \in6, \shift xvsrari.h \out7, \in7, \shift xvsrari.h \out8, \in8, \shift xvsrari.h \out9, \in9, \shift xvsrari.h \out10, \in10, \shift xvsrari.h \out11, \in11, \shift xvsrari.h \out12, \in12, \shift xvsrari.h \out13, \in13, \shift xvsrari.h \out14, \in14, \shift xvsrari.h \out15, \in15, \shift .endm .macro xvpermi_q_x2 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1 xvor.v \tmp0, \in0, \in0 xvor.v \tmp1, \in1, \in1 xvpermi.q \out0, \in2, 0x02 xvpermi.q \out1, \in3, 0x02 xvpermi.q \out2, \tmp0, 0x31 xvpermi.q \out3, \tmp1, 0x31 .endm .macro DST_ADD_W16_LASX in0, in1, in2, in3, in4, in5, in6, in7 vext2xv.hu.bu xr0, \in0 vext2xv.hu.bu xr1, \in1 vext2xv.hu.bu xr2, \in2 vext2xv.hu.bu xr3, \in3 xvadd.h xr0, xr0, \in4 xvadd.h xr1, xr1, \in5 xvadd.h xr2, xr2, \in6 xvadd.h xr3, xr3, \in7 xvssrani.bu.h xr1, xr0, 0 xvssrani.bu.h xr3, xr2, 0 xvpermi.d xr0, xr1, 0b11011000 xvpermi.d xr2, xr3, 0b11011000 xvpermi.d xr1, xr0, 0b00001110 xvpermi.d xr3, xr2, 0b00001110 vst vr0, a0, 0 vstx vr1, a0, a1 vst vr2, t2, 0 vstx vr3, t2, a1 .endm .macro XVLD_DST_ADD_W16 in0, in1, in2, in3 vld vr0, a0, 0 vldx vr1, a0, a1 vld vr2, t2, 0 vldx vr3, t2, a1 DST_ADD_W16_LASX xr0, xr1, xr2, xr3, \in0, \in1, \in2, \in3 .endm .macro inv_adst16_lasx la.local t0, iadst16_coeffs_h xvldrepl.h xr20, t0, 0 // 4091 xvldrepl.h xr21, t0, 2 // 201 xvmulev_xvmaddod_lasx xr15, xr0, xr20, xr21, xr16, xr18 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr15, xr0, xr21, xr20, xr17, xr19 xvilvl.w xr15, xr18, xr16 xvilvl.w xr0, xr19, xr17 xvilvh.w xr18, xr18, xr16 xvilvh.w xr19, xr19, xr17 xvssrarni.h.w xr18, xr15, 12 // t0 xvssrarni.h.w xr19, xr0, 12 // t1 xvldrepl.h xr20, t0, 4 // 3973 xvldrepl.h xr21, t0, 6 // 995 xvmulev_xvmaddod_lasx xr13, xr2, xr20, xr21, xr16, xr0 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr13, xr2, xr21, xr20, xr17, xr15 xvilvl.w xr13, xr0, xr16 xvilvl.w xr2, xr15, xr17 xvilvh.w xr0, xr0, xr16 xvilvh.w xr15, xr15, xr17 xvssrarni.h.w xr0, xr13, 12 // t2 xvssrarni.h.w xr15, xr2, 12 // t3 xvldrepl.h xr20, t0, 8 // 3703 xvldrepl.h xr21, t0, 10 // 1751 xvmulev_xvmaddod_lasx xr11, xr4, xr20, xr21, xr16, xr2 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr11, xr4, xr21, xr20, xr17, xr13 xvilvl.w xr11, xr2, xr16 xvilvl.w xr4, xr13, xr17 xvilvh.w xr2, xr2, xr16 xvilvh.w xr13, xr13, xr17 xvssrarni.h.w xr2, xr11, 12 // t4 xvssrarni.h.w xr13, xr4, 12 // t5 xvldrepl.h xr20, t0, 12 // 3290 -> 1645 xvldrepl.h xr21, t0, 14 // 2440 -> 1220 xvmulev_xvmaddod_lasx xr9, xr6, xr20, xr21, xr16, xr4 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr9, xr6, xr21, xr20, xr17, xr11 xvilvl.w xr9, xr4, xr16 xvilvl.w xr6, xr11, xr17 xvilvh.w xr4, xr4, xr16 xvilvh.w xr11, xr11, xr17 xvssrarni.h.w xr4, xr9, 12 // t6 xvssrarni.h.w xr11, xr6, 12 // t7 xvldrepl.h xr20, t0, 16 // 2751 xvldrepl.h xr21, t0, 18 // 3035 xvmulev_xvmaddod_lasx xr7, xr8, xr20, xr21, xr16, xr6 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr7, xr8, xr21, xr20, xr17, xr9 xvilvl.w xr7, xr6, xr16 xvilvl.w xr8, xr9, xr17 xvilvh.w xr6, xr6, xr16 xvilvh.w xr9, xr9, xr17 xvssrarni.h.w xr6, xr7, 12 // t8 xvssrarni.h.w xr9, xr8, 12 // t9 xvldrepl.h xr20, t0, 20 // 2106 xvldrepl.h xr21, t0, 22 // 3513 xvmulev_xvmaddod_lasx xr5, xr10, xr20, xr21, xr16, xr7 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr5, xr10, xr21, xr20, xr17, xr8 xvilvl.w xr5, xr7, xr16 xvilvl.w xr10, xr8, xr17 xvilvh.w xr7, xr7, xr16 xvilvh.w xr8, xr8, xr17 xvssrarni.h.w xr7, xr5, 12 // t10 xvssrarni.h.w xr8, xr10, 12 // t11 xvldrepl.h xr20, t0, 24 // 1380 xvldrepl.h xr21, t0, 26 // 3857 xvmulev_xvmaddod_lasx xr3, xr12, xr20, xr21, xr16, xr5 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr3, xr12, xr21, xr20, xr17, xr10 xvilvl.w xr3, xr5, xr16 xvilvl.w xr12, xr10, xr17 xvilvh.w xr5, xr5, xr16 xvilvh.w xr10, xr10, xr17 xvssrarni.h.w xr5, xr3, 12 // t12 xvssrarni.h.w xr10, xr12, 12 // t13 xvldrepl.h xr20, t0, 28 // 601 xvldrepl.h xr21, t0, 30 // 4052 xvmulev_xvmaddod_lasx xr1, xr14, xr20, xr21, xr16, xr3 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr1, xr14, xr21, xr20, xr17, xr12 xvilvl.w xr1, xr3, xr16 xvilvl.w xr14, xr12, xr17 xvilvh.w xr3, xr3, xr16 xvilvh.w xr12, xr12, xr17 xvssrarni.h.w xr3, xr1, 12 // t14 xvssrarni.h.w xr12, xr14, 12 // t15 xvsadd.h xr1, xr18, xr6 // t0a xvssub.h xr14, xr18, xr6 // t8a xvsadd.h xr16, xr19, xr9 // t1a xvssub.h xr17, xr19, xr9 // t9a xvsadd.h xr6, xr0, xr7 // t2a xvssub.h xr18, xr0, xr7 // t10a xvsadd.h xr9, xr15, xr8 // t3a xvssub.h xr19, xr15, xr8 // t11a xvsadd.h xr0, xr2, xr5 // t4a xvssub.h xr7, xr2, xr5 // t12a xvsadd.h xr8, xr13, xr10 // t5a xvssub.h xr15, xr13, xr10 // t13a xvsadd.h xr2, xr4, xr3 // t6a xvssub.h xr5, xr4, xr3 // t14a xvsadd.h xr10, xr11, xr12 // t7a xvssub.h xr13, xr11, xr12 // t15a la.local t0, idct_coeffs_h xvldrepl.h xr20, t0, 8 // 799 xvldrepl.h xr21, t0, 10 // 4017 xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr3, xr11 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr4, xr12 xvilvl.w xr14, xr11, xr3 xvilvl.w xr17, xr12, xr4 xvilvh.w xr11, xr11, xr3 xvilvh.w xr12, xr12, xr4 xvssrarni.h.w xr11, xr14, 12 // t8 xvssrarni.h.w xr12, xr17, 12 // t9 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr15, xr7, xr20, xr21, xr3, xr14 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr15, xr7, xr21, xr20, xr4, xr17 xvilvl.w xr15, xr14, xr3 xvilvl.w xr7, xr17, xr4 xvilvh.w xr14, xr14, xr3 xvilvh.w xr17, xr17, xr4 xvssrarni.h.w xr14, xr15, 12 // t13 xvssrarni.h.w xr17, xr7, 12 // t12 xvldrepl.h xr20, t0, 12 // 3406 xvldrepl.h xr21, t0, 14 // 2276 xvmulev_xvmaddod_lasx xr18, xr19, xr21, xr20, xr3, xr7 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr18, xr19, xr20, xr21, xr4, xr15 xvilvl.w xr18, xr7, xr3 xvilvl.w xr19, xr15, xr4 xvilvh.w xr7, xr7, xr3 xvilvh.w xr15, xr15, xr4 xvssrarni.h.w xr7, xr18, 12 // t10 xvssrarni.h.w xr15, xr19, 12 // t11 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr13, xr5, xr20, xr21, xr3, xr18 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr13, xr5, xr21, xr20, xr4, xr19 xvilvl.w xr13, xr18, xr3 xvilvl.w xr5, xr19, xr4 xvilvh.w xr18, xr18, xr3 xvilvh.w xr19, xr19, xr4 xvssrarni.h.w xr18, xr13, 12 // t15 xvssrarni.h.w xr19, xr5, 12 // t14 xvsadd.h xr5, xr1, xr0 // t0 xvssub.h xr13, xr1, xr0 // t4 xvsadd.h xr3, xr16, xr8 // t1 xvssub.h xr4, xr16, xr8 // t5 xvsadd.h xr0, xr6, xr2 // t2 xvssub.h xr1, xr6, xr2 // t6 xvsadd.h xr8, xr9, xr10 // t3 xvssub.h xr16, xr9, xr10 // t7 xvsadd.h xr2, xr11, xr17 // t8a xvssub.h xr6, xr11, xr17 // t12a xvsadd.h xr9, xr12, xr14 // t9a xvssub.h xr10, xr12, xr14 // t13a xvsadd.h xr11, xr7, xr19 // t10a xvssub.h xr17, xr7, xr19 // t14a xvsadd.h xr12, xr15, xr18 // t11a xvssub.h xr14, xr15, xr18 // t15a la.local t0, idct_coeffs_h xvldrepl.h xr20, t0, 4 // 1567 xvldrepl.h xr21, t0, 6 // 3784 xvmulev_xvmaddod_lasx xr13, xr4, xr21, xr20, xr7, xr18 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr13, xr4, xr20, xr21, xr15, xr19 xvilvl.w xr13, xr18, xr7 xvilvl.w xr4, xr19, xr15 xvilvh.w xr18, xr18, xr7 xvilvh.w xr19, xr19, xr15 xvssrarni.h.w xr18, xr13, 12 // t4a xvssrarni.h.w xr19, xr4, 12 // t5a xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr16, xr1, xr20, xr21, xr7, xr4 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr16, xr1, xr21, xr20, xr15, xr13 xvilvl.w xr16, xr4, xr7 xvilvl.w xr1, xr13, xr15 xvilvh.w xr4, xr4, xr7 xvilvh.w xr13, xr13, xr15 xvssrarni.h.w xr4, xr16, 12 // t7a xvssrarni.h.w xr13, xr1, 12 // t6a xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr6, xr10, xr21, xr20, xr7, xr1 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr6, xr10, xr20, xr21, xr15, xr16 xvilvl.w xr6, xr1, xr7 xvilvl.w xr10, xr16, xr15 xvilvh.w xr1, xr1, xr7 xvilvh.w xr16, xr16, xr15 xvssrarni.h.w xr1, xr6, 12 // t12 xvssrarni.h.w xr16, xr10, 12 // t13 xvneg.h xr21, xr21 xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr7, xr6 xvneg.h xr20, xr20 xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr15, xr10 xvilvl.w xr14, xr6, xr7 xvilvl.w xr17, xr10, xr15 xvilvh.w xr6, xr6, xr7 xvilvh.w xr10, xr10, xr15 xvssrarni.h.w xr6, xr14, 12 // t15 xvssrarni.h.w xr10, xr17, 12 // t14 xvsadd.h xr14, xr5, xr0 // out[0] xvssub.h xr17, xr5, xr0 // t2a xvssub.h xr7, xr3, xr8 // t3a xvsadd.h xr15, xr3, xr8 // out[15] xvsllwil.w.h xr22, xr15, 0 xvexth.w.h xr15, xr15 xvneg.w xr22, xr22 xvneg.w xr15, xr15 xvssrarni.h.w xr15, xr22, 0 // out[15] xvssub.h xr7, xr3, xr8 // t3a xvsadd.h xr3, xr19, xr4 // out[12] xvssub.h xr8, xr19, xr4 // t7 xvssub.h xr0, xr18, xr13 // t6 xvsadd.h xr5, xr18, xr13 // out[3] xvsllwil.w.h xr22, xr5, 0 xvexth.w.h xr5, xr5 xvneg.w xr22, xr22 xvneg.w xr5, xr5 xvssrarni.h.w xr5, xr22, 0 // out[3] xvsadd.h xr13, xr9, xr12 // out[14] xvssub.h xr19, xr9, xr12 // t11 xvssub.h xr4, xr2, xr11 // t10 xvsadd.h xr18, xr2, xr11 // out[1] xvsllwil.w.h xr22, xr18, 0 xvexth.w.h xr18, xr18 xvneg.w xr22, xr22 xvneg.w xr18, xr18 xvssrarni.h.w xr18, xr22, 0 // out[1] xvsadd.h xr2, xr1, xr10 // out[2] xvssub.h xr11, xr1, xr10 // t14a xvssub.h xr12, xr16, xr6 // t15a xvsadd.h xr9, xr16, xr6 // out[13] xvsllwil.w.h xr22, xr9, 0 xvexth.w.h xr9, xr9 xvneg.w xr22, xr22 xvneg.w xr9, xr9 xvssrarni.h.w xr9, xr22, 0 // out[13] xvldrepl.h xr20, t0, 0 // 2896 xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr20, xr6, xr10 xvneg.h xr21, xr20 xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr21, xr16, xr1 xvilvl.w xr17, xr10, xr6 xvilvl.w xr7, xr1, xr16 xvilvh.w xr10, xr10, xr6 xvilvh.w xr1, xr1, xr16 xvssrarni.h.w xr1, xr7, 12 // out[8] xvsrari.w xr17, xr17, 12 xvsrari.w xr10, xr10, 12 xvneg.w xr17, xr17 xvneg.w xr10, xr10 xvssrarni.h.w xr10, xr17, 0 // out[7] xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17 xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7 xvilvl.w xr0, xr17, xr16 xvilvl.w xr8, xr7, xr6 xvilvh.w xr17, xr17, xr16 xvilvh.w xr7, xr7, xr6 xvssrarni.h.w xr7, xr8, 12 // out[4] xvsrari.w xr0, xr0, 12 xvsrari.w xr17, xr17, 12 xvneg.w xr0, xr0 xvneg.w xr17, xr17 xvssrarni.h.w xr17, xr0, 0 // out[11] xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0 xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8 xvilvl.w xr4, xr0, xr16 xvilvl.w xr19, xr8, xr6 xvilvh.w xr0, xr0, xr16 xvilvh.w xr8, xr8, xr6 xvssrarni.h.w xr8, xr19, 12 // out[6] xvsrari.w xr4, xr4, 12 xvsrari.w xr0, xr0, 12 xvneg.w xr4, xr4 xvneg.w xr0, xr0 xvssrarni.h.w xr0, xr4, 0 // out[9] xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4 xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19 xvilvl.w xr11, xr4, xr6 xvilvl.w xr12, xr19, xr16 xvilvh.w xr4, xr4, xr6 xvilvh.w xr19, xr19, xr16 xvssrarni.h.w xr19, xr12, 12 // out[10] xvsrari.w xr11, xr11, 12 xvsrari.w xr4, xr4, 12 xvneg.w xr11, xr11 xvneg.w xr4, xr4 xvssrarni.h.w xr4, xr11, 0 // out[5] .endm function inv_txfm_add_adst_adst_16x16_8bpc_lasx PUSH_REG xvld_x16 a2, 0, 32, xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \ xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15 inv_adst16_lasx LASX_TRANSPOSE8x8_H xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \ xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \ xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27 LASX_TRANSPOSE8x8_H xr1, xr0, xr19, xr17, xr3, xr9, xr13, xr15, \ xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \ xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27 xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \ xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \ xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \ xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15, 2 xvpermi_q_x2 xr0, xr1, xr8, xr9, xr0, xr1, xr8, xr9, xr20, xr21 xvpermi_q_x2 xr2, xr3, xr10, xr11, xr2, xr3, xr10, xr11, xr20, xr21 xvpermi_q_x2 xr4, xr5, xr12, xr13, xr4, xr5, xr12, xr13, xr20, xr21 xvpermi_q_x2 xr6, xr7, xr14, xr15, xr6, xr7, xr14, xr15, xr20, xr21 inv_adst16_lasx xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \ xr1, xr0, xr19, xr17, xr3, xr9, xr13, xr15, \ xr14, xr18, xr11, xr5, xr7, xr4, xr8, xr10, \ xr12, xr16, xr19, xr17, xr20, xr9, xr13, xr15, 4 xvxor.v xr23, xr23, xr23 .irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480 xvst xr23, a2, \i .endr alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr14, xr18, xr11, xr5 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr7, xr4, xr8, xr10 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr12, xr16, xr19, xr17 alsl.d a0, a1, a0, 2 alsl.d t2, a1, a0, 1 XVLD_DST_ADD_W16 xr20, xr9, xr13, xr15 POP_REG endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/itx.h000066400000000000000000000115611517466257200235700ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_ITX_H #define DAV2D_SRC_LOONGARCH_ITX_H #include "src/cpu.h" #include "src/itx.h" decl_itx17_fns( 4, 4, lsx); decl_itx16_fns( 4, 8, lsx); decl_itx16_fns( 4, 16, lsx); decl_itx16_fns( 8, 4, lsx); decl_itx16_fns( 8, 8, lsx); decl_itx16_fns( 8, 16, lsx); decl_itx2_fns ( 8, 32, lsx); decl_itx16_fns(16, 8, lsx); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_16x4, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_identity_identity_16x4, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_adst_dct_16x4, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_adst_adst_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_adst_dct_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_adst_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_flipadst_dct_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_flipadst_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_adst_flipadst_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_flipadst_adst_16x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_16x32, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_32x8, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_32x16, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_32x32, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x32, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x64, lsx)); decl_itx_fn(BF(dav2d_inv_txfm_add_adst_adst_16x16, lasx)); static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav2dInvTxfmDSPContext *const c, int bpc) { #if BITDEPTH == 8 const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; if (BITDEPTH != 8 ) return; assign_itx17_fn( , 4, 4, lsx); assign_itx16_fn(R, 4, 8, lsx); assign_itx16_fn(R, 4, 16, lsx); assign_itx16_fn(R, 8, 4, lsx); assign_itx16_fn( , 8, 8, lsx); assign_itx16_fn(R, 8, 16, lsx); assign_itx2_fn (R, 8, 32, lsx); assign_itx16_fn(R, 16, 8, lsx); assign_itx1_fn (R, 64, 32, lsx); assign_itx1_fn ( , 64, 64, lsx); c->itxfm_add[RTX_16X4][DCT_DCT] = dav2d_inv_txfm_add_dct_dct_16x4_8bpc_lsx; c->itxfm_add[RTX_16X4][IDTX] = dav2d_inv_txfm_add_identity_identity_16x4_8bpc_lsx; c->itxfm_add[RTX_16X4][DCT_ADST] = dav2d_inv_txfm_add_adst_dct_16x4_8bpc_lsx; c->itxfm_add[TX_16X16][DCT_DCT] = dav2d_inv_txfm_add_dct_dct_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][ADST_ADST] = dav2d_inv_txfm_add_adst_adst_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][DCT_ADST] = dav2d_inv_txfm_add_adst_dct_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][ADST_DCT] = dav2d_inv_txfm_add_dct_adst_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav2d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav2d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][FLIPADST_ADST] = dav2d_inv_txfm_add_adst_flipadst_16x16_8bpc_lsx; c->itxfm_add[TX_16X16][ADST_FLIPADST] = dav2d_inv_txfm_add_flipadst_adst_16x16_8bpc_lsx; c->itxfm_add[RTX_16X32][DCT_DCT] = dav2d_inv_txfm_add_dct_dct_16x32_8bpc_lsx; c->itxfm_add[RTX_32X8][DCT_DCT] = dav2d_inv_txfm_add_dct_dct_32x8_8bpc_lsx; c->itxfm_add[RTX_32X16][DCT_DCT] = dav2d_inv_txfm_add_dct_dct_32x16_8bpc_lsx; c->itxfm_add[TX_32X32][DCT_DCT] = dav2d_inv_txfm_add_dct_dct_32x32_8bpc_lsx; if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LASX)) return; if (BITDEPTH != 8 ) return; c->itxfm_add[TX_16X16][ADST_ADST] = dav2d_inv_txfm_add_adst_adst_16x16_8bpc_lasx; #endif } #endif /* DAV2D_SRC_LOONGARCH_ITX_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/loongson_asm.S000066400000000000000000000532671517466257200254460ustar00rootroot00000000000000/********************************************************************* * Copyright (c) 2022 Loongson Technology Corporation Limited * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn) * Shiyou Yin(yinshiyou-hf@loongson.cn) * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *********************************************************************/ /* * This file is a LoongArch assembly helper file and available under ISC * license. It provides a large number of macros and alias to simplify * writing assembly code, especially for LSX and LASX optimizations. * * Any one can modify it or add new features for his/her own purposes. * Contributing a patch will be appreciated as it might be useful for * others as well. Send patches to loongson contributor mentioned above. * * MAJOR version: Usage changes, incompatible with previous version. * MINOR version: Add new macros/functions, or bug fixes. * MICRO version: Comment changes or implementation changes. */ #define LML_VERSION_MAJOR 0 #define LML_VERSION_MINOR 4 #define LML_VERSION_MICRO 0 #define DEFAULT_ALIGN 5 /* Set prefix as needed. */ #ifndef PRIVATE_PREFIX #define PRIVATE_PREFIX dav2d_ #endif #define PASTE(a,b) a ## b #define CONCAT(a,b) PASTE(a,b) #ifdef PREFIX #define ASM_PREF CONCAT(_,PRIVATE_PREFIX) #else #define ASM_PREF PRIVATE_PREFIX #endif .macro function name, align=DEFAULT_ALIGN .macro endfunc jirl $r0, $r1, 0x0 .size ASM_PREF\name, . - ASM_PREF\name .purgem endfunc .endm .text ; .align \align ; .globl ASM_PREF\name ; .hidden ASM_PREF\name ; .type ASM_PREF\name, @function ; ASM_PREF\name: ; .endm .macro const name, align=DEFAULT_ALIGN .macro endconst .size \name, . - \name .purgem endconst .endm .section .rodata .align \align \name: .endm /* *============================================================================ * LoongArch register alias *============================================================================ */ #define a0 $a0 #define a1 $a1 #define a2 $a2 #define a3 $a3 #define a4 $a4 #define a5 $a5 #define a6 $a6 #define a7 $a7 #define t0 $t0 #define t1 $t1 #define t2 $t2 #define t3 $t3 #define t4 $t4 #define t5 $t5 #define t6 $t6 #define t7 $t7 #define t8 $t8 #define s0 $s0 #define s1 $s1 #define s2 $s2 #define s3 $s3 #define s4 $s4 #define s5 $s5 #define s6 $s6 #define s7 $s7 #define s8 $s8 #define zero $zero #define sp $sp #define ra $ra #define fa0 $fa0 #define fa1 $fa1 #define fa2 $fa2 #define fa3 $fa3 #define fa4 $fa4 #define fa5 $fa5 #define fa6 $fa6 #define fa7 $fa7 #define ft0 $ft0 #define ft1 $ft1 #define ft2 $ft2 #define ft3 $ft3 #define ft4 $ft4 #define ft5 $ft5 #define ft6 $ft6 #define ft7 $ft7 #define ft8 $ft8 #define ft9 $ft9 #define ft10 $ft10 #define ft11 $ft11 #define ft12 $ft12 #define ft13 $ft13 #define ft14 $ft14 #define ft15 $ft15 #define fs0 $fs0 #define fs1 $fs1 #define fs2 $fs2 #define fs3 $fs3 #define fs4 $fs4 #define fs5 $fs5 #define fs6 $fs6 #define fs7 $fs7 #define f0 $f0 #define f1 $f1 #define f2 $f2 #define f3 $f3 #define f4 $f4 #define f5 $f5 #define f6 $f6 #define f7 $f7 #define f8 $f8 #define f9 $f9 #define f10 $f10 #define f11 $f11 #define f12 $f12 #define f13 $f13 #define f14 $f14 #define f15 $f15 #define f16 $f16 #define f17 $f17 #define f18 $f18 #define f19 $f19 #define f20 $f20 #define f21 $f21 #define f22 $f22 #define f23 $f23 #define f24 $f24 #define f25 $f25 #define f26 $f26 #define f27 $f27 #define f28 $f28 #define f29 $f29 #define f30 $f30 #define f31 $f31 #define vr0 $vr0 #define vr1 $vr1 #define vr2 $vr2 #define vr3 $vr3 #define vr4 $vr4 #define vr5 $vr5 #define vr6 $vr6 #define vr7 $vr7 #define vr8 $vr8 #define vr9 $vr9 #define vr10 $vr10 #define vr11 $vr11 #define vr12 $vr12 #define vr13 $vr13 #define vr14 $vr14 #define vr15 $vr15 #define vr16 $vr16 #define vr17 $vr17 #define vr18 $vr18 #define vr19 $vr19 #define vr20 $vr20 #define vr21 $vr21 #define vr22 $vr22 #define vr23 $vr23 #define vr24 $vr24 #define vr25 $vr25 #define vr26 $vr26 #define vr27 $vr27 #define vr28 $vr28 #define vr29 $vr29 #define vr30 $vr30 #define vr31 $vr31 #define xr0 $xr0 #define xr1 $xr1 #define xr2 $xr2 #define xr3 $xr3 #define xr4 $xr4 #define xr5 $xr5 #define xr6 $xr6 #define xr7 $xr7 #define xr8 $xr8 #define xr9 $xr9 #define xr10 $xr10 #define xr11 $xr11 #define xr12 $xr12 #define xr13 $xr13 #define xr14 $xr14 #define xr15 $xr15 #define xr16 $xr16 #define xr17 $xr17 #define xr18 $xr18 #define xr19 $xr19 #define xr20 $xr20 #define xr21 $xr21 #define xr22 $xr22 #define xr23 $xr23 #define xr24 $xr24 #define xr25 $xr25 #define xr26 $xr26 #define xr27 $xr27 #define xr28 $xr28 #define xr29 $xr29 #define xr30 $xr30 #define xr31 $xr31 /* *============================================================================ * LSX/LASX synthesize instructions *============================================================================ */ /* * Description : Dot product of byte vector elements * Arguments : Inputs - vj, vk * Outputs - vd * Return Type - halfword */ .macro vdp2.h.bu vd, vj, vk vmulwev.h.bu \vd, \vj, \vk vmaddwod.h.bu \vd, \vj, \vk .endm .macro vdp2.h.bu.b vd, vj, vk vmulwev.h.bu.b \vd, \vj, \vk vmaddwod.h.bu.b \vd, \vj, \vk .endm .macro vdp2.w.h vd, vj, vk vmulwev.w.h \vd, \vj, \vk vmaddwod.w.h \vd, \vj, \vk .endm .macro xvdp2.h.bu xd, xj, xk xvmulwev.h.bu \xd, \xj, \xk xvmaddwod.h.bu \xd, \xj, \xk .endm .macro xvdp2.h.bu.b xd, xj, xk xvmulwev.h.bu.b \xd, \xj, \xk xvmaddwod.h.bu.b \xd, \xj, \xk .endm .macro xvdp2.w.h xd, xj, xk xvmulwev.w.h \xd, \xj, \xk xvmaddwod.w.h \xd, \xj, \xk .endm /* * Description : Dot product & addition of halfword vector elements * Arguments : Inputs - vj, vk * Outputs - vd * Return Type - twice size of input */ .macro vdp2add.h.bu vd, vj, vk vmaddwev.h.bu \vd, \vj, \vk vmaddwod.h.bu \vd, \vj, \vk .endm .macro vdp2add.h.bu.b vd, vj, vk vmaddwev.h.bu.b \vd, \vj, \vk vmaddwod.h.bu.b \vd, \vj, \vk .endm .macro vdp2add.w.h vd, vj, vk vmaddwev.w.h \vd, \vj, \vk vmaddwod.w.h \vd, \vj, \vk .endm .macro xvdp2add.h.bu.b xd, xj, xk xvmaddwev.h.bu.b \xd, \xj, \xk xvmaddwod.h.bu.b \xd, \xj, \xk .endm .macro xvdp2add.w.h xd, xj, xk xvmaddwev.w.h \xd, \xj, \xk xvmaddwod.w.h \xd, \xj, \xk .endm /* * Description : Range element vj[i] to vk[i] ~ vj[i] * clip: vj > vk ? vj : vk && vj < va ? vj : va */ .macro vclip.h vd, vj, vk, va vmax.h \vd, \vj, \vk vmin.h \vd, \vd, \va .endm .macro vclip.w vd, vj, vk, va vmax.w \vd, \vj, \vk vmin.w \vd, \vd, \va .endm .macro xvclip.h xd, xj, xk, xa xvmax.h \xd, \xj, \xk xvmin.h \xd, \xd, \xa .endm .macro xvclip.w xd, xj, xk, xa xvmax.w \xd, \xj, \xk xvmin.w \xd, \xd, \xa .endm /* * Description : Range element vj[i] to 0 ~ 255 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 */ .macro vclip255.h vd, vj vmaxi.h \vd, \vj, 0 vsat.hu \vd, \vd, 7 .endm .macro vclip255.w vd, vj vmaxi.w \vd, \vj, 0 vsat.wu \vd, \vd, 7 .endm .macro xvclip255.h xd, xj xvmaxi.h \xd, \xj, 0 xvsat.hu \xd, \xd, 7 .endm .macro xvclip255.w xd, xj xvmaxi.w \xd, \xj, 0 xvsat.wu \xd, \xd, 7 .endm /* * Description : Store elements of vector * vd : Data vector to be stroed * rk : Address of data storage * ra : Offset of address * si : Index of data in vd */ .macro vstelmx.b vd, rk, ra, si add.d \rk, \rk, \ra vstelm.b \vd, \rk, 0, \si .endm .macro vstelmx.h vd, rk, ra, si add.d \rk, \rk, \ra vstelm.h \vd, \rk, 0, \si .endm .macro vstelmx.w vd, rk, ra, si add.d \rk, \rk, \ra vstelm.w \vd, \rk, 0, \si .endm .macro vstelmx.d vd, rk, ra, si add.d \rk, \rk, \ra vstelm.d \vd, \rk, 0, \si .endm .macro vmov xd, xj vor.v \xd, \xj, \xj .endm .macro xmov xd, xj xvor.v \xd, \xj, \xj .endm .macro xvstelmx.d xd, rk, ra, si add.d \rk, \rk, \ra xvstelm.d \xd, \rk, 0, \si .endm /* *============================================================================ * LSX/LASX custom macros *============================================================================ */ /* * Load 4 float, double, V128, v256 elements with stride. */ .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 fld.s \out0, \src, 0 fldx.s \out1, \src, \stride fldx.s \out2, \src, \stride2 fldx.s \out3, \src, \stride3 .endm .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 fld.d \out0, \src, 0 fldx.d \out1, \src, \stride fldx.d \out2, \src, \stride2 fldx.d \out3, \src, \stride3 .endm .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 vld \out0, \src, 0 vldx \out1, \src, \stride vldx \out2, \src, \stride2 vldx \out3, \src, \stride3 .endm .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 xvld \out0, \src, 0 xvldx \out1, \src, \stride xvldx \out2, \src, \stride2 xvldx \out3, \src, \stride3 .endm /* * Description : Transpose 4x4 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 vilvl.h \tmp0, \in1, \in0 vilvl.h \tmp1, \in3, \in2 vilvl.w \out0, \tmp1, \tmp0 vilvh.w \out2, \tmp1, \tmp0 vilvh.d \out1, \out0, \out0 vilvh.d \out3, \out0, \out2 .endm /* * Description : Transpose 4x4 block with word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 * Details : * Example : * 1, 2, 3, 4 1, 5, 9,13 * 5, 6, 7, 8 to 2, 6,10,14 * 9,10,11,12 =====> 3, 7,11,15 * 13,14,15,16 4, 8,12,16 */ .macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 vilvl.w \tmp0, \in1, \in0 vilvh.w \out1, \in1, \in0 vilvl.w \tmp1, \in3, \in2 vilvh.w \out3, \in3, \in2 vilvl.d \out0, \tmp1, \tmp0 vilvl.d \out2, \out3, \out1 vilvh.d \out3, \out3, \out1 vilvh.d \out1, \tmp1, \tmp0 .endm /* * Description : Transpose 8x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 */ .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ tmp3, tmp4, tmp5, tmp6, tmp7 vilvl.h \tmp0, \in6, \in4 vilvl.h \tmp1, \in7, \in5 vilvl.h \tmp2, \in2, \in0 vilvl.h \tmp3, \in3, \in1 vilvl.h \tmp4, \tmp1, \tmp0 vilvh.h \tmp5, \tmp1, \tmp0 vilvl.h \tmp6, \tmp3, \tmp2 vilvh.h \tmp7, \tmp3, \tmp2 vilvh.h \tmp0, \in6, \in4 vilvh.h \tmp1, \in7, \in5 vilvh.h \tmp2, \in2, \in0 vilvh.h \tmp3, \in3, \in1 vpickev.d \out0, \tmp4, \tmp6 vpickod.d \out1, \tmp4, \tmp6 vpickev.d \out2, \tmp5, \tmp7 vpickod.d \out3, \tmp5, \tmp7 vilvl.h \tmp4, \tmp1, \tmp0 vilvh.h \tmp5, \tmp1, \tmp0 vilvl.h \tmp6, \tmp3, \tmp2 vilvh.h \tmp7, \tmp3, \tmp2 vpickev.d \out4, \tmp4, \tmp6 vpickod.d \out5, \tmp4, \tmp6 vpickev.d \out6, \tmp5, \tmp7 vpickod.d \out7, \tmp5, \tmp7 .endm /* * Description : Transpose 16x8 block with byte elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 */ .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15, \ out0, out1, out2, out3, out4, out5, out6, out7,\ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 xvilvl.b \tmp0, \in2, \in0 xvilvl.b \tmp1, \in3, \in1 xvilvl.b \tmp2, \in6, \in4 xvilvl.b \tmp3, \in7, \in5 xvilvl.b \tmp4, \in10, \in8 xvilvl.b \tmp5, \in11, \in9 xvilvl.b \tmp6, \in14, \in12 xvilvl.b \tmp7, \in15, \in13 xvilvl.b \out0, \tmp1, \tmp0 xvilvh.b \out1, \tmp1, \tmp0 xvilvl.b \out2, \tmp3, \tmp2 xvilvh.b \out3, \tmp3, \tmp2 xvilvl.b \out4, \tmp5, \tmp4 xvilvh.b \out5, \tmp5, \tmp4 xvilvl.b \out6, \tmp7, \tmp6 xvilvh.b \out7, \tmp7, \tmp6 xvilvl.w \tmp0, \out2, \out0 xvilvh.w \tmp2, \out2, \out0 xvilvl.w \tmp4, \out3, \out1 xvilvh.w \tmp6, \out3, \out1 xvilvl.w \tmp1, \out6, \out4 xvilvh.w \tmp3, \out6, \out4 xvilvl.w \tmp5, \out7, \out5 xvilvh.w \tmp7, \out7, \out5 xvilvl.d \out0, \tmp1, \tmp0 xvilvh.d \out1, \tmp1, \tmp0 xvilvl.d \out2, \tmp3, \tmp2 xvilvh.d \out3, \tmp3, \tmp2 xvilvl.d \out4, \tmp5, \tmp4 xvilvh.d \out5, \tmp5, \tmp4 xvilvl.d \out6, \tmp7, \tmp6 xvilvh.d \out7, \tmp7, \tmp6 .endm /* * Description : Transpose 4x4 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.h \tmp0, \in1, \in0 xvilvl.h \tmp1, \in3, \in2 xvilvl.w \out0, \tmp1, \tmp0 xvilvh.w \out2, \tmp1, \tmp0 xvilvh.d \out1, \out0, \out0 xvilvh.d \out3, \out0, \out2 .endm /* * Description : Transpose 4x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.h \tmp0, \in2, \in0 xvilvl.h \tmp1, \in3, \in1 xvilvl.h \out2, \tmp1, \tmp0 xvilvh.h \out3, \tmp1, \tmp0 xvilvl.d \out0, \out2, \out2 xvilvh.d \out1, \out2, \out2 xvilvl.d \out2, \out3, \out3 xvilvh.d \out3, \out3, \out3 .endm /* * Description : Transpose 8x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 */ .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7, \ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 xvilvl.h \tmp0, \in6, \in4 xvilvl.h \tmp1, \in7, \in5 xvilvl.h \tmp2, \in2, \in0 xvilvl.h \tmp3, \in3, \in1 xvilvl.h \tmp4, \tmp1, \tmp0 xvilvh.h \tmp5, \tmp1, \tmp0 xvilvl.h \tmp6, \tmp3, \tmp2 xvilvh.h \tmp7, \tmp3, \tmp2 xvilvh.h \tmp0, \in6, \in4 xvilvh.h \tmp1, \in7, \in5 xvilvh.h \tmp2, \in2, \in0 xvilvh.h \tmp3, \in3, \in1 xvpickev.d \out0, \tmp4, \tmp6 xvpickod.d \out1, \tmp4, \tmp6 xvpickev.d \out2, \tmp5, \tmp7 xvpickod.d \out3, \tmp5, \tmp7 xvilvl.h \tmp4, \tmp1, \tmp0 xvilvh.h \tmp5, \tmp1, \tmp0 xvilvl.h \tmp6, \tmp3, \tmp2 xvilvh.h \tmp7, \tmp3, \tmp2 xvpickev.d \out4, \tmp4, \tmp6 xvpickod.d \out5, \tmp4, \tmp6 xvpickev.d \out6, \tmp5, \tmp7 xvpickod.d \out7, \tmp5, \tmp7 .endm /* * Description : Transpose 2x4x4 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1, tmp2 xvilvh.h \tmp1, \in0, \in1 xvilvl.h \out1, \in0, \in1 xvilvh.h \tmp0, \in2, \in3 xvilvl.h \out3, \in2, \in3 xvilvh.w \tmp2, \out3, \out1 xvilvl.w \out3, \out3, \out1 xvilvl.w \out2, \tmp0, \tmp1 xvilvh.w \tmp1, \tmp0, \tmp1 xvilvh.d \out0, \out2, \out3 xvilvl.d \out2, \out2, \out3 xvilvh.d \out1, \tmp1, \tmp2 xvilvl.d \out3, \tmp1, \tmp2 .endm /* * Description : Transpose 4x4 block with word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 * Details : * Example : * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 */ .macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.w \tmp0, \in1, \in0 xvilvh.w \out1, \in1, \in0 xvilvl.w \tmp1, \in3, \in2 xvilvh.w \out3, \in3, \in2 xvilvl.d \out0, \tmp1, \tmp0 xvilvl.d \out2, \out3, \out1 xvilvh.d \out3, \out3, \out1 xvilvh.d \out1, \tmp1, \tmp0 .endm /* * Description : Transpose 8x8 block with word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, * _out7 * Example : LASX_TRANSPOSE8x8_W * in0 : 1,2,3,4,5,6,7,8 * in1 : 2,2,3,4,5,6,7,8 * in2 : 3,2,3,4,5,6,7,8 * in3 : 4,2,3,4,5,6,7,8 * in4 : 5,2,3,4,5,6,7,8 * in5 : 6,2,3,4,5,6,7,8 * in6 : 7,2,3,4,5,6,7,8 * in7 : 8,2,3,4,5,6,7,8 * * out0 : 1,2,3,4,5,6,7,8 * out1 : 2,2,2,2,2,2,2,2 * out2 : 3,3,3,3,3,3,3,3 * out3 : 4,4,4,4,4,4,4,4 * out4 : 5,5,5,5,5,5,5,5 * out5 : 6,6,6,6,6,6,6,6 * out6 : 7,7,7,7,7,7,7,7 * out7 : 8,8,8,8,8,8,8,8 */ .macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ out0, out1, out2, out3, out4, out5, out6, out7,\ tmp0, tmp1, tmp2, tmp3 xvilvl.w \tmp0, \in2, \in0 xvilvl.w \tmp1, \in3, \in1 xvilvh.w \tmp2, \in2, \in0 xvilvh.w \tmp3, \in3, \in1 xvilvl.w \out0, \tmp1, \tmp0 xvilvh.w \out1, \tmp1, \tmp0 xvilvl.w \out2, \tmp3, \tmp2 xvilvh.w \out3, \tmp3, \tmp2 xvilvl.w \tmp0, \in6, \in4 xvilvl.w \tmp1, \in7, \in5 xvilvh.w \tmp2, \in6, \in4 xvilvh.w \tmp3, \in7, \in5 xvilvl.w \out4, \tmp1, \tmp0 xvilvh.w \out5, \tmp1, \tmp0 xvilvl.w \out6, \tmp3, \tmp2 xvilvh.w \out7, \tmp3, \tmp2 xmov \tmp0, \out0 xmov \tmp1, \out1 xmov \tmp2, \out2 xmov \tmp3, \out3 xvpermi.q \out0, \out4, 0x02 xvpermi.q \out1, \out5, 0x02 xvpermi.q \out2, \out6, 0x02 xvpermi.q \out3, \out7, 0x02 xvpermi.q \out4, \tmp0, 0x31 xvpermi.q \out5, \tmp1, 0x31 xvpermi.q \out6, \tmp2, 0x31 xvpermi.q \out7, \tmp3, 0x31 .endm /* * Description : Transpose 4x4 block with double-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 * Example : LASX_TRANSPOSE4x4_D * in0 : 1,2,3,4 * in1 : 1,2,3,4 * in2 : 1,2,3,4 * in3 : 1,2,3,4 * * out0 : 1,1,1,1 * out1 : 2,2,2,2 * out2 : 3,3,3,3 * out3 : 4,4,4,4 */ .macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.d \tmp0, \in1, \in0 xvilvh.d \out1, \in1, \in0 xvilvh.d \tmp1, \in3, \in2 xvilvl.d \out2, \in3, \in2 xvor.v \out0, \tmp0, \tmp0 xvor.v \out3, \tmp1, \tmp1 xvpermi.q \out0, \out2, 0x02 xvpermi.q \out2, \tmp0, 0x31 xvpermi.q \out3, \out1, 0x31 xvpermi.q \out1, \tmp1, 0x02 .endm dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/loongson_util.S000066400000000000000000000204111517466257200256240ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #ifndef DAV2D_SRC_LOONGSON_UTIL_S #define DAV2D_SRC_LOONGSON_UTIL_S #ifndef DEFAULT_ALIGN #define DEFAULT_ALIGN 5 #endif //That l means local defines local functions .macro functionl name, align=DEFAULT_ALIGN .macro endfuncl jirl $r0, $r1, 0x0 .size \name, . - \name .purgem endfuncl .endm .text ; .align \align ; .hidden \name ; .type \name, @function ; \name: ; .endm .macro TRANSPOSE_4x16B in0, in1 ,in2, in3, in4, in5, in6, in7 vpackev.b \in4, \in1, \in0 vpackod.b \in5, \in1, \in0 vpackev.b \in6, \in3, \in2 vpackod.b \in7, \in3, \in2 vpackev.h \in0, \in6, \in4 vpackod.h \in2, \in6, \in4 vpackev.h \in1, \in7, \in5 vpackod.h \in3, \in7, \in5 .endm .macro TRANSPOSE_8x16B in0, in1, in2, in3, in4, in5, in6, in7, in8, in9 vpackev.b \in8, \in1, \in0 vpackod.b \in9, \in1, \in0 vpackev.b \in1, \in3, \in2 vpackod.b \in3, \in3, \in2 vpackev.b \in0, \in5, \in4 vpackod.b \in5, \in5, \in4 vpackev.b \in2, \in7, \in6 vpackod.b \in7, \in7, \in6 vpackev.h \in4, \in2, \in0 vpackod.h \in2, \in2, \in0 vpackev.h \in6, \in7, \in5 vpackod.h \in7, \in7, \in5 vpackev.h \in5, \in3, \in9 vpackod.h \in9, \in3, \in9 vpackev.h \in3, \in1, \in8 vpackod.h \in8, \in1, \in8 vpackev.w \in0, \in4, \in3 vpackod.w \in4, \in4, \in3 vpackev.w \in1, \in6, \in5 vpackod.w \in5, \in6, \in5 vpackod.w \in6, \in2, \in8 vpackev.w \in2, \in2, \in8 vpackev.w \in3, \in7, \in9 vpackod.w \in7, \in7, \in9 .endm .macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 vld \in0, \src, \start vld \in1, \src, \start+(\stride*1) vld \in2, \src, \start+(\stride*2) vld \in3, \src, \start+(\stride*3) vld \in4, \src, \start+(\stride*4) vld \in5, \src, \start+(\stride*5) vld \in6, \src, \start+(\stride*6) vld \in7, \src, \start+(\stride*7) .endm .macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 vst \in0, \src, \start vst \in1, \src, \start+(\stride*1) vst \in2, \src, \start+(\stride*2) vst \in3, \src, \start+(\stride*3) vst \in4, \src, \start+(\stride*4) vst \in5, \src, \start+(\stride*5) vst \in6, \src, \start+(\stride*6) vst \in7, \src, \start+(\stride*7) .endm .macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15 vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vld \in8, \src, \start+(\stride*8) vld \in9, \src, \start+(\stride*9) vld \in10, \src, \start+(\stride*10) vld \in11, \src, \start+(\stride*11) vld \in12, \src, \start+(\stride*12) vld \in13, \src, \start+(\stride*13) vld \in14, \src, \start+(\stride*14) vld \in15, \src, \start+(\stride*15) .endm .macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15 vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 vst \in8, \src, \start+(\stride*8) vst \in9, \src, \start+(\stride*9) vst \in10, \src, \start+(\stride*10) vst \in11, \src, \start+(\stride*11) vst \in12, \src, \start+(\stride*12) vst \in13, \src, \start+(\stride*13) vst \in14, \src, \start+(\stride*14) vst \in15, \src, \start+(\stride*15) .endm .macro xvld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 xvld \in0, \src, \start xvld \in1, \src, \start+(\stride) xvld \in2, \src, \start+(\stride<<1) xvld \in3, \src, \start+(\stride<<1)+(\stride) xvld \in4, \src, \start+(\stride<<2) xvld \in5, \src, \start+(\stride<<2)+(\stride) xvld \in6, \src, \start+(\stride*6) xvld \in7, \src, \start+(\stride<<3)-(\stride) .endm .macro xvst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7 xvst \in0, \src, \start xvst \in1, \src, \start+(\stride) xvst \in2, \src, \start+(\stride<<1) xvst \in3, \src, \start+(\stride<<1)+(\stride) xvst \in4, \src, \start+(\stride<<2) xvst \in5, \src, \start+(\stride<<2)+(\stride) xvst \in6, \src, \start+(\stride*6) xvst \in7, \src, \start+(\stride<<3)-(\stride) .endm .macro xvld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15 xvld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 xvld \in8, \src, \start+(\stride<<3) xvld \in9, \src, \start+(\stride<<3)+(\stride) xvld \in10, \src, \start+(\stride*10) xvld \in11, \src, \start+(\stride*11) xvld \in12, \src, \start+(\stride*12) xvld \in13, \src, \start+(\stride*13) xvld \in14, \src, \start+(\stride*14) xvld \in15, \src, \start+(\stride<<4)-(\stride) .endm .macro xvst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15 xvst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 xvst \in8, \src, \start+(\stride<<3) xvst \in9, \src, \start+(\stride<<3)+(\stride) xvst \in10, \src, \start+(\stride*10) xvst \in11, \src, \start+(\stride*11) xvst \in12, \src, \start+(\stride*12) xvst \in13, \src, \start+(\stride*13) xvst \in14, \src, \start+(\stride*14) xvst \in15, \src, \start+(\stride<<4)-(\stride) .endm #endif /* DAV2D_SRC_LOONGSON_UTIL_S */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/loopfilter.S000066400000000000000000001262021517466257200251150ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" #include "src/loongarch/loongson_util.S" // depending on how many pixels need to be stored, returns: // t4 = (1 << 0) : 0 pixels // t4 = (1 << 4) : inner 4 pixels // t4 = (1 << 6) : inner 6 pixels // t4 = 0 : all pixels .macro FILTER wd functionl lpf_16_wd\wd\()_lsx vabsd.bu vr0, vr22, vr23 // abs(p1 - p0) vabsd.bu vr1, vr25, vr24 // abs(q1 - q0) vabsd.bu vr2, vr23, vr24 // abs(p0 - q0) vabsd.bu vr3, vr22, vr25 // abs(p1 - q1) .if \wd >= 6 vabsd.bu vr4, vr21, vr22 // abs(p2 - p1) vabsd.bu vr5, vr26, vr25 // abs(q2 - q1) .endif .if \wd >= 8 vabsd.bu vr6, vr20, vr21 // abs(p3 - p2) vabsd.bu vr7, vr27, vr26 // abs(q3 - q3) .endif .if \wd >= 6 vmax.bu vr4, vr4, vr5 .endif vsadd.bu vr2, vr2, vr2 // abs(p0 - q0) * 2 .if \wd >= 8 vmax.bu vr6, vr6, vr7 .endif vsrli.b vr3, vr3, 1 // abs(p1 - q1) >> 1 .if \wd >= 8 vmax.bu vr4, vr4, vr6 .endif .if \wd >= 6 vand.v vr4, vr4, vr14 .endif vmax.bu vr0, vr0, vr1 // max(abs(p1 - p0), abs(q1 - q0)) vsadd.bu vr2, vr2, vr3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 vmax.bu vr4, vr0, vr4 vsle.bu vr1, vr4, vr11 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else vsle.bu vr1, vr0, vr11 // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif vsle.bu vr2, vr2, vr10 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E vand.v vr1, vr1, vr2 // fm vand.v vr1, vr1, vr13 // fm && wd >= 4 .if \wd >= 6 vand.v vr14, vr14, vr1 // fm && wd > 4 .endif .if \wd >= 16 vand.v vr15, vr15, vr1 // fm && wd == 16 .endif vhaddw.qu.du vr8, vr1, vr1 vpickve2gr.du t6, vr8, 0 bnez t6, 9f // if (!fm || wd < 4) return; li.w t4, 1 << 0 jirl zero, ra, 0x00 9: .if \wd >= 6 vabsd.bu vr2, vr21, vr23 // abs(p2 - p0) vabsd.bu vr3, vr22, vr23 // abs(p1 - p0) vabsd.bu vr4, vr25, vr24 // abs(q1 - q0) vabsd.bu vr5, vr26, vr24 // abs(q2 - q0) .if \wd >= 8 vabsd.bu vr6, vr20, vr23 // abs(p3 - p0) vabsd.bu vr7, vr27, vr24 // abs(q3 - q0) .endif vmax.bu vr2, vr2, vr3 vmax.bu vr4, vr4, vr5 .if \wd >= 8 vmax.bu vr6, vr6, vr7 .endif vmax.bu vr2, vr2, vr4 .if \wd >= 8 vmax.bu vr2, vr2, vr6 .endif .if \wd == 16 vabsd.bu vr3, vr17, vr23 // abs(p6 - p0) vabsd.bu vr4, vr18, vr23 // abs(p5 - p0) vabsd.bu vr5, vr19, vr23 // abs(p4 - p0) .endif vslei.bu vr2, vr2, 1 // flat8in .if \wd == 16 vabsd.bu vr6, vr28, vr24 // abs(q4 - q0) vabsd.bu vr7, vr29, vr24 // abs(q5 - q0) vabsd.bu vr8, vr30, vr24 // abs(q6 - q0) .endif vand.v vr14, vr2, vr14 // flat8in && fm && wd > 4 vandn.v vr1, vr14, vr1 // fm && wd >= 4 && !flat8in .if \wd == 16 vmax.bu vr3, vr3, vr4 vmax.bu vr5, vr5, vr6 .endif vhaddw.qu.du vr9, vr1, vr1 .if \wd == 16 vmax.bu vr7, vr7, vr8 vmax.bu vr3, vr3, vr5 vmax.bu vr3, vr3, vr7 vslei.bu vr3, vr3, 1 // flat8out .endif vpickve2gr.du t6, vr9, 0 .if \wd == 16 vand.v vr15, vr15, vr3 // flat8out && fm && wd == 16 vand.v vr15, vr15, vr14 // flat8out && flat8in && fm && wd == 16 vandn.v vr14, vr15, vr14 // flat8in && fm && wd >= 4 && !flat8out .endif beqz t6, 1f // skip wd == 4 case .endif vxori.b vr2, vr22, 128 // p1 - 128 vxori.b vr3, vr25, 128 // q1 - 128 vslt.bu vr0, vr12, vr0 // hev vssub.b vr2, vr2, vr3 // iclip_diff(p1 - q1) vand.v vr4, vr2, vr0 // if (hev) iclip_diff(p1 - q1) vandn.v vr0, vr0, vr1 // (fm && wd >= 4 && !hev) vxor.v vr5, vr5, vr5 vaddi.hu vr5, vr5, 3 vsubwev.h.bu vr2, vr24, vr23 vsubwod.h.bu vr3, vr24, vr23 vmul.h vr2, vr2, vr5 vmul.h vr3, vr3, vr5 vxor.v vr6, vr6, vr6 vaddwev.h.b vr7, vr4, vr6 vaddwod.h.b vr6, vr4, vr6 vadd.h vr2, vr2, vr7 vadd.h vr3, vr3, vr6 vssrani.b.h vr2, vr2, 0 vssrani.b.h vr3, vr3, 0 vilvl.b vr2, vr3, vr2 // f vxor.v vr6, vr6, vr6 vaddi.bu vr5, vr6, 3 vaddi.bu vr6, vr6, 4 // 4 vsadd.b vr4, vr6, vr2 // imin(f + 4, 127) vsadd.b vr5, vr5, vr2 // imin(f + 3, 127) vsrai.b vr4, vr4, 3 // f1 vsrai.b vr5, vr5, 3 // f2 vaddi.bu vr2, vr23, 0 // p0 vaddi.bu vr3, vr24, 0 // q0 vxori.b vr2, vr2, 128 vxori.b vr3, vr3, 128 vsadd.b vr2, vr2, vr5 // p0 + f2 out p0 vssub.b vr3, vr3, vr4 // q0 - f1 out q0 vxori.b vr2, vr2, 128 vxori.b vr3, vr3, 128 vsrari.b vr4, vr4, 1 // (f1 + 1) >> 1 vbitsel.v vr23, vr23, vr2, vr1 // if (fm && wd >= 4) vbitsel.v vr24, vr24, vr3, vr1 // if (fm && wd >= 4) vaddi.bu vr2, vr22, 0 // p1 vaddi.bu vr3, vr25, 0 // q1 vxori.b vr2, vr2, 128 vxori.b vr3, vr3, 128 vsadd.b vr2, vr2, vr4 // out p1 vssub.b vr3, vr3, vr4 // out q1 vxori.b vr2, vr2, 128 vxori.b vr3, vr3, 128 vbitsel.v vr22, vr22, vr2, vr0 // if (fm && wd >= 4 && !hev) vbitsel.v vr25, vr25, vr3, vr0 // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 vhaddw.qu.du vr0, vr14, vr14 vpickve2gr.du t6, vr0, 0 beqz t6, 2f // skip if there's no flat8in vaddwev.h.bu vr0, vr21, vr21 vaddwod.h.bu vr1, vr21, vr21 // p2 * 2 vaddwev.h.bu vr2, vr21, vr22 vaddwod.h.bu vr3, vr21, vr22 // p2 + p1 vaddwev.h.bu vr4, vr22, vr23 vaddwod.h.bu vr5, vr22, vr23 // p1 + p0 vaddwev.h.bu vr6, vr23, vr24 vaddwod.h.bu vr7, vr23, vr24 // p0 + q0 vadd.h vr8, vr0, vr2 vadd.h vr9, vr1, vr3 vadd.h vr10, vr4, vr6 vadd.h vr11, vr5, vr7 vaddwev.h.bu vr12, vr24, vr25 vaddwod.h.bu vr13, vr24, vr25 // q0 + q1 vadd.h vr8, vr8, vr10 vadd.h vr9, vr9, vr11 vsub.h vr12, vr12, vr0 vsub.h vr13, vr13, vr1 vaddwev.h.bu vr10, vr25, vr26 vaddwod.h.bu vr11, vr25, vr26 // q1 + q2 vssrlrni.bu.h vr0, vr8, 3 vssrlrni.bu.h vr1, vr9, 3 vilvl.b vr0, vr1, vr0 // out p1 vadd.h vr8, vr8, vr12 vadd.h vr9, vr9, vr13 vsub.h vr10, vr10, vr2 vsub.h vr11, vr11, vr3 vaddwev.h.bu vr12, vr26, vr26 // q2 + q2 vaddwod.h.bu vr13, vr26, vr26 vssrlrni.bu.h vr1, vr8, 3 vssrlrni.bu.h vr2, vr9, 3 vilvl.b vr1, vr2, vr1 // out p0 vadd.h vr8, vr8, vr10 vadd.h vr9, vr9, vr11 vsub.h vr12, vr12, vr4 vsub.h vr13, vr13, vr5 vssrlrni.bu.h vr2, vr8, 3 vssrlrni.bu.h vr3, vr9, 3 vilvl.b vr2, vr3, vr2 // out q0 vbitsel.v vr22, vr22, vr0, vr14 vadd.h vr8, vr8, vr12 vadd.h vr9, vr9, vr13 vbitsel.v vr23, vr23, vr1, vr14 vssrlrni.bu.h vr3, vr8, 3 vssrlrni.bu.h vr4, vr9, 3 vilvl.b vr3, vr4, vr3 vbitsel.v vr24, vr24, vr2, vr14 vbitsel.v vr25, vr25, vr3, vr14 .elseif \wd >= 8 vhaddw.qu.du vr0, vr14, vr14 vpickve2gr.du t6, vr0, 0 .if \wd == 8 beqz t6, 8f // skip if there's no flat8in .else beqz t6, 2f // skip if there's no flat8in .endif vaddwev.h.bu vr0, vr20, vr21 vaddwod.h.bu vr1, vr20, vr21 // p3 + p2 vaddwev.h.bu vr2, vr22, vr25 vaddwod.h.bu vr3, vr22, vr25 // p1 + q1 vaddwev.h.bu vr4, vr20, vr22 vaddwod.h.bu vr5, vr20, vr22 // p3 + p1 vaddwev.h.bu vr6, vr23, vr26 vaddwod.h.bu vr7, vr23, vr26 // p0 + q2 vadd.h vr8, vr0, vr0 vadd.h vr9, vr1, vr1 // 2 * (p3 + p2) vxor.v vr10, vr10, vr10 vaddwev.h.bu vr11, vr23, vr10 vaddwod.h.bu vr12, vr23, vr10 vaddwev.h.bu vr13, vr24, vr10 vaddwod.h.bu vr10, vr24, vr10 vadd.h vr8, vr8, vr11 // + p0 vadd.h vr9, vr9, vr12 vadd.h vr8, vr8, vr13 // + q0 vadd.h vr9, vr9, vr10 vadd.h vr8, vr8, vr4 vadd.h vr9, vr9, vr5 // + p3 + p1 vsub.h vr2, vr2, vr0 vsub.h vr3, vr3, vr1 // p1 + q1 - p3 - p2 vsub.h vr6, vr6, vr4 vsub.h vr7, vr7, vr5 // p0 + q2 - p3 - p1 vssrlrni.bu.h vr10, vr8, 3 vssrlrni.bu.h vr11, vr9, 3 vilvl.b vr10, vr11, vr10 // out p2 vadd.h vr8, vr8, vr2 vadd.h vr9, vr9, vr3 vaddwev.h.bu vr0, vr20, vr23 vaddwod.h.bu vr1, vr20, vr23 // p3 + p0 vaddwev.h.bu vr2, vr24, vr27 vaddwod.h.bu vr3, vr24, vr27 // q0 + q3 vssrlrni.bu.h vr11, vr8, 3 vssrlrni.bu.h vr12, vr9, 3 vilvl.b vr11, vr12, vr11 // out p1 vadd.h vr8, vr8, vr6 vadd.h vr9, vr9, vr7 vsub.h vr2, vr2, vr0 // q0 + q3 - p3 - p0 vsub.h vr3, vr3, vr1 vaddwev.h.bu vr4, vr21, vr24 // p2 + q0 vaddwod.h.bu vr5, vr21, vr24 vaddwev.h.bu vr6, vr25, vr27 // q1 + q3 vaddwod.h.bu vr7, vr25, vr27 vssrlrni.bu.h vr12, vr8, 3 vssrlrni.bu.h vr13, vr9, 3 vilvl.b vr12, vr13, vr12 // out p0 vadd.h vr8, vr8, vr2 vadd.h vr9, vr9, vr3 vsub.h vr6, vr6, vr4 // q1 + q3 - p2 - q0 vsub.h vr7, vr7, vr5 vaddwev.h.bu vr0, vr22, vr25 // p1 + q1 vaddwod.h.bu vr1, vr22, vr25 vaddwev.h.bu vr2, vr26, vr27 vaddwod.h.bu vr3, vr26, vr27 // q2 + q3 vssrlrni.bu.h vr13, vr8, 3 vssrlrni.bu.h vr4, vr9, 3 vilvl.b vr13, vr4, vr13 // out q0 vadd.h vr8, vr8, vr6 vadd.h vr9, vr9, vr7 vsub.h vr2, vr2, vr0 // q2 + q3 - p1 - q1 vsub.h vr3, vr3, vr1 vssrlrni.bu.h vr0, vr8, 3 vssrlrni.bu.h vr1, vr9, 3 vilvl.b vr0, vr1, vr0 // out q1 vadd.h vr8, vr8, vr2 vadd.h vr9, vr9, vr3 vbitsel.v vr21, vr21, vr10, vr14 vbitsel.v vr22, vr22, vr11, vr14 vbitsel.v vr23, vr23, vr12, vr14 vbitsel.v vr24, vr24, vr13, vr14 vssrlrni.bu.h vr1, vr8, 3 vssrlrni.bu.h vr2, vr9, 3 vilvl.b vr1, vr2, vr1 // out q2 vbitsel.v vr25, vr25, vr0, vr14 vbitsel.v vr26, vr26, vr1, vr14 .endif 2: .if \wd == 16 vhaddw.qu.du vr2, vr15, vr15 vpickve2gr.du t6, vr2, 0 bnez t6, 1f // check if flat8out is needed vhaddw.qu.du vr2, vr14, vr14 vpickve2gr.du t6, vr2, 0 beqz t6, 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: vaddwev.h.bu vr2, vr17, vr17 // p6 + p6 vaddwod.h.bu vr3, vr17, vr17 vaddwev.h.bu vr4, vr17, vr18 vaddwod.h.bu vr5, vr17, vr18 // p6 + p5 vaddwev.h.bu vr6, vr17, vr19 vaddwod.h.bu vr7, vr17, vr19 // p6 + p4 vaddwev.h.bu vr8, vr17, vr20 vaddwod.h.bu vr9, vr17, vr20 // p6 + p3 vadd.h vr12, vr2, vr4 vadd.h vr13, vr3, vr5 vadd.h vr10, vr6, vr8 vadd.h vr11, vr7, vr9 vaddwev.h.bu vr6, vr17, vr21 vaddwod.h.bu vr7, vr17, vr21 // p6 + p2 vadd.h vr12, vr12, vr10 vadd.h vr13, vr13, vr11 vaddwev.h.bu vr8, vr17, vr22 vaddwod.h.bu vr9, vr17, vr22 // p6 + p1 vaddwev.h.bu vr10, vr18, vr23 vaddwod.h.bu vr11, vr18, vr23 // p5 + p0 vadd.h vr6, vr6, vr8 vadd.h vr7, vr7, vr9 vaddwev.h.bu vr8, vr19, vr24 vaddwod.h.bu vr9, vr19, vr24 // p4 + q0 vadd.h vr12, vr12, vr6 vadd.h vr13, vr13, vr7 vadd.h vr10, vr10, vr8 vadd.h vr11, vr11, vr9 vaddwev.h.bu vr6, vr20, vr25 vaddwod.h.bu vr7, vr20, vr25 // p3 + q1 vadd.h vr12, vr12, vr10 vadd.h vr13, vr13, vr11 vsub.h vr6, vr6, vr2 vsub.h vr7, vr7, vr3 vaddwev.h.bu vr2, vr21, vr26 vaddwod.h.bu vr3, vr21, vr26 // p2 + q2 vssrlrni.bu.h vr0, vr12, 4 vssrlrni.bu.h vr1, vr13, 4 vilvl.b vr0, vr1, vr0 // out p5 vadd.h vr12, vr12, vr6 vadd.h vr13, vr13, vr7 // - (p6 + p6) + (p3 + q1) vsub.h vr2, vr2, vr4 vsub.h vr3, vr3, vr5 vaddwev.h.bu vr4, vr22, vr27 vaddwod.h.bu vr5, vr22, vr27 // p1 + q3 vaddwev.h.bu vr6, vr17, vr19 vaddwod.h.bu vr7, vr17, vr19 // p6 + p4 vssrlrni.bu.h vr1, vr12, 4 vssrlrni.bu.h vr8, vr13, 4 vilvl.b vr1, vr8, vr1 // out p4 vadd.h vr12, vr12, vr2 vadd.h vr13, vr13, vr3 // - (p6 + p5) + (p2 + q2) vsub.h vr4, vr4, vr6 vsub.h vr5, vr5, vr7 vaddwev.h.bu vr6, vr23, vr28 vaddwod.h.bu vr7, vr23, vr28 // p0 + q4 vaddwev.h.bu vr8, vr17, vr20 vaddwod.h.bu vr9, vr17, vr20 // p6 + p3 vssrlrni.bu.h vr2, vr12, 4 vssrlrni.bu.h vr10, vr13, 4 vilvl.b vr2, vr10, vr2 // out p3 vadd.h vr12, vr12, vr4 vadd.h vr13, vr13, vr5 // - (p6 + p4) + (p1 + q3) vsub.h vr6, vr6, vr8 vsub.h vr7, vr7, vr9 vaddwev.h.bu vr8, vr24, vr29 vaddwod.h.bu vr9, vr24, vr29 // q0 + q5 vaddwev.h.bu vr4, vr17, vr21 vaddwod.h.bu vr5, vr17, vr21 // p6 + p2 vssrlrni.bu.h vr3, vr12, 4 vssrlrni.bu.h vr11, vr13, 4 vilvl.b vr3, vr11, vr3 // out p2 vadd.h vr12, vr12, vr6 vadd.h vr13, vr13, vr7 // - (p6 + p3) + (p0 + q4) vsub.h vr8, vr8, vr4 vsub.h vr9, vr9, vr5 vaddwev.h.bu vr6, vr25, vr30 vaddwod.h.bu vr7, vr25, vr30 // q1 + q6 vaddwev.h.bu vr10, vr17, vr22 vaddwod.h.bu vr11, vr17, vr22 // p6 + p1 vssrlrni.bu.h vr4, vr12, 4 vssrlrni.bu.h vr5, vr13, 4 vilvl.b vr4, vr5, vr4 // out p1 vadd.h vr12, vr12, vr8 vadd.h vr13, vr13, vr9 // - (p6 + p2) + (q0 + q5) vsub.h vr6, vr6, vr10 vsub.h vr7, vr7, vr11 vaddwev.h.bu vr8, vr26, vr30 vaddwod.h.bu vr9, vr26, vr30 // q2 + q6 vbitsel.v vr0, vr18, vr0, vr15 // out p5 vaddwev.h.bu vr10, vr18, vr23 vaddwod.h.bu vr11, vr18, vr23 // p5 + p0 vssrlrni.bu.h vr5, vr12, 4 vssrlrni.bu.h vr18, vr13, 4 vilvl.b vr5, vr18, vr5 // out p0 vadd.h vr12, vr12, vr6 vadd.h vr13, vr13, vr7 // - (p6 + p1) + (q1 + q6) vsub.h vr8, vr8, vr10 vsub.h vr9, vr9, vr11 vaddwev.h.bu vr10, vr27, vr30 vaddwod.h.bu vr11, vr27, vr30 // q3 + q6 vbitsel.v vr1, vr19, vr1, vr15 // out p4 vaddwev.h.bu vr18, vr19, vr24 vaddwod.h.bu vr19, vr19, vr24 // p4 + q0 vssrlrni.bu.h vr6, vr12, 4 vssrlrni.bu.h vr7, vr13, 4 vilvl.b vr6, vr7, vr6 // out q0 vadd.h vr12, vr12, vr8 vadd.h vr13, vr13, vr9 // - (p5 + p0) + (q2 + q6) vsub.h vr10, vr10, vr18 vsub.h vr11, vr11, vr19 vaddwev.h.bu vr8, vr28, vr30 vaddwod.h.bu vr9, vr28, vr30 // q4 + q6 vbitsel.v vr2, vr20, vr2, vr15 // out p3 vaddwev.h.bu vr18, vr20, vr25 vaddwod.h.bu vr19, vr20, vr25 // p3 + q1 vssrlrni.bu.h vr7, vr12, 4 vssrlrni.bu.h vr20, vr13, 4 vilvl.b vr7, vr20, vr7 // out q1 vadd.h vr12, vr12, vr10 vadd.h vr13, vr13, vr11 // - (p4 + q0) + (q3 + q6) vsub.h vr18, vr8, vr18 vsub.h vr19, vr9, vr19 vaddwev.h.bu vr10, vr29, vr30 vaddwod.h.bu vr11, vr29, vr30 // q5 + q6 vbitsel.v vr3, vr21, vr3, vr15 // out p2 vaddwev.h.bu vr20, vr21, vr26 vaddwod.h.bu vr21, vr21, vr26 // p2 + q2 vssrlrni.bu.h vr8, vr12, 4 vssrlrni.bu.h vr9, vr13, 4 vilvl.b vr8, vr9, vr8 // out q2 vadd.h vr12, vr12, vr18 vadd.h vr13, vr13, vr19 // - (p3 + q1) + (q4 + q6) vsub.h vr10, vr10, vr20 vsub.h vr11, vr11, vr21 vaddwev.h.bu vr18, vr30, vr30 vaddwod.h.bu vr19, vr30, vr30 // q6 + q6 vbitsel.v vr4, vr22, vr4, vr15 // out p1 vaddwev.h.bu vr20, vr22, vr27 vaddwod.h.bu vr21, vr22, vr27 // p1 + q3 vssrlrni.bu.h vr9, vr12, 4 vssrlrni.bu.h vr22, vr13, 4 vilvl.b vr9, vr22, vr9 // out q3 vadd.h vr12, vr12, vr10 vadd.h vr13, vr13, vr11 // - (p2 + q2) + (q5 + q6) vsub.h vr18, vr18, vr20 vsub.h vr19, vr19, vr21 vbitsel.v vr5, vr23, vr5, vr15 // out p0 vssrlrni.bu.h vr10, vr12, 4 vssrlrni.bu.h vr23, vr13, 4 vilvl.b vr10, vr23, vr10 // out q4 vadd.h vr12, vr12, vr18 vadd.h vr13, vr13, vr19 // - (p1 + q3) + (q6 + q6) vssrlrni.bu.h vr11, vr12, 4 vssrlrni.bu.h vr12, vr13, 4 vilvl.b vr11, vr12, vr11 // out q5 vbitsel.v vr6, vr24, vr6, vr15 vbitsel.v vr7, vr25, vr7, vr15 vbitsel.v vr8, vr26, vr8, vr15 vbitsel.v vr9, vr27, vr9, vr15 vbitsel.v vr10, vr28, vr10, vr15 vbitsel.v vr11, vr29, vr11, vr15 .endif li.w t4, 0 jirl zero, ra, 0x00 .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels li.w t4, 1 << 6 jirl zero, ra, 0x00 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels li.w t4, 1 << 4 jirl zero, ra, 0x00 .endif endfuncl .endm FILTER 16 FILTER 8 FILTER 6 FILTER 4 .macro LPF_16_WD16 move t7, ra bl lpf_16_wd16_lsx move ra, t7 beqz t4, 1f andi t5, t4, 1 << 6 bnez t5, 7f andi t5, t4, 1 << 4 bnez t5, 8f jirl zero, ra, 0x00 1: .endm .macro LPF_16_WD8 move t7, ra bl lpf_16_wd8_lsx move ra, t7 beqz t4, 1f andi t5, t4, 1 << 4 bnez t5, 8f jirl zero, ra, 0x00 1: .endm .macro LPF_16_WD6 move t7, ra bl lpf_16_wd6_lsx move ra, t7 beqz t4, 1f jirl zero, ra, 0x00 1: .endm .macro LPF_16_WD4 move t7, ra bl lpf_16_wd4_lsx move ra, t7 beqz t4, 1f jirl zero, ra, 0x00 1: .endm functionl lpf_v_4_16_lsx slli.d t3, a1, 1 sub.d t3, a0, t3 vld vr22, t3, 0 // p1 vldx vr23, t3, a1 // p0 vld vr24, a0, 0 // q0 vldx vr25, a0, a1 // q1 LPF_16_WD4 vst vr22, t3, 0 // p1 vstx vr23, t3, a1 // p0 vst vr24, a0, 0 // q0 vstx vr25, a0, a1 // q1 endfuncl functionl lpf_h_4_16_lsx addi.d t3, a0, -2 fld.s f22, t3, 0 fldx.s f23, t3, a1 alsl.d t3, a1, t3, 1 fld.s f24, t3, 0 fldx.s f25, t3, a1 alsl.d t3, a1, t3, 1 fld.s f17, t3, 0 fldx.s f18, t3, a1 alsl.d t3, a1, t3, 1 fld.s f19, t3, 0 fldx.s f20, t3, a1 alsl.d t3, a1, t3, 1 vilvl.w vr22, vr17, vr22 vilvl.w vr23, vr18, vr23 vilvl.w vr24, vr19, vr24 vilvl.w vr25, vr20, vr25 fld.s f17, t3, 0 fldx.s f18, t3, a1 alsl.d t3, a1, t3, 1 fld.s f19, t3, 0 fldx.s f20, t3, a1 alsl.d t3, a1, t3, 1 fld.s f26, t3, 0 fldx.s f27, t3, a1 alsl.d t3, a1, t3, 1 fld.s f28, t3, 0 fldx.s f29, t3, a1 alsl.d t3, a1, t3, 1 vilvl.w vr17, vr26, vr17 vilvl.w vr18, vr27, vr18 vilvl.w vr19, vr28, vr19 vilvl.w vr20, vr29, vr20 vilvl.d vr22, vr17, vr22 vilvl.d vr23, vr18, vr23 vilvl.d vr24, vr19, vr24 vilvl.d vr25, vr20, vr25 addi.d a0, t3, 2 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 LPF_16_WD4 slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 addi.d a0, a0, -2 .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 0 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 1 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 2 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 3 add.d a0, a0, a1 .endr addi.d a0, a0, 2 endfuncl functionl lpf_v_6_16_lsx slli.d t3, a1, 1 sub.d t3, a0, t3 sub.d s0, t3, a1 vld vr21, s0, 0 // p2 vldx vr22, s0, a1 // p1 alsl.d s0, a1, s0, 1 vld vr23, s0, 0 // p0 vldx vr24, s0, a1 // q0 alsl.d s0, a1, s0, 1 vld vr25, s0, 0 // q1 vldx vr26, s0, a1 // q2 LPF_16_WD6 vst vr22, t3, 0 // p1 vstx vr23, t3, a1 // p0 vst vr24, a0, 0 // q0 vstx vr25, a0, a1 // q1 endfuncl functionl lpf_h_6_16_lsx addi.d t3, a0, -4 fld.d f20, t3, 0 fldx.d f21, t3, a1 alsl.d t3, a1, t3, 1 fld.d f22, t3, 0 fldx.d f23, t3, a1 alsl.d t3, a1, t3, 1 fld.d f24, t3, 0 fldx.d f25, t3, a1 alsl.d t3, a1, t3, 1 fld.d f26, t3, 0 fldx.d f27, t3, a1 alsl.d t3, a1, t3, 1 fld.d f16, t3, 0 fldx.d f17, t3, a1 alsl.d t3, a1, t3, 1 fld.d f18, t3, 0 fldx.d f19, t3, a1 alsl.d t3, a1, t3, 1 fld.d f28, t3, 0 fldx.d f29, t3, a1 alsl.d t3, a1, t3, 1 fld.d f30, t3, 0 fldx.d f31, t3, a1 alsl.d t3, a1, t3, 1 vilvl.d vr20, vr16, vr20 vilvl.d vr21, vr17, vr21 vilvl.d vr22, vr18, vr22 vilvl.d vr23, vr19, vr23 vilvl.d vr24, vr28, vr24 vilvl.d vr25, vr29, vr25 vilvl.d vr26, vr30, vr26 vilvl.d vr27, vr31, vr27 addi.d a0, t3, 4 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 LPF_16_WD6 slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 addi.d a0, a0, -2 .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 0 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 1 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 2 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 3 add.d a0, a0, a1 .endr addi.d a0, a0, 2 endfuncl functionl lpf_v_8_16_lsx slli.d t3, a1, 2 sub.d s0, a0, t3 vld vr20, s0, 0 // p3 vldx vr21, s0, a1 // p2 alsl.d s0, a1, s0, 1 vld vr22, s0, 0 // p1 vldx vr23, s0, a1 // p0 alsl.d s0, a1, s0, 1 vld vr24, s0, 0 // q0 vldx vr25, s0, a1 // q1 alsl.d s0, a1, s0, 1 vld vr26, s0, 0 // q2 vldx vr27, s0, a1 // q3 LPF_16_WD8 sub.d t3, a0, t3 add.d t3, t3, a1 // -3 vst vr21, t3, 0 // p2 vstx vr22, t3, a1 // p1 alsl.d t3, a1, t3, 1 vst vr23, t3, 0 // p0 vstx vr24, t3, a1 // q0 alsl.d t3, a1, t3, 1 vst vr25, t3, 0 // q1 vstx vr26, t3, a1 // q2 jirl zero, ra, 0x00 8: slli.d t3, a1, 1 sub.d t3, a0, t3 vst vr22, t3, 0 // p1 vstx vr23, t3, a1 // p0 alsl.d t3, a1, t3, 1 vst vr24, t3, 0 // q0 vstx vr25, t3, a1 // q1 endfuncl functionl lpf_h_8_16_lsx addi.d t3, a0, -4 fld.d f20, t3, 0 fldx.d f21, t3, a1 alsl.d t3, a1, t3, 1 fld.d f22, t3, 0 fldx.d f23, t3, a1 alsl.d t3, a1, t3, 1 fld.d f24, t3, 0 fldx.d f25, t3, a1 alsl.d t3, a1, t3, 1 fld.d f26, t3, 0 fldx.d f27, t3, a1 alsl.d t3, a1, t3, 1 fld.d f16, t3, 0 fldx.d f17, t3, a1 alsl.d t3, a1, t3, 1 fld.d f18, t3, 0 fldx.d f19, t3, a1 alsl.d t3, a1, t3, 1 fld.d f28, t3, 0 fldx.d f29, t3, a1 alsl.d t3, a1, t3, 1 fld.d f30, t3, 0 fldx.d f31, t3, a1 alsl.d t3, a1, t3, 1 vilvl.d vr20, vr16, vr20 vilvl.d vr21, vr17, vr21 vilvl.d vr22, vr18, vr22 vilvl.d vr23, vr19, vr23 vilvl.d vr24, vr28, vr24 vilvl.d vr25, vr29, vr25 vilvl.d vr26, vr30, vr26 vilvl.d vr27, vr31, vr27 addi.d a0, t3, 4 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 LPF_16_WD8 slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 addi.d a0, a0, -4 .irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 vstelm.d \i, a0, 0, 0 add.d a0, a0, a1 .endr .irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 vstelm.d \i, a0, 0, 1 add.d a0, a0, a1 .endr addi.d a0, a0, 4 jirl zero, ra, 0x00 8: slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 addi.d a0, a0, -2 .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 0 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 1 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 2 add.d a0, a0, a1 .endr .irp i, vr22, vr23, vr24, vr25 vstelm.w \i, a0, 0, 3 add.d a0, a0, a1 .endr addi.d a0, a0, 2 endfuncl functionl lpf_v_16_16_lsx slli.d t3, a1, 3 sub.d s0, a0, t3 add.d s0, s0, a1 vld vr17, s0, 0 // p6 vldx vr18, s0, a1 // p5 alsl.d s0, a1, s0, 1 vld vr19, s0, 0 // p4 vldx vr20, s0, a1 // p3 alsl.d s0, a1, s0, 1 vld vr21, s0, 0 // p2 vldx vr22, s0, a1 // p1 alsl.d s0, a1, s0, 1 vld vr23, s0, 0 // p0 vldx vr24, s0, a1 // q0 alsl.d s0, a1, s0, 1 vld vr25, s0, 0 // q1 vldx vr26, s0, a1 // q2 alsl.d s0, a1, s0, 1 vld vr27, s0, 0 // q3 vldx vr28, s0, a1 // q4 alsl.d s0, a1, s0, 1 vld vr29, s0, 0 // q5 vldx vr30, s0, a1 // q6 LPF_16_WD16 sub.d s0, a0, t3 alsl.d s0, a1, s0, 1 vst vr0, s0, 0 // p5 vstx vr1, s0, a1 // p4 alsl.d s0, a1, s0, 1 vst vr2, s0, 0 // p3 vstx vr3, s0, a1 // p2 alsl.d s0, a1, s0, 1 vst vr4, s0, 0 // p1 vstx vr5, s0, a1 // p0 alsl.d s0, a1, s0, 1 vst vr6, s0, 0 // q0 vstx vr7, s0, a1 // q1 alsl.d s0, a1, s0, 1 vst vr8, s0, 0 // q2 vstx vr9, s0, a1 // q3 alsl.d s0, a1, s0, 1 vst vr10, s0, 0 // q4 vstx vr11, s0, a1 // q5 jirl zero, ra, 0x00 7: slli.d t3, a1, 1 add.d t3, t3, a1 sub.d s0, a0, t3 vst vr21, s0, 0 // p2 vstx vr22, s0, a1 // p1 alsl.d s0, a1, s0, 1 vst vr23, s0, 0 // p0 vstx vr24, s0, a1 // q0 alsl.d s0, a1, s0, 1 vst vr25, s0, 0 // q1 vstx vr26, s0, a1 // q2 jirl zero, ra, 0x00 8: slli.d t3, a1, 1 sub.d s0, a0, t3 vst vr22, s0, 0 // p1 vstx vr23, s0, a1 // p0 alsl.d s0, a1, s0, 1 vst vr24, s0, 0 // q0 vstx vr25, s0, a1 // q1 endfuncl functionl lpf_h_16_16_lsx addi.d t3, a0, -8 vld vr16, t3, 0 vldx vr17, t3, a1 alsl.d t3, a1, t3, 1 vld vr18, t3, 0 vldx vr19, t3, a1 alsl.d t3, a1, t3, 1 vld vr20, t3, 0 vldx vr21, t3, a1 alsl.d t3, a1, t3, 1 vld vr22, t3, 0 vldx vr23, t3, a1 alsl.d t3, a1, t3, 1 vld vr24, t3, 0 vldx vr25, t3, a1 alsl.d t3, a1, t3, 1 vld vr26, t3, 0 vldx vr27, t3, a1 alsl.d t3, a1, t3, 1 vld vr28, t3, 0 vldx vr29, t3, a1 alsl.d t3, a1, t3, 1 vld vr30, t3, 0 vldx vr31, t3, a1 alsl.d t3, a1, t3, 1 .macro SWAPD in0, in1 vaddi.bu vr0, \in0, 0 vilvl.d \in0, \in1, \in0 vilvh.d \in1, \in1, vr0 .endm SWAPD vr16, vr24 SWAPD vr17, vr25 SWAPD vr18, vr26 SWAPD vr19, vr27 SWAPD vr20, vr28 SWAPD vr21, vr29 SWAPD vr22, vr30 SWAPD vr23, vr31 addi.d a0, t3, 8 TRANSPOSE_8x16B vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, vr0, vr1 TRANSPOSE_8x16B vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, vr0, vr1 LPF_16_WD16 slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_8x16B vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5, vr18, vr19 TRANSPOSE_8x16B vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31, vr18, vr19 addi.d t3, a0, -8 .irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5 vstelm.d \i, t3, 0, 0 add.d t3, t3, a1 .endr .irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5 vstelm.d \i, t3, 0, 1 add.d t3, t3, a1 .endr .irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31 vstelm.d \i, a0, 0, 0 add.d a0, a0, a1 .endr .irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31 vstelm.d \i, a0, 0, 1 add.d a0, a0, a1 .endr jirl zero, ra, 0x00 7: slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 addi.d a0, a0, -4 .irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 vstelm.d \i, a0, 0, 0 add.d a0, a0, a1 .endr .irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27 vstelm.d \i, a0, 0, 1 add.d a0, a0, a1 .endr addi.d a0, a0, 4 jirl zero, ra, 0x00 8: slli.d t3, a1, 4 sub.d a0, a0, t3 TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29 addi.d a0, a0, -2 .irp i, 0, 1, 2, 3 vstelm.w vr22, a0, 0, \i add.d a0, a0, a1 vstelm.w vr23, a0, 0, \i add.d a0, a0, a1 vstelm.w vr24, a0, 0, \i add.d a0, a0, a1 vstelm.w vr25, a0, 0, \i add.d a0, a0, a1 .endr addi.d a0, a0, 2 endfuncl .macro PUSH_REG addi.d sp, sp, -64-8 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 st.d s0, sp, 64 .endm .macro POP_REG fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 ld.d s0, sp, 64 addi.d sp, sp, 64+8 .endm const mask_1248 .word 1, 2, 4, 8 endconst .macro LPF_FUNC DIR, TYPE function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx PUSH_REG move t8, ra vld vr0, a2, 0 //vmask vpickve2gr.wu t0, vr0, 0 vpickve2gr.wu t1, vr0, 1 .ifc \TYPE, y vpickve2gr.wu t2, vr0, 2 .endif addi.d a5, a5, 128 // Move to sharp part of lut .ifc \TYPE, y or t1, t1, t2 // vmask[1] |= vmaks[2] .endif slli.d a4, a4, 2 .ifc \DIR, v sub.d a4, a3, a4 .else addi.d a3, a3, -4 .endif or t0, t0, t1 // vmaks[0] |= vmask[1] 1: andi t3, t0, 0x0f .ifc \DIR, v vld vr0, a4, 0 // l[-b4_stride][] addi.d a4, a4, 16 vld vr1, a3, 0 // l[0][] addi.d a3, a3, 16 .else fld.d f0, a3, 0 fldx.d f1, a3, a4 alsl.d a3, a4, a3, 1 fld.d f2, a3, 0 fldx.d f3, a3, a4 alsl.d a3, a4, a3, 1 vilvl.w vr1, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.d vr0, vr2, vr1 vilvh.d vr1, vr2, vr1 .endif beqz t3, 7f //l[0][] ? l[0][] : l[-b4_stride][] vseqi.b vr2, vr1, 0 vbitsel.v vr1, vr1, vr0, vr2 li.w t3, 0xff vreplgr2vr.w vr3, t3 vand.v vr1, vr1, vr3 vshuf4i.b vr1, vr1, 0x00 // L -- 1 0 2 0 vseqi.w vr2, vr1, 0 // 0 -1 0 -1 vseqi.w vr2, vr2, 0 // L != 0 -- -1 0 -1 0 vhaddw.qu.du vr3, vr2, vr2 vpickve2gr.du t4, vr3, 0 beqz t4, 7f // if (!L) continue la.local t3, mask_1248 // bits x vld vr16, t3, 0 vreplgr2vr.w vr13, t0 // vmask[0] vreplgr2vr.w vr14, t1 // vmaks[1] vand.v vr13, vr13, vr16 vseqi.w vr13, vr13, 0 vseqi.w vr13, vr13, 0 // if (vmask[0] & x) vand.v vr13, vr13, vr2 // vmask[0] &= L != 0 vand.v vr14, vr14, vr16 vseqi.w vr14, vr14, 0 vseqi.w vr14, vr14, 0 // if (vmask[1] & x) .ifc \TYPE, y vreplgr2vr.w vr15, t2 // vmask[2] vand.v vr15, vr15, vr16 vseqi.w vr15, vr15, 0 vseqi.w vr15, vr15, 0 // if (vmask[2] & x) .endif vldrepl.b vr5, a5, 0 // sharp[0] addi.d t5, a5, 8 vldrepl.b vr6, t5, 0 // sharp[1] vsrl.b vr3, vr1, vr5 // L >> sharp[0] vsrli.b vr12, vr1, 4 // H vmin.bu vr3, vr3, vr6 // imin(L >> sharp[0], sharp[1]) vaddi.bu vr0, vr1, 2 // L + 2 vmaxi.bu vr11, vr3, 1 // imax(imin(), 1) = limit = I vslli.b vr0, vr0, 1 // 2*(L + 2) vadd.b vr10, vr0, vr11 // 2*(L + 2) + limit = E .ifc \TYPE, y andi t3, t2, 0x0f beqz t3, 2f //wd16 bl lpf_\DIR\()_16_16_lsx b 8f 2: .endif andi t3, t1, 0x0f beqz t3, 3f .ifc \TYPE, y // wd8 bl lpf_\DIR\()_8_16_lsx .else // wd6 bl lpf_\DIR\()_6_16_lsx .endif b 8f 3: // wd4 bl lpf_\DIR\()_4_16_lsx .ifc \DIR, h b 8f 7: // For dir h, the functions above increment a0. // If the whole function is skipped, increment it here instead. alsl.d a0, a1, a0, 4 .else 7: .endif 8: srli.d t0, t0, 4 srli.d t1, t1, 4 .ifc \TYPE, y srli.d t2, t2, 4 .endif .ifc \DIR, v addi.d a0, a0, 16 .else // For dir h, a0 is returned incremented .endif bnez t0, 1b move ra, t8 POP_REG endfunc .endm LPF_FUNC h, y LPF_FUNC v, y LPF_FUNC h, uv LPF_FUNC v, uv dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/looprestoration.h000066400000000000000000000115771517466257200262360ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_LOOPRESTORATION_H #define DAV2D_SRC_LOONGARCH_LOOPRESTORATION_H #include "common/intops.h" #include "src/cpu.h" #include "src/looprestoration.h" void dav2d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride, const uint8_t (*const left)[4], const uint8_t *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void dav2d_wiener_filter_lasx(uint8_t *p, const ptrdiff_t stride, const uint8_t (*const left)[4], const uint8_t *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void dav2d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride, const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void dav2d_sgr_filter_3x3_lasx(pixel *p, const ptrdiff_t p_stride, const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void dav2d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride, const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void dav2d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride, const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); void dav2d_sgr_filter_mix_lasx(pixel *p, const ptrdiff_t p_stride, const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav2dLoopRestorationDSPContext *const c, int bpc) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; #if BITDEPTH == 8 c->wiener[0] = c->wiener[1] = dav2d_wiener_filter_lsx; c->sgr[0] = dav2d_sgr_filter_5x5_lsx; c->sgr[1] = dav2d_sgr_filter_3x3_lsx; c->sgr[2] = dav2d_sgr_filter_mix_lsx; #endif if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LASX)) return; #if BITDEPTH == 8 c->wiener[0] = c->wiener[1] = dav2d_wiener_filter_lasx; c->sgr[1] = dav2d_sgr_filter_3x3_lasx; #endif } #endif /* DAV2D_SRC_LOONGARCH_LOOPRESTORATION_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/mc.S000066400000000000000000006625231517466257200233500ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" /* static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_stride, const int16_t *const abcd, int mx, int my HIGHBD_DECL_SUFFIX) */ .macro vld_filter_row dst, src, inc addi.w t3, \src, 512 srai.w t3, t3, 10 add.w \src, \src, \inc addi.w t3, t3, 64 slli.w t3, t3, 3 fldx.d \dst, t4, t3 .endm .macro warp_filter_horz_lsx addi.w t5, a5, 0 vld vr10, a2, 0 add.d a2, a2, a3 vld_filter_row f0, t5, t0 vld_filter_row f1, t5, t0 vld_filter_row f2, t5, t0 vld_filter_row f3, t5, t0 vld_filter_row f4, t5, t0 vld_filter_row f5, t5, t0 vld_filter_row f6, t5, t0 vld_filter_row f7, t5, t0 vxor.v vr10, vr10, vr20 vbsrl.v vr8, vr10, 1 vbsrl.v vr9, vr10, 2 vilvl.d vr8, vr8, vr10 vilvl.d vr0, vr1, vr0 vmulwev.h.b vr11, vr8, vr0 vmulwod.h.b vr12, vr8, vr0 vbsrl.v vr8, vr10, 3 vbsrl.v vr19, vr10, 4 vilvl.d vr8, vr8, vr9 vilvl.d vr2, vr3, vr2 vmulwev.h.b vr13, vr8, vr2 vmulwod.h.b vr14, vr8, vr2 vbsrl.v vr8, vr10, 5 vbsrl.v vr9, vr10, 6 vilvl.d vr8, vr8, vr19 vilvl.d vr4, vr5, vr4 vmulwev.h.b vr15, vr8, vr4 vmulwod.h.b vr16, vr8, vr4 vbsrl.v vr8, vr10, 7 vilvl.d vr8, vr8, vr9 vilvl.d vr6, vr7, vr6 vmulwev.h.b vr17, vr8, vr6 vmulwod.h.b vr18, vr8, vr6 vadd.h vr11, vr11, vr12 vadd.h vr13, vr13, vr14 vadd.h vr15, vr15, vr16 vadd.h vr17, vr17, vr18 vpickev.h vr12, vr13, vr11 vpickod.h vr14, vr13, vr11 vpickev.h vr16, vr17, vr15 vpickod.h vr18, vr17, vr15 vadd.h vr11, vr12, vr14 vadd.h vr15, vr16, vr18 vpickev.h vr12, vr15, vr11 vpickod.h vr14, vr15, vr11 vadd.h vr11, vr12, vr14 add.d a5, a5, t1 .endm .macro transpose_8x8b_extend_lsx in0, in1, in2, in3, in4, in5, in6, in7 vilvl.b \in0, \in1, \in0 vilvl.b \in2, \in3, \in2 vilvl.b \in4, \in5, \in4 vilvl.b \in6, \in7, \in6 vpackev.h \in1, \in2, \in0 vpackod.h \in3, \in2, \in0 vpackev.h \in5, \in6, \in4 vpackod.h \in7, \in6, \in4 vpackev.w \in0, \in5, \in1 vpackod.w \in2, \in5, \in1 vpackev.w \in1, \in7, \in3 vpackod.w \in3, \in7, \in3 vexth.h.b \in4, \in0 vsllwil.h.b \in0, \in0, 0 vexth.h.b \in5, \in1 vsllwil.h.b \in1, \in1, 0 vexth.h.b \in6, \in2 vsllwil.h.b \in2, \in2, 0 vexth.h.b \in7, \in3 vsllwil.h.b \in3, \in3, 0 .endm .macro warp t, shift function warp_affine_8x8\t\()_8bpc_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 ld.h t0, a4, 0 ld.h t1, a4, 2 ld.h t2, a4, 4 ld.h a4, a4, 6 li.d t7, 8 alsl.w t3, a3, a3, 1 sub.d a2, a2, t3 addi.d a2, a2, -3 la.local t4, dav2d_mc_warp_filter .ifnb \t slli.d a1, a1, 1 .endif li.w t3, 128 vreplgr2vr.b vr20, t3 .ifb \t vreplgr2vr.h vr21, t3 .else li.w t3, 2048 vreplgr2vr.h vr21, t3 .endif warp_filter_horz_lsx vsrari.h vr24, vr11, 3 warp_filter_horz_lsx vsrari.h vr25, vr11, 3 warp_filter_horz_lsx vsrari.h vr26, vr11, 3 warp_filter_horz_lsx vsrari.h vr27, vr11, 3 warp_filter_horz_lsx vsrari.h vr28, vr11, 3 warp_filter_horz_lsx vsrari.h vr29, vr11, 3 warp_filter_horz_lsx vsrari.h vr30, vr11, 3 1: addi.d t6, a6, 0 warp_filter_horz_lsx vsrari.h vr31, vr11, 3 vld_filter_row f0, t6, t2 vld_filter_row f1, t6, t2 vld_filter_row f2, t6, t2 vld_filter_row f3, t6, t2 vld_filter_row f4, t6, t2 vld_filter_row f5, t6, t2 vld_filter_row f6, t6, t2 vld_filter_row f7, t6, t2 transpose_8x8b_extend_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vmulwev.w.h vr16, vr24, vr0 vmulwod.w.h vr17, vr24, vr0 vmaddwev.w.h vr16, vr25, vr1 vmaddwod.w.h vr17, vr25, vr1 vmaddwev.w.h vr16, vr26, vr2 vmaddwod.w.h vr17, vr26, vr2 vmaddwev.w.h vr16, vr27, vr3 vmaddwod.w.h vr17, vr27, vr3 vmaddwev.w.h vr16, vr28, vr4 vmaddwod.w.h vr17, vr28, vr4 vmaddwev.w.h vr16, vr29, vr5 vmaddwod.w.h vr17, vr29, vr5 vmaddwev.w.h vr16, vr30, vr6 vmaddwod.w.h vr17, vr30, vr6 vmaddwev.w.h vr16, vr31, vr7 vmaddwod.w.h vr17, vr31, vr7 vssrarni.h.w vr16, vr16, \shift vssrarni.h.w vr17, vr17, \shift vilvl.h vr16, vr17, vr16 vadd.h vr16, vr16, vr21 vor.v vr24, vr25, vr25 vor.v vr25, vr26, vr26 vor.v vr26, vr27, vr27 vor.v vr27, vr28, vr28 vor.v vr28, vr29, vr29 vor.v vr29, vr30, vr30 vor.v vr30, vr31, vr31 .ifb \t vssrarni.bu.h vr16, vr16, 0 .endif addi.d t7, t7, -1 .ifnb \t vst vr16, a0, 0 .else vstelm.d vr16, a0, 0, 0 .endif add.d a0, a1, a0 add.d a6, a6, a4 blt zero, t7, 1b fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc .endm warp , 11 warp t, 7 .macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3 xvshuf.b xr2, \in0, \in0, \in2 addi.w t4, \in1, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr3, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr4, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr5, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] addi.w t4, t3, 512 srai.w t4, t4, 10 addi.w t4, t4, 64 slli.w t4, t4, 3 vldx vr6, t5, t4 add.w t3, t3, t0 // tmx += abcd[0] xvinsve0.d xr3, xr5, 1 xvinsve0.d xr3, xr4, 2 xvinsve0.d xr3, xr6, 3 xvmulwev.h.bu.b xr4, xr2, xr3 xvmulwod.h.bu.b xr5, xr2, xr3 xvilvl.d xr2, xr5, xr4 xvilvh.d xr3, xr5, xr4 xvhaddw.w.h xr2, xr2, xr2 xvhaddw.w.h xr3, xr3, xr3 xvhaddw.d.w xr2, xr2, xr2 xvhaddw.d.w xr3, xr3, xr3 xvhaddw.q.d xr2, xr2, xr2 xvhaddw.q.d xr3, xr3, xr3 xvextrins.w \out0, xr2, \out1 xvextrins.w \out2, xr3, \out3 .endm .macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1 add.w \in0, \in0, \in1 addi.w t6, \in0, 512 srai.w t6, t6, 10 addi.w t6, t6, 64 slli.w t6, t6, 3 fldx.d f1, t5, t6 add.w t2, t2, t7 addi.w t6, t2, 512 srai.w t6, t6, 10 addi.w t6, t6, 64 slli.w t6, t6, 3 fldx.d f2, t5, t6 vilvl.d vr0, vr2, vr1 vext2xv.h.b xr0, xr0 xvmulwev.w.h xr3, \in2, xr0 xvmaddwod.w.h xr3, \in2, xr0 xvhaddw.d.w xr3, xr3, xr3 xvhaddw.q.d xr3, xr3, xr3 xvextrins.w \out0, xr3, \out1 .endm const shuf0 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 .byte 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10 endconst const warp_sh .rept 2 .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 .endr .rept 2 .byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 .endr endconst .macro warp_lasx t, shift function warp_affine_8x8\t\()_8bpc_lasx addi.d sp, sp, -16 ld.h t0, a4, 0 // abcd[0] ld.h t1, a4, 2 // abcd[1] fst.d f24, sp, 0 fst.d f25, sp, 8 alsl.w t2, a3, a3, 1 addi.w t3, a5, 0 la.local t4, warp_sh la.local t5, dav2d_mc_warp_filter sub.d a2, a2, t2 addi.d a2, a2, -3 vld vr0, a2, 0 xvld xr24, t4, 0 xvld xr25, t4, 32 la.local t2, shuf0 xvld xr1, t2, 0 xvpermi.q xr0, xr0, 0x00 xvaddi.bu xr9, xr1, 4 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30 xvsrarni.h.w xr12, xr7, 3 xvsrarni.h.w xr13, xr8, 3 xvsrarni.h.w xr14, xr10, 3 xvsrarni.h.w xr15, xr11, 3 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10 add.w a5, a5, t1 or t3, a5, a5 add.d a2, a2, a3 vld vr0, a2, 0 xvpermi.q xr0, xr0, 0x00 FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20 FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20 xvsrarni.h.w xr16, xr7, 3 xvsrarni.h.w xr17, xr8, 3 xvsrarni.h.w xr18, xr10, 3 xvsrarni.h.w xr19, xr11, 3 addi.w t2, a6, 0 // my ld.h t7, a4, 4 // abcd[2] ld.h t8, a4, 6 // abcd[3] .ifnb \t slli.d a1, a1, 1 .endif // y = 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, \shift xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift alsl.d a0, a1, a0, 1 xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, 11 xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 add.d a0, a0, a1 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift alsl.d a0, a1, a0, 1 xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, 11 xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 add.d a0, a0, a1 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif xvaddi.bu xr25, xr25, 2 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 xvextrins.h xr24, xr25, 0x70 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr20, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr20, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr20, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr20, 0x30 xvshuf.b xr12, xr16, xr12, xr24 xvshuf.b xr13, xr17, xr13, xr24 xvshuf.b xr14, xr18, xr14, xr24 xvshuf.b xr15, xr19, xr15, xr24 add.w a6, a6, t8 addi.w t2, a6, 0 FILTER_WARP_CLIP_LASX t2, zero, xr12, xr21, 0x00 FILTER_WARP_CLIP_LASX t2, t7, xr13, xr21, 0x10 FILTER_WARP_CLIP_LASX t2, t7, xr14, xr21, 0x20 FILTER_WARP_CLIP_LASX t2, t7, xr15, xr21, 0x30 .ifnb \t xvssrarni.h.w xr21, xr20, \shift alsl.d a0, a1, a0, 1 xvpermi.q xr22, xr21, 0x01 vilvl.h vr23, vr22, vr21 vilvh.h vr21, vr22, vr21 vst vr23, a0, 0 vstx vr21, a0, a1 .else xvssrarni.hu.w xr21, xr20, 11 xvssrlni.bu.h xr22, xr21, 0 xvpermi.q xr23, xr22, 0x01 vilvl.b vr21, vr23, vr22 add.d a0, a0, a1 fst.d f21, a0, 0 add.d a0, a0, a1 vstelm.d vr21, a0, 0, 1 .endif fld.d f24, sp, 0 fld.d f25, sp, 8 addi.d sp, sp, 16 endfunc .endm warp_lasx , 11 warp_lasx t, 7 /* static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, const int weight HIGHBD_DECL_SUFFIX) */ #define bpc8_sh 5 // sh = intermediate_bits + 1 #define bpcw8_sh 8 // sh = intermediate_bits + 4 #define bpc_sh bpc8_sh #define bpcw_sh bpcw8_sh function avg_8bpc_lsx addi.d t8, a0, 0 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .AVG_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 // The jump addresses are relative to AVG_LSX_JRTABLE add.d t1, t1, t2 // Get absolute address jirl $r0, t1, 0 .align 3 .AVG_LSX_JRTABLE: .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE .hword .AVG_W64_LSX - .AVG_LSX_JRTABLE .hword .AVG_W32_LSX - .AVG_LSX_JRTABLE .hword .AVG_W16_LSX - .AVG_LSX_JRTABLE .hword .AVG_W8_LSX - .AVG_LSX_JRTABLE .hword .AVG_W4_LSX - .AVG_LSX_JRTABLE .AVG_W4_LSX: vld vr0, a2, 0 vld vr1, a3, 0 vadd.h vr2, vr0, vr1 vssrarni.bu.h vr3, vr2, bpc_sh vstelm.w vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 1 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a0, a1 blt zero, a5, .AVG_W4_LSX b .AVG_END_LSX .AVG_W8_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr4, vr0, vr1 vadd.h vr5, vr2, vr3 vssrarni.bu.h vr5, vr4, bpc_sh addi.w a5, a5, -2 addi.d a2, a2, 32 vstelm.d vr5, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr5, a0, 0, 1 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .AVG_W8_LSX b .AVG_END_LSX .AVG_W16_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr4, vr0, vr1 vadd.h vr5, vr2, vr3 vssrarni.bu.h vr5, vr4, bpc_sh addi.w a5, a5, -1 addi.d a2, a2, 32 vst vr5, a0, 0 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .AVG_W16_LSX b .AVG_END_LSX .AVG_W32_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr4, a2, 32 vld vr6, a2, 48 vld vr1, a3, 0 vld vr3, a3, 16 vld vr5, a3, 32 vld vr7, a3, 48 vadd.h vr0, vr0, vr1 vadd.h vr2, vr2, vr3 vadd.h vr4, vr4, vr5 vadd.h vr6, vr6, vr7 vssrarni.bu.h vr2, vr0, bpc_sh vssrarni.bu.h vr6, vr4, bpc_sh addi.w a5, a5, -1 addi.d a2, a2, 64 vst vr2, a0, 0 vst vr6, a0, 16 addi.d a3, a3, 64 add.d a0, a0, a1 blt zero, a5, .AVG_W32_LSX b .AVG_END_LSX .AVG_W64_LSX: .rept 4 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr0, vr0, vr1 vadd.h vr2, vr2, vr3 vssrarni.bu.h vr2, vr0, bpc_sh addi.d a2, a2, 32 addi.d a3, a3, 32 vst vr2, a0, 0 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .AVG_W64_LSX b .AVG_END_LSX .AVG_W128_LSX: .rept 8 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vadd.h vr0, vr0, vr1 vadd.h vr2, vr2, vr3 vssrarni.bu.h vr2, vr0, bpc_sh addi.d a2, a2, 32 addi.d a3, a3, 32 vst vr2, a0, 0 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .AVG_W128_LSX .AVG_END_LSX: endfunc function avg_8bpc_lasx clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .AVG_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .AVG_LASX_JRTABLE: .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE .hword .AVG_W64_LASX - .AVG_LASX_JRTABLE .hword .AVG_W32_LASX - .AVG_LASX_JRTABLE .hword .AVG_W16_LASX - .AVG_LASX_JRTABLE .hword .AVG_W8_LASX - .AVG_LASX_JRTABLE .hword .AVG_W4_LASX - .AVG_LASX_JRTABLE .AVG_W4_LASX: vld vr0, a2, 0 vld vr1, a3, 0 vadd.h vr0, vr0, vr1 vssrarni.bu.h vr1, vr0, bpc_sh vstelm.w vr1, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr1, a0, 0, 1 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a0, a1 blt zero, a5, .AVG_W4_LASX b .AVG_END_LASX .AVG_W8_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 xvadd.h xr2, xr0, xr1 xvssrarni.bu.h xr1, xr2, bpc_sh xvstelm.d xr1, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr1, a0, 0, 2 addi.w a5, a5, -2 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a1, a0 blt zero, a5, .AVG_W8_LASX b .AVG_END_LASX .AVG_W16_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvadd.h xr4, xr0, xr1 xvadd.h xr5, xr2, xr3 xvssrarni.bu.h xr5, xr4, bpc_sh xvpermi.d xr2, xr5, 0xd8 xvpermi.d xr3, xr5, 0x8d vst vr2, a0, 0 vstx vr3, a0, a1 addi.w a5, a5, -2 addi.d a2, a2, 64 addi.d a3, a3, 64 alsl.d a0, a1, a0, 1 blt zero, a5, .AVG_W16_LASX b .AVG_END_LASX .AVG_W32_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvadd.h xr4, xr0, xr1 xvadd.h xr5, xr2, xr3 xvssrarni.bu.h xr5, xr4, bpc_sh xvpermi.d xr6, xr5, 0xd8 xvst xr6, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 64 addi.d a3, a3, 64 add.d a0, a0, a1 blt zero, a5, .AVG_W32_LASX b .AVG_END_LASX .AVG_W64_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr4, a2, 64 xvld xr6, a2, 96 xvld xr1, a3, 0 xvld xr3, a3, 32 xvld xr5, a3, 64 xvld xr7, a3, 96 xvadd.h xr0, xr0, xr1 xvadd.h xr2, xr2, xr3 xvadd.h xr4, xr4, xr5 xvadd.h xr6, xr6, xr7 xvssrarni.bu.h xr2, xr0, bpc_sh xvssrarni.bu.h xr6, xr4, bpc_sh xvpermi.d xr1, xr2, 0xd8 xvpermi.d xr3, xr6, 0xd8 xvst xr1, a0, 0 xvst xr3, a0, 32 addi.w a5, a5, -1 addi.d a2, a2, 128 addi.d a3, a3, 128 add.d a0, a0, a1 blt zero, a5, .AVG_W64_LASX b .AVG_END_LASX .AVG_W128_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr4, a2, 64 xvld xr6, a2, 96 xvld xr8, a2, 128 xvld xr10, a2, 160 xvld xr12, a2, 192 xvld xr14, a2, 224 xvld xr1, a3, 0 xvld xr3, a3, 32 xvld xr5, a3, 64 xvld xr7, a3, 96 xvld xr9, a3, 128 xvld xr11, a3, 160 xvld xr13, a3, 192 xvld xr15, a3, 224 xvadd.h xr0, xr0, xr1 xvadd.h xr2, xr2, xr3 xvadd.h xr4, xr4, xr5 xvadd.h xr6, xr6, xr7 xvadd.h xr8, xr8, xr9 xvadd.h xr10, xr10, xr11 xvadd.h xr12, xr12, xr13 xvadd.h xr14, xr14, xr15 xvssrarni.bu.h xr2, xr0, bpc_sh xvssrarni.bu.h xr6, xr4, bpc_sh xvssrarni.bu.h xr10, xr8, bpc_sh xvssrarni.bu.h xr14, xr12, bpc_sh xvpermi.d xr1, xr2, 0xd8 xvpermi.d xr3, xr6, 0xd8 xvpermi.d xr5, xr10, 0xd8 xvpermi.d xr7, xr14, 0xd8 xvst xr1, a0, 0 xvst xr3, a0, 32 xvst xr5, a0, 64 xvst xr7, a0, 96 addi.w a5, a5, -1 addi.d a2, a2, 256 addi.d a3, a3, 256 add.d a0, a0, a1 blt zero, a5, .AVG_W128_LASX .AVG_END_LASX: endfunc function w_avg_8bpc_lsx addi.d t8, a0, 0 li.w t2, 16 sub.w t2, t2, a6 // 16 - weight vreplgr2vr.h vr21, a6 vreplgr2vr.h vr22, t2 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .W_AVG_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .W_AVG_LSX_JRTABLE: .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W64_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W32_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W16_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W8_LSX - .W_AVG_LSX_JRTABLE .hword .W_AVG_W4_LSX - .W_AVG_LSX_JRTABLE .W_AVG_W4_LSX: vld vr0, a2, 0 vld vr1, a3, 0 vmulwev.w.h vr2, vr0, vr21 vmulwod.w.h vr3, vr0, vr21 vmaddwev.w.h vr2, vr1, vr22 vmaddwod.w.h vr3, vr1, vr22 vssrarni.hu.w vr3, vr2, bpcw_sh vssrlni.bu.h vr1, vr3, 0 vpickod.w vr4, vr2, vr1 vilvl.b vr0, vr4, vr1 fst.s f0, a0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a1, a0 blt zero, a5, .W_AVG_W4_LSX b .W_AVG_END_LSX .W_AVG_W8_LSX: vld vr0, a2, 0 vld vr1, a3, 0 vmulwev.w.h vr2, vr0, vr21 vmulwod.w.h vr3, vr0, vr21 vmaddwev.w.h vr2, vr1, vr22 vmaddwod.w.h vr3, vr1, vr22 vssrarni.hu.w vr3, vr2, bpcw_sh vssrlni.bu.h vr1, vr3, 0 vpickod.w vr4, vr2, vr1 vilvl.b vr0, vr4, vr1 fst.d f0, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a0, a1 blt zero, a5, .W_AVG_W8_LSX b .W_AVG_END_LSX .W_AVG_W16_LSX: vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .W_AVG_W16_LSX b .W_AVG_END_LSX .W_AVG_W32_LSX: .rept 2 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W32_LSX b .W_AVG_END_LSX .W_AVG_W64_LSX: .rept 4 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W64_LSX b .W_AVG_END_LSX .W_AVG_W128_LSX: .rept 8 vld vr0, a2, 0 vld vr2, a2, 16 vld vr1, a3, 0 vld vr3, a3, 16 vmulwev.w.h vr4, vr0, vr21 vmulwod.w.h vr5, vr0, vr21 vmulwev.w.h vr6, vr2, vr21 vmulwod.w.h vr7, vr2, vr21 vmaddwev.w.h vr4, vr1, vr22 vmaddwod.w.h vr5, vr1, vr22 vmaddwev.w.h vr6, vr3, vr22 vmaddwod.w.h vr7, vr3, vr22 vssrarni.hu.w vr6, vr4, bpcw_sh vssrarni.hu.w vr7, vr5, bpcw_sh vssrlrni.bu.h vr7, vr6, 0 vshuf4i.w vr8, vr7, 0x4E vilvl.b vr0, vr8, vr7 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a0, a0, 16 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W128_LSX .W_AVG_END_LSX: endfunc function w_avg_8bpc_lasx addi.d t8, a0, 0 li.w t2, 16 sub.w t2, t2, a6 // 16 - weight xvreplgr2vr.h xr21, a6 xvreplgr2vr.h xr22, t2 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .W_AVG_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .W_AVG_LASX_JRTABLE: .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W64_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W32_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W16_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W8_LASX - .W_AVG_LASX_JRTABLE .hword .W_AVG_W4_LASX - .W_AVG_LASX_JRTABLE .W_AVG_W4_LASX: vld vr0, a2, 0 vld vr1, a3, 0 xvpermi.d xr2, xr0, 0xD8 xvpermi.d xr3, xr1, 0xD8 xvilvl.h xr4, xr3, xr2 xvmulwev.w.h xr0, xr4, xr21 xvmaddwod.w.h xr0, xr4, xr22 xvssrarni.hu.w xr1, xr0, bpcw_sh xvssrlni.bu.h xr0, xr1, 0 fst.s f0, a0, 0 add.d a0, a0, a1 xvstelm.w xr0, a0, 0, 4 addi.w a5, a5, -2 addi.d a2, a2, 16 addi.d a3, a3, 16 add.d a0, a1, a0 blt zero, a5, .W_AVG_W4_LASX b .W_AVG_END_LASX .W_AVG_W8_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 xvmulwev.w.h xr2, xr0, xr21 xvmulwod.w.h xr3, xr0, xr21 xvmaddwev.w.h xr2, xr1, xr22 xvmaddwod.w.h xr3, xr1, xr22 xvssrarni.hu.w xr3, xr2, bpcw_sh xvssrlni.bu.h xr1, xr3, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 xvstelm.d xr0, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr0, a0, 0, 2 addi.w a5, a5, -2 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .W_AVG_W8_LASX b .W_AVG_END_LASX .W_AVG_W16_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 xvmulwev.w.h xr2, xr0, xr21 xvmulwod.w.h xr3, xr0, xr21 xvmaddwev.w.h xr2, xr1, xr22 xvmaddwod.w.h xr3, xr1, xr22 xvssrarni.hu.w xr3, xr2, bpcw_sh xvssrlni.bu.h xr1, xr3, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 xvpermi.d xr1, xr0, 0xD8 vst vr1, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 32 addi.d a3, a3, 32 add.d a0, a0, a1 blt zero, a5, .W_AVG_W16_LASX b .W_AVG_END_LSX .W_AVG_W32_LASX: xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvmulwev.w.h xr4, xr0, xr21 xvmulwod.w.h xr5, xr0, xr21 xvmulwev.w.h xr6, xr2, xr21 xvmulwod.w.h xr7, xr2, xr21 xvmaddwev.w.h xr4, xr1, xr22 xvmaddwod.w.h xr5, xr1, xr22 xvmaddwev.w.h xr6, xr3, xr22 xvmaddwod.w.h xr7, xr3, xr22 xvssrarni.hu.w xr6, xr4, bpcw_sh xvssrarni.hu.w xr7, xr5, bpcw_sh xvssrlni.bu.h xr7, xr6, 0 xvshuf4i.w xr8, xr7, 0x4E xvilvl.b xr9, xr8, xr7 xvpermi.d xr0, xr9, 0xD8 xvst xr0, a0, 0 addi.w a5, a5, -1 addi.d a2, a2, 64 addi.d a3, a3, 64 add.d a0, a0, a1 blt zero, a5, .W_AVG_W32_LASX b .W_AVG_END_LASX .W_AVG_W64_LASX: .rept 2 xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvmulwev.w.h xr4, xr0, xr21 xvmulwod.w.h xr5, xr0, xr21 xvmulwev.w.h xr6, xr2, xr21 xvmulwod.w.h xr7, xr2, xr21 xvmaddwev.w.h xr4, xr1, xr22 xvmaddwod.w.h xr5, xr1, xr22 xvmaddwev.w.h xr6, xr3, xr22 xvmaddwod.w.h xr7, xr3, xr22 xvssrarni.hu.w xr6, xr4, bpcw_sh xvssrarni.hu.w xr7, xr5, bpcw_sh xvssrlni.bu.h xr7, xr6, 0 xvshuf4i.w xr8, xr7, 0x4E xvilvl.b xr9, xr8, xr7 xvpermi.d xr0, xr9, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a0, a0, 32 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W64_LASX b .W_AVG_END_LASX .W_AVG_W128_LASX: .rept 4 xvld xr0, a2, 0 xvld xr2, a2, 32 xvld xr1, a3, 0 xvld xr3, a3, 32 xvmulwev.w.h xr4, xr0, xr21 xvmulwod.w.h xr5, xr0, xr21 xvmulwev.w.h xr6, xr2, xr21 xvmulwod.w.h xr7, xr2, xr21 xvmaddwev.w.h xr4, xr1, xr22 xvmaddwod.w.h xr5, xr1, xr22 xvmaddwev.w.h xr6, xr3, xr22 xvmaddwod.w.h xr7, xr3, xr22 xvssrarni.hu.w xr6, xr4, bpcw_sh xvssrarni.hu.w xr7, xr5, bpcw_sh xvssrlni.bu.h xr7, xr6, 0 xvshuf4i.w xr8, xr7, 0x4E xvilvl.b xr9, xr8, xr7 xvpermi.d xr0, xr9, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a0, a0, 32 .endr addi.w a5, a5, -1 add.d t8, t8, a1 add.d a0, t8, zero blt zero, a5, .W_AVG_W128_LASX .W_AVG_END_LASX: endfunc #undef bpc_sh #undef bpcw_sh #define mask_sh 10 /* static void mask_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, const uint8_t *mask HIGHBD_DECL_SUFFIX) */ function mask_8bpc_lsx vldi vr21, 0x440 // 64 vxor.v vr19, vr19, vr19 addi.d t8, a0, 0 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .MASK_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .MASK_LSX_JRTABLE: .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE .hword .MASK_W64_LSX - .MASK_LSX_JRTABLE .hword .MASK_W32_LSX - .MASK_LSX_JRTABLE .hword .MASK_W16_LSX - .MASK_LSX_JRTABLE .hword .MASK_W8_LSX - .MASK_LSX_JRTABLE .hword .MASK_W4_LSX - .MASK_LSX_JRTABLE .MASK_W4_LSX: vld vr0, a2, 0 vld vr1, a3, 0 fld.d f22, a6, 0 vilvl.b vr2, vr19, vr22 vsub.h vr3, vr21, vr2 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vssrarni.hu.w vr5, vr4, mask_sh vssrlrni.bu.h vr1, vr5, 0 vpickod.w vr4, vr2, vr1 vilvl.b vr0, vr4, vr1 fst.s f0, a0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 addi.d a2, a2, 16 addi.d a3, a3, 16 addi.d a6, a6, 8 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W4_LSX b .MASK_END_LSX .MASK_W8_LSX: vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 fst.d f0, a0, 0 add.d a0, a0, a1 vstelm.d vr0, a0, 0, 1 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W8_LSX b .MASK_END_LSX .MASK_W16_LSX: vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -1 blt zero, a5, .MASK_W16_LSX b .MASK_END_LSX .MASK_W32_LSX: .rept 2 vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 addi.d a0, a0, 16 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W32_LSX b .MASK_END_LSX .MASK_W64_LSX: .rept 4 vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 addi.d a0, a0, 16 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W64_LSX b .MASK_END_LSX .MASK_W128_LSX: .rept 8 vld vr0, a2, 0 vld vr10, a2, 16 vld vr1, a3, 0 vld vr11, a3, 16 vld vr22, a6, 0 vilvl.b vr2, vr19, vr22 vilvh.b vr12, vr19, vr22 vsub.h vr3, vr21, vr2 vsub.h vr13, vr21, vr12 vmulwev.w.h vr4, vr0, vr2 vmulwod.w.h vr5, vr0, vr2 vmulwev.w.h vr14, vr10, vr12 vmulwod.w.h vr15, vr10, vr12 vmaddwev.w.h vr4, vr1, vr3 vmaddwod.w.h vr5, vr1, vr3 vmaddwev.w.h vr14, vr11, vr13 vmaddwod.w.h vr15, vr11, vr13 vssrarni.hu.w vr14, vr4, mask_sh vssrarni.hu.w vr15, vr5, mask_sh vssrlrni.bu.h vr15, vr14, 0 vshuf4i.w vr6, vr15, 0x4E vilvl.b vr0, vr6, vr15 vst vr0, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 addi.d a0, a0, 16 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W128_LSX .MASK_END_LSX: endfunc function mask_8bpc_lasx xvldi xr21, 0x440 // 64 xvxor.v xr19, xr19, xr19 addi.d t8, a0, 0 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .MASK_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 add.d t1, t1, t2 jirl $r0, t1, 0 .align 3 .MASK_LASX_JRTABLE: .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE .hword .MASK_W64_LASX - .MASK_LASX_JRTABLE .hword .MASK_W32_LASX - .MASK_LASX_JRTABLE .hword .MASK_W16_LASX - .MASK_LASX_JRTABLE .hword .MASK_W8_LASX - .MASK_LASX_JRTABLE .hword .MASK_W4_LASX - .MASK_LASX_JRTABLE .MASK_W4_LASX: vld vr0, a2, 0 vld vr1, a3, 0 fld.d f22, a6, 0 vilvl.h vr4, vr1, vr0 vilvh.h vr14, vr1, vr0 vilvl.b vr2, vr19, vr22 vsub.h vr3, vr21, vr2 xvpermi.q xr14, xr4, 0x20 vilvl.h vr5, vr3, vr2 vilvh.h vr15, vr3, vr2 xvpermi.q xr15, xr5, 0x20 xvmulwev.w.h xr0, xr14, xr15 xvmaddwod.w.h xr0, xr14, xr15 xvssrarni.hu.w xr1, xr0, mask_sh xvssrlni.bu.h xr2, xr1, 0 fst.s f2, a0, 0 add.d a0, a0, a1 xvstelm.w xr2, a0, 0, 4 addi.d a2, a2, 16 addi.d a3, a3, 16 addi.d a6, a6, 8 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W4_LASX b .MASK_END_LASX .MASK_W8_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 vld vr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvsub.h xr3, xr21, xr2 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvssrarni.hu.w xr5, xr4, mask_sh xvssrlni.bu.h xr1, xr5, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 fst.d f0, a0, 0 add.d a0, a0, a1 xvstelm.d xr0, a0, 0, 2 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .MASK_W8_LASX b .MASK_END_LASX .MASK_W16_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 vld vr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvsub.h xr3, xr21, xr2 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvssrarni.hu.w xr5, xr4, mask_sh xvssrlni.bu.h xr1, xr5, 0 xvpickod.w xr4, xr2, xr1 xvilvl.b xr0, xr4, xr1 xvpermi.d xr1, xr0, 0xD8 vst vr1, a0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 16 add.d a0, a0, a1 addi.w a5, a5, -1 blt zero, a5, .MASK_W16_LASX b .MASK_END_LASX .MASK_W32_LASX: xvld xr0, a2, 0 xvld xr10, a2, 32 xvld xr1, a3, 0 xvld xr11, a3, 32 xvld xr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvpermi.q xr4, xr22, 0x01 vext2xv.hu.bu xr12, xr4 xvsub.h xr3, xr21, xr2 xvsub.h xr13, xr21, xr12 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmulwev.w.h xr14, xr10, xr12 xvmulwod.w.h xr15, xr10, xr12 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvmaddwev.w.h xr14, xr11, xr13 xvmaddwod.w.h xr15, xr11, xr13 xvssrarni.hu.w xr14, xr4, mask_sh xvssrarni.hu.w xr15, xr5, mask_sh xvssrlni.bu.h xr15, xr14, 0 xvshuf4i.w xr6, xr15, 0x4E xvilvl.b xr1, xr6, xr15 xvpermi.d xr0, xr1, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 32 add.d a0, a0, a1 addi.w a5, a5, -1 blt zero, a5, .MASK_W32_LASX b .MASK_END_LASX .MASK_W64_LASX: .rept 2 xvld xr0, a2, 0 xvld xr10, a2, 32 xvld xr1, a3, 0 xvld xr11, a3, 32 xvld xr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvpermi.q xr4, xr22, 0x01 vext2xv.hu.bu xr12, xr4 xvsub.h xr3, xr21, xr2 xvsub.h xr13, xr21, xr12 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmulwev.w.h xr14, xr10, xr12 xvmulwod.w.h xr15, xr10, xr12 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvmaddwev.w.h xr14, xr11, xr13 xvmaddwod.w.h xr15, xr11, xr13 xvssrarni.hu.w xr14, xr4, mask_sh xvssrarni.hu.w xr15, xr5, mask_sh xvssrlni.bu.h xr15, xr14, 0 xvshuf4i.w xr6, xr15, 0x4E xvilvl.b xr1, xr6, xr15 xvpermi.d xr0, xr1, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 32 addi.d a0, a0, 32 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W64_LASX b .MASK_END_LASX .MASK_W128_LASX: .rept 4 xvld xr0, a2, 0 xvld xr10, a2, 32 xvld xr1, a3, 0 xvld xr11, a3, 32 xvld xr22, a6, 0 vext2xv.hu.bu xr2, xr22 xvpermi.q xr4, xr22, 0x01 vext2xv.hu.bu xr12, xr4 xvsub.h xr3, xr21, xr2 xvsub.h xr13, xr21, xr12 xvmulwev.w.h xr4, xr0, xr2 xvmulwod.w.h xr5, xr0, xr2 xvmulwev.w.h xr14, xr10, xr12 xvmulwod.w.h xr15, xr10, xr12 xvmaddwev.w.h xr4, xr1, xr3 xvmaddwod.w.h xr5, xr1, xr3 xvmaddwev.w.h xr14, xr11, xr13 xvmaddwod.w.h xr15, xr11, xr13 xvssrarni.hu.w xr14, xr4, mask_sh xvssrarni.hu.w xr15, xr5, mask_sh xvssrlni.bu.h xr15, xr14, 0 xvshuf4i.w xr6, xr15, 0x4E xvilvl.b xr1, xr6, xr15 xvpermi.d xr0, xr1, 0xD8 xvst xr0, a0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 32 addi.d a0, a0, 32 .endr add.d t8, t8, a1 add.d a0, t8, zero addi.w a5, a5, -1 blt zero, a5, .MASK_W128_LASX .MASK_END_LASX: endfunc /* static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, uint8_t *mask, const int sign, const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) */ function w_mask_420_8bpc_lsx addi.d sp, sp, -24 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 vldi vr20, 0x440 vreplgr2vr.h vr21, a7 vldi vr22, 0x426 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .WMASK420_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t8, t0, 0 add.d t1, t1, t8 jirl $r0, t1, 0 .align 3 .WMASK420_LSX_JRTABLE: .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W64_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W32_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W16_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W8_LSX - .WMASK420_LSX_JRTABLE .hword .WMASK420_W4_LSX - .WMASK420_LSX_JRTABLE .WMASK420_W4_LSX: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a3, 0 vld vr3, a3, 16 addi.w a5, a5, -4 vabsd.h vr4, vr0, vr2 vabsd.h vr5, vr1, vr3 vaddi.hu vr4, vr4, 8 vaddi.hu vr5, vr5, 8 vsrli.h vr4, vr4, 8 vsrli.h vr5, vr5, 8 vadd.h vr4, vr4, vr22 vadd.h vr5, vr5, vr22 vmin.hu vr6, vr4, vr20 vmin.hu vr7, vr5, vr20 vsub.h vr8, vr20, vr6 vsub.h vr9, vr20, vr7 vmulwev.w.h vr4, vr6, vr0 vmulwod.w.h vr5, vr6, vr0 vmulwev.w.h vr10, vr7, vr1 vmulwod.w.h vr11, vr7, vr1 vmaddwev.w.h vr4, vr8, vr2 vmaddwod.w.h vr5, vr8, vr2 vmaddwev.w.h vr10, vr9, vr3 vmaddwod.w.h vr11, vr9, vr3 vilvl.w vr0, vr5, vr4 vilvh.w vr1, vr5, vr4 vilvl.w vr2, vr11, vr10 vilvh.w vr3, vr11, vr10 vssrarni.hu.w vr1, vr0, 10 vssrarni.hu.w vr3, vr2, 10 vssrlni.bu.h vr3, vr1, 0 vstelm.w vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 1 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 2 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 3 add.d a0, a0, a1 vpickev.h vr0, vr7, vr6 vpickod.h vr1, vr7, vr6 vadd.h vr0, vr0, vr1 vshuf4i.h vr0, vr0, 0xd8 vhaddw.w.h vr2, vr0, vr0 vpickev.h vr2, vr2, vr2 vsub.h vr2, vr2, vr21 vaddi.hu vr2, vr2, 2 vssrani.bu.h vr2, vr2, 2 vstelm.w vr2, a6, 0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 4 blt zero, a5, .WMASK420_W4_LSX b .END_W420 .WMASK420_W8_LSX: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a3, 0 vld vr3, a3, 16 addi.w a5, a5, -2 vabsd.h vr4, vr0, vr2 vabsd.h vr5, vr1, vr3 vaddi.hu vr4, vr4, 8 vaddi.hu vr5, vr5, 8 vsrli.h vr4, vr4, 8 vsrli.h vr5, vr5, 8 vadd.h vr4, vr4, vr22 vadd.h vr5, vr5, vr22 vmin.hu vr6, vr4, vr20 vmin.hu vr7, vr5, vr20 vsub.h vr8, vr20, vr6 vsub.h vr9, vr20, vr7 vmulwev.w.h vr4, vr6, vr0 vmulwod.w.h vr5, vr6, vr0 vmulwev.w.h vr10, vr7, vr1 vmulwod.w.h vr11, vr7, vr1 vmaddwev.w.h vr4, vr8, vr2 vmaddwod.w.h vr5, vr8, vr2 vmaddwev.w.h vr10, vr9, vr3 vmaddwod.w.h vr11, vr9, vr3 vssrarni.hu.w vr10, vr4, 10 vssrarni.hu.w vr11, vr5, 10 vssrlni.bu.h vr11, vr10, 0 vshuf4i.w vr0, vr11, 0x4E vilvl.b vr3, vr0, vr11 vstelm.d vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr3, a0, 0, 1 add.d a0, a0, a1 vpickev.h vr0, vr7, vr6 vpickod.h vr1, vr7, vr6 vadd.h vr0, vr0, vr1 vilvh.d vr2, vr0, vr0 vadd.h vr2, vr2, vr0 vsub.h vr2, vr2, vr21 vaddi.hu vr2, vr2, 2 vssrani.bu.h vr2, vr2, 2 vstelm.w vr2, a6, 0, 0 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 4 blt zero, a5, .WMASK420_W8_LSX b .END_W420 .WMASK420_W16_LSX: vld vr0, a2, 0 vld vr1, a2, 16 alsl.d a2, a4, a2, 1 vld vr2, a2, 0 vld vr3, a2, 16 vld vr4, a3, 0 vld vr5, a3, 16 alsl.d a3, a4, a3, 1 vld vr6, a3, 0 vld vr7, a3, 16 vabsd.h vr8, vr0, vr4 vabsd.h vr9, vr1, vr5 vabsd.h vr10, vr2, vr6 vabsd.h vr11, vr3, vr7 vaddi.hu vr8, vr8, 8 vaddi.hu vr9, vr9, 8 vaddi.hu vr10, vr10, 8 vaddi.hu vr11, vr11, 8 vsrli.h vr8, vr8, 8 vsrli.h vr9, vr9, 8 vsrli.h vr10, vr10, 8 vsrli.h vr11, vr11, 8 vadd.h vr8, vr8, vr22 vadd.h vr9, vr9, vr22 vadd.h vr10, vr10, vr22 vadd.h vr11, vr11, vr22 vmin.hu vr12, vr8, vr20 vmin.hu vr13, vr9, vr20 vmin.hu vr14, vr10, vr20 vmin.hu vr15, vr11, vr20 vsub.h vr16, vr20, vr12 vsub.h vr17, vr20, vr13 vsub.h vr18, vr20, vr14 vsub.h vr19, vr20, vr15 vmulwev.w.h vr8, vr12, vr0 vmulwod.w.h vr9, vr12, vr0 vmulwev.w.h vr10, vr13, vr1 vmulwod.w.h vr11, vr13, vr1 vmulwev.w.h vr23, vr14, vr2 vmulwod.w.h vr24, vr14, vr2 vmulwev.w.h vr25, vr15, vr3 vmulwod.w.h vr26, vr15, vr3 vmaddwev.w.h vr8, vr16, vr4 vmaddwod.w.h vr9, vr16, vr4 vmaddwev.w.h vr10, vr17, vr5 vmaddwod.w.h vr11, vr17, vr5 vmaddwev.w.h vr23, vr18, vr6 vmaddwod.w.h vr24, vr18, vr6 vmaddwev.w.h vr25, vr19, vr7 vmaddwod.w.h vr26, vr19, vr7 vssrarni.hu.w vr10, vr8, 10 vssrarni.hu.w vr11, vr9, 10 vssrarni.hu.w vr25, vr23, 10 vssrarni.hu.w vr26, vr24, 10 vssrlni.bu.h vr11, vr10, 0 vssrlni.bu.h vr26, vr25, 0 vshuf4i.w vr0, vr11, 0x4E vshuf4i.w vr1, vr26, 0x4E vilvl.b vr3, vr0, vr11 vilvl.b vr7, vr1, vr26 vst vr3, a0, 0 vstx vr7, a0, a1 vpickev.h vr0, vr13, vr12 vpickod.h vr1, vr13, vr12 vpickev.h vr2, vr15, vr14 vpickod.h vr3, vr15, vr14 vadd.h vr4, vr0, vr1 vadd.h vr5, vr2, vr3 vadd.h vr4, vr4, vr5 vsub.h vr4, vr4, vr21 vssrarni.bu.h vr4, vr4, 2 vstelm.d vr4, a6, 0, 0 alsl.d a2, a4, a2, 1 alsl.d a3, a4, a3, 1 alsl.d a0, a1, a0, 1 addi.d a6, a6, 8 addi.w a5, a5, -2 blt zero, a5, .WMASK420_W16_LSX b .END_W420 .WMASK420_W32_LSX: .WMASK420_W64_LSX: .WMASK420_W128_LSX: .LOOP_W32_420_LSX: add.d t1, a2, zero add.d t2, a3, zero add.d t3, a0, zero add.d t4, a6, zero alsl.d t5, a4, t1, 1 alsl.d t6, a4, t2, 1 or t7, a4, a4 .W32_420_LSX: vld vr0, t1, 0 vld vr1, t1, 16 vld vr2, t2, 0 vld vr3, t2, 16 vld vr4, t5, 0 vld vr5, t5, 16 vld vr6, t6, 0 vld vr7, t6, 16 addi.d t1, t1, 32 addi.d t2, t2, 32 addi.d t5, t5, 32 addi.d t6, t6, 32 addi.w t7, t7, -16 vabsd.h vr8, vr0, vr2 vabsd.h vr9, vr1, vr3 vabsd.h vr10, vr4, vr6 vabsd.h vr11, vr5, vr7 vaddi.hu vr8, vr8, 8 vaddi.hu vr9, vr9, 8 vaddi.hu vr10, vr10, 8 vaddi.hu vr11, vr11, 8 vsrli.h vr8, vr8, 8 vsrli.h vr9, vr9, 8 vsrli.h vr10, vr10, 8 vsrli.h vr11, vr11, 8 vadd.h vr8, vr8, vr22 vadd.h vr9, vr9, vr22 vadd.h vr10, vr10, vr22 vadd.h vr11, vr11, vr22 vmin.hu vr12, vr8, vr20 vmin.hu vr13, vr9, vr20 vmin.hu vr14, vr10, vr20 vmin.hu vr15, vr11, vr20 vsub.h vr16, vr20, vr12 vsub.h vr17, vr20, vr13 vsub.h vr18, vr20, vr14 vsub.h vr19, vr20, vr15 vmulwev.w.h vr8, vr12, vr0 vmulwod.w.h vr9, vr12, vr0 vmulwev.w.h vr10, vr13, vr1 vmulwod.w.h vr11, vr13, vr1 vmulwev.w.h vr23, vr14, vr4 vmulwod.w.h vr24, vr14, vr4 vmulwev.w.h vr25, vr15, vr5 vmulwod.w.h vr26, vr15, vr5 vmaddwev.w.h vr8, vr16, vr2 vmaddwod.w.h vr9, vr16, vr2 vmaddwev.w.h vr10, vr17, vr3 vmaddwod.w.h vr11, vr17, vr3 vmaddwev.w.h vr23, vr18, vr6 vmaddwod.w.h vr24, vr18, vr6 vmaddwev.w.h vr25, vr19, vr7 vmaddwod.w.h vr26, vr19, vr7 vssrarni.hu.w vr10, vr8, 10 vssrarni.hu.w vr11, vr9, 10 vssrarni.hu.w vr25, vr23, 10 vssrarni.hu.w vr26, vr24, 10 vssrlni.bu.h vr11, vr10, 0 vssrlni.bu.h vr26, vr25, 0 vshuf4i.w vr8, vr11, 0x4E vshuf4i.w vr9, vr26, 0x4E vilvl.b vr3, vr8, vr11 vilvl.b vr7, vr9, vr26 vst vr3, t3, 0 vstx vr7, a1, t3 addi.d t3, t3, 16 vpickev.h vr8, vr13, vr12 vpickod.h vr9, vr13, vr12 vpickev.h vr10, vr15, vr14 vpickod.h vr11, vr15, vr14 vadd.h vr8, vr8, vr9 vadd.h vr10, vr10, vr11 vadd.h vr12, vr8, vr10 vsub.h vr12, vr12, vr21 vssrarni.bu.h vr12, vr12, 2 vstelm.d vr12, t4, 0, 0 addi.d t4, t4, 8 bne t7, zero, .W32_420_LSX alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 alsl.d a0, a1, a0, 1 srai.w t8, a4, 1 add.d a6, a6, t8 addi.w a5, a5, -2 blt zero, a5, .LOOP_W32_420_LSX .END_W420: fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 addi.d sp, sp, 24 endfunc function w_mask_420_8bpc_lasx xvldi xr20, 0x440 xvreplgr2vr.h xr21, a7 xvldi xr22, 0x426 clz.w t0, a4 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .WMASK420_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t8, t0, 0 add.d t1, t1, t8 jirl $r0, t1, 0 .align 3 .WMASK420_LASX_JRTABLE: .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W64_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W32_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W16_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W8_LASX - .WMASK420_LASX_JRTABLE .hword .WMASK420_W4_LASX - .WMASK420_LASX_JRTABLE .WMASK420_W4_LASX: xvld xr0, a2, 0 xvld xr1, a3, 0 addi.w a5, a5, -4 xvabsd.h xr2, xr0, xr1 xvaddi.hu xr2, xr2, 8 xvsrli.h xr2, xr2, 8 xvadd.h xr2, xr2, xr22 xvmin.hu xr3, xr2, xr20 xvsub.h xr4, xr20, xr3 xvmulwev.w.h xr5, xr3, xr0 xvmulwod.w.h xr6, xr3, xr0 xvmaddwev.w.h xr5, xr4, xr1 xvmaddwod.w.h xr6, xr4, xr1 xvilvl.w xr7, xr6, xr5 xvilvh.w xr8, xr6, xr5 xvssrarni.hu.w xr8, xr7, 10 xvssrlni.bu.h xr9, xr8, 0 vstelm.w vr9, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr9, a0, 0, 1 add.d a0, a0, a1 xvstelm.w xr9, a0, 0, 4 add.d a0, a0, a1 xvstelm.w xr9, a0, 0, 5 add.d a0, a0, a1 xvhaddw.w.h xr3, xr3, xr3 xvpermi.d xr4, xr3, 0xb1 xvadd.h xr3, xr3, xr4 xvpickev.h xr3, xr3, xr3 xvsub.h xr3, xr3, xr21 xvssrarni.bu.h xr3, xr3, 2 vstelm.h vr3, a6, 0, 0 xvstelm.h xr3, a6, 2, 8 addi.d a2, a2, 32 addi.d a3, a3, 32 addi.d a6, a6, 4 blt zero, a5, .WMASK420_W4_LASX b .END_W420_LASX .WMASK420_W8_LASX: xvld xr0, a2, 0 xvld xr1, a2, 32 xvld xr2, a3, 0 xvld xr3, a3, 32 addi.w a5, a5, -4 xvabsd.h xr4, xr0, xr2 xvabsd.h xr5, xr1, xr3 xvaddi.hu xr4, xr4, 8 xvaddi.hu xr5, xr5, 8 xvsrli.h xr4, xr4, 8 xvsrli.h xr5, xr5, 8 xvadd.h xr4, xr4, xr22 xvadd.h xr5, xr5, xr22 xvmin.hu xr6, xr4, xr20 xvmin.hu xr7, xr5, xr20 xvsub.h xr8, xr20, xr6 xvsub.h xr9, xr20, xr7 xvmulwev.w.h xr10, xr6, xr0 xvmulwod.w.h xr11, xr6, xr0 xvmulwev.w.h xr12, xr7, xr1 xvmulwod.w.h xr13, xr7, xr1 xvmaddwev.w.h xr10, xr8, xr2 xvmaddwod.w.h xr11, xr8, xr2 xvmaddwev.w.h xr12, xr9, xr3 xvmaddwod.w.h xr13, xr9, xr3 xvssrarni.hu.w xr12, xr10, 10 xvssrarni.hu.w xr13, xr11, 10 xvssrlni.bu.h xr13, xr12, 0 xvshuf4i.w xr1, xr13, 0x4E xvilvl.b xr17, xr1, xr13 vstelm.d vr17, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr17, a0, 0, 2 add.d a0, a0, a1 xvstelm.d xr17, a0, 0, 1 add.d a0, a0, a1 xvstelm.d xr17, a0, 0, 3 add.d a0, a0, a1 xvhaddw.w.h xr6, xr6, xr6 xvhaddw.w.h xr7, xr7, xr7 xvpickev.h xr8, xr7, xr6 xvpermi.q xr9, xr8, 0x01 vadd.h vr8, vr8, vr9 vsub.h vr8, vr8, vr21 vssrarni.bu.h vr8, vr8, 2 vstelm.d vr8, a6, 0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 8 blt zero, a5, .WMASK420_W8_LASX b .END_W420_LASX .WMASK420_W16_LASX: xvld xr0, a2, 0 xvld xr1, a2, 32 xvld xr2, a3, 0 xvld xr3, a3, 32 addi.w a5, a5, -2 xvabsd.h xr4, xr0, xr2 xvabsd.h xr5, xr1, xr3 xvaddi.hu xr4, xr4, 8 xvaddi.hu xr5, xr5, 8 xvsrli.h xr4, xr4, 8 xvsrli.h xr5, xr5, 8 xvadd.h xr4, xr4, xr22 xvadd.h xr5, xr5, xr22 xvmin.hu xr4, xr4, xr20 xvmin.hu xr5, xr5, xr20 xvsub.h xr6, xr20, xr4 xvsub.h xr7, xr20, xr5 xvmulwev.w.h xr8, xr4, xr0 xvmulwod.w.h xr9, xr4, xr0 xvmulwev.w.h xr10, xr5, xr1 xvmulwod.w.h xr11, xr5, xr1 xvmaddwev.w.h xr8, xr6, xr2 xvmaddwod.w.h xr9, xr6, xr2 xvmaddwev.w.h xr10, xr7, xr3 xvmaddwod.w.h xr11, xr7, xr3 xvssrarni.hu.w xr10, xr8, 10 xvssrarni.hu.w xr11, xr9, 10 xvssrlni.bu.h xr11, xr10, 0 xvshuf4i.w xr8, xr11, 0x4E xvilvl.b xr15, xr8, xr11 xvpermi.d xr16, xr15, 0xd8 vst vr16, a0, 0 add.d a0, a0, a1 xvpermi.q xr16, xr16, 0x01 vst vr16, a0, 0 add.d a0, a0, a1 xvhaddw.w.h xr4, xr4, xr4 xvhaddw.w.h xr5, xr5, xr5 xvadd.h xr4, xr5, xr4 xvpickev.h xr6, xr4, xr4 xvpermi.d xr7, xr6, 0x08 vsub.h vr7, vr7, vr21 vssrarni.bu.h vr7, vr7, 2 vstelm.d vr7, a6, 0, 0 addi.d a2, a2, 64 addi.d a3, a3, 64 addi.d a6, a6, 8 blt zero, a5, .WMASK420_W16_LASX b .END_W420_LASX .WMASK420_W32_LASX: .WMASK420_W64_LASX: .WMASK420_W128_LASX: .LOOP_W32_420_LASX: add.d t1, a2, zero add.d t2, a3, zero add.d t3, a0, zero add.d t4, a6, zero alsl.d t5, a4, t1, 1 alsl.d t6, a4, t2, 1 or t7, a4, a4 .W32_420_LASX: xvld xr0, t1, 0 xvld xr1, t2, 0 xvld xr2, t5, 0 xvld xr3, t6, 0 addi.d t1, t1, 32 addi.d t2, t2, 32 addi.d t5, t5, 32 addi.d t6, t6, 32 addi.w t7, t7, -16 xvabsd.h xr4, xr0, xr1 xvabsd.h xr5, xr2, xr3 xvaddi.hu xr4, xr4, 8 xvaddi.hu xr5, xr5, 8 xvsrli.h xr4, xr4, 8 xvsrli.h xr5, xr5, 8 xvadd.h xr4, xr4, xr22 xvadd.h xr5, xr5, xr22 xvmin.hu xr6, xr4, xr20 xvmin.hu xr7, xr5, xr20 xvsub.h xr8, xr20, xr6 xvsub.h xr9, xr20, xr7 xvmulwev.w.h xr10, xr6, xr0 xvmulwod.w.h xr11, xr6, xr0 xvmulwev.w.h xr12, xr7, xr2 xvmulwod.w.h xr13, xr7, xr2 xvmaddwev.w.h xr10, xr8, xr1 xvmaddwod.w.h xr11, xr8, xr1 xvmaddwev.w.h xr12, xr9, xr3 xvmaddwod.w.h xr13, xr9, xr3 xvssrarni.hu.w xr12, xr10, 10 xvssrarni.hu.w xr13, xr11, 10 xvssrlni.bu.h xr13, xr12, 0 xvshuf4i.w xr10, xr13, 0x4E xvilvl.b xr17, xr10, xr13 xvpermi.d xr18, xr17, 0x08 xvpermi.d xr19, xr17, 0x0d vst vr18, t3, 0 vstx vr19, t3, a1 addi.d t3, t3, 16 xvhaddw.w.h xr6, xr6, xr6 xvhaddw.w.h xr7, xr7, xr7 xvadd.h xr6, xr7, xr6 xvpickev.h xr7, xr6, xr6 xvpermi.d xr8, xr7, 0x08 vsub.h vr9, vr8, vr21 vssrarni.bu.h vr9, vr9, 2 vstelm.d vr9, t4, 0, 0 addi.d t4, t4, 8 bne t7, zero, .W32_420_LASX alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 alsl.d a0, a1, a0, 1 srai.w t8, a4, 1 add.d a6, a6, t8 addi.w a5, a5, -2 blt zero, a5, .LOOP_W32_420_LASX .END_W420_LASX: endfunc #undef bpc_sh #undef bpcw_sh .macro vhaddw.d.h in0 vhaddw.w.h \in0, \in0, \in0 vhaddw.d.w \in0, \in0, \in0 .endm .macro vhaddw.q.w in0 vhaddw.d.w \in0, \in0, \in0 vhaddw.q.d \in0, \in0, \in0 .endm .macro PUT_H_8W in0 vshuf.b vr2, \in0, \in0, vr6 vshuf.b vr3, \in0, \in0, vr7 vshuf.b vr4, \in0, \in0, vr8 vmulwev.h.bu.b vr12, vr2, vr10 vmulwev.h.bu.b vr13, vr3, vr11 vmulwev.h.bu.b vr14, vr3, vr10 vmulwev.h.bu.b vr15, vr4, vr11 vmaddwod.h.bu.b vr12, vr2, vr10 vmaddwod.h.bu.b vr13, vr3, vr11 vmaddwod.h.bu.b vr14, vr3, vr10 vmaddwod.h.bu.b vr15, vr4, vr11 vadd.h vr12, vr12, vr13 vadd.h vr14, vr14, vr15 vhaddw.w.h vr12, vr12, vr12 vhaddw.w.h vr14, vr14, vr14 vpickev.h \in0, vr14, vr12 vadd.h \in0, \in0, vr9 .endm const subpel_h_shuf0 .byte 0, 1, 2, 3, 1, 2, 3, 4, 16, 17, 18, 19, 17, 18, 19, 20 endconst const subpel_h_shuf1 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 endconst const subpel_h_shuf2 .byte 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 .byte 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 endconst const subpel_h_shuf3 .byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 .byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 endconst .macro FILTER_8TAP_8W in0 vshuf.b vr13, \in0, \in0, vr7 vshuf.b vr14, \in0, \in0, vr11 vshuf.b vr15, \in0, \in0, vr12 vmulwev.h.bu.b vr16, vr13, vr8 vmulwev.h.bu.b vr17, vr14, vr10 vmulwev.h.bu.b vr18, vr14, vr8 vmulwev.h.bu.b vr19, vr15, vr10 vmaddwod.h.bu.b vr16, vr13, vr8 vmaddwod.h.bu.b vr17, vr14, vr10 vmaddwod.h.bu.b vr18, vr14, vr8 vmaddwod.h.bu.b vr19, vr15, vr10 vadd.h vr16, vr16, vr17 vadd.h vr18, vr18, vr19 vhaddw.w.h vr16, vr16, vr16 vhaddw.w.h \in0, vr18, vr18 vssrarni.h.w \in0, vr16, 2 .endm .macro PUT_8TAP_8BPC_LSX lable li.w t0, 4 la.local t6, dav2d_mc_subpel_filters slli.d t2, a3, 1 //src_stride*2 add.d t3, t2, a3 //src_stride*3 slli.d t4, t2, 1 //src_stride*4 bnez a6, .l_\lable\()put_h //mx bnez a7, .l_\lable\()put_v //my clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_hv0_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_hv0_jtable: .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_64w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_32w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_16w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_8w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_4w - .l_\lable\()put_hv0_jtable .dword .l_\lable\()put_hv0_2w - .l_\lable\()put_hv0_jtable .l_\lable\()put_hv0_2w: vldrepl.h vr0, a2, 0 add.d a2, a2, a3 vldrepl.h vr1, a2, 0 vstelm.h vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr1, a0, 0, 0 add.d a2, a2, a3 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_2w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_4w: fld.s f0, a2, 0 fldx.s f1, a2, a3 fst.s f0, a0, 0 fstx.s f1, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_4w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_8w: fld.d f0, a2, 0 fldx.d f1, a2, a3 fst.d f0, a0, 0 fstx.d f1, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_8w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_16w: vld vr0, a2, 0 vldx vr1, a2, a3 vst vr0, a0, 0 vstx vr1, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_16w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_32w: vld vr0, a2, 0 vld vr1, a2, 16 add.d a2, a2, a3 vld vr2, a2, 0 vld vr3, a2, 16 vst vr0, a0, 0 vst vr1, a0, 16 add.d a0, a0, a1 vst vr2, a0, 0 vst vr3, a0, 16 add.d a2, a2, a3 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_32w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_64w: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 add.d a2, a2, a3 vld vr4, a2, 0 vld vr5, a2, 16 vld vr6, a2, 32 vld vr7, a2, 48 add.d a2, a2, a3 vst vr0, a0, 0 vst vr1, a0, 16 vst vr2, a0, 32 vst vr3, a0, 48 add.d a0, a0, a1 vst vr4, a0, 0 vst vr5, a0, 16 vst vr6, a0, 32 vst vr7, a0, 48 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_64w b .l_\lable\()end_put_8tap .l_\lable\()put_hv0_128w: vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a2, 32 vld vr3, a2, 48 vld vr4, a2, 64 vld vr5, a2, 80 vld vr6, a2, 96 vld vr7, a2, 112 add.d a2, a2, a3 vld vr8, a2, 0 vld vr9, a2, 16 vld vr10, a2, 32 vld vr11, a2, 48 vld vr12, a2, 64 vld vr13, a2, 80 vld vr14, a2, 96 vld vr15, a2, 112 add.d a2, a2, a3 vst vr0, a0, 0 vst vr1, a0, 16 vst vr2, a0, 32 vst vr3, a0, 48 vst vr4, a0, 64 vst vr5, a0, 80 vst vr6, a0, 96 vst vr7, a0, 112 add.d a0, a0, a1 vst vr8, a0, 0 vst vr9, a0, 16 vst vr10, a0, 32 vst vr11, a0, 48 vst vr12, a0, 64 vst vr13, a0, 80 vst vr14, a0, 96 vst vr15, a0, 112 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv0_128w b .l_\lable\()end_put_8tap .l_\lable\()put_h: bnez a7, .l_\lable\()put_hv //if(fh) && if (fv) ld.d t5, sp, 0 //filter_type andi t1, t5, 3 blt t0, a4, .l_\lable\()put_h_idx_fh andi t1, t5, 1 addi.w t1, t1, 3 .l_\lable\()put_h_idx_fh: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t7, t6, t1 //fh's offset li.w t1, 34 vreplgr2vr.h vr9, t1 clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_h_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_h_jtable: .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_64w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_32w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_16w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_8w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_4w - .l_\lable\()put_h_jtable .dword .l_\lable\()put_h_2w - .l_\lable\()put_h_jtable .l_\lable\()put_h_2w: addi.d t7, t7, 2 addi.d a2, a2, -1 vldrepl.w vr8, t7, 0 la.local t7, subpel_h_shuf0 vld vr7, t7, 0 .l_\lable\()put_h_2w_loop: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 vshuf.b vr0, vr1, vr0, vr7 vdp2.h.bu.b vr1, vr0, vr8 vhaddw.w.h vr0, vr1, vr1 vpickev.h vr0, vr0, vr0 vadd.h vr0, vr0, vr9 vssrani.bu.h vr0, vr0, 6 vstelm.h vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr0, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_h_2w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_h_4w: addi.d t7, t7, 2 addi.d a2, a2, -1 vldrepl.w vr8, t7, 0 la.local t7, subpel_h_shuf1 vld vr7, t7, 0 .l_\lable\()put_h_4w_loop: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 vshuf.b vr0, vr0, vr0, vr7 vshuf.b vr1, vr1, vr1, vr7 vmulwev.h.bu.b vr2, vr0, vr8 vmulwev.h.bu.b vr3, vr1, vr8 vmaddwod.h.bu.b vr2, vr0, vr8 vmaddwod.h.bu.b vr3, vr1, vr8 vhaddw.w.h vr0, vr2, vr2 vhaddw.w.h vr1, vr3, vr3 vpickev.h vr0, vr1, vr0 vadd.h vr0, vr0, vr9 vssrani.bu.h vr0, vr0, 6 vstelm.w vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 add.d a0, a0, a1 addi.d a5, a5, -2 bnez a5, .l_\lable\()put_h_4w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_h_8w: fld.d f10, t7, 0 vreplvei.w vr11, vr10, 1 vreplvei.w vr10, vr10, 0 la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vaddi.bu vr7, vr6, 4 vaddi.bu vr8, vr6, 8 addi.d a2, a2, -3 .l_\lable\()put_h_8w_loop: vld vr0, a2, 0 vldx vr1, a2, a3 add.d a2, a2, t2 PUT_H_8W vr0 PUT_H_8W vr1 vssrani.bu.h vr1, vr0, 6 vstelm.d vr1, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr1, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_h_8w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_h_16w: .l_\lable\()put_h_32w: .l_\lable\()put_h_64w: .l_\lable\()put_h_128w: fld.d f10, t7, 0 vreplvei.w vr11, vr10, 1 vreplvei.w vr10, vr10, 0 la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vaddi.bu vr7, vr6, 4 vaddi.bu vr8, vr6, 8 addi.d a2, a2, -3 addi.d t0, a2, 0 //src addi.w t5, a5, 0 //h addi.d t8, a0, 0 //dst .l_\lable\()put_h_16w_loop: vld vr0, a2, 0 vld vr1, a2, 8 add.d a2, a2, a3 PUT_H_8W vr0 PUT_H_8W vr1 vssrani.bu.h vr1, vr0, 6 vst vr1, a0, 0 add.d a0, a0, a1 addi.d a5, a5, -1 bnez a5, .l_\lable\()put_h_16w_loop addi.d a2, t0, 16 addi.d t0, t0, 16 addi.d a0, t8, 16 addi.d t8, t8, 16 addi.w a5, t5, 0 addi.w a4, a4, -16 bnez a4, .l_\lable\()put_h_16w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_v: ld.d t1, sp, 0 //filter_type srli.w t1, t1, 2 blt t0, a5, .l_\lable\()put_v_idx_fv andi t1, t1, 1 addi.w t1, t1, 3 .l_\lable\()put_v_idx_fv: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a7, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fv's offset vldrepl.d vr8, t1, 0 sub.d a2, a2, t3 vilvl.h vr8, vr8, vr8 vreplvei.w vr9, vr8, 1 vreplvei.w vr10, vr8, 2 vreplvei.w vr11, vr8, 3 vreplvei.w vr8, vr8, 0 clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_v_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_v_jtable: .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_64w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_32w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_16w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_8w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_4w - .l_\lable\()put_v_jtable .dword .l_\lable\()put_v_2w - .l_\lable\()put_v_jtable .l_\lable\()put_v_2w: fld.s f0, a2, 0 fldx.s f1, a2, a3 fldx.s f2, a2, t2 add.d a2, a2, t3 fld.s f3, a2, 0 fldx.s f4, a2, a3 fldx.s f5, a2, t2 fldx.s f6, a2, t3 add.d a2, a2, t4 vilvl.h vr0, vr1, vr0 //0 1 vilvl.h vr1, vr2, vr1 //1 2 vilvl.b vr0, vr1, vr0 //01 12 vilvl.h vr2, vr3, vr2 //2 3 vilvl.h vr3, vr4, vr3 //3 4 vilvl.b vr1, vr3, vr2 //23 34 vilvl.h vr2, vr5, vr4 //4 5 vilvl.h vr3, vr6, vr5 //5 6 vilvl.b vr2, vr3, vr2 //45 56 .l_\lable\()put_v_2w_loop: fld.s f7, a2, 0 vilvl.h vr3, vr7, vr6 //6 7 fldx.s f6, a2, a3 add.d a2, a2, t2 vilvl.h vr4, vr6, vr7 //7 8 vilvl.b vr3, vr4, vr3 //67 78 vmulwev.h.bu.b vr12, vr0, vr8 vmulwev.h.bu.b vr13, vr1, vr9 vmulwev.h.bu.b vr14, vr2, vr10 vmulwev.h.bu.b vr15, vr3, vr11 vmaddwod.h.bu.b vr12, vr0, vr8 vmaddwod.h.bu.b vr13, vr1, vr9 vmaddwod.h.bu.b vr14, vr2, vr10 vmaddwod.h.bu.b vr15, vr3, vr11 vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr3, 0 vadd.h vr12, vr12, vr13 vadd.h vr12, vr12, vr14 vadd.h vr12, vr12, vr15 vssrarni.bu.h vr12, vr12, 6 vstelm.h vr12, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr12, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_v_2w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_v_4w: fld.s f0, a2, 0 fldx.s f1, a2, a3 fldx.s f2, a2, t2 add.d a2, a2, t3 fld.s f3, a2, 0 fldx.s f4, a2, a3 fldx.s f5, a2, t2 fldx.s f6, a2, t3 add.d a2, a2, t4 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr2, vr1 vilvl.b vr0, vr1, vr0 vilvl.w vr1, vr3, vr2 vilvl.w vr2, vr4, vr3 vilvl.b vr1, vr2, vr1 vilvl.w vr2, vr5, vr4 vilvl.w vr3, vr6, vr5 vilvl.b vr2, vr3, vr2 .l_\lable\()put_v_4w_loop: fld.s f7, a2, 0 vilvl.w vr3, vr7, vr6 fldx.s f6, a2, a3 add.d a2, a2, t2 vilvl.w vr4, vr6, vr7 vilvl.b vr3, vr4, vr3 vmulwev.h.bu.b vr12, vr0, vr8 vmulwev.h.bu.b vr13, vr1, vr9 vmulwev.h.bu.b vr14, vr2, vr10 vmulwev.h.bu.b vr15, vr3, vr11 vmaddwod.h.bu.b vr12, vr0, vr8 vmaddwod.h.bu.b vr13, vr1, vr9 vmaddwod.h.bu.b vr14, vr2, vr10 vmaddwod.h.bu.b vr15, vr3, vr11 vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr3, 0 vadd.h vr12, vr12, vr13 vadd.h vr12, vr12, vr14 vadd.h vr12, vr12, vr15 vssrarni.bu.h vr12, vr12, 6 vstelm.w vr12, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr12, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_v_4w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_v_8w: .l_\lable\()put_v_16w: .l_\lable\()put_v_32w: .l_\lable\()put_v_64w: .l_\lable\()put_v_128w: addi.d t0, a2, 0 //src addi.d t5, a5, 0 //h addi.d t8, a0, 0 //dst .l_\lable\()put_v_8w_loop0: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t2 add.d a2, a2, t3 fld.d f3, a2, 0 fldx.d f4, a2, a3 fldx.d f5, a2, t2 fldx.d f6, a2, t3 add.d a2, a2, t4 vilvl.b vr0, vr1, vr0 //0 1 vilvl.b vr1, vr2, vr1 //1 2 vilvl.b vr2, vr3, vr2 //2 3 vilvl.b vr3, vr4, vr3 //3 4 vilvl.b vr4, vr5, vr4 //4 5 vilvl.b vr5, vr6, vr5 //5 6 .l_\lable\()put_v_8w_loop: fld.d f7, a2, 0 vilvl.b vr12, vr7, vr6 //6 7 fldx.d f6, a2, a3 add.d a2, a2, t2 vilvl.b vr13, vr6, vr7 //7 8 vmulwev.h.bu.b vr14, vr0, vr8 vmulwev.h.bu.b vr15, vr1, vr8 vmulwev.h.bu.b vr16, vr2, vr9 vmulwev.h.bu.b vr17, vr3, vr9 vmulwev.h.bu.b vr18, vr4, vr10 vmulwev.h.bu.b vr19, vr5, vr10 vmulwev.h.bu.b vr20, vr12, vr11 vmulwev.h.bu.b vr21, vr13, vr11 vmaddwod.h.bu.b vr14, vr0, vr8 vmaddwod.h.bu.b vr15, vr1, vr8 vmaddwod.h.bu.b vr16, vr2, vr9 vmaddwod.h.bu.b vr17, vr3, vr9 vmaddwod.h.bu.b vr18, vr4, vr10 vmaddwod.h.bu.b vr19, vr5, vr10 vmaddwod.h.bu.b vr20, vr12, vr11 vmaddwod.h.bu.b vr21, vr13, vr11 vaddi.hu vr0, vr2, 0 vaddi.hu vr1, vr3, 0 vaddi.hu vr2, vr4, 0 vaddi.hu vr3, vr5, 0 vaddi.hu vr4, vr12, 0 vaddi.hu vr5, vr13, 0 vadd.h vr14, vr14, vr16 vadd.h vr14, vr14, vr18 vadd.h vr14, vr14, vr20 vadd.h vr15, vr15, vr17 vadd.h vr15, vr15, vr19 vadd.h vr15, vr15, vr21 vssrarni.bu.h vr15, vr14, 6 vstelm.d vr15, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr15, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_v_8w_loop addi.d a2, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 8 addi.d t8, t8, 8 addi.d a5, t5, 0 addi.w a4, a4, -8 bnez a4, .l_\lable\()put_v_8w_loop0 b .l_\lable\()end_put_8tap .l_\lable\()put_hv: ld.d t5, sp, 0 //filter_type andi t1, t5, 3 blt t0, a4, .l_\lable\()put_hv_idx_fh andi t1, t5, 1 addi.w t1, t1, 3 .l_\lable\()put_hv_idx_fh: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset vldrepl.d vr8, t1, 0 ld.d t1, sp, 0 //filter_type srli.w t1, t1, 2 blt t0, a5, .l_\lable\()put_hv_idx_fv andi t1, t1, 1 addi.w t1, t1, 3 .l_\lable\()put_hv_idx_fv: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a7, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fv's offset vldrepl.d vr9, t1, 0 vexth.h.b vr9, vr9 sub.d a2, a2, t3 addi.d a2, a2, -3 clz.w t1, a4 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()put_hv_jtable alsl.d t1, t1, t5, 3 ld.d t6, t1, 0 add.d t5, t5, t6 jirl $r0, t5, 0 .align 3 .l_\lable\()put_hv_jtable: .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_64w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_32w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_16w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_8w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_4w - .l_\lable\()put_hv_jtable .dword .l_\lable\()put_hv_2w - .l_\lable\()put_hv_jtable .l_\lable\()put_hv_2w: addi.d a2, a2, 2 vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t2 add.d a2, a2, t3 vld vr3, a2, 0 vldx vr4, a2, a3 vldx vr5, a2, t2 vldx vr6, a2, t3 add.d a2, a2, t4 la.local t1, subpel_h_shuf0 vld vr7, t1, 0 vbsrl.v vr8, vr8, 2 vreplvei.w vr8, vr8, 0 //fv vreplvei.w vr14, vr9, 1 vreplvei.w vr15, vr9, 2 vreplvei.w vr16, vr9, 3 vreplvei.w vr9, vr9, 0 vshuf.b vr0, vr1, vr0, vr7 vshuf.b vr1, vr3, vr2, vr7 vshuf.b vr2, vr5, vr4, vr7 vshuf.b vr3, vr6, vr6, vr7 vmulwev.h.bu.b vr10, vr0, vr8 vmulwev.h.bu.b vr11, vr1, vr8 vmulwev.h.bu.b vr12, vr2, vr8 vmulwev.h.bu.b vr13, vr3, vr8 vmaddwod.h.bu.b vr10, vr0, vr8 vmaddwod.h.bu.b vr11, vr1, vr8 vmaddwod.h.bu.b vr12, vr2, vr8 vmaddwod.h.bu.b vr13, vr3, vr8 vhaddw.w.h vr0, vr10, vr10 vhaddw.w.h vr1, vr11, vr11 vssrarni.h.w vr1, vr0, 2 //h0 h1 h2 h3 vhaddw.w.h vr2, vr12, vr12 vhaddw.w.h vr3, vr13, vr13 vssrarni.h.w vr3, vr2, 2 //h4 h5 h6 ~ vbsrl.v vr2, vr1, 4 vextrins.w vr2, vr3, 0x30 //h1 h2 h3 h4 vilvl.h vr4, vr2, vr1 //h0 h1 h1 h2 -- vilvh.h vr5, vr2, vr1 //h2 h3 h3 h4 -- vbsrl.v vr6, vr3, 4 vilvl.h vr6, vr6, vr3 //h4 h5 h5 h6 -- vbsrl.v vr3, vr3, 8 //h6 ~ .l_\lable\()put_hv_2w_loop: vld vr0, a2, 0 vldx vr2, a2, a3 add.d a2, a2, t2 vshuf.b vr0, vr2, vr0, vr7 vdp2.h.bu.b vr17, vr0, vr8 vhaddw.w.h vr17, vr17, vr17 vssrarni.h.w vr17, vr17, 2 //h7 h8 vextrins.w vr3, vr17, 0x10 //h6 h7 vilvl.h vr3, vr17, vr3 //h6 h7 h7 h8 -- vmulwev.w.h vr18, vr4, vr9 vmulwev.w.h vr19, vr5, vr14 vmulwev.w.h vr20, vr6, vr15 vmulwev.w.h vr21, vr3, vr16 vmaddwod.w.h vr18, vr4, vr9 vmaddwod.w.h vr19, vr5, vr14 vmaddwod.w.h vr20, vr6, vr15 vmaddwod.w.h vr21, vr3, vr16 vaddi.hu vr4, vr5, 0 vaddi.hu vr5, vr6, 0 vaddi.hu vr6, vr3, 0 vbsrl.v vr3, vr17, 4 //h8 ~ vadd.w vr18, vr18, vr19 vadd.w vr18, vr18, vr20 vadd.w vr18, vr18, vr21 vssrarni.hu.w vr0, vr18, 10 vssrani.bu.h vr0, vr0, 0 vstelm.h vr0, a0, 0, 0 add.d a0, a0, a1 vstelm.h vr0, a0, 0, 1 add.d a0, a0, a1 addi.d a5, a5, -2 bnez a5, .l_\lable\()put_hv_2w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_hv_4w: addi.d a2, a2, 2 //ignore leading 0 vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t2 add.d a2, a2, t3 vld vr3, a2, 0 vldx vr4, a2, a3 vldx vr5, a2, t2 vldx vr6, a2, t3 add.d a2, a2, t4 la.local t1, subpel_h_shuf1 vld vr7, t1, 0 vbsrl.v vr8, vr8, 2 vreplvei.w vr8, vr8, 0 //fv vreplvei.w vr17, vr9, 0 vreplvei.w vr18, vr9, 1 vreplvei.w vr19, vr9, 2 vreplvei.w vr20, vr9, 3 //DAV2D_FILTER_8TAP_RND vshuf.b vr0, vr0, vr0, vr7 vshuf.b vr1, vr1, vr1, vr7 vshuf.b vr2, vr2, vr2, vr7 vshuf.b vr3, vr3, vr3, vr7 vshuf.b vr4, vr4, vr4, vr7 vshuf.b vr5, vr5, vr5, vr7 vshuf.b vr6, vr6, vr6, vr7 vmulwev.h.bu.b vr10, vr0, vr8 vmulwev.h.bu.b vr11, vr1, vr8 vmulwev.h.bu.b vr12, vr2, vr8 vmulwev.h.bu.b vr13, vr3, vr8 vmulwev.h.bu.b vr14, vr4, vr8 vmulwev.h.bu.b vr15, vr5, vr8 vmulwev.h.bu.b vr16, vr6, vr8 vmaddwod.h.bu.b vr10, vr0, vr8 vmaddwod.h.bu.b vr11, vr1, vr8 vmaddwod.h.bu.b vr12, vr2, vr8 vmaddwod.h.bu.b vr13, vr3, vr8 vmaddwod.h.bu.b vr14, vr4, vr8 vmaddwod.h.bu.b vr15, vr5, vr8 vmaddwod.h.bu.b vr16, vr6, vr8 vhaddw.w.h vr10, vr10, vr10 vhaddw.w.h vr11, vr11, vr11 vhaddw.w.h vr12, vr12, vr12 vhaddw.w.h vr13, vr13, vr13 vhaddw.w.h vr14, vr14, vr14 vhaddw.w.h vr15, vr15, vr15 vhaddw.w.h vr16, vr16, vr16 vssrarni.h.w vr10, vr10, 2 //h0 vssrarni.h.w vr11, vr11, 2 //h1 vssrarni.h.w vr12, vr12, 2 //h2 vssrarni.h.w vr13, vr13, 2 //h3 vssrarni.h.w vr14, vr14, 2 //h4 vssrarni.h.w vr15, vr15, 2 //h5 vssrarni.h.w vr16, vr16, 2 //h6 //h0 vilvl.h vr0, vr11, vr10 //01 vilvl.h vr1, vr13, vr12 //23 vilvl.h vr2, vr15, vr14 //45 //h1 vilvl.h vr4, vr12, vr11 //12 vilvl.h vr5, vr14, vr13 //34 vilvl.h vr6, vr16, vr15 //56 .l_\lable\()put_hv_4w_loop: vld vr9, a2, 0 vldx vr10, a2, a3 add.d a2, a2, t2 //DAV2D_FILTER_8TAP_CLIP vshuf.b vr9, vr9, vr9, vr7 vshuf.b vr10, vr10, vr10, vr7 vmulwev.h.bu.b vr11, vr9, vr8 vmulwev.h.bu.b vr12, vr10, vr8 vmaddwod.h.bu.b vr11, vr9, vr8 vmaddwod.h.bu.b vr12, vr10, vr8 vhaddw.w.h vr11, vr11, vr11 vhaddw.w.h vr12, vr12, vr12 vssrarni.h.w vr11, vr11, 2 //h7 vssrarni.h.w vr12, vr12, 2 //h8 vilvl.h vr3, vr11, vr16 //67 vilvl.h vr13, vr12, vr11 //78 vmulwev.w.h vr9, vr0, vr17 vmulwev.w.h vr10, vr1, vr18 vmulwev.w.h vr14, vr2, vr19 vmulwev.w.h vr15, vr3, vr20 vmaddwod.w.h vr9, vr0, vr17 vmaddwod.w.h vr10, vr1, vr18 vmaddwod.w.h vr14, vr2, vr19 vmaddwod.w.h vr15, vr3, vr20 vadd.w vr16, vr9, vr10 vadd.w vr16, vr16, vr14 vadd.w vr16, vr16, vr15 vmulwev.w.h vr9, vr4, vr17 vmulwev.w.h vr10, vr5, vr18 vmulwev.w.h vr14, vr6, vr19 vmulwev.w.h vr15, vr13, vr20 vmaddwod.w.h vr9, vr4, vr17 vmaddwod.w.h vr10, vr5, vr18 vmaddwod.w.h vr14, vr6, vr19 vmaddwod.w.h vr15, vr13, vr20 vadd.w vr21, vr9, vr10 vadd.w vr21, vr21, vr14 vadd.w vr21, vr21, vr15 vssrarni.hu.w vr21, vr16, 10 vssrani.bu.h vr21, vr21, 0 //cache vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr3, 0 vaddi.hu vr4, vr5, 0 vaddi.hu vr5, vr6, 0 vaddi.hu vr6, vr13, 0 vaddi.hu vr16, vr12, 0 vstelm.w vr21, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr21, a0, 0, 1 add.d a0, a0, a1 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv_4w_loop b .l_\lable\()end_put_8tap .l_\lable\()put_hv_8w: .l_\lable\()put_hv_16w: .l_\lable\()put_hv_32w: .l_\lable\()put_hv_64w: .l_\lable\()put_hv_128w: addi.d sp, sp, -8*8 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 addi.d t0, a2, 0 //src addi.d t5, a5, 0 //h addi.d t8, a0, 0 //dst la.local t1, subpel_h_shuf1 vld vr7, t1, 0 vaddi.bu vr11, vr7, 4 vaddi.bu vr12, vr7, 8 vreplvei.w vr10, vr8, 1 vreplvei.w vr8, vr8, 0 vreplvei.w vr20, vr9, 1 vreplvei.w vr21, vr9, 2 vreplvei.w vr22, vr9, 3 vreplvei.w vr9, vr9, 0 .l_\lable\()put_hv_8w_loop0: vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t2 add.d a2, a2, t3 vld vr3, a2, 0 vldx vr4, a2, a3 vldx vr5, a2, t2 vldx vr6, a2, t3 add.d a2, a2, t4 FILTER_8TAP_8W vr0 //h0 FILTER_8TAP_8W vr1 //h1 FILTER_8TAP_8W vr2 //h2 FILTER_8TAP_8W vr3 //h3 FILTER_8TAP_8W vr4 //h4 FILTER_8TAP_8W vr5 //h5 FILTER_8TAP_8W vr6 //h6 //h0' low part vilvl.h vr23, vr1, vr0 //01 vilvl.h vr24, vr3, vr2 //23 vilvl.h vr25, vr5, vr4 //45 //h0' high part vilvh.h vr26, vr1, vr0 //01 vilvh.h vr27, vr3, vr2 //23 vilvh.h vr28, vr5, vr4 //45 //h1' low part vilvl.h vr29, vr2, vr1 //12 vilvl.h vr30, vr4, vr3 //34 vilvl.h vr31, vr6, vr5 //56 //h1' high part vilvh.h vr0, vr2, vr1 //12 vilvh.h vr1, vr4, vr3 //34 vilvh.h vr2, vr6, vr5 //56 .l_\lable\()put_hv_8w_loop: vld vr3, a2, 0 vldx vr4, a2, a3 add.d a2, a2, t2 FILTER_8TAP_8W vr3 //h7 FILTER_8TAP_8W vr4 //h8 //h0' low part vilvl.h vr16, vr3, vr6 //67 ~low vmulwev.w.h vr13, vr23, vr9 vmulwev.w.h vr14, vr24, vr20 vmulwev.w.h vr15, vr25, vr21 vmulwev.w.h vr17, vr16, vr22 vmaddwod.w.h vr13, vr23, vr9 vmaddwod.w.h vr14, vr24, vr20 vmaddwod.w.h vr15, vr25, vr21 vmaddwod.w.h vr17, vr16, vr22 vadd.w vr13, vr13, vr14 vadd.w vr13, vr13, vr15 vadd.w vr13, vr13, vr17 //cache vaddi.hu vr23, vr24, 0 vaddi.hu vr24, vr25, 0 vaddi.hu vr25, vr16, 0 //h0' high part vilvh.h vr17, vr3, vr6 //67 ~high vmulwev.w.h vr14, vr26, vr9 vmulwev.w.h vr15, vr27, vr20 vmulwev.w.h vr16, vr28, vr21 vmulwev.w.h vr18, vr17, vr22 vmaddwod.w.h vr14, vr26, vr9 vmaddwod.w.h vr15, vr27, vr20 vmaddwod.w.h vr16, vr28, vr21 vmaddwod.w.h vr18, vr17, vr22 vadd.w vr14, vr14, vr15 vadd.w vr14, vr14, vr16 vadd.w vr14, vr14, vr18 vssrarni.hu.w vr14, vr13, 10 vssrarni.bu.h vr5, vr14, 0 vstelm.d vr5, a0, 0, 0 add.d a0, a0, a1 //cache vaddi.hu vr26, vr27, 0 vaddi.hu vr27, vr28, 0 vaddi.hu vr28, vr17, 0 vaddi.hu vr6, vr4, 0 vilvl.h vr5, vr4, vr3 //78 ~low vilvh.h vr4, vr4, vr3 //78 ~high //h1' low part vmulwev.w.h vr13, vr29, vr9 vmulwev.w.h vr14, vr30, vr20 vmulwev.w.h vr15, vr31, vr21 vmulwev.w.h vr16, vr5, vr22 vmaddwod.w.h vr13, vr29, vr9 vmaddwod.w.h vr14, vr30, vr20 vmaddwod.w.h vr15, vr31, vr21 vmaddwod.w.h vr16, vr5, vr22 vadd.w vr13, vr13, vr14 vadd.w vr13, vr13, vr15 vadd.w vr13, vr13, vr16 //cache vaddi.hu vr29, vr30, 0 vaddi.hu vr30, vr31, 0 vaddi.hu vr31, vr5, 0 //h1' high part vmulwev.w.h vr14, vr0, vr9 vmulwev.w.h vr15, vr1, vr20 vmulwev.w.h vr16, vr2, vr21 vmulwev.w.h vr17, vr4, vr22 vmaddwod.w.h vr14, vr0, vr9 vmaddwod.w.h vr15, vr1, vr20 vmaddwod.w.h vr16, vr2, vr21 vmaddwod.w.h vr17, vr4, vr22 vadd.w vr14, vr14, vr15 vadd.w vr14, vr14, vr16 vadd.w vr14, vr14, vr17 vssrarni.hu.w vr14, vr13, 10 vssrarni.bu.h vr5, vr14, 0 vstelm.d vr5, a0, 0, 0 add.d a0, a0, a1 //cache vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr4, 0 addi.w a5, a5, -2 bnez a5, .l_\lable\()put_hv_8w_loop addi.d a2, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 8 addi.d t8, t8, 8 addi.d a5, t5, 0 addi.w a4, a4, -8 bnez a4, .l_\lable\()put_hv_8w_loop0 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 8*8 .l_\lable\()end_put_8tap: .endm function put_8tap_regular_8bpc_lsx addi.d sp, sp, -16 st.d zero, sp, 0 PUT_8TAP_8BPC_LSX 0 addi.d sp, sp, 16 endfunc function put_8tap_smooth_regular_8bpc_lsx addi.d sp, sp, -16 li.w t0, 1 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 1 addi.d sp, sp, 16 endfunc function put_8tap_sharp_regular_8bpc_lsx addi.d sp, sp, -16 li.w t0, 2 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 2 addi.d sp, sp, 16 endfunc function put_8tap_regular_smooth_8bpc_lsx addi.d sp, sp, -16 li.w t0, 4 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 4 addi.d sp, sp, 16 endfunc function put_8tap_smooth_8bpc_lsx addi.d sp, sp, -16 li.w t0, 5 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 5 addi.d sp, sp, 16 endfunc function put_8tap_sharp_smooth_8bpc_lsx addi.d sp, sp, -16 li.w t0, 6 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 6 addi.d sp, sp, 16 endfunc function put_8tap_regular_sharp_8bpc_lsx addi.d sp, sp, -16 li.w t0, 8 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 8 addi.d sp, sp, 16 endfunc function put_8tap_smooth_sharp_8bpc_lsx addi.d sp, sp, -16 li.w t0, 9 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 9 addi.d sp, sp, 16 endfunc function put_8tap_sharp_8bpc_lsx addi.d sp, sp, -16 li.w t0, 10 st.d t0, sp, 0 PUT_8TAP_8BPC_LSX 10 addi.d sp, sp, 16 endfunc const shufb1 .byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8 endconst .macro PREP_H_8W in0 vshuf.b vr2, \in0, \in0, vr6 vshuf.b vr3, \in0, \in0, vr7 vshuf.b vr4, \in0, \in0, vr8 vmulwev.h.bu.b vr12, vr2, vr22 vmulwev.h.bu.b vr13, vr3, vr23 vmulwev.h.bu.b vr14, vr3, vr22 vmulwev.h.bu.b vr15, vr4, vr23 vmaddwod.h.bu.b vr12, vr2, vr22 vmaddwod.h.bu.b vr13, vr3, vr23 vmaddwod.h.bu.b vr14, vr3, vr22 vmaddwod.h.bu.b vr15, vr4, vr23 vadd.h vr12, vr12, vr13 vadd.h vr14, vr14, vr15 vhaddw.w.h vr12, vr12, vr12 vhaddw.w.h \in0, vr14, vr14 vssrarni.h.w \in0, vr12, 2 .endm .macro PREP_HV_8W_LASX in0 xvshuf.b xr4, \in0, \in0, xr19 xvshuf.b xr5, \in0, \in0, xr20 xvshuf.b xr6, \in0, \in0, xr21 xvmulwev.h.bu.b xr7, xr4, xr22 xvmulwev.h.bu.b xr9, xr5, xr23 xvmulwev.h.bu.b xr10, xr5, xr22 xvmulwev.h.bu.b xr11, xr6, xr23 xvmaddwod.h.bu.b xr7, xr4, xr22 xvmaddwod.h.bu.b xr9, xr5, xr23 xvmaddwod.h.bu.b xr10, xr5, xr22 xvmaddwod.h.bu.b xr11, xr6, xr23 xvadd.h xr7, xr7, xr9 xvadd.h xr9, xr10, xr11 xvhaddw.w.h xr7, xr7, xr7 xvhaddw.w.h \in0, xr9, xr9 xvssrarni.h.w \in0, xr7, 2 .endm .macro PREP_8TAP_8BPC_LASX lable li.w t0, 4 la.local t6, dav2d_mc_subpel_filters slli.d t2, a2, 1 //src_stride*2 add.d t3, t2, a2 //src_stride*3 slli.d t4, t2, 1 bnez a5, .l_\lable\()h_lasx //mx bnez a6, .l_\lable\()v_lasx clz.w t1, a3 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()prep_hv0_jtable_lasx alsl.d t1, t1, t5, 1 ld.h t8, t1, 0 add.d t5, t5, t8 jirl $r0, t5, 0 .align 3 .l_\lable\()prep_hv0_jtable_lasx: .hword .l_\lable\()hv0_128w_lasx - .l_\lable\()prep_hv0_jtable_lasx .hword .l_\lable\()hv0_64w_lasx - .l_\lable\()prep_hv0_jtable_lasx .hword .l_\lable\()hv0_32w_lasx - .l_\lable\()prep_hv0_jtable_lasx .hword .l_\lable\()hv0_16w_lasx - .l_\lable\()prep_hv0_jtable_lasx .hword .l_\lable\()hv0_8w_lasx - .l_\lable\()prep_hv0_jtable_lasx .hword .l_\lable\()hv0_4w_lasx - .l_\lable\()prep_hv0_jtable_lasx .l_\lable\()hv0_4w_lasx: fld.s f0, a1, 0 fldx.s f1, a1, a2 fldx.s f2, a1, t2 fldx.s f3, a1, t3 add.d a1, a1, t4 xvpackev.w xr0, xr1, xr0 xvpackev.w xr1, xr3, xr2 xvpermi.q xr0, xr1, 0x02 xvsllwil.hu.bu xr0, xr0, 4 xvst xr0, a0, 0 addi.d a0, a0, 32 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_4w_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()hv0_8w_lasx: fld.d f0, a1, 0 fldx.d f1, a1, a2 fldx.d f2, a1, t2 fldx.d f3, a1, t3 add.d a1, a1, t4 xvpermi.q xr0, xr1, 0x02 xvpermi.q xr2, xr3, 0x02 xvsllwil.hu.bu xr0, xr0, 4 xvsllwil.hu.bu xr2, xr2, 4 xvst xr0, a0, 0 xvst xr2, a0, 32 addi.d a0, a0, 64 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_8w_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()hv0_16w_lasx: vld vr0, a1, 0 vldx vr1, a1, a2 vldx vr2, a1, t2 vldx vr3, a1, t3 add.d a1, a1, t4 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 xvslli.h xr0, xr0, 4 xvslli.h xr1, xr1, 4 xvslli.h xr2, xr2, 4 xvslli.h xr3, xr3, 4 xvst xr0, a0, 0 xvst xr1, a0, 32 xvst xr2, a0, 64 xvst xr3, a0, 96 addi.d a0, a0, 128 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_16w_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()hv0_32w_lasx: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 xvpermi.d xr4, xr0, 0xD8 xvpermi.d xr5, xr1, 0xD8 xvpermi.d xr6, xr2, 0xD8 xvpermi.d xr7, xr3, 0xD8 xvpermi.d xr10, xr0, 0x32 xvpermi.d xr11, xr1, 0x32 xvpermi.d xr12, xr2, 0x32 xvpermi.d xr13, xr3, 0x32 xvsllwil.hu.bu xr0, xr4, 4 xvsllwil.hu.bu xr1, xr5, 4 xvsllwil.hu.bu xr2, xr6, 4 xvsllwil.hu.bu xr3, xr7, 4 xvsllwil.hu.bu xr4, xr10, 4 xvsllwil.hu.bu xr5, xr11, 4 xvsllwil.hu.bu xr6, xr12, 4 xvsllwil.hu.bu xr7, xr13, 4 xvst xr0, a0, 0 xvst xr4, a0, 32 xvst xr1, a0, 64 xvst xr5, a0, 96 xvst xr2, a0, 128 xvst xr6, a0, 160 xvst xr3, a0, 192 xvst xr7, a0, 224 addi.d a0, a0, 256 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_32w_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()hv0_64w_lasx: .l_\lable\()hv0_128w_lasx: addi.d t0, a1, 0 addi.d t5, a4, 0 srli.w t7, a3, 5 slli.w t7, t7, 6 addi.d t8, a0, 0 .l_\lable\()hv0_32_loop_lasx: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 xvpermi.d xr4, xr0, 0xD8 xvpermi.d xr5, xr1, 0xD8 xvpermi.d xr6, xr2, 0xD8 xvpermi.d xr7, xr3, 0xD8 xvpermi.d xr10, xr0, 0x32 xvpermi.d xr11, xr1, 0x32 xvpermi.d xr12, xr2, 0x32 xvpermi.d xr13, xr3, 0x32 xvsllwil.hu.bu xr0, xr4, 4 xvsllwil.hu.bu xr1, xr5, 4 xvsllwil.hu.bu xr2, xr6, 4 xvsllwil.hu.bu xr3, xr7, 4 xvsllwil.hu.bu xr4, xr10, 4 xvsllwil.hu.bu xr5, xr11, 4 xvsllwil.hu.bu xr6, xr12, 4 xvsllwil.hu.bu xr7, xr13, 4 xvst xr0, a0, 0 xvst xr4, a0, 32 add.d t1, a0, t7 xvst xr1, t1, 0 xvst xr5, t1, 32 add.d t1, t1, t7 xvst xr2, t1, 0 xvst xr6, t1, 32 add.d t1, t1, t7 xvst xr3, t1, 0 xvst xr7, t1, 32 add.d a0, t1, t7 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv0_32_loop_lasx addi.d a1, t0, 32 addi.d t0, t0, 32 addi.d a0, t8, 64 addi.d t8, t8, 64 addi.d a4, t5, 0 addi.d a3, a3, -32 bnez a3, .l_\lable\()hv0_32_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()h_lasx: bnez a6, .l_\lable\()hv_lasx //if(fh) && if (fv) andi t1, a7, 3 blt t0, a3, .l_\lable\()h_idx_fh_lasx andi t1, a7, 1 addi.w t1, t1, 3 .l_\lable\()h_idx_fh_lasx: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a5, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset xvldrepl.d xr22, t1, 0 addi.d a1, a1, -3 clz.w t1, a3 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()prep_h_jtable_lasx alsl.d t1, t1, t5, 1 ld.h t8, t1, 0 add.d t5, t5, t8 jirl $r0, t5, 0 .align 3 .l_\lable\()prep_h_jtable_lasx: .hword .l_\lable\()h_128w_lasx - .l_\lable\()prep_h_jtable_lasx .hword .l_\lable\()h_64w_lasx - .l_\lable\()prep_h_jtable_lasx .hword .l_\lable\()h_32w_lasx - .l_\lable\()prep_h_jtable_lasx .hword .l_\lable\()h_16w_lasx - .l_\lable\()prep_h_jtable_lasx .hword .l_\lable\()h_8w_lasx - .l_\lable\()prep_h_jtable_lasx .hword .l_\lable\()h_4w_lasx - .l_\lable\()prep_h_jtable_lasx .l_\lable\()h_4w_lasx: addi.d a1, a1, 2 la.local t7, subpel_h_shuf1 vld vr7, t7, 0 xvreplve0.q xr7, xr7 xvbsrl.v xr22, xr22, 2 xvreplve0.w xr22, xr22 .l_\lable\()h_4w_loop_lasx: vld vr0, a1, 0 vldx vr1, a1, a2 vldx vr2, a1, t2 vldx vr3, a1, t3 add.d a1, a1, t4 xvpermi.q xr1, xr0, 0x20 xvpermi.q xr3, xr2, 0x20 xvshuf.b xr1, xr1, xr1, xr7 xvshuf.b xr3, xr3, xr3, xr7 xvmulwev.h.bu.b xr0, xr1, xr22 xvmulwev.h.bu.b xr2, xr3, xr22 xvmaddwod.h.bu.b xr0, xr1, xr22 xvmaddwod.h.bu.b xr2, xr3, xr22 xvhaddw.w.h xr0, xr0, xr0 xvhaddw.w.h xr2, xr2, xr2 xvssrarni.h.w xr2, xr0, 2 xvpermi.d xr2, xr2, 0xd8 xvst xr2, a0, 0 addi.d a0, a0, 32 addi.w a4, a4, -4 bnez a4, .l_\lable\()h_4w_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()h_8w_lasx: la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vbsrl.v vr23, vr22, 4 //fh xvreplve0.w xr23, xr23 xvreplve0.w xr22, xr22 xvreplve0.q xr19, xr6 xvaddi.bu xr20, xr19, 4 xvaddi.bu xr21, xr19, 8 .l_\lable\()h_8w_loop_lasx: xvld xr0, a1, 0 xvldx xr1, a1, a2 add.d a1, a1, t2 xvpermi.q xr0, xr1, 0x02 PREP_HV_8W_LASX xr0 xvst xr0, a0, 0 addi.d a0, a0, 32 addi.d a4, a4, -2 bnez a4, .l_\lable\()h_8w_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()h_16w_lasx: la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vbsrl.v vr23, vr22, 4 //fh xvreplve0.w xr23, xr23 xvreplve0.w xr22, xr22 xvreplve0.q xr19, xr6 xvaddi.bu xr20, xr19, 4 xvaddi.bu xr21, xr19, 8 .l_\lable\()h_16w_loop_lasx: xvld xr0, a1, 0 xvld xr1, a1, 8 add.d a1, a1, a2 xvpermi.q xr0, xr1, 0x02 PREP_HV_8W_LASX xr0 xvst xr0, a0, 0 xvld xr0, a1, 0 xvld xr1, a1, 8 add.d a1, a1, a2 xvpermi.q xr0, xr1, 0x02 PREP_HV_8W_LASX xr0 xvst xr0, a0, 32 addi.d a0, a0, 64 addi.w a4, a4, -2 bnez a4, .l_\lable\()h_16w_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()h_32w_lasx: .l_\lable\()h_64w_lasx: .l_\lable\()h_128w_lasx: la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vbsrl.v vr23, vr22, 4 //fh xvreplve0.w xr23, xr23 xvreplve0.w xr22, xr22 xvreplve0.q xr19, xr6 xvaddi.bu xr20, xr19, 4 xvaddi.bu xr21, xr19, 8 addi.d t5, a1, 0 //src addi.d t6, a3, 0 //w slli.w t7, a3, 1 //store offset addi.d t8, a0, 0 //dst .l_\lable\()h_16_loop_lasx: xvld xr0, a1, 0 xvld xr1, a1, 8 xvpermi.q xr0, xr1, 0x02 PREP_HV_8W_LASX xr0 xvst xr0, a0, 0 xvld xr0, a1, 16 xvld xr1, a1, 24 xvpermi.q xr0, xr1, 0x02 PREP_HV_8W_LASX xr0 xvst xr0, a0, 32 addi.d a0, a0, 64 addi.d a1, a1, 32 addi.d a3, a3, -32 bnez a3, .l_\lable\()h_16_loop_lasx add.d a1, t5, a2 add.d t5, t5, a2 add.d a0, t8, t7 add.d t8, t8, t7 addi.d a3, t6, 0 addi.d a4, a4, -1 bnez a4, .l_\lable\()h_16_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()hv_lasx: andi t1, a7, 3 blt t0, a3, .l_\lable\()hv_idx_fh_lasx andi t1, a7, 1 addi.w t1, t1, 3 .l_\lable\()hv_idx_fh_lasx: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a5, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset xvldrepl.d xr22, t1, 0 srli.w a7, a7, 2 blt t0, a4, .l_\lable\()hv_idx_fv_lasx andi a7, a7, 1 addi.w a7, a7, 3 .l_\lable\()hv_idx_fv_lasx: addi.w t5, zero, 120 mul.w a7, a7, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w a7, a7, t5 add.d a7, t6, a7 //fv's offset xvldrepl.d xr8, a7, 0 xvsllwil.h.b xr8, xr8, 0 sub.d a1, a1, t3 addi.d a1, a1, -1 //ignore leading 0s beq a3, t0, .l_\lable\()hv_4w_lasx addi.d a1, a1, -2 b .l_\lable\()hv_8w_lasx .l_\lable\()hv_4w_lasx: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 xvldx xr3, a1, t3 add.d a1, a1, t4 xvld xr4, a1, 0 xvldx xr5, a1, a2 xvldx xr6, a1, t2 la.local t1, subpel_h_shuf2 xvld xr7, t1, 0 vbsrl.v vr22, vr22, 2 xvreplve0.w xr22, xr22 xvreplve0.q xr8, xr8 xvrepl128vei.w xr12, xr8, 0 xvrepl128vei.w xr13, xr8, 1 xvrepl128vei.w xr14, xr8, 2 xvrepl128vei.w xr15, xr8, 3 xvilvl.d xr0, xr1, xr0 xvilvl.d xr2, xr3, xr2 xvilvl.d xr4, xr5, xr4 xvreplve0.q xr0, xr0 xvreplve0.q xr2, xr2 xvreplve0.q xr4, xr4 xvreplve0.q xr6, xr6 xvshuf.b xr0, xr0, xr0, xr7 xvshuf.b xr2, xr2, xr2, xr7 xvshuf.b xr4, xr4, xr4, xr7 xvshuf.b xr6, xr6, xr6, xr7 xvmulwev.h.bu.b xr1, xr0, xr22 xvmulwev.h.bu.b xr3, xr2, xr22 xvmulwev.h.bu.b xr5, xr4, xr22 xvmulwev.h.bu.b xr9, xr6, xr22 xvmaddwod.h.bu.b xr1, xr0, xr22 xvmaddwod.h.bu.b xr3, xr2, xr22 xvmaddwod.h.bu.b xr5, xr4, xr22 xvmaddwod.h.bu.b xr9, xr6, xr22 xvhaddw.w.h xr1, xr1, xr1 // a0 b0 a1 b1 c0 d0 c1 d1 xvhaddw.w.h xr3, xr3, xr3 // a2 b2 a3 b3 c2 d2 c3 d3 xvhaddw.w.h xr5, xr5, xr5 // a4 b4 a5 b5 c4 d4 c5 d5 xvhaddw.w.h xr9, xr9, xr9 // a6 b6 - - c6 d6 - - xvssrarni.h.w xr3, xr1, 2 // a0 b0 a1 b1 a2 b2 a3 b3 c0 d0 c1 d1 c2 d2 c3 d3 xvssrarni.h.w xr9, xr5, 2 // a4 b4 a5 b5 a6 b6 - - c4 d4 c5 d5 c6 d6 - - xvbsrl.v xr4, xr3, 4 xvextrins.w xr4, xr9, 0x30 // a1 b1 a2 b2 a3 b3 a4 b4 c1 d1 c2 d2 c3 d3 c4 d4 xvilvl.h xr5, xr4, xr3 // a0 a1 b0 b1 a1 a2 b1 b2 c0 c1 d0 d1 c1 c2 d1 d2 xvilvh.h xr6, xr4, xr3 // a2 a3 b2 b3 a3 a4 b3 b4 c2 c3 d2 d3 c3 c4 d3 d4 xvbsrl.v xr10, xr9, 4 // a5 b5 a6 b6 - - - - c5 d5 c6 d6 - - - - xvilvl.h xr11, xr10, xr9 // a4 a5 b4 b5 a5 a6 b5 b6 c4 c5 d4 d5 c5 c6 d5 d6 .l_\lable\()hv_w4_loop_lasx: xvmulwev.w.h xr16, xr5, xr12 //a0 a1 (h0) xvmulwev.w.h xr17, xr6, xr12 //a2 a3 (h1) xvmulwev.w.h xr18, xr6, xr13 //a2 a3 (h0) xvmulwev.w.h xr19, xr11, xr13 //a4 a5 (h1) xvmulwev.w.h xr20, xr11, xr14 //a4 a5 (h0) xvmaddwod.w.h xr16, xr5, xr12 // xvmaddwod.w.h xr17, xr6, xr12 // xvmaddwod.w.h xr18, xr6, xr13 // xvmaddwod.w.h xr19, xr11, xr13 // xvmaddwod.w.h xr20, xr11, xr14 // xvaddi.wu xr5, xr11, 0 xvadd.w xr16, xr16, xr18 //a0 a1 + a2 a3 xvldx xr18, a1, t3 //a7 b7 c7 d7 add.d a1, a1, t4 xvadd.w xr17, xr17, xr19 //a2 a3 + a4 a5 xvld xr19, a1, 0 //a8 b8 c8 d8 xvadd.w xr16, xr16, xr20 //a0 a1 + a2 a3 + a4 a5 xvldx xr20, a1, a2 //a9 b9 c9 d9 xvilvl.d xr18, xr19, xr18 xvreplve0.q xr18, xr18 xvldx xr19, a1, t2 //aa ba ca da xvilvl.d xr20, xr19, xr20 xvreplve0.q xr20, xr20 xvshuf.b xr18, xr18, xr18, xr7 xvshuf.b xr20, xr20, xr20, xr7 xvmulwev.h.bu.b xr21, xr18, xr22 xvmulwev.h.bu.b xr23, xr20, xr22 xvmaddwod.h.bu.b xr21, xr18, xr22 xvmaddwod.h.bu.b xr23, xr20, xr22 xvhaddw.w.h xr21, xr21, xr21 //a7 b7 a8 b8 c7 d7 c8 d8 xvhaddw.w.h xr23, xr23, xr23 //a9 b9 aa ba c9 d9 ca da xvssrarni.h.w xr23, xr21, 2 //a7 b7 a8 b8 a9 b9 aa ba c7 d7 c8 d8 c9 d9 ca da xvbsll.v xr0, xr23, 4 xvextrins.w xr0, xr9, 0x02 //a6 b6 a7 b7 a8 b8 a9 b9 c6 d6 c7 d7 c8 d8 c9 d9 xvilvl.h xr6, xr23, xr0 //a6 a7 b6 b7 a7 a8 b7 b8 c6 c7 d6 d7 c7 c8 d7 d8 xvilvh.h xr11, xr23, xr0 //a8 a9 b8 b9 a9 aa b9 ba c8 c9 d8 d9 c9 ca d9 da xvbsrl.v xr9, xr23, 4 xvmulwev.w.h xr1 , xr6, xr14 //a6 a7 (h0) xvmulwev.w.h xr2 , xr6, xr15 //a6 a7 (h1) xvmulwev.w.h xr3 , xr11, xr15 //a8 a9 (h1) xvmaddwod.w.h xr1 , xr6, xr14 xvmaddwod.w.h xr2 , xr6, xr15 xvmaddwod.w.h xr3 , xr11, xr15 xvadd.w xr17, xr17, xr1 //a2 a3 + a4 a5 + a6 a7 xvadd.w xr16, xr16, xr2 //a0 a1 + a2 a3 + a4 a5 + a6 a7 xvadd.w xr17, xr17, xr3 //a2 a3 + a4 a5 + a6 a7 + a8 a9 xvssrarni.h.w xr17, xr16, 6 //a01 b01 a12 b12 a23 b23 a34 b34 c01 d01 c12 d12 c23 d23 c34 d34 xvpermi.d xr17, xr17, 0xd8 //a01 b01 a12 b12 c01 d01 c12 d12 a23 b23 a34 b34 c23 d23 c34 d34 xvshuf4i.w xr17, xr17, 0xd8 xvst xr17, a0, 0 addi.d a0, a0, 32 addi.d a4, a4, -4 bnez a4, .l_\lable\()hv_w4_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()hv_8w_lasx: addi.d sp, sp, -4*8 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 la.local t1, subpel_h_shuf1 vld vr19, t1, 0 addi.d t0, a1, 0 addi.d t5, a4, 0 slli.w t7, a3, 1 // store offset addi.d t8, a0, 0 xvreplve0.q xr19, xr19 xvaddi.bu xr20, xr19, 4 xvaddi.bu xr21, xr19, 8 vbsrl.v vr23, vr22, 4 xvreplve0.w xr22, xr22 //f0f1f2f3 xvreplve0.w xr23, xr23 //f4f5f6f7 xvreplve0.q xr8, xr8 xvrepl128vei.w xr24, xr8, 0 xvrepl128vei.w xr25, xr8, 1 xvrepl128vei.w xr26, xr8, 2 xvrepl128vei.w xr27, xr8, 3 .l_\lable\()hv_8w_loop0_lasx: xvld xr0, a1, 0 xvldx xr1, a1, a2 xvldx xr2, a1, t2 add.d a1, a1, t3 xvld xr3, a1, 0 xvldx xr4, a1, a2 xvldx xr5, a1, t2 xvldx xr6, a1, t3 add.d a1, a1, t4 xvpermi.q xr0, xr3, 0x02 //0 3 xvpermi.q xr1, xr4, 0x02 //1 4 xvpermi.q xr2, xr5, 0x02 //2 5 xvpermi.q xr3, xr6, 0x02 //3 6 PREP_HV_8W_LASX xr0 //a0b0c0d0 e0f0g0h0 a3b3c3d3 e3f3g3h3 PREP_HV_8W_LASX xr1 //a1b1c1d1 e1f1g1h1 a4b4c4d4 e4f4g4h4 PREP_HV_8W_LASX xr2 //a2b2c2d2 e2f2g2h2 a5b5c5d5 e5f5g5h5 PREP_HV_8W_LASX xr3 //a3b3c3d3 e3f3g3h3 a6b6c6d6 e6f6g6h6 xvpermi.d xr0, xr0, 0xd8 xvpermi.d xr1, xr1, 0xd8 xvpermi.d xr2, xr2, 0xd8 xvpermi.d xr18, xr3, 0xd8 xvilvl.h xr12, xr1, xr0 //a0a1b0b1c0c1d0d1 e0e1f0f1g0g1h0h1 xvilvh.h xr13, xr1, xr0 //a3a4b3b4c3c4d3d4 e3e4f3f4g3g4h3h4 xvilvl.h xr14, xr2, xr1 //a1a2b1b2c1c2d1d2 e1e2f1f2g1g2h1h2 xvilvh.h xr15, xr2, xr1 //a4a5b4b5c4c5d4d5 e4e5f4f5g4g5h4h5 xvilvl.h xr16, xr18, xr2 //a2a3b2b3c2c3d2d3 e2e3f2f3g2g3h2h3 xvilvh.h xr17, xr18, xr2 //a5a6b5b6c5c6d5d6 e5e6f5f6g5g6h5h6 .l_\lable\()hv_8w_loop_lasx: xvld xr0, a1, 0 xvldx xr1, a1, a2 add.d a1, a1, t2 xvpermi.q xr0, xr1, 0x02 //7 8 PREP_HV_8W_LASX xr0 //a7b7c7d7e7f7g7h7 a8b8c8d8e8f8g8h8 xvpermi.q xr3, xr0, 0x03 //a6b6c6d6e6f6g6h6 a7b7c7d7e7f7g7h7 xvpermi.d xr3, xr3, 0xd8 //a6b6c6d6a7b7c7d7 e6f6g6h6e7f7g7h7 xvpermi.d xr1, xr0, 0xd8 //a7b7c7d7a8b8c8d8 e7f7g7h7e8f8g8h8 xvilvl.h xr18, xr1, xr3 //a6a7b6b7c6c7d6d7 e6e7f6f7g6g7h6h7 xvilvh.h xr2, xr1, xr3 //a7a8b7b8c7c8d7d8 e7e8f7f8g7g8h7h8 xvaddi.hu xr3, xr0, 0 xvmulwev.w.h xr4, xr12, xr24 //01 xvmulwev.w.h xr5, xr14, xr24 //12 xvmulwev.w.h xr6, xr16, xr25 //23 xvmulwev.w.h xr7, xr13, xr25 //34 xvmulwev.w.h xr8, xr15, xr26 //45 xvmulwev.w.h xr9, xr17, xr26 //56 xvmulwev.w.h xr10, xr18, xr27 //67 xvmulwev.w.h xr11, xr2, xr27 //78 xvmaddwod.w.h xr4, xr12, xr24 //01 xvmaddwod.w.h xr5, xr14, xr24 //12 xvmaddwod.w.h xr6, xr16, xr25 //23 xvmaddwod.w.h xr7, xr13, xr25 //34 xvmaddwod.w.h xr8, xr15, xr26 //45 xvmaddwod.w.h xr9, xr17, xr26 //56 xvmaddwod.w.h xr10, xr18, xr27 //67 xvmaddwod.w.h xr11, xr2, xr27 //78 xvadd.w xr4, xr4, xr6 xvadd.w xr5, xr5, xr7 xvadd.w xr4, xr4, xr8 xvadd.w xr5, xr5, xr9 xvadd.w xr4, xr4, xr10 xvadd.w xr5, xr5, xr11 xvaddi.hu xr12, xr16, 0 //01 <-- 23 xvaddi.hu xr14, xr13, 0 //12 <-- 34 xvaddi.hu xr16, xr15, 0 //23 <-- 45 xvaddi.hu xr13, xr17, 0 //34 <-- 56 xvaddi.hu xr15, xr18, 0 //45 <-- 67 xvaddi.hu xr17, xr2, 0 //56 <-- 78 xvssrarni.h.w xr5, xr4, 6 xvpermi.d xr5, xr5, 0xd8 vst vr5, a0, 0 xvpermi.q xr5, xr5, 0x11 vstx vr5, a0, t7 alsl.d a0, t7, a0, 1 addi.d a4, a4, -2 bnez a4, .l_\lable\()hv_8w_loop_lasx addi.d a1, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 16 addi.d t8, t8, 16 addi.d a4, t5, 0 addi.d a3, a3, -8 bnez a3, .l_\lable\()hv_8w_loop0_lasx fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 addi.d sp, sp, 4*8 b .l_\lable\()end_pre_8tap_lasx .l_\lable\()v_lasx: srli.w a7, a7, 2 blt t0, a4, .l_\lable\()v_idx_fv_lasx andi a7, a7, 1 addi.w a7, a7, 3 .l_\lable\()v_idx_fv_lasx: addi.w t5, zero, 120 mul.w a7, a7, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w a7, a7, t5 add.d a7, t6, a7 //fv's offset xvldrepl.d xr8, a7, 0 xvrepl128vei.h xr12, xr8, 0 xvrepl128vei.h xr13, xr8, 1 xvrepl128vei.h xr14, xr8, 2 xvrepl128vei.h xr15, xr8, 3 sub.d a1, a1, t3 beq a3, t0, .l_\lable\()v_4w_lasx addi.w t0, t0, 4 beq a3, t0, .l_\lable\()v_8w_lasx blt t0, a3, .l_\lable\()v_16w_lasx .l_\lable\()v_4w_lasx: la.local t6, subpel_h_shuf3 xvld xr11, t6, 0 fld.s f0, a1, 0 //a0b0c0d0 fldx.s f1, a1, a2 //a1b1c1d1 fldx.s f2, a1, t2 //a2b2c2d2 add.d a1, a1, t3 fld.s f3, a1, 0 //a3b3c3d3 fldx.s f4, a1, a2 //a4b4c4d4 fldx.s f5, a1, t2 //a5b5c5d5 fldx.s f6, a1, t3 //a6b6c6d6 vilvl.w vr0, vr1, vr0 //01 vilvl.w vr1, vr3, vr2 //23 vilvl.d vr0, vr1, vr0 //0123 vilvl.w vr2, vr5, vr4 //45 vilvl.d vr1, vr2, vr1 //2345 xvpermi.q xr0, xr1, 0x02 //0123 2345 xvbsrl.v xr1, xr0, 4 //123- 345- xvpermi.q xr4, xr6, 0x02 xvextrins.w xr1, xr4, 0x30 //1234 3456 xvilvl.b xr2, xr1, xr0 //0112 2334 //a0a1b0b1c0c1d0d1 a1a2b1b2c1c2d1d2 a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 xvilvh.b xr3, xr1, xr0 //2334 4556 //a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 .l_\lable\()v_4w_loop_lasx: add.d a1, a1, t4 fld.s f0, a1, 0 //a7b7c7d7 fldx.s f1, a1, a2 //a8b8c8d8 fldx.s f4, a1, t2 //a9b9c9d9 fldx.s f5, a1, t3 //aabacada vilvl.w vr7, vr0, vr6 //67 vilvl.w vr10, vr4, vr1 //89 vextrins.w vr7, vr1, 0x20//678- vextrins.w vr10, vr5, 0x20//89a- xvpermi.q xr7, xr10, 0x02//678- 89a- xvshuf.b xr4, xr7, xr7, xr11 //67 78 89 9a //a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 a8a9b8b9c8c9d8d9 a9aab9bac9cad9da xvpermi.q xr7, xr3, 0x11 //4556 xvpermi.q xr7, xr4, 0x02 //45 56 67 78 //a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 xvmulwev.h.bu.b xr16, xr2, xr12 xvmulwev.h.bu.b xr17, xr3, xr13 xvmulwev.h.bu.b xr18, xr7, xr14 xvmulwev.h.bu.b xr19, xr4, xr15 xvmaddwod.h.bu.b xr16, xr2, xr12 xvmaddwod.h.bu.b xr17, xr3, xr13 xvmaddwod.h.bu.b xr18, xr7, xr14 xvmaddwod.h.bu.b xr19, xr4, xr15 xvadd.h xr16, xr16, xr17 xvadd.h xr16, xr16, xr18 xvadd.h xr16, xr16, xr19 xvsrari.h xr16, xr16, 2 xvaddi.bu xr2, xr7, 0 xvaddi.bu xr3, xr4, 0 xvaddi.bu xr6, xr5, 0 xvst xr16, a0, 0 addi.d a0, a0, 32 addi.w a4, a4, -4 bnez a4, .l_\lable\()v_4w_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()v_8w_lasx: fld.d f0, a1, 0 fldx.d f1, a1, a2 fldx.d f2, a1, t2 add.d a1, a1, t3 fld.d f3, a1, 0 fldx.d f4, a1, a2 fldx.d f5, a1, t2 fldx.d f6, a1, t3 xvpermi.q xr0, xr1, 0x02 xvpermi.q xr1, xr2, 0x02 xvilvl.b xr0, xr1, xr0 //01 12 xvpermi.q xr2, xr3, 0x02 xvpermi.q xr3, xr4, 0x02 xvilvl.b xr2, xr3, xr2 //23 34 xvpermi.q xr4, xr5, 0x02 xvpermi.q xr5, xr6, 0x02 xvilvl.b xr4, xr5, xr4 //45 56 .l_\lable\()v_8w_loop_lasx: add.d a1, a1, t4 fld.d f7, a1, 0 //7 fldx.d f10, a1, a2 //8 fldx.d f11, a1, t2 //9 fldx.d f18, a1, t3 //a xvpermi.q xr6, xr7, 0x02 xvpermi.q xr7, xr10, 0x02 xvilvl.b xr6, xr7, xr6 //67 78 xvpermi.q xr10, xr11, 0x02 xvpermi.q xr11, xr18, 0x02 xvilvl.b xr10, xr11, xr10 //89 9a xvmulwev.h.bu.b xr1, xr0, xr12 xvmulwev.h.bu.b xr3, xr2, xr13 xvmulwev.h.bu.b xr5, xr4, xr14 xvmulwev.h.bu.b xr7, xr6, xr15 xvmulwev.h.bu.b xr9, xr2, xr12 xvmulwev.h.bu.b xr11, xr4, xr13 xvmulwev.h.bu.b xr16, xr6, xr14 xvmulwev.h.bu.b xr17, xr10, xr15 xvmaddwod.h.bu.b xr1, xr0, xr12 xvmaddwod.h.bu.b xr3, xr2, xr13 xvmaddwod.h.bu.b xr5, xr4, xr14 xvmaddwod.h.bu.b xr7, xr6, xr15 xvmaddwod.h.bu.b xr9, xr2, xr12 xvmaddwod.h.bu.b xr11, xr4, xr13 xvmaddwod.h.bu.b xr16, xr6, xr14 xvmaddwod.h.bu.b xr17, xr10, xr15 xvadd.h xr1, xr1, xr3 xvadd.h xr1, xr1, xr5 xvadd.h xr1, xr1, xr7 xvadd.h xr9, xr9, xr11 xvadd.h xr9, xr9, xr16 xvadd.h xr9, xr9, xr17 xvaddi.bu xr0, xr4, 0 xvaddi.bu xr2, xr6, 0 xvaddi.bu xr4, xr10, 0 xvaddi.bu xr6, xr18, 0 xvsrari.h xr1, xr1, 2 xvsrari.h xr9, xr9, 2 xvst xr1, a0, 0 xvst xr9, a0, 32 addi.d a0, a0, 64 addi.w a4, a4, -4 bnez a4, .l_\lable\()v_8w_loop_lasx b .l_\lable\()end_pre_8tap_lasx .l_\lable\()v_16w_lasx: addi.d t0, a0, 0 //dst addi.d t5, a1, 0 //src slli.w t7, a3, 1 //w addi.d t8, a4, 0 //h .l_\lable\()v_16w_loop0_lasx: vld vr0, a1, 0 vldx vr1, a1, a2 vldx vr2, a1, t2 add.d a1, a1, t3 vld vr3, a1, 0 vldx vr4, a1, a2 vldx vr5, a1, t2 vldx vr6, a1, t3 add.d a1, a1, t4 xvpermi.d xr0, xr0, 0xd8 xvpermi.d xr1, xr1, 0xd8 xvpermi.d xr2, xr2, 0xd8 xvpermi.d xr3, xr3, 0xd8 xvpermi.d xr4, xr4, 0xd8 xvpermi.d xr5, xr5, 0xd8 xvpermi.d xr6, xr6, 0xd8 xvilvl.b xr0, xr1, xr0 //01 xvilvl.b xr1, xr2, xr1 //12 xvilvl.b xr2, xr3, xr2 //23 xvilvl.b xr3, xr4, xr3 //34 xvilvl.b xr4, xr5, xr4 //45 xvilvl.b xr5, xr6, xr5 //56 .l_\lable\()v_16w_loop_lasx: vld vr7, a1, 0 //7 vldx vr10, a1, a2 //8 add.d a1, a1, t2 xvpermi.d xr7, xr7, 0xd8 xvpermi.d xr10, xr10, 0xd8 xvilvl.b xr6, xr7, xr6 //67 xvilvl.b xr7, xr10, xr7 //78 xvmulwev.h.bu.b xr9, xr0, xr12 xvmulwev.h.bu.b xr11, xr2, xr13 xvmulwev.h.bu.b xr16, xr4, xr14 xvmulwev.h.bu.b xr17, xr6, xr15 xvmulwev.h.bu.b xr18, xr1, xr12 xvmulwev.h.bu.b xr19, xr3, xr13 xvmulwev.h.bu.b xr20, xr5, xr14 xvmulwev.h.bu.b xr21, xr7, xr15 xvmaddwod.h.bu.b xr9, xr0, xr12 xvmaddwod.h.bu.b xr11, xr2, xr13 xvmaddwod.h.bu.b xr16, xr4, xr14 xvmaddwod.h.bu.b xr17, xr6, xr15 xvmaddwod.h.bu.b xr18, xr1, xr12 xvmaddwod.h.bu.b xr19, xr3, xr13 xvmaddwod.h.bu.b xr20, xr5, xr14 xvmaddwod.h.bu.b xr21, xr7, xr15 xvadd.h xr9, xr9, xr11 xvadd.h xr9, xr9, xr16 xvadd.h xr9, xr9, xr17 xvadd.h xr11, xr18, xr19 xvadd.h xr11, xr11, xr20 xvadd.h xr11, xr11, xr21 xvsrari.h xr9, xr9, 2 xvsrari.h xr11, xr11, 2 xvaddi.bu xr0, xr2, 0 xvaddi.bu xr1, xr3, 0 xvaddi.bu xr2, xr4, 0 xvaddi.bu xr3, xr5, 0 xvaddi.bu xr4, xr6, 0 xvaddi.bu xr5, xr7, 0 xvaddi.bu xr6, xr10, 0 xvst xr9, a0, 0 xvstx xr11, a0, t7 alsl.d a0, t7, a0, 1 addi.d a4, a4, -2 bnez a4, .l_\lable\()v_16w_loop_lasx addi.d a3, a3, -16 addi.d a0, t0, 32 addi.d t0, t0, 32 addi.d a1, t5, 16 addi.d t5, t5, 16 addi.d a4, t8, 0 bnez a3, .l_\lable\()v_16w_loop0_lasx .l_\lable\()end_pre_8tap_lasx: .endm function prep_8tap_regular_8bpc_lasx addi.w a7, zero, 0 PREP_8TAP_8BPC_LASX 0 endfunc function prep_8tap_smooth_regular_8bpc_lasx addi.w a7, zero, 1 PREP_8TAP_8BPC_LASX 1 endfunc function prep_8tap_sharp_regular_8bpc_lasx addi.w a7, zero, 2 PREP_8TAP_8BPC_LASX 2 endfunc function prep_8tap_regular_smooth_8bpc_lasx addi.w a7, zero, 4 PREP_8TAP_8BPC_LASX 4 endfunc function prep_8tap_smooth_8bpc_lasx addi.w a7, zero, 5 PREP_8TAP_8BPC_LASX 5 endfunc function prep_8tap_sharp_smooth_8bpc_lasx addi.w a7, zero, 6 PREP_8TAP_8BPC_LASX 6 endfunc function prep_8tap_regular_sharp_8bpc_lasx addi.w a7, zero, 8 PREP_8TAP_8BPC_LASX 8 endfunc function prep_8tap_smooth_sharp_8bpc_lasx addi.w a7, zero, 9 PREP_8TAP_8BPC_LASX 9 endfunc function prep_8tap_sharp_8bpc_lasx addi.w a7, zero, 10 PREP_8TAP_8BPC_LASX 10 endfunc .macro PREP_8TAP_8BPC_LSX lable li.w t0, 4 la.local t6, dav2d_mc_subpel_filters la.local t7, shufb1 vld vr23, t7, 0 slli.d t2, a2, 1 //src_stride*2 add.d t3, t2, a2 //src_stride*3 slli.d t4, t2, 1 bnez a5, .l_\lable\()h_lsx //mx bnez a6, .l_\lable\()v_lsx clz.w t1, a3 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()prep_hv0_jtable_lsx alsl.d t1, t1, t5, 1 ld.h t8, t1, 0 add.d t5, t5, t8 jirl $r0, t5, 0 .align 3 .l_\lable\()prep_hv0_jtable_lsx: .hword .l_\lable\()hv0_128w_lsx - .l_\lable\()prep_hv0_jtable_lsx .hword .l_\lable\()hv0_64w_lsx - .l_\lable\()prep_hv0_jtable_lsx .hword .l_\lable\()hv0_32w_lsx - .l_\lable\()prep_hv0_jtable_lsx .hword .l_\lable\()hv0_16w_lsx - .l_\lable\()prep_hv0_jtable_lsx .hword .l_\lable\()hv0_8w_lsx - .l_\lable\()prep_hv0_jtable_lsx .hword .l_\lable\()hv0_4w_lsx - .l_\lable\()prep_hv0_jtable_lsx .l_\lable\()hv0_4w_lsx: fld.s f0, a1, 0 fldx.s f1, a1, a2 add.d a1, a1, t2 vilvl.w vr0, vr1, vr0 vsllwil.hu.bu vr0, vr0, 4 vst vr0, a0, 0 addi.d a0, a0, 16 addi.d a4, a4, -2 bnez a4, .l_\lable\()hv0_4w_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()hv0_8w_lsx: fld.d f0, a1, 0 fldx.d f1, a1, a2 add.d a1, a1, t2 vsllwil.hu.bu vr0, vr0, 4 vsllwil.hu.bu vr1, vr1, 4 vst vr0, a0, 0 vst vr1, a0, 16 addi.d a0, a0, 32 addi.d a4, a4, -2 bnez a4, .l_\lable\()hv0_8w_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()hv0_16w_lsx: vld vr0, a1, 0 vldx vr1, a1, a2 add.d a1, a1, t2 vsllwil.hu.bu vr2, vr0, 4 vsllwil.hu.bu vr4, vr1, 4 vexth.hu.bu vr3, vr0 vexth.hu.bu vr5, vr1 vslli.h vr3, vr3, 4 vslli.h vr5, vr5, 4 vst vr2, a0, 0 vst vr3, a0, 16 vst vr4, a0, 32 vst vr5, a0, 48 addi.d a0, a0, 64 addi.d a4, a4, -2 bnez a4, .l_\lable\()hv0_16w_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()hv0_32w_lsx: .l_\lable\()hv0_64w_lsx: .l_\lable\()hv0_128w_lsx: addi.d t0, a1, 0 addi.d t5, a4, 0 srli.w t7, a3, 4 slli.w t7, t7, 5 addi.d t8, a0, 0 .l_\lable\()hv0_16_loop_lsx: vld vr0, a1, 0 vldx vr1, a1, a2 add.d a1, a1, t2 vsllwil.hu.bu vr2, vr0, 4 vsllwil.hu.bu vr3, vr1, 4 vexth.hu.bu vr0, vr0 vexth.hu.bu vr1, vr1 vslli.h vr0, vr0, 4 vslli.h vr1, vr1, 4 vst vr2, a0, 0 vst vr0, a0, 16 add.d a0, a0, t7 vst vr3, a0, 0 vst vr1, a0, 16 add.d a0, a0, t7 addi.d a4, a4, -2 bnez a4, .l_\lable\()hv0_16_loop_lsx addi.d a1, t0, 16 addi.d t0, t0, 16 addi.d a0, t8, 32 addi.d t8, t8, 32 addi.d a4, t5, 0 addi.d a3, a3, -16 bnez a3, .l_\lable\()hv0_16_loop_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()h_lsx: bnez a6, .l_\lable\()hv_lsx //if(fh) && if (fv) andi t1, a7, 3 blt t0, a3, .l_\lable\()h_idx_fh_lsx andi t1, a7, 1 addi.w t1, t1, 3 .l_\lable\()h_idx_fh_lsx: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a5, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset vldrepl.d vr23, t1, 0 addi.d a1, a1, -3 clz.w t1, a3 li.w t5, 24 sub.w t1, t1, t5 la.local t5, .l_\lable\()prep_h_jtable_lsx alsl.d t1, t1, t5, 1 ld.h t8, t1, 0 add.d t5, t5, t8 jirl $r0, t5, 0 .align 3 .l_\lable\()prep_h_jtable_lsx: .hword .l_\lable\()h_128w_lsx - .l_\lable\()prep_h_jtable_lsx .hword .l_\lable\()h_64w_lsx - .l_\lable\()prep_h_jtable_lsx .hword .l_\lable\()h_32w_lsx - .l_\lable\()prep_h_jtable_lsx .hword .l_\lable\()h_16w_lsx - .l_\lable\()prep_h_jtable_lsx .hword .l_\lable\()h_8w_lsx - .l_\lable\()prep_h_jtable_lsx .hword .l_\lable\()h_4w_lsx - .l_\lable\()prep_h_jtable_lsx .l_\lable\()h_4w_lsx: addi.d a1, a1, 2 la.local t7, subpel_h_shuf1 vld vr7, t7, 0 vbsrl.v vr23, vr23, 2 vreplvei.w vr23, vr23, 0 .l_\lable\()h_4w_loop_lsx: vld vr0, a1, 0 vldx vr1, a1, a2 add.d a1, a1, t2 vshuf.b vr0, vr0, vr0, vr7 vshuf.b vr1, vr1, vr1, vr7 vmulwev.h.bu.b vr2, vr0, vr23 vmulwev.h.bu.b vr3, vr1, vr23 vmaddwod.h.bu.b vr2, vr0, vr23 vmaddwod.h.bu.b vr3, vr1, vr23 vhaddw.w.h vr0, vr2, vr2 vhaddw.w.h vr1, vr3, vr3 vssrarni.h.w vr1, vr0, 2 vst vr1, a0, 0 addi.d a0, a0, 16 addi.w a4, a4, -2 bnez a4, .l_\lable\()h_4w_loop_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()h_8w_lsx: vreplvei.w vr22, vr23, 0 //fh vreplvei.w vr23, vr23, 1 la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vaddi.bu vr7, vr6, 4 vaddi.bu vr8, vr6, 8 .l_\lable\()h_8w_loop_lsx: vld vr0, a1, 0 vldx vr1, a1, a2 add.d a1, a1, t2 PREP_H_8W vr0 PREP_H_8W vr1 vst vr0, a0, 0 vst vr1, a0, 16 addi.d a0, a0, 32 addi.d a4, a4, -2 bnez a4, .l_\lable\()h_8w_loop_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()h_16w_lsx: .l_\lable\()h_32w_lsx: .l_\lable\()h_64w_lsx: .l_\lable\()h_128w_lsx: vreplvei.w vr22, vr23, 0 //fh vreplvei.w vr23, vr23, 1 la.local t7, subpel_h_shuf1 vld vr6, t7, 0 vaddi.bu vr7, vr6, 4 vaddi.bu vr8, vr6, 8 srli.w t7, a3, 4 slli.w t6, t7, 5 .l_\lable\()h_16w_loop0_lsx: addi.d t0, a1, 0 //src addi.d t5, a4, 0 //h addi.d t8, a0, 0 //dst .l_\lable\()h_16w_loop_lsx: vld vr0, a1, 0 vld vr1, a1, 8 add.d a1, a1, a2 PREP_H_8W vr0 PREP_H_8W vr1 vst vr0, a0, 0 vst vr1, a0, 16 add.d a0, a0, t6 addi.d t5, t5, -1 bnez t5, .l_\lable\()h_16w_loop_lsx addi.d a1, t0, 16 addi.d a0, t8, 32 addi.w t7, t7, -1 bnez t7, .l_\lable\()h_16w_loop0_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()hv_lsx: andi t1, a7, 3 blt t0, a3, .l_\lable\()hv_idx_fh_lsx andi t1, a7, 1 addi.w t1, t1, 3 .l_\lable\()hv_idx_fh_lsx: addi.w t5, zero, 120 mul.w t1, t1, t5 addi.w t5, a5, -1 slli.w t5, t5, 3 add.w t1, t1, t5 add.d t1, t6, t1 //fh's offset vldrepl.d vr8, t1, 0 srli.w a7, a7, 2 blt t0, a4, .l_\lable\()hv_idx_fv_lsx andi a7, a7, 1 addi.w a7, a7, 3 .l_\lable\()hv_idx_fv_lsx: addi.w t5, zero, 120 mul.w a7, a7, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w a7, a7, t5 add.d a7, t6, a7 //fv's offset vldrepl.d vr9, a7, 0 vsllwil.h.b vr9, vr9, 0 sub.d a1, a1, t3 addi.d a1, a1, -3 beq a3, t0, .l_\lable\()hv_4w_lsx b .l_\lable\()hv_8w_lsx .l_\lable\()hv_4w_lsx: addi.d a1, a1, 2 //ignore leading 0s vld vr0, a1, 0 vldx vr1, a1, a2 vldx vr2, a1, t2 add.d a1, a1, t3 vld vr3, a1, 0 vldx vr4, a1, a2 vldx vr5, a1, t2 vldx vr6, a1, t3 add.d a1, a1, t4 la.local t1, subpel_h_shuf1 vld vr7, t1, 0 vbsrl.v vr8, vr8, 2 vreplvei.w vr8, vr8, 0 //fv vreplvei.w vr17, vr9, 0 vreplvei.w vr18, vr9, 1 vreplvei.w vr19, vr9, 2 vreplvei.w vr20, vr9, 3 //DAV2D_FILTER_8TAP_RND vshuf.b vr0, vr0, vr0, vr7 vshuf.b vr1, vr1, vr1, vr7 vshuf.b vr2, vr2, vr2, vr7 vshuf.b vr3, vr3, vr3, vr7 vshuf.b vr4, vr4, vr4, vr7 vshuf.b vr5, vr5, vr5, vr7 vshuf.b vr6, vr6, vr6, vr7 vmulwev.h.bu.b vr10, vr0, vr8 vmulwev.h.bu.b vr11, vr1, vr8 vmulwev.h.bu.b vr12, vr2, vr8 vmulwev.h.bu.b vr13, vr3, vr8 vmulwev.h.bu.b vr14, vr4, vr8 vmulwev.h.bu.b vr15, vr5, vr8 vmulwev.h.bu.b vr16, vr6, vr8 vmaddwod.h.bu.b vr10, vr0, vr8 vmaddwod.h.bu.b vr11, vr1, vr8 vmaddwod.h.bu.b vr12, vr2, vr8 vmaddwod.h.bu.b vr13, vr3, vr8 vmaddwod.h.bu.b vr14, vr4, vr8 vmaddwod.h.bu.b vr15, vr5, vr8 vmaddwod.h.bu.b vr16, vr6, vr8 vhaddw.w.h vr10, vr10, vr10 vhaddw.w.h vr11, vr11, vr11 vhaddw.w.h vr12, vr12, vr12 vhaddw.w.h vr13, vr13, vr13 vhaddw.w.h vr14, vr14, vr14 vhaddw.w.h vr15, vr15, vr15 vhaddw.w.h vr16, vr16, vr16 vssrarni.h.w vr10, vr10, 2 //h0 vssrarni.h.w vr11, vr11, 2 //h1 vssrarni.h.w vr12, vr12, 2 //h2 vssrarni.h.w vr13, vr13, 2 //h3 vssrarni.h.w vr14, vr14, 2 //h4 vssrarni.h.w vr15, vr15, 2 //h5 vssrarni.h.w vr16, vr16, 2 //h6 //h0 vilvl.h vr0, vr11, vr10 //01 vilvl.h vr1, vr13, vr12 //23 vilvl.h vr2, vr15, vr14 //45 //h1 vilvl.h vr4, vr12, vr11 //12 vilvl.h vr5, vr14, vr13 //34 vilvl.h vr6, vr16, vr15 //56 .l_\lable\()hv_w4_loop_lsx: vld vr9, a1, 0 vldx vr10, a1, a2 add.d a1, a1, t2 //DAV2D_FILTER_8TAP_CLIP vshuf.b vr9, vr9, vr9, vr7 vshuf.b vr10, vr10, vr10, vr7 vmulwev.h.bu.b vr11, vr9, vr8 vmulwev.h.bu.b vr12, vr10, vr8 vmaddwod.h.bu.b vr11, vr9, vr8 vmaddwod.h.bu.b vr12, vr10, vr8 vhaddw.w.h vr11, vr11, vr11 vhaddw.w.h vr12, vr12, vr12 vssrarni.h.w vr11, vr11, 2 //7h vssrarni.h.w vr12, vr12, 2 //h8 vilvl.h vr3, vr11, vr16 //67 vilvl.h vr13, vr12, vr11 //78 vmulwev.w.h vr9, vr0, vr17 vmulwev.w.h vr10, vr1, vr18 vmulwev.w.h vr14, vr2, vr19 vmulwev.w.h vr15, vr3, vr20 vmaddwod.w.h vr9, vr0, vr17 vmaddwod.w.h vr10, vr1, vr18 vmaddwod.w.h vr14, vr2, vr19 vmaddwod.w.h vr15, vr3, vr20 vadd.w vr16, vr9, vr10 vadd.w vr16, vr16, vr14 vadd.w vr16, vr16, vr15 vmulwev.w.h vr9, vr4, vr17 vmulwev.w.h vr10, vr5, vr18 vmulwev.w.h vr14, vr6, vr19 vmulwev.w.h vr15, vr13, vr20 vmaddwod.w.h vr9, vr4, vr17 vmaddwod.w.h vr10, vr5, vr18 vmaddwod.w.h vr14, vr6, vr19 vmaddwod.w.h vr15, vr13, vr20 vadd.w vr21, vr9, vr10 vadd.w vr21, vr21, vr14 vadd.w vr21, vr21, vr15 vssrarni.h.w vr21, vr16, 6 //cache vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr3, 0 vaddi.hu vr4, vr5, 0 vaddi.hu vr5, vr6, 0 vaddi.hu vr6, vr13, 0 vaddi.hu vr16, vr12, 0 vst vr21, a0, 0 addi.d a0, a0, 16 addi.d a4, a4, -2 bnez a4, .l_\lable\()hv_w4_loop_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()hv_8w_lsx: .l_\lable\()hv_16w_lsx: .l_\lable\()hv_32w_lsx: .l_\lable\()hv_64w_lsx: .l_\lable\()hv_128w_lsx: addi.d sp, sp, -8*8 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 addi.d t0, a1, 0 //src addi.d t5, a4, 0 //h addi.d t8, a0, 0 //dst slli.w t6, a3, 1 la.local t1, subpel_h_shuf1 vld vr7, t1, 0 vaddi.bu vr11, vr7, 4 vaddi.bu vr12, vr7, 8 vreplvei.w vr10, vr8, 1 vreplvei.w vr8, vr8, 0 vreplvei.w vr20, vr9, 1 vreplvei.w vr21, vr9, 2 vreplvei.w vr22, vr9, 3 vreplvei.w vr9, vr9, 0 .l_\lable\()prep_hv_8w_loop0_lsx: vld vr0, a1, 0 vldx vr1, a1, a2 vldx vr2, a1, t2 add.d a1, a1, t3 vld vr3, a1, 0 vldx vr4, a1, a2 vldx vr5, a1, t2 vldx vr6, a1, t3 add.d a1, a1, t4 FILTER_8TAP_8W vr0 //h0 FILTER_8TAP_8W vr1 //h1 FILTER_8TAP_8W vr2 //h2 FILTER_8TAP_8W vr3 //h3 FILTER_8TAP_8W vr4 //h4 FILTER_8TAP_8W vr5 //h5 FILTER_8TAP_8W vr6 //h6 //h0' low part vilvl.h vr23, vr1, vr0 //01 vilvl.h vr24, vr3, vr2 //23 vilvl.h vr25, vr5, vr4 //45 //h0' high part vilvh.h vr26, vr1, vr0 //01 vilvh.h vr27, vr3, vr2 //23 vilvh.h vr28, vr5, vr4 //45 //h1' low part vilvl.h vr29, vr2, vr1 //12 vilvl.h vr30, vr4, vr3 //34 vilvl.h vr31, vr6, vr5 //56 //h1' high part vilvh.h vr0, vr2, vr1 //12 vilvh.h vr1, vr4, vr3 //34 vilvh.h vr2, vr6, vr5 //56 .l_\lable\()prep_hv_8w_loop_lsx: vld vr3, a1, 0 vldx vr4, a1, a2 add.d a1, a1, t2 FILTER_8TAP_8W vr3 //h7 FILTER_8TAP_8W vr4 //h8 //h0' low part vilvl.h vr16, vr3, vr6 //67 ~low vmulwev.w.h vr13, vr23, vr9 vmulwev.w.h vr14, vr24, vr20 vmulwev.w.h vr15, vr25, vr21 vmulwev.w.h vr17, vr16, vr22 vmaddwod.w.h vr13, vr23, vr9 vmaddwod.w.h vr14, vr24, vr20 vmaddwod.w.h vr15, vr25, vr21 vmaddwod.w.h vr17, vr16, vr22 vadd.w vr13, vr13, vr14 vadd.w vr13, vr13, vr15 vadd.w vr13, vr13, vr17 //cache vaddi.hu vr23, vr24, 0 vaddi.hu vr24, vr25, 0 vaddi.hu vr25, vr16, 0 //h0' high part vilvh.h vr17, vr3, vr6 //67 ~high vmulwev.w.h vr14, vr26, vr9 vmulwev.w.h vr15, vr27, vr20 vmulwev.w.h vr16, vr28, vr21 vmulwev.w.h vr18, vr17, vr22 vmaddwod.w.h vr14, vr26, vr9 vmaddwod.w.h vr15, vr27, vr20 vmaddwod.w.h vr16, vr28, vr21 vmaddwod.w.h vr18, vr17, vr22 vadd.w vr14, vr14, vr15 vadd.w vr14, vr14, vr16 vadd.w vr14, vr14, vr18 vssrarni.h.w vr14, vr13, 6 vst vr14, a0, 0 add.d a0, a0, t6 //cache vaddi.hu vr26, vr27, 0 vaddi.hu vr27, vr28, 0 vaddi.hu vr28, vr17, 0 vaddi.hu vr6, vr4, 0 vilvl.h vr5, vr4, vr3 //78 ~low vilvh.h vr4, vr4, vr3 //78 ~high //h1' low part vmulwev.w.h vr13, vr29, vr9 vmulwev.w.h vr14, vr30, vr20 vmulwev.w.h vr15, vr31, vr21 vmulwev.w.h vr16, vr5, vr22 vmaddwod.w.h vr13, vr29, vr9 vmaddwod.w.h vr14, vr30, vr20 vmaddwod.w.h vr15, vr31, vr21 vmaddwod.w.h vr16, vr5, vr22 vadd.w vr13, vr13, vr14 vadd.w vr13, vr13, vr15 vadd.w vr13, vr13, vr16 //cache vaddi.hu vr29, vr30, 0 vaddi.hu vr30, vr31, 0 vaddi.hu vr31, vr5, 0 //h1' high part vmulwev.w.h vr14, vr0, vr9 vmulwev.w.h vr15, vr1, vr20 vmulwev.w.h vr16, vr2, vr21 vmulwev.w.h vr17, vr4, vr22 vmaddwod.w.h vr14, vr0, vr9 vmaddwod.w.h vr15, vr1, vr20 vmaddwod.w.h vr16, vr2, vr21 vmaddwod.w.h vr17, vr4, vr22 vadd.w vr14, vr14, vr15 vadd.w vr14, vr14, vr16 vadd.w vr14, vr14, vr17 vssrarni.h.w vr14, vr13, 6 vst vr14, a0, 0 add.d a0, a0, t6 //cache vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr4, 0 addi.w a4, a4, -2 bnez a4, .l_\lable\()prep_hv_8w_loop_lsx addi.d a1, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 16 addi.d t8, t8, 16 addi.d a4, t5, 0 addi.w a3, a3, -8 bnez a3, .l_\lable\()prep_hv_8w_loop0_lsx fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 8*8 b .l_\lable\()end_pre_8tap_lsx .l_\lable\()v_lsx: srli.w a7, a7, 2 blt t0, a4, .l_\lable\()v_idx_fv_lsx andi a7, a7, 1 addi.w a7, a7, 3 .l_\lable\()v_idx_fv_lsx: addi.w t5, zero, 120 mul.w a7, a7, t5 addi.w t5, a6, -1 slli.w t5, t5, 3 add.w a7, a7, t5 add.d a7, t6, a7 //fv's offset vldrepl.d vr8, a7, 0 vilvl.h vr8, vr8, vr8 vreplvei.w vr9, vr8, 1 vreplvei.w vr10, vr8, 2 vreplvei.w vr11, vr8, 3 vreplvei.w vr8, vr8, 0 sub.d a1, a1, t3 beq a3, t0, .l_\lable\()v_4w_lsx blt t0, a3, .l_\lable\()v_8w_lsx .l_\lable\()v_4w_lsx: fld.s f0, a1, 0 fldx.s f1, a1, a2 fldx.s f2, a1, t2 add.d a1, a1, t3 fld.s f3, a1, 0 fldx.s f4, a1, a2 fldx.s f5, a1, t2 fldx.s f6, a1, t3 add.d a1, a1, t4 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr2, vr1 vilvl.b vr0, vr1, vr0 //0 1 1 2 vilvl.w vr1, vr3, vr2 vilvl.w vr2, vr4, vr3 vilvl.b vr1, vr2, vr1 //2 3 3 4 vilvl.w vr2, vr5, vr4 vilvl.w vr3, vr6, vr5 vilvl.b vr2, vr3, vr2 //4 5 5 6 .l_\lable\()v_4w_loop_lsx: fld.s f7, a1, 0 vilvl.w vr3, vr7, vr6 fldx.s f6, a1, a2 add.d a1, a1, t2 vilvl.w vr4, vr6, vr7 vilvl.b vr3, vr4, vr3 //6 7 7 8 vmulwev.h.bu.b vr12, vr0, vr8 vmulwev.h.bu.b vr13, vr1, vr9 vmulwev.h.bu.b vr14, vr2, vr10 vmulwev.h.bu.b vr15, vr3, vr11 vmaddwod.h.bu.b vr12, vr0, vr8 vmaddwod.h.bu.b vr13, vr1, vr9 vmaddwod.h.bu.b vr14, vr2, vr10 vmaddwod.h.bu.b vr15, vr3, vr11 vaddi.hu vr0, vr1, 0 vaddi.hu vr1, vr2, 0 vaddi.hu vr2, vr3, 0 vadd.h vr12, vr12, vr13 vadd.h vr12, vr12, vr14 vadd.h vr12, vr12, vr15 vsrari.h vr12, vr12, 2 vst vr12, a0, 0 addi.d a0, a0, 16 addi.w a4, a4, -2 bnez a4, .l_\lable\()v_4w_loop_lsx b .l_\lable\()end_pre_8tap_lsx .l_\lable\()v_8w_lsx: addi.d t0, a1, 0 addi.d t5, a4, 0 addi.d t8, a0, 0 slli.w t6, a3, 1 .l_\lable\()v_8w_loop0_lsx: fld.d f0, a1, 0 fldx.d f1, a1, a2 fldx.d f2, a1, t2 add.d a1, a1, t3 fld.d f3, a1, 0 fldx.d f4, a1, a2 fldx.d f5, a1, t2 fldx.d f6, a1, t3 add.d a1, a1, t4 vilvl.b vr0, vr1, vr0 //0 1 vilvl.b vr1, vr2, vr1 //1 2 vilvl.b vr2, vr3, vr2 //2 3 vilvl.b vr3, vr4, vr3 //3 4 vilvl.b vr4, vr5, vr4 //4 5 vilvl.b vr5, vr6, vr5 //5 6 .l_\lable\()v_8w_loop_lsx: fld.d f7, a1, 0 vilvl.b vr12, vr7, vr6 //6 7 fldx.d f6, a1, a2 add.d a1, a1, t2 vilvl.b vr13, vr6, vr7 //7 8 vmulwev.h.bu.b vr14, vr0, vr8 vmulwev.h.bu.b vr15, vr1, vr8 vmulwev.h.bu.b vr16, vr2, vr9 vmulwev.h.bu.b vr17, vr3, vr9 vmulwev.h.bu.b vr18, vr4, vr10 vmulwev.h.bu.b vr19, vr5, vr10 vmulwev.h.bu.b vr20, vr12, vr11 vmulwev.h.bu.b vr21, vr13, vr11 vmaddwod.h.bu.b vr14, vr0, vr8 vmaddwod.h.bu.b vr15, vr1, vr8 vmaddwod.h.bu.b vr16, vr2, vr9 vmaddwod.h.bu.b vr17, vr3, vr9 vmaddwod.h.bu.b vr18, vr4, vr10 vmaddwod.h.bu.b vr19, vr5, vr10 vmaddwod.h.bu.b vr20, vr12, vr11 vmaddwod.h.bu.b vr21, vr13, vr11 vaddi.hu vr0, vr2, 0 vaddi.hu vr1, vr3, 0 vaddi.hu vr2, vr4, 0 vaddi.hu vr3, vr5, 0 vaddi.hu vr4, vr12, 0 vaddi.hu vr5, vr13, 0 vadd.h vr14, vr14, vr16 vadd.h vr14, vr14, vr18 vadd.h vr14, vr14, vr20 vadd.h vr15, vr15, vr17 vadd.h vr15, vr15, vr19 vadd.h vr15, vr15, vr21 vsrari.h vr14, vr14, 2 vsrari.h vr15, vr15, 2 vst vr14, a0, 0 add.d a0, a0, t6 vst vr15, a0, 0 add.d a0, a0, t6 addi.w a4, a4, -2 bnez a4, .l_\lable\()v_8w_loop_lsx addi.d a1, t0, 8 addi.d t0, t0, 8 addi.d a0, t8, 16 addi.d t8, t8, 16 addi.d a4, t5, 0 addi.d a3, a3, -8 bnez a3, .l_\lable\()v_8w_loop0_lsx .l_\lable\()end_pre_8tap_lsx: .endm function prep_8tap_regular_8bpc_lsx addi.w a7, zero, 0 PREP_8TAP_8BPC_LSX 0 endfunc function prep_8tap_smooth_regular_8bpc_lsx addi.w a7, zero, 1 PREP_8TAP_8BPC_LSX 1 endfunc function prep_8tap_sharp_regular_8bpc_lsx addi.w a7, zero, 2 PREP_8TAP_8BPC_LSX 2 endfunc function prep_8tap_regular_smooth_8bpc_lsx addi.w a7, zero, 4 PREP_8TAP_8BPC_LSX 4 endfunc function prep_8tap_smooth_8bpc_lsx addi.w a7, zero, 5 PREP_8TAP_8BPC_LSX 5 endfunc function prep_8tap_sharp_smooth_8bpc_lsx addi.w a7, zero, 6 PREP_8TAP_8BPC_LSX 6 endfunc function prep_8tap_regular_sharp_8bpc_lsx addi.w a7, zero, 8 PREP_8TAP_8BPC_LSX 8 endfunc function prep_8tap_smooth_sharp_8bpc_lsx addi.w a7, zero, 9 PREP_8TAP_8BPC_LSX 9 endfunc function prep_8tap_sharp_8bpc_lsx addi.w a7, zero, 10 PREP_8TAP_8BPC_LSX 10 endfunc /* * static void blend_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h, const uint8_t *mask) */ function blend_8bpc_lsx addi.d t8, zero, 64 vreplgr2vr.b vr23, t8 clz.w t0, a3 li.w t1, 26 sub.w t0, t0, t1 la.local t1, .BLEND_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE add.d t1, t1, t2 // Get absolute address jirl $r0, t1, 0 .align 3 .BLEND_LSX_JRTABLE: .hword .BLEND_W32_LSX - .BLEND_LSX_JRTABLE .hword .BLEND_W16_LSX - .BLEND_LSX_JRTABLE .hword .BLEND_W8_LSX - .BLEND_LSX_JRTABLE .hword .BLEND_W4_LSX - .BLEND_LSX_JRTABLE .BLEND_W4_LSX: vld vr0, a0, 0 vld vr1, a2, 0 vld vr2, a5, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.hu.bu vr4, vr2, 0 vmul.h vr1, vr1, vr4 //b*m vsub.b vr3, vr23, vr2 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr3, vr3, 0 vmadd.h vr1, vr0, vr3 vssrarni.bu.h vr1, vr1, 6 vstelm.w vr1, a0, 0, 0 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 4 addi.d a5, a5, 4 blt zero, a4, .BLEND_W4_LSX b .BLEND_END_LSX .BLEND_W8_LSX: vld vr0, a0, 0 vld vr1, a2, 0 vld vr2, a5, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.hu.bu vr4, vr2, 0 vmul.h vr1, vr1, vr4 //b*m vsub.b vr3, vr23, vr2 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr3, vr3, 0 vmadd.h vr1, vr0, vr3 vssrarni.bu.h vr1, vr1, 6 vstelm.d vr1, a0, 0, 0 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 8 addi.d a5, a5, 8 blt zero, a4, .BLEND_W8_LSX b .BLEND_END_LSX .BLEND_W16_LSX: vld vr0, a0, 0 vld vr1, a2, 0 vld vr2, a5, 0 vexth.hu.bu vr5, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr6, vr2 vsllwil.hu.bu vr4, vr2, 0 vmul.h vr1, vr1, vr4 //b*m vmul.h vr5, vr5, vr6 //b*m vsub.b vr3, vr23, vr2 vexth.hu.bu vr7, vr0 vexth.hu.bu vr8, vr3 vmadd.h vr5, vr7, vr8 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr3, vr3, 0 vmadd.h vr1, vr0, vr3 vssrarni.bu.h vr5, vr1, 6 vst vr5, a0, 0 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 16 addi.d a5, a5, 16 blt zero, a4, .BLEND_W16_LSX b .BLEND_END_LSX .BLEND_W32_LSX: vld vr0, a0, 0 vld vr1, a2, 0 vld vr2, a5, 0 vexth.hu.bu vr5, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr6, vr2 vsllwil.hu.bu vr4, vr2, 0 vmul.h vr1, vr1, vr4 //b*m vmul.h vr5, vr5, vr6 //b*m vsub.b vr3, vr23, vr2 vexth.hu.bu vr7, vr0 vexth.hu.bu vr8, vr3 vmadd.h vr5, vr7, vr8 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr3, vr3, 0 vmadd.h vr1, vr0, vr3 vssrarni.bu.h vr5, vr1, 6 vst vr5, a0, 0 /* sencond */ vld vr0, a0, 16 vld vr1, a2, 16 vld vr2, a5, 16 vexth.hu.bu vr5, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr6, vr2 vsllwil.hu.bu vr4, vr2, 0 vmul.h vr1, vr1, vr4 //b*m vmul.h vr5, vr5, vr6 //b*m vsub.b vr3, vr23, vr2 vexth.hu.bu vr7, vr0 vexth.hu.bu vr8, vr3 vmadd.h vr5, vr7, vr8 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr3, vr3, 0 vmadd.h vr1, vr0, vr3 vssrarni.bu.h vr5, vr1, 6 vst vr5, a0, 16 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 32 addi.d a5, a5, 32 blt zero, a4, .BLEND_W32_LSX .BLEND_END_LSX: endfunc const obmc_masks_la /* Unused */ .byte 0, 0, 0, 0 /* 2 */ .byte 45, 19, 64, 0 /* 4 */ .byte 39, 25, 50, 14, 59, 5, 64, 0 /* 8 */ .byte 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 /* 16 */ .byte 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 .byte 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 /* 32 */ .byte 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 .byte 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 .byte 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 endconst /* * static void blend_v_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) */ function blend_v_8bpc_lsx la.local t8, obmc_masks_la clz.w t0, a3 li.w t1, 26 sub.w t0, t0, t1 la.local t1, .BLEND_V_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE add.d t1, t1, t2 // Get absolute address jirl $r0, t1, 0 .align 3 .BLEND_V_LSX_JRTABLE: .hword .BLEND_V_W32_LSX - .BLEND_V_LSX_JRTABLE .hword .BLEND_V_W16_LSX - .BLEND_V_LSX_JRTABLE .hword .BLEND_V_W8_LSX - .BLEND_V_LSX_JRTABLE .hword .BLEND_V_W4_LSX - .BLEND_V_LSX_JRTABLE .hword .BLEND_V_W2_LSX - .BLEND_V_LSX_JRTABLE .hword .BLEND_V_W2_LSX_1 - .BLEND_V_LSX_JRTABLE //Instructions must be 4-byte aligned .BLEND_V_W2_LSX: ld.bu t6, t8, 4 ld.bu t7, t8, 5 .BLEND_V_W2_LSX_1: ld.bu t0, a0, 0 ld.bu t1, a2, 0 mul.d t0, t0, t6 mul.d t1, t1, t7 addi.d t0, t0, 32 add.d t0, t0, t1 srli.d t0, t0, 6 st.b t0, a0, 0 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 2 addi.d a5, a5, 2 blt zero, a4, .BLEND_V_W2_LSX_1 b .BLEND_V_END_LSX .BLEND_V_W4_LSX: vld vr20, t8, 8 .BLEND_V_W4_LSX_1: vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.h vr1, a0, 0, 0 vstelm.b vr1, a0, 2, 2 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 4 blt zero, a4, .BLEND_V_W4_LSX_1 b .BLEND_V_END_LSX .BLEND_V_W8_LSX: vld vr20, t8, 16 .BLEND_V_W8_LSX_1: vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.w vr1, a0, 0, 0 vstelm.h vr1, a0, 4, 2 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 8 blt zero, a4, .BLEND_V_W8_LSX_1 b .BLEND_V_END_LSX .BLEND_V_W16_LSX: vld vr20, t8, 32 vld vr21, t8, 48 .BLEND_V_W16_LSX_1: vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr2, vr1, vr0 vilvh.b vr3, vr1, vr0 vmulwev.h.bu vr4, vr2, vr20 vmulwev.h.bu vr5, vr3, vr21 vmaddwod.h.bu vr4, vr2, vr20 vmaddwod.h.bu vr5, vr3, vr21 vssrarni.bu.h vr5, vr4, 6 vstelm.d vr5, a0, 0, 0 vstelm.w vr5, a0, 8, 2 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 16 blt zero, a4, .BLEND_V_W16_LSX_1 b .BLEND_V_END_LSX .BLEND_V_W32_LSX: vld vr20, t8, 64 vld vr21, t8, 80 vld vr22, t8, 96 .BLEND_V_W32_LSX_1: vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a2, 0 vld vr3, a2, 16 vilvl.b vr4, vr2, vr0 vmulwev.h.bu vr7, vr4, vr20 vilvh.b vr5, vr2, vr0 vmulwev.h.bu vr8, vr5, vr21 vilvl.b vr6, vr3, vr1 vmulwev.h.bu vr9, vr6, vr22 vmaddwod.h.bu vr7, vr4, vr20 vmaddwod.h.bu vr8, vr5, vr21 vmaddwod.h.bu vr9, vr6, vr22 vssrarni.bu.h vr8, vr7, 6 vssrarni.bu.h vr9, vr9, 6 vst vr8, a0, 0 vstelm.d vr9, a0, 16, 0 addi.w a4, a4, -1 add.d a0, a0, a1 addi.d a2, a2, 32 blt zero, a4, .BLEND_V_W32_LSX_1 .BLEND_V_END_LSX: endfunc /* * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) */ function blend_h_8bpc_lsx la.local t8, obmc_masks_la alsl.d t8, a4, t8, 1 srli.d t0, a4, 1 srli.d t1, a4, 2 add.d a4, t0, t1 // h = (h * 3) >> 2; slli.d a4, a4, 1 add.d a4, a4, t8 clz.w t0, a3 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .BLEND_H_LSX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE add.d t1, t1, t2 // Get absolute address jirl $r0, t1, 0 .align 3 .BLEND_H_LSX_JRTABLE: .hword .BLEND_H_W128_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_W64_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_W32_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_W16_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_W8_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_W4_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_W2_LSX - .BLEND_H_LSX_JRTABLE .hword .BLEND_H_END_LSX - .BLEND_H_LSX_JRTABLE //Instructions must be 4-byte aligned .BLEND_H_W2_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.h vr1, a0, 0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 2 blt t8, a4, .BLEND_H_W2_LSX b .BLEND_H_END_LSX .BLEND_H_W4_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.w vr1, a0, 0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 4 blt t8, a4, .BLEND_H_W4_LSX b .BLEND_H_END_LSX .BLEND_H_W8_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.d vr1, a0, 0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 8 blt t8, a4, .BLEND_H_W8_LSX b .BLEND_H_END_LSX .BLEND_H_W16_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr2, vr1, vr0 vilvh.b vr3, vr1, vr0 vmulwev.h.bu vr4, vr2, vr20 vmulwev.h.bu vr5, vr3, vr20 vmaddwod.h.bu vr4, vr2, vr20 vmaddwod.h.bu vr5, vr3, vr20 vssrarni.bu.h vr5, vr4, 6 vst vr5, a0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 16 blt t8, a4, .BLEND_H_W16_LSX b .BLEND_H_END_LSX .BLEND_H_W32_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a2, 0 vld vr3, a2, 16 vilvl.b vr4, vr2, vr0 vilvh.b vr5, vr2, vr0 vilvl.b vr6, vr3, vr1 vilvh.b vr3, vr3, vr1 vmulwev.h.bu vr7, vr4, vr20 vmulwev.h.bu vr8, vr5, vr20 vmulwev.h.bu vr9, vr6, vr20 vmulwev.h.bu vr0, vr3, vr20 vmaddwod.h.bu vr7, vr4, vr20 vmaddwod.h.bu vr8, vr5, vr20 vmaddwod.h.bu vr9, vr6, vr20 vmaddwod.h.bu vr0, vr3, vr20 vssrarni.bu.h vr8, vr7, 6 vssrarni.bu.h vr0, vr9, 6 vst vr8, a0, 0 vst vr0, a0, 16 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 32 blt t8, a4, .BLEND_H_W32_LSX b .BLEND_H_END_LSX .BLEND_H_W64_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 vld vr4, a2, 0 vld vr5, a2, 16 vld vr6, a2, 32 vld vr7, a2, 48 vilvl.b vr8, vr4, vr0 vilvh.b vr9, vr4, vr0 vilvl.b vr10, vr5, vr1 vilvh.b vr11, vr5, vr1 vilvl.b vr12, vr6, vr2 vilvh.b vr13, vr6, vr2 vilvl.b vr14, vr7, vr3 vilvh.b vr15, vr7, vr3 vmulwev.h.bu vr0, vr8, vr20 vmulwev.h.bu vr1, vr9, vr20 vmulwev.h.bu vr2, vr10, vr20 vmulwev.h.bu vr3, vr11, vr20 vmulwev.h.bu vr4, vr12, vr20 vmulwev.h.bu vr5, vr13, vr20 vmulwev.h.bu vr6, vr14, vr20 vmulwev.h.bu vr7, vr15, vr20 vmaddwod.h.bu vr0, vr8, vr20 vmaddwod.h.bu vr1, vr9, vr20 vmaddwod.h.bu vr2, vr10, vr20 vmaddwod.h.bu vr3, vr11, vr20 vmaddwod.h.bu vr4, vr12, vr20 vmaddwod.h.bu vr5, vr13, vr20 vmaddwod.h.bu vr6, vr14, vr20 vmaddwod.h.bu vr7, vr15, vr20 vssrarni.bu.h vr1, vr0, 6 vssrarni.bu.h vr3, vr2, 6 vssrarni.bu.h vr5, vr4, 6 vssrarni.bu.h vr7, vr6, 6 vst vr1, a0, 0 vst vr3, a0, 16 vst vr5, a0, 32 vst vr7, a0, 48 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 64 blt t8, a4, .BLEND_H_W64_LSX b .BLEND_H_END_LSX .BLEND_H_W128_LSX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 vld vr4, a2, 0 vld vr5, a2, 16 vld vr6, a2, 32 vld vr7, a2, 48 vilvl.b vr8, vr4, vr0 vilvh.b vr9, vr4, vr0 vilvl.b vr10, vr5, vr1 vilvh.b vr11, vr5, vr1 vilvl.b vr12, vr6, vr2 vilvh.b vr13, vr6, vr2 vilvl.b vr14, vr7, vr3 vilvh.b vr15, vr7, vr3 vmulwev.h.bu vr0, vr8, vr20 vmulwev.h.bu vr1, vr9, vr20 vmulwev.h.bu vr2, vr10, vr20 vmulwev.h.bu vr3, vr11, vr20 vmulwev.h.bu vr4, vr12, vr20 vmulwev.h.bu vr5, vr13, vr20 vmulwev.h.bu vr6, vr14, vr20 vmulwev.h.bu vr7, vr15, vr20 vmaddwod.h.bu vr0, vr8, vr20 vmaddwod.h.bu vr1, vr9, vr20 vmaddwod.h.bu vr2, vr10, vr20 vmaddwod.h.bu vr3, vr11, vr20 vmaddwod.h.bu vr4, vr12, vr20 vmaddwod.h.bu vr5, vr13, vr20 vmaddwod.h.bu vr6, vr14, vr20 vmaddwod.h.bu vr7, vr15, vr20 vssrarni.bu.h vr1, vr0, 6 vssrarni.bu.h vr3, vr2, 6 vssrarni.bu.h vr5, vr4, 6 vssrarni.bu.h vr7, vr6, 6 vst vr1, a0, 0 vst vr3, a0, 16 vst vr5, a0, 32 vst vr7, a0, 48 /* second */ vld vr0, a0, 64 vld vr1, a0, 80 vld vr2, a0, 96 vld vr3, a0, 112 vld vr4, a2, 64 vld vr5, a2, 80 vld vr6, a2, 96 vld vr7, a2, 112 vilvl.b vr8, vr4, vr0 vilvh.b vr9, vr4, vr0 vilvl.b vr10, vr5, vr1 vilvh.b vr11, vr5, vr1 vilvl.b vr12, vr6, vr2 vilvh.b vr13, vr6, vr2 vilvl.b vr14, vr7, vr3 vilvh.b vr15, vr7, vr3 vmulwev.h.bu vr0, vr8, vr20 vmulwev.h.bu vr1, vr9, vr20 vmulwev.h.bu vr2, vr10, vr20 vmulwev.h.bu vr3, vr11, vr20 vmulwev.h.bu vr4, vr12, vr20 vmulwev.h.bu vr5, vr13, vr20 vmulwev.h.bu vr6, vr14, vr20 vmulwev.h.bu vr7, vr15, vr20 vmaddwod.h.bu vr0, vr8, vr20 vmaddwod.h.bu vr1, vr9, vr20 vmaddwod.h.bu vr2, vr10, vr20 vmaddwod.h.bu vr3, vr11, vr20 vmaddwod.h.bu vr4, vr12, vr20 vmaddwod.h.bu vr5, vr13, vr20 vmaddwod.h.bu vr6, vr14, vr20 vmaddwod.h.bu vr7, vr15, vr20 vssrarni.bu.h vr1, vr0, 6 vssrarni.bu.h vr3, vr2, 6 vssrarni.bu.h vr5, vr4, 6 vssrarni.bu.h vr7, vr6, 6 vst vr1, a0, 64 vst vr3, a0, 80 vst vr5, a0, 96 vst vr7, a0, 112 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 128 blt t8, a4, .BLEND_H_W128_LSX b .BLEND_H_END_LSX .BLEND_H_END_LSX: endfunc /* * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) */ function blend_h_8bpc_lasx la.local t8, obmc_masks_la alsl.d t8, a4, t8, 1 srli.d t0, a4, 1 srli.d t1, a4, 2 add.d a4, t0, t1 // h = (h * 3) >> 2; slli.d a4, a4, 1 add.d a4, a4, t8 clz.w t0, a3 li.w t1, 24 sub.w t0, t0, t1 la.local t1, .BLEND_H_LASX_JRTABLE alsl.d t0, t0, t1, 1 ld.h t2, t0, 0 // The jump addresses are relative to JRTABLE add.d t1, t1, t2 // Get absolute address jirl $r0, t1, 0 .align 3 .BLEND_H_LASX_JRTABLE: .hword .BLEND_H_W128_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_W64_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_W32_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_W16_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_W8_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_W4_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_W2_LASX - .BLEND_H_LASX_JRTABLE .hword .BLEND_H_END_LASX - .BLEND_H_LASX_JRTABLE //Instructions must be 4-byte aligned .BLEND_H_W2_LASX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.h vr1, a0, 0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 2 blt t8, a4, .BLEND_H_W2_LASX b .BLEND_H_END_LASX .BLEND_H_W4_LASX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.w vr1, a0, 0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 4 blt t8, a4, .BLEND_H_W4_LASX b .BLEND_H_END_LASX .BLEND_H_W8_LASX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr0, vr1, vr0 vdp2.h.bu vr1, vr0, vr20 vssrarni.bu.h vr1, vr1, 6 vstelm.d vr1, a0, 0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 8 blt t8, a4, .BLEND_H_W8_LASX b .BLEND_H_END_LASX .BLEND_H_W16_LASX: vldrepl.h vr20, t8, 0 vld vr0, a0, 0 vld vr1, a2, 0 vilvl.b vr2, vr1, vr0 vilvh.b vr3, vr1, vr0 vmulwev.h.bu vr4, vr2, vr20 vmulwev.h.bu vr5, vr3, vr20 vmaddwod.h.bu vr4, vr2, vr20 vmaddwod.h.bu vr5, vr3, vr20 vssrarni.bu.h vr5, vr4, 6 vst vr5, a0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 16 blt t8, a4, .BLEND_H_W16_LSX b .BLEND_H_END_LSX .BLEND_H_W32_LASX: xvldrepl.h xr20, t8, 0 xvld xr0, a0, 0 xvld xr1, a2, 0 xvilvl.b xr2, xr1, xr0 xvilvh.b xr3, xr1, xr0 xvmulwev.h.bu xr4, xr2, xr20 xvmulwev.h.bu xr5, xr3, xr20 xvmaddwod.h.bu xr4, xr2, xr20 xvmaddwod.h.bu xr5, xr3, xr20 xvssrarni.bu.h xr5, xr4, 6 xvst xr5, a0, 0 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 32 blt t8, a4, .BLEND_H_W32_LASX b .BLEND_H_END_LASX .BLEND_H_W64_LASX: xvldrepl.h xr20, t8, 0 xvld xr0, a0, 0 xvld xr1, a0, 32 xvld xr2, a2, 0 xvld xr3, a2, 32 xvilvl.b xr4, xr2, xr0 xvilvh.b xr5, xr2, xr0 xvilvl.b xr6, xr3, xr1 xvilvh.b xr7, xr3, xr1 xvmulwev.h.bu xr0, xr4, xr20 xvmulwev.h.bu xr1, xr5, xr20 xvmulwev.h.bu xr2, xr6, xr20 xvmulwev.h.bu xr3, xr7, xr20 xvmaddwod.h.bu xr0, xr4, xr20 xvmaddwod.h.bu xr1, xr5, xr20 xvmaddwod.h.bu xr2, xr6, xr20 xvmaddwod.h.bu xr3, xr7, xr20 xvssrarni.bu.h xr1, xr0, 6 xvssrarni.bu.h xr3, xr2, 6 xvst xr1, a0, 0 xvst xr3, a0, 32 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 64 blt t8, a4, .BLEND_H_W64_LASX b .BLEND_H_END_LASX .BLEND_H_W128_LASX: xvldrepl.h xr20, t8, 0 xvld xr0, a0, 0 xvld xr1, a0, 32 xvld xr2, a0, 64 xvld xr3, a0, 96 xvld xr4, a2, 0 xvld xr5, a2, 32 xvld xr6, a2, 64 xvld xr7, a2, 96 xvilvl.b xr8, xr4, xr0 xvilvh.b xr9, xr4, xr0 xvilvl.b xr10, xr5, xr1 xvilvh.b xr11, xr5, xr1 xvilvl.b xr12, xr6, xr2 xvilvh.b xr13, xr6, xr2 xvilvl.b xr14, xr7, xr3 xvilvh.b xr15, xr7, xr3 xvmulwev.h.bu xr0, xr8, xr20 xvmulwev.h.bu xr1, xr9, xr20 xvmulwev.h.bu xr2, xr10, xr20 xvmulwev.h.bu xr3, xr11, xr20 xvmulwev.h.bu xr4, xr12, xr20 xvmulwev.h.bu xr5, xr13, xr20 xvmulwev.h.bu xr6, xr14, xr20 xvmulwev.h.bu xr7, xr15, xr20 xvmaddwod.h.bu xr0, xr8, xr20 xvmaddwod.h.bu xr1, xr9, xr20 xvmaddwod.h.bu xr2, xr10, xr20 xvmaddwod.h.bu xr3, xr11, xr20 xvmaddwod.h.bu xr4, xr12, xr20 xvmaddwod.h.bu xr5, xr13, xr20 xvmaddwod.h.bu xr6, xr14, xr20 xvmaddwod.h.bu xr7, xr15, xr20 xvssrarni.bu.h xr1, xr0, 6 xvssrarni.bu.h xr3, xr2, 6 xvssrarni.bu.h xr5, xr4, 6 xvssrarni.bu.h xr7, xr6, 6 xvst xr1, a0, 0 xvst xr3, a0, 32 xvst xr5, a0, 64 xvst xr7, a0, 96 addi.d t8, t8, 2 add.d a0, a0, a1 addi.d a2, a2, 128 blt t8, a4, .BLEND_H_W128_LASX b .BLEND_H_END_LASX .BLEND_H_END_LASX: endfunc /* * a1=16 | a2=8 | a3=4 * temp reg: a4 */ .macro PIXEL_COPY_LSX _dst, _src, _size blt \_size, a1, 8f 16: vld vr0, \_src, 0 vst vr0, \_dst, 0 addi.d \_size, \_size, -16 addi.d \_dst, \_dst, 16 addi.d \_src, \_src, 16 blt a1, \_size, 16b 8: blt \_size, a2, 14f ld.d a4, \_src, 0 st.d a4, \_dst, 0 addi.d \_size, \_size, -8 addi.d \_dst, \_dst, 8 addi.d \_src, \_src, 8 14: blt \_size, a3, 11f ld.w a4, \_src, 0 st.w a4, \_dst, 0 addi.d \_size, \_size, -4 addi.d \_dst, \_dst, 4 addi.d \_src, \_src, 4 11: beqz \_size, 110f 111: ld.b a4, \_src, 0 st.b a4, \_dst, 0 addi.d \_size, \_size, -1 addi.d \_dst, \_dst, 1 addi.d \_src, \_src, 1 bnez \_size, 111b 110: .endm /* * a1=16 | a2=8 | a3=4 */ .macro PIXEL_SET_LSX _dst, _vsrc, _size blt \_size, a1, 8f 16: vst \_vsrc, \_dst, 0 addi.d \_size, \_size, -16 addi.d \_dst, \_dst, 16 blt a1, \_size, 16b 8: blt \_size, a2, 14f vstelm.d \_vsrc, \_dst, 0, 0 addi.d \_size, \_size, -8 addi.d \_dst, \_dst, 8 14: blt \_size, a3, 11f vstelm.w \_vsrc, \_dst, 0, 0 addi.d \_size, \_size, -4 addi.d \_dst, \_dst, 4 11: beqz \_size, 110f 111: vstelm.b \_vsrc, \_dst, 0, 0 addi.d \_size, \_size, -1 addi.d \_dst, \_dst, 1 bnez \_size, 111b 110: .endm /* * temp reg: a4 a5 t2 t3 vr0 */ .macro DEGE_LOOP need_left, need_right 0: addi.d t2, t6, 0 // dst addi.d t3, t7, 0 // src .if \need_left vldrepl.b vr0, t3, 0 addi.d a5, t0, 0 PIXEL_SET_LSX t2, vr0, a5 .endif addi.d a5, t4, 0 PIXEL_COPY_LSX t2, t3, a5 .if \need_right vldrepl.b vr0, t3, -1 addi.d a5, t1, 0 PIXEL_SET_LSX t2, vr0, a5 .endif addi.d t5, t5, -1 add.d t7, t7, t8 add.d t6, t6, a7 bnez t5, 0b .endm /* * static void emu_edge_c(const intptr_t bw, const intptr_t bh, * const intptr_t iw, const intptr_t ih, * const intptr_t x, const intptr_t y, * pixel *dst, const ptrdiff_t dst_stride, * const pixel *ref, const ptrdiff_t ref_stride) */ function emu_edge_8bpc_lsx vxor.v vr23, vr23, vr23 // zero addi.d t0, a3, -1 // ih - 1 addi.d t1, a2, -1 // iw - 1 vreplgr2vr.w vr22, t0 vinsgr2vr.w vr22, t1, 1 vreplgr2vr.w vr0, a5 vinsgr2vr.w vr0, a4, 1 // [0] - h | [1] - w vclip.w vr2, vr0, vr23, vr22 vpickve2gr.w t0, vr2, 0 ld.d t2, sp, 0 ld.d t8, sp, 8 // ref_stride mul.w t0, t0, t8 vpickve2gr.w t1, vr2, 1 add.d t2, t2, t1 add.d t7, t0, t2 // ref addi.d t0, a0, -1 // bw - 1 addi.d t1, a1, -1 // bh - 1 vreplgr2vr.w vr21, t0 vreplgr2vr.w vr22, t1 vilvl.d vr21, vr22, vr21 sub.d t2, zero, a4 // -x add.d t3, a0, a4 sub.d t3, t3, a2 // x + bw - iw sub.d t4, zero, a5 // -y add.d t5, a1, a5 sub.d t5, t5, a3 // y + bh - ih vreplgr2vr.w vr0, t2 vinsgr2vr.w vr0, t3, 1 vinsgr2vr.w vr0, t4, 2 vinsgr2vr.w vr0, t5, 3 vclip.w vr2, vr0, vr23, vr21 vpickve2gr.w t0, vr2, 0 // left_ext vpickve2gr.w t1, vr2, 1 // right_ext vpickve2gr.w t2, vr2, 2 // top_ext vpickve2gr.w t3, vr2, 3 // bottom_ext mul.w t6, t2, a7 add.d t4, t0, t1 add.d t5, t2, t3 sub.d t4, a0, t4 // center_w sub.d t5, a1, t5 // center_h addi.d a1, zero, 16 addi.d a2, zero, 8 addi.d a3, zero, 4 add.d t6, t6, a6 // blk beqz t0, 2f // need_left beqz t1, 3f // need_left + need_right DEGE_LOOP 1, 1 b 5f 2: // !need_left beqz t1, 4f // !need_left + need_right DEGE_LOOP 0, 1 b 5f 3: // need_left + !need_right DEGE_LOOP 1, 0 b 5f 4: // !need_left + !need_right DEGE_LOOP 0, 0 5: vpickve2gr.w t2, vr2, 2 // top_ext vpickve2gr.w t3, vr2, 3 // bottom_ext sub.d t7, a7, a0 // dst_stride - bw mul.w t8, t2, a7 beqz t3, 2f // need_bottom sub.d t0, t6, a7 // &dst[-PXSTRIDE(dst_stride)] 1: addi.d t1, t0, 0 addi.d a5, a0, 0 PIXEL_COPY_LSX t6, t1, a5 add.d t6, t6, t7 addi.d t3, t3, -1 bnez t3, 1b 2: beqz t2, 3f // need_top add.d t8, t8, a6 // blk 1: addi.d t1, t8, 0 addi.d a5, a0, 0 PIXEL_COPY_LSX a6, t1, a5 add.d a6, a6, t7 addi.d t2, t2, -1 bnez t2, 1b 3: endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/mc.h000066400000000000000000000070251517466257200233630ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_MC_H #define DAV2D_SRC_LOONGARCH_MC_H #include "config.h" #include "src/mc.h" #include "src/cpu.h" #define init_mc_fn(type, name, suffix) \ c->mc[type] = BF(dav2d_put_##name, suffix) #define init_mct_fn(type, name, suffix) \ c->mct[type] = BF(dav2d_prep_##name, suffix) decl_avg_fn(BF(dav2d_avg, lsx)); decl_w_avg_fn(BF(dav2d_w_avg, lsx)); decl_mask_fn(BF(dav2d_mask, lsx)); decl_warp8x8_fn(BF(dav2d_warp_affine_8x8, lsx)); decl_warp8x8t_fn(BF(dav2d_warp_affine_8x8t, lsx)); decl_w_mask_fn(BF(dav2d_w_mask_420, lsx)); decl_blend_fn(BF(dav2d_blend, lsx)); decl_blend_dir_fn(BF(dav2d_blend_v, lsx)); decl_blend_dir_fn(BF(dav2d_blend_h, lsx)); decl_emu_edge_fn(BF(dav2d_emu_edge, lsx)); decl_8tap_fns(lsx); decl_avg_fn(BF(dav2d_avg, lasx)); decl_w_avg_fn(BF(dav2d_w_avg, lasx)); decl_mask_fn(BF(dav2d_mask, lasx)); decl_warp8x8_fn(BF(dav2d_warp_affine_8x8, lasx)); decl_warp8x8t_fn(BF(dav2d_warp_affine_8x8t, lasx)); decl_w_mask_fn(BF(dav2d_w_mask_420, lasx)); decl_blend_dir_fn(BF(dav2d_blend_h, lasx)); decl_8tap_gen(mct, prep, lasx); static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav2dMCDSPContext *const c) { #if BITDEPTH == 8 const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; c->avg = BF(dav2d_avg, lsx); c->w_avg = BF(dav2d_w_avg, lsx); c->mask = BF(dav2d_mask, lsx); c->warp8x8 = BF(dav2d_warp_affine_8x8, lsx); c->warp8x8t = BF(dav2d_warp_affine_8x8t, lsx); c->w_mask[2] = BF(dav2d_w_mask_420, lsx); c->blend = BF(dav2d_blend, lsx); c->blend_v = BF(dav2d_blend_v, lsx); c->blend_h = BF(dav2d_blend_h, lsx); c->emu_edge = BF(dav2d_emu_edge, lsx); init_8tap_fns(lsx); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LASX)) return; c->avg = BF(dav2d_avg, lasx); c->w_avg = BF(dav2d_w_avg, lasx); c->mask = BF(dav2d_mask, lasx); c->warp8x8 = BF(dav2d_warp_affine_8x8, lasx); c->warp8x8t = BF(dav2d_warp_affine_8x8t, lasx); c->w_mask[2] = BF(dav2d_w_mask_420, lasx); c->blend_h = BF(dav2d_blend_h, lasx); init_8tap_gen(mct, lasx); #endif } #endif /* DAV2D_SRC_LOONGARCH_MC_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/msac.S000066400000000000000000000557171517466257200236750ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "loongson_asm.S" const min_prob .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 endconst const ph_0xff00 .rept 8 .short 0xff00 .endr endconst .macro decode_symbol_adapt w addi.d sp, sp, -48 vldrepl.h vr0, a0, 24 //rng fst.s f0, sp, 0 //val==0 vld vr1, a1, 0 //cdf vldrepl.d vr2, a0, 16 //dif ld.w t1, a0, 32 //allow_update_cdf la.local t2, min_prob addi.d t2, t2, 30 slli.w t3, a2, 1 sub.d t2, t2, t3 vld vr3, t2, 0 //min_prob vsrli.h vr4, vr0, 8 //r = s->rng >> 8 vslli.h vr4, vr4, 8 //r << 8 vsrli.h vr5, vr1, 6 vslli.h vr5, vr5, 7 vmuh.hu vr5, vr4, vr5 vadd.h vr5, vr5, vr3 //v addi.d t8, sp, 2 vst vr5, t8, 0 //store v vreplvei.h vr20, vr2, 3 //c vsle.hu vr6, vr5, vr20 vmskltz.h vr10, vr6 beqz t1, .renorm\()\w // update_cdf alsl.d t1, a2, a1, 1 ld.h t2, t1, 0 //count srli.w t3, t2, 4 //count >> 4 addi.w t3, t3, 4 li.w t5, 2 sltu t5, t5, a2 add.w t3, t3, t5 //rate sltui t5, t2, 32 add.w t2, t2, t5 //count + (count < 32) vreplgr2vr.h vr9, t3 vseq.h vr7, vr7, vr7 vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768 vsub.h vr5, vr5, vr1 vsub.h vr8, vr1, vr6 vsra.h vr5, vr5, vr9 vadd.h vr8, vr8, vr5 .if \w == 4 fst.d f8, a1, 0 .else vst vr8, a1, 0 .endif st.h t2, t1, 0 .renorm\()\w: vpickve2gr.h t3, vr10, 0 ctz.w a7, t3 // ret alsl.d t3, a7, t8, 1 ld.hu t4, t3, 0 // v ld.hu t5, t3, -2 // u sub.w t5, t5, t4 // rng slli.d t4, t4, 48 vpickve2gr.d t6, vr2, 0 sub.d t6, t6, t4 // dif clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 ld.w t0, a0, 28 //cnt sll.w t5, t5, t4 sub.w t7, t0, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu t0, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, a7 addi.d sp, sp, 48 .endm function msac_decode_symbol_adapt4_lsx decode_symbol_adapt 4 endfunc function msac_decode_symbol_adapt8_lsx decode_symbol_adapt 8 endfunc function msac_decode_bool_lsx ld.w t0, a0, 24 // rng srli.w a1, a1, 6 ld.d t1, a0, 16 // dif srli.w t2, t0, 8 // r >> 8 mul.w t2, t2, a1 ld.w a5, a0, 28 // cnt srli.w t2, t2, 1 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw sltu t4, t1, t3 move t8, t4 // ret xori t4, t4, 1 maskeqz t6, t3, t4 // if (ret) vw sub.d t6, t1, t6 // dif slli.w t5, t2, 1 sub.w t5, t0, t5 // r - 2v maskeqz t7, t5, t4 // if (ret) r - 2v add.w t5, t2, t7 // v(rng) // renorm clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 sll.w t5, t5, t4 sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, t8 endfunc function msac_decode_bool_equi_lsx ld.w t0, a0, 24 // rng ld.d t1, a0, 16 // dif ld.w a5, a0, 28 // cnt srli.w t2, t0, 8 // r >> 8 slli.w t2, t2, 7 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw sltu t4, t1, t3 move t8, t4 // ret xori t4, t4, 1 maskeqz t6, t3, t4 // if (ret) vw sub.d t6, t1, t6 // dif slli.w t5, t2, 1 sub.w t5, t0, t5 // r - 2v maskeqz t7, t5, t4 // if (ret) r - 2v add.w t5, t2, t7 // v(rng) // renorm clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 sll.w t5, t5, t4 sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, t8 endfunc function msac_decode_bool_adapt_lsx ld.hu a3, a1, 0 // cdf[0] /f ld.w t0, a0, 24 // rng ld.d t1, a0, 16 // dif srli.w t2, t0, 8 // r >> 8 srli.w a7, a3, 6 mul.w t2, t2, a7 ld.w a4, a0, 32 // allow_update_cdf ld.w a5, a0, 28 // cnt srli.w t2, t2, 1 addi.w t2, t2, 4 // v slli.d t3, t2, 48 // vw sltu t4, t1, t3 move t8, t4 // bit xori t4, t4, 1 maskeqz t6, t3, t4 // if (ret) vw sub.d t6, t1, t6 // dif slli.w t5, t2, 1 sub.w t5, t0, t5 // r - 2v maskeqz t7, t5, t4 // if (ret) r - 2v add.w t5, t2, t7 // v(rng) beqz a4, .renorm // update_cdf ld.hu t0, a1, 2 // cdf[1] srli.w t1, t0, 4 addi.w t1, t1, 4 // rate sltui t2, t0, 32 // count < 32 add.w t0, t0, t2 // count + (count < 32) sub.w a3, a3, t8 // cdf[0] -= bit slli.w t4, t8, 15 sub.w t7, a3, t4 // cdf[0] - bit - 32768 sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate sub.w t7, a3, t7 // cdf[0] st.h t7, a1, 0 st.h t0, a1, 2 .renorm: clz.w t4, t5 // d xori t4, t4, 16 // d sll.d t6, t6, t4 sll.w t5, t5, t4 sub.w t7, a5, t4 // cnt-d st.w t5, a0, 24 // store rng bgeu a5, t4, 9f // refill ld.d t0, a0, 0 // buf_pos ld.d t1, a0, 8 // buf_end addi.d t2, t0, 8 bltu t1, t2, 2f ld.d t3, t0, 0 // next_bits addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) nor t3, t3, t3 sub.w t2, zero, t1 revb.d t3, t3 // next_bits = bswap(next_bits) srli.w t2, t2, 3 // num_bytes_read srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) b 3f 1: addi.w t3, t7, -48 srl.d t3, t3, t3 // pad with ones b 4f 2: bgeu t0, t1, 1b ld.d t3, t1, -8 // next_bits sub.w t2, t2, t1 sub.w t1, t1, t0 // num_bytes_left slli.w t2, t2, 3 srl.d t3, t3, t2 addi.w t2, t7, -48 nor t3, t3, t3 sub.w t4, zero, t2 revb.d t3, t3 srli.w t4, t4, 3 srl.d t3, t3, t2 sltu t2, t1, t4 maskeqz t1, t1, t2 masknez t2, t4, t2 or t2, t2, t1 // num_bytes_read 3: slli.w t1, t2, 3 add.d t0, t0, t2 add.w t7, t7, t1 // cnt += num_bits_read st.d t0, a0, 0 4: or t6, t6, t3 // dif |= next_bits 9: st.w t7, a0, 28 // store cnt st.d t6, a0, 16 // store dif move a0, t8 endfunc .macro HI_TOK allow_update_cdf .\allow_update_cdf\()_hi_tok_lsx_start: .if \allow_update_cdf == 1 ld.hu a4, a1, 0x06 // cdf[3] .endif vor.v vr1, vr0, vr0 vsrli.h vr1, vr1, 0x06 // cdf[val] >> EC_PROB_SHIFT vstelm.h vr2, sp, 0, 0 // -0x1a vand.v vr2, vr2, vr4 // (8 x rng) & 0xff00 vslli.h vr1, vr1, 0x07 vmuh.hu vr1, vr1, vr2 vadd.h vr1, vr1, vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val); vst vr1, sp, 0x02 // -0x18 vssub.hu vr1, vr1, vr3 // v - c vseqi.h vr1, vr1, 0 .if \allow_update_cdf == 1 addi.d t4, a4, 0x50 srli.d t4, t4, 0x04 sltui t7, a4, 32 add.w a4, a4, t7 vreplgr2vr.h vr7, t4 vavgr.hu vr9, vr8, vr1 vsub.h vr9, vr9, vr0 vsub.h vr0, vr0, vr1 vsra.h vr9, vr9, vr7 vadd.h vr0, vr0, vr9 vstelm.d vr0, a1, 0, 0 st.h a4, a1, 0x06 .endif vmsknz.b vr7, vr1 movfr2gr.s t4, f7 ctz.w t4, t4 // loop_times * 2 addi.d t7, t4, 2 ldx.hu t6, sp, t4 // u ldx.hu t5, sp, t7 // v addi.w t3, t3, 0x05 addi.w t4, t4, -0x05 // if t4 == 3, continue sub.w t6, t6, t5 // u - v , rng for ctx_norm slli.d t5, t5, 0x30 // (ec_win)v << (EC_WIN_SIZE - 16) sub.d t1, t1, t5 // s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)) // Init ctx_norm param clz.w t7, t6 xori t7, t7, 0x1f xori t7, t7, 0x0f // d = 15 ^ (31 ^ clz(rng)); sll.d t1, t1, t7 // dif << d sll.d t6, t6, t7 // rng << d // update vr2 8 x rng vreplgr2vr.h vr2, t6 vreplvei.h vr2, vr2, 0 st.w t6, a0, 0x18 // store rng move t0, t2 sub.w t2, t2, t7 // cnt - d bgeu t0, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end // if ((unsigned)cnt < (unsigned)d) goto ctx_norm_end // Step into ctx_fill ld.d t5, a0, 0x00 // buf_pos ld.d t6, a0, 0x08 // end_pos addi.d t7, t5, 0x08 // buf_pos + 8 sub.d t7, t7, t6 // (buf_pos + 8) - end_pos blt zero, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob // (end_pos - buf_pos) >= 8 ld.d t6, t5, 0x00 // load buf_pos[0]~buf_pos[7] addi.w t7, t2, -0x30 // cnt - 0x30 nor t6, t6, t6 // not buf data revb.d t6, t6 // Byte reversal srl.d t6, t6, t7 // Replace left shift with right shift sub.w t7, zero, t7 // neg srli.w t7, t7, 0x03 // Loop times or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob: bge t5, t6, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one // end_pos - buf_pos < 8 && buf_pos < end_pos ld.d t0, t6, -0x08 slli.d t7, t7, 0x03 srl.d t6, t0, t7 // Retrieve the buf data and remove the excess data addi.w t7, t2, -0x30 // cnt - 0x30 nor t6, t6, t6 // not revb.d t6, t6 // Byte reversal srl.d t6, t6, t7 // Replace left shift with right shift sub.w t7, zero, t7 // neg or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c ld.d t6, a0, 0x08 // end_pos srli.w t7, t7, 0x03 // Loop times sub.d t6, t6, t5 // end_pos - buf_pos slt t0, t6, t7 maskeqz a3, t6, t0 // min(loop_times, end_pos - buf_pos) masknez t0, t7, t0 or t7, a3, t0 b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one: // buf_pos >= end_pos addi.w t7, t2, -0x10 andi t7, t7, 0xf nor t0, zero, zero srl.d t0, t0, t7 or t1, t1, t0 // dif |= ~(~(ec_win)0xff << c); b .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end: add.d t5, t5, t7 // buf_pos + Loop_times st.d t5, a0, 0x00 // Store buf_pos alsl.w t2, t7, t2, 0x03 // update cnt .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end: srli.d t7, t1, 0x30 vreplgr2vr.h vr3, t7 // broadcast the high 16 bits of dif add.w t3, t4, t3 // update control parameter beqz t3, .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times. blt zero, t4, .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3 .\allow_update_cdf\()_hi_tok_lsx_end: addi.d t3, t3, 0x1e st.d t1, a0, 0x10 // store dif st.w t2, a0, 0x1c // store cnt srli.w a0, t3, 0x01 // tok addi.d sp, sp, 0x1a .endm /** * @param unsigned dav2d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) * * Reg Alloction * * vr0: cdf; * * vr1: temp; * * vr2: rng; * * vr3: dif; * * vr4: const 0xff00ff00...ff00ff00; * * vr5: const 0x0004080c; * * vr6: const 0; * * t0: allow_update_cdf, tmp; * * t1: dif; * * t2: cnt; * * t3: 0xffffffe8, outermost control parameter; * * t4: loop time * * t5: v, buf_pos, temp; * * t6: u, rng, end_pos, buf, temp; * * t7: temp; */ function msac_decode_hi_tok_lsx fld.d f0, a1, 0 // Load cdf[0]~cdf[3] vldrepl.h vr2, a0, 0x18 // 8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid vldrepl.h vr3, a0, 0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16) ld.w t0, a0, 0x20 // allow_update_cdf la.local t7, ph_0xff00 vld vr4, t7, 0x00 // 0xff00ff00...ff00ff00 la.local t7, min_prob vld vr5, t7, 12 * 2 // 0x0004080c vxor.v vr6, vr6, vr6 // const 0 ld.d t1, a0, 0x10 // dif ld.w t2, a0, 0x1c // cnt orn t3, t3, t3 srli.d t3, t3, 32 addi.d t3, t3, -0x17 // 0xffffffe8 vseq.h vr8, vr8, vr8 addi.d sp, sp, -0x1a // alloc stack beqz t0, .hi_tok_lsx_no_update_cdf HI_TOK 1 jirl zero, ra, 0x0 .hi_tok_lsx_no_update_cdf: HI_TOK 0 endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/msac.h000066400000000000000000000047641517466257200237160ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_MSAC_H #define DAV2D_SRC_LOONGARCH_MSAC_H unsigned dav2d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav2d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav2d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf); unsigned dav2d_msac_decode_bool_lsx(MsacContext *s, unsigned f); unsigned dav2d_msac_decode_bool_equi_lsx(MsacContext *s); unsigned dav2d_msac_decode_hi_tok_lsx(MsacContext *s, uint16_t *cdf); #define dav2d_msac_decode_symbol_adapt4 dav2d_msac_decode_symbol_adapt4_lsx #define dav2d_msac_decode_symbol_adapt8 dav2d_msac_decode_symbol_adapt8_lsx #define dav2d_msac_decode_bool_adapt dav2d_msac_decode_bool_adapt_lsx #define dav2d_msac_decode_bool dav2d_msac_decode_bool_lsx #define dav2d_msac_decode_bool_equi dav2d_msac_decode_bool_equi_lsx #define dav2d_msac_decode_hi_tok dav2d_msac_decode_hi_tok_lsx #endif /* DAV2D_SRC_LOONGARCH_MSAC_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/refmvs.S000066400000000000000000000660601517466257200242450ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/loongarch/loongson_asm.S" /* static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, const int bx4, const int bw4, int bh4) */ function splat_mv_lsx vld vr0, a1, 0 // 0 1 ... 11 ... clz.w t4, a3 vaddi.bu vr1, vr0, 0 addi.w t4, t4, -26 vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3 la.local t5, .SPLAT_LSX_JRTABLE vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0 alsl.d t6, t4, t5, 1 vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7 ld.h t7, t6, 0 vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0 add.d t8, t5, t7 alsl.d a2, a2, a2, 1 vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11 slli.w a2, a2, 2 jirl $r0, t8, 0 .SPLAT_LSX_JRTABLE: .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE .hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE .hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE .hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE .hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE .SPLAT_W1_LSX: ld.d t3, a0, 0 addi.d a0, a0, 8 addi.d a4, a4, -1 add.d t3, t3, a2 fst.d f1, t3, 0 fst.s f3, t3, 8 blt zero, a4, .SPLAT_W1_LSX b .splat_end .SPLAT_W2_LSX: ld.d t3, a0, 0 addi.d a0, a0, 8 addi.d a4, a4, -1 add.d t3, t3, a2 vst vr1, t3, 0 fst.d f2, t3, 16 blt zero, a4, .SPLAT_W2_LSX b .splat_end .SPLAT_W4_LSX: ld.d t3, a0, 0 addi.d a0, a0, 8 addi.d a4, a4, -1 add.d t3, t3, a2 vst vr1, t3, 0 vst vr2, t3, 16 vst vr3, t3, 32 blt zero, a4, .SPLAT_W4_LSX b .splat_end .SPLAT_W8_LSX: ld.d t3, a0, 0 addi.d a0, a0, 8 addi.d a4, a4, -1 add.d t3, t3, a2 vst vr1, t3, 0 vst vr2, t3, 16 vst vr3, t3, 32 vst vr1, t3, 48 vst vr2, t3, 64 vst vr3, t3, 80 blt zero, a4, .SPLAT_W8_LSX b .splat_end .SPLAT_W16_LSX: ld.d t3, a0, 0 addi.d a0, a0, 8 addi.d a4, a4, -1 add.d t3, t3, a2 .rept 2 vst vr1, t3, 0 vst vr2, t3, 16 vst vr3, t3, 32 vst vr1, t3, 48 vst vr2, t3, 64 vst vr3, t3, 80 addi.d t3, t3, 96 .endr blt zero, a4, .SPLAT_W16_LSX b .splat_end .SPLAT_W32_LSX: ld.d t3, a0, 0 addi.d a0, a0, 8 addi.d a4, a4, -1 add.d t3, t3, a2 .rept 4 vst vr1, t3, 0 vst vr2, t3, 16 vst vr3, t3, 32 vst vr1, t3, 48 vst vr2, t3, 64 vst vr3, t3, 80 addi.d t3, t3, 96 .endr blt zero, a4, .SPLAT_W32_LSX .splat_end: endfunc const la_div_mult .short 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 .short 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 .short 1024, 963, 910, 862, 819, 780, 744, 712 .short 682, 655, 630, 606, 585, 564, 546, 528 endconst /* * temp reg: a6 a7 */ .macro LOAD_SET_LOOP is_odd slli.d a6, t6, 2 add.d a6, a6, t6 // col_w * 5 0: addi.d a7, zero, 0 // x .if \is_odd stx.w t7, t3, a7 addi.d a7, a7, 5 bge a7, a6, 2f .endif 1: stx.w t7, t3, a7 addi.d a7, a7, 5 stx.w t7, t3, a7 addi.d a7, a7, 5 blt a7, a6, 1b 2: add.d t3, t3, t2 addi.d t5, t5, 1 blt t5, a5, 0b .endm /* * static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx, * const int col_start8, const int col_end8, * const int row_start8, int row_end8) */ function load_tmvs_lsx addi.d sp, sp, -80 st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 st.d s8, sp, 64 vld vr16, a0, 16 vld vr0, a0, 48 // rf->mfmv_ref, rf->mfmv_ref2cur ld.w s8, a0, 80 // [0] - rf->n_mfmvs vld vr17, a0, 96 // [0] - rp_ref| [1]- rp_proj ld.d t1, a0, 112 // stride ld.w t0, a0, 128 addi.w t0, t0, -1 bnez t0, 1f addi.w a1, zero, 0 1: addi.d t0, a3, 8 vinsgr2vr.w vr1, t0, 0 vinsgr2vr.w vr1, a5, 1 vmin.w vr1, vr1, vr16 // [0] col_end8i [1] row_end8 addi.d t0, a2, -8 bge t0, zero, 2f addi.w t0, zero, 0 // t0 col_start8i 2: vpickve2gr.d t4, vr17, 1 // rf->rp_proj slli.d t2, t1, 2 add.d t2, t2, t1 // stride * 5 slli.d a1, a1, 4 // tile_row_idx * 16 andi t3, a4, 0xf add.d t3, t3, a1 // tile_row_idx * 16 + row_start8 & 15 mul.w t3, t3, t2 mul.w t8, a1, t2 vpickve2gr.w a5, vr1, 1 addi.d t5, a4, 0 sub.d t6, a3, a2 // col_end8 - col_start8 li.w t7, 0x80008000 slli.d a7, a2, 2 add.d t3, t3, a2 add.d t3, t3, a7 add.d t3, t3, t4 // rp_proj andi a6, t6, 1 bnez a6, 3f LOAD_SET_LOOP 0 b 4f 3: LOAD_SET_LOOP 1 4: addi.d a6, zero, 0 // n bge a6, s8, .end_load add.d t3, t8, t4 // rp_proj mul.w t6, a4, t2 addi.d s7, zero, 40 vpickve2gr.w t1, vr1, 0 // col_end8i addi.d t5, a0, 58 // rf->mfmv_ref2ref - 1 la.local t8, la_div_mult vld vr6, t8, 0 vld vr7, t8, 16 vld vr8, t8, 32 vld vr9, t8, 48 li.w t8, 0x3fff vreplgr2vr.h vr21, t8 vxor.v vr18, vr18, vr18 // zero vsub.h vr20, vr18, vr21 vpickev.b vr12, vr7, vr6 vpickod.b vr13, vr7, vr6 vpickev.b vr14, vr9, vr8 vpickod.b vr15, vr9, vr8 vpickve2gr.d s6, vr17, 0 // rf->rp_ref 5: vld vr10, t5, 0 // ref2ref [1...7] vpickve2gr.b t8, vr0, 8 // ref2cur vbsrl.v vr0, vr0, 1 addi.w t4, t8, 32 beqz t4, 8f // INVALID_REF2CUR vreplgr2vr.h vr23, t8 vshuf.b vr6, vr14, vr12, vr10 vshuf.b vr7, vr15, vr13, vr10 vilvl.b vr8, vr7, vr6 vmulwev.w.h vr6, vr8, vr23 vmulwod.w.h vr7, vr8, vr23 vpickve2gr.b s0, vr0, 4 // ref slli.d t8, s0, 3 ldx.d s1, s6, t8 // rf->rp_ref[ref] addi.d s0, s0, -4 // ref_sign vreplgr2vr.h vr19, s0 add.d s1, s1, t6 // &rf->rp_ref[ref][row_start8 * stride] addi.d s2, a4, 0 // y vilvl.w vr8, vr7, vr6 vilvh.w vr9, vr7, vr6 6: // for (int y = row_start8; andi s3, s2, 0xff8 addi.d s4, s3, 8 blt a4, s3, 0f addi.d s3, a4, 0 // y_proj_start 0: blt s4, a5, 0f addi.d s4, a5, 0 // y_proj_end 0: addi.d s5, t0, 0 // x 7: // for (int x = col_start8i; slli.d a7, s5, 2 add.d a7, a7, s5 add.d a7, s1, a7 // rb vld vr3, a7, 0 // [rb] vpickve2gr.b t4, vr3, 4 // b_ref beqz t4, .end_x vreplve.b vr11, vr10, t4 vpickve2gr.b t7, vr11, 4 // ref2ref beqz t7, .end_x vsllwil.w.h vr4, vr3, 0 vreplgr2vr.w vr6, t4 vshuf.w vr6, vr9, vr8 // frac vmul.w vr5, vr6, vr4 vsrai.w vr4, vr5, 31 vadd.w vr4, vr4, vr5 vssrarni.h.w vr4, vr4, 14 vclip.h vr4, vr4, vr20, vr21 // offset vxor.v vr5, vr4, vr19 // offset.x ^ ref_sign vori.b vr5, vr5, 0x1 // offset.x ^ ref_sign vabsd.h vr4, vr4, vr18 vsrli.h vr4, vr4, 6 // abs(offset.x) >> 6 vsigncov.h vr4, vr5, vr4 // apply_sign vpickve2gr.h s0, vr4, 0 add.d s0, s2, s0 // pos_y blt s0, s3, .n_posy bge s0, s4, .n_posy andi s0, s0, 0xf mul.w s0, s0, t2 // pos vpickve2gr.h t7, vr4, 1 add.d t7, t7, s5 // pos_x add.d s0, t3, s0 // rp_proj + pos .loop_posx: andi t4, s5, 0xff8 // x_sb_align blt t7, a2, .n_posx addi.d t8, t4, -8 blt t7, t8, .n_posx bge t7, a3, .n_posx addi.d t4, t4, 16 bge t7, t4, .n_posx slli.d t4, t7, 2 add.d t4, t4, t7 // pos_x * 5 add.d t4, s0, t4 // rp_proj[pos + pos_x] vstelm.w vr3, t4, 0, 0 vstelm.b vr11, t4, 4, 4 .n_posx: addi.d s5, s5, 1 // x + 1 bge s5, t1, .ret_posx addi.d a7, a7, 5 // rb + 1 vld vr4, a7, 0 // [rb] vseq.b vr5, vr4, vr3 vpickve2gr.d t8, vr5, 0 cto.d t8, t8 blt t8, s7, 7b addi.d t7, t7, 1 // pos_x + 1 /* Core computing loop expansion(sencond) */ andi t4, s5, 0xff8 // x_sb_align blt t7, a2, .n_posx addi.d t8, t4, -8 blt t7, t8, .n_posx bge t7, a3, .n_posx addi.d t4, t4, 16 bge t7, t4, .n_posx slli.d t4, t7, 2 add.d t4, t4, t7 // pos_x * 5 add.d t4, s0, t4 // rp_proj[pos + pos_x] vstelm.w vr3, t4, 0, 0 vstelm.b vr11, t4, 4, 4 addi.d s5, s5, 1 // x + 1 bge s5, t1, .ret_posx addi.d a7, a7, 5 // rb + 1 vld vr4, a7, 0 // [rb] vseq.b vr5, vr4, vr3 vpickve2gr.d t8, vr5, 0 cto.d t8, t8 blt t8, s7, 7b addi.d t7, t7, 1 // pos_x + 1 /* Core computing loop expansion(third) */ andi t4, s5, 0xff8 // x_sb_align blt t7, a2, .n_posx addi.d t8, t4, -8 blt t7, t8, .n_posx bge t7, a3, .n_posx addi.d t4, t4, 16 bge t7, t4, .n_posx slli.d t4, t7, 2 add.d t4, t4, t7 // pos_x * 5 add.d t4, s0, t4 // rp_proj[pos + pos_x] vstelm.w vr3, t4, 0, 0 vstelm.b vr11, t4, 4, 4 addi.d s5, s5, 1 // x + 1 bge s5, t1, .ret_posx addi.d a7, a7, 5 // rb + 1 vld vr4, a7, 0 // [rb] vseq.b vr5, vr4, vr3 vpickve2gr.d t8, vr5, 0 cto.d t8, t8 blt t8, s7, 7b addi.d t7, t7, 1 // pos_x + 1 b .loop_posx .n_posy: addi.d s5, s5, 1 // x + 1 bge s5, t1, .ret_posx addi.d a7, a7, 5 // rb + 1 vld vr4, a7, 0 // [rb] vseq.b vr5, vr4, vr3 vpickve2gr.d t8, vr5, 0 cto.d t8, t8 blt t8, s7, 7b addi.d s5, s5, 1 // x + 1 bge s5, t1, .ret_posx addi.d a7, a7, 5 // rb + 1 vld vr4, a7, 0 // [rb] vseq.b vr5, vr4, vr3 vpickve2gr.d t8, vr5, 0 cto.d t8, t8 blt t8, s7, 7b b .n_posy .end_x: addi.d s5, s5, 1 // x + 1 blt s5, t1, 7b .ret_posx: add.d s1, s1, t2 // r + stride addi.d s2, s2, 1 // y + 1 blt s2, a5, 6b 8: addi.d a6, a6, 1 // n + 1 addi.d t5, t5, 7 // mfmv_ref2ref(offset) + 7 blt a6, s8, 5b .end_load: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 ld.d s8, sp, 64 addi.d sp, sp, 80 endfunc const mv_tbls .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 endconst const mask_mult .byte 1, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 endconst const mask_mv0 .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 endconst const mask_mv1 .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 endconst // void dav2d_save_tmvs_lsx(refmvs_temporal_block *rp, ptrdiff_t stride, // refmvs_block **rr, const uint8_t *ref_sign, // int col_end8, int row_end8, // int col_start8, int row_start8) function save_tmvs_lsx addi.d sp, sp, -0x28 st.d s0, sp, 0x00 st.d s1, sp, 0x08 st.d s2, sp, 0x10 st.d s3, sp, 0x18 st.d s4, sp, 0x20 move t0, ra vxor.v vr10, vr10, vr10 vld vr11, a3, 0 // Load ref_sign[0] ~ Load ref_sign[7] la.local t2, .save_tevs_tbl la.local s1, mask_mult la.local t7, mv_tbls vld vr9, s1, 0 // Load mask_mult vslli.d vr11, vr11, 8 // 0, ref_sign[0], ... ,ref_sign[6] la.local s3, mask_mv0 vld vr8, s3, 0 // Load mask_mv0 la.local s4, mask_mv1 vld vr7, s4, 0 // Load mask_mv1 li.d s0, 5 li.d t8, 12 * 2 mul.d a1, a1, s0 // stride *= 5 sub.d a5, a5, a7 // h = row_end8 - row_start8 slli.d a7, a7, 1 // row_start8 <<= 1 1: li.d s0, 5 andi t3, a7, 30 // (y & 15) * 2 slli.d s4, t3, 3 ldx.d t3, a2, s4 // b = rr[(y & 15) * 2] addi.d t3, t3, 12 // &b[... + 1] mul.d s4, a4, t8 add.d t4, s4, t3 // end_cand_b = &b[col_end8*2 + 1] mul.d s3, a6, t8 add.d t3, s3, t3 // cand_b = &b[x*2 + 1] mul.d s4, a6, s0 add.d a3, s4, a0 // &rp[x] 2: /* First cand_b */ ld.b t5, t3, 10 // cand_b->bs vld vr0, t3, 0 // cand_b->mv and ref alsl.d t5, t5, t2, 2 // bt2 index ld.h s3, t3, 8 // cand_b->ref ld.h t6, t5, 0 // bt2 move s0, t2 alsl.d t3, t6, t3, 1 // Next cand_b += bt2 * 2 vor.v vr2, vr0, vr0 vinsgr2vr.h vr1, s3, 0 move t1 , t3 bge t3, t4, 3f /* Next cand_b */ ld.b s0, t3, 10 // cand_b->bs vld vr4, t3, 0 // cand_b->mv and ref alsl.d s0, s0, t2, 2 // bt2 index ld.h s4, t3, 8 // cand_b->ref ld.h t6, s0, 0 // bt2 alsl.d t3, t6, t3, 1 // Next cand_b += bt2*2 vpackev.d vr2, vr4, vr0 // a0.mv[0] a0.mv[1] a1.mv[0], a1.mv[1] vinsgr2vr.h vr1, s4, 1 // a0.ref[0] a0.ref[1], a1.ref[0], a1.ref[1] 3: vabsd.h vr2, vr2, vr10 // abs(mv[].xy) vsle.b vr16, vr10, vr1 vand.v vr1, vr16, vr1 vshuf.b vr1, vr11, vr11, vr1 // ref_sign[ref] vsrli.h vr2, vr2, 12 // abs(mv[].xy) >> 12 vilvl.b vr1, vr1, vr1 vmulwev.h.bu vr1, vr1, vr9 // ef_sign[ref] * {1, 2} vseqi.w vr2, vr2, 0 // abs(mv[].xy) <= 4096 vpickev.h vr2, vr2, vr2 // abs() condition to 16 bit vand.v vr1, vr2, vr1 // h[0-3] contains conditions for mv[0-1] vhaddw.wu.hu vr1, vr1, vr1 // Combine condition for [1] and [0] vpickve2gr.wu s1, vr1, 0 // Extract case for first block vpickve2gr.wu s2, vr1, 1 ld.hu t5, t5, 2 // Fetch jump table entry ld.hu s0, s0, 2 alsl.d s3, s1, t7, 4 // Load permutation table base on case vld vr1, s3, 0 alsl.d s4, s2, t7, 4 vld vr5, s4, 0 sub.d t5, t2, t5 // Find jump table target sub.d s0, t2, s0 vshuf.b vr0, vr0, vr0, vr1 // Permute cand_b to output refmvs_temporal_block vshuf.b vr4, vr4, vr4, vr5 vsle.b vr16, vr10, vr1 vand.v vr0, vr16, vr0 vsle.b vr17, vr10, vr5 vand.v vr4, vr17, vr4 // v1 follows on v0, with another 3 full repetitions of the pattern. vshuf.b vr1, vr0, vr0, vr8 // 1, 2, 3, ... , 15, 16 vshuf.b vr5, vr4, vr4, vr8 // 1, 2, 3, ... , 15, 16 // v2 ends with 3 complete repetitions of the pattern. vshuf.b vr2, vr1, vr0, vr7 vshuf.b vr6, vr5, vr4, vr7 // 4, 5, 6, 7, ... , 12, 13, 14, 15, 16, 17, 18, 19 jirl ra, t5, 0 bge t1 , t4, 4f // if (cand_b >= end) vor.v vr0, vr4, vr4 vor.v vr1, vr5, vr5 vor.v vr2, vr6, vr6 jirl ra, s0, 0 blt t3, t4, 2b // if (cand_b < end) 4: addi.d a5, a5, -1 // h-- addi.d a7, a7, 2 // y += 2 add.d a0, a0, a1 // rp += stride blt zero, a5, 1b ld.d s0, sp, 0x00 ld.d s1, sp, 0x08 ld.d s2, sp, 0x10 ld.d s3, sp, 0x18 ld.d s4, sp, 0x20 addi.d sp, sp, 0x28 move ra, t0 jirl zero, ra, 0x00 10: addi.d s1, a3, 4 vstelm.w vr0, a3, 0, 0 // .mv vstelm.b vr0, s1, 0, 4 // .ref addi.d a3, a3, 5 jirl zero, ra, 0x00 20: addi.d s1, a3, 8 vstelm.d vr0, a3, 0, 0 // .mv vstelm.h vr0, s1, 0, 4 // .ref addi.d a3, a3, 2 * 5 jirl zero, ra, 0x00 40: vst vr0, a3, 0 vstelm.w vr1, a3, 0x10, 0 addi.d a3, a3, 4 * 5 jirl zero, ra, 0x00 80: vst vr0, a3, 0 vst vr1, a3, 0x10 // This writes 6 full entries plus 2 extra bytes vst vr2, a3, 5 * 8 - 16 // Write the last few, overlapping with the first write. addi.d a3, a3, 8 * 5 jirl zero, ra, 0x00 160: addi.d s1, a3, 6 * 5 addi.d s2, a3, 12 * 5 vst vr0, a3, 0 vst vr1, a3, 0x10 // This writes 6 full entries plus 2 extra bytes vst vr0, a3, 6 * 5 vst vr1, a3, 6 * 5 + 16 // Write another 6 full entries, slightly overlapping with the first set vstelm.d vr0, s2, 0, 0 // Write 8 bytes (one full entry) after the first 12 vst vr2, a3, 5 * 16 - 16 // Write the last 3 entries addi.d a3, a3, 16 * 5 jirl zero, ra, 0x00 .save_tevs_tbl: .hword 16 * 12 // bt2 * 12, 12 is sizeof(refmvs_block) .hword .save_tevs_tbl - 160b .hword 16 * 12 .hword .save_tevs_tbl - 160b .hword 8 * 12 .hword .save_tevs_tbl - 80b .hword 8 * 12 .hword .save_tevs_tbl - 80b .hword 8 * 12 .hword .save_tevs_tbl - 80b .hword 8 * 12 .hword .save_tevs_tbl - 80b .hword 4 * 12 .hword .save_tevs_tbl - 40b .hword 4 * 12 .hword .save_tevs_tbl - 40b .hword 4 * 12 .hword .save_tevs_tbl - 40b .hword 4 * 12 .hword .save_tevs_tbl - 40b .hword 2 * 12 .hword .save_tevs_tbl - 20b .hword 2 * 12 .hword .save_tevs_tbl - 20b .hword 2 * 12 .hword .save_tevs_tbl - 20b .hword 2 * 12 .hword .save_tevs_tbl - 20b .hword 2 * 12 .hword .save_tevs_tbl - 20b .hword 1 * 12 .hword .save_tevs_tbl - 10b .hword 1 * 12 .hword .save_tevs_tbl - 10b .hword 1 * 12 .hword .save_tevs_tbl - 10b .hword 1 * 12 .hword .save_tevs_tbl - 10b .hword 1 * 12 .hword .save_tevs_tbl - 10b .hword 1 * 12 .hword .save_tevs_tbl - 10b .hword 1 * 12 .hword .save_tevs_tbl - 10b endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/loongarch/refmvs.h000066400000000000000000000037751517466257200242760ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Loongson Technology Corporation Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOONGARCH_REFMVS_H #define DAV2D_SRC_LOONGARCH_REFMVS_H #include "src/cpu.h" #include "src/refmvs.h" decl_splat_mv_fn(dav2d_splat_mv_lsx); decl_load_tmvs_fn(dav2d_load_tmvs_lsx); decl_save_tmvs_fn(dav2d_save_tmvs_lsx); static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav2dRefmvsDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_LOONGARCH_CPU_FLAG_LSX)) return; c->splat_mv = dav2d_splat_mv_lsx; c->load_tmvs = dav2d_load_tmvs_lsx; c->save_tmvs = dav2d_save_tmvs_lsx; } #endif /* DAV2D_SRC_LOONGARCH_REFMVS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/looprestoration.h000066400000000000000000000105341517466257200242520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LOOPRESTORATION_H #define DAV2D_SRC_LOOPRESTORATION_H #include #include #include "common/bitdepth.h" enum LrEdgeFlags { LR_HAVE_LEFT = 1 << 0, LR_HAVE_RIGHT = 1 << 1, LR_HAVE_TOP = 1 << 2, LR_HAVE_BOTTOM = 1 << 3, // top (or bottom) edges are at tile row boundaries and consist of // post-cdef/ccso instead of pre-cdef/ccso data. This also means they // contain 4 lines of pixel data instead of 2. LR_HAVE_TOP_INTEGRATED = 1 << 4, LR_HAVE_BOTTOM_INTEGRATED = 1 << 5, }; #ifdef BITDEPTH typedef const pixel (*const_left_pixel_row)[6]; #else typedef const void *const_left_pixel_row; #endif typedef union WienerParams { struct { const int8_t *filter; const pixel *luma, *luma_top, *luma_bottom; ptrdiff_t stride; int ss_ver, ss_hor, ds_flt; } single; struct { union { const int8_t (*user)[18]; const int16_t (*pretrained)[13]; } filters; const uint8_t *subclass_lut; const uint16_t *noskip_mask; int base_q; } multi; } WienerParams; // Although the spec applies restoration filters over 4x4 blocks, // they can be applied to a bigger surface. // * w is constrained by the smallest gdf block size (w <= 64) // * h is constrained by the stripe height (h <= 64) // The filter functions are allowed to do aligned writes past the right // edge of the buffer, aligned up to the minimum loop restoration unit size // (which is 32 pixels for subsampled chroma and 64 pixels for luma). #define decl_wiener_filter_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const_left_pixel_row left, \ const pixel *top, const pixel *bottom, int w, int h, \ const WienerParams *params, enum LrEdgeFlags edges, \ const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) typedef decl_wiener_filter_fn(*wienerfilter_fn); #define decl_gdf_prep_fn(name) \ void (name)(int8_t *dst, ptrdiff_t dst_stride, \ const pixel *p, ptrdiff_t stride, \ const_left_pixel_row left, \ const pixel *top, const pixel *bottom, int w, int h, \ int ref_dst_idx, int qp_idx, \ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) typedef decl_gdf_prep_fn(*gdf_prep_fn); #define decl_gdf_add_fn(name) \ void (name)(pixel *p, ptrdiff_t dst_stride, \ const int8_t *err, const ptrdiff_t err_stride, \ const int w, const int h, const int scale, \ const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) typedef decl_gdf_add_fn(*gdf_add_fn); typedef struct Dav2dLoopRestorationDSPContext { wienerfilter_fn ns_wiener_single[2 /* y, uv */]; wienerfilter_fn ns_wiener_multi; wienerfilter_fn pc_wiener; gdf_prep_fn gdf_prep; gdf_add_fn gdf_add; } Dav2dLoopRestorationDSPContext; bitfn_decls(void dav2d_loop_restoration_dsp_init, Dav2dLoopRestorationDSPContext *c, int bpc); #endif /* DAV2D_SRC_LOOPRESTORATION_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/looprestoration_tmpl.c000066400000000000000000001034141517466257200253010ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include "common/attributes.h" #include "common/bitdepth.h" #include "common/intops.h" #include "src/looprestoration.h" #include "src/tables.h" #include "src/gdf_tables.h" // 64 + 6 + 6 #define REST_UNIT_STRIDE (76) // 64 / 4 #define CLASS_BUF_SIZE (16) // 64 / 2 + 1 #define GRADIENT_BUF_STRIDE (33) // C // E A F // 6 2 7 // 8 4 0 5 9 // D B 3 1 1 3 B D // 9 5 0 4 8 // 7 2 6 // F A E // C static const int8_t wiener_ns_config_y[16][2] = { { 1, 0 }, { 0, 1 }, { 2, 0 }, { 0, 2 }, { 1, 1 }, { -1, 1 }, { 2, 1 }, { 2, -1 }, { 1, 2 }, { 1, -2 }, { 3, 0 }, { 0, 3 }, { 4, 0 }, { 0, 4 }, { 3, 3 }, { 3, -3 }, }; // 4 // 2 0 3 // 5 1 1 5 // 3 0 2 // 4 static const int8_t wiener_ns_config_uv[6][2] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 }, { 2, 0 }, { 0, 2 }, }; // 9 // 5 1 6 // B 3 2 A // 7 0 4 // 8 static const int8_t wiener_ns_config_uv_from_y[12][2] = { { 1, 0 }, { -1, 0 }, { 0, 1 }, { 0, -1 }, { 1, 1 }, { -1, -1 }, { -1, 1 }, { 1, -1 }, { 2, 0 }, { -2, 0 }, { 0, 2 }, { 0, -2 }, }; // A // 6 2 7 // 8 4 0 5 9 // B 3 1 C 1 3 B // 9 5 0 4 8 // 7 2 6 // A static const int8_t pc_wiener_config[12][2] = { { 1, 0 }, { 0, 1 }, { 2, 0 }, { 0, 2 }, { 1, 1 }, { -1, 1 }, { 2, 1 }, { 2, -1 }, { 1, 2 }, { 1, -2 }, { 3, 0 }, { 0, 3 }, }; // 0 // 1 // 2 // 3 // 4 5 6 // 7 8 9 A B // C D E F G G F E D C // B A 9 8 7 // 6 5 4 // 3 // 2 // 1 // 0 static const int8_t gdf_coords[18][2] = { { 6, 0 }, { 5, 0 }, { 4, 0 }, { 3, 0 }, { 2, 1 }, { 2, 0 }, { 2, -1 }, { 1, 2 }, { 1, 1 }, { 1, 0 }, { 1, -1 }, { 1, -2 }, { 0, 6 }, { 0, 5 }, { 0, 4 }, { 0, 3 }, { 0, 2 }, { 0, 1 } }; static const uint16_t pc_wiener_normalizer[ 4 ] = { 3739, 3273, 3074, 7 }; static const int16_t mode_weights[ 4 ][ 3 ] = { { -527, 15325, 321 }, { 26436, -17705, 17905 }, { 366, -147, -194 }, { 202, -267, -179 } }; static const int16_t mode_offsets[ 4 ] = { -547, -21565, -573, -680 }; static void backup_row(pixel *dst, const pixel *src, const pixel *left, const int w, const int edge_len, const enum LrEdgeFlags edges) { if (edges & LR_HAVE_LEFT) for (int x = -edge_len; x < 0; x++) dst[x] = left[x + 6]; else for (int x = -edge_len; x < 0; x++) dst[x] = src[0]; for (int x = 0; x < w; x++) dst[x] = src[x]; if (edges & LR_HAVE_RIGHT) for (int x = w; x < w + edge_len; x++) dst[x] = src[x]; else for (int x = w; x < w + edge_len; x++) dst[x] = src[w - 1]; } static void backup_row_lpf(pixel *dst, const pixel *src, const int w, const int edge_len, const enum LrEdgeFlags edges) { if (edges & LR_HAVE_LEFT) { for (int x = -edge_len; x < 0; x++) dst[x] = src[x]; } else { for (int x = -edge_len; x < 0; x++) dst[x] = src[0]; } for (int x = 0; x < w; x++) dst[x] = src[x]; if (edges & LR_HAVE_RIGHT) { for (int x = w; x < w + edge_len; x++) dst[x] = src[x]; } else { for (int x = w; x < w + edge_len; x++) dst[x] = src[w - 1]; } } static void backup_row_luma(pixel *dst, const pixel *src, const ptrdiff_t src_stride, const int w, const enum LrEdgeFlags edges, const int ss_hor, const int ss_ver, const int cfl_ds_flt) { if (!ss_ver) { backup_row_lpf(dst, src, w, 4, edges); return; } const pixel *src2 = &src[PXSTRIDE(src_stride)]; switch (cfl_ds_flt) { case 0: for (int x = 0; x < w; x += 1 + ss_hor) dst[x] = (src[x] + src2[x] + src[x + 1] + src2[x + 1]) >> 2; break; case 1: for (int x = 0; x < w; x++) dst[x] = (src[x] + src2[x]) >> 1; break; default: assert(0); // fall-through in non-debug mode case 2: for (int x = 0; x < w; x++) dst[x] = src[x]; break; } if (edges & LR_HAVE_LEFT) { switch (cfl_ds_flt) { case 0: for (int x = -4; x < 0; x++) dst[x] = (src[x] + src2[x] + src[x + 1] + src2[x + 1]) >> 2; break; case 1: for (int x = -4; x < 0; x++) dst[x] = (src[x] + src2[x]) >> 1; break; default: assert(0); // fall-through in non-debug mode case 2: for (int x = -4; x < 0; x++) dst[x] = src[x]; break; } } else { for (int x = -4; x < 0; x++) dst[x] = dst[0]; } if (edges & LR_HAVE_RIGHT) { switch (cfl_ds_flt) { case 0: for (int x = w; x < w + 4; x++) dst[x] = (src[x] + src2[x] + src[x + 1] + src2[x + 1]) >> 2; break; case 1: for (int x = w; x < w + 4; x++) dst[x] = (src[x] + src2[x]) >> 1; break; default: assert(0); // fall-through in non-debug mode case 2: for (int x = w; x < w + 4; x++) dst[x] = src[x]; break; } } else { for (int x = w; x < w + 4; x++) dst[x] = dst[w - 2]; } } static void ns_wiener_single_y_c(pixel *p, const ptrdiff_t stride, const pixel (*left)[6], const pixel *lpf, const pixel *lpf_bottom, const int w, int h, const WienerParams *params, const enum LrEdgeFlags edges, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { const int8_t *filter = params->single.filter; pixel row_buffers[9][REST_UNIT_STRIDE]; pixel *bak_rows[9]; const pixel *ptrs[9]; for (int i = 0; i < 9; i++) bak_rows[i] = row_buffers[i] + 6; backup_row(bak_rows[4], p, left[0], w, 4, edges); ptrs[4] = bak_rows[4]; if (edges & LR_HAVE_TOP_INTEGRATED) { for (int i = 0; i < 4; i++) { backup_row_lpf(bak_rows[i], lpf, w, 4, edges); lpf += PXSTRIDE(stride); ptrs[i] = bak_rows[i]; } } else if (edges & LR_HAVE_TOP) { // y = -2,-1 backup_row_lpf(bak_rows[2], lpf, w, 4, edges); ptrs[2] = bak_rows[2]; backup_row_lpf(bak_rows[3], lpf + PXSTRIDE(stride), w, 4, edges); ptrs[3] = bak_rows[3]; // y = -3,-4 ptrs[0] = ptrs[1] = ptrs[2]; } else { ptrs[0] = ptrs[1] = ptrs[2] = ptrs[3] = ptrs[4]; } backup_row(bak_rows[5], p + PXSTRIDE(stride), left[1], w, 4, edges); ptrs[5] = bak_rows[5]; backup_row(bak_rows[6], p + 2*PXSTRIDE(stride), left[2], w, 4, edges); ptrs[6] = bak_rows[6]; backup_row(bak_rows[7], p + 3*PXSTRIDE(stride), left[3], w, 4, edges); ptrs[7] = bak_rows[7]; int bak_idx = 8; for (int y = 0; y < h; y++) { if (y + 4 < h) { backup_row(bak_rows[bak_idx], p + 4*PXSTRIDE(stride), left[y + 4], w, 4, edges); ptrs[8] = bak_rows[bak_idx]; } else if (edges & LR_HAVE_BOTTOM_INTEGRATED) { backup_row_lpf(bak_rows[bak_idx], p + 4*PXSTRIDE(stride), w, 4, edges); ptrs[8] = bak_rows[bak_idx]; } else if (y + 2 < h && edges & LR_HAVE_BOTTOM) { int offset_y = y + 4 - h; assert(offset_y < 2); backup_row_lpf(bak_rows[bak_idx], lpf_bottom + offset_y * PXSTRIDE(stride), w, 4, edges); ptrs[8] = bak_rows[bak_idx]; } else { ptrs[8] = ptrs[7]; } if (++bak_idx == 9) bak_idx = 0; for (int bx = 0; bx < (w >> 2); bx++) { if (ll_mask[y >> 2][0] & (1 << bx)) continue; for (int x = bx * 4; x < bx * 4 + 4; x++) { const int m = ptrs[4][x]; int s = m << 7; for (int i = 0; i < 16; i++) { const int dy = wiener_ns_config_y[i][0]; const int dx = wiener_ns_config_y[i][1]; const int diff = ptrs[4 + dy][x + dx] + ptrs[4 - dy][x - dx] - 2 * m; s += diff * filter[i]; } const int v = (s + 64) >> 7; p[x] = iclip_pixel(v); } } for (int r = 0; r < 8; r++) ptrs[r] = ptrs[r+1]; p += PXSTRIDE(stride); } } static void ns_wiener_single_uv_c(pixel *p, const ptrdiff_t stride, const pixel (*left)[6], // FIXME this can be 2 const pixel *lpf, const pixel *lpf_bottom, const int w, int h, const WienerParams *params, const enum LrEdgeFlags edges, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { const int8_t *filter = params->single.filter; pixel row_buffers_c[5][REST_UNIT_STRIDE]; pixel row_buffers_l[5][REST_UNIT_STRIDE + 64]; pixel *bak_rows[2][5]; const pixel *ptrs[2][5]; for (int i = 0; i < 5; i++) { bak_rows[0][i] = row_buffers_c[i] + 6; bak_rows[1][i] = row_buffers_l[i] + 6; } // FIXME we only need 2px edges here (left/right), not 4 backup_row(bak_rows[0][2], p, left[0], w, 2, edges); ptrs[0][2] = bak_rows[0][2]; if (edges & (LR_HAVE_TOP_INTEGRATED | LR_HAVE_TOP)) { // y = -2,-1 for (int i = 0; i < 2; i++) { backup_row_lpf(bak_rows[0][i], lpf, w, 2, edges); ptrs[0][i] = bak_rows[0][i]; lpf += PXSTRIDE(stride); } } else { ptrs[0][0] = ptrs[0][1] = ptrs[0][2]; } backup_row(bak_rows[0][3], p + PXSTRIDE(stride), left[1], w, 2, edges); ptrs[0][3] = bak_rows[0][3]; int bak_idx = 4; const pixel *luma = params->single.luma; const ptrdiff_t lstride = params->single.stride; const int ss_hor = params->single.ss_hor, ss_ver = params->single.ss_ver; backup_row_luma(bak_rows[1][2], luma, lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][2] = bak_rows[1][2]; if (edges & LR_HAVE_TOP_INTEGRATED) { backup_row_luma(bak_rows[1][0], params->single.luma - 4 * PXSTRIDE(lstride), lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][0] = bak_rows[1][0]; backup_row_luma(bak_rows[1][1], params->single.luma - 2 * PXSTRIDE(lstride), lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][1] = bak_rows[1][1]; } else if (edges & LR_HAVE_TOP) { backup_row_luma(bak_rows[1][0], params->single.luma_top, 0, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][0] = bak_rows[1][0]; backup_row_luma(bak_rows[1][1], params->single.luma_top, lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][1] = bak_rows[1][1]; } else { ptrs[1][0] = ptrs[1][1] = ptrs[1][2]; } backup_row_luma(bak_rows[1][3], luma + (1 << ss_ver) * PXSTRIDE(lstride), lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][3] = bak_rows[1][3]; int lbak_idx = 4; for (int y = 0; y < h; y++) { if (y + 2 < h) { backup_row(bak_rows[0][bak_idx], p + 2*PXSTRIDE(stride), left[y + 2], w, 2, edges); ptrs[0][4] = bak_rows[0][bak_idx]; } else if (edges & LR_HAVE_BOTTOM_INTEGRATED) { backup_row_lpf(bak_rows[0][bak_idx], p + 2*PXSTRIDE(stride), w, 2, edges); ptrs[0][4] = bak_rows[0][bak_idx]; } else if (edges & LR_HAVE_BOTTOM) { int offset_y = y + 2 - h; assert(offset_y < 2); backup_row_lpf(bak_rows[0][bak_idx], lpf_bottom + offset_y * PXSTRIDE(stride), w, 2, edges); ptrs[0][4] = bak_rows[0][bak_idx]; } else { ptrs[0][4] = ptrs[0][3]; } if (++bak_idx == 5) bak_idx = 0; if (ptrs[0][4] == ptrs[0][3]) { ptrs[1][4] = ptrs[1][3]; } else if (y + 2 == h && !(edges & LR_HAVE_BOTTOM_INTEGRATED)) { backup_row_luma(bak_rows[1][lbak_idx], params->single.luma_bottom, lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][4] = bak_rows[1][lbak_idx]; } else if (y + 1 == h && !(edges & LR_HAVE_BOTTOM_INTEGRATED)) { backup_row_luma(bak_rows[1][lbak_idx], params->single.luma_bottom + PXSTRIDE(lstride), 0, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][4] = bak_rows[1][lbak_idx]; } else { backup_row_luma(bak_rows[1][lbak_idx], luma + (2 << ss_ver) * PXSTRIDE(lstride), lstride, w << ss_hor, edges, ss_hor, ss_ver, params->single.ds_flt); ptrs[1][4] = bak_rows[1][lbak_idx]; } if (++lbak_idx == 5) lbak_idx = 0; for (int bx = 0; bx < (w >> 2); bx++) { if (ll_mask[y >> 2][0] & (1 << bx)) continue; for (int x = bx * 4; x < bx * 4 + 4; x++) { const int m = ptrs[0][2][x]; int s = m << 7; for (int i = 0; i < 6; i++) { const int dy = wiener_ns_config_uv[i][0]; const int dx = wiener_ns_config_uv[i][1]; const int diff = ptrs[0][2 + dy][x + dx] + ptrs[0][2 - dy][x - dx] - 2 * m; s += diff * filter[i]; } const int l = ptrs[1][2][x << ss_hor]; for (int i = 0; i < 12; i++) { const int dy = wiener_ns_config_uv_from_y[i][0]; const int dx = wiener_ns_config_uv_from_y[i][1]; const int diff = ptrs[1][2 + dy][(x + dx) * (1 << ss_hor)] - l; s += diff * filter[6 + i]; } const int v = (s + 64) >> 7; p[x] = iclip_pixel(v); } } for (int r = 0; r < 4; r++) ptrs[0][r] = ptrs[0][r + 1]; for (int r = 0; r < 4; r++) ptrs[1][r] = ptrs[1][r + 1]; p += PXSTRIDE(stride); luma += PXSTRIDE(lstride) << ss_ver; } } static int get_qval_given_tskip(int qstep, int tskip, int i, int bitdepth_min_8) { qstep = (qstep + (1 << bitdepth_min_8 >> 1)) >> bitdepth_min_8; int prod = (tskip * qstep + 128) >> 8; int qval = mode_weights[i][0] * (tskip << 5) + mode_weights[i][1] * qstep + mode_weights[i][2] * prod; int abs_qval = abs(qval); qval = apply_sign((abs_qval + (1 << 12)) >> 13, qval); qval = 255 * (mode_offsets[i] + qval); return qval; } static int get_class_lut_idx(const pixel *ptrs[10], const uint16_t *noskip_mask, const int base_q, const int bx, const int by, const int bh, const int bitdepth_min_8) { int f[3] = {0, 0, 0}; int s = 0; for (int dy = -1; dy <= 4; dy++) { for (int dx = -1; dx <= 4; dx++) { const int x = (bx << 2) + dx; const int y = 4 + dy; const int m = ptrs[y][x]; const int up = ptrs[y - 1][x]; const int down = ptrs[y + 1][x]; const int vert = up - 2 * m + down; const int up_right = ptrs[y - 1][x + 1]; const int down_left = ptrs[y + 1][x - 1]; const int anti_diag = up_right - 2 * m + down_left; const int down_right = ptrs[y + 1][x + 1]; const int up_left = ptrs[y - 1][x - 1]; const int diag = up_left - 2 * m + down_right; f[0] += abs(vert); f[1] += abs(anti_diag); f[2] += abs(diag); } } // count skip masks for center, sides, and corners. const uint8_t num_pixels[3] = {16, 4, 1}; for (int dy = -1; dy <= 1; dy++) { for (int dx = -1; dx <= 1; dx++) { const int edge = !!dy + !!dx; const int fx = iclip((bx & 15) + dx, 0, 15); const int fy = iclip(by + dy, 0, bh - 1); s += num_pixels[edge] * !((noskip_mask[fy] >> fx) & 1); } } const int rnd = 1 << bitdepth_min_8 >> 1; for (int i = 0; i < 3; i++) { f[i] = (f[i] * pc_wiener_normalizer[i] + rnd) >> bitdepth_min_8; } s = s * pc_wiener_normalizer[3]; int qval = (imax(0, get_qval_given_tskip(base_q, s, 0, bitdepth_min_8)) + (1 << 13)) >> 14; qval = imin(qval, 255) >> 5; int lut_idx = qval << 9; for (int i = 0; i < 3; i++) { qval = (imax(0, f[i] + get_qval_given_tskip(base_q, s, i + 1, bitdepth_min_8)) + (1 << 13)) >> 14; qval = imin(qval, 255) >> 5; lut_idx |= qval << (3 * (2-i)); } return lut_idx; } static void wiener_multi(pixel *p, const ptrdiff_t stride, const pixel (*left)[6], const pixel *lpf, const pixel *lpf_bottom, const int w, int h, const int8_t (*filters_user)[18], const int16_t (*filters_pretrained)[13], const uint8_t *subclass_lut, const uint16_t *noskip_mask, const int base_q, const enum LrEdgeFlags edges, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; uint8_t classes[CLASS_BUF_SIZE]; pixel row_buffers[10][REST_UNIT_STRIDE]; pixel *bak_rows[10]; const pixel *ptrs[10]; for (int i = 0; i < 10; i++) bak_rows[i] = row_buffers[i] + 6; backup_row(bak_rows[4], p, left[0], w, 4, edges); ptrs[4] = bak_rows[4]; if (edges & LR_HAVE_TOP_INTEGRATED) { for (int i = 0; i < 4; i++) { backup_row_lpf(bak_rows[i], lpf, w, 4, edges); lpf += PXSTRIDE(stride); ptrs[i] = bak_rows[i]; } } else if (edges & LR_HAVE_TOP) { // y = -2,-1 backup_row_lpf(bak_rows[2], lpf, w, 4, edges); ptrs[2] = bak_rows[2]; backup_row_lpf(bak_rows[3], lpf + PXSTRIDE(stride), w, 4, edges); ptrs[3] = bak_rows[3]; // y = -3,-4 ptrs[0] = ptrs[1] = ptrs[2]; } else { ptrs[0] = ptrs[1] = ptrs[2] = ptrs[3] = ptrs[4]; } backup_row(bak_rows[5], p + PXSTRIDE(stride), left[1], w, 4, edges); ptrs[5] = bak_rows[5]; backup_row(bak_rows[6], p + 2*PXSTRIDE(stride), left[2], w, 4, edges); ptrs[6] = bak_rows[6]; backup_row(bak_rows[7], p + 3*PXSTRIDE(stride), left[3], w, 4, edges); ptrs[7] = bak_rows[7]; int bak_idx = 8; const int bh = h >> 2; const int bw = w >> 2; for (int by = 0; by < bh; by++) { // Backup an extra row to compute class if (by + 1 < bh) { // TODO: don't backup lines twice backup_row(bak_rows[bak_idx], p + 4*PXSTRIDE(stride), left[(by << 2) + 4], w, 4, edges); ptrs[8] = bak_rows[bak_idx]; backup_row(bak_rows[9], p + 5*PXSTRIDE(stride), left[(by << 2) + 5], w, 4, edges); ptrs[9] = bak_rows[9]; } else if (edges & LR_HAVE_BOTTOM_INTEGRATED) { backup_row_lpf(bak_rows[bak_idx], p + 4*PXSTRIDE(stride), w, 4, edges); ptrs[8] = bak_rows[bak_idx]; backup_row_lpf(bak_rows[9], p + 5*PXSTRIDE(stride), w, 4, edges); ptrs[9] = bak_rows[9]; } else if (edges & LR_HAVE_BOTTOM) { backup_row_lpf(bak_rows[bak_idx], lpf_bottom + 0 * PXSTRIDE(stride), w, 4, edges); ptrs[8] = bak_rows[bak_idx]; backup_row_lpf(bak_rows[9], lpf_bottom + 1 * PXSTRIDE(stride), w, 4, edges); ptrs[9] = bak_rows[9]; } else { ptrs[8] = ptrs[7]; ptrs[9] = ptrs[7]; } for (int bx = 0; bx < bw; bx++) { int lut_idx = get_class_lut_idx(ptrs, noskip_mask, base_q, bx, by, bh, bitdepth_min_8); // TODO: Convert these 2 lookup tables to a Pre compute a single lookup table ahead of time int cls = dav2d_pc_weiner_lut_to_class[lut_idx]; classes[bx] = subclass_lut[cls]; } for (int y = by << 2; y < (by << 2) + 4; y++) { if (y + 4 < h) { backup_row(bak_rows[bak_idx], p + 4*PXSTRIDE(stride), left[y + 4], w, 4, edges); ptrs[8] = bak_rows[bak_idx]; } else if (edges & LR_HAVE_BOTTOM_INTEGRATED) { backup_row_lpf(bak_rows[bak_idx], p + 4*PXSTRIDE(stride), w, 4, edges); ptrs[8] = bak_rows[bak_idx]; } else if (y + 2 < h && edges & LR_HAVE_BOTTOM) { int offset_y = y + 4 - h; assert(offset_y < 2); backup_row_lpf(bak_rows[bak_idx], lpf_bottom + offset_y * PXSTRIDE(stride), w, 4, edges); ptrs[8] = bak_rows[bak_idx]; } else { ptrs[8] = ptrs[7]; } if (++bak_idx == 9) bak_idx = 0; for (int bx = 0; bx < bw; bx++) { if (ll_mask[y >> 2][0] & (1 << bx)) continue; if (filters_user) { const int8_t *filter = filters_user[classes[bx]]; for (int x = bx << 2; x < (bx << 2) + 4; x++) { const int m = ptrs[4][x]; int s = m << 7; for (int i = 0; i < 16; i++) { const int dy = wiener_ns_config_y[i][0]; const int dx = wiener_ns_config_y[i][1]; const int diff = ptrs[4 + dy][x + dx] + ptrs[4 - dy][x - dx] - 2 * m; s += diff * filter[i]; } const int v = (s + 64) >> 7; p[x] = iclip_pixel(v); } } else { const int16_t *filter = filters_pretrained[classes[bx]]; for (int x = bx << 2; x < (bx << 2) + 4; x++) { int s = ptrs[4][x] * filter[12]; for (int i = 0; i < 12; i++) { const int dy = pc_wiener_config[i][0]; const int dx = pc_wiener_config[i][1]; s += filter[i] * (ptrs[4 + dy][x + dx] + ptrs[4 - dy][x - dx]); } const int v = (s + 64) >> 7; p[x] = iclip_pixel(v); } } } for (int r = 0; r < 8; r++) ptrs[r] = ptrs[r+1]; p += PXSTRIDE(stride); } } } static void ns_wiener_multi_c(pixel *p, const ptrdiff_t stride, const pixel (*left)[6], const pixel *lpf, const pixel *lpf_bottom, const int w, int h, const WienerParams *params, const enum LrEdgeFlags edges, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { wiener_multi(p, stride, left, lpf, lpf_bottom, w, h, params->multi.filters.user, NULL, params->multi.subclass_lut, params->multi.noskip_mask, params->multi.base_q, edges, ll_mask HIGHBD_TAIL_SUFFIX); } static void pc_wiener_c(pixel *p, const ptrdiff_t stride, const pixel (*left)[6], const pixel *lpf, const pixel *lpf_bottom, const int w, int h, const WienerParams *params, const enum LrEdgeFlags edges, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { wiener_multi(p, stride, left, lpf, lpf_bottom, w, h, NULL, params->multi.filters.pretrained, params->multi.subclass_lut, params->multi.noskip_mask, params->multi.base_q, edges, ll_mask HIGHBD_TAIL_SUFFIX); } // Sum 2x2 rows of gradients and store in dst static void compute_gradient_row(uint16_t (*dst)[4], const pixel **src, const int w, const int shift) { for (int x1 = 0; x1 < w + 2; x1 += 2) { const int8_t offs[4][2] = { { 1, 0 }, { 0, 1 }, { 1, 1 }, { -1, 1 } }; for (int d = 0; d < 4; d++) { int grad = 0; for (int x2 = 0; x2 < 2; x2++) { int x = x1 + x2; for (int y = 0; y < 2; y++) { int dy = offs[d][0]; int dx = offs[d][1]; int a = src[y - 1 - dy][x - 1 - dx] >> shift; int b = src[y - 1][x - 1] >> shift; int c = src[y - 1 + dy][x - 1 + dx] >> shift; grad += abs(b * 2 - a - c); } } dst[x1 >> 1][d] = grad; } } } static void gdf_prep_c(int8_t *dst, const ptrdiff_t dst_stride, const pixel *p, const ptrdiff_t stride, const pixel (*left)[6], const pixel *lpf, const pixel *lpf_bottom, const int w, const int h, const int ref_dst_idx, const int qp_idx, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { const int bitdepth = bitdepth_from_max(bitdepth_max); const int down_shift = bitdepth == 12 ? 2 : 0; const int up_shift = bitdepth == 8 ? 2 : 0; uint16_t grad[2][GRADIENT_BUF_STRIDE][4]; pixel row_buffers[13][REST_UNIT_STRIDE]; pixel *bak_rows[13]; const pixel *ptrs[13]; for (int i = 0; i < 13; i++) bak_rows[i] = row_buffers[i] + 6; backup_row(bak_rows[6], p, left[0], w, 6, edges); ptrs[6] = bak_rows[6]; if (edges & LR_HAVE_TOP_INTEGRATED) { for (int n = 0; n < 6; n++) { backup_row_lpf(bak_rows[n], lpf + n * PXSTRIDE(stride), w, 6, edges); ptrs[n] = bak_rows[n]; } } else if (edges & LR_HAVE_TOP) { // y = -2,-1 backup_row_lpf(bak_rows[4], lpf, w, 6, edges); ptrs[4] = bak_rows[4]; backup_row_lpf(bak_rows[5], lpf + PXSTRIDE(stride), w, 6, edges); ptrs[5] = bak_rows[5]; // y = -3,-4,-5,-6 ptrs[0] = ptrs[1] = ptrs[2] = ptrs[3] = ptrs[4]; } else { ptrs[0] = ptrs[1] = ptrs[2] = ptrs[3] = ptrs[4] = ptrs[5] = ptrs[6]; } int bak_idx = 7; for (int y = 1; y < 6; y++, bak_idx++) { backup_row(bak_rows[bak_idx], p + y * PXSTRIDE(stride), left[y], w, 6, edges); ptrs[bak_idx] = bak_rows[bak_idx]; } const int8_t *error_lut; int scale; compute_gradient_row(grad[0], &ptrs[6], w, down_shift); int grad_bit = 1; if (ref_dst_idx == 0) { error_lut = dav2d_gdf_intra_error[qp_idx]; scale = 8; } else { error_lut = dav2d_gdf_inter_error[ref_dst_idx - 1][qp_idx]; scale = 5; } for (int y = 0; y < h; y++) { if (y + 6 < h) { backup_row(bak_rows[bak_idx], p + 6*PXSTRIDE(stride), left[y + 6], w, 6, edges); ptrs[12] = bak_rows[bak_idx]; } else if (edges & LR_HAVE_BOTTOM_INTEGRATED) { backup_row_lpf(bak_rows[bak_idx], p + 6*PXSTRIDE(stride), w, 6, edges); ptrs[12] = bak_rows[bak_idx]; } else if (y + 4 < h && edges & LR_HAVE_BOTTOM) { int offset_y = y + 6 - h; assert(offset_y < 2); backup_row_lpf(bak_rows[bak_idx], lpf_bottom + offset_y * PXSTRIDE(stride), w, 6, edges); ptrs[12] = bak_rows[bak_idx]; } else { ptrs[12] = ptrs[11]; } if (++bak_idx == 13) bak_idx = 0; if ((y & 1) == 0) { compute_gradient_row(grad[grad_bit], &ptrs[8], w, down_shift); grad_bit ^= 1; } for (int x1 = 0; x1 < w; x1 += 2) { // TODO: Don't recompute the same grad_sum/shared_vals on odd rows. int grad_sums[4] = { 0, 0, 0, 0 }; int shared_vals[3]; for (int d = 0; d < 4; d++) { int hx = x1 >> 1; // Compute gradients over a 4x4 region grad_sums[d] = grad[0][hx][d] + grad[0][hx + 1][d] + grad[1][hx][d] + grad[1][hx + 1][d]; } int cls = (grad_sums[0] <= grad_sums[1]) | ((grad_sums[2] <= grad_sums[3]) << 1); for (int idx = 0; idx < 3; idx++) shared_vals[idx] = dav2d_gdf_bias[ref_dst_idx][qp_idx][idx]; for (int d = 0; d < 4; d++) { const int k = d + 18; const int alpha = dav2d_gdf_alpha[ref_dst_idx][qp_idx][k][cls]; const int v = imin(grad_sums[d] >> (4 - up_shift), alpha); for (int idx = 0; idx < 3; idx++) shared_vals[idx] += v * dav2d_gdf_weight[ref_dst_idx][qp_idx][idx][k][cls]; } for (int x2 = 0; x2 < 2; x2++) { int x = x1 + x2; int idx_vals[3]; int m = ptrs[6][x] >> down_shift; for (int idx = 0; idx < 3; idx++) idx_vals[idx] = shared_vals[idx]; for (int k = 0; k < 18; k++) { const int alpha = dav2d_gdf_alpha[ref_dst_idx][qp_idx][k][cls]; const int dy = gdf_coords[k][0]; const int dx = gdf_coords[k][1]; const int a = ptrs[6 - dy][x - dx] >> down_shift; const int b = ptrs[6 + dy][x + dx] >> down_shift; const int above = iclip((a - m) * (1 << up_shift), -alpha, alpha); const int below = iclip((b - m) * (1 << up_shift), -alpha, alpha); const int v = iclip(above + below, -512, 511); for (int idx = 0; idx < 3; idx++) idx_vals[idx] += v * dav2d_gdf_weight[ref_dst_idx][qp_idx][idx][k][cls]; } int full_idx = 0; for (int idx = 0; idx < 3; idx++) { int v = idx_vals[idx] * scale; v = apply_sign((abs(v) + (1 << 14)) >> 15, v); int sub_idx = iclip(v, -scale, scale - 1) + scale; full_idx = full_idx * scale * 2 + sub_idx; } dst[x] = error_lut[full_idx]; } } for (int r = 0; r < 12; r++) ptrs[r] = ptrs[r+1]; dst += dst_stride; p += PXSTRIDE(stride); } } static void gdf_add_c(pixel *p_line, const ptrdiff_t stride, const int8_t *err_line, const ptrdiff_t err_stride, const int w, const int h, const int scale, const uint16_t (*ll_mask)[4] HIGHBD_DECL_SUFFIX) { const int shift = 12 - bitdepth_from_max(bitdepth_max); const int rnd = 1 << shift >> 1; for (int by = 0; by < h >> 2; by++) { for (int bx = 0; bx < w >> 2; bx++) { if (ll_mask[by][0] & (1 << bx)) continue; pixel *p = p_line; const int8_t *err = err_line; for (int y = by * 4; y < by * 4 + 4; y++) { for (int x = bx * 4; x < bx * 4 + 4; x++) { int diff = err[x] * scale; p[x] = iclip_pixel(p[x] + apply_sign((abs(diff) + rnd) >> shift, diff)); } p += PXSTRIDE(stride); err += err_stride; } } p_line += PXSTRIDE(stride) * 4; err_line += err_stride * 4; } } #if HAVE_ASM && 0 #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/looprestoration.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/looprestoration.h" #elif ARCH_PPC64LE #include "src/ppc/looprestoration.h" #elif ARCH_X86 #include "src/x86/looprestoration.h" #endif #endif COLD void bitfn(dav2d_loop_restoration_dsp_init)(Dav2dLoopRestorationDSPContext *const c, const int bpc) { c->ns_wiener_single[0] = ns_wiener_single_y_c; c->ns_wiener_single[1] = ns_wiener_single_uv_c; c->ns_wiener_multi = ns_wiener_multi_c; c->pc_wiener = pc_wiener_c; c->gdf_add = gdf_add_c; c->gdf_prep = gdf_prep_c; #if HAVE_ASM && 0 #if ARCH_AARCH64 || ARCH_ARM loop_restoration_dsp_init_arm(c, bpc); #elif ARCH_LOONGARCH64 loop_restoration_dsp_init_loongarch(c, bpc); #elif ARCH_PPC64LE loop_restoration_dsp_init_ppc(c, bpc); #elif ARCH_X86 loop_restoration_dsp_init_x86(c, bpc); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/lr_apply.h000066400000000000000000000034711517466257200226330ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_LR_APPLY_H #define DAV2D_SRC_LR_APPLY_H #include #include #include "common/bitdepth.h" #include "src/internal.h" enum LrRestorePlanes { LR_RESTORE_Y = 1 << 0, LR_RESTORE_U = 1 << 1, LR_RESTORE_V = 1 << 2, }; void bytefn(dav2d_lr_sbrow)(Dav2dFrameContext *const f, pixel *const dst[3], int sby); #endif /* DAV2D_SRC_LR_APPLY_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/lr_apply_tmpl.c000066400000000000000000000376561517466257200236760ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/lr_apply.h" enum FirstSbInTileRow { FIRST_SB_NONE = 0, FIRST_SB_TOP, FIRST_SB_BOTTOM, }; static void lr_stripe(const Dav2dFrameContext *const f, pixel *p, const pixel (*left)[6], int x, int y, const int plane, const int w, const int row_h, const Av2RestorationUnit *const lr, enum LrEdgeFlags edges, const enum FirstSbInTileRow first_sby_in_tile_row, const int tile_row_m1) { const Dav2dDSPContext *const dsp = f->dsp; const struct Dav2dNSWienerPlane *const pd = &f->frame_hdr->restoration.p[plane].ns; const int chroma = !!plane; const int ss_ver = chroma & (f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420); const int ss_hor = chroma & (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444); const ptrdiff_t stride = f->cur.p.stride[chroma]; const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->frame_hdr->sb128); const int have_tt = f->c->n_tc > 1; const pixel *lpf = f->lf.lr_db_line[plane] + have_tt * (sby * (4 << f->frame_hdr->sb128) - 4) * PXSTRIDE(stride) + x; const pixel *top = ((edges & (LR_HAVE_TOP | LR_HAVE_TOP_INTEGRATED)) == (LR_HAVE_TOP | LR_HAVE_TOP_INTEGRATED)) ? f->lf.lr_cdef_line[plane] + tile_row_m1 * (6 - 4 * chroma) * PXSTRIDE(stride) + x : NULL; const int sb256x = (x << ss_hor) >> 8; const int sb64x_idx = ((x << ss_hor) >> 6) & 3; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = imin((64 - 8 * !!first_sby_in_tile_row) >> ss_ver, row_h - y); int ref_dst_idx = f->lf.gdf_ref_dst_idx; int qp_idx = f->frame_hdr->gdf.qp_idx; int gdf_scale = f->frame_hdr->gdf.scale; wienerfilter_fn wiener_fn = NULL; WienerParams wiener_params; uint16_t noskip_mask[64 + 2]; int multi_wiener = 0; const enum Dav2dRestorationType wiener_type = (f->c->inloop_filters & DAV2D_INLOOPFILTER_WIENER) ? lr->type : DAV2D_RESTORATION_NONE; if (wiener_type == DAV2D_RESTORATION_NS_WIENER) { if (pd->frame_filters_on) { if (pd->num_classes == 1) { wiener_fn = dsp->lr.ns_wiener_single[chroma]; wiener_params.single.filter = pd->filter[0]; } else { multi_wiener = 1; wiener_fn = dsp->lr.ns_wiener_multi; wiener_params.multi.base_q = f->lf.base_q; wiener_params.multi.subclass_lut = f->lf.ns_subclass_lut; wiener_params.multi.filters.user = pd->filter; } } else { wiener_fn = dsp->lr.ns_wiener_single[chroma]; wiener_params.single.filter = lr->ns_filter[0]; } } else if (wiener_type == DAV2D_RESTORATION_PC_WIENER) { multi_wiener = 1; wiener_fn = dsp->lr.pc_wiener; wiener_params.multi.base_q = f->lf.base_q; wiener_params.multi.subclass_lut = f->lf.pc_subclass_lut; wiener_params.multi.filters.pretrained = f->lf.pc_filters; } if (multi_wiener) { wiener_params.multi.noskip_mask = noskip_mask; for (int by = y >> 2, r = 0; by < row_h >> 2; by++, r++) { int by_idx = by & 63; // TODO: add outer loop so we don't compute and offset by the same sb256_idx most iterations int sb256_idx = f->sb256w * (by >> 6) + sb256x; uint16_t* noskip_row = f->lf.mask[sb256_idx].lr_noskip_mask[by_idx]; noskip_mask[r] = noskip_row[sb64x_idx]; // extend masks on the right edge if (!(edges & LR_HAVE_RIGHT) && w & 63) { const int shift = ((w >> 2) & 15) - 1; const int mask = noskip_mask[r]; const int edge = mask >> shift; noskip_mask[r] |= edge << (shift + 1); } } } const ptrdiff_t lstride = f->cur.p.stride[0]; const pixel *llpf = f->lf.lr_db_line[0] + have_tt * (sby * (4 << f->frame_hdr->sb128) - 4) * PXSTRIDE(lstride) + x * 2; if (chroma) { wiener_params.single.ss_ver = ss_ver; wiener_params.single.ss_hor = ss_hor; wiener_params.single.stride = lstride; wiener_params.single.ds_flt = f->seq_hdr->cfl_ds_filter_index; } int8_t gdf_err[64*64]; while (y + stripe_h <= row_h) { if (chroma) { wiener_params.single.luma = &((pixel *) f->lf.p[0])[(x << ss_hor) + (y << ss_ver) * PXSTRIDE(lstride)]; wiener_params.single.luma_top = llpf; wiener_params.single.luma_bottom = llpf + 6 * PXSTRIDE(lstride); } // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h) edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; const int inc = (edges & (LR_HAVE_TOP | LR_HAVE_TOP_INTEGRATED | LR_HAVE_BOTTOM_INTEGRATED)) == LR_HAVE_TOP && y + 8 < (f->bh * 4 >> ss_ver) ? 8 : 0; int sb256_idx = f->sb256w * (((y << ss_ver) + inc) >> 8) + sb256x; int gdf = !plane && (f->c->inloop_filters & DAV2D_INLOOPFILTER_GDF) && f->lf.mask[sb256_idx].gdf[(((y + inc) >> 4) & 12) + sb64x_idx]; if (gdf) { dsp->lr.gdf_prep(gdf_err, 64, p, stride, left, top ? top : lpf, lpf + 6 * PXSTRIDE(stride), w, stripe_h, ref_dst_idx, qp_idx, edges HIGHBD_CALL_SUFFIX); } const int y4 = ((y << ss_ver) & 255) >> 2; const uint16_t (*ll_mask)[4]; uint16_t ll_uv_mask_mem[16][4]; if (!plane || !ss_hor) { ll_mask = plane ? (const uint16_t(*)[4]) &f->lf.mask[sb256_idx].lossless_mask_uv[y4][sb64x_idx] : (const uint16_t(*)[4]) &f->lf.mask[sb256_idx].lossless_mask_y[y4][sb64x_idx]; } else { ll_mask = (const uint16_t(*)[4]) &f->lf.mask[sb256_idx].lossless_mask_uv[y4 >> ss_ver][sb64x_idx]; const int init_y = y >> 2; for (int yy = init_y; yy < (y + stripe_h) >> 2; yy++, ll_mask++) ll_uv_mask_mem[yy - init_y][0] = ll_mask[0][0] | (ll_mask[0][1] << 8); ll_mask = ll_uv_mask_mem; } if (wiener_fn) { wiener_fn(p, stride, left, top ? top + (2 * !chroma) * PXSTRIDE(stride) : lpf, lpf + 6 * PXSTRIDE(stride), w, stripe_h, &wiener_params, edges, ll_mask HIGHBD_CALL_SUFFIX); if (multi_wiener) wiener_params.multi.noskip_mask += stripe_h >> 2; } if (gdf) { dsp->lr.gdf_add(p, stride, gdf_err, 64, w, stripe_h, gdf_scale, ll_mask HIGHBD_CALL_SUFFIX); } edges &= ~(LR_HAVE_BOTTOM_INTEGRATED | LR_HAVE_TOP_INTEGRATED); left += stripe_h; y += stripe_h; p += stripe_h * PXSTRIDE(stride); edges |= LR_HAVE_TOP; stripe_h = imin(64 >> ss_ver, row_h - y); if (stripe_h == 0) break; lpf += 4 * PXSTRIDE(stride); llpf += 4 * PXSTRIDE(lstride); top = NULL; } } static void backupNxU(pixel (*dst)[6], const pixel *src, const ptrdiff_t src_stride, int u, const int n) { for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride)) pixel_copy(&dst[0][6 - n], src - n, n); } static void lr_sbrow(const Dav2dFrameContext *const f, pixel *p, const int y, const int w, const int h, const int row_h, const int plane, const enum FirstSbInTileRow first_sby_in_tile_row, const int tile_row_m1) { const int chroma = !!plane; const int ss_ver = chroma & (f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420); const int ss_hor = chroma & (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444); const ptrdiff_t p_stride = f->cur.p.stride[chroma]; const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!plane]; const int unit_size = 1 << unit_size_log2; const int half_unit_size = unit_size >> 1; // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y) const int row_y = y + ((8 >> ss_ver) * !first_sby_in_tile_row); // FIXME This is an ugly hack to lookup the proper AV2Filter unit for // chroma planes. Question: For Multithreaded decoding, is it better // to store the chroma LR information with collocated Luma information? // In other words. For a chroma restoration unit locate at 128,128 and // with a 4:2:0 chroma subsampling, do we store the filter information at // the AV2Filter unit located at (128,128) or (256,256) // TODO Support chroma subsampling. const int shift_hor = 8 - ss_hor; /* maximum sbrow height is 256 + 8 rows offset */ ALIGN_STK_16(pixel, pre_lr_border, 2, [256 + 8][6]); const Av2RestorationUnit *lr[2]; enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT; if (first_sby_in_tile_row == FIRST_SB_TOP) edges |= LR_HAVE_BOTTOM_INTEGRATED; if (first_sby_in_tile_row == FIRST_SB_BOTTOM && y > 0) edges |= LR_HAVE_TOP_INTEGRATED; int aligned_unit_pos = row_y & ~(unit_size - 1); if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h) aligned_unit_pos -= unit_size; aligned_unit_pos <<= ss_ver; const int sb_idx = (aligned_unit_pos >> 8) * f->sb256w; const int unit_idx = ((aligned_unit_pos >> 6) & 0x3) << 2; lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx]; int restore = 1; // TODO: restore logic for disabling backups int x = 0, bit = 0; for (; x + 64 < w; p += 64, edges |= LR_HAVE_LEFT, bit ^= 1) { const int next_x = x + 64; int next_iter_lru_start_x = next_x & ~(unit_size - 1); if (next_iter_lru_start_x && w - next_iter_lru_start_x < half_unit_size) next_iter_lru_start_x -= unit_size; const int next_u_idx = unit_idx + ((next_iter_lru_start_x >> (shift_hor - 2)) & 3); lr[!bit] = &f->lf.lr_mask[sb_idx + (next_iter_lru_start_x >> shift_hor)].lr[plane][next_u_idx]; const int restore_next = 1; // FIXME could backup 4px for luma if gdf=off if (restore_next) backupNxU(pre_lr_border[bit], p + 64, p_stride, row_h - y, plane ? 2 : 6); if (restore) lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, 64, row_h, lr[bit], edges, first_sby_in_tile_row, tile_row_m1); x = next_x; restore = restore_next; } if (restore) { edges &= ~LR_HAVE_RIGHT; const int end_w = w - x; lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, end_w, row_h, lr[bit], edges, first_sby_in_tile_row, tile_row_m1); } } static inline void copyNlines(pixel *dst, const pixel *src, const ptrdiff_t stride, const int n) { memcpy(dst, src, stride * n); } void bytefn(dav2d_lr_sbrow)(Dav2dFrameContext *const f, pixel *const dst[3], const int sby) { // TODO: strips starting at each tile row need to be shorted, not just the first row. const ptrdiff_t *const dst_stride = f->cur.p.stride; const int restore_planes = f->lf.restore_planes; const int not_last = sby + 1 < f->sbh; int first_sby_in_tile_row = f->lf.start_of_tile_row[sby]; const int tile_row = first_sby_in_tile_row >> 1; first_sby_in_tile_row &= 1; if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) { const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h = f->bh * 4 >> ss_ver; const int w = f->bw * 4 >> ss_hor; const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->frame_hdr->sb128); const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h); int offset_uv = 8 * !!sby >> ss_ver; int y_stripe = (sby << ((6 - ss_ver) + f->frame_hdr->sb128)) - offset_uv; if (sby && first_sby_in_tile_row) { if (restore_planes & LR_RESTORE_U) { copyNlines(&f->lf.lr_cdef_line[1][2 * PXSTRIDE(dst_stride[1]) * (tile_row - 1)], dst[1] - 2 * PXSTRIDE(dst_stride[1]), dst_stride[1], 2); lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, w, h, y_stripe + (8 >> ss_ver), 1, first_sby_in_tile_row * FIRST_SB_TOP, tile_row - 1); } if (restore_planes & LR_RESTORE_V) { copyNlines(&f->lf.lr_cdef_line[2][2 * PXSTRIDE(dst_stride[1]) * (tile_row - 1)], dst[2] - 2 * PXSTRIDE(dst_stride[1]), dst_stride[1], 2); lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, w, h, y_stripe + (8 >> ss_ver), 2, first_sby_in_tile_row * FIRST_SB_TOP, tile_row - 1); } offset_uv = 0; y_stripe += 8 >> ss_ver; } if (restore_planes & LR_RESTORE_U) lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, w, h, row_h, 1, first_sby_in_tile_row * FIRST_SB_BOTTOM, tile_row - 1); if (restore_planes & LR_RESTORE_V) lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, w, h, row_h, 2, first_sby_in_tile_row * FIRST_SB_BOTTOM, tile_row - 1); } if (restore_planes & LR_RESTORE_Y) { const int h = f->bh * 4; const int w = f->bw * 4; const int next_row_y = (sby + 1) << (6 + f->frame_hdr->sb128); int row_h = imin(next_row_y - 8 * not_last, h); int offset_y = 8 * !!sby; int y_stripe = (sby << (6 + f->frame_hdr->sb128)) - offset_y; if (sby && first_sby_in_tile_row) { copyNlines(&f->lf.lr_cdef_line[0][6 * PXSTRIDE(dst_stride[0]) * (tile_row - 1)], dst[0] - 6 * PXSTRIDE(dst_stride[0]), dst_stride[0], 6); lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w, h, y_stripe + 8, 0, first_sby_in_tile_row * FIRST_SB_TOP, tile_row - 1); offset_y = 0; y_stripe += 8; } lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w, h, row_h, 0, first_sby_in_tile_row * FIRST_SB_BOTTOM, tile_row - 1); } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/mc.h000066400000000000000000000163411517466257200214100ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_MC_H #define DAV2D_SRC_MC_H #include #include #include "common/bitdepth.h" #include "src/levels.h" #define decl_mc_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX) typedef decl_mc_fn(*mc_fn); #define decl_mc_scaled_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX) typedef decl_mc_scaled_fn(*mc_scaled_fn); #define decl_warp8x8_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX) typedef decl_warp8x8_fn(*warp8x8_fn); #define decl_mct_fn(name) \ void (name)(int16_t *tmp, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX) typedef decl_mct_fn(*mct_fn); #define decl_ext_warp4x4_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ int mx, int my HIGHBD_DECL_SUFFIX) typedef decl_ext_warp4x4_fn(*ext_warp4x4_fn); #define decl_mct_scaled_fn(name) \ void (name)(int16_t *tmp, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX) typedef decl_mct_scaled_fn(*mct_scaled_fn); #define decl_warp8x8t_fn(name) \ void (name)(int16_t *tmp, const ptrdiff_t tmp_stride, \ const pixel *src, ptrdiff_t src_stride, \ const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX) typedef decl_warp8x8t_fn(*warp8x8t_fn); #define decl_ext_warp4x4t_fn(name) \ void (name)(int16_t *tmp, const ptrdiff_t tmp_stride, \ const pixel *src, ptrdiff_t src_stride, \ int mx, int my HIGHBD_DECL_SUFFIX) typedef decl_ext_warp4x4t_fn(*ext_warp4x4t_fn); #define decl_avg_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const int16_t *tmp1, const int16_t *tmp2, int w, int h \ HIGHBD_DECL_SUFFIX) typedef decl_avg_fn(*avg_fn); #define decl_w_avg_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight \ HIGHBD_DECL_SUFFIX) typedef decl_w_avg_fn(*w_avg_fn); #define decl_mask_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const int16_t *tmp1, const int16_t *tmp2, int w, int h, \ const uint8_t *mask HIGHBD_DECL_SUFFIX) typedef decl_mask_fn(*mask_fn); #define decl_w_mask_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const int16_t *tmp1, const int16_t *tmp2, int w, int h, \ uint8_t *mask, ptrdiff_t mask_stride, int sign HIGHBD_DECL_SUFFIX) typedef decl_w_mask_fn(*w_mask_fn); #define decl_blend_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \ int w, int h, const uint8_t *mask) typedef decl_blend_fn(*blend_fn); #define decl_emu_edge_fn(name) \ void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \ pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride) typedef decl_emu_edge_fn(*emu_edge_fn); #define decl_morph_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, int alpha, int beta, \ int w, int h HIGHBD_DECL_SUFFIX) typedef decl_morph_fn(*morph_fn); struct OpflOffset { int8_t y, x; }; #define decl_sad_refine_mv_fn(name) \ void (name)(const pixel *p0, ptrdiff_t p0_stride, \ const pixel *p1, ptrdiff_t p1_stride, \ int w, int h, int is_implicit, \ struct OpflOffset *o HIGHBD_DECL_SUFFIX) typedef decl_sad_refine_mv_fn(*sad_refine_mv_fn); PACKED(union aliasi16 { uint16_t u16; int8_t i8[2]; }) ATTR_ALIAS; struct OpflRegressionData { int32_t su2, suv, sv2, suw, svw; }; #define decl_opfl_derive_mv_fn(name) \ void (name)(struct OpflRegressionData *out, \ const pixel *p0, ptrdiff_t p0_stride, \ const pixel *p1, ptrdiff_t p1_stride, \ int w, int h, int bs, const union aliasi16 d HIGHBD_DECL_SUFFIX) typedef decl_opfl_derive_mv_fn(*opfl_derive_mv_fn); #define decl_sad8x8_fn(name) \ unsigned (name)(const pixel *p0, ptrdiff_t p0_stride, \ const pixel *p1, ptrdiff_t p1_stride HIGHBD_DECL_SUFFIX) typedef decl_sad8x8_fn(*sad8x8_fn); #define decl_8tap_gen(decl_name, fn_name, opt) \ decl_##decl_name##_fn(BF(dav2d_##fn_name##_8tap_regular, opt)); \ decl_##decl_name##_fn(BF(dav2d_##fn_name##_8tap_smooth, opt)); \ decl_##decl_name##_fn(BF(dav2d_##fn_name##_8tap_sharp, opt)) #define decl_8tap_fns(opt) \ decl_8tap_gen(mc, put, opt); \ decl_8tap_gen(mct, prep, opt) #define init_8tap_gen(name, opt) \ init_##name##_fn(DAV2D_FILTER_8TAP_REGULAR, 8tap_regular, opt); \ init_##name##_fn(DAV2D_FILTER_8TAP_SMOOTH, 8tap_smooth, opt); \ init_##name##_fn(DAV2D_FILTER_8TAP_SHARP, 8tap_sharp, opt) #define init_8tap_fns(opt) \ init_8tap_gen(mc, opt); \ init_8tap_gen(mct, opt) typedef struct Dav2dMCDSPContext { mc_fn mc[DAV2D_N_FILTERS]; mc_scaled_fn mc_scaled[DAV2D_N_FILTERS]; mct_fn mct[DAV2D_N_FILTERS]; mct_scaled_fn mct_scaled[DAV2D_N_FILTERS]; avg_fn avg; w_avg_fn w_avg; mask_fn mask; w_mask_fn w_mask[3 /* 444, 422, 420 */]; blend_fn blend; warp8x8_fn warp8x8; warp8x8t_fn warp8x8t; ext_warp4x4_fn ext_warp4x4; ext_warp4x4t_fn ext_warp4x4t; emu_edge_fn emu_edge; morph_fn morph; opfl_derive_mv_fn opfl_derive_mv; sad_refine_mv_fn sad_refine_mv; sad8x8_fn sad8x8; } Dav2dMCDSPContext; bitfn_decls(void dav2d_mc_dsp_init, Dav2dMCDSPContext *c); #endif /* DAV2D_SRC_MC_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/mc_tmpl.c000066400000000000000000001207541517466257200224430ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "common/attributes.h" #include "common/intops.h" #include "src/mc.h" #include "src/tables.h" #if BITDEPTH == 8 #define get_intermediate_bits(bitdepth_max) 4 // Output in interval [-5132, 9212], fits in int16_t as is #define PREP_BIAS 0 #else // 4 for 10 bits/component, 2 for 12 bits/component #define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max)) // Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) // Subtract a bias to ensure the output fits in int16_t #define PREP_BIAS 8192 #endif static NOINLINE void put_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_stride, const int w, int h) { do { pixel_copy(dst, src, w); dst += dst_stride; src += src_stride; } while (--h); } static NOINLINE void prep_c(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, const ptrdiff_t src_stride, const int w, int h HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); do { for (int x = 0; x < w; x++) tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS; tmp += tmp_stride; src += src_stride; } while (--h); } #define FILTER_8TAP(src, x, F, stride) \ (F[0] * src[x + -3 * stride] + \ F[1] * src[x + -2 * stride] + \ F[2] * src[x + -1 * stride] + \ F[3] * src[x + +0 * stride] + \ F[4] * src[x + +1 * stride] + \ F[5] * src[x + +2 * stride] + \ F[6] * src[x + +3 * stride] + \ F[7] * src[x + +4 * stride]) #define FILTER_8TAP2(src, x, F) \ (F[0] * src[0][x] + \ F[1] * src[1][x] + \ F[2] * src[2][x] + \ F[3] * src[3][x] + \ F[4] * src[4][x] + \ F[5] * src[5][x] + \ F[6] * src[6][x] + \ F[7] * src[7][x]) #define DAV2D_FILTER_8TAP_RND(src, x, F, stride, sh) \ ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) #define DAV2D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \ ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh)) #define DAV2D_FILTER_8TAP_RND3(src, x, F, sh) \ ((FILTER_8TAP2(src, x, F) + ((1 << (sh)) >> 1)) >> (sh)) #define DAV2D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \ iclip_pixel(DAV2D_FILTER_8TAP_RND(src, x, F, stride, sh)) #define DAV2D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \ iclip_pixel(DAV2D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh)) #define DAV2D_FILTER_8TAP_CLIP3(src, x, F, sh) \ iclip_pixel(DAV2D_FILTER_8TAP_RND3(src, x, F, sh)) #define GET_H_FILTER(mx) \ const int8_t *const fh = !(mx) ? NULL : filter_type == -1 ? \ dav2d_ext_warp_filter[(mx) - 1] : w > 4 ? \ dav2d_mc_subpel_filters[filter_type][(mx) - 1] : \ dav2d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1] #define GET_V_FILTER(my) \ const int8_t *const fv = !(my) ? NULL : filter_type == -1 ? \ dav2d_ext_warp_filter[(my) - 1] : h > 4 ? \ dav2d_mc_subpel_filters[filter_type][(my) - 1] : \ dav2d_mc_subpel_filters[3 + (filter_type & 1)][(my) - 1] #define GET_FILTERS() \ GET_H_FILTER(mx); \ GET_V_FILTER(my) static NOINLINE void put_8tap_c(pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, const int my, const int filter_type HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int bits = 6 + (filter_type < 0); const int intermediate_rnd = ((1 << bits) >> 1) + ((1 << (bits - intermediate_bits)) >> 1); GET_FILTERS(); dst_stride = PXSTRIDE(dst_stride); src_stride = PXSTRIDE(src_stride); assert(!(w & (w - 1)) && w >= 2 && w <= 64); // w2/h2 used by sub8x8 chroma? assert(!(h & (h - 1)) && h >= 2 && h <= 64); if (fh) { if (fv) { int tmp_h = h + 7; int16_t mid[64 * (64 + 7)], *mid_ptr = mid; src -= src_stride * 3; do { for (int x = 0; x < w; x++) mid_ptr[x] = DAV2D_FILTER_8TAP_RND(src, x, fh, 1, bits - intermediate_bits); mid_ptr += 64; src += src_stride; } while (--tmp_h); mid_ptr = mid + 64 * 3; do { for (int x = 0; x < w; x++) dst[x] = DAV2D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 64, bits + intermediate_bits); mid_ptr += 64; dst += dst_stride; } while (--h); } else { do { for (int x = 0; x < w; x++) { dst[x] = DAV2D_FILTER_8TAP_CLIP2(src, x, fh, 1, intermediate_rnd, bits); } dst += dst_stride; src += src_stride; } while (--h); } } else if (fv) { do { for (int x = 0; x < w; x++) dst[x] = DAV2D_FILTER_8TAP_CLIP(src, x, fv, src_stride, bits); dst += dst_stride; src += src_stride; } while (--h); } else put_c(dst, dst_stride, src, src_stride, w, h); } static void ext_warp4x4_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *const src, const ptrdiff_t src_stride, const int mx, const int my HIGHBD_DECL_SUFFIX) { put_8tap_c(dst, dst_stride, src, src_stride, 4, 4, mx, my, -1 HIGHBD_TAIL_SUFFIX); } static NOINLINE void put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, int my, const int dx, const int dy, const int filter_type HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int intermediate_rnd = (1 << intermediate_bits) >> 1; int16_t mid[8][64]; int16_t *mid_ptrs[8]; int in_y = -8; src_stride = PXSTRIDE(src_stride); assert(!(w & (w - 1)) && w >= 2 && w <= 64); assert(!(h & (h - 1)) && h >= 2 && h <= 64); for (int i = 0; i < 8; i++) mid_ptrs[i] = mid[i]; src -= src_stride * 3; for (int y = 0; y < h; y++) { int x; int src_y = my >> 10; GET_V_FILTER((my & 0x3ff) >> 6); while (in_y < src_y) { int imx = mx, ioff = 0; int16_t *mid_ptr = mid_ptrs[0]; for (int i = 0; i < 7; i++) mid_ptrs[i] = mid_ptrs[i + 1]; mid_ptrs[7] = mid_ptr; for (x = 0; x < w; x++) { GET_H_FILTER(imx >> 6); mid_ptr[x] = fh ? DAV2D_FILTER_8TAP_RND(src, ioff, fh, 1, 6 - intermediate_bits) : src[ioff] << intermediate_bits; imx += dx; ioff += imx >> 10; imx &= 0x3ff; } src += src_stride; in_y++; } for (x = 0; x < w; x++) dst[x] = fv ? DAV2D_FILTER_8TAP_CLIP3(mid_ptrs, x, fv, 6 + intermediate_bits) : iclip_pixel((mid_ptrs[3][x] + intermediate_rnd) >> intermediate_bits); my += dy; dst += PXSTRIDE(dst_stride); } } static NOINLINE void prep_8tap_c(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, const int my, const int filter_type HIGHBD_DECL_SUFFIX) { const int bits = 6 + (filter_type < 0); const int intermediate_bits = get_intermediate_bits(bitdepth_max); GET_FILTERS(); src_stride = PXSTRIDE(src_stride); assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); if (fh) { if (fv) { int tmp_h = h + 7; int16_t mid[64 * (64 + 7)], *mid_ptr = mid; src -= src_stride * 3; do { for (int x = 0; x < w; x++) mid_ptr[x] = DAV2D_FILTER_8TAP_RND(src, x, fh, 1, bits - intermediate_bits); mid_ptr += 64; src += src_stride; } while (--tmp_h); mid_ptr = mid + 64 * 3; do { for (int x = 0; x < w; x++) { int t = DAV2D_FILTER_8TAP_RND(mid_ptr, x, fv, 64, bits) - PREP_BIAS; assert(t >= INT16_MIN && t <= INT16_MAX); tmp[x] = t; } mid_ptr += 64; tmp += tmp_stride; } while (--h); } else { do { for (int x = 0; x < w; x++) tmp[x] = DAV2D_FILTER_8TAP_RND(src, x, fh, 1, bits - intermediate_bits) - PREP_BIAS; tmp += tmp_stride; src += src_stride; } while (--h); } } else if (fv) { do { for (int x = 0; x < w; x++) tmp[x] = DAV2D_FILTER_8TAP_RND(src, x, fv, src_stride, bits - intermediate_bits) - PREP_BIAS; tmp += tmp_stride; src += src_stride; } while (--h); } else prep_c(tmp, tmp_stride, src, src_stride, w, h HIGHBD_TAIL_SUFFIX); } static void ext_warp4x4t_c(int16_t *const tmp, const ptrdiff_t tmp_stride, const pixel *const src, const ptrdiff_t src_stride, const int mx, const int my HIGHBD_DECL_SUFFIX) { prep_8tap_c(tmp, tmp_stride, src, src_stride, 4, 4, mx, my, -1 HIGHBD_TAIL_SUFFIX); } static NOINLINE void prep_8tap_scaled_c(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, int my, const int dx, const int dy, const int filter_type HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); int16_t mid[8][64]; int16_t *mid_ptrs[8]; int in_y = -8; src_stride = PXSTRIDE(src_stride); assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); for (int i = 0; i < 8; i++) mid_ptrs[i] = mid[i]; src -= src_stride * 3; for (int y = 0; y < h; y++) { int x; int src_y = my >> 10; GET_V_FILTER((my & 0x3ff) >> 6); while (in_y < src_y) { int imx = mx, ioff = 0; int16_t *mid_ptr = mid_ptrs[0]; for (int i = 0; i < 7; i++) mid_ptrs[i] = mid_ptrs[i + 1]; mid_ptrs[7] = mid_ptr; for (x = 0; x < w; x++) { GET_H_FILTER(imx >> 6); mid_ptr[x] = fh ? DAV2D_FILTER_8TAP_RND(src, ioff, fh, 1, 6 - intermediate_bits) : src[ioff] << intermediate_bits; imx += dx; ioff += imx >> 10; imx &= 0x3ff; } src += src_stride; in_y++; } for (x = 0; x < w; x++) tmp[x] = (fv ? DAV2D_FILTER_8TAP_RND3(mid_ptrs, x, fv, 6) : mid_ptrs[3][x]) - PREP_BIAS; my += dy; tmp += tmp_stride; } } #define filter_fns(name, type) \ static void put_8tap_##name##_c(pixel *const dst, \ const ptrdiff_t dst_stride, \ const pixel *const src, \ const ptrdiff_t src_stride, \ const int w, const int h, \ const int mx, const int my \ HIGHBD_DECL_SUFFIX) \ { \ put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \ type HIGHBD_TAIL_SUFFIX); \ } \ static void put_8tap_##name##_scaled_c(pixel *const dst, \ const ptrdiff_t dst_stride, \ const pixel *const src, \ const ptrdiff_t src_stride, \ const int w, const int h, \ const int mx, const int my, \ const int dx, const int dy \ HIGHBD_DECL_SUFFIX) \ { \ put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \ type HIGHBD_TAIL_SUFFIX); \ } \ static void prep_8tap_##name##_c(int16_t *const tmp, \ const ptrdiff_t tmp_stride, \ const pixel *const src, \ const ptrdiff_t src_stride, \ const int w, const int h, \ const int mx, const int my \ HIGHBD_DECL_SUFFIX) \ { \ prep_8tap_c(tmp, tmp_stride, src, src_stride, w, h, mx, my, \ type HIGHBD_TAIL_SUFFIX); \ } \ static void prep_8tap_##name##_scaled_c(int16_t *const tmp, \ const ptrdiff_t tmp_stride, \ const pixel *const src, \ const ptrdiff_t src_stride, \ const int w, const int h, \ const int mx, const int my, \ const int dx, const int dy \ HIGHBD_DECL_SUFFIX) \ { \ prep_8tap_scaled_c(tmp, tmp_stride, src, src_stride, w, h, mx, my, dx, dy, \ type HIGHBD_TAIL_SUFFIX); \ } filter_fns(regular, DAV2D_FILTER_8TAP_REGULAR) filter_fns(smooth, DAV2D_FILTER_8TAP_SMOOTH) filter_fns(sharp, DAV2D_FILTER_8TAP_SHARP) #define FILTER_BILIN(src, x, mxy, stride) \ (16 * src[x] + ((mxy) * (src[x + stride] - src[x]))) #define FILTER_BILIN_RND(src, x, mxy, stride, sh) \ ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh)) #define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \ iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh)) #define FILTER_BILIN2(src1, src2, x, mxy) \ (16 * src1[x] + ((mxy) * (src2[x] - src1[x]))) #define FILTER_BILIN_RND2(src1, src2, x, mxy, sh) \ ((FILTER_BILIN2(src1, src2, x, mxy) + ((1 << (sh)) >> 1)) >> (sh)) #define FILTER_BILIN_CLIP2(src1, src2, x, mxy, sh) \ iclip_pixel(FILTER_BILIN_RND2(src1, src2, x, mxy, sh)) static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, const int my HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int intermediate_rnd = (1 << intermediate_bits) >> 1; dst_stride = PXSTRIDE(dst_stride); src_stride = PXSTRIDE(src_stride); assert(!(w & (w - 1)) && w >= 2 && w <= 64); // h=24 can happen for refinemv slices of height 16 and 8 pixels padding assert((!(h & (h - 1)) && h >= 2 && h <= 64) || h == 24); if (mx) { if (my) { int16_t mid[64 * (64 + 7)], *mid_ptr = mid; int tmp_h = h + 1; do { for (int x = 0; x < w; x++) mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1, 4 - intermediate_bits); mid_ptr += 64; src += src_stride; } while (--tmp_h); mid_ptr = mid; do { for (int x = 0; x < w; x++) dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 64, 4 + intermediate_bits); mid_ptr += 64; dst += dst_stride; } while (--h); } else { do { for (int x = 0; x < w; x++) { const int px = FILTER_BILIN_RND(src, x, mx, 1, 4 - intermediate_bits); dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits); } dst += dst_stride; src += src_stride; } while (--h); } } else if (my) { do { for (int x = 0; x < w; x++) dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4); dst += dst_stride; src += src_stride; } while (--h); } else put_c(dst, dst_stride, src, src_stride, w, h); } static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, int my, const int dx, const int dy HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); int16_t mid[2][64]; int in_y = -2; assert(!(w & (w - 1)) && w >= 2 && w <= 64); assert(!(h & (h - 1)) && h >= 2 && h <= 64); do { int x; int y = my >> 10; int16_t *mid1 = mid[y & 1]; int16_t *mid2 = mid[(y & 1) ^ 1]; int dmy = my & 0x3ff; while (in_y < y) { int imx = mx, ioff = 0; int16_t *mid_ptr = mid[in_y & 1]; for (x = 0; x < w; x++) { mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1, 4 - intermediate_bits); imx += dx; ioff += imx >> 10; imx &= 0x3ff; } src += PXSTRIDE(src_stride); in_y++; } for (x = 0; x < w; x++) dst[x] = FILTER_BILIN_CLIP2(mid1, mid2, x, dmy >> 6, 4 + intermediate_bits); my += dy; dst += PXSTRIDE(dst_stride); } while (--h); } static void prep_bilin_c(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, const int my HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); src_stride = PXSTRIDE(src_stride); assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); if (mx) { if (my) { int16_t mid[64 * (64 + 1)], *mid_ptr = mid; int tmp_h = h + 1; do { for (int x = 0; x < w; x++) mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1, 4 - intermediate_bits); mid_ptr += 64; src += src_stride; } while (--tmp_h); mid_ptr = mid; do { for (int x = 0; x < w; x++) tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 64, 4) - PREP_BIAS; mid_ptr += 64; tmp += tmp_stride; } while (--h); } else { do { for (int x = 0; x < w; x++) tmp[x] = FILTER_BILIN_RND(src, x, mx, 1, 4 - intermediate_bits) - PREP_BIAS; tmp += tmp_stride; src += src_stride; } while (--h); } } else if (my) { do { for (int x = 0; x < w; x++) tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride, 4 - intermediate_bits) - PREP_BIAS; tmp += tmp_stride; src += src_stride; } while (--h); } else prep_c(tmp, tmp_stride, src, src_stride, w, h HIGHBD_TAIL_SUFFIX); } static void prep_bilin_scaled_c(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, const int w, int h, const int mx, int my, const int dx, const int dy HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); int16_t mid[2][64]; int in_y = -2; assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); do { int x; int y = my >> 10; int16_t *mid1 = mid[y & 1]; int16_t *mid2 = mid[(y & 1) ^ 1]; int dmy = my & 0x3ff; while (in_y < y) { int imx = mx, ioff = 0; int16_t *mid_ptr = mid[in_y & 1]; for (x = 0; x < w; x++) { mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1, 4 - intermediate_bits); imx += dx; ioff += imx >> 10; imx &= 0x3ff; } src += PXSTRIDE(src_stride); in_y++; } for (x = 0; x < w; x++) tmp[x] = FILTER_BILIN_RND2(mid1, mid2, x, dmy >> 6, 4) - PREP_BIAS; my += dy; tmp += tmp_stride; } while (--h); } static void avg_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int sh = intermediate_bits + 1; const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2; assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); do { for (int x = 0; x < w; x++) dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh); tmp1 += w; tmp2 += w; dst += PXSTRIDE(dst_stride); } while (--h); } static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, const int weight HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int sh = intermediate_bits + 4; const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16; assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); do { for (int x = 0; x < w; x++) dst[x] = iclip_pixel((tmp1[x] * weight + tmp2[x] * (16 - weight) + rnd) >> sh); tmp1 += w; tmp2 += w; dst += PXSTRIDE(dst_stride); } while (--h); } static void mask_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, const uint8_t *mask HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int sh = intermediate_bits + 6; const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); do { for (int x = 0; x < w; x++) dst[x] = iclip_pixel((tmp1[x] * mask[x] + tmp2[x] * (64 - mask[x]) + rnd) >> sh); tmp1 += w; tmp2 += w; mask += w; dst += PXSTRIDE(dst_stride); } while (--h); } static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h, const uint8_t *mask) { assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); do { for (int x = 0; x < w; x++) dst[x] = ((dst[x] * (64 - mask[x]) + tmp[x] * mask[x]) + 32) >> 6; dst += PXSTRIDE(dst_stride); tmp += w; mask += w; } while (--h); } static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, const int w, int h, uint8_t *mask, const ptrdiff_t mask_stride, const int sign, const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX) { // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows, // and then load this intermediate to calculate final value for odd rows const int intermediate_bits = get_intermediate_bits(bitdepth_max); const int bitdepth = bitdepth_from_max(bitdepth_max); const int sh = intermediate_bits + 6; const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; const int mask_sh = bitdepth + intermediate_bits - 4; const int mask_rnd = 1 << (mask_sh - 5); assert(!(w & (w - 1)) && w >= 4 && w <= 64); assert(!(h & (h - 1)) && h >= 4 && h <= 64); // w<64 means multiple lines can be written together (they live adjacent // in memory). This is only not true for w=64 luma blocks, where horizontally // adjacent blocks may make up a single chroma (w=64 after subsampling) block. assert(mask_stride == (w >> ss_hor) || mask_stride == w); assert((w == 64 && ss_hor) || mask_stride == (w >> ss_hor)); do { for (int x = 0; x < w; x++) { const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64); dst[x] = iclip_pixel((tmp1[x] * m + tmp2[x] * (64 - m) + rnd) >> sh); if (ss_hor) { x++; const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64); dst[x] = iclip_pixel((tmp1[x] * n + tmp2[x] * (64 - n) + rnd) >> sh); if (h & ss_ver) { mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2; } else if (ss_ver) { mask[x >> 1] = m + n; } else { mask[x >> 1] = (m + n + 1 - sign) >> 1; } } else { mask[x] = m; } } tmp1 += w; tmp2 += w; dst += PXSTRIDE(dst_stride); if (!ss_ver || (h & 1)) mask += mask_stride; } while (--h); } #define w_mask_fns(ssn, ss_hor, ss_ver) \ static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \ const int16_t *const tmp1, const int16_t *const tmp2, \ const int w, const int h, uint8_t *mask, \ const ptrdiff_t mask_stride, \ const int sign HIGHBD_DECL_SUFFIX) \ { \ w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, mask_stride, sign, \ ss_hor, ss_ver HIGHBD_TAIL_SUFFIX); \ } w_mask_fns(444, 0, 0); w_mask_fns(422, 1, 0); w_mask_fns(420, 1, 1); #undef w_mask_fns #define FILTER_WARP_RND(src, x, F, stride, sh) \ ((F[0] * src[x - 3 * stride] + \ F[1] * src[x - 2 * stride] + \ F[2] * src[x - 1 * stride] + \ F[3] * src[x + 0 * stride] + \ F[4] * src[x + 1 * stride] + \ F[5] * src[x + 2 * stride] + \ F[6] * src[x + 3 * stride] + \ F[7] * src[x + 4 * stride] + \ ((1 << (sh)) >> 1)) >> (sh)) #define FILTER_WARP_CLIP(src, x, F, stride, sh) \ iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh)) static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_stride, const int16_t *const abcd, int mx, int my HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); int16_t mid[15 * 8], *mid_ptr = mid; src -= 3 * PXSTRIDE(src_stride); for (int y = 0; y < 15; y++, mx += abcd[1]) { for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) { const int8_t *const filter = dav2d_mc_warp_filter[3*64 + ((tmx + 512) >> 10)]; mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, 7 - intermediate_bits); } src += PXSTRIDE(src_stride); mid_ptr += 8; } mid_ptr = &mid[3 * 8]; for (int y = 0; y < 8; y++, my += abcd[3]) { for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) { const int8_t *const filter = dav2d_mc_warp_filter[3*64 + ((tmy + 512) >> 10)]; dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8, 7 + intermediate_bits); } mid_ptr += 8; dst += PXSTRIDE(dst_stride); } } static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, const ptrdiff_t src_stride, const int16_t *const abcd, int mx, int my HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); int16_t mid[15 * 8], *mid_ptr = mid; src -= 3 * PXSTRIDE(src_stride); for (int y = 0; y < 15; y++, mx += abcd[1]) { for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) { const int8_t *const filter = dav2d_mc_warp_filter[64*3 + ((tmx + 512) >> 10)]; mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, 7 - intermediate_bits); } src += PXSTRIDE(src_stride); mid_ptr += 8; } mid_ptr = &mid[3 * 8]; for (int y = 0; y < 8; y++, my += abcd[3]) { for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) { const int8_t *const filter = dav2d_mc_warp_filter[64*3 + ((tmy + 512) >> 10)]; tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS; } mid_ptr += 8; tmp += tmp_stride; } } static void emu_edge_c(const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, const pixel *ref, const ptrdiff_t ref_stride) { // find offset in reference of visible block to copy ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) + iclip((int) x, 0, (int) iw - 1); // number of pixels to extend (left, right, top, bottom) const int left_ext = iclip((int) -x, 0, (int) bw - 1); const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1); assert(left_ext + right_ext < bw); const int top_ext = iclip((int) -y, 0, (int) bh - 1); const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1); assert(top_ext + bottom_ext < bh); // copy visible portion first pixel *blk = dst + top_ext * PXSTRIDE(dst_stride); const int center_w = (int) (bw - left_ext - right_ext); const int center_h = (int) (bh - top_ext - bottom_ext); for (int y = 0; y < center_h; y++) { pixel_copy(blk + left_ext, ref, center_w); // extend left edge for this line if (left_ext) pixel_set(blk, blk[left_ext], left_ext); // extend right edge for this line if (right_ext) pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1], right_ext); ref += PXSTRIDE(ref_stride); blk += PXSTRIDE(dst_stride); } // copy top blk = dst + top_ext * PXSTRIDE(dst_stride); for (int y = 0; y < top_ext; y++) { pixel_copy(dst, blk, bw); dst += PXSTRIDE(dst_stride); } // copy bottom dst += center_h * PXSTRIDE(dst_stride); for (int y = 0; y < bottom_ext; y++) { pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw); dst += PXSTRIDE(dst_stride); } } static void morph_c(pixel *dst, const ptrdiff_t dst_stride, const int alpha, const int beta, const int w, const int h HIGHBD_DECL_SUFFIX) { assert(!(w & (w - 1)) && !(h & (h - 1))); assert(w >= 4 && w <= 64 && h >= 4 && h <= 64); assert(alpha > -512 && alpha < 512); for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { dst[x] = iclip_pixel((alpha * dst[x] + beta) >> 8); } dst += PXSTRIDE(dst_stride); } } static int sad_nxn(const pixel *p0, const ptrdiff_t p0_stride, const pixel *p1, const ptrdiff_t p1_stride, const int w, const int h, const int bd_min8) { int sad = 0; for (int y = 0; y < h; y += 2) { for (int x = 0; x < w; x++) { sad += abs(p0[x] - p1[x]); } p0 += PXSTRIDE(p0_stride) * 2; p1 += PXSTRIDE(p1_stride) * 2; } return sad >> bd_min8; } static void sad_refine_mv_c(const pixel *const p0, const ptrdiff_t p0_stride, const pixel *const p1, const ptrdiff_t p1_stride, const int w, const int h, const int is_implicit, struct OpflOffset *const o HIGHBD_DECL_SUFFIX) { const int bd_min8 = bitdepth_from_max(bitdepth_max) - 8; assert(w == 8 || w == 16); assert(h == 8 || h == 16); const int sadw = w + 4, sadh = h + 4; const unsigned sad_thr = sadw * sadh * 2; unsigned best_sad = ~0U; int best_dx = 0, best_dy = 0; if (is_implicit) { best_sad = sad_nxn(&p0[2 * PXSTRIDE(p0_stride) + 2], p0_stride, &p1[2 * PXSTRIDE(p1_stride) + 2], p1_stride, sadw, sadh, bd_min8); best_sad = (best_sad * 7 + 7) >> 3; if (best_sad < sad_thr) goto end; } for (int y_off = -2; y_off <= 2; y_off++) { for (int x_off = -2; x_off <= 2; x_off++) { if (!(x_off | y_off)) continue; const unsigned sad = sad_nxn(&p0[(2 + y_off) * PXSTRIDE(p0_stride) + (2 + x_off)], p0_stride, &p1[(2 - y_off) * PXSTRIDE(p1_stride) + (2 - x_off)], p1_stride, sadw, sadh, bd_min8); if (sad >= best_sad) continue; best_sad = sad; best_dx = x_off; best_dy = y_off; } } end: assert(best_sad != ~0U); o->y = best_dy; o->x = best_dx; } static void opfl_derive_mv_c(struct OpflRegressionData *out, const pixel *p0, const ptrdiff_t p0_stride, const pixel *p1, const ptrdiff_t p1_stride, const int w, const int h, const int bs, const union aliasi16 d HIGHBD_DECL_SUFFIX) { #if BITDEPTH != 8 const int bd_min8 = bitdepth_from_max(bitdepth_max) - 8; const int rnd = (1 << bd_min8) >> 1; #endif assert(bs == 4 || bs == 8); assert(bs == 8 || (w == 8 && h == 8)); assert(!(w & (w - 1))); assert(h == 8 || h == 16); assert(w >= 8 && w <= 64); // distance-weighted pixel difference & regular pixel difference int16_t tmp0[64 * 16], tmp1[64 * 16]; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { const int p0p = p0[y * PXSTRIDE(p0_stride) + x]; const int p1p = p1[y * PXSTRIDE(p1_stride) + x]; const int v = d.i8[0] * p0p + d.i8[1] * p1p; #if BITDEPTH == 8 tmp0[y * 64 + x] = v; tmp1[y * 64 + x] = p0p - p1p; #else tmp0[y * 64 + x] = (v + rnd - (v < 0)) >> bd_min8; tmp1[y * 64 + x] = (p0p - p1p + rnd - (p1p > p0p)) >> bd_min8; #endif } } // subpel gradient in both directions int16_t gx0[64 * 16], gy0[64 * 16]; for (int bx = 0; bx < w; bx += 16) { const int x_end = imin(bx + 16, w); const int min_x = bx & ~15, max_x = x_end - 1; const int min_y = 0, max_y = h - 1; for (int y = 0; y < h; y++) { for (int x = bx; x < x_end; x++) { const int p0 = tmp0[y * 64 + imax(min_x, x - 2)]; const int p1 = tmp0[y * 64 + imax(min_x, x - 1)]; const int p2 = tmp0[y * 64 + imin(max_x, x + 1)]; const int p3 = tmp0[y * 64 + imin(max_x, x + 2)]; const int e1 = x + 1 > max_x || x - 1 < min_x; const int x0 = ((p2 - p1) * 42 + (p3 - p0) * -5) * (1 + e1); gx0[y * 64 + x] = (x0 + 63 + (x0 > 0)) >> 7; const int q0 = tmp0[imax(min_y, y - 2) * 64 + x]; const int q1 = tmp0[imax(min_y, y - 1) * 64 + x]; const int q2 = tmp0[imin(max_y, y + 1) * 64 + x]; const int q3 = tmp0[imin(max_y, y + 2) * 64 + x]; const int e2 = y + 1 > max_y || y - 1 < min_y; const int y0 = ((q2 - q1) * 42 + (q3 - q0) * -5) * (1 + e2); gy0[y * 64 + x] = (y0 + 63 + (y0 > 0)) >> 7; } } } // set up regression data for flow-derived sub-pixel offset for (int y = 0; y < h; y += bs) { for (int x = 0; x < w; x += bs, out++) { int su2 = bs * bs, suv = 0, sv2 = bs * bs, suw = 0, svw = 0; for (int py = y; py < y + bs; py++) { for (int px = x; px < x + bs; px++) { const int u = gx0[py * 64 + px]; const int v = gy0[py * 64 + px]; const int w = tmp1[py * 64 + px]; su2 += u * u; suv += u * v; sv2 += v * v; suw += u * w; svw += v * w; } } out->su2 = su2; out->suv = suv; out->sv2 = sv2; out->suw = suw; out->svw = svw; } } } static unsigned sad8x8_c(const pixel *p0, const ptrdiff_t p0_stride, const pixel *p1, const ptrdiff_t p1_stride HIGHBD_DECL_SUFFIX) { const int bd_min8 = bitdepth_from_max(bitdepth_max) - 8; unsigned sad = 0; for (int y = 0; y < 8; y++) { for (int x = 0; x < 8; x++) sad += abs(p0[x] - p1[x]); p0 += PXSTRIDE(p0_stride); p1 += PXSTRIDE(p1_stride); } return sad >> bd_min8; } #if HAVE_ASM #if ARCH_AARCH64 #include "src/arm/mc.h" #elif ARCH_X86 #include "src/x86/mc.h" #endif #endif COLD void bitfn(dav2d_mc_dsp_init)(Dav2dMCDSPContext *const c) { #define init_mc_fns(type, name) do { \ c->mc [type] = put_##name##_c; \ c->mc_scaled [type] = put_##name##_scaled_c; \ c->mct [type] = prep_##name##_c; \ c->mct_scaled[type] = prep_##name##_scaled_c; \ } while (0) init_mc_fns(DAV2D_FILTER_8TAP_REGULAR, 8tap_regular); init_mc_fns(DAV2D_FILTER_8TAP_SHARP, 8tap_sharp); init_mc_fns(DAV2D_FILTER_8TAP_SMOOTH, 8tap_smooth); init_mc_fns(DAV2D_FILTER_BILINEAR, bilin); c->avg = avg_c; c->w_avg = w_avg_c; c->mask = mask_c; c->blend = blend_c; c->w_mask[0] = w_mask_444_c; c->w_mask[1] = w_mask_422_c; c->w_mask[2] = w_mask_420_c; c->warp8x8 = warp_affine_8x8_c; c->warp8x8t = warp_affine_8x8t_c; c->ext_warp4x4 = ext_warp4x4_c; c->ext_warp4x4t = ext_warp4x4t_c; c->emu_edge = emu_edge_c; c->morph = morph_c; c->opfl_derive_mv = opfl_derive_mv_c; c->sad_refine_mv = sad_refine_mv_c; c->sad8x8 = sad8x8_c; #if HAVE_ASM #if ARCH_AARCH64 mc_dsp_init_arm(c); #elif ARCH_X86 mc_dsp_init_x86(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/mem.c000066400000000000000000000240771517466257200215670ustar00rootroot00000000000000/* * Copyright © 2020-2026, VideoLAN and dav2d authors * Copyright © 2020-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "src/internal.h" #if TRACK_HEAP_ALLOCATIONS #include #include "src/log.h" #define DEFAULT_ALIGN 16 typedef struct { size_t sz; unsigned align; enum AllocationType type; } Dav2dAllocationData; typedef struct { size_t curr_sz; size_t peak_sz; unsigned num_allocs; unsigned num_reuses; } AllocStats; static AllocStats tracked_allocs[N_ALLOC_TYPES]; static size_t curr_total_sz; static size_t peak_total_sz; static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER; static void *track_alloc(const enum AllocationType type, char *ptr, const size_t sz, const size_t align) { assert(align >= sizeof(Dav2dAllocationData)); if (ptr) { ptr += align; Dav2dAllocationData *const d = &((Dav2dAllocationData*)ptr)[-1]; AllocStats *const s = &tracked_allocs[type]; d->sz = sz; d->align = (unsigned)align; d->type = type; pthread_mutex_lock(&track_alloc_mutex); s->num_allocs++; s->curr_sz += sz; if (s->curr_sz > s->peak_sz) s->peak_sz = s->curr_sz; curr_total_sz += sz; if (curr_total_sz > peak_total_sz) peak_total_sz = curr_total_sz; pthread_mutex_unlock(&track_alloc_mutex); } return ptr; } static void *track_free(char *const ptr) { const Dav2dAllocationData *const d = &((Dav2dAllocationData*)ptr)[-1]; const size_t sz = d->sz; pthread_mutex_lock(&track_alloc_mutex); tracked_allocs[d->type].curr_sz -= sz; curr_total_sz -= sz; pthread_mutex_unlock(&track_alloc_mutex); return ptr - d->align; } static void dav2d_track_reuse(const enum AllocationType type) { pthread_mutex_lock(&track_alloc_mutex); tracked_allocs[type].num_reuses++; pthread_mutex_unlock(&track_alloc_mutex); } void *dav2d_malloc(const enum AllocationType type, const size_t sz) { void *const ptr = malloc(sz + DEFAULT_ALIGN); return track_alloc(type, ptr, sz, DEFAULT_ALIGN); } void *dav2d_alloc_aligned(const enum AllocationType type, const size_t sz, const size_t align) { void *const ptr = dav2d_alloc_aligned_internal(sz + align, align); return track_alloc(type, ptr, sz, align); } void *dav2d_realloc(const enum AllocationType type, void *ptr, const size_t sz) { if (!ptr) return dav2d_malloc(type, sz); ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN); if (ptr) ptr = track_free((char*)ptr + DEFAULT_ALIGN); return track_alloc(type, ptr, sz, DEFAULT_ALIGN); } void dav2d_free(void *ptr) { if (ptr) free(track_free(ptr)); } void dav2d_free_aligned(void *ptr) { if (ptr) { dav2d_free_aligned_internal(track_free(ptr)); } } static COLD int cmp_stats(const void *const a, const void *const b) { const size_t a_sz = ((const AllocStats*)a)->peak_sz; const size_t b_sz = ((const AllocStats*)b)->peak_sz; return a_sz < b_sz ? -1 : a_sz > b_sz; } /* Insert spaces as thousands separators for better readability */ static COLD int format_tsep(char *const s, const size_t n, const size_t value) { if (value < 1000) return snprintf(s, n, "%u", (unsigned)value); const int len = format_tsep(s, n, value / 1000); assert((size_t)len < n); return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000)); } COLD void dav2d_log_alloc_stats(Dav2dContext *const c) { static const char *const type_names[N_ALLOC_TYPES] = { [ALLOC_BLOCK ] = "Block data", [ALLOC_CDEF ] = "CDEF line buffers", [ALLOC_CDF ] = "CDF contexts", [ALLOC_COEF ] = "Coefficient data", [ALLOC_COMMON_CTX] = "Common context data", [ALLOC_DAV2DDATA ] = "Dav2dData", [ALLOC_IPRED ] = "Intra pred edges", [ALLOC_LF ] = "Loopfilter data", [ALLOC_LR ] = "Looprestoration data", [ALLOC_OBU_HDR ] = "OBU headers", [ALLOC_OBU_META ] = "OBU metadata", [ALLOC_PAL ] = "Palette data", [ALLOC_PIC ] = "Picture buffers", [ALLOC_PIC_CTX ] = "Picture context data", [ALLOC_REFMVS ] = "Reference mv data", [ALLOC_SEGMAP ] = "Segmentation maps", [ALLOC_CCSOMAP ] = "CCSO maps", [ALLOC_THREAD_CTX] = "Thread context data", [ALLOC_TILE ] = "Tile data", }; struct { AllocStats stats; enum AllocationType type; } data[N_ALLOC_TYPES]; unsigned total_allocs = 0; unsigned total_reuses = 0; pthread_mutex_lock(&track_alloc_mutex); for (int i = 0; i < N_ALLOC_TYPES; i++) { AllocStats *const s = &data[i].stats; *s = tracked_allocs[i]; data[i].type = i; total_allocs += s->num_allocs; total_reuses += s->num_reuses; } size_t total_sz = peak_total_sz; pthread_mutex_unlock(&track_alloc_mutex); /* Sort types by memory usage */ qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats); const double inv_total_share = 100.0 / total_sz; char total_sz_buf[32]; const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz); dav2d_log(c, "\n Type Allocs Reuses Share Peak size\n" "---------------------------------------------------------------------\n"); for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) { const AllocStats *const s = &data[i].stats; if (s->num_allocs) { const double share = s->peak_sz * inv_total_share; char sz_buf[32]; format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz); dav2d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type], s->num_allocs, s->num_reuses, share, sz_len, sz_buf); } } dav2d_log(c, "---------------------------------------------------------------------\n" "%31u%10u %s\n", total_allocs, total_reuses, total_sz_buf); } #endif /* TRACK_HEAP_ALLOCATIONS */ static COLD void mem_pool_destroy(Dav2dMemPool *const pool) { pthread_mutex_destroy(&pool->lock); dav2d_free(pool); } void dav2d_mem_pool_push(Dav2dMemPool *const pool, void *const ptr) { if (!ptr) return; pthread_mutex_lock(&pool->lock); Dav2dMemPoolBuffer *const buf = (Dav2dMemPoolBuffer*)((uintptr_t)ptr - 64); const int ref_cnt = --pool->ref_cnt; if (!pool->end) { buf->next = pool->buf; pool->buf = buf; pthread_mutex_unlock(&pool->lock); assert(ref_cnt > 0); } else { pthread_mutex_unlock(&pool->lock); dav2d_free_aligned(buf); if (!ref_cnt) mem_pool_destroy(pool); } } void *dav2d_mem_pool_pop(Dav2dMemPool *const pool, const size_t size) { pthread_mutex_lock(&pool->lock); Dav2dMemPoolBuffer *buf = pool->buf; pool->ref_cnt++; if (buf) { pool->buf = buf->next; pthread_mutex_unlock(&pool->lock); if (buf->size != size) { /* Reallocate if the size has changed */ dav2d_free_aligned(buf); goto alloc; } #if TRACK_HEAP_ALLOCATIONS dav2d_track_reuse(pool->type); #endif } else { pthread_mutex_unlock(&pool->lock); alloc: buf = dav2d_alloc_aligned(pool->type, size + 64, 64); if (!buf) { pthread_mutex_lock(&pool->lock); const int ref_cnt = --pool->ref_cnt; pthread_mutex_unlock(&pool->lock); if (!ref_cnt) mem_pool_destroy(pool); return NULL; } buf->size = size; } return (void*)((uintptr_t)buf + 64); } COLD int dav2d_mem_pool_init(const enum AllocationType type, Dav2dMemPool **const ppool) { Dav2dMemPool *const pool = dav2d_malloc(ALLOC_COMMON_CTX, sizeof(Dav2dMemPool)); if (pool) { if (!pthread_mutex_init(&pool->lock, NULL)) { pool->buf = NULL; pool->ref_cnt = 1; pool->end = 0; #if TRACK_HEAP_ALLOCATIONS pool->type = type; #endif *ppool = pool; return 0; } dav2d_free(pool); } *ppool = NULL; return DAV2D_ERR(ENOMEM); } COLD void dav2d_mem_pool_end(Dav2dMemPool *const pool) { if (pool) { pthread_mutex_lock(&pool->lock); Dav2dMemPoolBuffer *buf = pool->buf; const int ref_cnt = --pool->ref_cnt; pool->buf = NULL; pool->end = 1; pthread_mutex_unlock(&pool->lock); while (buf) { void *const ptr = buf; buf = buf->next; dav2d_free_aligned(ptr); } if (!ref_cnt) mem_pool_destroy(pool); } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/mem.h000066400000000000000000000112171517466257200215640ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_MEM_H #define DAV2D_SRC_MEM_H #define TRACK_HEAP_ALLOCATIONS 0 #include #if defined(_WIN32) || HAVE_MEMALIGN #include #endif #include "dav2d/dav2d.h" #include "common/attributes.h" #include "src/thread.h" enum AllocationType { ALLOC_BLOCK, ALLOC_CDEF, ALLOC_CDF, ALLOC_COEF, ALLOC_COMMON_CTX, ALLOC_DAV2DDATA, ALLOC_IPRED, ALLOC_LF, ALLOC_LR, ALLOC_OBU_HDR, ALLOC_OBU_META, ALLOC_PAL, ALLOC_PIC, ALLOC_PIC_CTX, ALLOC_REFMVS, ALLOC_SEGMAP, ALLOC_CCSOMAP, ALLOC_THREAD_CTX, ALLOC_TILE, N_ALLOC_TYPES, }; typedef struct Dav2dMemPoolBuffer { struct Dav2dMemPoolBuffer *next; size_t size; } Dav2dMemPoolBuffer; typedef struct Dav2dMemPool { pthread_mutex_t lock; Dav2dMemPoolBuffer *buf; int ref_cnt; int end; #if TRACK_HEAP_ALLOCATIONS enum AllocationType type; #endif } Dav2dMemPool; // TODO: Move this to a common location? #define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1)) /* * Allocate align-byte aligned memory. The return value can be released * by calling the dav2d_free_aligned() function. */ static inline void *dav2d_alloc_aligned_internal(const size_t sz, const size_t align) { assert(!(align & (align - 1))); #ifdef _WIN32 return _aligned_malloc(sz, align); #elif HAVE_POSIX_MEMALIGN void *ptr; if (posix_memalign(&ptr, align, sz)) return NULL; return ptr; #elif HAVE_MEMALIGN return memalign(align, sz); #elif HAVE_ALIGNED_ALLOC // The C11 standard specifies that the size parameter // must be an integral multiple of alignment. return aligned_alloc(align, ROUND_UP(sz, align)); #else void *const buf = malloc(sz + align + sizeof(void *)); if (!buf) return NULL; void *const ptr = (void *)(((uintptr_t)buf + sizeof(void *) + align - 1) & ~(align - 1)); ((void **)ptr)[-1] = buf; return ptr; #endif } static inline void dav2d_free_aligned_internal(void *ptr) { #ifdef _WIN32 _aligned_free(ptr); #elif HAVE_POSIX_MEMALIGN || HAVE_MEMALIGN || HAVE_ALIGNED_ALLOC free(ptr); #else if (ptr) free(((void **)ptr)[-1]); #endif } #if TRACK_HEAP_ALLOCATIONS void *dav2d_malloc(enum AllocationType type, size_t sz); void *dav2d_realloc(enum AllocationType type, void *ptr, size_t sz); void *dav2d_alloc_aligned(enum AllocationType type, size_t sz, size_t align); void dav2d_free(void *ptr); void dav2d_free_aligned(void *ptr); void dav2d_log_alloc_stats(Dav2dContext *c); #else #define dav2d_mem_pool_init(type, pool) dav2d_mem_pool_init(pool) #define dav2d_malloc(type, sz) malloc(sz) #define dav2d_realloc(type, ptr, sz) realloc(ptr, sz) #define dav2d_alloc_aligned(type, sz, align) dav2d_alloc_aligned_internal(sz, align) #define dav2d_free(ptr) free(ptr) #define dav2d_free_aligned(ptr) dav2d_free_aligned_internal(ptr) #endif /* TRACK_HEAP_ALLOCATIONS */ void dav2d_mem_pool_push(Dav2dMemPool *pool, void *ptr); void *dav2d_mem_pool_pop(Dav2dMemPool *pool, size_t size); int dav2d_mem_pool_init(enum AllocationType type, Dav2dMemPool **pool); void dav2d_mem_pool_end(Dav2dMemPool *pool); static inline void dav2d_freep_aligned(void *ptr) { void **mem = (void **) ptr; if (*mem) { dav2d_free_aligned(*mem); *mem = NULL; } } #endif /* DAV2D_SRC_MEM_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/meson.build000066400000000000000000000270101517466257200227750ustar00rootroot00000000000000# Copyright © 2018-2019, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Build definition for the dav2d library # # libdav2d source files libdav2d_sources = files( 'cdf.c', 'cpu.c', 'ctx.c', 'data.c', 'decode.c', 'dip_tables.c', 'getbits.c', 'gdf_tables.c', 'ibp.c', 'itx_1d.c', 'lf_mask.c', 'lib.c', 'log.c', 'mem.c', 'msac.c', 'obu.c', 'pal.c', 'picture.c', 'quantizer.c', 'ref.c', 'refmvs.c', 'scan.c', 'stx_tables.c', 'tables.c', 'thread_task.c', 'warpmv.c', 'wedge.c', ) # libdav2d bitdepth source files # These files are compiled for each bitdepth with # `BITDEPTH` defined to the currently built bitdepth. libdav2d_tmpl_sources = files( 'ccso_tmpl.c', 'cdef_apply_tmpl.c', 'cdef_tmpl.c', 'db_apply_tmpl.c', 'deblock_tmpl.c', 'fg_apply_tmpl.c', 'filmgrain_tmpl.c', 'ipred_prepare_tmpl.c', 'ipred_tmpl.c', 'itx_tmpl.c', 'looprestoration_tmpl.c', 'lr_apply_tmpl.c', 'mc_tmpl.c', 'recon_tmpl.c', 'stx_tmpl.c', ) libdav2d_arch_tmpl_sources = {} libdav2d_bitdepth_objs = [] # ASM specific sources libdav2d_asm_objs = [] # Arch-specific flags arch_flags = {} if is_asm_enabled if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm')) libdav2d_sources += files( 'arm/cpu.c', ) if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') libdav2d_sources_asm = files( # itx.S is used for both 8 and 16 bpc. 'arm/64/itx.S', 'arm/64/msac.S', 'arm/64/refmvs.S', ) if dav2d_bitdepths.contains('8') libdav2d_sources_asm += files( 'arm/64/cdef.S', 'arm/64/filmgrain.S', 'arm/64/ipred.S', 'arm/64/loopfilter.S', 'arm/64/mc.S', 'arm/64/mc_dotprod.S', ) endif if dav2d_bitdepths.contains('16') libdav2d_sources_asm += files( 'arm/64/cdef16.S', 'arm/64/filmgrain16.S', 'arm/64/ipred16.S', 'arm/64/itx16.S', 'arm/64/loopfilter16.S', 'arm/64/mc16.S', 'arm/64/mc16_sve.S', ) endif elif host_machine.cpu_family().startswith('arm') libdav2d_sources_asm = files( # itx.S is used for both 8 and 16 bpc. 'arm/32/itx.S', 'arm/32/msac.S', 'arm/32/refmvs.S', ) if dav2d_bitdepths.contains('8') libdav2d_sources_asm += files( 'arm/32/cdef.S', 'arm/32/filmgrain.S', 'arm/32/ipred.S', 'arm/32/loopfilter.S', 'arm/32/mc.S', ) endif if dav2d_bitdepths.contains('16') libdav2d_sources_asm += files( 'arm/32/cdef16.S', 'arm/32/filmgrain16.S', 'arm/32/ipred16.S', 'arm/32/itx16.S', 'arm/32/loopfilter16.S', 'arm/32/mc16.S', ) endif endif if use_gaspp libdav2d_asm_objs = gaspp_gen.process(libdav2d_sources_asm) else libdav2d_sources += libdav2d_sources_asm endif elif host_machine.cpu_family().startswith('x86') libdav2d_sources += files( 'x86/cpu.c', ) # NASM source files libdav2d_sources_asm = files( 'x86/cpuid.asm', 'x86/msac.asm', 'x86/pal.asm', 'x86/refmvs.asm', 'x86/cdef_avx2.asm', 'x86/cdef_sse.asm', ) if dav2d_bitdepths.contains('8') libdav2d_sources_asm += files( 'x86/cdef_avx512.asm', 'x86/filmgrain_avx512.asm', 'x86/ipred_avx512.asm', 'x86/loopfilter_avx512.asm', 'x86/mc_avx512.asm', 'x86/filmgrain_avx2.asm', 'x86/ipred_avx2.asm', 'x86/itx_avx2.asm', 'x86/deblock_avx2.asm', 'x86/mc_avx2.asm', 'x86/filmgrain_sse.asm', 'x86/ipred_sse.asm', 'x86/loopfilter_sse.asm', 'x86/mc_sse.asm', ) endif if dav2d_bitdepths.contains('16') libdav2d_sources_asm += files( 'x86/cdef16_avx512.asm', 'x86/filmgrain16_avx512.asm', 'x86/ipred16_avx512.asm', 'x86/loopfilter16_avx512.asm', 'x86/mc16_avx512.asm', 'x86/cdef16_avx2.asm', 'x86/filmgrain16_avx2.asm', 'x86/ipred16_avx2.asm', 'x86/loopfilter16_avx2.asm', 'x86/mc16_avx2.asm', 'x86/cdef16_sse.asm', 'x86/filmgrain16_sse.asm', 'x86/ipred16_sse.asm', 'x86/loopfilter16_sse.asm', 'x86/mc16_sse.asm', ) endif # Compile the ASM sources with NASM libdav2d_asm_objs = nasm_gen.process(libdav2d_sources_asm) elif host_machine.cpu_family().startswith('loongarch') libdav2d_sources += files( 'loongarch/cpu.c', ) libdav2d_sources_asm = files( 'loongarch/cdef.S', 'loongarch/ipred.S', 'loongarch/mc.S', 'loongarch/loopfilter.S', 'loongarch/msac.S', 'loongarch/refmvs.S', 'loongarch/itx.S', ) libdav2d_asm_objs += libdav2d_sources_asm elif host_machine.cpu() == 'ppc64le' arch_flags += {'vsx': ['-maltivec', '-mvsx', '-DDAV2D_VSX']} libdav2d_sources += files( 'ppc/cpu.c', ) libdav2d_arch_tmpl_sources += {'vsx': files( 'ppc/cdef_tmpl.c', )} arch_flags += {'pwr9': ['-mcpu=power9', '-DDAV2D_PWR9']} libdav2d_arch_tmpl_sources += {'pwr9': files( 'ppc/itx_tmpl.c', 'ppc/loopfilter_tmpl.c', 'ppc/mc_tmpl.c', )} elif host_machine.cpu_family().startswith('riscv') libdav2d_sources += files( 'riscv/cpu.c', ) if host_machine.cpu_family() == 'riscv64' libdav2d_sources += files( 'riscv/64/cpu.S', 'riscv/64/pal.S', ) if dav2d_bitdepths.contains('8') libdav2d_sources += files( 'riscv/64/cdef.S', 'riscv/64/ipred.S', 'riscv/64/itx.S', 'riscv/64/mc.S', ) endif if dav2d_bitdepths.contains('16') libdav2d_sources += files( 'riscv/64/cdef16.S', 'riscv/64/ipred16.S', 'riscv/64/mc16.S', ) endif endif endif endif libdav2d_rc_obj = [] libdav2d_flags = [] api_export_flags = [] # # Windows .rc file and API export flags # if host_machine.system() == 'windows' if get_option('default_library') != 'static' rc_file = configure_file( input : 'dav2d.rc.in', output : 'dav2d.rc', configuration : rc_data ) libdav2d_rc_obj = winmod.compile_resources(rc_file) api_export_flags = ['-DDAV2D_BUILDING_DLL'] endif if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc') # We don't expect to reference data members from other DLLs without # dllimport attributes. Set the -mcmodel=small flag, which avoids # generating indirection via .refptr. for all potentially # dllimported variable references. libdav2d_flags += '-mcmodel=small' endif endif # # Library definitions # # Helper library for each bitdepth libdav2d_bitdepth_objs = [] foreach bitdepth : dav2d_bitdepths libdav2d_bitdepth_objs += static_library( 'dav2d_bitdepth_@0@'.format(bitdepth), libdav2d_tmpl_sources, config_h_target, include_directories: dav2d_inc_dirs, dependencies : [stdatomic_dependencies], c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav2d_flags, install : false, build_by_default : false, ).extract_all_objects(recursive: true) endforeach # Helper library for each bitdepth and architecture-specific flags foreach bitdepth : dav2d_bitdepths foreach subarch : libdav2d_arch_tmpl_sources.keys() libdav2d_bitdepth_objs += static_library( 'dav2d_arch_bitdepth_@0@_@1@'.format(bitdepth,subarch), libdav2d_arch_tmpl_sources[subarch], config_h_target, include_directories: dav2d_inc_dirs, dependencies : [stdatomic_dependencies], c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav2d_flags + arch_flags.get(subarch, []), install : false, build_by_default : false, ).extract_all_objects(recursive: true) endforeach endforeach # The final dav2d library if host_machine.system() == 'windows' dav2d_soversion = '' else dav2d_soversion = dav2d_api_version_major endif libdav2d = library('dav2d', libdav2d_sources, libdav2d_asm_objs, libdav2d_rc_obj, rev_target, config_h_target, objects : [ libdav2d_bitdepth_objs, ], include_directories : dav2d_inc_dirs, dependencies : [ stdatomic_dependencies, thread_dependency, thread_compat_dep, libdl_dependency, ], c_args : [libdav2d_flags, api_export_flags], version : dav2d_soname_version, soversion : dav2d_soversion, install : true, ) dav2d_dep = declare_dependency(link_with: libdav2d, include_directories : include_directories('../include') ) # # Generate pkg-config .pc file # pkg_mod = import('pkgconfig') pkg_mod.generate(libraries: libdav2d, version: meson.project_version(), name: 'libdav2d', filebase: 'dav2d', description: 'AV2 decoding library' ) dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/msac.c000066400000000000000000000215361517466257200217310ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/msac.h" const uint8_t dav2d_msac_rate[125][3] = { { 4, 5, 6 }, { 4, 5, 5 }, { 4, 5, 4 }, { 4, 5, 7 }, { 4, 5, 7 }, { 4, 4, 6 }, { 4, 4, 5 }, { 4, 4, 4 }, { 4, 4, 7 }, { 4, 4, 7 }, { 4, 3, 6 }, { 4, 3, 5 }, { 4, 3, 4 }, { 4, 3, 7 }, { 4, 3, 7 }, { 4, 6, 6 }, { 4, 6, 5 }, { 4, 6, 4 }, { 4, 6, 7 }, { 4, 6, 7 }, { 4, 6, 6 }, { 4, 6, 5 }, { 4, 6, 4 }, { 4, 6, 7 }, { 4, 6, 7 }, { 3, 5, 6 }, { 3, 5, 5 }, { 3, 5, 4 }, { 3, 5, 7 }, { 3, 5, 7 }, { 3, 4, 6 }, { 3, 4, 5 }, { 3, 4, 4 }, { 3, 4, 7 }, { 3, 4, 7 }, { 3, 3, 6 }, { 3, 3, 5 }, { 3, 3, 4 }, { 3, 3, 7 }, { 3, 3, 7 }, { 3, 6, 6 }, { 3, 6, 5 }, { 3, 6, 4 }, { 3, 6, 7 }, { 3, 6, 7 }, { 3, 6, 6 }, { 3, 6, 5 }, { 3, 6, 4 }, { 3, 6, 7 }, { 3, 6, 7 }, { 2, 5, 6 }, { 2, 5, 5 }, { 2, 5, 4 }, { 2, 5, 7 }, { 2, 5, 7 }, { 2, 4, 6 }, { 2, 4, 5 }, { 2, 4, 4 }, { 2, 4, 7 }, { 2, 4, 7 }, { 2, 3, 6 }, { 2, 3, 5 }, { 2, 3, 4 }, { 2, 3, 7 }, { 2, 3, 7 }, { 2, 6, 6 }, { 2, 6, 5 }, { 2, 6, 4 }, { 2, 6, 7 }, { 2, 6, 7 }, { 2, 6, 6 }, { 2, 6, 5 }, { 2, 6, 4 }, { 2, 6, 7 }, { 2, 6, 7 }, { 5, 5, 6 }, { 5, 5, 5 }, { 5, 5, 4 }, { 5, 5, 7 }, { 5, 5, 7 }, { 5, 4, 6 }, { 5, 4, 5 }, { 5, 4, 4 }, { 5, 4, 7 }, { 5, 4, 7 }, { 5, 3, 6 }, { 5, 3, 5 }, { 5, 3, 4 }, { 5, 3, 7 }, { 5, 3, 7 }, { 5, 6, 6 }, { 5, 6, 5 }, { 5, 6, 4 }, { 5, 6, 7 }, { 5, 6, 7 }, { 5, 6, 6 }, { 5, 6, 5 }, { 5, 6, 4 }, { 5, 6, 7 }, { 5, 6, 7 }, { 5, 5, 6 }, { 5, 5, 5 }, { 5, 5, 4 }, { 5, 5, 7 }, { 5, 5, 7 }, { 5, 4, 6 }, { 5, 4, 5 }, { 5, 4, 4 }, { 5, 4, 7 }, { 5, 4, 7 }, { 5, 3, 6 }, { 5, 3, 5 }, { 5, 3, 4 }, { 5, 3, 7 }, { 5, 3, 7 }, { 5, 6, 6 }, { 5, 6, 5 }, { 5, 6, 4 }, { 5, 6, 7 }, { 5, 6, 7 }, { 5, 6, 6 }, { 5, 6, 5 }, { 5, 6, 4 }, { 5, 6, 7 }, { 5, 6, 7 }, }; const uint16_t ALIGN(dav2d_msac_min_prob[7][8], 16) = { { 63, 65535, 65535, 65535, 65535, 65535, 65535, 65535 }, { 47, 87, 65535, 65535, 65535, 65535, 65535, 65535 }, { 31, 63, 95, 65535, 65535, 65535, 65535, 65535 }, { 31, 55, 79, 103, 65535, 65535, 65535, 65535 }, { 23, 47, 63, 87, 111, 65535, 65535, 65535 }, { 23, 39, 55, 79, 95, 111, 65535, 65535 }, { 15, 31, 47, 63, 79, 95, 111, 65535 }, }; static inline void ctx_refill(MsacContext *const s) { const uint8_t *buf_pos = s->buf_pos; const uint8_t *buf_end = s->buf_end; int c = 40 - s->cnt; uint64_t dif = s->dif; do { if (buf_pos >= buf_end) break; dif ^= (uint64_t)*buf_pos++ << c; c -= 8; } while (c >= 0); s->dif = dif; s->cnt = 40 - c; s->buf_pos = buf_pos; } unsigned dav2d_msac_decode_bools_bypass_c(MsacContext *const s, const unsigned n_bits) { assert(n_bits > 0 && n_bits <= 32); if ((unsigned)s->cnt < n_bits) ctx_refill(s); const uint64_t r = s->rng; uint64_t dif = s->dif; assert(!(r & 1)); assert((dif >> 48) < r); uint64_t vw = r << 47; unsigned ret = 0; for (unsigned n = 0; n < n_bits; n++) { ret <<= 1; if (dif >= vw) dif -= vw; else ret |= 1; vw >>= 1; } s->dif = ((dif + 1) << n_bits) - 1; s->cnt -= n_bits; return ret; } unsigned dav2d_msac_decode_unary_bypass_c(MsacContext *const s, const unsigned max_bits) { assert(max_bits == 5 || max_bits == 6 || max_bits == 21); if ((unsigned)s->cnt < max_bits) ctx_refill(s); const uint64_t r = s->rng; uint64_t dif = s->dif; assert(!(r & 1)); assert((dif >> 48) < r); uint64_t vw = r << 47; unsigned ret = 0, bit; for (bit = 0; bit < max_bits; bit++) { if (dif >= vw) { dif -= vw; vw >>= 1; ret++; } else { bit++; break; } } s->dif = ((dif + 1) << bit) - 1; s->cnt -= bit; return ret; } /* Takes updated dif and range values, renormalizes them so that * 32768 <= rng < 65536 (reading more bytes from the stream into dif if * necessary), and stores them back in the decoder context. * dif: The new value of dif. * rng: The new value of the range. */ static inline void ctx_norm(MsacContext *const s, const uint64_t dif, const unsigned rng) { const unsigned d = 15 ^ (31 ^ clz(rng)); const unsigned cnt = s->cnt; assert(rng <= 65535U); s->dif = ((dif + 1) << d) - 1; s->rng = rng << d; s->cnt = cnt - d; if (cnt < d) // unsigned compare avoids redundant refills at eob ctx_refill(s); } /* Decode a single binary value. * f: The probability that the bit is one * Return: The value decoded (0 or 1). */ static unsigned dav2d_msac_decode_bool_c(MsacContext *const s, const unsigned f) { const unsigned r = s->rng; uint64_t dif = s->dif; assert((dif >> 48) < r); const unsigned p = ((f >> 7) << 4) + 8; unsigned v = ((r >> 8) * p >> 7) << 3; const uint64_t vw = (uint64_t)v << 48; const unsigned ret = dif >= vw; dif -= ret * vw; v += ret * (r - 2 * v); ctx_norm(s, dif, v); return !ret; } /* Decodes a symbol given an inverse cumulative distribution function (CDF) * table in Q15. */ unsigned dav2d_msac_decode_symbol_adapt_c(MsacContext *const s, uint16_t *const cdf, const size_t n_symbols) { const unsigned c = s->dif >> 48, r = s->rng >> 8; unsigned u, v = s->rng, val = -1; const uint16_t *const min_prob = dav2d_msac_min_prob[n_symbols - 1]; assert(n_symbols <= 7); do { val++; u = v; const unsigned p = imax((cdf[val] | 127) - min_prob[val], 0); v = (r * p >> 10) << 3; } while (c < v); assert(u <= s->rng); ctx_norm(s, s->dif - ((uint64_t)v << 48), u - v); if (s->allow_update_cdf) { const unsigned pc = cdf[n_symbols]; const unsigned count = (uint8_t)pc; assert(count <= 32); const int rate = dav2d_msac_rate[pc >> 8][count >> 4] + (n_symbols > 2); unsigned i; for (i = 0; i < val; i++) cdf[i] += (32768 - cdf[i]) >> rate; for (; i < n_symbols; i++) cdf[i] -= cdf[i] >> rate; cdf[n_symbols] = pc + (count < 32); } return val; } unsigned dav2d_msac_decode_bool_adapt_c(MsacContext *const s, uint16_t *const cdf) { const unsigned bit = dav2d_msac_decode_bool_c(s, *cdf); if (s->allow_update_cdf) { // update_cdf() specialized for boolean CDFs const unsigned pc = cdf[1]; const unsigned count = (uint8_t)pc; const int rate = dav2d_msac_rate[pc >> 8][count >> 4]; if (bit) cdf[0] += (32768 - cdf[0]) >> rate; else cdf[0] -= cdf[0] >> rate; cdf[1] = pc + (count < 32); } return bit; } void dav2d_msac_init(MsacContext *const s, const uint8_t *const data, const size_t sz, const int disable_cdf_update_flag) { s->buf_pos = data; s->buf_end = data + sz; s->dif = (~(uint64_t)0) >> 1; s->rng = 0x8000; s->cnt = -15; s->allow_update_cdf = !disable_cdf_update_flag; ctx_refill(s); #if HAVE_ASM && ARCH_X86_64 s->unary_bypass6 = dav2d_msac_decode_unary_bypass_c; s->unary_bypass21 = dav2d_msac_decode_unary_bypass21_c; msac_dsp_init_x86(s); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/msac.h000066400000000000000000000101751517466257200217330ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_MSAC_H #define DAV2D_SRC_MSAC_H #include #include #include "common/intops.h" typedef struct MsacContext { const uint8_t *buf_pos; const uint8_t *buf_end; uint64_t dif; unsigned rng; int cnt; int allow_update_cdf; #if HAVE_ASM && ARCH_X86_64 unsigned (*unary_bypass6)(struct MsacContext *s, unsigned max_bits); unsigned (*unary_bypass21)(struct MsacContext *s); #endif } MsacContext; EXTERN const uint8_t dav2d_msac_rate[125 /* para */][3 /* count */]; EXTERN const uint16_t dav2d_msac_min_prob[7 /* n_symbols*/][8]; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/msac.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/msac.h" #elif ARCH_X86 #include "src/x86/msac.h" #endif #endif void dav2d_msac_init(MsacContext *s, const uint8_t *data, size_t sz, int disable_cdf_update_flag); unsigned dav2d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav2d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf); unsigned dav2d_msac_decode_bools_bypass_c(MsacContext *s, unsigned n_bits); unsigned dav2d_msac_decode_unary_bypass_c(MsacContext *s, unsigned max_bits); /* Supported n_symbols ranges: adapt4: 1-3, adapt8: 1-7 */ #ifndef dav2d_msac_decode_symbol_adapt4 #define dav2d_msac_decode_symbol_adapt4 dav2d_msac_decode_symbol_adapt_c #endif #ifndef dav2d_msac_decode_symbol_adapt8 #define dav2d_msac_decode_symbol_adapt8 dav2d_msac_decode_symbol_adapt_c #endif #ifndef dav2d_msac_decode_bool_adapt #define dav2d_msac_decode_bool_adapt dav2d_msac_decode_bool_adapt_c #endif #ifndef dav2d_msac_decode_bool_bypass #define dav2d_msac_decode_bool_bypass dav2d_msac_decode_bool_bypass_c #endif #ifndef dav2d_msac_decode_bools_bypass #define dav2d_msac_decode_bools_bypass dav2d_msac_decode_bools_bypass_c #endif #ifndef dav2d_msac_decode_unary_bypass6 #define dav2d_msac_decode_unary_bypass6 dav2d_msac_decode_unary_bypass_c #endif #ifndef dav2d_msac_decode_unary_bypass21 #define dav2d_msac_decode_unary_bypass21 dav2d_msac_decode_unary_bypass21_c #endif static inline unsigned dav2d_msac_decode_bool_bypass_c(MsacContext *const s) { return dav2d_msac_decode_bools_bypass_c(s, 1); } static inline unsigned dav2d_msac_decode_unary_bypass21_c(MsacContext *const s) { return dav2d_msac_decode_unary_bypass_c(s, 21); } static inline int dav2d_msac_decode_uniform(MsacContext *const s, const unsigned n) { assert(n > 0); const int l = ulog2(n) + 1; assert(l > 1); const unsigned m = (1 << l) - n; const unsigned v = dav2d_msac_decode_bools_bypass(s, l - 1); return v < m ? v : (v << 1) - m + dav2d_msac_decode_bool_bypass(s); } #endif /* DAV2D_SRC_MSAC_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/obu.c000066400000000000000000003424771517466257200216050ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "dav2d/data.h" #include "common/frame.h" #include "common/intops.h" #include "common/validate.h" #include "src/decode.h" #include "src/getbits.h" #include "src/levels.h" #include "src/log.h" #include "src/obu.h" #include "src/ref.h" #include "src/thread_task.h" #include "src/warpmv.h" static int check_trailing_bits(GetBits *const gb, const int strict_std_compliance) { const int trailing_one_bit = dav2d_get_bit(gb); if (gb->error) return DAV2D_ERR(EINVAL); if (!strict_std_compliance) return 0; if (!trailing_one_bit || gb->state) return DAV2D_ERR(EINVAL); ptrdiff_t size = gb->ptr_end - gb->ptr; while (size > 0 && gb->ptr[size - 1] == 0) size--; if (size) return DAV2D_ERR(EINVAL); return 0; } static inline int tile_log2(const int sz, const int tgt) { int k; for (k = 0; (sz << k) < tgt; k++) ; return k; } static NOINLINE void parse_tile_info(struct Dav2dTileInfo *const thdr, GetBits *const gb, const int sbmul, const int sb128, const int seq_sb128, const int w, const int h, const int level, const int tier) { thdr->uniform = dav2d_get_bit(gb); // the limits are calculated based on a frame's sb128, and rounded-up // width/height variables (aligned to sbsz) const int sbsz_min1 = (64 << sb128) - 1; const int sbsz_log2 = 6 + sb128; const int sbw = (w + sbsz_min1) >> sbsz_log2; const int sbh = (h + sbsz_min1) >> sbsz_log2; const int w_adj = (level >= 18) + (level >= 14 && tier); const int max_tile_width_sb = 4096 >> (sbsz_log2 - w_adj); const int sz_adj = (level >= 14) + (level >= 18) + (level >= 14 && tier); const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2 - sz_adj); thdr->min_log2_cols = tile_log2(max_tile_width_sb, sbw); thdr->max_log2_cols = tile_log2(1, imin(sbw, DAV2D_MAX_TILE_COLS)); thdr->max_log2_rows = tile_log2(1, imin(sbh, DAV2D_MAX_TILE_ROWS)); const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh), thdr->min_log2_cols); if (thdr->uniform) { // but the (uniform) tile distribution is done based on "full" SBs only, // which can be less than the rounded-up versions above. Also, this is // done based on the sequence header's sb128 (not the frame's), which // can be different for keyframes const int seq_sbsz_log2 = 6 + seq_sb128; const int fsbw = imax(1, (w + 7) >> seq_sbsz_log2); const int fsbh = imax(1, (h + 7) >> seq_sbsz_log2); for (thdr->log2_cols = thdr->min_log2_cols; thdr->log2_cols < thdr->max_log2_cols && dav2d_get_bit(gb); thdr->log2_cols++) ; const int tile_w = imax(1, fsbw >> thdr->log2_cols); int extra = imax(0, fsbw - (tile_w << thdr->log2_cols)); thdr->cols = 0; for (int sbx = 0; sbx < fsbw; sbx += tile_w + (extra > 0), thdr->cols++, extra--) { thdr->col_start_sb[thdr->cols] = sbx * sbmul; } thdr->min_log2_rows = imax(min_log2_tiles - thdr->log2_cols, 0); for (thdr->log2_rows = thdr->min_log2_rows; thdr->log2_rows < thdr->max_log2_rows && dav2d_get_bit(gb); thdr->log2_rows++) ; const int tile_h = imax(1, fsbh >> thdr->log2_rows); extra = imax(0, fsbh - (tile_h << thdr->log2_rows)); thdr->rows = 0; for (int sby = 0; sby < fsbh; sby += tile_h + (extra > 0), thdr->rows++, extra--) { thdr->row_start_sb[thdr->rows] = sby * sbmul; } } else { thdr->cols = 0; int widest_tile = 0, max_tile_area_sb = sbw * sbh; for (int sbx = 0; sbx < sbw && thdr->cols < DAV2D_MAX_TILE_COLS; thdr->cols++) { const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb); const int tile_w = (tile_width_sb > 1) ? 1 + dav2d_get_uniform(gb, tile_width_sb) : 1; thdr->col_start_sb[thdr->cols] = sbx; sbx += tile_w; widest_tile = imax(widest_tile, tile_w); } thdr->log2_cols = tile_log2(1, thdr->cols); if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1; const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1); thdr->rows = 0; for (int sby = 0; sby < sbh && thdr->rows < DAV2D_MAX_TILE_ROWS; thdr->rows++) { const int tile_height_sb = imin(sbh - sby, max_tile_height_sb); const int tile_h = (tile_height_sb > 1) ? 1 + dav2d_get_uniform(gb, tile_height_sb) : 1; thdr->row_start_sb[thdr->rows] = sby; sby += tile_h; } thdr->log2_rows = tile_log2(1, thdr->rows); } thdr->col_start_sb[thdr->cols] = sbw; thdr->row_start_sb[thdr->rows] = sbh; } static ALWAYS_INLINE void parse_seg_info(Dav2dSegmentationDataSet *const seg, GetBits *const gb, const int n_seg) { for (int n = 0, m = 1; n < n_seg; n++, m <<= 1) { if (dav2d_get_bit(gb)) { seg->delta_q_mask |= m; seg->delta_q[n] = iclip(dav2d_get_sbits(gb, 10), -351, 351); } seg->skip_mask |= m * dav2d_get_bit(gb); seg->globalmv_mask |= m * dav2d_get_bit(gb); } } static NOINLINE int parse_seq_hdr(Dav2dSequenceHeader *const hdr, GetBits *const gb, const int strict_std_compliance) { #define DEBUG_SEQ_HDR 0 #if DEBUG_SEQ_HDR const unsigned init_bit_pos = dav2d_get_bits_pos(gb); #endif memset(hdr, 0, sizeof(*hdr)); hdr->id = dav2d_get_vlc(gb); hdr->profile = dav2d_get_bits(gb, 5); if (hdr->profile > 2) goto error; hdr->reduced_still_picture_header = dav2d_get_bit(gb); hdr->level = dav2d_get_bits(gb, 5); if (hdr->level >= 4 && !hdr->reduced_still_picture_header) hdr->tier = dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-profile_stillpic_level_tier[profile:%d,reducedhdr:%d," "level:%d,tier:%d]: off=%u\n", hdr->profile, hdr->reduced_still_picture_header, hdr->level, hdr->tier, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->layout = dav2d_get_vlc(gb); if (hdr->layout > 3) goto error; #if DEBUG_SEQ_HDR printf("SEQHDR: post-layout[%d]: off=%u\n", hdr->layout, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->layout = dav2d_layouts[hdr->layout]; switch (hdr->layout) { case DAV2D_PIXEL_LAYOUT_I420: case DAV2D_PIXEL_LAYOUT_I400: hdr->ss_hor = hdr->ss_ver = 1; break; case DAV2D_PIXEL_LAYOUT_I422: hdr->ss_hor = 1; hdr->ss_ver = 0; default: break; } hdr->hbd = dav2d_get_vlc(gb); if (hdr->hbd > 2) goto error; if (hdr->hbd < 2) hdr->hbd ^= 1; #if DEBUG_SEQ_HDR printf("SEQHDR: post-bitdepth[%d]: off=%u\n", 8 + 2 * hdr->hbd, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (hdr->reduced_still_picture_header) { hdr->still_picture = 1; hdr->monotonic = 1; } else { hdr->lcr_id = dav2d_get_bits(gb, 3); hdr->still_picture = dav2d_get_bit(gb); hdr->max_tlayer_id = dav2d_get_bits(gb, 2); hdr->max_mlayer_id = dav2d_get_bits(gb, 3); hdr->monotonic = dav2d_get_bit(gb); } #if DEBUG_SEQ_HDR printf("SEQHDR: post-seqlcrid_stillpic_maxtmlayerid[lcrid:%d,stillpic:%d," "maxtlayerid:%d,maxmlayerid:%d,monotonic:%d]: off=%u\n", hdr->lcr_id, hdr->still_picture, hdr->max_tlayer_id, hdr->max_mlayer_id, hdr->monotonic, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->width_n_bits = dav2d_get_bits(gb, 4) + 1; hdr->height_n_bits = dav2d_get_bits(gb, 4) + 1; hdr->max_width = dav2d_get_bits(gb, hdr->width_n_bits) + 1; hdr->max_height = dav2d_get_bits(gb, hdr->height_n_bits) + 1; #if DEBUG_SEQ_HDR printf("SEQHDR: post-size[bits:%dx%d,max:%dx%d]: off=%u\n", hdr->width_n_bits, hdr->height_n_bits, hdr->max_width, hdr->max_height, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->crop.enabled = dav2d_get_bit(gb); if (hdr->crop.enabled) { hdr->crop.left = dav2d_get_vlc(gb); hdr->crop.right = dav2d_get_vlc(gb); hdr->crop.top = dav2d_get_vlc(gb); hdr->crop.bottom = dav2d_get_vlc(gb); } #if DEBUG_SEQ_HDR printf("SEQHDR: post-cropwindow[%d,l:%d,r:%d,t:%d,b:%d]: off=%u\n", hdr->crop.enabled, hdr->crop.left, hdr->crop.right, hdr->crop.top, hdr->crop.bottom, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (!hdr->reduced_still_picture_header) { hdr->max_display_model_info_present = dav2d_get_bit(gb); if (hdr->max_display_model_info_present) hdr->max_initial_display_delay = dav2d_get_bits(gb, 4); hdr->decoder_model_info_present = dav2d_get_bit(gb); hdr->max_decoder_buffer_delay = 70000; hdr->max_encoder_buffer_delay = 20000; if (hdr->decoder_model_info_present) { hdr->num_units_in_decoding_tick = dav2d_get_bits(gb, 32); hdr->max_decoder_buffer_delay = dav2d_get_vlc(gb); hdr->max_encoder_buffer_delay = dav2d_get_vlc(gb); } else hdr->num_units_in_decoding_tick = 1; #if DEBUG_SEQ_HDR printf("SEQHDR: post-decodermodel[maxdisplaymodel:%d,decodermodel:%d]: off=%u\n", hdr->max_display_model_info_present, hdr->decoder_model_info_present, dav2d_get_bits_pos(gb) - init_bit_pos); #endif } if (hdr->max_tlayer_id) { hdr->tlayer_dependency_present = dav2d_get_bit(gb); if (hdr->tlayer_dependency_present) { for (unsigned n = 1; n < hdr->max_tlayer_id; n++) hdr->tlayer_dependencies[n] = dav2d_get_bits(gb, n); } else { for (unsigned n = 1, mask = ~0U; n < hdr->max_tlayer_id; n++, mask <<= 1) hdr->tlayer_dependencies[n] = ~mask; } } if (hdr->max_mlayer_id) { hdr->mlayer_dependency_present = dav2d_get_bit(gb); if (hdr->mlayer_dependency_present) { for (unsigned n = 1; n < hdr->max_mlayer_id; n++) hdr->mlayer_dependencies[n] = dav2d_get_bits(gb, n); } else { for (unsigned n = 1, mask = ~0U; n < hdr->max_mlayer_id; n++, mask <<= 1) hdr->mlayer_dependencies[n] = ~mask; } } #if DEBUG_SEQ_HDR printf("SEQHDR: post-layerdesc[%d]: off=%u\n", hdr->mlayer_dependency_present, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->sb128 = dav2d_get_bit(gb) ? 2 : dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-sbsz[%dx%d]: off=%u\n", 64 << hdr->sb128, 64 << hdr->sb128, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (hdr->layout != DAV2D_PIXEL_LAYOUT_I400) { hdr->sdp = dav2d_get_bit(gb); if (hdr->sdp && !hdr->reduced_still_picture_header) hdr->ext_sdp = dav2d_get_bit(gb); } hdr->ext_partitions = dav2d_get_bit(gb); if (hdr->ext_partitions) hdr->uneven_4way_partitions = dav2d_get_bit(gb); hdr->max_pb_aspect_ratio_log2 = dav2d_get_bit(gb) ? 1 + dav2d_get_bit(gb) : 3; #if DEBUG_SEQ_HDR printf("SEQHDR: post-partition[sdp:%d,extsdp:%d,extpart:%d," "uneven4way:%d,maxpbaspectratio:%d]: off=%u\n", hdr->sdp, hdr->ext_sdp, hdr->ext_partitions, hdr->uneven_4way_partitions, hdr->max_pb_aspect_ratio_log2, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->segmentation.ext = dav2d_get_bit(gb); hdr->segmentation.info_present = dav2d_get_bit(gb); if (hdr->segmentation.info_present) { hdr->segmentation.adaptive = dav2d_get_bit(gb); parse_seg_info(&hdr->segmentation.d, gb, 8 << hdr->segmentation.ext); } #if DEBUG_SEQ_HDR printf("SEQHDR: post-segmentation[extseg:%d,seginfo:%d]: off=%u\n", hdr->segmentation.ext, hdr->segmentation.info_present, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->intra_dip = dav2d_get_bit(gb); // data-driven intra prediction hdr->intra_edge_filter = dav2d_get_bit(gb); hdr->mrls = dav2d_get_bit(gb); hdr->cfl = dav2d_get_bit(gb); if (hdr->layout != DAV2D_PIXEL_LAYOUT_I400) hdr->cfl_ds_filter_index = dav2d_get_bits(gb, 2); hdr->mhccp = dav2d_get_bit(gb); hdr->ibp = dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-intratools[dip:%d,edgefilter:%d,mrl:%d,cfl:%d," "cfldsfilter:%d,mhccp:%d,ibp:%d]: off=%u\n", hdr->intra_dip, hdr->intra_edge_filter, hdr->mrls, hdr->cfl, hdr->cfl_ds_filter_index, hdr->mhccp, hdr->ibp, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (hdr->reduced_still_picture_header) { hdr->motion_modes = 1 << MM_TRANSLATION; } else { hdr->motion_modes = (1 << MM_TRANSLATION); for (int n = 2; n <= 16; n <<= 1) hdr->motion_modes |= n * dav2d_get_bit(gb); if (hdr->motion_modes & ~(1 << MM_TRANSLATION)) hdr->frame_motion_modes_present = dav2d_get_bit(gb); if (hdr->motion_modes & (1 << MM_WARP_DELTA)) hdr->six_param_warp_delta = dav2d_get_bit(gb); hdr->masked_compound = dav2d_get_bit(gb); hdr->ref_frame_mvs = dav2d_get_bit(gb); if (hdr->ref_frame_mvs) hdr->reduced_ref_frame_mvs_mode = dav2d_get_bit(gb); hdr->order_hint_n_bits = dav2d_get_bits(gb, 4) + 1; #if DEBUG_SEQ_HDR printf("SEQHDR: post-interframetools[mm:%x,fmm:%d,6pwarp:%d," "maskcomp:%d,refmvs:%d,redrefmvs:%d,pocbits:%d]: off=%u\n", hdr->motion_modes, hdr->frame_motion_modes_present, hdr->six_param_warp_delta, hdr->masked_compound, hdr->ref_frame_mvs, hdr->reduced_ref_frame_mvs_mode, hdr->order_hint_n_bits, dav2d_get_bits_pos(gb) - init_bit_pos); #endif } hdr->refmv_bank = dav2d_get_bit(gb); hdr->drl_reorder = dav2d_get_bit(gb) ? 0 : 2 - dav2d_get_bit(gb); if (hdr->reduced_still_picture_header) { hdr->ref_frames = 2; hdr->def_max_drl_bits = 1; } else { hdr->explicit_ref_frame_map = dav2d_get_bit(gb); hdr->ref_frames = dav2d_get_bit(gb) ? dav2d_get_bits(gb, 4) + 1 : 8; hdr->ref_frames_log2 = hdr->ref_frames <= 2 ? hdr->ref_frames - 1 : 1 + ulog2(hdr->ref_frames - 1); hdr->number_of_bits_for_lt_frame_id = dav2d_get_bits(gb, 3); hdr->def_max_drl_bits = dav2d_get_uniform(gb, 5) + 1; hdr->allow_frame_max_drl_bits = dav2d_get_bit(gb); } hdr->def_max_bvp_drl_bits = dav2d_get_uniform(gb, 3) + 1; hdr->allow_max_bvp_drl_bits = dav2d_get_bit(gb); if (!hdr->reduced_still_picture_header) hdr->num_same_ref_comp = dav2d_get_bits(gb, 2); #if DEBUG_SEQ_HDR printf("SEQHDR: post-refs[bank:%d,drlreorder:%d,explrefmap:%d," "nrefs:%d,nbitsltfid:%d,drlbits:%d,fdrlbits:%d,bvpdrlbits:%d," "fbvpdrlbits:%d,numsamerefcomp:%d]: off=%u\n", hdr->refmv_bank, hdr->drl_reorder, hdr->explicit_ref_frame_map, hdr->ref_frames, hdr->number_of_bits_for_lt_frame_id, hdr->def_max_drl_bits, hdr->allow_frame_max_drl_bits, hdr->def_max_bvp_drl_bits, hdr->allow_max_bvp_drl_bits, hdr->num_same_ref_comp, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (!hdr->reduced_still_picture_header) { hdr->tip = dav2d_get_bit(gb) ? 1 + dav2d_get_bit(gb) : 0; if (hdr->tip) hdr->tip_hole_fill = dav2d_get_bit(gb); hdr->mv_traj = dav2d_get_bit(gb); } hdr->bawp = dav2d_get_bit(gb); if (!hdr->reduced_still_picture_header) { hdr->cwp = dav2d_get_bit(gb); hdr->imp_msk_bld = dav2d_get_bit(gb); hdr->db_sub_pu = dav2d_get_bit(gb); if (hdr->tip == 1 && hdr->db_sub_pu) hdr->tip_explicit_qp = dav2d_get_bit(gb); } #if DEBUG_SEQ_HDR printf("SEQHDR: post-intertools1[tip:%d,tipholefill:%d,mvtraj:%d,bawp:%d," "cwp:%d,impmskbld:%d,lfsubpu:%d,tipqp:%d]: off=%u\n", hdr->tip, hdr->tip_hole_fill, hdr->mv_traj, hdr->bawp, hdr->cwp, hdr->imp_msk_bld, hdr->db_sub_pu, hdr->tip_explicit_qp, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (!hdr->reduced_still_picture_header) { hdr->opfl_refine = dav2d_get_bits(gb, 2); hdr->refine_mv = dav2d_get_bit(gb); if (hdr->tip && (hdr->opfl_refine || hdr->refine_mv)) hdr->tip_refine_mv = dav2d_get_bit(gb); hdr->bru = dav2d_get_bit(gb); hdr->adaptive_mvd = dav2d_get_bit(gb); hdr->mvd_sign_derive = dav2d_get_bit(gb); hdr->flex_mvres = dav2d_get_bit(gb); if (!hdr->reduced_still_picture_header) hdr->global_motion = dav2d_get_bit(gb); hdr->short_refresh_frame_flags = dav2d_get_bit(gb); } #if DEBUG_SEQ_HDR printf("SEQHDR: post-intertools2[opflrefine:%d,refinemv:%d,tiprefinemv:%d," "bru:%d,adaptivemvd:%d,mvdsignderive:%d,flexmvres:%d,gmv:%d," "shortrefeshmsk:%d]: off=%u\n", hdr->opfl_refine, hdr->refine_mv, hdr->tip_refine_mv, hdr->bru, hdr->adaptive_mvd, hdr->mvd_sign_derive, hdr->flex_mvres, hdr->global_motion, hdr->short_refresh_frame_flags, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (hdr->reduced_still_picture_header) { hdr->screen_content_tools = DAV2D_ADAPTIVE; hdr->force_integer_mv = DAV2D_ADAPTIVE; } else { hdr->screen_content_tools = dav2d_get_bit(gb) ? DAV2D_ADAPTIVE : dav2d_get_bit(gb); hdr->force_integer_mv = hdr->screen_content_tools ? dav2d_get_bit(gb) ? DAV2D_ADAPTIVE : dav2d_get_bit(gb) : DAV2D_ADAPTIVE; #if DEBUG_SEQ_HDR printf("SEQHDR: post-screentools[scc:%d,forceintmv:%d]: off=%u\n", hdr->screen_content_tools, hdr->force_integer_mv, dav2d_get_bits_pos(gb) - init_bit_pos); #endif } hdr->fsc = dav2d_get_bit(gb); hdr->idtx_intra = hdr->fsc || dav2d_get_bit(gb); hdr->ist[0] = dav2d_get_bit(gb); hdr->ist[1] = dav2d_get_bit(gb); if (hdr->layout != DAV2D_PIXEL_LAYOUT_I400) hdr->chroma_dctonly = dav2d_get_bit(gb); if (!hdr->reduced_still_picture_header) hdr->inter_ddt = dav2d_get_bit(gb); hdr->reduced_tx_part_set = dav2d_get_bit(gb); if (hdr->layout != DAV2D_PIXEL_LAYOUT_I400) hdr->cctx = dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-txgrptools[fsc:%d,idtxintra:%d,ist:%d,interist:%d," "chromadctonly:%d,interddt:%d,reducedtxtpset:%d,cctx:%d]: off=%u\n", hdr->fsc, hdr->idtx_intra, hdr->ist[0], hdr->ist[1], hdr->chroma_dctonly, hdr->inter_ddt, hdr->reduced_tx_part_set, hdr->cctx, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->tcq = dav2d_get_bit(gb); if (hdr->tcq && !hdr->reduced_still_picture_header) hdr->tcq += dav2d_get_bit(gb); if (hdr->tcq != 1) hdr->parity_hiding = dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-coef[tcq:%d,parityhiding:%d]: off=%u\n", hdr->tcq, hdr->parity_hiding, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->avg_cdf = hdr->reduced_still_picture_header || dav2d_get_bit(gb); if (hdr->avg_cdf) hdr->avg_cdf_type = hdr->reduced_still_picture_header || dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-cdfbits[avgcdf:%d,cdftype:%d]: off=%u\n", hdr->avg_cdf, hdr->avg_cdf_type, dav2d_get_bits_pos(gb) - init_bit_pos); #endif if (hdr->layout != DAV2D_PIXEL_LAYOUT_I400) hdr->separate_uv_delta_q = dav2d_get_bit(gb); hdr->equal_ac_dc_q = dav2d_get_bit(gb); if (!hdr->equal_ac_dc_q) { hdr->base_ydc_dq = dav2d_get_bits(gb, 5) - 23; hdr->ydc_dq_enabled = dav2d_get_bit(gb); } if (hdr->layout != DAV2D_PIXEL_LAYOUT_I400) { if (!hdr->equal_ac_dc_q) { hdr->base_uvdc_dq = dav2d_get_bits(gb, 5) - 23; hdr->uvdc_dq_enabled = dav2d_get_bit(gb); } hdr->base_uvac_dq = dav2d_get_bits(gb, 5) - 23; hdr->uvac_dq_enabled = dav2d_get_bit(gb); if (hdr->equal_ac_dc_q) hdr->base_uvdc_dq = hdr->base_uvac_dq; } #if DEBUG_SEQ_HDR printf("SEQHDR: post-quantflags[sepuvdq:%d,aceqdc:%d,ydcdq:%d," "fydcdq:%d,uvdcdq:%d,fuvdcdq:%d,uvacdq:%d,fuvacdq:%d]: off=%u\n", hdr->separate_uv_delta_q, hdr->equal_ac_dc_q, hdr->base_ydc_dq, hdr->ydc_dq_enabled, hdr->base_uvdc_dq, hdr->uvdc_dq_enabled, hdr->base_uvac_dq, hdr->uvac_dq_enabled, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->disable_loopfilters_across_tiles = dav2d_get_bit(gb); hdr->cdef = dav2d_get_bit(gb); hdr->gdf = dav2d_get_bit(gb); if (hdr->gdf && !hdr->sb128) hdr->gdf_unit_matches_sbsz = dav2d_get_bit(gb); hdr->restoration = dav2d_get_bit(gb); if (hdr->restoration) { unsigned no_pc_wiener = dav2d_get_bit(gb); unsigned no_ns_wiener_y = dav2d_get_bit(gb); hdr->rst_disable_mask[0] = (no_ns_wiener_y << 1) | no_pc_wiener; if (dav2d_get_bit(gb)) { hdr->rst_disable_mask[1] = (dav2d_get_bit(gb) << 1) | 1; } else { hdr->rst_disable_mask[1] = hdr->rst_disable_mask[0] | 1; } } hdr->ccso = dav2d_get_bit(gb); if (hdr->ccso) hdr->ccso_unit_matches_sbsz = dav2d_get_bit(gb); hdr->cdef_on_skiptx = hdr->reduced_still_picture_header ? DAV2D_ADAPTIVE : dav2d_get_bit(gb) ? 1 : dav2d_get_bit(gb) ? 0 : DAV2D_ADAPTIVE; hdr->df_par_bits = 2 + dav2d_get_bits(gb, 2); #if DEBUG_SEQ_HDR printf("SEQHDR: post-inloopfilters[disablelfacrosstiles:%d,cdef:%d,gdf:%d,rst:%d," "lrdisablemsk:%d,%d,cccso:%d,cdefonskiptxfm:%d,dfparbits:%d]: off=%u\n", hdr->disable_loopfilters_across_tiles, hdr->cdef, hdr->gdf, hdr->restoration, hdr->rst_disable_mask[0] << 1, hdr->rst_disable_mask[1] << 1, hdr->ccso, hdr->cdef_on_skiptx, hdr->df_par_bits, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->tiling.present = dav2d_get_bit(gb); if (hdr->tiling.present) { hdr->tiling.present += dav2d_get_bit(gb); parse_tile_info(&hdr->tiling.t, gb, 1, hdr->sb128, hdr->sb128, hdr->max_width, hdr->max_height, hdr->level, hdr->tier); } #if DEBUG_SEQ_HDR printf("SEQHDR: post-tileinfo[%d,%dx%d]: off=%u\n", hdr->tiling.present, hdr->tiling.t.cols, hdr->tiling.t.rows, dav2d_get_bits_pos(gb) - init_bit_pos); #endif hdr->film_grain_present = dav2d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-filmgrain[%d]: off=%u\n", hdr->film_grain_present, dav2d_get_bits_pos(gb) - init_bit_pos); #endif // extension is all bits until the trailing one, so // since we don't care about the contents anyway, we // can skip & ignore it if we don't care about strict // conformance if (!gb->error && !strict_std_compliance && !DEBUG_SEQ_HDR) return 0; const int has_extension = dav2d_get_bit(gb); #if DEBUG_SEQ_HDR ptrdiff_t extension_bits = 0; #endif if (has_extension) { // extension is all following bits except trailing one & any following zeroes const uint8_t *ptr = &gb->ptr_end[-1], *cur = &gb->ptr[-(1 + (gb->bits_left >> 3))]; while (ptr >= cur && !*ptr) ptr--; if (ptr < cur) goto error; int n = 8, m = 0x80, byte = *ptr; for (; n >= 0; n++, m >>= 1) { if (byte & m) break; } assert(n >= 0); #if DEBUG_SEQ_HDR extension_bits = (&ptr[1] - gb->ptr) * 8 + gb->bits_left - n; #endif // set up bitreader to skip all extension bits, but not the trailing one // and any following zeroes gb->ptr = &ptr[1]; gb->state = (uint64_t) byte << (64 - n); gb->bits_left = n; } #if DEBUG_SEQ_HDR printf("SEQHDR: post-extension[%d,nbits=%td]: off=%u\n", has_extension, extension_bits, dav2d_get_bits_pos(gb) - init_bit_pos); #endif return check_trailing_bits(gb, strict_std_compliance); error: return DAV2D_ERR(EINVAL); } int dav2d_parse_sequence_header(Dav2dSequenceHeader *const out, const uint8_t *const ptr, const size_t sz) { validate_input_or_ret(out != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(ptr != NULL, DAV2D_ERR(EINVAL)); validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV2D_ERR(EINVAL)); GetBits gb; dav2d_init_get_bits(&gb, ptr, sz); int res = DAV2D_ERR(ENOENT); do { dav2d_get_bit(&gb); // obu_forbidden_bit const enum Dav2dObuType type = dav2d_get_bits(&gb, 4); const int has_extension = dav2d_get_bit(&gb); const int has_length_field = dav2d_get_bit(&gb); dav2d_get_bits(&gb, 1 + 8 * has_extension); // ignore const uint8_t *obu_end = gb.ptr_end; if (has_length_field) { const size_t len = dav2d_get_uleb128(&gb); if (len > (size_t)(obu_end - gb.ptr)) return DAV2D_ERR(EINVAL); obu_end = gb.ptr + len; } if (type == DAV2D_OBU_SEQ_HDR) { if ((res = parse_seq_hdr(out, &gb, 0)) < 0) return res; if (gb.ptr > obu_end) return DAV2D_ERR(EINVAL); dav2d_bytealign_get_bits(&gb); } if (gb.error) return DAV2D_ERR(EINVAL); assert(gb.state == 0 && gb.bits_left == 0); gb.ptr = obu_end; } while (gb.ptr < gb.ptr_end); return res; } static int read_frame_size(Dav2dContext *const c, GetBits *const gb) { Dav2dFrameHeader *const hdr = c->frame_hdr; if (hdr->frame_size_override && IS_INTER_OR_SWITCH(hdr)) { for (int i = 0; i < hdr->n_ref_frames; i++) { if (dav2d_get_bit(gb)) { const Dav2dThreadPicture *const ref = &c->refs[hdr->refidx[i]].p; if (!ref->p.frame_hdr) return -1; const Dav2dFrameHeader *const refhdr = ref->p.frame_hdr; hdr->width = refhdr->width; hdr->height = refhdr->height; return 0; } } } const Dav2dSequenceHeader *const seqhdr = c->seq_hdr; if (hdr->frame_size_override) { hdr->width = dav2d_get_bits(gb, seqhdr->width_n_bits) + 1; hdr->height = dav2d_get_bits(gb, seqhdr->height_n_bits) + 1; } else { hdr->width = seqhdr->max_width; hdr->height = seqhdr->max_height; } return 0; } static int get_ref_frames(Dav2dContext *const c, const int have_resolution) { const Dav2dSequenceHeader *const seqhdr = c->seq_hdr; Dav2dFrameHeader *const hdr = c->frame_hdr; struct Score { int score; uint8_t poc; int8_t pocdiff; uint16_t qidx; uint8_t mlayer; int8_t res_ratio_log2; } ref_info[8]; uint8_t sort_idx[8]; int n_refs = 0, have_fwd_refs = 0; const unsigned poc = hdr->frame_offset; for (int n = 0; n < 8 && !have_fwd_refs; n++) { if (!c->refs[n].p.p.frame_hdr) continue; have_fwd_refs = get_poc_diff(seqhdr->order_hint_n_bits, poc, c->refs[n].p.p.frame_hdr->frame_offset) < 0; } const int mlayer = hdr->mlayer_id, tlayer = hdr->tlayer_id; const int w = hdr->width, h = hdr->height; int minq = 512, maxq = -1; const Dav2dFrameHeader *last_refhdr = NULL; for (int n = 0; n < 8; n++) { struct Score *const r = &ref_info[n]; const Dav2dFrameHeader *const refhdr = c->refs[n].p.p.frame_hdr; if (!refhdr || refhdr == last_refhdr) continue; if (seqhdr->tlayer_dependency_present) { if (!(seqhdr->tlayer_dependencies[tlayer] & (1 << refhdr->tlayer_id))) continue; } else { if (tlayer < refhdr->tlayer_id) continue; } r->mlayer = refhdr->mlayer_id; if (seqhdr->mlayer_dependency_present) { if (!(seqhdr->mlayer_dependencies[mlayer] & (1 << r->mlayer))) continue; } else { if (mlayer < r->mlayer) continue; } if (have_resolution && (2 * w < refhdr->width || 2 * h < refhdr->height || w > 16 * refhdr->width || h > 16 * refhdr->height)) { continue; } r->res_ratio_log2 = -ulog2(refhdr->width * refhdr->height); r->poc = refhdr->frame_offset; r->pocdiff = get_poc_diff(seqhdr->order_hint_n_bits, poc, r->poc); r->qidx = refhdr->quant.yac; const int tdist = abs(r->pocdiff) + mlayer - r->mlayer; r->score = have_fwd_refs ? (tdist << 6) : 128 - (128 >> (imin(tdist, 6))) + imax(tdist - 6, 0); r->score += r->res_ratio_log2 * (1 << 5) + r->qidx; int m; for (m = 0; m < n_refs; m++) { const struct Score *const r2 = &ref_info[sort_idx[m]]; if (r->score == r2->score && r->poc == r2->poc && r->mlayer == r2->mlayer) { break; } } if (m < n_refs) continue; // ref already exists maxq = imax(r->qidx, maxq); minq = imin(r->qidx, minq); for (; m > 0; m--) { const int idx = sort_idx[m - 1]; const struct Score *const r2 = &ref_info[idx]; if (r2->score <= r->score) break; sort_idx[m] = idx; } sort_idx[m] = n; n_refs++; last_refhdr = refhdr; } if (n_refs == 8) { const int q_thr = (maxq + minq + 1) >> 1; int maxpocdiff[2] = { 0, 0 }, num[2] = { 0, 0 }, furthest_idx[2]; for (int n = 0; n < 8; n++) { const struct Score *const r = &ref_info[sort_idx[n]]; if (r->qidx < q_thr) continue; if (r->pocdiff > 0) { if (r->pocdiff > maxpocdiff[0]) { maxpocdiff[0] = r->pocdiff; furthest_idx[0] = n; } num[0]++; } else if (r->pocdiff < 0) { if (r->pocdiff < maxpocdiff[1]) { maxpocdiff[1] = r->pocdiff; furthest_idx[1] = n; } num[1]++; } } const int idx = num[0] > num[1] ? furthest_idx[0] : num[0] < num[1] ? furthest_idx[1] : furthest_idx[maxpocdiff[0] < -maxpocdiff[1]]; if (idx < 7) { memmove(&sort_idx[idx], &sort_idx[idx + 1], 7 - idx); sort_idx[7] = idx; } } for (int n = 0; n < 7; n++) hdr->refidx[n] = sort_idx[n < n_refs ? n : 0]; return imin(7, n_refs); } static void find_tip_ref_frames(const Dav2dContext *const c, Dav2dFrameHeader *const hdr, const Dav2dSequenceHeader *const seqhdr) { // tip const int n_refs = hdr->n_ref_frames; if (n_refs == 1) { hdr->tip.ref[0] = hdr->tip.ref[1] = 0; return; } const unsigned poc = hdr->frame_offset; uint8_t order[7]; int8_t refdist[7]; int n_past = 0; // temporal ordering of refs for (int n = 0; n < n_refs; n++) { const unsigned refpoc = c->refs[hdr->refidx[n]].p.p.frame_hdr->frame_offset; const int dist = refdist[n] = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc); int m; for (m = n; m > 0 && refdist[order[m - 1]] > dist; m--) order[m] = order[m - 1]; order[m] = n; n_past += dist < 0; } if (n_past == n_refs) { // all refs are in the past, select nearest (last) 2 hdr->tip.ref[0] = order[n_refs - 1]; hdr->tip.ref[1] = order[n_refs - 2]; } else if (!n_past) { // all refs are in the future, select nearest (first) 2 hdr->tip.ref[0] = order[0]; hdr->tip.ref[1] = order[1]; } else { // temporally mixed refs, select the closest to the current one hdr->tip.ref[0] = order[n_past - 1]; hdr->tip.ref[1] = order[n_past]; } } static void derive_pri_sec_ref(const Dav2dContext *const c, int refs[2]) { const Dav2dSequenceHeader *const seqhdr = c->seq_hdr; const Dav2dFrameHeader *const hdr = c->frame_hdr; refs[0] = DAV2D_PRIMARY_REF_NONE; int best_qdiff[2], best_pocdiff[2], best_poc[2], best = 0; const int qidx = hdr->quant.yac, poc = hdr->frame_offset; const int nbits = seqhdr->order_hint_n_bits; for (int i = 0; i < hdr->n_ref_frames; i++) { const Dav2dFrameHeader *const refhdr = c->refs[hdr->refidx[i]].p.p.frame_hdr; if (!refhdr || IS_KEY_OR_INTRA(refhdr)) continue; const int ref_qidx = refhdr->quant.yac, qdiff = abs(ref_qidx - qidx); const int ref_poc = refhdr->frame_offset; const int pocdiff = abs(get_poc_diff(nbits, poc, ref_poc)); for (int n = 0, m = best; n < 2; n++, m = !m) { if (refs[m] == DAV2D_PRIMARY_REF_NONE || qdiff < best_qdiff[m] || (qdiff == best_qdiff[m] && (pocdiff < best_pocdiff[m] || (pocdiff == best_pocdiff[m] && get_poc_diff(nbits, best_poc[m], ref_poc) < 0)))) { refs[!best] = i; best_pocdiff[!best] = pocdiff; best_qdiff[!best] = qdiff; best_poc[!best] = ref_poc; if (!n) best = !best; break; } } } if (best) { const int tmp = refs[0]; refs[0] = refs[1]; refs[1] = tmp; } } static NOINLINE void parse_tile_info_frmhdr(Dav2dFrameHeader *const hdr, const Dav2dSequenceHeader *const seqhdr, GetBits *const gb) { // tile data hdr->sb128 = IS_INTER_OR_SWITCH(hdr) ? seqhdr->sb128 : !!seqhdr->sb128; int sbmul, reuse_allowed = 0; if (seqhdr->tiling.present) { const int sbsz_min1 = (64 << hdr->sb128) - 1; const int sbsz_log2 = 6 + hdr->sb128; const int sbw = (hdr->width + sbsz_min1) >> sbsz_log2; const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2; if (!seqhdr->tiling.t.uniform) { const int seq_sbsz_min1 = (64 << seqhdr->sb128) - 1; const int seq_sbsz_log2 = 6 + seqhdr->sb128; const int seq_sbw = (seqhdr->max_width + seq_sbsz_min1) >> seq_sbsz_log2; const int seq_sbh = (seqhdr->max_height + seq_sbsz_min1) >> seq_sbsz_log2; reuse_allowed = seq_sbw == sbw && seq_sbh == sbh; } else { const int tile_w = (sbw + seqhdr->tiling.t.cols - 1) >> seqhdr->tiling.t.log2_cols; const int tile_h = (sbh + seqhdr->tiling.t.rows - 1) >> seqhdr->tiling.t.log2_rows; reuse_allowed = tile_w * (seqhdr->tiling.t.cols - 1) < sbw && tile_h * (seqhdr->tiling.t.rows - 1) < sbh; } } if (reuse_allowed && (seqhdr->tiling.present == 1 || (seqhdr->tiling.present == DAV2D_ADAPTIVE && dav2d_get_bit(gb)))) { hdr->tiling.t = seqhdr->tiling.t; if (hdr->sb128 != seqhdr->sb128) { assert(hdr->sb128 == 1 && seqhdr->sb128 == 2 && IS_KEY_OR_INTRA(hdr)); sbmul = 2; for (int n = 0; n < hdr->tiling.t.rows; n++) hdr->tiling.t.row_start_sb[n] *= 2; for (int n = 0; n < hdr->tiling.t.cols; n++) hdr->tiling.t.col_start_sb[n] *= 2; } else sbmul = 1; } else { sbmul = seqhdr->sb128 == 2 && IS_KEY_OR_INTRA(hdr) ? 2 : 1; parse_tile_info(&hdr->tiling.t, gb, sbmul, hdr->sb128, seqhdr->sb128, hdr->width, hdr->height, seqhdr->level, seqhdr->tier); } if (sbmul == 2) { hdr->tiling.t.row_start_sb[hdr->tiling.t.rows] = (hdr->height + 127) >> 7; hdr->tiling.t.col_start_sb[hdr->tiling.t.cols] = (hdr->width + 127) >> 7; } } static void rescale_matrix(int32_t *const dm, const int32_t *const sm, int in_dist, int out_dist) { int shift, inv_in_dist = dav2d_resolve_divisor_32(abs(in_dist), &shift); if (inv_in_dist >= 512) { inv_in_dist >>= 1; shift--; } if (in_dist < 0) inv_in_dist = -inv_in_dist; const int rnd = (1 << shift) >> 1; for (int n = 0; n < 2; n++) { const int r = iclip(sm[n], -0x400000, 0x400000) * inv_in_dist; const int t = ((r + rnd - (r < 0)) >> shift) * out_dist; const int d = (t + 0x1000 - (t < 0)) & ~0x1fff; dm[n] = iclip(d, -0x7ffe000, +0x7ffe000); } for (int n = 2; n < 6; n++) { const int b = 0x10000 * ((unsigned) (n - 3) > 1U); const int r = (sm[n] - b) * inv_in_dist; const int t = ((r + rnd - (r < 0)) >> shift) * out_dist; const int d = (t + 32 - (t < 0)) & ~63; dm[n] = b + iclip(d, -0x7fc0, +0x7fc0); } } static int parse_frame_hdr(Dav2dContext *const c, GetBits *const gb, const enum Dav2dObuType obu_type) { #define DEBUG_FRAME_HDR 0 #if DEBUG_FRAME_HDR const uint8_t *const init_ptr = &gb->ptr[-!!(gb->bits_left & 7)]; #endif const Dav2dSequenceHeader *const seqhdr = c->seq_hdr; Dav2dFrameHeader *const hdr = c->frame_hdr; hdr->id = dav2d_get_vlc(gb); if (hdr->id) goto error; const int seqhdr_idx = dav2d_get_vlc(gb); if (seqhdr_idx != seqhdr->id) goto error; #if DEBUG_FRAME_HDR printf("HDR: post-ids[f:%d,s:%d]: off=%td\n", hdr->id, seqhdr->id, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->show_existing_frame = obu_type == DAV2D_OBU_SEF; if (hdr->show_existing_frame) { hdr->existing_frame_idx = dav2d_get_bits(gb, seqhdr->ref_frames_log2); if (hdr->existing_frame_idx >= seqhdr->ref_frames) goto error; if (dav2d_get_bit(gb)) { // FIXME poc } // FIXME filmgrain #if DEBUG_FRAME_HDR printf("HDR: post-existing_frame_idx[%d]: off=%td\n", hdr->existing_frame_idx, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif return 0; } if (seqhdr->reduced_still_picture_header) { hdr->frame_type = DAV2D_FRAME_TYPE_KEY; hdr->show_immediate = 1; } else { switch (obu_type) { case DAV2D_OBU_CLOSED_LOOP_KF: case DAV2D_OBU_OPEN_LOOP_KF: hdr->frame_type = DAV2D_FRAME_TYPE_KEY; break; case DAV2D_OBU_RAS: case DAV2D_OBU_SWITCH: hdr->frame_type = DAV2D_FRAME_TYPE_SWITCH; break; default: if (!dav2d_get_bit(gb)) { hdr->frame_type = DAV2D_FRAME_TYPE_INTRA; break; } // fall-through case DAV2D_OBU_LEADING_TIP: case DAV2D_OBU_TIP: case DAV2D_OBU_BRIDGE: hdr->frame_type = DAV2D_FRAME_TYPE_INTER; break; } hdr->ltr_id = -1; if (hdr->frame_type == DAV2D_FRAME_TYPE_KEY) { if (seqhdr->number_of_bits_for_lt_frame_id) hdr->ltr_id = dav2d_get_bits(gb, seqhdr->number_of_bits_for_lt_frame_id) - 1; } else if (obu_type == DAV2D_OBU_RAS || obu_type == DAV2D_OBU_OPEN_LOOP_KF) { if (seqhdr->number_of_bits_for_lt_frame_id) { hdr->n_ref_frames = dav2d_get_bits(gb, 3); for (int n = 0; n < hdr->n_ref_frames; n++) hdr->refidx[n] = dav2d_get_bits(gb, seqhdr->number_of_bits_for_lt_frame_id); } } if (obu_type != DAV2D_OBU_BRIDGE) { if (obu_type != DAV2D_OBU_OPEN_LOOP_KF) hdr->show_immediate = dav2d_get_bit(gb); if (!hdr->show_immediate && !seqhdr->monotonic) hdr->show_implicit = dav2d_get_bit(gb); } #if DEBUG_FRAME_HDR printf("HDR: post-frametype_bits[type:%d,ltrid:%d,show:%d|%d]: off=%td\n", hdr->frame_type, hdr->ltr_id, hdr->show_immediate, hdr->show_implicit, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } hdr->primary_ref_frame = DAV2D_PRIMARY_REF_NONE; if (!seqhdr->reduced_still_picture_header) { hdr->frame_size_override = hdr->frame_type == DAV2D_FRAME_TYPE_SWITCH ? 1 : dav2d_get_bit(gb); hdr->frame_offset = dav2d_get_bits(gb, seqhdr->order_hint_n_bits); int did_signal_pri_ref = -1; if (hdr->frame_type == DAV2D_FRAME_TYPE_INTER) { hdr->primary_ref_signaled = did_signal_pri_ref = dav2d_get_bit(gb); if (obu_type != DAV2D_OBU_LEADING_TIP && obu_type != DAV2D_OBU_TIP) hdr->cross_frame_context = dav2d_get_bit(gb); if (did_signal_pri_ref) hdr->primary_ref_frame = dav2d_get_bits(gb, 3); } #if DEBUG_FRAME_HDR printf("HDR: post-frame_size_override_flag[%d,poc:%d,p_ref:%d|%d]: off=%td\n", hdr->frame_size_override, hdr->frame_offset, did_signal_pri_ref, hdr->primary_ref_frame, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } // FIXME special cases for bridge and ras frames if (obu_type == DAV2D_OBU_CLOSED_LOOP_KF && !seqhdr->max_mlayer_id) { hdr->refresh_frame_flags = (1 << seqhdr->ref_frames) - 1; } else if (obu_type == DAV2D_OBU_OPEN_LOOP_KF || seqhdr->max_mlayer_id) { if (seqhdr->short_refresh_frame_flags) { hdr->refresh_frame_flags = 1 << dav2d_get_bits(gb, seqhdr->ref_frames_log2); } else { hdr->refresh_frame_flags = dav2d_get_bits(gb, seqhdr->ref_frames); } } else if (hdr->frame_type != DAV2D_FRAME_TYPE_SWITCH && seqhdr->short_refresh_frame_flags) { const int refresh = dav2d_get_bit(gb); if (refresh) { const int refresh_idx = dav2d_get_bits(gb, seqhdr->ref_frames_log2); if (refresh_idx >= seqhdr->ref_frames) goto error; hdr->refresh_frame_flags = 1 << refresh_idx; } } else { hdr->refresh_frame_flags = dav2d_get_bits(gb, seqhdr->ref_frames); } #if DEBUG_FRAME_HDR printf("HDR: post-refresh_frame_flags[%x]: off=%td\n", hdr->refresh_frame_flags, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (IS_INTER_OR_SWITCH(hdr)) { if (hdr->frame_type == DAV2D_FRAME_TYPE_SWITCH || seqhdr->explicit_ref_frame_map) { // explicit ref frame signaling hdr->n_ref_frames = dav2d_get_bits(gb, 3); if (hdr->n_ref_frames > imin(7, seqhdr->ref_frames)) goto error; for (int n = 0; n < hdr->n_ref_frames; n++) { hdr->refidx[n] = dav2d_get_bits(gb, seqhdr->ref_frames_log2); if (hdr->refidx[n] >= seqhdr->ref_frames) goto error; } } else { // implicit ref frame scoring (this will fill hdr->refidx[]) hdr->n_ref_frames = get_ref_frames(c, 0); } const unsigned poc = hdr->frame_offset; for (int n = 0; n < hdr->n_ref_frames; n++) { const int pocdiff = get_poc_diff(seqhdr->order_hint_n_bits, poc, c->refs[hdr->refidx[n]].p.p.frame_hdr->frame_offset); hdr->has_future_refs |= pocdiff < 0; hdr->has_past_refs |= pocdiff > 0; } hdr->has_bothside_refs = hdr->has_future_refs && hdr->has_past_refs; #if DEBUG_FRAME_HDR printf("HDR: post-refs[explicit:%d,refs:%d,%d,%d,%d,%d,%d,%d]: off=%td\n", seqhdr->explicit_ref_frame_map, hdr->refidx[0], hdr->refidx[1], hdr->refidx[2], hdr->refidx[3], hdr->refidx[4], hdr->refidx[5], hdr->refidx[6], (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } if (read_frame_size(c, gb) < 0) goto error; #if DEBUG_FRAME_HDR printf("HDR: post-framesize[%dx%d]: off=%td\n", hdr->width, hdr->height, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (IS_INTER_OR_SWITCH(hdr)) { if (hdr->frame_type == DAV2D_FRAME_TYPE_INTER && !seqhdr->explicit_ref_frame_map) { // include resolution constraints hdr->n_ref_frames = get_ref_frames(c, 1); #if DEBUG_FRAME_HDR printf("HDR: post-refs2[refs:%d,%d,%d,%d,%d,%d,%d]: off=%td\n", hdr->refidx[0], hdr->refidx[1], hdr->refidx[2], hdr->refidx[3], hdr->refidx[4], hdr->refidx[5], hdr->refidx[6], (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } // FIXME bru if (seqhdr->ref_frame_mvs) hdr->use_ref_frame_mvs = dav2d_get_bit(gb); hdr->tmvp_sample_step = 1 + (hdr->use_ref_frame_mvs && hdr->n_ref_frames > 1 && seqhdr->sb128 && dav2d_get_bit(gb)); #if DEBUG_FRAME_HDR printf("HDR: post-refmvbits[%d,step:%d]: off=%td\n", hdr->use_ref_frame_mvs, hdr->tmvp_sample_step, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->tip.subpel_filter = DAV2D_FILTER_8TAP_SHARP; if (seqhdr->tip && hdr->n_ref_frames > 1 && hdr->use_ref_frame_mvs) { if (obu_type == DAV2D_OBU_TIP || obu_type == DAV2D_OBU_LEADING_TIP) { hdr->tip.frame_mode = 2; // output hdr->opfl_refine_type = 2 * (seqhdr->opfl_refine && seqhdr->tip_refine_mv); } else { hdr->tip.frame_mode = dav2d_get_bit(gb); // 1: ref, or 0: disabled hdr->opfl_refine_type = seqhdr->opfl_refine < 3 /* auto */ ? seqhdr->opfl_refine : dav2d_get_bit(gb) ? 1 /* switchable */ : 2 * dav2d_get_bit(gb) /* all or none */; } if (hdr->tip.frame_mode) { if (seqhdr->tip_hole_fill) hdr->tip.hole_fill = dav2d_get_bit(gb); if (!hdr->has_bothside_refs || !seqhdr->tip_refine_mv || (!seqhdr->opfl_refine && !seqhdr->refine_mv)) { hdr->tip.global_wtd_idx = dav2d_get_bits(gb, 3); } if (hdr->tip.frame_mode == 2) { if (!dav2d_get_bit(gb)) { hdr->tip.gmv.y = dav2d_get_bits(gb, 4); hdr->tip.gmv.x = dav2d_get_bits(gb, 4); if (hdr->tip.gmv.y && dav2d_get_bit(gb)) hdr->tip.gmv.y = -hdr->tip.gmv.y; if (hdr->tip.gmv.x && dav2d_get_bit(gb)) hdr->tip.gmv.x = -hdr->tip.gmv.x; } hdr->tip.subpel_filter = dav2d_get_bit(gb) ? DAV2D_FILTER_8TAP_SHARP : dav2d_get_bit(gb) ? DAV2D_FILTER_8TAP_REGULAR : DAV2D_FILTER_8TAP_SMOOTH; } } find_tip_ref_frames(c, hdr, seqhdr); } else { hdr->opfl_refine_type = seqhdr->opfl_refine < 3 /* auto */ ? seqhdr->opfl_refine : dav2d_get_bit(gb) ? 1 /* switchable */ : 2 * dav2d_get_bit(gb) /* all or none */; } #if DEBUG_FRAME_HDR printf("HDR: post-refinemv-tip[opfl/refine:%d,tip:%d,holefill:%d," "glbwt:%d,gmv:y=%d,x=%d,interpfilt:%d]: off=%td\n", hdr->opfl_refine_type, hdr->tip.frame_mode, hdr->tip.hole_fill, hdr->tip.global_wtd_idx, hdr->tip.gmv.y, hdr->tip.gmv.x, hdr->tip.subpel_filter, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (hdr->tip.frame_mode == 2) { if (seqhdr->db_sub_pu) { hdr->deblock.sub_pu = dav2d_get_bit(gb); if (hdr->deblock.sub_pu) { hdr->tip.apply_filter = dav2d_get_bit(gb); if (hdr->tip.apply_filter) { hdr->deblock.level_y[0] = 1; hdr->deblock.level_y[1] = 1; hdr->deblock.level_u = 1; hdr->deblock.level_v = 1; } } } #if DEBUG_FRAME_HDR printf("HDR: post-tip_deblock[lfsubpu:%d,apply:%d]: off=%td\n", hdr->deblock.sub_pu, hdr->tip.apply_filter, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (seqhdr->tip_explicit_qp) { // FIXME yac and (sometimes) u/v ac delta } else { const Dav2dFrameHeader *const ref1hdr = c->refs[hdr->refidx[hdr->tip.ref[0]]].p.p.frame_hdr; const Dav2dFrameHeader *const ref2hdr = c->refs[hdr->refidx[hdr->tip.ref[1]]].p.p.frame_hdr; hdr->quant.yac = (ref1hdr->quant.yac + ref2hdr->quant.yac + 1) >> 1; } // FIXME this is read further down if (hdr->tip.apply_filter) { parse_tile_info_frmhdr(hdr, seqhdr, gb); #if DEBUG_FRAME_HDR printf("HDR: post-tiling[%dx%dtiles,%dbytes]: off=%td\n", hdr->tiling.t.cols, hdr->tiling.t.rows, 0, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } else { hdr->sb128 = IS_INTER_OR_SWITCH(hdr) ? seqhdr->sb128 : !!seqhdr->sb128; hdr->tiling.t.rows = hdr->tiling.t.cols = 1; const int shift = 6 + hdr->sb128; hdr->tiling.t.col_start_sb[0] = 0; hdr->tiling.t.col_start_sb[1] = (hdr->width + ((1 << shift) - 1)) >> shift; hdr->tiling.t.row_start_sb[0] = 0; hdr->tiling.t.row_start_sb[1] = (hdr->height + ((1 << shift) - 1)) >> shift; } hdr->disable_cdf_update = 1; int refs[2]; derive_pri_sec_ref(c, refs); hdr->primary_ref_frame = refs[0]; hdr->secondary_ref_frame = refs[1]; goto grain; } } hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV2D_ADAPTIVE ? dav2d_get_bit(gb) : seqhdr->screen_content_tools; if (hdr->allow_screen_content_tools) hdr->force_integer_mv = seqhdr->force_integer_mv == DAV2D_ADAPTIVE ? dav2d_get_bit(gb) : seqhdr->force_integer_mv; #if DEBUG_FRAME_HDR printf("HDR: post-screencontent[sctools:%d,forceintmv:%d]: off=%td\n", hdr->allow_screen_content_tools, hdr->force_integer_mv, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->allow_intrabc = dav2d_get_bit(gb); if (hdr->allow_intrabc) { if (IS_KEY_OR_INTRA(hdr)) hdr->allow_global_intrabc = dav2d_get_bit(gb); hdr->allow_local_intrabc = !hdr->allow_global_intrabc || dav2d_get_bit(gb); } if (hdr->allow_intrabc) { hdr->max_bvp_drl_bits = seqhdr->allow_max_bvp_drl_bits ? dav2d_get_ref_uniform(gb, 3, seqhdr->def_max_bvp_drl_bits) + 1 : seqhdr->def_max_bvp_drl_bits; } #if DEBUG_FRAME_HDR printf("HDR: post-ibc[intrabc:%d,global:%d,local:%d,drlbits:%d]: off=%td\n", hdr->allow_intrabc, hdr->allow_global_intrabc, hdr->allow_local_intrabc, hdr->max_bvp_drl_bits, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (IS_INTER_OR_SWITCH(hdr)) { hdr->max_drl_bits = seqhdr->allow_frame_max_drl_bits ? dav2d_get_ref_uniform(gb, 3, seqhdr->def_max_drl_bits) + 1 : seqhdr->def_max_drl_bits; if (!hdr->force_integer_mv) hdr->mv_precision = dav2d_get_bit(gb) ? 2 : 1 + 2 * dav2d_get_bit(gb); hdr->subpel_filter_mode = dav2d_get_bit(gb) ? DAV2D_FILTER_SWITCHABLE : dav2d_get_bits(gb, 2); if (seqhdr->frame_motion_modes_present) { hdr->motion_modes = 1; for (int n = 2; n <= 16; n <<= 1) if ((seqhdr->motion_modes & n) && dav2d_get_bit(gb)) hdr->motion_modes |= n; } else { hdr->motion_modes = seqhdr->motion_modes; } #if DEBUG_FRAME_HDR printf("HDR: post-frametype-specific-bits[drlbits:%d,mvprec:%d,flt:%d,mm:%x]: off=%td\n", hdr->max_drl_bits, hdr->mv_precision, hdr->subpel_filter_mode, hdr->motion_modes, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } hdr->disable_cdf_update = dav2d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-disable_cdf_update[%d]: off=%td\n", hdr->disable_cdf_update, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif parse_tile_info_frmhdr(hdr, seqhdr, gb); if (hdr->tiling.t.log2_cols || hdr->tiling.t.log2_rows) { if (!seqhdr->avg_cdf_type) hdr->tiling.update = dav2d_get_bits(gb, hdr->tiling.t.log2_cols + hdr->tiling.t.log2_rows); if (hdr->tiling.update >= hdr->tiling.t.cols * hdr->tiling.t.rows) goto error; hdr->tiling.n_bytes = dav2d_get_bits(gb, 2) + 1; } #if DEBUG_FRAME_HDR printf("HDR: post-tiling[%dx%dtiles,%dbytes]: off=%td\n", hdr->tiling.t.cols, hdr->tiling.t.rows, hdr->tiling.n_bytes, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // quant data hdr->quant.yac = dav2d_get_bits(gb, 8 + !!seqhdr->hbd); if (seqhdr->ydc_dq_enabled && dav2d_get_bit(gb)) hdr->quant.ydc_delta = dav2d_get_sbits(gb, 7); if (seqhdr->layout != DAV2D_PIXEL_LAYOUT_I400 && (seqhdr->uvdc_dq_enabled || seqhdr->uvac_dq_enabled)) { // If the sequence header says that delta_q might be different // for U, V, we must check whether it actually is for this // frame. const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav2d_get_bit(gb) : 0; if (seqhdr->uvdc_dq_enabled && dav2d_get_bit(gb)) hdr->quant.udc_delta = dav2d_get_sbits(gb, 7); if (seqhdr->uvac_dq_enabled && dav2d_get_bit(gb)) hdr->quant.uac_delta = dav2d_get_sbits(gb, 7); if (diff_uv_delta) { if (seqhdr->uvdc_dq_enabled && dav2d_get_bit(gb)) hdr->quant.vdc_delta = dav2d_get_sbits(gb, 7); if (seqhdr->uvac_dq_enabled && dav2d_get_bit(gb)) hdr->quant.vac_delta = dav2d_get_sbits(gb, 7); } else { hdr->quant.vdc_delta = hdr->quant.udc_delta; hdr->quant.vac_delta = hdr->quant.uac_delta; } } hdr->secondary_ref_frame = DAV2D_PRIMARY_REF_NONE; if (IS_INTER_OR_SWITCH(hdr)) { int refs[2]; derive_pri_sec_ref(c, refs); if (!hdr->primary_ref_signaled) hdr->primary_ref_frame = refs[0]; if (hdr->primary_ref_frame != DAV2D_PRIMARY_REF_NONE) hdr->secondary_ref_frame = refs[refs[1] != hdr->primary_ref_frame]; } #if DEBUG_FRAME_HDR printf("HDR: post-quant[yac:%d,deltas=ydc:%d,uac:%d/dc:%d,vac:%d/dc:%d]: off=%td\n", hdr->quant.yac, hdr->quant.ydc_delta, hdr->quant.uac_delta, hdr->quant.udc_delta, hdr->quant.vac_delta, hdr->quant.vdc_delta, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // segmentation data hdr->segmentation.enabled = dav2d_get_bit(gb); if (hdr->segmentation.enabled) { if (seqhdr->segmentation.info_present && (!seqhdr->segmentation.adaptive || dav2d_get_bit(gb))) { hdr->segmentation.d = seqhdr->segmentation.d; } else { parse_seg_info(&hdr->segmentation.d, gb, 8 << seqhdr->segmentation.ext); } if (hdr->primary_ref_frame == DAV2D_PRIMARY_REF_NONE) { hdr->segmentation.update_map = 1; } else { hdr->segmentation.update_map = dav2d_get_bit(gb); if (hdr->segmentation.update_map && hdr->frame_type != DAV2D_FRAME_TYPE_SWITCH) { hdr->segmentation.temporal = dav2d_get_bit(gb); } } unsigned m = hdr->segmentation.d.skip_mask | hdr->segmentation.d.globalmv_mask; hdr->segmentation.preskip = !!m; m |= hdr->segmentation.d.delta_q_mask; hdr->segmentation.last_active_segid = m ? ulog2(m) : -1; } #if DEBUG_FRAME_HDR printf("HDR: post-segmentation[%d]: off=%td\n", hdr->segmentation.enabled, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->quant.qm.enabled = dav2d_get_bit(gb); if (hdr->quant.qm.enabled) { hdr->quant.qm.num = hdr->segmentation.enabled ? dav2d_get_bits(gb, 2) + 1 : 1; for (int n = 0; n < hdr->quant.qm.num; n++) { hdr->quant.qm.y[n] = dav2d_get_bits(gb, 4); if (seqhdr->layout != DAV2D_PIXEL_LAYOUT_I400) { if (dav2d_get_bit(gb)) { hdr->quant.qm.u[n] = hdr->quant.qm.v[n] = hdr->quant.qm.y[n]; } else { hdr->quant.qm.u[n] = dav2d_get_bits(gb, 4); hdr->quant.qm.v[n] = seqhdr->separate_uv_delta_q ? dav2d_get_bits(gb, 4) : hdr->quant.qm.u[n]; } } } } #if DEBUG_FRAME_HDR printf("HDR: post-qm[%d]: off=%td\n", hdr->quant.qm.enabled, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // delta q if (hdr->quant.yac) { hdr->delta.q.present = dav2d_get_bit(gb); if (hdr->delta.q.present) hdr->delta.q.res_log2 = dav2d_get_bits(gb, 2); } #if DEBUG_FRAME_HDR printf("HDR: post-delta_q[%d]: off=%td\n", hdr->delta.q.present, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // derive lossless flags const int delta_lossless = !hdr->quant.ydc_delta && !hdr->quant.udc_delta && !hdr->quant.uac_delta && !hdr->quant.vdc_delta && !hdr->quant.vac_delta; hdr->all_lossless = 1; hdr->any_lossless = 0; for (int i = 0; i < DAV2D_MAX_SEGMENTS; i++) { hdr->segmentation.qidx[i] = hdr->segmentation.enabled ? iclip_u8(hdr->quant.yac + hdr->segmentation.d.delta_q[i]) : hdr->quant.yac; hdr->segmentation.lossless[i] = !hdr->segmentation.qidx[i] && delta_lossless; hdr->all_lossless &= hdr->segmentation.lossless[i]; hdr->any_lossless |= hdr->segmentation.lossless[i]; // FIXME when using qm & segmentaiton, there are also some // bits here which qm to use per seg } if (!hdr->all_lossless) hdr->tcq = seqhdr->tcq == DAV2D_ADAPTIVE ? dav2d_get_bit(gb) : seqhdr->tcq; if (!hdr->all_lossless && !hdr->tcq && seqhdr->parity_hiding) hdr->parity_hiding = dav2d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-tcq_parity[tcq:%d,par:%d]: off=%td\n", hdr->tcq, hdr->parity_hiding, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // deblock if (!hdr->all_lossless) { if (hdr->frame_type == DAV2D_FRAME_TYPE_INTER && seqhdr->db_sub_pu) hdr->deblock.sub_pu = dav2d_get_bit(gb); hdr->deblock.level_y[0] = dav2d_get_bit(gb); hdr->deblock.level_y[1] = dav2d_get_bit(gb); if (seqhdr->layout != DAV2D_PIXEL_LAYOUT_I400 && (hdr->deblock.level_y[0] || hdr->deblock.level_y[1])) { hdr->deblock.level_u = dav2d_get_bit(gb); hdr->deblock.level_v = dav2d_get_bit(gb); } const int bits = seqhdr->df_par_bits, off = 1 << (bits - 1); if (hdr->deblock.level_y[0] && dav2d_get_bit(gb)) hdr->deblock.delta_q_y[0] = dav2d_get_bits(gb, bits) - off; if (hdr->deblock.level_y[1]) hdr->deblock.delta_q_y[1] = dav2d_get_bit(gb) ? (int)dav2d_get_bits(gb, bits) - off : hdr->deblock.delta_q_y[0]; if (hdr->deblock.level_u && dav2d_get_bit(gb)) hdr->deblock.delta_q_u = dav2d_get_bits(gb, bits) - off; if (hdr->deblock.level_v && dav2d_get_bit(gb)) hdr->deblock.delta_q_v = dav2d_get_bits(gb, bits) - off; } #if DEBUG_FRAME_HDR printf("HDR: post-deblock[lfsubpu:%d,y:%d|%d,u:%d,v:%d,dqy:%d|%d,dqu:%d,dqv:%d]: off=%td\n", hdr->deblock.sub_pu, hdr->deblock.level_y[0], hdr->deblock.level_y[1], hdr->deblock.level_u, hdr->deblock.level_v, hdr->deblock.delta_q_y[0], hdr->deblock.delta_q_y[1], hdr->deblock.delta_q_u, hdr->deblock.delta_q_v, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (!hdr->all_lossless && seqhdr->gdf /* && not large-scale tiles */) { const int gdf_bs = 128 << (hdr->sb128 == 2); hdr->gdf.enabled = seqhdr->reduced_still_picture_header || dav2d_get_bit(gb); if (hdr->gdf.enabled) { if (imax(hdr->width, hdr->height) > gdf_bs) hdr->gdf.enabled += dav2d_get_bit(gb); const int qp_base = IS_KEY_OR_INTRA(hdr) ? 85 : 110; const int qp_diff = hdr->quant.yac - qp_base - 48 * seqhdr->hbd; const int qp_idx_offset = dav2d_get_bits(gb, 2); hdr->gdf.qp_idx = iclip((qp_diff - 37)/25, 0, 2) + qp_idx_offset; hdr->gdf.scale = dav2d_get_bits(gb, 2) + 1; } #if DEBUG_FRAME_HDR printf("HDR: post-gdf[%d]: off=%td\n", hdr->gdf.enabled, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } // cdef if (!hdr->all_lossless && seqhdr->cdef) { hdr->cdef.enabled = seqhdr->reduced_still_picture_header || dav2d_get_bit(gb); if (hdr->cdef.enabled) { hdr->cdef.damping = dav2d_get_bits(gb, 2) + 3; hdr->cdef.n_strengths = dav2d_get_bits(gb, 3) + 1; hdr->cdef.on_skiptx = seqhdr->cdef_on_skiptx == DAV2D_ADAPTIVE ? dav2d_get_bit(gb) : seqhdr->cdef_on_skiptx; for (int i = 0; i < hdr->cdef.n_strengths; i++) { hdr->cdef.y_strength[i] = dav2d_get_bits(gb, 6 - 4 * dav2d_get_bit(gb)); if (seqhdr->layout != DAV2D_PIXEL_LAYOUT_I400) hdr->cdef.uv_strength[i] = dav2d_get_bits(gb, 6 - 4 * dav2d_get_bit(gb)); } } #if DEBUG_FRAME_HDR printf("HDR: post-cdef[%d]: off=%td\n", hdr->cdef.enabled, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } const int n_bits = hdr->n_ref_frames <= 2 ? hdr->n_ref_frames - 1 : 1 + ulog2(hdr->n_ref_frames - 1); // restoration if (!hdr->all_lossless && seqhdr->restoration) { for (int p = 0; p < 3; p++) { const unsigned disable_mask = seqhdr->rst_disable_mask[!!p]; if (disable_mask == 0) { hdr->restoration.p[p].type = dav2d_get_bits(gb, 2); } else if (disable_mask == 3) { hdr->restoration.p[p].type = DAV2D_RESTORATION_NONE; } else { hdr->restoration.p[p].type = dav2d_get_bit(gb) * (3 - disable_mask); } if (hdr->restoration.p[p].type >= DAV2D_RESTORATION_NS_WIENER) { struct Dav2dNSWienerPlane *const pd = &hdr->restoration.p[p].ns; pd->frame_filters_on = dav2d_get_bit(gb); if (pd->frame_filters_on) { if (IS_INTER_OR_SWITCH(hdr)) pd->temporal = dav2d_get_bit(gb); if (pd->temporal) { int ref = 0; if (n_bits) { ref = hdr->restoration.p[p].ns.refidx = dav2d_get_bits(gb, n_bits); if (ref >= hdr->n_ref_frames) goto error; } const Dav2dFrameHeader *const refhdr = c->refs[hdr->refidx[ref]].p.p.frame_hdr; if (!refhdr) goto error; const struct Dav2dNSWienerPlane *rpd = &refhdr->restoration.p[p].ns; if (!rpd->frame_filters_on && p) rpd = &refhdr->restoration.p[3 - p].ns; // U <-> V if (!rpd->frame_filters_on) goto error; pd->num_classes_idx = rpd->num_classes_idx; pd->num_classes = rpd->num_classes; } else { const int val = dav2d_get_bits(gb, 3); pd->num_classes_idx = val; pd->num_classes = 1 + val + imax(val - 3, 0) + imax(val - 5, 0) * 2; } } else { pd->num_classes_idx = 0; pd->num_classes = 1; } } } hdr->restoration.unit_size[0] = 9; if (hdr->restoration.p[0].type) { if (dav2d_get_bit(gb)) { hdr->restoration.unit_size[0]--; } else if (hdr->sb128 < 2 && !dav2d_get_bit(gb)) { hdr->restoration.unit_size[0] -= 2 + (!hdr->sb128 && !dav2d_get_bit(gb)); } assert(hdr->restoration.unit_size[0] >= 6 + hdr->sb128); } const int ss = seqhdr->layout != DAV2D_PIXEL_LAYOUT_I444; hdr->restoration.unit_size[1] = 9 - ss; if (hdr->restoration.p[1].type || hdr->restoration.p[2].type) { if (dav2d_get_bit(gb)) { hdr->restoration.unit_size[1]--; } else if (hdr->sb128 < 2 && !dav2d_get_bit(gb)) { hdr->restoration.unit_size[1] -= 2 + (!hdr->sb128 && !dav2d_get_bit(gb)); } // this can trigger for 422 if (hdr->restoration.unit_size[1] < 6 - seqhdr->ss_ver) goto error; assert(hdr->restoration.unit_size[1] >= 6 + hdr->sb128 - imax(seqhdr->ss_hor, seqhdr->ss_ver)); } for (int p = 0; p < 3; p++) { int8_t ref_filters[48][18]; struct Dav2dNSWienerPlane *const pd = &hdr->restoration.p[p].ns; if (!pd->frame_filters_on) continue; const int n_feat = 16 + 2 * !!p; const int n_ref_filters = seqhdr->rst_disable_mask[!!p] & 1 ? 16 : 48 - pd->num_classes; if (pd->temporal) { const Dav2dFrameHeader *const ref_hdr = c->refs[hdr->refidx[pd->refidx]].p.p.frame_hdr; const struct Dav2dNSWienerPlane *rpd = &ref_hdr->restoration.p[p].ns; if (!rpd->frame_filters_on) { assert(p); rpd = &ref_hdr->restoration.p[3 - p].ns; } assert(rpd->frame_filters_on); for (int n = 0; n < pd->num_classes; n++) memcpy(pd->filter[n], rpd->filter[n], n_feat); continue; } int i = 0; for (int r = 0; r < hdr->n_ref_frames; r++) { const Dav2dFrameHeader *const ref_hdr = c->refs[hdr->refidx[r]].p.p.frame_hdr; for (int dir = (const int8_t[]){ 0, +1, -1 }[p], p2 = p;; p2 += dir, dir = 0) { const struct Dav2dNSWienerPlane *const rpd = &ref_hdr->restoration.p[p2].ns; if (rpd->frame_filters_on) { const int n_classes = imin(n_ref_filters - i, rpd->num_classes); for (int n = 0; n < n_classes; n++) memcpy(ref_filters[i++], rpd->filter[n], n_feat); } if (!dir) break; } } const int n_filters = seqhdr->rst_disable_mask[!!p] & 1 ? 16 : 64; const int n_classes = pd->num_classes; uint8_t grp_cnt[3], grp_ref_cnt[3] = { 0 }; assert(n_classes > 0); grp_cnt[0] = n_classes; grp_cnt[1] = i; grp_cnt[2] = n_filters - (grp_cnt[0] + grp_cnt[1]); uint8_t filter_refs[64]; int pred_grp = 2 - (grp_cnt[1] > 2); const int nnz_grps = 1 + !!grp_cnt[1] + !!grp_cnt[2]; for (int n = 0; n < n_classes; n++) { int group; if (nnz_grps == 1 || !dav2d_get_bit(gb)) { group = pred_grp; } else if (nnz_grps == 2) { group = 2 - !grp_cnt[2] - pred_grp; } else if (dav2d_get_bit(gb)) { group = 2 - (pred_grp == 2); } else { group = pred_grp == 0; } if (++grp_ref_cnt[group] + (group < pred_grp) > grp_ref_cnt[pred_grp]) pred_grp = group; const int base = grp_cnt[0] * !!group + grp_cnt[1] * (group == 2); const int range = group ? grp_cnt[group] : n + 1; filter_refs[n] = base + (range == 1 ? 0 : dav2d_get_bits_subexp_u(gb, range >> 1, range, 4)); } unsigned exact_match_mask = 0; // FIXME use dav2d_get_bits() for (int n = 0, mask = 1; n < n_classes; n++, mask <<= 1) { exact_match_mask |= mask * dav2d_get_bit(gb); } const unsigned *const masks = p ? dav2d_subset_masks_uv : dav2d_subset_masks_y; const int8_t (*const cf_range)[2] = p ? dav2d_ns_wiener_coef_range_uv : dav2d_ns_wiener_coef_range_y; static const uint8_t shuffled_index[] = { 16, 7, 58, 21, 12, 61, 26, 38, 18, 30, 50, 45, 23, 49, 43, 62, 42, 54, 27, 36, 17, 44, 32, 34, 4, 24, 52, 31, 37, 11, 33, 19, 35, 6, 22, 53, 63, 25, 41, 47, 1, 59, 0, 28, 40, 55, 48, 8, 5, 51, 9, 46, 56, 60, 15, 2, 13, 14, 57, 29, 3, 20, 39, 10 }; static const int8_t zero[18] = { 0 }; for (int n = 0; n < n_classes; n++, exact_match_mask >>= 1) { const int r = filter_refs[n]; int8_t *const filter = hdr->restoration.p[p].ns.filter[n]; const int8_t *const ref_filter = !r ? zero : r < n_classes ? hdr->restoration.p[p].ns.filter[r - 1] : r < n_classes + grp_cnt[1] ? ref_filters[r - n_classes] : dav2d_wiener_ns_filters[shuffled_index[r - n_classes - grp_cnt[1]]]; if (exact_match_mask & 1) { memcpy(filter, ref_filter, 16 + 2 * !!p); continue; } memset(filter, 0, 16 + !!p * 2); int s; for (s = 0; s < 3 - !!p; s++) { const int found = dav2d_get_bit(gb); if (!found) break; } const unsigned mask = masks[s]; // FIXME read sym bit (chroma only) if ref filter subset "s" is // assymetric and has space for (int i = 0, m = mask; i < 16 + !!p * 2; i++, m >>= 1) { if (!(m & 1)) continue; const int nbits = cf_range[i][0]; filter[i] = (int) dav2d_get_bits_subexp_u(gb, ref_filter[i] - cf_range[i][1], 1 << nbits, nbits - 3) + cf_range[i][1]; // FIXME if sym is set and this coef is assymetric, insert an // extra coef here } } } #if DEBUG_FRAME_HDR printf("HDR: post-restoration[y:%d,u:%d,v:%d]: off=%td\n", hdr->restoration.p[0].type, hdr->restoration.p[1].type, hdr->restoration.p[2].type, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } if (!hdr->all_lossless && seqhdr->ccso) { hdr->ccso.enabled = seqhdr->reduced_still_picture_header || dav2d_get_bit(gb); if (hdr->ccso.enabled) { const int n_planes = seqhdr->layout == DAV2D_PIXEL_LAYOUT_I400 ? 1 : 3; for (int p = 0; p < n_planes; p++) { hdr->ccso.p[p].enabled = dav2d_get_bit(gb); if (!hdr->ccso.p[p].enabled) continue; if (IS_INTER_OR_SWITCH(hdr)) { hdr->ccso.p[p].reuse = dav2d_get_bit(gb); hdr->ccso.p[p].sb_reuse = dav2d_get_bit(gb); if (hdr->ccso.p[p].reuse || hdr->ccso.p[p].sb_reuse) { int ref = 0; if (n_bits) { hdr->ccso.p[p].refidx = ref = dav2d_get_bits(gb, n_bits); if (hdr->ccso.p[p].refidx >= hdr->n_ref_frames) goto error; } const Dav2dFrameHeader *const refhdr = c->refs[hdr->refidx[ref]].p.p.frame_hdr; if (!refhdr) goto error; if (hdr->ccso.p[p].reuse) { const int w4 = (hdr->width + 3) >> 2; const int h4 = (hdr->height + 3) >> 2; const int rw4 = (refhdr->width + 3) >> 2; const int rh4 = (refhdr->height + 3) >> 2; if (w4 != rw4 || h4 != rh4 || !refhdr->ccso.p[p].enabled) goto error; } } } if (!hdr->ccso.p[p].reuse) { hdr->ccso.p[p].bo_only = dav2d_get_bit(gb); const int si = hdr->ccso.p[p].scale_idx = dav2d_get_bits(gb, 2); if (hdr->ccso.p[p].bo_only) { hdr->ccso.p[p].max_band_log2 = dav2d_get_bits(gb, 3); } else { const int qi = hdr->ccso.p[p].quant_idx = dav2d_get_bits(gb, 2); hdr->ccso.p[p].ext_filter_support = dav2d_get_bits(gb, 3); if (hdr->ccso.p[p].ext_filter_support == 7) goto error; if (dav2d_ccso_quant_sz[si][qi]) hdr->ccso.p[p].edge_clf = dav2d_get_bit(gb); hdr->ccso.p[p].max_band_log2 = dav2d_get_bits(gb, 2); } const int n_edge_off_intervals = hdr->ccso.p[p].bo_only ? 1 : 3 - hdr->ccso.p[p].edge_clf; const int max_band = 1 << hdr->ccso.p[p].max_band_log2; memset(hdr->ccso.p[p].filter_off, 0, sizeof(hdr->ccso.p[p].filter_off)); for (int n = 0; n < n_edge_off_intervals; n++) { uint8_t *filter_off = &hdr->ccso.p[p].filter_off[n * 16]; for (int m = 0; m < n_edge_off_intervals; m++, filter_off += 4) { for (int o = 0; o < max_band; o++) { int off = 0; for (; off < 7; off++) if (!dav2d_get_bit(gb)) break; filter_off[o >> 1] |= off << (4 * (o & 1)); } } } } else { const Dav2dFrameHeader *const refhdr = c->refs[hdr->refidx[hdr->ccso.p[p].refidx]].p.p.frame_hdr; memcpy(&hdr->ccso.p[p].bo_only, &refhdr->ccso.p[p].bo_only, sizeof(hdr->ccso.p[p]) - (offsetof(Dav2dFrameHeader, ccso.p[p].bo_only) - offsetof(Dav2dFrameHeader, ccso.p[p].enabled))); } } } #if DEBUG_FRAME_HDR printf("HDR: post-ccso[%d]: off=%td\n", hdr->ccso.enabled, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } if (!hdr->all_lossless) hdr->txfm_mode = dav2d_get_bit(gb) ? DAV2D_TX_SWITCHABLE : DAV2D_TX_LARGEST; if (IS_INTER_OR_SWITCH(hdr)) { hdr->switchable_comp_refs = dav2d_get_bit(gb); hdr->skip_mode_enabled = dav2d_get_bit(gb); if (seqhdr->bawp) hdr->bawp = dav2d_get_bit(gb); if (seqhdr->motion_modes & (1 << MM_WARP_DELTA)) hdr->warp_motion = dav2d_get_bit(gb); } hdr->reduced_txtp_set = dav2d_get_bits(gb, 2); #if DEBUG_FRAME_HDR printf("HDR: post-modebits[tx:%d,refmode:%d,skipmode:%d,bawp:%d,warp:%d,redtxset:%d]: off=%td\n", hdr->txfm_mode, hdr->switchable_comp_refs, hdr->skip_mode_enabled, hdr->bawp, hdr->warp_motion, hdr->reduced_txtp_set, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif for (int i = 0; i < 7; i++) hdr->gmv.m[i] = dav2d_default_wm_params; if (IS_INTER_OR_SWITCH(hdr)) { if (seqhdr->global_motion && dav2d_get_bit(gb)) { hdr->gmv.ref = dav2d_get_uniform(gb, hdr->n_ref_frames + 1); const int32_t *ref_base_mat; int in_dist; if (hdr->gmv.ref == hdr->n_ref_frames) { ref_base_mat = dav2d_default_wm_params.matrix; in_dist = 1; } else { const int refidx = hdr->refidx[hdr->gmv.ref]; const Dav2dFrameHeader *const refhdr = c->refs[refidx].p.p.frame_hdr; if (!refhdr->n_ref_frames) { ref_base_mat = dav2d_default_wm_params.matrix; in_dist = 1; } else { hdr->gmv.refref = refhdr->n_ref_frames == 1 ? 0 : dav2d_get_uniform(gb, refhdr->n_ref_frames); ref_base_mat = refhdr->gmv.m[hdr->gmv.refref].matrix; in_dist = get_poc_diff(seqhdr->order_hint_n_bits, refhdr->frame_offset, c->refs[refidx].refpoc[hdr->gmv.refref]); } } for (int i = 0; i < hdr->n_ref_frames; i++) { hdr->gmv.m[i].type = !dav2d_get_bit(gb) ? DAV2D_WM_TYPE_IDENTITY : dav2d_get_bit(gb) ? DAV2D_WM_TYPE_ROT_ZOOM : DAV2D_WM_TYPE_AFFINE; if (hdr->gmv.m[i].type == DAV2D_WM_TYPE_IDENTITY) continue; int32_t *const mat = hdr->gmv.m[i].matrix; int32_t ref_mat[6]; const int out_dist = get_poc_diff(seqhdr->order_hint_n_bits, hdr->frame_offset, c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset); rescale_matrix(ref_mat, ref_base_mat, in_dist, out_dist); if (hdr->gmv.m[i].type >= DAV2D_WM_TYPE_ROT_ZOOM) { mat[2] = (1 << 16) + 64 * dav2d_get_bits_subexp(gb, (ref_mat[2] - (1 << 16)) >> 6, 512); mat[3] = 64 * dav2d_get_bits_subexp(gb, ref_mat[3] >> 6, 512); } if (hdr->gmv.m[i].type == DAV2D_WM_TYPE_AFFINE) { mat[4] = 64 * dav2d_get_bits_subexp(gb, ref_mat[4] >> 6, 512); mat[5] = (1 << 16) + 64 * dav2d_get_bits_subexp(gb, (ref_mat[5] - (1 << 16)) >> 6, 512); } else { mat[4] = -mat[3]; mat[5] = mat[2]; } mat[0] = dav2d_get_bits_subexp(gb, ref_mat[0] >> 13, 0x4000) * 8192; mat[1] = dav2d_get_bits_subexp(gb, ref_mat[1] >> 13, 0x4000) * 8192; #if DEBUG_FRAME_HDR printf("HDR: post-gmv[%d]matrix[%d,%d|%d,%d,%d,%d,t=%d]: off=%td\n", i, mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], hdr->gmv.m[i].type, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } } #if DEBUG_FRAME_HDR printf("HDR: post-gmv: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } grain: if (seqhdr->film_grain_present && (hdr->show_immediate || hdr->show_implicit)) { hdr->film_grain.present = seqhdr->reduced_still_picture_header || dav2d_get_bit(gb); if (hdr->film_grain.present) { hdr->film_grain.id = dav2d_get_bits(gb, 3); hdr->film_grain.seed = dav2d_get_bits(gb, 16); } #if DEBUG_FRAME_HDR printf("HDR: post-filmgrain[%d]: off=%td\n", hdr->film_grain.present, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } return 0; error: dav2d_log(c, "Error parsing frame header\n"); return DAV2D_ERR(EINVAL); } static int parse_fgm_hdr(Dav2dContext *const c, GetBits *const gb) { #define DEBUG_FGM_HDR 0 #if DEBUG_FGM_HDR const uint8_t *const init_ptr = gb->ptr; #endif const unsigned mask = dav2d_get_bits(gb, 8); enum Dav2dPixelLayout layout = dav2d_get_vlc(gb); if (layout > 3) goto error; #if DEBUG_FGM_HDR printf("FGM: post-init[mask=0x%x,layout=%d]: off=%td\n", mask, layout, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif layout = dav2d_layouts[layout]; if (layout != c->seq_hdr->layout) goto error; for (int idx = 0, m = 1; idx < 8; idx++, m <<= 1) { if (!(mask & m)) continue; if (c->fgm[idx]) dav2d_ref_dec(&c->fgm[idx]); c->fgm[idx] = dav2d_ref_create_using_pool(c->fgm_pool, sizeof(Dav2dFilmGrainData)); Dav2dFilmGrainData *const fgd = c->fgm[idx]->data; memset(fgd, 0, sizeof(*fgd)); int num_pl = 1; if (layout != DAV2D_PIXEL_LAYOUT_I400) { fgd->chroma_scaling_from_luma = dav2d_get_bit(gb); if (!fgd->chroma_scaling_from_luma) num_pl = 3; } for (int pl = 0; pl < num_pl; pl++) { fgd->num_points[pl] = dav2d_get_bits(gb, 4); if (fgd->num_points[pl] > 14) goto error; if (!fgd->num_points[pl]) continue; const int index_bits = 1 + dav2d_get_bits(gb, 3); const int scaling_bits = 5 + dav2d_get_bits(gb, 2); for (int i = 0, base = 0; i < fgd->num_points[pl]; i++) { base += dav2d_get_bits(gb, index_bits); if (base > 255) goto error; fgd->points[pl][i][0] = base; fgd->points[pl][i][1] = dav2d_get_bits(gb, scaling_bits); } #if DEBUG_FGM_HDR printf("FGM: post-scaling_points[id=%d,pl=%d,cnt=%d,bits=%d|%d]: off=%td\n", idx, pl, fgd->num_points[pl], index_bits, scaling_bits, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } if (layout == DAV2D_PIXEL_LAYOUT_I420 && !!fgd->num_points[1] != !!fgd->num_points[2]) { goto error; } fgd->scaling_shift = dav2d_get_bits(gb, 2) + 8; fgd->ar_coeff_lag = dav2d_get_bits(gb, 2); const int num_pos = 2 * fgd->ar_coeff_lag * (fgd->ar_coeff_lag + 1); for (int pl = 0; pl < 3; pl++) { if (!fgd->num_points[pl] && (!pl || !fgd->chroma_scaling_from_luma)) continue; // chroma has one more point const int num_pl_pos = num_pos + !!pl * !!fgd->num_points[0]; const int coef_bits = 5 + dav2d_get_bits(gb, 2); for (int i = 0; i < num_pl_pos; i++) fgd->ar_coeffs[pl][i] = dav2d_get_bits(gb, coef_bits) - 128; #if DEBUG_FGM_HDR printf("FGM: post-ar_coefs[id=%d,pl=%d,cnt=%d->%d,bits=%d]: off=%td\n", idx, pl, fgd->ar_coeff_lag, num_pl_pos, coef_bits, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } fgd->ar_coeff_shift = dav2d_get_bits(gb, 2) + 6; fgd->grain_scale_shift = dav2d_get_bits(gb, 2); for (int pl = 0; pl < 2; pl++) { if (!fgd->num_points[1 + pl]) continue; fgd->uv_mult[pl] = dav2d_get_bits(gb, 8) - 128; fgd->uv_luma_mult[pl] = dav2d_get_bits(gb, 8) - 128; fgd->uv_offset[pl] = dav2d_get_bits(gb, 9) - 256; } fgd->overlap_flag = dav2d_get_bit(gb); fgd->clip_to_restricted_range = dav2d_get_bit(gb); if (fgd->clip_to_restricted_range) fgd->mc_identity = dav2d_get_bit(gb); fgd->block_size = dav2d_get_bit(gb); #if DEBUG_FGM_HDR printf("FGM: post-data[id=%d,sh=%d|%"PRIu64"|%d,uvm=%d|%d|%d|%d|%d|%d," "overlap=%d,clip=%d,mcid=%d,bs=%d]: off=%td\n", idx, fgd->scaling_shift, fgd->ar_coeff_shift, fgd->grain_scale_shift, fgd->uv_mult[0], fgd->uv_luma_mult[0], fgd->uv_offset[0], fgd->uv_mult[1], fgd->uv_luma_mult[1], fgd->uv_offset[1], fgd->overlap_flag, fgd->clip_to_restricted_range, fgd->mc_identity, fgd->block_size, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } return 0; error: return -1; } static int parse_ci_hdr(Dav2dContext *const c, GetBits *const gb) { #define DEBUG_CI_HDR 0 #if DEBUG_CI_HDR const uint8_t *const init_ptr = gb->ptr; #endif Dav2dContentInterpretation *const ci = c->ci_ref->data; memset(ci, 0, sizeof(*ci)); ci->scan_type = dav2d_get_bits(gb, 2); ci->color_description_present = dav2d_get_bit(gb); ci->chroma_sample_position_present = dav2d_get_bit(gb); ci->aspect_ratio_info_present = dav2d_get_bit(gb); ci->timing_info_present = dav2d_get_bit(gb); ci->extension_present = dav2d_get_bit(gb); dav2d_get_bit(gb); // reserved #if DEBUG_CI_HDR printf("CI: post-flags[scan=%d,colordesc=%d,chrsamplepos=%d," "aspectratio=%d,timinginfo=%d,extension=%d]: off=%td\n", ci->scan_type, ci->color_description_present, ci->chroma_sample_position_present, ci->aspect_ratio_info_present, ci->timing_info_present, ci->extension_present, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (ci->color_description_present) { ci->color.type = dav2d_get_golomb(gb, 2); switch (ci->color.type) { case DAV2D_COLOR_DESC_EXPLICIT: ci->color.pri = dav2d_get_bits(gb, 8); ci->color.trc = dav2d_get_bits(gb, 8); ci->color.mtrx = dav2d_get_bits(gb, 8); break; case DAV2D_COLOR_DESC_BT709SDR: ci->color.pri = DAV2D_COLOR_PRI_BT709; ci->color.trc = DAV2D_TRC_BT709; ci->color.mtrx = DAV2D_MC_BT470BG; break; case DAV2D_COLOR_DESC_BT2100PQ: ci->color.pri = DAV2D_COLOR_PRI_BT2020; ci->color.trc = DAV2D_TRC_SMPTE2084; ci->color.mtrx = DAV2D_MC_BT2020_NCL; break; case DAV2D_COLOR_DESC_BT2100HLG: ci->color.pri = DAV2D_COLOR_PRI_BT2020; ci->color.trc = DAV2D_TRC_BT2020_10BIT; ci->color.mtrx = DAV2D_MC_BT2020_NCL; break; case DAV2D_COLOR_DESC_SRGB: ci->color.pri = DAV2D_COLOR_PRI_BT709; ci->color.trc = DAV2D_TRC_SRGB; ci->color.mtrx = DAV2D_MC_IDENTITY; break; case DAV2D_COLOR_DESC_SRGBSYCC: ci->color.pri = DAV2D_COLOR_PRI_BT709; ci->color.trc = DAV2D_TRC_SRGB; ci->color.mtrx = DAV2D_MC_BT470BG; break; default: ci->color.pri = DAV2D_COLOR_PRI_UNKNOWN; ci->color.trc = DAV2D_TRC_UNKNOWN; ci->color.mtrx = DAV2D_MC_UNKNOWN; break; } ci->color.range = dav2d_get_bit(gb); #if DEBUG_CI_HDR printf("CI: post-colordesc[id=%d,pri=%d,trc=%d,mtrx=%d,rng=%d]: off=%td\n", ci->color.type, ci->color.pri, ci->color.trc, ci->color.mtrx, ci->color.range, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } else { ci->color.pri = DAV2D_COLOR_PRI_UNKNOWN; ci->color.mtrx = DAV2D_MC_UNKNOWN; ci->color.trc = DAV2D_TRC_UNKNOWN; } if (ci->chroma_sample_position_present) { ci->chr[0] = dav2d_get_vlc(gb); ci->chr[1] = ci->scan_type == DAV2D_SCAN_TYPE_PROGRESSIVE ? ci->chr[0] : dav2d_get_vlc(gb); #if DEBUG_CI_HDR printf("CI: post-chromasampleposition[chr=%d/%d]: off=%td\n", ci->chr[0], ci->chr[1], (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } else { ci->chr[0] = ci->chr[1] = DAV2D_CHR_UNKNOWN; } if (ci->aspect_ratio_info_present) { ci->sar.type = dav2d_get_bits(gb, 8); switch (ci->sar.type) { case DAV2D_SAR_UNKNOWN: break; #define case_sar(width, height) \ case DAV2D_SAR_##width##_##height: \ ci->sar.w = width; \ ci->sar.h = height; \ break case_sar(1, 1); case_sar(12, 11); case_sar(10, 11); case_sar(16, 11); case_sar(40, 33); case_sar(24, 11); case_sar(20, 11); case_sar(32, 11); case_sar(80, 33); case_sar(18, 11); case_sar(15, 11); case_sar(64, 33); case_sar(160, 99); case_sar(4, 3); case_sar(3, 2); case_sar(2, 1); #undef case_sar case DAV2D_SAR_EXPLICIT: ci->sar.w = dav2d_get_vlc(gb); ci->sar.h = dav2d_get_vlc(gb); break; default: goto error; } #if DEBUG_CI_HDR printf("CI: post-sampleaspectratio[id=%d,sar=%d:%d]: off=%td\n", ci->sar.type, ci->sar.w, ci->sar.h, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } if (ci->timing_info_present) { ci->timing.num_units_in_display_tick = dav2d_get_bits(gb, 32); ci->timing.time_scale = dav2d_get_bits(gb, 32); if (!ci->timing.num_units_in_display_tick || !ci->timing.time_scale) goto error; ci->timing.equal_elemental_interval = dav2d_get_bit(gb); if (ci->timing.equal_elemental_interval) { const unsigned t = dav2d_get_vlc(gb); if (t == ~0U) goto error; ci->timing.num_ticks_per_elemental_duration = t + 1; } #if DEBUG_CI_HDR printf("CI: post-timinginfo[nuidt:%d,ts:%d,eei:%d,ntped:%d]: off=%td\n", ci->timing.num_units_in_display_tick, ci->timing.time_scale, ci->timing.equal_elemental_interval, ci->timing.num_ticks_per_elemental_duration, (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif } return 0; error: return -1; } static void parse_tile_hdr(Dav2dContext *const c, GetBits *const gb) { const int n_tiles = c->frame_hdr->tiling.t.cols * c->frame_hdr->tiling.t.rows; const int have_tile_pos = n_tiles > 1 ? dav2d_get_bit(gb) : 0; if (have_tile_pos) { const int n_bits = c->frame_hdr->tiling.t.log2_cols + c->frame_hdr->tiling.t.log2_rows; c->tile[c->n_tile_data].start = dav2d_get_bits(gb, n_bits); c->tile[c->n_tile_data].end = dav2d_get_bits(gb, n_bits); } else { c->tile[c->n_tile_data].start = 0; c->tile[c->n_tile_data].end = n_tiles - 1; } } ptrdiff_t dav2d_parse_obus(Dav2dContext *const c, Dav2dData *const in) { GetBits gb; int res; dav2d_init_get_bits(&gb, in->data, in->sz); // length field const size_t len = dav2d_get_uleb128(&gb); dav2d_bytealign_get_bits(&gb); if (len > (size_t)(gb.ptr_end - gb.ptr)) goto error; gb.ptr_end = gb.ptr + len; const int has_extension = dav2d_get_bit(&gb); const enum Dav2dObuType type = dav2d_get_bits(&gb, 5); const int tlayer_id = dav2d_get_bits(&gb, 2); int mlayer_id = 0, xlayer_id = 0; if (has_extension) { mlayer_id = dav2d_get_bits(&gb, 3); xlayer_id = dav2d_get_bits(&gb, 5); } if (gb.error) goto error; // We must have read a whole number of bytes at this point (1 byte // for the header and whole bytes at a time when reading the // leb128 length field). assert(gb.bits_left == 0); // skip obu not belonging to the selected temporal/spatial layer if (type != DAV2D_OBU_SEQ_HDR && type != DAV2D_OBU_TD && has_extension && c->operating_point_idc != 0) { const int in_temporal_layer = 1; //(c->operating_point_idc >> temporal_id) & 1; const int in_spatial_layer = 1; //(c->operating_point_idc >> (spatial_id + 8)) & 1; if (!in_temporal_layer || !in_spatial_layer) return gb.ptr_end - gb.ptr_start; } #define DEBUG_OBU_HDR 0 if (DEBUG_OBU_HDR) printf("OBU type=%d size=%td\n", type, gb.ptr_end - gb.ptr); switch (type) { case DAV2D_OBU_SEQ_HDR: { Dav2dRef *ref = dav2d_ref_create_using_pool(c->seq_hdr_pool, sizeof(Dav2dSequenceHeader)); if (!ref) return DAV2D_ERR(ENOMEM); Dav2dSequenceHeader *seq_hdr = ref->data; if ((res = parse_seq_hdr(seq_hdr, &gb, c->strict_std_compliance)) < 0) { dav2d_log(c, "Error parsing sequence header\n"); dav2d_ref_dec(&ref); goto error; } const int op_idx = 0; //c->operating_point < seq_hdr->num_operating_points ? c->operating_point : 0; c->operating_point_idc = 0;//seq_hdr->operating_points[op_idx].idc; const unsigned spatial_mask = c->operating_point_idc >> 8; c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0; // If we have read a sequence header which is different from // the old one, this is a new video sequence and can't use any // previous state. Free that state. if (!c->seq_hdr) { c->frame_hdr = NULL; #if 0 c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE; #endif } else if (memcmp(seq_hdr, c->seq_hdr, sizeof(Dav2dSequenceHeader))) { c->frame_hdr = NULL; c->mastering_display = NULL; c->content_light = NULL; dav2d_ref_dec(&c->mastering_display_ref); dav2d_ref_dec(&c->content_light_ref); for (int i = 0; i < 8; i++) { if (c->refs[i].p.p.frame_hdr) dav2d_thread_picture_unref(&c->refs[i].p); dav2d_ref_dec(&c->refs[i].segmap); dav2d_ref_dec(&c->refs[i].refmvs); dav2d_cdf_thread_unref(&c->cdf[i]); dav2d_ref_dec(&c->fgm[i]); } dav2d_ref_dec(&c->ci_ref); #if 0 c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE; // If operating_parameter_info changed, signal it } else if (memcmp(seq_hdr->operating_parameter_info, c->seq_hdr->operating_parameter_info, sizeof(seq_hdr->operating_parameter_info))) { c->frame_flags |= PICTURE_FLAG_NEW_OP_PARAMS_INFO; #endif } dav2d_ref_dec(&c->seq_hdr_ref); c->seq_hdr_ref = ref; c->seq_hdr = seq_hdr; break; } case DAV2D_OBU_OPEN_LOOP_KF: case DAV2D_OBU_CLOSED_LOOP_KF: case DAV2D_OBU_LEADING_TILE_GRP: case DAV2D_OBU_TILE_GRP: case DAV2D_OBU_SWITCH: case DAV2D_OBU_LEADING_SEF: case DAV2D_OBU_SEF: case DAV2D_OBU_LEADING_TIP: case DAV2D_OBU_TIP: case DAV2D_OBU_BRIDGE: case DAV2D_OBU_RAS: { if (!c->seq_hdr) goto error; if (!c->frame_hdr_ref) { c->frame_hdr_ref = dav2d_ref_create_using_pool(c->frame_hdr_pool, sizeof(Dav2dFrameHeader)); if (!c->frame_hdr_ref) return DAV2D_ERR(ENOMEM); } #ifndef NDEBUG // ensure that the reference is writable assert(dav2d_ref_is_writable(c->frame_hdr_ref)); #endif c->frame_hdr = c->frame_hdr_ref->data; memset(c->frame_hdr, 0, sizeof(*c->frame_hdr)); c->frame_hdr->tlayer_id = tlayer_id; c->frame_hdr->mlayer_id = mlayer_id; c->frame_hdr->xlayer_id = xlayer_id; const int first_tile = type == DAV2D_OBU_SEF || type == DAV2D_OBU_TIP || type == DAV2D_OBU_BRIDGE || dav2d_get_bit(&gb); const int has_hdr = first_tile || dav2d_get_bit(&gb); // FIXME if not first tile, we can skip re-parsing the header and // instead skip the header data and move on to block data directly if (has_hdr && (res = parse_frame_hdr(c, &gb, type)) < 0) { c->frame_hdr = NULL; goto error; } for (int n = 0; n < c->n_tile_data; n++) dav2d_data_unref_internal(&c->tile[n].data); c->n_tile_data = 0; c->n_tiles = 0; if (type == DAV2D_OBU_SEF || type == DAV2D_OBU_TIP || type == DAV2D_OBU_BRIDGE /* || bru frame inactive */) { // This is actually a frame header OBU so read the // trailing bit and check for overrun. if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) { c->frame_hdr = NULL; goto error; } } if (c->frame_size_limit && (int64_t)c->frame_hdr->width * c->frame_hdr->height > c->frame_size_limit) { dav2d_log(c, "Frame size %dx%d exceeds limit %u\n", c->frame_hdr->width, c->frame_hdr->height, c->frame_size_limit); c->frame_hdr = NULL; return DAV2D_ERR(ERANGE); } if (type == DAV2D_OBU_SEF || type == DAV2D_OBU_TIP || type == DAV2D_OBU_BRIDGE /* || bru frame inactive */) { break; } if (c->n_tile_data_alloc < c->n_tile_data + 1) { if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error; struct Dav2dTileGroup *tile = dav2d_realloc(ALLOC_TILE, c->tile, (c->n_tile_data + 1) * sizeof(*c->tile)); if (!tile) goto error; c->tile = tile; memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile)); c->n_tile_data_alloc = c->n_tile_data + 1; } parse_tile_hdr(c, &gb); // Align to the next byte boundary and check for overrun. dav2d_bytealign_get_bits(&gb); if (gb.error) goto error; dav2d_data_ref(&c->tile[c->n_tile_data].data, in); c->tile[c->n_tile_data].data.data = gb.ptr; c->tile[c->n_tile_data].data.sz = (size_t)(gb.ptr_end - gb.ptr); // ensure tile groups are in order and sane, see 6.10.1 if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end || c->tile[c->n_tile_data].start != c->n_tiles) { for (int i = 0; i <= c->n_tile_data; i++) dav2d_data_unref_internal(&c->tile[i].data); c->n_tile_data = 0; c->n_tiles = 0; goto error; } c->n_tiles += 1 + c->tile[c->n_tile_data].end - c->tile[c->n_tile_data].start; c->n_tile_data++; break; } case DAV2D_OBU_FGM: { parse_fgm_hdr(c, &gb); if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) goto error; break; } case DAV2D_OBU_CONTENT_INTERP: { if (c->ci_ref) dav2d_ref_dec(&c->ci_ref); c->ci_ref = dav2d_ref_create_using_pool(c->ci_pool, sizeof(Dav2dContentInterpretation)); parse_ci_hdr(c, &gb); if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) goto error; break; } case DAV2D_OBU_METADATA: { #define DEBUG_OBU_METADATA 0 #if DEBUG_OBU_METADATA const uint8_t *const init_ptr = gb.ptr; #endif // obu metadta type field const enum ObuMetaType meta_type = dav2d_get_uleb128(&gb); if (gb.error) goto error; switch (meta_type) { case OBU_META_HDR_CLL: { Dav2dRef *ref = dav2d_ref_create(ALLOC_OBU_META, sizeof(Dav2dContentLightLevel)); if (!ref) return DAV2D_ERR(ENOMEM); Dav2dContentLightLevel *const content_light = ref->data; content_light->max_content_light_level = dav2d_get_bits(&gb, 16); #if DEBUG_OBU_METADATA printf("CLLOBU: max-content-light-level: %d [off=%td]\n", content_light->max_content_light_level, (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif content_light->max_frame_average_light_level = dav2d_get_bits(&gb, 16); #if DEBUG_OBU_METADATA printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n", content_light->max_frame_average_light_level, (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) { dav2d_ref_dec(&ref); goto error; } dav2d_ref_dec(&c->content_light_ref); c->content_light = content_light; c->content_light_ref = ref; break; } case OBU_META_HDR_MDCV: { Dav2dRef *ref = dav2d_ref_create(ALLOC_OBU_META, sizeof(Dav2dMasteringDisplay)); if (!ref) return DAV2D_ERR(ENOMEM); Dav2dMasteringDisplay *const mastering_display = ref->data; for (int i = 0; i < 3; i++) { mastering_display->primaries[i][0] = dav2d_get_bits(&gb, 16); mastering_display->primaries[i][1] = dav2d_get_bits(&gb, 16); #if DEBUG_OBU_METADATA printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i, mastering_display->primaries[i][0], mastering_display->primaries[i][1], (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif } mastering_display->white_point[0] = dav2d_get_bits(&gb, 16); #if DEBUG_OBU_METADATA printf("MDCVOBU: white-point-x: %d [off=%td]\n", mastering_display->white_point[0], (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif mastering_display->white_point[1] = dav2d_get_bits(&gb, 16); #if DEBUG_OBU_METADATA printf("MDCVOBU: white-point-y: %d [off=%td]\n", mastering_display->white_point[1], (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif mastering_display->max_luminance = dav2d_get_bits(&gb, 32); #if DEBUG_OBU_METADATA printf("MDCVOBU: max-luminance: %d [off=%td]\n", mastering_display->max_luminance, (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif mastering_display->min_luminance = dav2d_get_bits(&gb, 32); #if DEBUG_OBU_METADATA printf("MDCVOBU: min-luminance: %d [off=%td]\n", mastering_display->min_luminance, (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) { dav2d_ref_dec(&ref); goto error; } dav2d_ref_dec(&c->mastering_display_ref); c->mastering_display = mastering_display; c->mastering_display_ref = ref; break; } case OBU_META_ITUT_T35: { ptrdiff_t payload_size = gb.ptr_end - gb.ptr; // Don't take into account all the trailing bits for payload_size while (payload_size > 0 && !gb.ptr[payload_size - 1]) payload_size--; // trailing_zero_bit x 8 payload_size--; // trailing_one_bit + trailing_zero_bit x 7 int country_code_extension_byte = 0; const int country_code = dav2d_get_bits(&gb, 8); payload_size--; if (country_code == 0xFF) { country_code_extension_byte = dav2d_get_bits(&gb, 8); payload_size--; } if (payload_size <= 0 || gb.ptr[payload_size] != 0x80) { dav2d_log(c, "Malformed ITU-T T.35 metadata message format\n"); break; } if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error; struct Dav2dITUTT35 *itut_t35 = dav2d_realloc(ALLOC_OBU_META, c->itut_t35, (c->n_itut_t35 + 1) * sizeof(*c->itut_t35)); if (!itut_t35) goto error; c->itut_t35 = itut_t35; memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35)); struct itut_t35_ctx_context *itut_t35_ctx; if (!c->n_itut_t35) { assert(!c->itut_t35_ref); itut_t35_ctx = dav2d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context)); if (!itut_t35_ctx) goto error; c->itut_t35_ref = dav2d_ref_init(&itut_t35_ctx->ref, c->itut_t35, dav2d_picture_free_itut_t35, itut_t35_ctx, 0); } else { assert(c->itut_t35_ref && atomic_load(&c->itut_t35_ref->ref_cnt) == 1); itut_t35_ctx = c->itut_t35_ref->user_data; c->itut_t35_ref->const_data = (uint8_t *)c->itut_t35; } itut_t35_ctx->itut_t35 = c->itut_t35; itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1; Dav2dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35]; itut_t35_metadata->payload = dav2d_malloc(ALLOC_OBU_META, payload_size); if (!itut_t35_metadata->payload) goto error; itut_t35_metadata->country_code = country_code; itut_t35_metadata->country_code_extension_byte = country_code_extension_byte; itut_t35_metadata->payload_size = payload_size; // We know that we've read a whole number of bytes and that the // payload is within the OBU boundaries, so just use memcpy() assert(gb.bits_left == 0); memcpy(itut_t35_metadata->payload, gb.ptr, payload_size); c->n_itut_t35++; break; } case OBU_META_SCALABILITY: case OBU_META_TIMECODE: // ignore metadata OBUs we don't care about break; default: // print a warning but don't fail for unknown types if (meta_type > 31) // Types 6 to 31 are "Unregistered user private", so ignore them. dav2d_log(c, "Unknown Metadata OBU type %d\n", meta_type); break; } break; } case DAV2D_OBU_TD: #if 0 c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT; #endif break; case DAV2D_OBU_PADDING: // ignore OBUs we don't care about break; default: // print a warning but don't fail for unknown types dav2d_log(c, "Unknown OBU type %d of size %td\n", type, gb.ptr_end - gb.ptr); break; } if (c->seq_hdr && c->frame_hdr) { // FIXME handle bridge/bru also const int frame_without_data = c->frame_hdr->tip.frame_mode == 2; if (c->frame_hdr->show_existing_frame) { if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error; switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) { case DAV2D_FRAME_TYPE_INTER: case DAV2D_FRAME_TYPE_SWITCH: if (c->decode_frame_type > DAV2D_DECODEFRAMETYPE_REFERENCE) goto skip; break; case DAV2D_FRAME_TYPE_INTRA: if (c->decode_frame_type > DAV2D_DECODEFRAMETYPE_INTRA) goto skip; // fall-through default: break; } if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error; if (c->strict_std_compliance && !c->refs[c->frame_hdr->existing_frame_idx].p.showable) { goto error; } dav2d_queue_output(c, &c->refs[c->frame_hdr->existing_frame_idx].p); #if 0 dav2d_picture_copy_props(&c->out.p, c->content_light, c->content_light_ref, c->mastering_display, c->mastering_display_ref, c->itut_t35, c->itut_t35_ref, c->n_itut_t35, &in->m); // Must be removed from the context after being attached to the frame dav2d_ref_dec(&c->itut_t35_ref); c->itut_t35 = NULL; c->n_itut_t35 = 0; c->event_flags |= dav2d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p); #endif if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV2D_FRAME_TYPE_KEY) { const int r = c->frame_hdr->existing_frame_idx; c->refs[r].p.showable = 0; for (int i = 0; i < 8; i++) { if (i == r) continue; if (c->refs[i].p.p.frame_hdr) dav2d_thread_picture_unref(&c->refs[i].p); dav2d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p); dav2d_cdf_thread_unref(&c->cdf[i]); dav2d_cdf_thread_ref(&c->cdf[i], &c->cdf[r]); dav2d_ref_dec(&c->refs[i].segmap); c->refs[i].segmap = c->refs[r].segmap; if (c->refs[r].segmap) dav2d_ref_inc(c->refs[r].segmap); dav2d_ref_dec(&c->refs[i].refmvs); } } c->frame_hdr = NULL; } else if (c->n_tiles == c->frame_hdr->tiling.t.cols * c->frame_hdr->tiling.t.rows || frame_without_data) { switch (c->frame_hdr->frame_type) { case DAV2D_FRAME_TYPE_INTER: case DAV2D_FRAME_TYPE_SWITCH: if (c->decode_frame_type > DAV2D_DECODEFRAMETYPE_REFERENCE || (c->decode_frame_type == DAV2D_DECODEFRAMETYPE_REFERENCE && !c->frame_hdr->refresh_frame_flags)) goto skip; break; case DAV2D_FRAME_TYPE_INTRA: if (c->decode_frame_type > DAV2D_DECODEFRAMETYPE_INTRA || (c->decode_frame_type == DAV2D_DECODEFRAMETYPE_REFERENCE && !c->frame_hdr->refresh_frame_flags)) goto skip; // fall-through default: break; } if (!frame_without_data && !c->n_tile_data) goto error; if ((res = dav2d_submit_frame(c)) < 0) return res; assert(!c->n_tile_data); c->frame_hdr = NULL; c->n_tiles = 0; } } return gb.ptr_end - gb.ptr_start; skip: // update refs with only the headers in case we skip the frame for (int i = 0; i < 8; i++) { if (c->frame_hdr->refresh_frame_flags & (1 << i)) { dav2d_thread_picture_unref(&c->refs[i].p); c->refs[i].p.p.frame_hdr = c->frame_hdr; c->refs[i].p.p.seq_hdr = c->seq_hdr; c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref; c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref; dav2d_ref_inc(c->frame_hdr_ref); dav2d_ref_inc(c->seq_hdr_ref); } } dav2d_ref_dec(&c->frame_hdr_ref); c->frame_hdr = NULL; c->n_tiles = 0; return gb.ptr_end - gb.ptr_start; error: #if 0 dav2d_data_props_copy(&c->cached_error_props, &in->m); #endif dav2d_log(c, gb.error ? "Overrun in OBU bit buffer\n" : "Error parsing OBU data\n"); return DAV2D_ERR(EINVAL); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/obu.h000066400000000000000000000031301517466257200215660ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_OBU_H #define DAV2D_SRC_OBU_H #include "dav2d/data.h" #include "src/internal.h" ptrdiff_t dav2d_parse_obus(Dav2dContext *c, Dav2dData *in); #endif /* DAV2D_SRC_OBU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/pal.c000066400000000000000000000053361517466257200215620ustar00rootroot00000000000000/* * Copyright © 2023-2026, VideoLAN and dav2d authors * Copyright © 2023-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/attributes.h" #include "src/pal.h" // fill invisible edges and pack to 4-bit (2 pixels per byte) static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src, const int bw, const int bh, const int w, const int h) { assert(bw >= 4 && bw <= 64 && !(bw & (bw - 1))); assert(bh >= 4 && bh <= 64 && !(bh & (bh - 1))); assert(w >= 4 && w <= bw && !(w & 3)); assert(h >= 4 && h <= bh && !(h & 3)); const int dst_w = w / 2; const int dst_bw = bw / 2; for (int y = 0; y < h; y++, src += bw, dst += dst_bw) { for (int x = 0; x < dst_w; x++) dst[x] = src[x * 2 + 0] | (src[x * 2 + 1] << 4); if (dst_w < dst_bw) memset(dst + dst_w, src[w - 1] * 0x11, dst_bw - dst_w); } if (h < bh) { const uint8_t *const last_row = &dst[-dst_bw]; for (int y = h; y < bh; y++, dst += dst_bw) memcpy(dst, last_row, dst_bw); } } #if HAVE_ASM #if ARCH_RISCV #include "riscv/pal.h" #elif ARCH_X86 #include "x86/pal.h" #endif #endif COLD void dav2d_pal_dsp_init(Dav2dPalDSPContext *const c) { c->pal_idx_finish = pal_idx_finish_c; #if HAVE_ASM #if ARCH_RISCV pal_dsp_init_riscv(c); #elif ARCH_X86 pal_dsp_init_x86(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/pal.h000066400000000000000000000034711517466257200215650ustar00rootroot00000000000000/* * Copyright © 2023-2026, VideoLAN and dav2d authors * Copyright © 2023-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_PAL_H #define DAV2D_SRC_PAL_H #include #define decl_pal_idx_finish_fn(name) \ void (name)(uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h) typedef decl_pal_idx_finish_fn(*pal_idx_finish_fn); typedef struct Dav2dPalDSPContext { pal_idx_finish_fn pal_idx_finish; } Dav2dPalDSPContext; void dav2d_pal_dsp_init(Dav2dPalDSPContext *dsp); #endif /* DAV2D_SRC_PAL_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/picture.c000066400000000000000000000313111517466257200224510ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include #include "common/intops.h" #include "common/validate.h" #include "src/internal.h" #include "src/log.h" #include "src/picture.h" #include "src/ref.h" #include "src/thread.h" #include "src/thread_task.h" int dav2d_default_picture_alloc(Dav2dPicture *const p, void *const cookie) { const int hbd = p->p.bpc > 8; const int aligned_w = (p->p.w + 127) & ~127; const int aligned_h = (p->p.h + 127) & ~127; const int has_chroma = p->p.layout != DAV2D_PIXEL_LAYOUT_I400; const int ss_ver = p->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p->p.layout != DAV2D_PIXEL_LAYOUT_I444; ptrdiff_t y_stride = aligned_w << hbd; ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0; /* Due to how mapping of addresses to sets works in most L1 and L2 cache * implementations, strides of multiples of certain power-of-two numbers * may cause multiple rows of the same superblock to map to the same set, * causing evictions of previous rows resulting in a reduction in cache * hit rate. Avoid that by slightly padding the stride when necessary. */ if (!(y_stride & 1023)) y_stride += DAV2D_PICTURE_ALIGNMENT; if (!(uv_stride & 1023) && has_chroma) uv_stride += DAV2D_PICTURE_ALIGNMENT; p->stride[0] = y_stride; p->stride[1] = uv_stride; const size_t y_sz = y_stride * aligned_h; const size_t uv_sz = uv_stride * (aligned_h >> ss_ver); const size_t pic_size = y_sz + 2 * uv_sz; uint8_t *const buf = dav2d_mem_pool_pop(cookie, pic_size + DAV2D_PICTURE_ALIGNMENT); if (!buf) return DAV2D_ERR(ENOMEM); p->allocator_data = buf; p->data[0] = buf; p->data[1] = has_chroma ? buf + y_sz : NULL; p->data[2] = has_chroma ? buf + y_sz + uv_sz : NULL; return 0; } void dav2d_default_picture_release(Dav2dPicture *const p, void *const cookie) { dav2d_mem_pool_push(cookie, p->allocator_data); } struct pic_ctx_context { Dav2dPicAllocator allocator; Dav2dPicture pic; Dav2dRef ref; void *extra_data[]; }; static void free_buffer(const uint8_t *const data, void *const user_data) { struct pic_ctx_context *pic_ctx = (struct pic_ctx_context*)data; pic_ctx->allocator.release_picture_callback(&pic_ctx->pic, pic_ctx->allocator.cookie); dav2d_mem_pool_push(user_data, pic_ctx); } void dav2d_picture_free_itut_t35(const uint8_t *const data, void *const user_data) { struct itut_t35_ctx_context *itut_t35_ctx = user_data; for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++) dav2d_free(itut_t35_ctx->itut_t35[i].payload); dav2d_free(itut_t35_ctx->itut_t35); dav2d_free(itut_t35_ctx); } static int picture_alloc(Dav2dContext *const c, Dav2dPicture *const p, const int w, const int h, Dav2dSequenceHeader *const seq_hdr, Dav2dRef *const seq_hdr_ref, Dav2dFrameHeader *const frame_hdr, Dav2dRef *const frame_hdr_ref, const int bpc, const Dav2dDataProps *const props, Dav2dPicAllocator *const p_allocator, void **const extra_ptr) { if (p->data[0]) { dav2d_log(c, "Picture already allocated!\n"); return -1; } assert(bpc > 0 && bpc <= 16); size_t extra = c->n_fc > 1 ? sizeof(atomic_int) * 3 : 0; struct pic_ctx_context *pic_ctx = dav2d_mem_pool_pop(c->pic_ctx_pool, extra + sizeof(struct pic_ctx_context)); if (!pic_ctx) return DAV2D_ERR(ENOMEM); p->p.w = w; p->p.h = h; p->seq_hdr = seq_hdr; p->frame_hdr = frame_hdr; p->p.layout = seq_hdr->layout; p->p.bpc = bpc; dav2d_data_props_set_defaults(&p->m); const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie); if (res < 0) { dav2d_mem_pool_push(c->pic_ctx_pool, pic_ctx); return res; } pic_ctx->allocator = *p_allocator; pic_ctx->pic = *p; p->ref = dav2d_ref_init(&pic_ctx->ref, pic_ctx, free_buffer, c->pic_ctx_pool, 0); p->seq_hdr_ref = seq_hdr_ref; if (seq_hdr_ref) dav2d_ref_inc(seq_hdr_ref); p->frame_hdr_ref = frame_hdr_ref; if (frame_hdr_ref) dav2d_ref_inc(frame_hdr_ref); if (extra && extra_ptr) *extra_ptr = &pic_ctx->extra_data; return 0; } void dav2d_picture_copy_props(Dav2dPicture *const p, Dav2dContentLightLevel *const content_light, Dav2dRef *const content_light_ref, Dav2dMasteringDisplay *const mastering_display, Dav2dRef *const mastering_display_ref, Dav2dITUTT35 *const itut_t35, Dav2dRef *itut_t35_ref, size_t n_itut_t35, const Dav2dDataProps *const props) { dav2d_data_props_copy(&p->m, props); dav2d_ref_dec(&p->content_light_ref); p->content_light_ref = content_light_ref; p->content_light = content_light; if (content_light_ref) dav2d_ref_inc(content_light_ref); dav2d_ref_dec(&p->mastering_display_ref); p->mastering_display_ref = mastering_display_ref; p->mastering_display = mastering_display; if (mastering_display_ref) dav2d_ref_inc(mastering_display_ref); dav2d_ref_dec(&p->itut_t35_ref); p->itut_t35_ref = itut_t35_ref; p->itut_t35 = itut_t35; p->n_itut_t35 = n_itut_t35; if (itut_t35_ref) dav2d_ref_inc(itut_t35_ref); } int dav2d_thread_picture_alloc(Dav2dContext *const c, Dav2dFrameContext *const f, const int bpc) { Dav2dThreadPicture *const p = &f->cur; const int res = picture_alloc(c, &p->p, f->frame_hdr->width, f->frame_hdr->height, f->seq_hdr, f->seq_hdr_ref, f->frame_hdr, f->frame_hdr_ref, bpc, &f->tile[0].data.m, &c->allocator, (void **) &p->progress); if (res) return res; // Don't clear these flags from c->frame_flags if the frame is not going to be output. // This way they will be added to the next visible frame too. const int flags_mask = ((f->frame_hdr->show_immediate || c->output_invisible_frames) /*&& c->max_spatial_id == f->frame_hdr->spatial_id*/) ? 0 : (PICTURE_FLAG_NEW_SEQUENCE | PICTURE_FLAG_NEW_OP_PARAMS_INFO); #if 0 p->flags = c->frame_flags; c->frame_flags &= flags_mask; #endif p->visible = f->frame_hdr->show_immediate; p->showable = f->frame_hdr->show_immediate || f->frame_hdr->show_implicit; if (p->visible) { // Only add HDR10+ and T35 metadata when show frame flag is enabled dav2d_picture_copy_props(&p->p, c->content_light, c->content_light_ref, c->mastering_display, c->mastering_display_ref, c->itut_t35, c->itut_t35_ref, c->n_itut_t35, &f->tile[0].data.m); // Must be removed from the context after being attached to the frame dav2d_ref_dec(&c->itut_t35_ref); c->itut_t35 = NULL; c->n_itut_t35 = 0; } else { dav2d_data_props_copy(&p->p.m, &f->tile[0].data.m); } if (c->n_fc > 1) { atomic_init(&p->progress[0], 0); atomic_init(&p->progress[1], 0); atomic_init(&p->progress[2], 0); } return res; } int dav2d_picture_alloc_copy(Dav2dContext *const c, Dav2dPicture *const dst, const Dav2dPicture *const src) { struct pic_ctx_context *const pic_ctx = (struct pic_ctx_context*)src->ref->const_data; const int res = picture_alloc(c, dst, src->p.w, src->p.h, src->seq_hdr, src->seq_hdr_ref, src->frame_hdr, src->frame_hdr_ref, src->p.bpc, &src->m, &pic_ctx->allocator, NULL); if (res) return res; dav2d_picture_copy_props(dst, src->content_light, src->content_light_ref, src->mastering_display, src->mastering_display_ref, src->itut_t35, src->itut_t35_ref, src->n_itut_t35, &src->m); if (src->fgm_ref) { dst->fgm_ref = src->fgm_ref; dav2d_ref_inc(dst->fgm_ref); dst->fgm = src->fgm; } if (src->ci_ref) { dst->ci_ref = src->ci_ref; dav2d_ref_inc(dst->ci_ref); dst->ci = src->ci; } return 0; } void dav2d_picture_ref(Dav2dPicture *const dst, const Dav2dPicture *const src) { assert(dst != NULL); assert(dst->data[0] == NULL); assert(src != NULL); if (src->ref) { assert(src->data[0] != NULL); dav2d_ref_inc(src->ref); } if (src->frame_hdr_ref) dav2d_ref_inc(src->frame_hdr_ref); if (src->seq_hdr_ref) dav2d_ref_inc(src->seq_hdr_ref); if (src->m.user_data.ref) dav2d_ref_inc(src->m.user_data.ref); if (src->content_light_ref) dav2d_ref_inc(src->content_light_ref); if (src->mastering_display_ref) dav2d_ref_inc(src->mastering_display_ref); if (src->itut_t35_ref) dav2d_ref_inc(src->itut_t35_ref); if (src->fgm_ref) dav2d_ref_inc(src->fgm_ref); if (src->ci_ref) dav2d_ref_inc(src->ci_ref); *dst = *src; } void dav2d_picture_move_ref(Dav2dPicture *const dst, Dav2dPicture *const src) { assert(dst != NULL); assert(dst->data[0] == NULL); assert(src != NULL); if (src->ref) assert(src->data[0] != NULL); *dst = *src; memset(src, 0, sizeof(*src)); } void dav2d_thread_picture_ref(Dav2dThreadPicture *const dst, const Dav2dThreadPicture *const src) { dav2d_picture_ref(&dst->p, &src->p); dst->visible = src->visible; dst->showable = src->showable; dst->progress = src->progress; dst->flags = src->flags; } void dav2d_thread_picture_move_ref(Dav2dThreadPicture *const dst, Dav2dThreadPicture *const src) { dav2d_picture_move_ref(&dst->p, &src->p); dst->visible = src->visible; dst->showable = src->showable; dst->progress = src->progress; dst->flags = src->flags; memset(src, 0, sizeof(*src)); } void dav2d_picture_unref_internal(Dav2dPicture *const p) { validate_input(p != NULL); if (p->ref) { validate_input(p->data[0] != NULL); dav2d_ref_dec(&p->ref); } dav2d_ref_dec(&p->seq_hdr_ref); dav2d_ref_dec(&p->frame_hdr_ref); dav2d_ref_dec(&p->m.user_data.ref); dav2d_ref_dec(&p->content_light_ref); dav2d_ref_dec(&p->mastering_display_ref); dav2d_ref_dec(&p->itut_t35_ref); dav2d_ref_dec(&p->fgm_ref); dav2d_ref_dec(&p->ci_ref); memset(p, 0, sizeof(*p)); dav2d_data_props_set_defaults(&p->m); } void dav2d_thread_picture_unref(Dav2dThreadPicture *const p) { dav2d_picture_unref_internal(&p->p); p->progress = NULL; } enum Dav2dEventFlags dav2d_picture_get_event_flags(const Dav2dThreadPicture *const p) { if (!p->flags) return 0; enum Dav2dEventFlags flags = 0; if (p->flags & PICTURE_FLAG_NEW_SEQUENCE) flags |= DAV2D_EVENT_FLAG_NEW_SEQUENCE; if (p->flags & PICTURE_FLAG_NEW_OP_PARAMS_INFO) flags |= DAV2D_EVENT_FLAG_NEW_OP_PARAMS_INFO; return flags; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/picture.h000066400000000000000000000102001517466257200224500ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_PICTURE_H #define DAV2D_SRC_PICTURE_H #include #include "src/thread.h" #include "dav2d/picture.h" #include "src/thread_data.h" #include "src/ref.h" enum PlaneType { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPE_BLOCK, PLANE_TYPE_ALL, }; enum PictureFlags { PICTURE_FLAG_NEW_SEQUENCE = 1 << 0, PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1, PICTURE_FLAG_NEW_TEMPORAL_UNIT = 1 << 2, }; typedef struct Dav2dThreadPicture { Dav2dPicture p; int visible; // This can be set for inter frames, non-key intra frames, or for invisible // keyframes that have not yet been made visible using the show-existing-frame // mechanism. int showable; enum PictureFlags flags; // [0] block data (segmentation map and ccso values) // [1] motion vectors // [2] pixel data atomic_uint *progress; } Dav2dThreadPicture; typedef struct Dav2dPictureBuffer { void *data; struct Dav2dPictureBuffer *next; } Dav2dPictureBuffer; /* * Allocate a picture with custom border size. */ int dav2d_thread_picture_alloc(Dav2dContext *c, Dav2dFrameContext *f, const int bpc); /** * Allocate a picture with identical metadata to an existing picture. */ int dav2d_picture_alloc_copy(Dav2dContext *c, Dav2dPicture *dst, const Dav2dPicture *src); /** * Create a copy of a picture. */ void dav2d_picture_ref(Dav2dPicture *dst, const Dav2dPicture *src); void dav2d_thread_picture_ref(Dav2dThreadPicture *dst, const Dav2dThreadPicture *src); void dav2d_thread_picture_move_ref(Dav2dThreadPicture *dst, Dav2dThreadPicture *src); void dav2d_thread_picture_unref(Dav2dThreadPicture *p); /** * Move a picture reference. */ void dav2d_picture_move_ref(Dav2dPicture *dst, Dav2dPicture *src); int dav2d_default_picture_alloc(Dav2dPicture *p, void *cookie); void dav2d_default_picture_release(Dav2dPicture *p, void *cookie); void dav2d_picture_unref_internal(Dav2dPicture *p); struct itut_t35_ctx_context { Dav2dITUTT35 *itut_t35; size_t n_itut_t35; Dav2dRef ref; }; void dav2d_picture_free_itut_t35(const uint8_t *data, void *user_data); void dav2d_picture_copy_props(Dav2dPicture *p, Dav2dContentLightLevel *content_light, Dav2dRef *content_light_ref, Dav2dMasteringDisplay *mastering_display, Dav2dRef *mastering_display_ref, Dav2dITUTT35 *itut_t35, Dav2dRef *itut_t35_ref, size_t n_itut_t35, const Dav2dDataProps *props); /** * Get event flags from picture flags. */ enum Dav2dEventFlags dav2d_picture_get_event_flags(const Dav2dThreadPicture *p); #endif /* DAV2D_SRC_PICTURE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/000077500000000000000000000000001517466257200214155ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/cdef.h000066400000000000000000000047731517466257200225020ustar00rootroot00000000000000/* * Copyright © 2019, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "common/bitdepth.h" #include "common/intops.h" #include "src/cdef.h" #include "src/cpu.h" #define cdef_vsx_fn(w, h) \ void dav2d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ const ptrdiff_t dst_stride, \ const pixel (*left)[2], \ const pixel *const top, \ const pixel *const bottom, \ const int pri_strength, \ const int sec_strength, \ const int dir, \ const int damping, \ const enum CdefEdgeFlags edges) cdef_vsx_fn(4, 4); cdef_vsx_fn(4, 8); cdef_vsx_fn(8, 8); static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav2dCdefDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_PPC_CPU_FLAG_VSX)) return; #if BITDEPTH == 8 c->fb[0] = dav2d_cdef_filter_8x8_vsx; c->fb[1] = dav2d_cdef_filter_4x8_vsx; c->fb[2] = dav2d_cdef_filter_4x4_vsx; #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/cdef_tmpl.c000066400000000000000000000462201517466257200235220ustar00rootroot00000000000000/* * Copyright © 2019, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/ppc/dav2d_types.h" #include "src/ppc/cdef.h" #if BITDEPTH == 8 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, const uint16_t shift) { const i16x8 zero = vec_splat_s16(0); if (!threshold) return zero; const i16x8 abs_diff = vec_abs(diff); const b16x8 mask = vec_cmplt(diff, zero); const i16x8 thr = vec_splats(threshold); const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift))); const i16x8 max = vec_max(zero, sub); const i16x8 min = vec_min(abs_diff, max); const i16x8 neg = vec_sub(zero, min); return vec_sel(min, neg, mask); } static inline void copy4xN(uint16_t *tmp, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, const uint8_t *const bottom, const int w, const int h, const enum CdefEdgeFlags edges) { const u16x8 fill = vec_splats((uint16_t)INT16_MAX); u16x8 l0; u16x8 l1; int y_start = -2, y_end = h + 2; // Copy top and bottom first if (!(edges & CDEF_HAVE_TOP)) { l0 = fill; l1 = fill; y_start = 0; } else { l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2)); l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2)); } vec_st(l0, 0, tmp - 2 * 8); vec_st(l1, 0, tmp - 1 * 8); if (!(edges & CDEF_HAVE_BOTTOM)) { l0 = fill; l1 = fill; y_end -= 2; } else { l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2)); l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2)); } vec_st(l0, 0, tmp + (h + 0) * 8); vec_st(l1, 0, tmp + (h + 1) * 8); int y_with_left_edge = 0; if (!(edges & CDEF_HAVE_LEFT)) { u16x8 l = u8h_to_u16(vec_vsx_ld(0, src)); vec_vsx_st(l, 0, tmp + 2); y_with_left_edge = 1; } for (int y = y_with_left_edge; y < h; y++) { u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride)); vec_st(l, 0, tmp + y * 8); } if (!(edges & CDEF_HAVE_LEFT)) { for (int y = y_start; y < y_end; y++) { tmp[y * 8] = INT16_MAX; tmp[1 + y * 8] = INT16_MAX; } } else { for (int y = 0; y < h; y++) { tmp[y * 8] = left[y][0]; tmp[1 + y * 8] = left[y][1]; } } if (!(edges & CDEF_HAVE_RIGHT)) { for (int y = y_start; y < y_end; y++) { tmp[- 2 + (y + 1) * 8] = INT16_MAX; tmp[- 1 + (y + 1) * 8] = INT16_MAX; } } } static inline void copy8xN(uint16_t *tmp, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, const uint8_t *const bottom, const int w, const int h, const enum CdefEdgeFlags edges) { const u16x8 fill = vec_splats((uint16_t)INT16_MAX); u16x8 l0h, l0l; u16x8 l1h, l1l; int y_start = -2, y_end = h + 2; // Copy top and bottom first if (!(edges & CDEF_HAVE_TOP)) { l0h = fill; l0l = fill; l1h = fill; l1l = fill; y_start = 0; } else { u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2); u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2); l0h = u8h_to_u16(l0); l0l = u8l_to_u16(l0); l1h = u8h_to_u16(l1); l1l = u8l_to_u16(l1); } vec_st(l0h, 0, tmp - 4 * 8); vec_st(l0l, 0, tmp - 3 * 8); vec_st(l1h, 0, tmp - 2 * 8); vec_st(l1l, 0, tmp - 1 * 8); if (!(edges & CDEF_HAVE_BOTTOM)) { l0h = fill; l0l = fill; l1h = fill; l1l = fill; y_end -= 2; } else { u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2); u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2); l0h = u8h_to_u16(l0); l0l = u8l_to_u16(l0); l1h = u8h_to_u16(l1); l1l = u8l_to_u16(l1); } vec_st(l0h, 0, tmp + (h + 0) * 16); vec_st(l0l, 0, tmp + (h + 0) * 16 + 8); vec_st(l1h, 0, tmp + (h + 1) * 16); vec_st(l1l, 0, tmp + (h + 1) * 16 + 8); int y_with_left_edge = 0; if (!(edges & CDEF_HAVE_LEFT)) { u8x16 l = vec_vsx_ld(0, src); u16x8 lh = u8h_to_u16(l); u16x8 ll = u8l_to_u16(l); vec_vsx_st(lh, 0, tmp + 2); vec_vsx_st(ll, 0, tmp + 8 + 2); y_with_left_edge = 1; } for (int y = y_with_left_edge; y < h; y++) { u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride); u16x8 lh = u8h_to_u16(l); u16x8 ll = u8l_to_u16(l); vec_st(lh, 0, tmp + y * 16); vec_st(ll, 0, tmp + 8 + y * 16); } if (!(edges & CDEF_HAVE_LEFT)) { for (int y = y_start; y < y_end; y++) { tmp[y * 16] = INT16_MAX; tmp[1 + y * 16] = INT16_MAX; } } else { for (int y = 0; y < h; y++) { tmp[y * 16] = left[y][0]; tmp[1 + y * 16] = left[y][1]; } } if (!(edges & CDEF_HAVE_RIGHT)) { for (int y = y_start; y < y_end; y++) { tmp[- 6 + (y + 1) * 16] = INT16_MAX; tmp[- 5 + (y + 1) * 16] = INT16_MAX; } } } static inline i16x8 max_mask(i16x8 a, i16x8 b) { const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX); const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX); const i16x8 val = vec_sel(a, b, mask); return vec_max(val, b); } #define LOAD_PIX(addr) \ const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \ i16x8 sum = vec_splat_s16(0); #define LOAD_PIX4(addr) \ const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \ const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \ const i16x8 px = vec_xxpermdi(a, b, 0); \ i16x8 sum = vec_splat_s16(0); #define LOAD_DIR(p, addr, o0, o1) \ const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \ const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \ const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \ const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1); #define LOAD_DIR4(p, addr, o0, o1) \ LOAD_DIR(p ## a, addr, o0, o1) \ LOAD_DIR(p ## b, addr + 8, o0, o1) \ const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \ const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \ const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \ const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0); #define CONSTRAIN(p, strength, shift) \ const i16x8 p ## _d0 = vec_sub(p ## 0, px); \ const i16x8 p ## _d1 = vec_sub(p ## 1, px); \ const i16x8 p ## _d2 = vec_sub(p ## 2, px); \ const i16x8 p ## _d3 = vec_sub(p ## 3, px); \ \ i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \ i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \ i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \ i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift); #define SETUP_MINMAX \ i16x8 max = px; \ i16x8 min = px; \ #define MIN_MAX(p) \ max = max_mask(p ## 0, max); \ min = vec_min(p ## 0, min); \ max = max_mask(p ## 1, max); \ min = vec_min(p ## 1, min); \ max = max_mask(p ## 2, max); \ min = vec_min(p ## 2, min); \ max = max_mask(p ## 3, max); \ min = vec_min(p ## 3, min); #define MAKE_TAPS \ const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \ const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \ const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd)); #define PRI_0_UPDATE_SUM(p) \ sum = vec_madd(tap0, p ## _c0, sum); \ sum = vec_madd(tap0, p ## _c1, sum); \ sum = vec_madd(tap1, p ## _c2, sum); \ sum = vec_madd(tap1, p ## _c3, sum); #define UPDATE_SUM(p) \ const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \ const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \ sum = vec_add(sum, p ## sum0); \ sum = vec_add(sum, p ## sum1); #define SEC_0_UPDATE_SUM(p) \ sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \ sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \ sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \ sum = vec_madd(vec_splat_s16(2), p ## _c3, sum); #define BIAS \ i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \ bias = vec_sub(vec_splat_s16(8), bias); \ #define STORE4 \ dst[0] = vdst[0]; \ dst[1] = vdst[1]; \ dst[2] = vdst[2]; \ dst[3] = vdst[3]; \ \ tmp += 8; \ dst += PXSTRIDE(dst_stride); \ dst[0] = vdst[4]; \ dst[1] = vdst[5]; \ dst[2] = vdst[6]; \ dst[3] = vdst[7]; \ \ tmp += 8; \ dst += PXSTRIDE(dst_stride); #define STORE4_CLAMPED \ BIAS \ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ STORE4 #define STORE4_UNCLAMPED \ BIAS \ i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ STORE4 #define STORE8 \ dst[0] = vdst[0]; \ dst[1] = vdst[1]; \ dst[2] = vdst[2]; \ dst[3] = vdst[3]; \ dst[4] = vdst[4]; \ dst[5] = vdst[5]; \ dst[6] = vdst[6]; \ dst[7] = vdst[7]; \ \ tmp += 16; \ dst += PXSTRIDE(dst_stride); #define STORE8_CLAMPED \ BIAS \ i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ i16x8 vdst = vec_max(vec_min(unclamped, max), min); \ STORE8 #define STORE8_UNCLAMPED \ BIAS \ i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \ STORE8 #define DIRECTIONS(w, tmp_stride) \ static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \ { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \ { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \ { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \ { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \ { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \ { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \ { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \ { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \ }; DIRECTIONS(4, 8) DIRECTIONS(8, 16) static inline void filter_4xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int sec_strength, const int dir, const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges, uint16_t *tmp) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int off1 = cdef_directions4[dir][0]; const int off1_1 = cdef_directions4[dir][1]; const int off2 = cdef_directions4[(dir + 2) & 7][0]; const int off3 = cdef_directions4[(dir + 6) & 7][0]; const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; MAKE_TAPS for (int y = 0; y < h / 2; y++) { LOAD_PIX4(tmp) SETUP_MINMAX // Primary pass LOAD_DIR4(p, tmp, off1, off1_1) CONSTRAIN(p, pri_strength, pri_shift) MIN_MAX(p) PRI_0_UPDATE_SUM(p) // Secondary pass 1 LOAD_DIR4(s, tmp, off2, off3) CONSTRAIN(s, sec_strength, sec_shift) MIN_MAX(s) SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR4(s2, tmp, off2_1, off3_1) CONSTRAIN(s2, sec_strength, sec_shift) MIN_MAX(s2) UPDATE_SUM(s2) // Store STORE4_CLAMPED } } static inline void filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int dir, const int pri_shift, const enum CdefEdgeFlags edges, uint16_t *tmp) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int off1 = cdef_directions4[dir][0]; const int off1_1 = cdef_directions4[dir][1]; MAKE_TAPS for (int y = 0; y < h / 2; y++) { LOAD_PIX4(tmp) // Primary pass LOAD_DIR4(p, tmp, off1, off1_1) CONSTRAIN(p, pri_strength, pri_shift) PRI_0_UPDATE_SUM(p) STORE4_UNCLAMPED } } static inline void filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int sec_strength, const int dir, const int sec_shift, const enum CdefEdgeFlags edges, uint16_t *tmp) { const int off2 = cdef_directions4[(dir + 2) & 7][0]; const int off3 = cdef_directions4[(dir + 6) & 7][0]; const int off2_1 = cdef_directions4[(dir + 2) & 7][1]; const int off3_1 = cdef_directions4[(dir + 6) & 7][1]; for (int y = 0; y < h / 2; y++) { LOAD_PIX4(tmp) // Secondary pass 1 LOAD_DIR4(s, tmp, off2, off3) CONSTRAIN(s, sec_strength, sec_shift) SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR4(s2, tmp, off2_1, off3_1) CONSTRAIN(s2, sec_strength, sec_shift) UPDATE_SUM(s2) STORE4_UNCLAMPED } } static inline void filter_8xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int sec_strength, const int dir, const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges, uint16_t *tmp) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int off1 = cdef_directions8[dir][0]; const int off1_1 = cdef_directions8[dir][1]; const int off2 = cdef_directions8[(dir + 2) & 7][0]; const int off3 = cdef_directions8[(dir + 6) & 7][0]; const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; MAKE_TAPS for (int y = 0; y < h; y++) { LOAD_PIX(tmp) SETUP_MINMAX // Primary pass LOAD_DIR(p, tmp, off1, off1_1) CONSTRAIN(p, pri_strength, pri_shift) MIN_MAX(p) PRI_0_UPDATE_SUM(p) // Secondary pass 1 LOAD_DIR(s, tmp, off2, off3) CONSTRAIN(s, sec_strength, sec_shift) MIN_MAX(s) SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR(s2, tmp, off2_1, off3_1) CONSTRAIN(s2, sec_strength, sec_shift) MIN_MAX(s2) UPDATE_SUM(s2) // Store STORE8_CLAMPED } } static inline void filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int pri_strength, const int dir, const int pri_shift, const enum CdefEdgeFlags edges, uint16_t *tmp) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int off1 = cdef_directions8[dir][0]; const int off1_1 = cdef_directions8[dir][1]; MAKE_TAPS for (int y = 0; y < h; y++) { LOAD_PIX(tmp) // Primary pass LOAD_DIR(p, tmp, off1, off1_1) CONSTRAIN(p, pri_strength, pri_shift) PRI_0_UPDATE_SUM(p) STORE8_UNCLAMPED } } static inline void filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int w, const int h, const int sec_strength, const int dir, const int sec_shift, const enum CdefEdgeFlags edges, uint16_t *tmp) { const int off2 = cdef_directions8[(dir + 2) & 7][0]; const int off3 = cdef_directions8[(dir + 6) & 7][0]; const int off2_1 = cdef_directions8[(dir + 2) & 7][1]; const int off3_1 = cdef_directions8[(dir + 6) & 7][1]; for (int y = 0; y < h; y++) { LOAD_PIX(tmp) // Secondary pass 1 LOAD_DIR(s, tmp, off2, off3) CONSTRAIN(s, sec_strength, sec_shift) SEC_0_UPDATE_SUM(s) // Secondary pass 2 LOAD_DIR(s2, tmp, off2_1, off3_1) CONSTRAIN(s2, sec_strength, sec_shift) UPDATE_SUM(s2) STORE8_UNCLAMPED } } #define cdef_fn(w, h, tmp_stride) \ void dav2d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ const ptrdiff_t dst_stride, \ const pixel (*left)[2], \ const pixel *const top, \ const pixel *const bottom, \ const int pri_strength, \ const int sec_strength, \ const int dir, \ const int damping, \ const enum CdefEdgeFlags edges) \ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \ if (pri_strength) { \ const int pri_shift = imax(0, damping - ulog2(pri_strength)); \ if (sec_strength) { \ const int sec_shift = damping - ulog2(sec_strength); \ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ sec_strength, dir, pri_shift, sec_shift, edges, tmp); \ } else { \ filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ dir, pri_shift, edges, tmp); \ } \ } else { \ const int sec_shift = damping - ulog2(sec_strength); \ filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \ dir, sec_shift, edges, tmp); \ } \ } cdef_fn(4, 4, 8); cdef_fn(4, 8, 8); cdef_fn(8, 8, 16); #endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/cpu.c000066400000000000000000000037651517466257200223630ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/attributes.h" #include "src/cpu.h" #include "src/ppc/cpu.h" #define HAVE_AUX ((HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO) && ARCH_PPC64LE) #if HAVE_AUX #include #endif COLD unsigned dav2d_get_cpu_flags_ppc(void) { unsigned flags = dav2d_get_default_cpu_flags(); #if HAVE_AUX unsigned long hw_cap = dav2d_getauxval(AT_HWCAP); unsigned long hw_cap2 = dav2d_getauxval(AT_HWCAP2); flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV2D_PPC_CPU_FLAG_VSX : 0; flags |= (hw_cap2 & PPC_FEATURE2_ARCH_3_00) ? DAV2D_PPC_CPU_FLAG_PWR9 : 0; #endif return flags; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/cpu.h000066400000000000000000000031561517466257200223620ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_PPC_CPU_H #define DAV2D_SRC_PPC_CPU_H enum CpuFlags { DAV2D_PPC_CPU_FLAG_VSX = 1 << 0, DAV2D_PPC_CPU_FLAG_PWR9 = 1 << 1, }; unsigned dav2d_get_cpu_flags_ppc(void); #endif /* DAV2D_SRC_PPC_CPU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/dav2d_types.h000066400000000000000000000050641517466257200240170ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_PPC_TYPES_H #define DAV2D_SRC_PPC_TYPES_H #include #undef pixel #define u8x16 vector unsigned char #define i8x16 vector signed char #define b8x16 vector bool char #define u16x8 vector unsigned short #define i16x8 vector signed short #define b16x8 vector bool short #define u32x4 vector unsigned int #define i32x4 vector signed int #define b32x4 vector bool int #define u64x2 vector unsigned long long #define i64x2 vector signed long long #define b64x2 vector bool long long #define i8h_to_i16(v) ((i16x8) vec_unpackh((i8x16)v)) #define i8l_to_i16(v) ((i16x8) vec_unpackl((i8x16)v)) #define u8h_to_i16(v) ((i16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) #define u8l_to_i16(v) ((i16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) #define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) #define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) #define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0))) #define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v)) #define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0))) #define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v)) #endif /* DAV2D_SRC_PPC_TYPES_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/deblock.h000066400000000000000000000040611517466257200231720ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/deblock.h" decl_deblock_sb_fn(BF(dav2d_lpf_h_sb_y, pwr9)); decl_deblock_sb_fn(BF(dav2d_lpf_v_sb_y, pwr9)); decl_deblock_sb_fn(BF(dav2d_lpf_h_sb_uv, pwr9)); decl_deblock_sb_fn(BF(dav2d_lpf_v_sb_uv, pwr9)); static ALWAYS_INLINE void deblock_dsp_init_ppc(Dav2dDeblockDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_PPC_CPU_FLAG_PWR9)) return; #if BITDEPTH == 8 c->deblock_sb[0][0] = BF(dav2d_lpf_h_sb_y, pwr9); c->deblock_sb[0][1] = BF(dav2d_lpf_v_sb_y, pwr9); c->deblock_sb[1][0] = BF(dav2d_lpf_h_sb_uv, pwr9); c->deblock_sb[1][1] = BF(dav2d_lpf_v_sb_uv, pwr9); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/itx.h000066400000000000000000000050761517466257200224020ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2023, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/itx.h" decl_itx17_fns( 4, 4, pwr9); decl_itx16_fns( 4, 8, pwr9); decl_itx16_fns( 4, 16, pwr9); decl_itx16_fns( 8, 4, pwr9); decl_itx16_fns( 8, 8, pwr9); decl_itx16_fns( 8, 16, pwr9); decl_itx2_fns ( 8, 32, pwr9); decl_itx16_fns(16, 4, pwr9); decl_itx16_fns(16, 8, pwr9); decl_itx12_fns(16, 16, pwr9); decl_itx2_fns (16, 32, pwr9); decl_itx2_fns (32, 8, pwr9); decl_itx2_fns (32, 16, pwr9); decl_itx2_fns (32, 32, pwr9); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_16x64, pwr9)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_32x64, pwr9)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x16, pwr9)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x32, pwr9)); decl_itx_fn(BF(dav2d_inv_txfm_add_dct_dct_64x64, pwr9)); static ALWAYS_INLINE void itx_dsp_init_ppc(Dav2dInvTxfmDSPContext *const c, const int bpc) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_PPC_CPU_FLAG_PWR9)) return; #if BITDEPTH == 8 assign_itx17_fn( , 4, 4, pwr9); assign_itx16_fn(R, 4, 8, pwr9); assign_itx16_fn(R, 8, 4, pwr9); assign_itx16_fn(, 8, 8, pwr9); assign_itx16_fn(R, 4, 16, pwr9); assign_itx16_fn(R, 16, 4, pwr9); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/itx_tmpl.c000066400000000000000000002134521517466257200234300ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/ppc/dav2d_types.h" #include "src/ppc/itx.h" #include "src/ppc/utils.h" #if BITDEPTH == 8 #define LOAD_4(src, stride, a, b, c, d) \ { \ uint8_t *s = src; \ a = vec_xl(0, s); \ s += stride; \ b = vec_xl(0, s); \ s += stride; \ c = vec_xl(0, s); \ s += stride; \ d = vec_xl(0, s); \ } #define LOAD_DECLARE_2_I16(src, a, b) \ i16x8 a = vec_xl(0, src); \ i16x8 b = vec_xl(0, src + 8); #define UNPACK_DECLARE_4_I16_I32(sa, sb, a, b, c, d) \ i32x4 a = i16h_to_i32(sa); \ i32x4 b = i16l_to_i32(sa); \ i32x4 c = i16h_to_i32(sb); \ i32x4 d = i16l_to_i32(sb); #define LOAD_COEFF_4(coeff) \ LOAD_DECLARE_2_I16(coeff, c01, c23) \ UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) #define LOAD_SCALE_COEFF_4x8(coeff, scale) \ LOAD_DECLARE_2_I16(coeff, c04, c15) \ LOAD_DECLARE_2_I16(coeff+16, c26, c37) \ i16x8 c01 = (i16x8)vec_mergeh((i64x2)c04, (i64x2)c15); \ i16x8 c23 = (i16x8)vec_mergeh((i64x2)c26, (i64x2)c37); \ i16x8 c45 = (i16x8)vec_mergel((i64x2)c04, (i64x2)c15); \ i16x8 c67 = (i16x8)vec_mergel((i64x2)c26, (i64x2)c37); \ c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) #define LOAD_SCALE_COEFF_8x4(coeff, scale) \ LOAD_DECLARE_2_I16(coeff, c01, c23) \ LOAD_DECLARE_2_I16(coeff+16, c45, c67) \ c01 = vec_mradds(c01, scale, vec_splat_s16(0)); \ c23 = vec_mradds(c23, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c01, c23, c0, c1, c2, c3) \ c45 = vec_mradds(c45, scale, vec_splat_s16(0)); \ c67 = vec_mradds(c67, scale, vec_splat_s16(0)); \ UNPACK_DECLARE_4_I16_I32(c45, c67, c4, c5, c6, c7) #define LOAD_COEFF_8x8(coeff) \ LOAD_DECLARE_2_I16(coeff, c0, c1) \ LOAD_DECLARE_2_I16(coeff+16, c2, c3) \ LOAD_DECLARE_2_I16(coeff+32, c4, c5) \ LOAD_DECLARE_2_I16(coeff+48, c6, c7) \ UNPACK_DECLARE_4_I16_I32(c0, c1, c0h, c0l, c1h, c1l) \ UNPACK_DECLARE_4_I16_I32(c2, c3, c2h, c2l, c3h, c3l) \ UNPACK_DECLARE_4_I16_I32(c4, c5, c4h, c4l, c5h, c5l) \ UNPACK_DECLARE_4_I16_I32(c6, c7, c6h, c6l, c7h, c7l) \ #define LOAD_COEFF_4x16(coeff) \ LOAD_DECLARE_2_I16(coeff, a0b0, c0d0) \ LOAD_DECLARE_2_I16(coeff+16, a1b1, c1d1) \ LOAD_DECLARE_2_I16(coeff+32, a2b2, c2d2) \ LOAD_DECLARE_2_I16(coeff+48, a3b3, c3d3) \ UNPACK_DECLARE_4_I16_I32(a0b0, c0d0, cA0, cB0, cC0, cD0) \ UNPACK_DECLARE_4_I16_I32(a1b1, c1d1, cA1, cB1, cC1, cD1) \ UNPACK_DECLARE_4_I16_I32(a2b2, c2d2, cA2, cB2, cC2, cD2) \ UNPACK_DECLARE_4_I16_I32(a3b3, c3d3, cA3, cB3, cC3, cD3) #define LOAD_DECLARE_4(src, stride, a, b, c, d) \ u8x16 a, b, c, d; \ LOAD_4(src, stride, a, b, c, d) #define STORE_LEN(l, dst, stride, a, b, c, d) \ { \ uint8_t *dst2 = dst; \ vec_xst_len(a, dst2, l); \ dst2 += stride; \ vec_xst_len(b, dst2, l); \ dst2 += stride; \ vec_xst_len(c, dst2, l); \ dst2 += stride; \ vec_xst_len(d, dst2, l); \ } #define STORE_4(dst, stride, a, b, c, d) \ STORE_LEN(4, dst, stride, a, b, c, d) #define STORE_8(dst, stride, ab, cd, ef, gh) \ STORE_LEN(8, dst, stride, ab, cd, ef, gh) #define STORE_16(dst, stride, l0, l1, l2, l3) \ { \ uint8_t *dst##2 = dst; \ vec_xst(l0, 0, dst##2); \ dst##2 += stride; \ vec_xst(l1, 0, dst##2); \ dst##2 += stride; \ vec_xst(l2, 0, dst##2); \ dst##2 += stride; \ vec_xst(l3, 0, dst##2); \ } #define APPLY_COEFF_4(a, b, c, d, c01, c23) \ { \ u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \ u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); \ \ c01 = vec_adds(c01, vec_splat_s16(8)); \ c23 = vec_adds(c23, vec_splat_s16(8)); \ c01 = vec_sra(c01, vec_splat_u16(4)); \ c23 = vec_sra(c23, vec_splat_u16(4)); \ \ i16x8 abs = u8h_to_i16(ab); \ i16x8 cds = u8h_to_i16(cd); \ \ abs = vec_adds(abs, c01); \ cds = vec_adds(cds, c23); \ \ a = vec_packsu(abs, abs); \ c = vec_packsu(cds, cds); \ \ b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); \ d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); \ } #define APPLY_COEFF_8x4(ab, cd, c01, c23) \ { \ i16x8 abs = u8h_to_i16(ab); \ i16x8 cds = u8h_to_i16(cd); \ c01 = vec_adds(c01, vec_splat_s16(8)); \ c23 = vec_adds(c23, vec_splat_s16(8)); \ c01 = vec_sra(c01, vec_splat_u16(4)); \ c23 = vec_sra(c23, vec_splat_u16(4)); \ \ abs = vec_adds(abs, c01); \ cds = vec_adds(cds, c23); \ \ ab = vec_packsu(abs, abs); \ cd = vec_packsu(cds, cds); \ } #define APPLY_COEFF_16x4(a, b, c, d, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ i16x8 ah = u8h_to_i16(a); \ i16x8 al = u8l_to_i16(a); \ i16x8 bh = u8h_to_i16(b); \ i16x8 bl = u8l_to_i16(b); \ i16x8 ch = u8h_to_i16(c); \ i16x8 cl = u8l_to_i16(c); \ i16x8 dh = u8h_to_i16(d); \ i16x8 dl = u8l_to_i16(d); \ SCALE_ROUND_4(c00c01, c02c03, c04c05, c06c07, vec_splat_s16(8), vec_splat_u16(4)) \ SCALE_ROUND_4(c08c09, c10c11, c12c13, c14c15, vec_splat_s16(8), vec_splat_u16(4)) \ \ ah = vec_adds(ah, c00c01); \ al = vec_adds(al, c02c03); \ bh = vec_adds(bh, c04c05); \ bl = vec_adds(bl, c06c07); \ ch = vec_adds(ch, c08c09); \ cl = vec_adds(cl, c10c11); \ dh = vec_adds(dh, c12c13); \ dl = vec_adds(dl, c14c15); \ \ a = vec_packsu(ah, al); \ b = vec_packsu(bh, bl); \ c = vec_packsu(ch, cl); \ d = vec_packsu(dh, dl); \ } #define IDCT_4_INNER(c0, c1, c2, c3) \ { \ i32x4 o0 = vec_add(c0, c2); \ i32x4 o1 = vec_sub(c0, c2); \ \ i32x4 v2896 = vec_splats(2896); \ i32x4 v1567 = vec_splats(1567); \ i32x4 v3784 = vec_splats(3784); \ i32x4 v2048 = vec_splats(2048); \ \ o0 = vec_mul(o0, v2896); \ o1 = vec_mul(o1, v2896); \ \ i32x4 o2a = vec_mul(c1, v1567); \ i32x4 o2b = vec_mul(c3, v3784); \ i32x4 o3a = vec_mul(c1, v3784); \ i32x4 o3b = vec_mul(c3, v1567); \ \ i32x4 o2 = vec_sub(o2a, o2b); \ i32x4 o3 = vec_add(o3a, o3b); \ \ u32x4 v12 = vec_splat_u32(12); \ \ o0 = vec_add(o0, v2048); \ o1 = vec_add(o1, v2048); \ o2 = vec_add(o2, v2048); \ o3 = vec_add(o3, v2048); \ \ o0 = vec_sra(o0, v12); \ o1 = vec_sra(o1, v12); \ o2 = vec_sra(o2, v12); \ o3 = vec_sra(o3, v12); \ \ c0 = vec_add(o0, o3); \ c1 = vec_add(o1, o2); \ c2 = vec_sub(o1, o2); \ c3 = vec_sub(o0, o3); \ \ } #define dct4_for_dct8(c0, c1, c2, c3, c03, c12) \ IDCT_4_INNER(c0, c1, c2, c3) \ c03 = vec_packs(c0, c3); \ c12 = vec_packs(c1, c2); \ #define dct_4_in(c0, c1, c2, c3, c01, c23) \ { \ IDCT_4_INNER(c0, c1, c2, c3) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c0 = i16h_to_i32(c01); \ c1 = i16l_to_i32(c01); \ c2 = i16h_to_i32(c23); \ c3 = i16l_to_i32(c23); \ } #define dct_4_out(c0, c1, c2, c3, c01, c23) \ IDCT_4_INNER(c0, c1, c2, c3) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ #define IDENTITY_4(c01, c23) \ { \ i16x8 v1697 = vec_splats((int16_t)(1697*8)); \ i16x8 o01 = vec_mradds(c01, v1697, vec_splat_s16(0)); \ i16x8 o23 = vec_mradds(c23, v1697, vec_splat_s16(0)); \ c01 = vec_adds(c01, o01); \ c23 = vec_adds(c23, o23); \ } #define identity_4_in(c0, c1, c2, c3, c01, c23) \ { \ IDENTITY_4(c01, c23) \ c0 = i16h_to_i32(c01); \ c1 = i16l_to_i32(c01); \ c2 = i16h_to_i32(c23); \ c3 = i16l_to_i32(c23); \ } #define identity_4_out(c0, c1, c2, c3, c01, c23) \ { \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ IDENTITY_4(c01, c23) \ } #define ADST_INNER_4(c0, c1, c2, c3, oc0, oc1, oc2, oc3) \ { \ i32x4 v1321 = vec_splats(1321); \ i32x4 v3803 = vec_splats(3803); \ i32x4 v2482 = vec_splats(2482); \ i32x4 v3344 = vec_splats(3344); \ i32x4 v2048 = vec_splats(2048); \ i32x4 i0_v1321 = vec_mul(c0, v1321); \ i32x4 i0_v2482 = vec_mul(c0, v2482); \ i32x4 i0_v3803 = vec_mul(c0, v3803); \ i32x4 i1 = vec_mul(c1, v3344); \ i32x4 i2_v1321 = vec_mul(c2, v1321); \ i32x4 i2_v2482 = vec_mul(c2, v2482); \ i32x4 i2_v3803 = vec_mul(c2, v3803); \ i32x4 i3_v1321 = vec_mul(c3, v1321); \ i32x4 i3_v2482 = vec_mul(c3, v2482); \ i32x4 i3_v3803 = vec_mul(c3, v3803); \ \ i32x4 n1 = vec_sub(i1, v2048); \ i1 = vec_add(i1, v2048); \ \ \ i32x4 o0 = vec_add(i0_v1321, i2_v3803); \ i32x4 o1 = vec_sub(i0_v2482, i2_v1321); \ i32x4 o2 = vec_sub(c0, c2); \ i32x4 o3 = vec_add(i0_v3803, i2_v2482); \ \ o0 = vec_add(o0, i3_v2482); \ o1 = vec_sub(o1, i3_v3803); \ o2 = vec_add(o2, c3); \ o3 = vec_sub(o3, i3_v1321); \ \ o0 = vec_add(o0, i1); \ o1 = vec_add(o1, i1); \ o2 = vec_mul(o2, v3344); \ o3 = vec_sub(o3, n1); \ \ o2 = vec_add(o2, v2048); \ \ oc0 = vec_sra(o0, vec_splat_u32(12)); \ oc1 = vec_sra(o1, vec_splat_u32(12)); \ oc2 = vec_sra(o2, vec_splat_u32(12)); \ oc3 = vec_sra(o3, vec_splat_u32(12)); \ } #define adst_4_in(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ } #define flipadst_4_in(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ } #define adst_4_out(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c0, c1, c2, c3) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ } #define flipadst_4_out(c0, c1, c2, c3, c01, c23) \ { \ ADST_INNER_4(c0, c1, c2, c3, c3, c2, c1, c0) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ } static void dc_only_4xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) { int dc = coeff[0]; const int rnd = (1 << shift) >> 1; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; dc = (dc * 181 + 128 + 2048) >> 12; i16x8 vdc = vec_splats((int16_t)dc); coeff[0] = 0; for (int i = 0; i < n; i++, dst += 4 * stride) { LOAD_DECLARE_4(dst, stride, a, b, c, d) i16x8 as = u8h_to_i16(a); i16x8 bs = u8h_to_i16(b); i16x8 cs = u8h_to_i16(c); i16x8 ds = u8h_to_i16(d); as = vec_adds(as, vdc); bs = vec_adds(bs, vdc); cs = vec_adds(cs, vdc); ds = vec_adds(ds, vdc); a = vec_packsu(as, as); b = vec_packsu(bs, bs); c = vec_packsu(cs, cs); d = vec_packsu(ds, ds); STORE_4(dst, stride, a, b, c, d) } } static void dc_only_8xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) { int dc = coeff[0]; const int rnd = (1 << shift) >> 1; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; dc = (dc * 181 + 128 + 2048) >> 12; i16x8 vdc = vec_splats((int16_t)dc); coeff[0] = 0; for (int i = 0; i < n; i++, dst += 4 * stride) { LOAD_DECLARE_4(dst, stride, a, b, c, d) i16x8 as = u8h_to_i16(a); i16x8 bs = u8h_to_i16(b); i16x8 cs = u8h_to_i16(c); i16x8 ds = u8h_to_i16(d); as = vec_adds(as, vdc); bs = vec_adds(bs, vdc); cs = vec_adds(cs, vdc); ds = vec_adds(ds, vdc); a = vec_packsu(as, as); b = vec_packsu(bs, bs); c = vec_packsu(cs, cs); d = vec_packsu(ds, ds); STORE_8(dst, stride, a, b, c, d) } } static void dc_only_16xN(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, int n, int is_rect2, int shift) { int dc = coeff[0]; const int rnd = (1 << shift) >> 1; if (is_rect2) dc = (dc * 181 + 128) >> 8; dc = (dc * 181 + 128) >> 8; dc = (dc + rnd) >> shift; dc = (dc * 181 + 128 + 2048) >> 12; i16x8 vdc = vec_splats((int16_t)dc); coeff[0] = 0; for (int i = 0; i < n; i++, dst += 4 * stride) { LOAD_DECLARE_4(dst, stride, a, b, c, d) i16x8 ah = u8h_to_i16(a); i16x8 bh = u8h_to_i16(b); i16x8 ch = u8h_to_i16(c); i16x8 dh = u8h_to_i16(d); i16x8 al = u8l_to_i16(a); i16x8 bl = u8l_to_i16(b); i16x8 cl = u8l_to_i16(c); i16x8 dl = u8l_to_i16(d); ah = vec_adds(ah, vdc); bh = vec_adds(bh, vdc); ch = vec_adds(ch, vdc); dh = vec_adds(dh, vdc); al = vec_adds(al, vdc); bl = vec_adds(bl, vdc); cl = vec_adds(cl, vdc); dl = vec_adds(dl, vdc); a = vec_packsu(ah, al); b = vec_packsu(bh, bl); c = vec_packsu(ch, cl); d = vec_packsu(dh, dl); STORE_16(dst, stride, a, b, c, d) } } void dav2d_inv_txfm_add_dct_dct_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { assert(eob >= 0); if (eob < 1) { return dc_only_4xN(dst, stride, coeff, 1, 0, 0); } LOAD_COEFF_4(coeff) dct_4_in(c0, c1, c2, c3, c01, c23) TRANSPOSE4_I32(c0, c1, c2, c3) memset(coeff, 0, sizeof(*coeff) * 4 * 4); dct_4_out(c0, c1, c2, c3, c01, c23) LOAD_DECLARE_4(dst, stride, a, b, c, d) APPLY_COEFF_4(a, b, c, d, c01, c23) STORE_4(dst, stride, a, b, c, d) } void dav2d_inv_txfm_add_wht_wht_4x4_8bpc_pwr9(pixel *dst, const ptrdiff_t stride, coef *const coeff, const int eob) { LOAD_COEFF_4(coeff) u32x4 v2 = vec_splat_u32(2); c0 = vec_sra(c0, v2); c1 = vec_sra(c1, v2); c2 = vec_sra(c2, v2); c3 = vec_sra(c3, v2); i32x4 t0 = vec_add(c0, c1); i32x4 t2 = vec_sub(c2, c3); i32x4 t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); i32x4 t3 = vec_sub(t4, c3); i32x4 t1 = vec_sub(t4, c1); c0 = vec_sub(t0, t3); c1 = t3; c2 = t1; c3 = vec_add(t2, t1); memset(coeff, 0, sizeof(*coeff) * 4 * 4); TRANSPOSE4_I32(c0, c1, c2, c3) t0 = vec_add(c0, c1); t2 = vec_sub(c2, c3); t4 = vec_sra(vec_sub(t0, t2), vec_splat_u32(1)); t3 = vec_sub(t4, c3); t1 = vec_sub(t4, c1); c0 = vec_sub(t0, t3); c1 = t3; c2 = t1; c3 = vec_add(t2, t1); c01 = vec_packs(c0, c1); c23 = vec_packs(c2, c3); LOAD_DECLARE_4(dst, stride, a, b, c, d) u8x16 ab = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); u8x16 cd = (u8x16)vec_mergeh((u32x4)c, (u32x4)d); i16x8 abs = u8h_to_i16(ab); i16x8 cds = u8h_to_i16(cd); abs = vec_adds(abs, c01); cds = vec_adds(cds, c23); a = vec_packsu(abs, abs); c = vec_packsu(cds, cds); b = (u8x16)vec_mergeo((u32x4)a, (u32x4)a); d = (u8x16)vec_mergeo((u32x4)c, (u32x4)c); STORE_4(dst, stride, a, b, c, d) } #define inv_txfm_fn4x4(type1, type2) \ void dav2d_inv_txfm_add_##type1##_##type2##_4x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_4(coeff) \ type1##_4_in(c0, c1, c2, c3, c01, c23) \ memset(coeff, 0, sizeof(*coeff) * 4 * 4); \ TRANSPOSE4_I32(c0, c1, c2, c3) \ type2##_4_out(c0, c1, c2, c3, c01, c23) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ APPLY_COEFF_4(a, b, c, d, c01, c23) \ STORE_4(dst, stride, a, b, c, d) \ } inv_txfm_fn4x4(adst, dct ) inv_txfm_fn4x4(dct, adst ) inv_txfm_fn4x4(dct, flipadst) inv_txfm_fn4x4(flipadst, dct ) inv_txfm_fn4x4(adst, flipadst) inv_txfm_fn4x4(flipadst, adst ) inv_txfm_fn4x4(identity, dct ) inv_txfm_fn4x4(dct, identity) inv_txfm_fn4x4(identity, flipadst) inv_txfm_fn4x4(flipadst, identity) inv_txfm_fn4x4(identity, adst ) inv_txfm_fn4x4(adst, identity) inv_txfm_fn4x4(identity, identity) inv_txfm_fn4x4(adst, adst ) inv_txfm_fn4x4(flipadst, flipadst) #define IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ dct4_for_dct8(c0, c2, c4, c6, c03, c12) \ \ i32x4 v799 = vec_splats(799); \ i32x4 v4017 = vec_splats(4017); \ i32x4 v3406 = vec_splats(3406); \ i32x4 v2276 = vec_splats(2276); \ i32x4 v2048 = vec_splats(2048); \ u32x4 v12 = vec_splat_u32(12); \ \ i32x4 c1v799 = vec_mul(c1, v799); \ i32x4 c7v4017 = vec_mul(c7, v4017); \ i32x4 c5v3406 = vec_mul(c5, v3406); \ i32x4 c3v2276 = vec_mul(c3, v2276); \ i32x4 c5v2276 = vec_mul(c5, v2276); \ i32x4 c3v3406 = vec_mul(c3, v3406); \ i32x4 c1v4017 = vec_mul(c1, v4017); \ i32x4 c7v799 = vec_mul(c7, v799); \ \ i32x4 t4a = vec_subs(c1v799, c7v4017); \ i32x4 t5a = vec_subs(c5v3406, c3v2276); \ i32x4 t6a = vec_adds(c5v2276, c3v3406); \ i32x4 t7a = vec_adds(c1v4017, c7v799); \ \ t4a = vec_adds(t4a, v2048); \ t5a = vec_adds(t5a, v2048); \ t6a = vec_adds(t6a, v2048); \ t7a = vec_adds(t7a, v2048); \ \ t4a = vec_sra(t4a, v12); \ t7a = vec_sra(t7a, v12); \ t5a = vec_sra(t5a, v12); \ t6a = vec_sra(t6a, v12); \ \ i16x8 t7at4a = vec_packs(t7a, t4a); \ i16x8 t6at5a = vec_packs(t6a, t5a); \ \ i16x8 t7t4 = vec_adds(t7at4a, t6at5a); \ t6at5a = vec_subs(t7at4a, t6at5a); \ \ t6a = i16h_to_i32(t6at5a); \ t5a = i16l_to_i32(t6at5a); \ \ i32x4 t6 = vec_add(t6a, t5a); \ i32x4 t5 = vec_sub(t6a, t5a); \ \ t6 = vec_mul(t6, vec_splats(181)); \ t5 = vec_mul(t5, vec_splats(181)); \ t6 = vec_add(t6, vec_splats(128)); \ t5 = vec_add(t5, vec_splats(128)); \ \ t6 = vec_sra(t6, vec_splat_u32(8)); \ t5 = vec_sra(t5, vec_splat_u32(8)); \ \ i16x8 t6t5 = vec_packs(t6, t5); \ \ c74 = vec_subs(c03, t7t4); \ c65 = vec_subs(c12, t6t5); \ c03 = vec_adds(c03, t7t4); \ c12 = vec_adds(c12, t6t5); \ #define UNPACK_4_I16_I32(t0, t1, t2, t3) \ t0 = i16h_to_i32(t0##t1); \ t1 = i16l_to_i32(t0##t1); \ t2 = i16h_to_i32(t2##t3); \ t3 = i16l_to_i32(t2##t3); #define UNPACK_PAIR_I16_I32(hi, lo, v) \ hi = i16h_to_i32(v); \ lo = i16l_to_i32(v); \ #define dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, ...) \ { \ i16x8 c0##c3, c1##c2, c7##c4, c6##c5; \ IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c0##c3, c1##c2, c7##c4, c6##c5) \ UNPACK_4_I16_I32(c0, c3, c1, c2) \ UNPACK_4_I16_I32(c7, c4, c6, c5) \ } #define dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ { \ i16x8 c03, c12, c74, c65; \ IDCT_8_INNER(c0, c1, c2, c3, c4, c5, c6, c7, c03, c12, c74, c65) \ c01 = (i16x8)vec_mergeh((u64x2)c03, (u64x2)c12); \ c23 = (i16x8)vec_mergel((u64x2)c12, (u64x2)c03); \ c45 = (i16x8)vec_mergel((u64x2)c74, (u64x2)c65); \ c67 = (i16x8)vec_mergeh((u64x2)c65, (u64x2)c74); \ } #define dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ dct_8_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h,) \ dct_8_in(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l,) \ } #define dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ i16x8 c03h, c12h, c74h, c65h; \ i16x8 c03l, c12l, c74l, c65l; \ { \ IDCT_8_INNER(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c03h, c12h, c74h, c65h) \ } \ { \ IDCT_8_INNER(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c03l, c12l, c74l, c65l) \ } \ c0 = (i16x8)vec_mergeh((u64x2)c03h, (u64x2)c03l); \ c3 = (i16x8)vec_mergel((u64x2)c03h, (u64x2)c03l); \ c1 = (i16x8)vec_mergeh((u64x2)c12h, (u64x2)c12l); \ c2 = (i16x8)vec_mergel((u64x2)c12h, (u64x2)c12l); \ c7 = (i16x8)vec_mergeh((u64x2)c74h, (u64x2)c74l); \ c4 = (i16x8)vec_mergel((u64x2)c74h, (u64x2)c74l); \ c6 = (i16x8)vec_mergeh((u64x2)c65h, (u64x2)c65l); \ c5 = (i16x8)vec_mergel((u64x2)c65h, (u64x2)c65l); \ } #define IDENTITY_8(c01, c23, c45, c67) \ { \ c01 = vec_adds(c01, c01); \ c23 = vec_adds(c23, c23); \ c45 = vec_adds(c45, c45); \ c67 = vec_adds(c67, c67); \ } #define identity_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ { \ IDENTITY_8(c01, c23, c45, c67) \ UNPACK_PAIR_I16_I32(c0, c1, c01) \ UNPACK_PAIR_I16_I32(c2, c3, c23) \ UNPACK_PAIR_I16_I32(c4, c5, c45) \ UNPACK_PAIR_I16_I32(c6, c7, c67) \ } #define identity_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ IDENTITY_8(c01, c23, c45, c67) #define identity_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ IDENTITY_8(c0, c1, c2, c3) \ IDENTITY_8(c4, c5, c6, c7) \ UNPACK_PAIR_I16_I32(c0h, c0l, c0) \ UNPACK_PAIR_I16_I32(c1h, c1l, c1) \ UNPACK_PAIR_I16_I32(c2h, c2l, c2) \ UNPACK_PAIR_I16_I32(c3h, c3l, c3) \ UNPACK_PAIR_I16_I32(c4h, c4l, c4) \ UNPACK_PAIR_I16_I32(c5h, c5l, c5) \ UNPACK_PAIR_I16_I32(c6h, c6l, c6) \ UNPACK_PAIR_I16_I32(c7h, c7l, c7) \ } #define PACK_4(c0, c1, c2, c3, \ c0h, c1h, c2h, c3h, \ c0l, c1l, c2l, c3l) \ { \ c0 = vec_packs(c0h, c0l); \ c1 = vec_packs(c1h, c1l); \ c2 = vec_packs(c2h, c2l); \ c3 = vec_packs(c3h, c3l); \ } #define DECLARE_PACK_4(c0, c1, c2, c3, \ c0h, c1h, c2h, c3h, \ c0l, c1l, c2l, c3l) \ i16x8 c0, c1, c2, c3; \ PACK_4(c0, c1, c2, c3, c0h, c1h, c2h, c3h, c0l, c1l, c2l, c3l); #define PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ { \ c0 = vec_packs(c0h, c0l); \ c1 = vec_packs(c1h, c1l); \ c2 = vec_packs(c2h, c2l); \ c3 = vec_packs(c3h, c3l); \ c4 = vec_packs(c4h, c4l); \ c5 = vec_packs(c5h, c5l); \ c6 = vec_packs(c6h, c6l); \ c7 = vec_packs(c7h, c7l); \ } #define identity_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ IDENTITY_8(c0, c1, c2, c3) \ IDENTITY_8(c4, c5, c6, c7) \ } #define DECLARE_SPLAT_I32(val) \ i32x4 v##val = vec_splats(val); #define DECLARE_MUL_PAIR_I32(ca, cb, va, vb) \ i32x4 ca##va = vec_mul(ca, va); \ i32x4 cb##vb = vec_mul(cb, vb); \ i32x4 ca##vb = vec_mul(ca, vb); \ i32x4 cb##va = vec_mul(cb, va); #define ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ r0 = vec_adds(ca##va, cb##vb); \ r1 = vec_subs(ca##vb, cb##va); #define DECLARE_ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) \ i32x4 r0, r1; \ ADD_SUB_PAIR(r0, r1, ca, cb, va, vb) #define SCALE_ROUND_4(a, b, c, d, rnd, shift) \ a = vec_adds(a, rnd); \ b = vec_adds(b, rnd); \ c = vec_adds(c, rnd); \ d = vec_adds(d, rnd); \ a = vec_sra(a, shift); \ b = vec_sra(b, shift); \ c = vec_sra(c, shift); \ d = vec_sra(d, shift); #define ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ o0, o1, o2, o3, o4, o5, o6, o7) \ { \ DECLARE_SPLAT_I32(4076) \ DECLARE_SPLAT_I32(401) \ \ DECLARE_SPLAT_I32(3612) \ DECLARE_SPLAT_I32(1931) \ \ DECLARE_SPLAT_I32(2598) \ DECLARE_SPLAT_I32(3166) \ \ DECLARE_SPLAT_I32(1189) \ DECLARE_SPLAT_I32(3920) \ \ DECLARE_SPLAT_I32(3784) \ DECLARE_SPLAT_I32(1567) \ \ DECLARE_SPLAT_I32(2048) \ u32x4 v12 = vec_splat_u32(12); \ \ DECLARE_MUL_PAIR_I32(c7, c0, v4076, v401) \ DECLARE_MUL_PAIR_I32(c5, c2, v3612, v1931) \ DECLARE_MUL_PAIR_I32(c3, c4, v2598, v3166) \ DECLARE_MUL_PAIR_I32(c1, c6, v1189, v3920) \ \ DECLARE_ADD_SUB_PAIR(t0a, t1a, c7, c0, v4076, v401) \ DECLARE_ADD_SUB_PAIR(t2a, t3a, c5, c2, v3612, v1931) \ DECLARE_ADD_SUB_PAIR(t4a, t5a, c3, c4, v2598, v3166) \ DECLARE_ADD_SUB_PAIR(t6a, t7a, c1, c6, v1189, v3920) \ \ SCALE_ROUND_4(t0a, t1a, t2a, t3a, v2048, v12) \ SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ \ i32x4 t0 = vec_add(t0a, t4a); \ i32x4 t1 = vec_add(t1a, t5a); \ i32x4 t2 = vec_add(t2a, t6a); \ i32x4 t3 = vec_add(t3a, t7a); \ i32x4 t4 = vec_sub(t0a, t4a); \ i32x4 t5 = vec_sub(t1a, t5a); \ i32x4 t6 = vec_sub(t2a, t6a); \ i32x4 t7 = vec_sub(t3a, t7a); \ \ i16x8 t0t1 = vec_packs(t0, t1); \ i16x8 t2t3 = vec_packs(t2, t3); \ i16x8 t4t5 = vec_packs(t4, t5); \ i16x8 t6t7 = vec_packs(t6, t7); \ \ UNPACK_4_I16_I32(t4, t5, t6, t7) \ UNPACK_4_I16_I32(t0, t1, t2, t3) \ \ DECLARE_MUL_PAIR_I32(t4, t5, v3784, v1567) \ DECLARE_MUL_PAIR_I32(t7, t6, v3784, v1567) \ \ ADD_SUB_PAIR(t4a, t5a, t4, t5, v3784, v1567) \ ADD_SUB_PAIR(t7a, t6a, t7, t6, v1567, v3784) \ \ SCALE_ROUND_4(t4a, t5a, t6a, t7a, v2048, v12) \ \ o0 = vec_add(t0, t2); \ o1 = vec_add(t4a, t6a); \ o7 = vec_add(t1, t3); \ o6 = vec_add(t5a, t7a); \ t2 = vec_sub(t0, t2); \ t3 = vec_sub(t1, t3); \ t6 = vec_sub(t4a, t6a); \ t7 = vec_sub(t5a, t7a); \ \ i16x8 o7##o1 = vec_packs(o7, o1); \ i16x8 o0##o6 = vec_packs(o0, o6); \ t2t3 = vec_packs(t2, t3); \ t6t7 = vec_packs(t6, t7); \ \ UNPACK_4_I16_I32(t2, t3, t6, t7) \ UNPACK_4_I16_I32(o7, o1, o0, o6) \ \ o7 = -o7; \ o1 = -o1; \ \ o3 = vec_add(t2, t3); \ o4 = vec_sub(t2, t3); \ o5 = vec_sub(t6, t7); \ o2 = vec_add(t6, t7); \ \ i32x4 v181 = vec_splats(181); \ i32x4 v128 = vec_splats(128); \ u32x4 v8 = vec_splat_u32(8); \ \ o2 = vec_mul(o2, v181); \ o3 = vec_mul(o3, v181); \ o4 = vec_mul(o4, v181); \ o5 = vec_mul(o5, v181); \ \ SCALE_ROUND_4(o2, o3, o4, o5, v128, v8) \ \ o3 = -o3; \ o5 = -o5; \ } #define adst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0, c1, c2, c3, c4, c5, c6, c7) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ UNPACK_PAIR_I16_I32(c0, c1, c01) \ UNPACK_PAIR_I16_I32(c2, c3, c23) \ UNPACK_PAIR_I16_I32(c4, c5, c45) \ UNPACK_PAIR_I16_I32(c6, c7, c67) \ } #define adst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0, c1, c2, c3, c4, c5, c6, c7) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ } #define adst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ } #define adst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ } #define flipadst_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c7, c6, c5, c4, c3, c2, c1, c0) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ UNPACK_PAIR_I16_I32(c0, c1, c01) \ UNPACK_PAIR_I16_I32(c2, c3, c23) \ UNPACK_PAIR_I16_I32(c4, c5, c45) \ UNPACK_PAIR_I16_I32(c6, c7, c67) \ } #define flipadst_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ {\ ADST_INNER_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c7, c6, c5, c4, c3, c2, c1, c0) \ c01 = vec_packs(c0, c1); \ c23 = vec_packs(c2, c3); \ c45 = vec_packs(c4, c5); \ c67 = vec_packs(c6, c7); \ } #define flipadst_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ } #define flipadst_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ { \ ADST_INNER_8(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c7h, c6h, c5h, c4h, c3h, c2h, c1h, c0h) \ ADST_INNER_8(c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c7l, c6l, c5l, c4l, c3l, c2l, c1l, c0l) \ PACK_8(c0, c1, c2, c3, c4, c5, c6, c7, \ c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ } void dav2d_inv_txfm_add_dct_dct_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { i16x8 v = vec_splats((int16_t)(2896*8)); if (eob < 1) { return dc_only_4xN(dst, stride, coeff, 2, 1, 0); } LOAD_SCALE_COEFF_4x8(coeff, v) dct_4_in(c0, c1, c2, c3, c01, c23) dct_4_in(c4, c5, c6, c7, c45, c67) memset(coeff, 0, sizeof(*coeff) * 4 * 8); TRANSPOSE4_I32(c0, c1, c2, c3); TRANSPOSE4_I32(c4, c5, c6, c7); dct_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) LOAD_DECLARE_4(dst, stride, a, b, cc, d) LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) APPLY_COEFF_4(a, b, cc, d, c01, c23) APPLY_COEFF_4(e, f, g, hh, c45, c67) STORE_4(dst, stride, a, b, cc, d) STORE_4(dst + 4 * stride, stride, e, f, g, hh) } #define inv_txfm_fn4x8(type1, type2) \ void dav2d_inv_txfm_add_##type1##_##type2##_4x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ i16x8 v = vec_splats((int16_t)(2896*8)); \ LOAD_SCALE_COEFF_4x8(coeff, v) \ type1##_4_in(c0, c1, c2, c3, c01, c23) \ type1##_4_in(c4, c5, c6, c7, c45, c67) \ memset(coeff, 0, sizeof(*coeff) * 4 * 8); \ TRANSPOSE4_I32(c0, c1, c2, c3); \ TRANSPOSE4_I32(c4, c5, c6, c7); \ type2##_8_out(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ APPLY_COEFF_4(a, b, c, d, c01, c23) \ APPLY_COEFF_4(e, f, g, h, c45, c67) \ STORE_4(dst, stride, a, b, c, d) \ STORE_4(dst + 4 * stride, stride, e, f, g, h) \ } inv_txfm_fn4x8(adst, dct ) inv_txfm_fn4x8(dct, adst ) inv_txfm_fn4x8(dct, flipadst) inv_txfm_fn4x8(flipadst, dct ) inv_txfm_fn4x8(adst, flipadst) inv_txfm_fn4x8(flipadst, adst ) inv_txfm_fn4x8(identity, dct ) inv_txfm_fn4x8(dct, identity) inv_txfm_fn4x8(identity, flipadst) inv_txfm_fn4x8(flipadst, identity) inv_txfm_fn4x8(identity, adst ) inv_txfm_fn4x8(adst, identity) inv_txfm_fn4x8(identity, identity) inv_txfm_fn4x8(adst, adst ) inv_txfm_fn4x8(flipadst, flipadst) void dav2d_inv_txfm_add_dct_dct_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { i16x8 v = vec_splats((int16_t)(2896*8)); if (eob < 1) { return dc_only_8xN(dst, stride, coeff, 1, 1, 0); } LOAD_SCALE_COEFF_8x4(coeff, v) dct_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) memset(coeff, 0, sizeof(*coeff) * 8 * 4); TRANSPOSE4_I32(c0, c1, c2, c3) TRANSPOSE4_I32(c4, c5, c6, c7) dct_4_out(c0, c1, c2, c3, c01, c23) dct_4_out(c4, c5, c6, c7, c45, c67) LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); APPLY_COEFF_8x4(ae, bf, c04, c15) APPLY_COEFF_8x4(cg, dh, c26, c37) STORE_8(dst, stride, ae, bf, cg, dh) } #define inv_txfm_fn8x4(type1, type2) \ void dav2d_inv_txfm_add_##type1##_##type2##_8x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ i16x8 v = vec_splats((int16_t)(2896*8)); \ LOAD_SCALE_COEFF_8x4(coeff, v) \ type1##_8_in(c0, c1, c2, c3, c4, c5, c6, c7, c01, c23, c45, c67) \ memset(coeff, 0, sizeof(*coeff) * 8 * 4); \ TRANSPOSE4_I32(c0, c1, c2, c3) \ TRANSPOSE4_I32(c4, c5, c6, c7) \ type2##_4_out(c0, c1, c2, c3, c01, c23) \ type2##_4_out(c4, c5, c6, c7, c45, c67) \ LOAD_DECLARE_4(dst, stride, ae, bf, cg, dh) \ i16x8 c04 = (i16x8)vec_mergeh((u64x2)c01, (u64x2)c45); \ i16x8 c15 = (i16x8)vec_mergel((u64x2)c01, (u64x2)c45); \ i16x8 c26 = (i16x8)vec_mergeh((u64x2)c23, (u64x2)c67); \ i16x8 c37 = (i16x8)vec_mergel((u64x2)c23, (u64x2)c67); \ APPLY_COEFF_8x4(ae, bf, c04, c15) \ APPLY_COEFF_8x4(cg, dh, c26, c37) \ STORE_8(dst, stride, ae, bf, cg, dh) \ } inv_txfm_fn8x4(adst, dct ) inv_txfm_fn8x4(dct, adst ) inv_txfm_fn8x4(dct, flipadst) inv_txfm_fn8x4(flipadst, dct ) inv_txfm_fn8x4(adst, flipadst) inv_txfm_fn8x4(flipadst, adst ) inv_txfm_fn8x4(identity, dct ) inv_txfm_fn8x4(dct, identity) inv_txfm_fn8x4(identity, flipadst) inv_txfm_fn8x4(flipadst, identity) inv_txfm_fn8x4(identity, adst ) inv_txfm_fn8x4(adst, identity) inv_txfm_fn8x4(identity, identity) inv_txfm_fn8x4(adst, adst ) inv_txfm_fn8x4(flipadst, flipadst) void dav2d_inv_txfm_add_dct_dct_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { if (eob < 1) { return dc_only_8xN(dst, stride, coeff, 2, 0, 1); } LOAD_COEFF_8x8(coeff) dct_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c0, c1, c2, c3, c4, c5, c6, c7) memset(coeff, 0, sizeof(*coeff) * 8 * 8); SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) dct_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, c0, c1, c2, c3, c4, c5, c6, c7) LOAD_DECLARE_4(dst, stride, a, b, cc, d) LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, hh) APPLY_COEFF_8x4(a, b, c0, c1) APPLY_COEFF_8x4(cc, d, c2, c3) APPLY_COEFF_8x4(e, f, c4, c5) APPLY_COEFF_8x4(g, hh, c6, c7) STORE_8(dst, stride, a, b, cc, d) STORE_8(dst + 4 * stride, stride, e, f, g, hh) } #define inv_txfm_fn8x8(type1, type2) \ void dav2d_inv_txfm_add_##type1##_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_8x8(coeff) \ type1##_8x2_in(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ SCALE_ROUND_4(c0h, c1h, c2h, c3h, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c4h, c5h, c6h, c7h, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c0l, c1l, c2l, c3l, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c4l, c5l, c6l, c7l, vec_splat_s32(1), vec_splat_u32(1)) \ memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ APPLY_COEFF_8x4(a, b, c0, c1) \ APPLY_COEFF_8x4(c, d, c2, c3) \ APPLY_COEFF_8x4(e, f, c4, c5) \ APPLY_COEFF_8x4(g, h, c6, c7) \ STORE_8(dst, stride, a, b, c, d) \ STORE_8(dst + 4 * stride, stride, e, f, g, h) \ } inv_txfm_fn8x8(adst, dct ) inv_txfm_fn8x8(dct, adst ) inv_txfm_fn8x8(dct, flipadst) inv_txfm_fn8x8(flipadst, dct ) inv_txfm_fn8x8(adst, flipadst) inv_txfm_fn8x8(flipadst, adst ) inv_txfm_fn8x8(dct, identity) inv_txfm_fn8x8(flipadst, identity) inv_txfm_fn8x8(adst, identity) inv_txfm_fn8x8(adst, adst ) inv_txfm_fn8x8(flipadst, flipadst) // identity + scale is a no op #define inv_txfm_fn8x8_identity(type2) \ void dav2d_inv_txfm_add_identity_##type2##_8x8_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_8x8(coeff) \ memset(coeff, 0, sizeof(*coeff) * 8 * 8); \ TRANSPOSE8_I32(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l) \ type2##_8x2_out(c0h, c1h, c2h, c3h, c4h, c5h, c6h, c7h, \ c0l, c1l, c2l, c3l, c4l, c5l, c6l, c7l, \ c0, c1, c2, c3, c4, c5, c6, c7) \ LOAD_DECLARE_4(dst, stride, a, b, c, d) \ LOAD_DECLARE_4(dst + 4 * stride, stride, e, f, g, h) \ APPLY_COEFF_8x4(a, b, c0, c1) \ APPLY_COEFF_8x4(c, d, c2, c3) \ APPLY_COEFF_8x4(e, f, c4, c5) \ APPLY_COEFF_8x4(g, h, c6, c7) \ STORE_8(dst, stride, a, b, c, d) \ STORE_8(dst + 4 * stride, stride, e, f, g, h) \ } inv_txfm_fn8x8_identity(dct ) inv_txfm_fn8x8_identity(flipadst) inv_txfm_fn8x8_identity(adst ) inv_txfm_fn8x8_identity(identity) #define CLIP16_I32_8(a, b, c, d, e, f, g, h, \ ab, cd, ef, gh) \ { \ ab = vec_packs(a, b); \ cd = vec_packs(c, d); \ ef = vec_packs(e, f); \ gh = vec_packs(g, h); \ UNPACK_PAIR_I16_I32(a, b, ab) \ UNPACK_PAIR_I16_I32(c, d, cd) \ UNPACK_PAIR_I16_I32(e, f, ef) \ UNPACK_PAIR_I16_I32(g, h, gh) \ } #define MUL_4_INPLACE(a, b, c, d, v) \ a = vec_mul(a, v); \ b = vec_mul(b, v); \ c = vec_mul(c, v); \ d = vec_mul(d, v); \ #define IDENTITY_16_V(v) \ { \ i16x8 v_ = vec_adds(v, v); \ v = vec_mradds(v, v1697_16, v_); \ } #define IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ i16x8 v1697_16 = vec_splats((int16_t)(1697*16)); \ IDENTITY_16_V(c00c01) \ IDENTITY_16_V(c02c03) \ IDENTITY_16_V(c04c05) \ IDENTITY_16_V(c06c07) \ IDENTITY_16_V(c08c09) \ IDENTITY_16_V(c10c11) \ IDENTITY_16_V(c12c13) \ IDENTITY_16_V(c14c15) \ } #define IDENTITY_16_4_I32(a, b, c, d) \ { \ i32x4 a2 = vec_add(a, a); \ i32x4 b2 = vec_add(b, b); \ i32x4 c2 = vec_add(c, c); \ i32x4 d2 = vec_add(d, d); \ MUL_4_INPLACE(a, b, c, d, v1697) \ SCALE_ROUND_4(a, b, c, d, v1024, vec_splat_u32(11)); \ a = vec_add(a2, a); \ b = vec_add(b2, b); \ c = vec_add(c2, c); \ d = vec_add(d2, d); \ } #define identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ DECLARE_SPLAT_I32(1697) \ DECLARE_SPLAT_I32(1024) \ IDENTITY_16_4_I32(c00, c01, c02, c03) \ IDENTITY_16_4_I32(c04, c05, c06, c07) \ IDENTITY_16_4_I32(c08, c09, c10, c11) \ IDENTITY_16_4_I32(c12, c13, c14, c15) \ } #define identity_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ c00, c02, c04, c06, c08, c10, c12, c14, \ c01, c03, c05, c07, c09, c11, c13, c15) \ IDENTITY_16_INNER(c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, \ c08c11, c09c10, c14c13, c15c12) \ IDCT_8_INNER(c00, c02, c04, c06, c08, c10, c12, c14, \ c00c03, c01c02, c07c04, c06c05) \ DECLARE_SPLAT_I32(128) \ DECLARE_SPLAT_I32(181) \ DECLARE_SPLAT_I32(401) \ DECLARE_SPLAT_I32(4076) \ DECLARE_SPLAT_I32(3166) \ DECLARE_SPLAT_I32(2598) \ DECLARE_SPLAT_I32(1931) \ DECLARE_SPLAT_I32(3612) \ DECLARE_SPLAT_I32(3920) \ DECLARE_SPLAT_I32(1189) \ DECLARE_SPLAT_I32(1567) \ DECLARE_SPLAT_I32(3784) \ \ DECLARE_MUL_PAIR_I32(c01, c15, v401, v4076) \ DECLARE_MUL_PAIR_I32(c09, c07, v3166, v2598) \ DECLARE_MUL_PAIR_I32(c05, c11, v1931, v3612) \ DECLARE_MUL_PAIR_I32(c13, c03, v3920, v1189) \ \ DECLARE_ADD_SUB_PAIR(t15a, t08a, c01, c15, v4076, v401) \ DECLARE_ADD_SUB_PAIR(t14a, t09a, c09, c07, v2598, v3166) \ DECLARE_ADD_SUB_PAIR(t13a, t10a, c05, c11, v3612, v1931) \ DECLARE_ADD_SUB_PAIR(t12a, t11a, c13, c03, v1189, v3920) \ \ SCALE_ROUND_4(t15a, t08a, t14a, t09a, v2048, v12) \ SCALE_ROUND_4(t13a, t10a, t12a, t11a, v2048, v12) \ \ CLIP16_I32_8(t15a, t08a, t14a, t09a, \ t13a, t10a, t12a, t11a, \ c08c11, c09c10, c14c13, c15c12) \ DECLARE_ADD_SUB_PAIR(t08, t09, t08a, t09a,,) \ DECLARE_ADD_SUB_PAIR(t11, t10, t11a, t10a,,) \ DECLARE_ADD_SUB_PAIR(t12, t13, t12a, t13a,,) \ DECLARE_ADD_SUB_PAIR(t15, t14, t15a, t14a,,) \ \ CLIP16_I32_8(t08, t09, t11, t10, \ t12, t13, t15, t14, \ c08c11, c09c10, c14c13, c15c12) \ \ DECLARE_MUL_PAIR_I32(t14, t09, v1567, v3784) \ DECLARE_MUL_PAIR_I32(t13, t10, v1567, v3784) \ \ ADD_SUB_PAIR(t14a, t09a, t14, t09, v3784, v1567) \ ADD_SUB_PAIR(t10a, t13a, t13, t10, v3784, v1567) \ t10a = -t10a; \ \ SCALE_ROUND_4(t14a, t09a, t13a, t10a, v2048, v12) \ \ ADD_SUB_PAIR(t08a, t11a, t08, t11,,) \ ADD_SUB_PAIR(t09, t10, t09a, t10a,,) \ ADD_SUB_PAIR(t15a, t12a, t15, t12,,) \ ADD_SUB_PAIR(t14, t13, t14a, t13a,,) \ \ CLIP16_I32_8(t08a, t11a, t09, t10, \ t15a, t12a, t14, t13, \ c08c11, c09c10, c14c13, c15c12) \ ADD_SUB_PAIR(t13a, t10a, t13, t10,,); \ ADD_SUB_PAIR(t12, t11, t12a, t11a,,); \ \ MUL_4_INPLACE(t13a, t10a, t12, t11, v181); \ SCALE_ROUND_4(t13a, t10a, t12, t11, v128, vec_splat_u32(8)) \ \ DECLARE_PACK_4(t15at12, t14t13a, t08at11, t09t10a, \ t15a, t14, t08a, t09, \ t12, t13a, t11, t10a) \ \ c15c12 = vec_subs(c00c03, t15at12); \ c14c13 = vec_subs(c01c02, t14t13a); \ c08c11 = vec_subs(c07c04, t08at11); \ c09c10 = vec_subs(c06c05, t09t10a); \ c00c03 = vec_adds(c00c03, t15at12); \ c01c02 = vec_adds(c01c02, t14t13a); \ c07c04 = vec_adds(c07c04, t08at11); \ c06c05 = vec_adds(c06c05, t09t10a); \ #define dct_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ \ i16x8 c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12; \ IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ c00c01 = (i16x8)vec_mergeh((u64x2)c00c03, (u64x2)c01c02); \ c02c03 = (i16x8)vec_mergel((u64x2)c01c02, (u64x2)c00c03); \ c04c05 = (i16x8)vec_mergel((u64x2)c07c04, (u64x2)c06c05); \ c06c07 = (i16x8)vec_mergeh((u64x2)c06c05, (u64x2)c07c04); \ c08c09 = (i16x8)vec_mergeh((u64x2)c08c11, (u64x2)c09c10); \ c10c11 = (i16x8)vec_mergel((u64x2)c09c10, (u64x2)c08c11); \ c12c13 = (i16x8)vec_mergel((u64x2)c15c12, (u64x2)c14c13); \ c14c15 = (i16x8)vec_mergeh((u64x2)c14c13, (u64x2)c15c12); \ #define dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ \ IDCT_16_INNER(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c03, c01c02, c07c04, c06c05, c08c11, c09c10, c14c13, c15c12) \ UNPACK_PAIR_I16_I32(c00, c03, c00c03) \ UNPACK_PAIR_I16_I32(c01, c02, c01c02) \ UNPACK_PAIR_I16_I32(c07, c04, c07c04) \ UNPACK_PAIR_I16_I32(c06, c05, c06c05) \ UNPACK_PAIR_I16_I32(c08, c11, c08c11) \ UNPACK_PAIR_I16_I32(c09, c10, c09c10) \ UNPACK_PAIR_I16_I32(c14, c13, c14c13) \ UNPACK_PAIR_I16_I32(c15, c12, c15c12) \ #define dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ dct_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ dct_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ dct_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ dct_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) #define PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ c00c01 = vec_packs(c00, c04); c02c03 = vec_packs(c08, c12); \ c04c05 = vec_packs(c01, c05); c06c07 = vec_packs(c09, c13); \ c08c09 = vec_packs(c02, c06); c10c11 = vec_packs(c10, c14); \ c12c13 = vec_packs(c03, c07); c14c15 = vec_packs(c11, c15); \ } #define dct_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ IDCT_4_INNER(c00, c01, c02, c03) \ IDCT_4_INNER(c04, c05, c06, c07) \ IDCT_4_INNER(c08, c09, c10, c11) \ IDCT_4_INNER(c12, c13, c14, c15) \ \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define IDENTITY_4_I32(a, b, c, d) \ { \ DECLARE_SPLAT_I32(5793) \ DECLARE_SPLAT_I32(2048) \ MUL_4_INPLACE(a, b, c, d, v5793) \ SCALE_ROUND_4(a, b, c, d, v2048, vec_splat_u32(12)) \ } #define identity_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ { \ IDENTITY_4_I32(cA0, cA1, cA2, cA3) \ IDENTITY_4_I32(cB0, cB1, cB2, cB3) \ IDENTITY_4_I32(cC0, cC1, cC2, cC3) \ IDENTITY_4_I32(cD0, cD1, cD2, cD3) \ } #define identity_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ IDENTITY_4(c00c01, c02c03) \ IDENTITY_4(c04c05, c06c07) \ IDENTITY_4(c08c09, c10c11) \ IDENTITY_4(c12c13, c14c15) \ } #define adst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ adst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ adst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ adst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ adst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) #define adst_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_4(c00, c01, c02, c03, c00, c01, c02, c03) \ ADST_INNER_4(c04, c05, c06, c07, c04, c05, c06, c07) \ ADST_INNER_4(c08, c09, c10, c11, c08, c09, c10, c11) \ ADST_INNER_4(c12, c13, c14, c15, c12, c13, c14, c15) \ \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define flipadst_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ flipadst_4_in(cA0, cA1, cA2, cA3, a0b0, c0d0) \ flipadst_4_in(cB0, cB1, cB2, cB3, a1b1, c1d1) \ flipadst_4_in(cC0, cC1, cC2, cC3, a2b2, c2d2) \ flipadst_4_in(cD0, cD1, cD2, cD3, a3b3, c3d3) #define flipadst_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_4(c00, c01, c02, c03, c03, c02, c01, c00) \ ADST_INNER_4(c04, c05, c06, c07, c07, c06, c05, c04) \ ADST_INNER_4(c08, c09, c10, c11, c11, c10, c09, c08) \ ADST_INNER_4(c12, c13, c14, c15, c15, c14, c13, c12) \ \ PACK_4x4(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ } #define ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ o00, o01, o02, o03, o04, o05, o06, o07, \ o08, o09, o10, o11, o12, o13, o14, o15, \ c00c01, c02c03, c04c05, c06c07) \ DECLARE_SPLAT_I32(2048); \ u32x4 v12 = vec_splat_u32(12); \ DECLARE_SPLAT_I32(4091) \ DECLARE_SPLAT_I32(201) \ DECLARE_SPLAT_I32(3973) \ DECLARE_SPLAT_I32(995) \ DECLARE_SPLAT_I32(3703) \ DECLARE_SPLAT_I32(1751) \ DECLARE_SPLAT_I32(3290) \ DECLARE_SPLAT_I32(2440) \ DECLARE_SPLAT_I32(2751) \ DECLARE_SPLAT_I32(3035) \ DECLARE_SPLAT_I32(2106) \ DECLARE_SPLAT_I32(3513) \ DECLARE_SPLAT_I32(1380) \ DECLARE_SPLAT_I32(3857) \ DECLARE_SPLAT_I32(601) \ DECLARE_SPLAT_I32(4052) \ \ DECLARE_MUL_PAIR_I32(c15, c00, v4091, v201) \ DECLARE_MUL_PAIR_I32(c13, c02, v3973, v995) \ DECLARE_MUL_PAIR_I32(c11, c04, v3703, v1751) \ DECLARE_MUL_PAIR_I32(c09, c06, v3290, v2440) \ DECLARE_MUL_PAIR_I32(c07, c08, v2751, v3035) \ DECLARE_MUL_PAIR_I32(c05, c10, v2106, v3513) \ DECLARE_MUL_PAIR_I32(c03, c12, v1380, v3857) \ DECLARE_MUL_PAIR_I32(c01, c14, v601, v4052) \ \ DECLARE_ADD_SUB_PAIR(t00, t01, c15, c00, v4091, v201);\ DECLARE_ADD_SUB_PAIR(t02, t03, c13, c02, v3973, v995) \ DECLARE_ADD_SUB_PAIR(t04, t05, c11, c04, v3703, v1751) \ DECLARE_ADD_SUB_PAIR(t06, t07, c09, c06, v3290, v2440) \ DECLARE_ADD_SUB_PAIR(t08, t09, c07, c08, v2751, v3035) \ DECLARE_ADD_SUB_PAIR(t10, t11, c05, c10, v2106, v3513) \ DECLARE_ADD_SUB_PAIR(t12, t13, c03, c12, v1380, v3857) \ DECLARE_ADD_SUB_PAIR(t14, t15, c01, c14, v601, v4052) \ \ SCALE_ROUND_4(t00, t01, t02, t03, v2048, v12) \ SCALE_ROUND_4(t04, t05, t06, t07, v2048, v12) \ SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ SCALE_ROUND_4(t12, t13, t14, t15, v2048, v12) \ \ DECLARE_ADD_SUB_PAIR(t00a, t08a, t00, t08,,) \ DECLARE_ADD_SUB_PAIR(t01a, t09a, t01, t09,,) \ DECLARE_ADD_SUB_PAIR(t02a, t10a, t02, t10,,) \ DECLARE_ADD_SUB_PAIR(t03a, t11a, t03, t11,,) \ DECLARE_ADD_SUB_PAIR(t04a, t12a, t04, t12,,) \ DECLARE_ADD_SUB_PAIR(t05a, t13a, t05, t13,,) \ DECLARE_ADD_SUB_PAIR(t06a, t14a, t06, t14,,) \ DECLARE_ADD_SUB_PAIR(t07a, t15a, t07, t15,,) \ \ CLIP16_I32_8(t00a, t08a, t01a, t09a, t02a, t10a, t03a, t11a, \ c00c01, c02c03, c04c05, c06c07); \ CLIP16_I32_8(t04a, t12a, t05a, t13a, t06a, t14a, t07a, t15a, \ c00c01, c02c03, c04c05, c06c07); \ \ DECLARE_SPLAT_I32(4017) \ DECLARE_SPLAT_I32(799) \ DECLARE_SPLAT_I32(2276) \ DECLARE_SPLAT_I32(3406) \ \ DECLARE_MUL_PAIR_I32(t08a, t09a, v4017, v799); \ DECLARE_MUL_PAIR_I32(t10a, t11a, v2276, v3406); \ DECLARE_MUL_PAIR_I32(t13a, t12a, v799, v4017); \ DECLARE_MUL_PAIR_I32(t15a, t14a, v3406, v2276); \ \ ADD_SUB_PAIR(t08, t09, t08a, t09a, v4017, v799); \ ADD_SUB_PAIR(t10, t11, t10a, t11a, v2276, v3406); \ ADD_SUB_PAIR(t13, t12, t13a, t12a, v799, v4017); \ ADD_SUB_PAIR(t15, t14, t15a, t14a, v3406, v2276); \ \ SCALE_ROUND_4(t08, t09, t10, t11, v2048, v12) \ SCALE_ROUND_4(t13, t12, t15, t14, v2048, v12) \ \ ADD_SUB_PAIR(t00, t04, t00a, t04a,,); \ ADD_SUB_PAIR(t01, t05, t01a, t05a,,); \ ADD_SUB_PAIR(t02, t06, t02a, t06a,,); \ ADD_SUB_PAIR(t03, t07, t03a, t07a,,); \ ADD_SUB_PAIR(t08a, t12a, t08, t12,,); \ ADD_SUB_PAIR(t09a, t13a, t09, t13,,); \ ADD_SUB_PAIR(t10a, t14a, t10, t14,,); \ ADD_SUB_PAIR(t11a, t15a, t11, t15,,); \ \ CLIP16_I32_8(t00, t04, t01, t05, t02, t06, t03, t07, \ c00c01, c02c03, c04c05, c06c07) \ CLIP16_I32_8(t08a, t12a, t09a, t13a, t10a, t14a, t11a, t15a, \ c00c01, c02c03, c04c05, c06c07) \ \ DECLARE_SPLAT_I32(3784) \ DECLARE_SPLAT_I32(1567) \ \ DECLARE_MUL_PAIR_I32(t04, t05, v3784, v1567) \ DECLARE_MUL_PAIR_I32(t07, t06, v1567, v3784) \ DECLARE_MUL_PAIR_I32(t12a, t13a, v3784, v1567) \ DECLARE_MUL_PAIR_I32(t15a, t14a, v1567, v3784) \ \ ADD_SUB_PAIR(t04a, t05a, t04, t05, v3784, v1567) \ ADD_SUB_PAIR(t07a, t06a, t07, t06, v1567, v3784) \ ADD_SUB_PAIR(t12, t13, t12a, t13a, v3784, v1567) \ ADD_SUB_PAIR(t15, t14, t15a, t14a, v1567, v3784) \ \ SCALE_ROUND_4(t04a, t05a, t07a, t06a, v2048, v12) \ SCALE_ROUND_4(t12, t13, t15, t14, v2048, v12) \ \ ADD_SUB_PAIR(o00, t02a, t00, t02,,) \ ADD_SUB_PAIR(o15, t03a, t01, t03,,) \ ADD_SUB_PAIR(o03, t06, t04a, t06a,,) \ ADD_SUB_PAIR(o12, t07, t05a, t07a,,) \ ADD_SUB_PAIR(o01, t10, t08a, t10a,,) \ ADD_SUB_PAIR(o14, t11, t09a, t11a,,) \ ADD_SUB_PAIR(o02, t14a, t12, t14,,) \ ADD_SUB_PAIR(o13, t15a, t13, t15,,) \ \ CLIP16_I32_8(o00, t02a, o15, t03a, o03, t06, o12, t07, \ c00c01, c02c03, c04c05, c06c07) \ CLIP16_I32_8(o01, t10, o14, t11, o02, t14a, o13, t15a, \ c00c01, c02c03, c04c05, c06c07) \ \ DECLARE_SPLAT_I32(181) \ DECLARE_SPLAT_I32(128) \ u32x4 v8 = vec_splat_u32(8); \ \ ADD_SUB_PAIR(o07, o08, t02a, t03a,,) \ ADD_SUB_PAIR(o04, o11, t06, t07,,) \ ADD_SUB_PAIR(o06, o09, t10, t11,,) \ ADD_SUB_PAIR(o05, o10, t14a, t15a,,) \ \ MUL_4_INPLACE(o07, o08, o04, o11, v181) \ MUL_4_INPLACE(o06, o09, o05, o10, v181) \ \ SCALE_ROUND_4(o07, o08, o04, o11, v128, v8) \ SCALE_ROUND_4(o06, o09, o05, o10, v128, v8) \ \ o01 = -o01; \ o03 = -o03; \ o05 = -o05; \ o07 = -o07; \ o09 = -o09; \ o11 = -o11; \ o13 = -o13; \ o15 = -o15; \ #define adst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07) \ } #define adst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07) \ PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ c00, c02, c04, c06, c08, c10, c12, c14, \ c01, c03, c05, c07, c09, c11, c13, c15) \ } #define flipadst_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ c00c01, c02c03, c04c05, c06c07) \ } #define flipadst_16_out(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ { \ ADST_INNER_16(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, \ c15, c14, c13, c12, c11, c10, c09, c08, c07, c06, c05, c04, c03, c02, c01, c00, \ c00c01, c02c03, c04c05, c06c07) \ PACK_8(c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15, \ c00, c02, c04, c06, c08, c10, c12, c14, \ c01, c03, c05, c07, c09, c11, c13, c15) \ } void dav2d_inv_txfm_add_dct_dct_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob HIGHBD_DECL_SUFFIX) { if (eob < 1) { return dc_only_4xN(dst, stride, coeff, 4, 0, 1); } LOAD_COEFF_4x16(coeff) dct_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) memset(coeff, 0, sizeof(*coeff) * 4 * 16); SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) dct_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); STORE_4(dst, stride, l00, l01, l02, l03); STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); } #define inv_txfm_fn4x16(type1, type2) \ void dav2d_inv_txfm_add_##type1##_##type2##_4x16_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_COEFF_4x16(coeff) \ type1##_4x4_in(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ memset(coeff, 0, sizeof(*coeff) * 4 * 16); \ SCALE_ROUND_4(cA0, cB0, cC0, cD0, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(cA1, cB1, cC1, cD1, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(cA2, cB2, cC2, cD2, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(cA3, cB3, cC3, cD3, vec_splat_s32(1), vec_splat_u32(1)) \ TRANSPOSE4x16_I32(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3) \ type2##_16_out(cA0, cA1, cA2, cA3, cB0, cB1, cB2, cB3, \ cC0, cC1, cC2, cC3, cD0, cD1, cD2, cD3, \ a0b0, c0d0, a1b1, c1d1, a2b2, c2d2, a3b3, c3d3) \ LOAD_DECLARE_4(dst, stride, l00, l01, l02, l03) \ LOAD_DECLARE_4(dst + 4 * stride, stride, l04, l05, l06, l07) \ LOAD_DECLARE_4(dst + 8 * stride, stride, l08, l09, l10, l11) \ LOAD_DECLARE_4(dst + 12 * stride, stride, l12, l13, l14, l15) \ APPLY_COEFF_4(l00, l01, l02, l03, a0b0, c0d0); \ APPLY_COEFF_4(l04, l05, l06, l07, a1b1, c1d1); \ APPLY_COEFF_4(l08, l09, l10, l11, a2b2, c2d2); \ APPLY_COEFF_4(l12, l13, l14, l15, a3b3, c3d3); \ STORE_4(dst, stride, l00, l01, l02, l03); \ STORE_4(dst + 4 * stride, stride, l04, l05, l06, l07); \ STORE_4(dst + 8 * stride, stride, l08, l09, l10, l11); \ STORE_4(dst + 12 * stride, stride, l12, l13, l14, l15); \ } inv_txfm_fn4x16(adst, dct ) inv_txfm_fn4x16(dct, adst ) inv_txfm_fn4x16(dct, flipadst) inv_txfm_fn4x16(flipadst, dct ) inv_txfm_fn4x16(adst, flipadst) inv_txfm_fn4x16(flipadst, adst ) inv_txfm_fn4x16(identity, dct ) inv_txfm_fn4x16(dct, identity) inv_txfm_fn4x16(identity, flipadst) inv_txfm_fn4x16(flipadst, identity) inv_txfm_fn4x16(identity, adst ) inv_txfm_fn4x16(adst, identity) inv_txfm_fn4x16(identity, identity) inv_txfm_fn4x16(adst, adst ) inv_txfm_fn4x16(flipadst, flipadst) void dav2d_inv_txfm_add_dct_dct_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, int16_t *const coeff, const int eob) { if (eob < 1) { return dc_only_16xN(dst, stride, coeff, 1, 0, 1); } LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) dct_16_in(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) memset(coeff, 0, sizeof(*coeff) * 16 * 4); SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) TRANSPOSE4_I32(c00, c01, c02, c03); TRANSPOSE4_I32(c04, c05, c06, c07); TRANSPOSE4_I32(c08, c09, c10, c11); TRANSPOSE4_I32(c12, c13, c14, c15); dct_4x4_out(c00, c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14, c15, c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) APPLY_COEFF_16x4(l0, l1, l2, l3, c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) STORE_16(dst, stride, l0, l1, l2, l3) } #define inv_txfm_fn16x4(type1, type2) \ void dav2d_inv_txfm_add_##type1##_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ type1##_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ TRANSPOSE4_I32(c00, c01, c02, c03); \ TRANSPOSE4_I32(c04, c05, c06, c07); \ TRANSPOSE4_I32(c08, c09, c10, c11); \ TRANSPOSE4_I32(c12, c13, c14, c15); \ type2##_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15); \ LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ APPLY_COEFF_16x4(l0, l1, l2, l3, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ STORE_16(dst, stride, l0, l1, l2, l3) \ } inv_txfm_fn16x4(adst, dct ) inv_txfm_fn16x4(dct, adst ) inv_txfm_fn16x4(dct, flipadst) inv_txfm_fn16x4(flipadst, dct ) inv_txfm_fn16x4(adst, flipadst) inv_txfm_fn16x4(flipadst, adst ) inv_txfm_fn16x4(dct, identity) inv_txfm_fn16x4(flipadst, identity) inv_txfm_fn16x4(adst, identity) inv_txfm_fn16x4(identity, identity) inv_txfm_fn16x4(adst, adst ) inv_txfm_fn16x4(flipadst, flipadst) #define inv_txfm_fn16x4_identity(type2) \ void dav2d_inv_txfm_add_identity_##type2##_16x4_8bpc_pwr9(uint8_t *dst, const ptrdiff_t stride, \ int16_t *const coeff, const int eob) \ { \ LOAD_DECLARE_2_I16(coeff, c00c01, c02c03) \ LOAD_DECLARE_2_I16(coeff+16, c04c05, c06c07) \ LOAD_DECLARE_2_I16(coeff+32, c08c09, c10c11) \ LOAD_DECLARE_2_I16(coeff+48, c12c13, c14c15) \ UNPACK_DECLARE_4_I16_I32(c00c01, c02c03, c00, c01, c02, c03) \ UNPACK_DECLARE_4_I16_I32(c04c05, c06c07, c04, c05, c06, c07) \ UNPACK_DECLARE_4_I16_I32(c08c09, c10c11, c08, c09, c10, c11) \ UNPACK_DECLARE_4_I16_I32(c12c13, c14c15, c12, c13, c14, c15) \ identity_16_in(c00, c01, c02, c03, c04, c05, c06, c07, \ c08, c09, c10, c11, c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, c08c09, c10c11, c12c13, c14c15) \ memset(coeff, 0, sizeof(*coeff) * 16 * 4); \ SCALE_ROUND_4(c00, c01, c02, c03, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c04, c05, c06, c07, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c08, c09, c10, c11, vec_splat_s32(1), vec_splat_u32(1)) \ SCALE_ROUND_4(c12, c13, c14, c15, vec_splat_s32(1), vec_splat_u32(1)) \ CLIP16_I32_8(c00, c01, c02, c03, c04, c05, c06, c07, c00c01, c02c03, c04c05, c06c07) \ CLIP16_I32_8(c08, c09, c10, c11, c12, c13, c14, c15, c08c09, c10c11, c12c13, c14c15) \ TRANSPOSE4_I32(c00, c01, c02, c03); \ TRANSPOSE4_I32(c04, c05, c06, c07); \ TRANSPOSE4_I32(c08, c09, c10, c11); \ TRANSPOSE4_I32(c12, c13, c14, c15); \ type2##_4x4_out(c00, c01, c02, c03, \ c04, c05, c06, c07, \ c08, c09, c10, c11, \ c12, c13, c14, c15, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15); \ LOAD_DECLARE_4(dst, stride, l0, l1, l2, l3) \ APPLY_COEFF_16x4(l0, l1, l2, l3, \ c00c01, c02c03, c04c05, c06c07, \ c08c09, c10c11, c12c13, c14c15) \ STORE_16(dst, stride, l0, l1, l2, l3) \ } inv_txfm_fn16x4_identity(dct) inv_txfm_fn16x4_identity(adst) inv_txfm_fn16x4_identity(flipadst) #endif // BITDEPTH dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/loopfilter_tmpl.c000066400000000000000000001540721517466257200250050ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * Copyright © 2024, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #undef NDEBUG #include #include #include "common/attributes.h" #include "common/intops.h" #include "src/ppc/dav2d_types.h" #include "src/ppc/loopfilter.h" #if BITDEPTH == 8 #define LOAD4_H(idx) \ u8x16 idx##0 = vec_xl(0, dst); /* p1_0 p0_0 q0_0 q1_0 */ \ dst += stridea; \ u8x16 idx##1 = vec_xl(0, dst); /* p1_1 p0_1 q0_1 q1_1 */ \ dst += stridea; \ u8x16 idx##2 = vec_xl(0, dst); /* p1_2 p0_2 q0_2 q1_2 */ \ dst += stridea; \ u8x16 idx##3 = vec_xl(0, dst); /* p1_3 p0_3 q0_3 q1_3 */ \ // return idx##_01 and idx##_23 #define LOAD4_H_SINGLE(idx) \ LOAD4_H(idx) \ \ u8x16 idx##_01 = vec_mergeh(idx##0, idx##1); /* p1_0 p1_1 p0_0 p0_1 q0_0 q0_1 q1_0 q1_1 */ \ u8x16 idx##_23 = vec_mergeh(idx##2, idx##3); /* p1_2 p1_3 p0_2 p0_3 q0_2 q0_3 q1_2 q1_3 */ #define DECLARE_ADD_16HL(r, a, b) \ u16x8 r##h = vec_add(a##h, b##h); \ u16x8 r##l = vec_add(a##l, b##l); #define ADD_16HL(r, a, b) \ r##h = vec_add(a##h, b##h); \ r##l = vec_add(a##l, b##l); #define ADD_AND_SHIFT4(v) \ v##h = vec_sr(vec_add(v##h, v4u16), v3u16); \ v##l = vec_sr(vec_add(v##l, v4u16), v3u16); #define ADD_AND_SHIFT8(v) \ v##h = vec_sr(vec_add(v##h, v8u16), v4u16); \ v##l = vec_sr(vec_add(v##l, v8u16), v4u16); #define PACK_AND_SEL(v, m) \ vec_sel(v, vec_pack(o##v##h, o##v##l), m) #define UNPACK_16(v) \ u16x8 v##h = u8h_to_u16(v); \ u16x8 v##l = u8l_to_u16(v); #define APPLY_4 \ b8x16 hev = vec_cmpgt(max_a_p1p0_q1q0, H); \ \ i8x16 ps1 = (i8x16)vec_xor(p1, s); \ i8x16 ps0 = (i8x16)vec_xor(p0, s); \ i8x16 qs0 = (i8x16)vec_xor(q0, s); \ i8x16 qs1 = (i8x16)vec_xor(q1, s); \ i8x16 f0 = vec_and(vec_subs(ps1, qs1), hev); \ i16x8 q0sh = (i16x8)q0h; \ i16x8 q0sl = (i16x8)q0l; \ i16x8 p0sh = (i16x8)p0h; \ i16x8 p0sl = (i16x8)p0l; \ i16x8 f0h = i8h_to_i16(f0); \ i16x8 f0l = i8l_to_i16(f0); \ i16x8 d0h = vec_sub(q0sh, p0sh); \ i16x8 d0l = vec_sub(q0sl, p0sl); \ u8x16 v3u8 = vec_splat_u8(3); \ i16x8 d0h_2 = vec_add(d0h, d0h); \ i16x8 d0l_2 = vec_add(d0l, d0l); \ u8x16 v4u8 = vec_splat_u8(4); \ i16x8 f0_d0h = vec_add(d0h, f0h); \ i16x8 f0_d0l = vec_add(d0l, f0l); \ i16x8 fh = vec_add(d0h_2, f0_d0h); \ i16x8 fl = vec_add(d0l_2, f0_d0l); \ i8x16 f = vec_packs(fh, fl); \ i8x16 f1 = vec_adds(f, (i8x16)v4u8); \ i8x16 f2 = vec_adds(f, (i8x16)v3u8); \ f1 = vec_sra(f1, v3u8); \ f2 = vec_sra(f2, v3u8); \ f1 = vec_and(f1, fm); \ f2 = vec_and(f2, fm); \ i8x16 f3 = vec_adds(f1, (i8x16)v1u8); \ b8x16 m3 = vec_and(~hev, (b8x16)fm); \ f3 = vec_sra(f3, v1u8); \ f3 = vec_and(f3, m3); \ i8x16 op0s = vec_adds(ps0, f2); \ i8x16 oq0s = vec_subs(qs0, f1); \ i8x16 oq1s = vec_subs(qs1, f3); \ i8x16 op1s = vec_adds(ps1, f3); \ p0 = (u8x16)vec_xor(op0s, s); \ q0 = (u8x16)vec_xor(oq0s, s); \ q1 = (u8x16)vec_xor(oq1s, s); \ p1 = (u8x16)vec_xor(op1s, s); #define APPLY_8 \ DECLARE_ADD_16HL(p1p0, p1, p0) \ DECLARE_ADD_16HL(p2q0, p2, q0) \ DECLARE_ADD_16HL(q1q2, q1, q2) \ DECLARE_ADD_16HL(p3p3, p3, p3) \ DECLARE_ADD_16HL(q0q3, q0, q3) \ DECLARE_ADD_16HL(p3p2, p3, p2) \ DECLARE_ADD_16HL(p1q1, p1, q1) \ DECLARE_ADD_16HL(p3p0, p3, p0) \ DECLARE_ADD_16HL(p0q2, p0, q2) \ DECLARE_ADD_16HL(q1q3, q1, q3) \ DECLARE_ADD_16HL(q3q3, q3, q3) \ DECLARE_ADD_16HL(q0q1q2q3, q0q3, q1q2) \ DECLARE_ADD_16HL(p2p1p0q0, p1p0, p2q0) \ DECLARE_ADD_16HL(p3p3p3p2, p3p3, p3p2) \ DECLARE_ADD_16HL(p3p3p1q1, p3p3, p1q1) \ DECLARE_ADD_16HL(p3p0q1q2, p3p0, q1q2) \ DECLARE_ADD_16HL(p1p0q1q3, p1p0, q1q3) \ DECLARE_ADD_16HL(p0q2q3q3, p0q2, q3q3) \ \ DECLARE_ADD_16HL(op2, p3p3p3p2, p2p1p0q0) \ DECLARE_ADD_16HL(op1, p3p3p1q1, p2p1p0q0) \ DECLARE_ADD_16HL(op0, p3p0q1q2, p2p1p0q0) \ DECLARE_ADD_16HL(oq0, p2p1p0q0, q0q1q2q3) \ DECLARE_ADD_16HL(oq1, p1p0q1q3, q0q1q2q3) \ DECLARE_ADD_16HL(oq2, p0q2q3q3, q0q1q2q3) \ \ ADD_AND_SHIFT4(op2) \ ADD_AND_SHIFT4(op1) \ ADD_AND_SHIFT4(op0) \ ADD_AND_SHIFT4(oq0) \ ADD_AND_SHIFT4(oq1) \ ADD_AND_SHIFT4(oq2) \ \ p2 = PACK_AND_SEL(p2, apply_8); \ p1 = PACK_AND_SEL(p1, apply_8); \ p0 = PACK_AND_SEL(p0, apply_8); \ q0 = PACK_AND_SEL(q0, apply_8); \ q1 = PACK_AND_SEL(q1, apply_8); \ q2 = PACK_AND_SEL(q2, apply_8); #define APPLY_16 \ DECLARE_ADD_16HL(p6p6, p6, p6) \ DECLARE_ADD_16HL(p6p5, p6, p5) \ DECLARE_ADD_16HL(p6p4, p6, p4) \ DECLARE_ADD_16HL(p4p3, p4, p3) \ DECLARE_ADD_16HL(p2p1, p2, p1) \ DECLARE_ADD_16HL(p2q2, p2, q2) \ DECLARE_ADD_16HL(p3q1, p3, q1) \ DECLARE_ADD_16HL(p0q0, p0, q0) \ DECLARE_ADD_16HL(p0q1, p0, q1) \ DECLARE_ADD_16HL(p1q3, p1, q3) \ DECLARE_ADD_16HL(p1q0, p1, q0) \ DECLARE_ADD_16HL(p1q5, p1, q5) \ DECLARE_ADD_16HL(q3q4, q3, q4) \ DECLARE_ADD_16HL(q2q5, q2, q5) \ DECLARE_ADD_16HL(q1q6, q1, q6) \ DECLARE_ADD_16HL(q0q1, q0, q1) \ DECLARE_ADD_16HL(q6q6, q6, q6) \ DECLARE_ADD_16HL(q2q6, q2, q6) \ DECLARE_ADD_16HL(q3q6, q3, q6) \ DECLARE_ADD_16HL(q4q6, q4, q6) \ DECLARE_ADD_16HL(p5q0, p5, q0) \ \ DECLARE_ADD_16HL(p6q2, p6, q2) \ DECLARE_ADD_16HL(p6p6p6p4, p6p6, p6p4) \ DECLARE_ADD_16HL(p6p5p2p1, p6p5, p2p1) \ DECLARE_ADD_16HL(p4p3p0q0, p4p3, p0q0) \ DECLARE_ADD_16HL(p2q2p3q1, p2q2, p3q1) \ DECLARE_ADD_16HL(p6p5p6p6, p6p5, p6p6) \ DECLARE_ADD_16HL(p6p5p3q1, p6p5, p3q1) \ DECLARE_ADD_16HL(p6p6p1q3, p6p6, p1q3) \ DECLARE_ADD_16HL(q2q5q3q4, q2q5, q3q4) \ DECLARE_ADD_16HL(p2p1q1q6, p2p1, q1q6) \ DECLARE_ADD_16HL(p0q0q3q6, p0q0, q3q6) \ DECLARE_ADD_16HL(q4q6q6q6, q4q6, q6q6) \ u16x8 q5q6q6q6h = vec_madd(v3u16, q6h, q5h); \ u16x8 q5q6q6q6l = vec_madd(v3u16, q6l, q5l); \ DECLARE_ADD_16HL(p0q0q1q6, p0q0, q1q6) \ DECLARE_ADD_16HL(p0q1q3q4, p0q1, q3q4) \ \ DECLARE_ADD_16HL(p6q2p2p1, p6q2, p2p1) \ DECLARE_ADD_16HL(p1q0q2q5, p1q0, q2q5) \ DECLARE_ADD_16HL(p0q1p5q0, p0q1, p5q0) \ DECLARE_ADD_16HL(q0q1q2q6, q0q1, q2q6) \ DECLARE_ADD_16HL(p3q1q2q6, p3q1, q2q6) \ DECLARE_ADD_16HL(q2q6q4q6, q2q6, q4q6) \ DECLARE_ADD_16HL(q3q6p1q5, q3q6, p1q5) \ \ DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6, p4p3p0q0, p2p1q1q6) \ DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0, p6p5p2p1, p4p3p0q0) \ DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4, p2p1q1q6, q2q5q3q4) \ DECLARE_ADD_16HL(q2q5q3q4q4q6q6q6, q2q5q3q4, q4q6q6q6) \ DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p2q2p3q1, p6p5p2p1p4p3p0q0, p2q2p3q1) \ DECLARE_ADD_16HL(p6p6p6p4p6p5p2p1p4p3p0q0, p6p6p6p4, p6p5p2p1p4p3p0q0) \ DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6q2q5q3q4, p4p3p0q0p2p1q1q6, q2q5q3q4) \ DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4p0q0q3q6, p2p1q1q6q2q5q3q4, p0q0q3q6) \ DECLARE_ADD_16HL(p0q0q1q6q2q5q3q4q4q6q6q6, p0q0q1q6, q2q5q3q4q4q6q6q6) \ DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p0q1q3q4, p6p5p2p1p4p3p0q0, p0q1q3q4) \ \ DECLARE_ADD_16HL(op5, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p6p6) \ DECLARE_ADD_16HL(op4, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p3q1) \ DECLARE_ADD_16HL(op3, p6p6p6p4, p6p5p2p1p4p3p0q0p2q2p3q1) \ DECLARE_ADD_16HL(op2, p6p6p1q3, p6p5p2p1p4p3p0q0p2q2p3q1) \ DECLARE_ADD_16HL(op1, p6p5p2p1p4p3p0q0p0q1q3q4, p6q2p2p1) \ DECLARE_ADD_16HL(op0, p6p5p2p1p4p3p0q0p0q1q3q4, p1q0q2q5) \ DECLARE_ADD_16HL(oq0, p4p3p0q0p2p1q1q6q2q5q3q4, p0q1p5q0) \ DECLARE_ADD_16HL(oq1, p4p3p0q0p2p1q1q6q2q5q3q4, q0q1q2q6) \ DECLARE_ADD_16HL(oq2, p2p1q1q6q2q5q3q4p0q0q3q6, p3q1q2q6) \ DECLARE_ADD_16HL(oq3, p2p1q1q6q2q5q3q4p0q0q3q6, q2q6q4q6) \ DECLARE_ADD_16HL(oq4, p0q0q1q6q2q5q3q4q4q6q6q6, q3q6p1q5) \ DECLARE_ADD_16HL(oq5, p0q0q1q6q2q5q3q4q4q6q6q6, q5q6q6q6) \ \ ADD_AND_SHIFT8(op5) \ ADD_AND_SHIFT8(op4) \ ADD_AND_SHIFT8(op3) \ ADD_AND_SHIFT8(op2) \ ADD_AND_SHIFT8(op1) \ ADD_AND_SHIFT8(op0) \ ADD_AND_SHIFT8(oq0) \ ADD_AND_SHIFT8(oq1) \ ADD_AND_SHIFT8(oq2) \ ADD_AND_SHIFT8(oq3) \ ADD_AND_SHIFT8(oq4) \ ADD_AND_SHIFT8(oq5) \ \ p5 = PACK_AND_SEL(p5, apply_16); \ p4 = PACK_AND_SEL(p4, apply_16); \ p3 = PACK_AND_SEL(p3, apply_16); \ p2 = PACK_AND_SEL(p2, apply_16); \ p1 = PACK_AND_SEL(p1, apply_16); \ p0 = PACK_AND_SEL(p0, apply_16); \ q0 = PACK_AND_SEL(q0, apply_16); \ q1 = PACK_AND_SEL(q1, apply_16); \ q2 = PACK_AND_SEL(q2, apply_16); \ q3 = PACK_AND_SEL(q3, apply_16); \ q4 = PACK_AND_SEL(q4, apply_16); \ q5 = PACK_AND_SEL(q5, apply_16); \ static inline void store_h_4(u8x16 out, uint8_t *dst, int stridea) { u8x16 out1 = (u8x16)vec_splat((u32x4)out, 1); u8x16 out2 = (u8x16)vec_splat((u32x4)out, 2); u8x16 out3 = (u8x16)vec_splat((u32x4)out, 3); vec_xst_len(out, dst, 4); dst += stridea; vec_xst_len(out1, dst, 4); dst += stridea; vec_xst_len(out2, dst, 4); dst += stridea; vec_xst_len(out3, dst, 4); } static inline void store_h_8(u8x16 outa, u8x16 outb, uint8_t *dst, int stridea) { u8x16 out1 = (u8x16)vec_mergel((u64x2)outa, (u64x2)outa); u8x16 out3 = (u8x16)vec_mergel((u64x2)outb, (u64x2)outb); vec_xst_len(outa, dst, 6); dst += stridea; vec_xst_len(out1, dst, 6); dst += stridea; vec_xst_len(outb, dst, 6); dst += stridea; vec_xst_len(out3, dst, 6); } // Assume a layout {v}0 {v}1 {v}2 {v}3, produces {v}01 {v}23 #define MERGEH_4(v) \ u8x16 v##01 = vec_mergeh(v##0, v##1); \ u8x16 v##23 = vec_mergeh(v##2, v##3); #define MERGEL_4(v) \ u8x16 v##01 = vec_mergel(v##0, v##1); \ u8x16 v##23 = vec_mergel(v##2, v##3); // produce {v}0123h #define MERGEH_U16_0123(v) \ u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23); #define MERGEHL_U16_0123(v) \ u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23); #define MERGE_U16_0123(v) \ u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23); \ u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23); // produce {ac,bd}0123h{dir} #define MERGEH_U32_LINE(dir) \ u32x4 ac0123h##dir = vec_mergeh((u32x4)a0123##dir, (u32x4)c0123##dir); \ u32x4 bd0123h##dir = vec_mergeh((u32x4)b0123##dir, (u32x4)d0123##dir); #define MERGEL_U32_LINE(dir) \ u32x4 ac0123l##dir = vec_mergel((u32x4)a0123##dir, (u32x4)c0123##dir); \ u32x4 bd0123l##dir = vec_mergel((u32x4)b0123##dir, (u32x4)d0123##dir); // produce the pair of mergeh/mergel of {ac,bd}01234{dira}{dirb} #define MERGE_U32(oh, ol, dira, dirb) \ oh = (u8x16)vec_mergeh(ac0123##dira##dirb, bd0123##dira##dirb); \ ol = (u8x16)vec_mergel(ac0123##dira##dirb, bd0123##dira##dirb); #define MERGEHL_U8(a, b) \ u8x16 a##b##h = vec_mergeh(a, b); \ u8x16 a##b##l = vec_mergel(a, b); #define MERGEHL_U16(out, a, b) \ u8x16 out##h = (u8x16)vec_mergeh((u16x8)a, (u16x8)b); \ u8x16 out##l = (u8x16)vec_mergel((u16x8)a, (u16x8)b); #define MERGEHL_U32(out, a, b) \ u8x16 out##h = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \ u8x16 out##l = (u8x16)vec_mergel((u32x4)a, (u32x4)b); static inline void loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t stridea, b32x4 apply) { dst -= 2; uint8_t *dst2 = dst; u8x16 p1, p0, q0, q1; LOAD4_H(a) dst += stridea; LOAD4_H(b) dst += stridea; LOAD4_H(c) dst += stridea; LOAD4_H(d) MERGEH_4(a) MERGEH_4(b) MERGEH_4(c) MERGEH_4(d) MERGEH_U16_0123(a) MERGEH_U16_0123(b) MERGEH_U16_0123(c) MERGEH_U16_0123(d) MERGEH_U32_LINE(h) MERGEL_U32_LINE(h) MERGE_U32(p1, p0, h, h) MERGE_U32(q0, q1, l, h) const u8x16 zero = vec_splat_u8(0); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); const u8x16 cmp_I = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); const b8x16 ltI = vec_cmple(cmp_I, I); const b8x16 ltE = vec_cmple(cmp_E, E); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; UNPACK_16(p0) UNPACK_16(q0) APPLY_4 u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ... u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ... u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ... u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ... u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ... u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab); u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd); u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd); if (apply[0]) { store_h_4(outa, dst2, stridea); } dst2 += 4 * stridea; if (apply[1]) { store_h_4(outb, dst2, stridea); } dst2 += 4 * stridea; if (apply[2]) { store_h_4(outc, dst2, stridea); } dst2 += 4 * stridea; if (apply[3]) { store_h_4(outd, dst2, stridea); } } static inline void loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t stridea, b32x4 apply, b32x4 m6) { uint8_t *dst2 = dst - 2; dst -= 3; u8x16 p2, p1, p0, q0, q1, q2; LOAD4_H(a) dst += stridea; LOAD4_H(b) dst += stridea; LOAD4_H(c) dst += stridea; LOAD4_H(d) MERGEH_4(a) MERGEH_4(b) MERGEH_4(c) MERGEH_4(d) MERGE_U16_0123(a) MERGE_U16_0123(b) MERGE_U16_0123(c) MERGE_U16_0123(d) MERGEH_U32_LINE(h) MERGEL_U32_LINE(h) MERGEH_U32_LINE(l) MERGE_U32(p2, p1, h, h) MERGE_U32(p0, q0, l, h) MERGE_U32(q1, q2, h, l) const u8x16 F = vec_splat_u8(1); const u8x16 zero = vec_splat_u8(0); const u16x8 v3u16 = vec_splat_u16(3); const u16x8 v4u16 = vec_splat_u16(4); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); const u8x16 a_p2_p1 = vec_absd(p2, p1); const u8x16 a_q2_q1 = vec_absd(q2, q1); const u8x16 a_p2_p0 = vec_absd(p2, p0); const u8x16 a_q2_q0 = vec_absd(q2, q0); u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); u8x16 cmp_I_m6 = max_a_p2p1_q2q1; u8x16 cmp_I_m4 = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6); u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6); const b8x16 ltE = vec_cmple(cmp_E, E); const b8x16 ltI = vec_cmple(cmp_I, I); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; UNPACK_16(p2) UNPACK_16(p1) UNPACK_16(p0) UNPACK_16(q0) UNPACK_16(q1) UNPACK_16(q2) m6 = vec_and(m6, (b32x4)fm); u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6); b8x16 apply_4 = vec_andc(fm, apply_6); if (vec_any_ne(apply_4, zero)) { APPLY_4 } if (vec_any_ne(apply_6, zero)) { DECLARE_ADD_16HL(p2p2, p2, p2) DECLARE_ADD_16HL(p2p1, p2, p1) DECLARE_ADD_16HL(p1p0, p1, p0) DECLARE_ADD_16HL(p0q0, p0, q0) DECLARE_ADD_16HL(q0q1, q0, q1) DECLARE_ADD_16HL(q1q2, q1, q2) DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0) DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0) DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2) DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1) u16x8 q1q2q2q2h = q2h * 3 + q1h; u16x8 q1q2q2q2l = q2l * 3 + q1l; DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0) DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1) DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1) DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2) ADD_AND_SHIFT4(op1) ADD_AND_SHIFT4(op0) ADD_AND_SHIFT4(oq0) ADD_AND_SHIFT4(oq1) p1 = PACK_AND_SEL(p1, apply_6); p0 = PACK_AND_SEL(p0, apply_6); q0 = PACK_AND_SEL(q0, apply_6); q1 = PACK_AND_SEL(q1, apply_6); } u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ... u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ... u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ... u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ... u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ... u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab); u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd); u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd); if (apply[0]) { store_h_4(outa, dst2, stridea); } dst2 += 4 * stridea; if (apply[1]) { store_h_4(outb, dst2, stridea); } dst2 += 4 * stridea; if (apply[2]) { store_h_4(outc, dst2, stridea); } dst2 += 4 * stridea; if (apply[3]) { store_h_4(outd, dst2, stridea); } } static inline void loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t stridea, b32x4 apply, b32x4 m8) { uint8_t *dst2 = dst - 3; dst -= 4; u8x16 p3, p2, p1, p0, q0, q1, q2, q3; LOAD4_H(a) dst += stridea; LOAD4_H(b) dst += stridea; LOAD4_H(c) dst += stridea; LOAD4_H(d) MERGEH_4(a) MERGEH_4(b) MERGEH_4(c) MERGEH_4(d) MERGE_U16_0123(a) MERGE_U16_0123(b) MERGE_U16_0123(c) MERGE_U16_0123(d) MERGEH_U32_LINE(h) MERGEL_U32_LINE(h) MERGEH_U32_LINE(l) MERGEL_U32_LINE(l) MERGE_U32(p3, p2, h, h) MERGE_U32(p1, p0, l, h) MERGE_U32(q0, q1, h, l) MERGE_U32(q2, q3, l, l) const u8x16 F = vec_splat_u8(1); const u8x16 zero = vec_splat_u8(0); const u16x8 v3u16 = vec_splat_u16(3); const u16x8 v4u16 = vec_splat_u16(4); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); const u8x16 a_p2_p1 = vec_absd(p2, p1); const u8x16 a_q2_q1 = vec_absd(q2, q1); const u8x16 a_p2_p0 = vec_absd(p2, p0); const u8x16 a_q2_q0 = vec_absd(q2, q0); const u8x16 a_p3_p0 = vec_absd(p3, p0); const u8x16 a_q3_q0 = vec_absd(q3, q0); const u8x16 a_p3_p2 = vec_absd(p3, p2); const u8x16 a_q3_q2 = vec_absd(q3, q2); u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); u8x16 cmp_I_m4 = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8); u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); const b8x16 ltE = vec_cmple(cmp_E, E); const b8x16 ltI = vec_cmple(cmp_I, I); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; #define UNPACK_16(v) \ u16x8 v##h = u8h_to_u16(v); \ u16x8 v##l = u8l_to_u16(v); UNPACK_16(p3) UNPACK_16(p2) UNPACK_16(p1) UNPACK_16(p0) UNPACK_16(q0) UNPACK_16(q1) UNPACK_16(q2) UNPACK_16(q3) m8 = vec_and(m8, (b32x4)fm); u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8); b8x16 apply_4 = vec_andc(fm, apply_8); if (vec_any_ne(apply_4, zero)) { APPLY_4 } if (vec_any_ne(apply_8, zero)) { APPLY_8 } MERGEHL_U8(p2, p1) // A0 A1 A2 A3 B0 B1 B2 B3 MERGEHL_U8(p0, q0) MERGEHL_U8(q1, q2) MERGEHL_U16(ab_p2p1p0q0, p2p1h, p0q0h) // A0 p2 p1 p0 q0 | A1 p2 p1 p0 q0 | A2 ... // B0 ... MERGEHL_U16(cd_p2p1p0q0, p2p1l, p0q0l) // C0 ... // D0 ... MERGEHL_U16(ab_q1q2, q1q2h, q1q2h) // A0 q1 q2 q1 q2 | A1 q1 q2 ... // B0 ... MERGEHL_U16(cd_q1q2, q1q2l, q1q2l) // C0 ... // D0 ... MERGEHL_U32(a, ab_p2p1p0q0h, ab_q1q2h) // A0 p2 p1 p0 q0 q1 q2 q1 q2 | A1 .. // A2 ... | A3 ... MERGEHL_U32(b, ab_p2p1p0q0l, ab_q1q2l) // B0 ... // C2 ... MERGEHL_U32(c, cd_p2p1p0q0h, cd_q1q2h) // C0 ... // C2 MERGEHL_U32(d, cd_p2p1p0q0l, cd_q1q2l) // D0 .. // D2 .. if (apply[0]) { store_h_8(ah, al, dst2, stridea); } dst2 += 4 * stridea; if (apply[1]) { store_h_8(bh, bl, dst2, stridea); } dst2 += 4 * stridea; if (apply[2]) { store_h_8(ch, cl, dst2, stridea); } dst2 += 4 * stridea; if (apply[3]) { store_h_8(dh, dl, dst2, stridea); } } static inline void loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16) { uint8_t *dst2 = dst -6 ; dst -= 7; u8x16 p3, p2, p1, p0, q0, q1, q2, q3; u8x16 p6, p5, p4, q4, q5, q6; LOAD4_H(a) dst += stridea; LOAD4_H(b) dst += stridea; LOAD4_H(c) dst += stridea; LOAD4_H(d) { MERGEH_4(a) MERGEH_4(b) MERGEH_4(c) MERGEH_4(d) MERGE_U16_0123(a) MERGE_U16_0123(b) MERGE_U16_0123(c) MERGE_U16_0123(d) MERGEH_U32_LINE(h) MERGEL_U32_LINE(h) MERGEH_U32_LINE(l) MERGEL_U32_LINE(l) MERGE_U32(p6, p5, h, h) MERGE_U32(p4, p3, l, h) MERGE_U32(p2, p1, h, l) MERGE_U32(p0, q0, l, l) } { MERGEL_4(a) MERGEL_4(b) MERGEL_4(c) MERGEL_4(d) MERGE_U16_0123(a) MERGE_U16_0123(b) MERGE_U16_0123(c) MERGE_U16_0123(d) MERGEH_U32_LINE(h) MERGEL_U32_LINE(h) MERGEH_U32_LINE(l) MERGE_U32(q1, q2, h, h) MERGE_U32(q3, q4, l, h) MERGE_U32(q5, q6, h, l) } const u8x16 F = vec_splat_u8(1); const u8x16 zero = vec_splat_u8(0); const u16x8 v3u16 = vec_splat_u16(3); const u16x8 v4u16 = vec_splat_u16(4); const u16x8 v8u16 = vec_splat_u16(8); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p6_p0 = vec_absd(p6, p0); const u8x16 a_p5_p0 = vec_absd(p5, p0); const u8x16 a_p4_p0 = vec_absd(p4, p0); const u8x16 a_q4_q0 = vec_absd(q4, q0); const u8x16 a_q5_q0 = vec_absd(q5, q0); const u8x16 a_q6_q0 = vec_absd(q6, q0); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); const u8x16 a_p2_p1 = vec_absd(p2, p1); const u8x16 a_q2_q1 = vec_absd(q2, q1); const u8x16 a_p2_p0 = vec_absd(p2, p0); const u8x16 a_q2_q0 = vec_absd(q2, q0); const u8x16 a_p3_p0 = vec_absd(p3, p0); const u8x16 a_q3_q0 = vec_absd(q3, q0); const u8x16 a_p3_p2 = vec_absd(p3, p2); const u8x16 a_q3_q2 = vec_absd(q3, q2); u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0); const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0); const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0); b32x4 m8_16 = vec_or(m8, m16); u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); u8x16 cmp_I_m4 = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); cmp_I_m8 = vec_and(cmp_I_m8, (b8x16)m8_16); u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); const b8x16 ltE = vec_cmple(cmp_E, E); const b8x16 ltI = vec_cmple(cmp_I, I); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0); m8_16 = vec_and(m8_16, (b32x4)fm); m16 = vec_and(m16, (b32x4)fm); cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out); b8x16 flat8in = vec_cmple(cmp_flat8in, F); b8x16 flat8out = vec_cmple(cmp_flat8out, F); flat8in = vec_and(flat8in, (b8x16)m8_16); flat8out = vec_and(flat8out, (b8x16)m16); b8x16 apply_16 = vec_and(flat8out, flat8in); b8x16 apply_8 = vec_andc(flat8in, flat8out); UNPACK_16(p6) UNPACK_16(p5) UNPACK_16(p4) UNPACK_16(p3) UNPACK_16(p2) UNPACK_16(p1) UNPACK_16(p0) b8x16 apply_4 = vec_and(fm, vec_nor(apply_16, apply_8)); UNPACK_16(q0) UNPACK_16(q1) UNPACK_16(q2) UNPACK_16(q3) UNPACK_16(q4) UNPACK_16(q5) UNPACK_16(q6) if (vec_any_ne(apply_4, zero)) { APPLY_4 } if (vec_any_ne(apply_16, zero)) { APPLY_16 } if (vec_any_ne(apply_8, zero)) { APPLY_8 } MERGEHL_U8(p5, p4) MERGEHL_U8(p3, p2) MERGEHL_U8(p1, p0) MERGEHL_U8(q0, q1) MERGEHL_U8(q2, q3) MERGEHL_U8(q4, q5) MERGEHL_U16(ab_p5p4p3p2, p5p4h, p3p2h) MERGEHL_U16(cd_p5p4p3p2, p5p4l, p3p2l) MERGEHL_U16(ab_p1p0q0q1, p1p0h, q0q1h) MERGEHL_U16(cd_p1p0q0q1, p1p0l, q0q1l) MERGEHL_U16(ab_q2q3q4q5, q2q3h, q4q5h) MERGEHL_U16(cd_q2q3q4q5, q2q3l, q4q5l) MERGEHL_U32(a_p5p4p3p2q2q3q4q5, ab_p5p4p3p2h, ab_q2q3q4q5h) // A0 p5p4p3p2 q2q3q4q5 A1 // A2 A3 MERGEHL_U32(a_p1p0q0q1q2q3q4q5, ab_p1p0q0q1h, ab_q2q3q4q5h) // A0 p1p0q0q1 q2q3q4q5 A1 // A2 A3 MERGEHL_U32(b_p5p4p3p2q2q3q4q5, ab_p5p4p3p2l, ab_q2q3q4q5l) // B0 p5p4p3p2 q2q3q4q5 B1 // A2 A3 MERGEHL_U32(b_p1p0q0q1q2q3q4q5, ab_p1p0q0q1l, ab_q2q3q4q5l) // B0 p1p0q0q1 q2q3q4q5 B1 // B2 B3 MERGEHL_U32(c_p5p4p3p2q2q3q4q5, cd_p5p4p3p2h, cd_q2q3q4q5h) // C0 p5p4p3p2 q2q3q4q5 C1 // C2 C3 MERGEHL_U32(c_p1p0q0q1q2q3q4q5, cd_p1p0q0q1h, cd_q2q3q4q5h) // C0 p1p0q0q1 q2q3q4q5 C1 // C2 C3 MERGEHL_U32(d_p5p4p3p2q2q3q4q5, cd_p5p4p3p2l, cd_q2q3q4q5l) // D0 p5p4p3p2 q2q3q4q5 D1 // D2 D3 MERGEHL_U32(d_p1p0q0q1q2q3q4q5, cd_p1p0q0q1l, cd_q2q3q4q5l) // D0 p1p0q0q1 q2q3q4q5 D1 // D2 D3 MERGEHL_U32(a01, a_p5p4p3p2q2q3q4q5h, a_p1p0q0q1q2q3q4q5h) // A0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 // A1 vec_xst_len(a01h, dst2, 12); dst2 += stridea; vec_xst_len(a01l, dst2, 12); dst2 += stridea; MERGEHL_U32(a23, a_p5p4p3p2q2q3q4q5l, a_p1p0q0q1q2q3q4q5l) // A2 // A3 vec_xst_len(a23h, dst2, 12); dst2 += stridea; vec_xst_len(a23l, dst2, 12); dst2 += stridea; MERGEHL_U32(b01, b_p5p4p3p2q2q3q4q5h, b_p1p0q0q1q2q3q4q5h) // B0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 // B1 vec_xst_len(b01h, dst2, 12); dst2 += stridea; vec_xst_len(b01l, dst2, 12); dst2 += stridea; MERGEHL_U32(b23, b_p5p4p3p2q2q3q4q5l, b_p1p0q0q1q2q3q4q5l) // B2 // B3 vec_xst_len(b23h, dst2, 12); dst2 += stridea; vec_xst_len(b23l, dst2, 12); dst2 += stridea; MERGEHL_U32(c01, c_p5p4p3p2q2q3q4q5h, c_p1p0q0q1q2q3q4q5h) // C0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 // C1 vec_xst_len(c01h, dst2, 12); dst2 += stridea; vec_xst_len(c01l, dst2, 12); dst2 += stridea; MERGEHL_U32(c23, c_p5p4p3p2q2q3q4q5l, c_p1p0q0q1q2q3q4q5l) // C2 // C3 vec_xst_len(c23h, dst2, 12); dst2 += stridea; vec_xst_len(c23l, dst2, 12); dst2 += stridea; MERGEHL_U32(d01, d_p5p4p3p2q2q3q4q5h, d_p1p0q0q1q2q3q4q5h) // D0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5 // D1 vec_xst_len(d01h, dst2, 12); dst2 += stridea; vec_xst_len(d01l, dst2, 12); dst2 += stridea; MERGEHL_U32(d23, d_p5p4p3p2q2q3q4q5l, d_p1p0q0q1q2q3q4q5l) // D2 // D3 vec_xst_len(d23h, dst2, 12); dst2 += stridea; vec_xst_len(d23l, dst2, 12); dst2 += stridea; } static inline void loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t strideb, b32x4 apply) { uint8_t *p1d = dst + strideb * -2; uint8_t *p0d = dst + strideb * -1; uint8_t *q0d = dst + strideb * +0; uint8_t *q1d = dst + strideb * +1; u8x16 p1 = vec_xl(0, p1d); u8x16 p0 = vec_xl(0, p0d); u8x16 q0 = vec_xl(0, q0d); u8x16 q1 = vec_xl(0, q1d); const u8x16 zero = vec_splat_u8(0); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); const u8x16 cmp_I = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); const b8x16 ltI = vec_cmple(cmp_I, I); const b8x16 ltE = vec_cmple(cmp_E, E); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; UNPACK_16(p0) UNPACK_16(q0) APPLY_4 vec_xst(p0, 0, p0d); vec_xst(q0, 0, q0d); vec_xst(q1, 0, q1d); vec_xst(p1, 0, p1d); } static inline void loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t strideb, b32x4 apply, b32x4 m6) { uint8_t *p2d = dst + strideb * -3; uint8_t *p1d = dst + strideb * -2; uint8_t *p0d = dst + strideb * -1; uint8_t *q0d = dst + strideb * +0; uint8_t *q1d = dst + strideb * +1; uint8_t *q2d = dst + strideb * +2; u8x16 p2 = vec_xl(0, p2d); u8x16 p1 = vec_xl(0, p1d); u8x16 p0 = vec_xl(0, p0d); u8x16 q0 = vec_xl(0, q0d); u8x16 q1 = vec_xl(0, q1d); u8x16 q2 = vec_xl(0, q2d); const u8x16 F = vec_splat_u8(1); const u8x16 zero = vec_splat_u8(0); const u16x8 v3u16 = vec_splat_u16(3); const u16x8 v4u16 = vec_splat_u16(4); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); const u8x16 a_p2_p1 = vec_absd(p2, p1); const u8x16 a_q2_q1 = vec_absd(q2, q1); const u8x16 a_p2_p0 = vec_absd(p2, p0); const u8x16 a_q2_q0 = vec_absd(q2, q0); u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); u8x16 cmp_I_m6 = max_a_p2p1_q2q1; u8x16 cmp_I_m4 = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6); u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6); const b8x16 ltE = vec_cmple(cmp_E, E); const b8x16 ltI = vec_cmple(cmp_I, I); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; UNPACK_16(p2) UNPACK_16(p1) UNPACK_16(p0) UNPACK_16(q0) UNPACK_16(q1) UNPACK_16(q2) m6 = vec_and(m6, (b32x4)fm); u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6); b8x16 apply_4 = vec_andc(fm, apply_6); if (vec_any_ne(apply_4, zero)) { APPLY_4 } if (vec_any_ne(apply_6, zero)) { DECLARE_ADD_16HL(p2p2, p2, p2) DECLARE_ADD_16HL(p2p1, p2, p1) DECLARE_ADD_16HL(p1p0, p1, p0) DECLARE_ADD_16HL(p0q0, p0, q0) DECLARE_ADD_16HL(q0q1, q0, q1) DECLARE_ADD_16HL(q1q2, q1, q2) DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0) DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0) DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2) DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1) u16x8 q1q2q2q2h = q2h * 3 + q1h; u16x8 q1q2q2q2l = q2l * 3 + q1l; DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0) DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1) DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1) DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2) ADD_AND_SHIFT4(op1) ADD_AND_SHIFT4(op0) ADD_AND_SHIFT4(oq0) ADD_AND_SHIFT4(oq1) p1 = PACK_AND_SEL(p1, apply_6); p0 = PACK_AND_SEL(p0, apply_6); q0 = PACK_AND_SEL(q0, apply_6); q1 = PACK_AND_SEL(q1, apply_6); } vec_xst(p0, 0, p0d); vec_xst(q0, 0, q0d); vec_xst(q1, 0, q1d); vec_xst(p1, 0, p1d); } static inline void loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t strideb, b32x4 apply, b32x4 m8) { uint8_t *p3d = dst + strideb * -4; uint8_t *p2d = dst + strideb * -3; uint8_t *p1d = dst + strideb * -2; uint8_t *p0d = dst + strideb * -1; uint8_t *q0d = dst + strideb * +0; uint8_t *q1d = dst + strideb * +1; uint8_t *q2d = dst + strideb * +2; uint8_t *q3d = dst + strideb * +3; u8x16 p3 = vec_xl(0, p3d); u8x16 p2 = vec_xl(0, p2d); u8x16 p1 = vec_xl(0, p1d); u8x16 p0 = vec_xl(0, p0d); u8x16 q0 = vec_xl(0, q0d); u8x16 q1 = vec_xl(0, q1d); u8x16 q2 = vec_xl(0, q2d); u8x16 q3 = vec_xl(0, q3d); const u8x16 F = vec_splat_u8(1); const u8x16 zero = vec_splat_u8(0); const u16x8 v3u16 = vec_splat_u16(3); const u16x8 v4u16 = vec_splat_u16(4); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); const u8x16 a_p2_p1 = vec_absd(p2, p1); const u8x16 a_q2_q1 = vec_absd(q2, q1); const u8x16 a_p2_p0 = vec_absd(p2, p0); const u8x16 a_q2_q0 = vec_absd(q2, q0); const u8x16 a_p3_p0 = vec_absd(p3, p0); const u8x16 a_q3_q0 = vec_absd(q3, q0); const u8x16 a_p3_p2 = vec_absd(p3, p2); const u8x16 a_q3_q2 = vec_absd(q3, q2); u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); u8x16 cmp_I_m4 = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8); u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); const b8x16 ltE = vec_cmple(cmp_E, E); const b8x16 ltI = vec_cmple(cmp_I, I); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; #define UNPACK_16(v) \ u16x8 v##h = u8h_to_u16(v); \ u16x8 v##l = u8l_to_u16(v); UNPACK_16(p3) UNPACK_16(p2) UNPACK_16(p1) UNPACK_16(p0) UNPACK_16(q0) UNPACK_16(q1) UNPACK_16(q2) UNPACK_16(q3) m8 = vec_and(m8, (b32x4)fm); u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8); b8x16 apply_4 = vec_andc(fm, apply_8); if (vec_any_ne(apply_4, zero)) { APPLY_4 } if (vec_any_ne(apply_8, zero)) { APPLY_8 } vec_xst(p0, 0, p0d); vec_xst(q0, 0, q0d); vec_xst(q1, 0, q1d); vec_xst(p1, 0, p1d); vec_xst(q2, 0, q2d); vec_xst(p2, 0, p2d); } static inline void loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H, const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16) { uint8_t *p6d = dst + strideb * -7; uint8_t *p5d = dst + strideb * -6; uint8_t *p4d = dst + strideb * -5; uint8_t *p3d = dst + strideb * -4; uint8_t *p2d = dst + strideb * -3; uint8_t *p1d = dst + strideb * -2; uint8_t *p0d = dst + strideb * -1; uint8_t *q0d = dst + strideb * +0; uint8_t *q1d = dst + strideb * +1; uint8_t *q2d = dst + strideb * +2; uint8_t *q3d = dst + strideb * +3; uint8_t *q4d = dst + strideb * +4; uint8_t *q5d = dst + strideb * +5; uint8_t *q6d = dst + strideb * +6; u8x16 p6 = vec_xl(0, p6d); u8x16 p5 = vec_xl(0, p5d); u8x16 p4 = vec_xl(0, p4d); u8x16 p3 = vec_xl(0, p3d); u8x16 p2 = vec_xl(0, p2d); u8x16 p1 = vec_xl(0, p1d); u8x16 p0 = vec_xl(0, p0d); u8x16 q0 = vec_xl(0, q0d); u8x16 q1 = vec_xl(0, q1d); u8x16 q2 = vec_xl(0, q2d); u8x16 q3 = vec_xl(0, q3d); u8x16 q4 = vec_xl(0, q4d); u8x16 q5 = vec_xl(0, q5d); u8x16 q6 = vec_xl(0, q6d); const u8x16 F = vec_splat_u8(1); const u8x16 zero = vec_splat_u8(0); const u16x8 v3u16 = vec_splat_u16(3); const u16x8 v4u16 = vec_splat_u16(4); const u16x8 v8u16 = vec_splat_u16(8); const u8x16 v1u8 = vec_splat_u8(1); const b8x16 s = (b8x16)vec_splats((uint8_t)128); const u8x16 a_p6_p0 = vec_absd(p6, p0); const u8x16 a_p5_p0 = vec_absd(p5, p0); const u8x16 a_p4_p0 = vec_absd(p4, p0); const u8x16 a_q4_q0 = vec_absd(q4, q0); const u8x16 a_q5_q0 = vec_absd(q5, q0); const u8x16 a_q6_q0 = vec_absd(q6, q0); const u8x16 a_p1_p0 = vec_absd(p1, p0); const u8x16 a_q1_q0 = vec_absd(q1, q0); const u8x16 a_p0_q0 = vec_absd(p0, q0); const u8x16 a_p1_q1 = vec_absd(p1, q1); const u8x16 a_p2_p1 = vec_absd(p2, p1); const u8x16 a_q2_q1 = vec_absd(q2, q1); const u8x16 a_p2_p0 = vec_absd(p2, p0); const u8x16 a_q2_q0 = vec_absd(q2, q0); const u8x16 a_p3_p0 = vec_absd(p3, p0); const u8x16 a_q3_q0 = vec_absd(q3, q0); const u8x16 a_p3_p2 = vec_absd(p3, p2); const u8x16 a_q3_q2 = vec_absd(q3, q2); u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1); u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2); u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0); const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0); const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0); const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0); const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0); const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0); b32x4 m8_16 = vec_or(m8, m16); u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0); u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2); u8x16 cmp_I_m4 = max_a_p1p0_q1q0; cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E); cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8_16); u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8); const b8x16 ltE = vec_cmple(cmp_E, E); const b8x16 ltI = vec_cmple(cmp_I, I); b8x16 fm = vec_and(ltI, ltE); fm = vec_and(fm, (b8x16)apply); if (vec_all_eq(fm, zero)) return; u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0); u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0); m8_16 = vec_and(m8_16, (b32x4)fm); m16 = vec_and(m16, (b32x4)fm); cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in); cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out); b8x16 flat8in = vec_cmple(cmp_flat8in, F); b8x16 flat8out = vec_cmple(cmp_flat8out, F); flat8in = vec_and(flat8in, (b8x16)m8_16); flat8out = vec_and(flat8out, (b8x16)m16); b8x16 apply_16 = vec_and(flat8out, flat8in); b8x16 apply_8 = vec_andc(flat8in, flat8out); UNPACK_16(p6) UNPACK_16(p5) UNPACK_16(p4) UNPACK_16(p3) UNPACK_16(p2) UNPACK_16(p1) UNPACK_16(p0) b8x16 apply_4 = vec_nor(apply_16, apply_8); UNPACK_16(q0) UNPACK_16(q1) UNPACK_16(q2) UNPACK_16(q3) UNPACK_16(q4) UNPACK_16(q5) UNPACK_16(q6) if (vec_any_ne(apply_4, zero)) { APPLY_4 } if (vec_any_ne(apply_16, zero)) { APPLY_16 } if (vec_any_ne(apply_8, zero)) { APPLY_8 } vec_xst(p5, 0, p5d); vec_xst(p4, 0, p4d); vec_xst(p3, 0, p3d); vec_xst(p2, 0, p2d); vec_xst(p1, 0, p1d); vec_xst(p0, 0, p0d); vec_xst(q0, 0, q0d); vec_xst(q1, 0, q1d); vec_xst(q2, 0, q2d); vec_xst(q3, 0, q3d); vec_xst(q4, 0, q4d); vec_xst(q5, 0, q5d); } #if defined(DAV2D_VSX) #define LPF(fn) BF(dav2d_lpf_##fn, vsx) #elif defined(DAV2D_PWR9) #define LPF(fn) BF(dav2d_lpf_##fn, pwr9) #endif void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, const Av2FilterLUT *lut, const int h) { unsigned vm = vmask[0] | vmask[1] | vmask[2]; u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]); u32x4 vm1 = vec_splats(vmask[1]); u32x4 vm2 = vec_splats(vmask[2]); u32x4 mm = (u32x4){1, 2, 4, 8}; const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); const u8x16 s0 = vec_splat(sharp, 0); const u8x16 s1 = vec_splat(sharp, 8); const u32x4 v4u32 = vec_splat_u32(4); const u32x4 zero = vec_splat_u32(0); const u8x16 v1u8 = vec_splat_u8(1); const u8x16 v2u8 = vec_splat_u8(2); const u8x16 v4u8 = vec_splat_u8(4); const uint8_t (*pl)[4] = &l[-1]; const u8x16 spread = (u8x16){ 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x0c, 0x0c, 0x0c, 0x0c, }; for (; vm; vm >>= 4, mm = vec_sl(mm, v4u32), dst += 4 * 4 * PXSTRIDE(stride), pl += 4 * b4_stride) { if (!(vm & 0x0f)) continue; u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ... u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl); u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl); u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl); u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0] u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0] u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8] u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1] u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0] b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] } u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... b32x4 m16 = vec_cmpeq(wd16, mm); b32x4 m8 = vec_cmpeq(wd8, mm); b32x4 m4 = vec_cmpeq(wd4, mm); b32x4 apply = vec_cmpne((u32x4)L, zero); if (vec_all_eq((u32x4)L, zero)) continue; u8x16 I = vec_sr(L, s0); // L >> sharp[0] u8x16 H = vec_sr(L, v4u8); I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) u8x16 E = vec_add(L, v2u8); // L + 2 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) E = vec_add(E, E); // 2 * (L + 2) E = vec_add(E, I); // 2 * (L + 2) + limit apply = vec_and(m4, apply); if (vec_any_ne(wd16, zero)) { loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16); } else if (vec_any_ne(wd8, zero)) { loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8); } else { // wd4 == 0 already tested loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply); } } } void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, const Av2FilterLUT *lut, const int w) { unsigned vm = vmask[0] | vmask[1] | vmask[2]; u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]); u32x4 vm1 = vec_splats(vmask[1]); u32x4 vm2 = vec_splats(vmask[2]); u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); u8x16 s0 = vec_splat(sharp, 0); u8x16 s1 = vec_splat(sharp, 8); u32x4 mm = (u32x4){1, 2, 4, 8}; u32x4 v4u32 = vec_splat_u32(4); u32x4 zero = vec_splat_u32(0); u8x16 v1u8 = vec_splat_u8(1); u8x16 v2u8 = vec_splat_u8(2); u8x16 v4u8 = vec_splat_u8(4); const uint8_t (*pl)[4] = l; const uint8_t (*plb4)[4] = l - b4_stride; const u8x16 spread = (u8x16){ 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x0c, 0x0c, 0x0c, 0x0c, }; for (; vm; vm >>= 4, mm = vec_sl(mm, v4u32), dst += 4 * 4, pl += 4, plb4 += 4) { if (!(vm & 0x0f)) continue; u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl); u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4); u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8] u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] } u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... b32x4 m16 = vec_cmpeq(wd16, mm); b32x4 m8 = vec_cmpeq(wd8, mm); b32x4 m4 = vec_cmpeq(wd4, mm); b32x4 apply = vec_cmpne((u32x4)L, zero); if (vec_all_eq((u32x4)L, zero)) continue; u8x16 I = vec_sr(L, s0); // L >> sharp[0] u8x16 H = vec_sr(L, v4u8); I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) u8x16 E = vec_add(L, v2u8); // L + 2 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) E = vec_add(E, E); // 2 * (L + 2) E = vec_add(E, I); // 2 * (L + 2) + limit apply = vec_and(apply, m4); if (vec_any_ne(wd16, zero)) { loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16); } else if (vec_any_ne(wd8, zero)) { loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8); } else { loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply); } } } void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, const Av2FilterLUT *lut, const int h) { unsigned vm = vmask[0] | vmask[1]; u32x4 vm0 = vec_splats(vm); u32x4 vm1 = vec_splats(vmask[1]); u32x4 mm = (u32x4){1, 2, 4, 8}; const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); const u8x16 s0 = vec_splat(sharp, 0); const u8x16 s1 = vec_splat(sharp, 8); const u32x4 v4u32 = vec_splat_u32(4); const u32x4 zero = vec_splat_u32(0); const u8x16 v1u8 = vec_splat_u8(1); const u8x16 v2u8 = vec_splat_u8(2); const u8x16 v4u8 = vec_splat_u8(4); const uint8_t (*pl)[4] = &l[-1]; const u8x16 spread = (u8x16){ 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x0c, 0x0c, 0x0c, 0x0c, }; for (; vm; vm >>= 4, mm = vec_sl(mm, v4u32), dst += 4 * 4 * PXSTRIDE(stride), pl += 4 * b4_stride) { if (!(vm & 0x0f)) continue; u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ... u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl); u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl); u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl); u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0] u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0] u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1] u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0] b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] } u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... b32x4 m6 = vec_cmpeq(wd6, mm); b32x4 m4 = vec_cmpeq(wd4, mm); b32x4 apply = vec_cmpne((u32x4)L, zero); if (vec_all_eq((u32x4)L, zero)) continue; u8x16 I = vec_sr(L, s0); // L >> sharp[0] u8x16 H = vec_sr(L, v4u8); I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) u8x16 E = vec_add(L, v2u8); // L + 2 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) E = vec_add(E, E); // 2 * (L + 2) E = vec_add(E, I); // 2 * (L + 2) + limit apply = vec_and(m4, apply); if (vec_any_ne(wd6, zero)) { loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6); // loop_filter_h_8 } else { // wd4 == 0 already tested loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply); // loop_filter_h_4 } } } void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride, const uint32_t *const vmask, const uint8_t (*l)[4], ptrdiff_t b4_stride, const Av2FilterLUT *lut, const int w) { unsigned vm = vmask[0] | vmask[1]; u32x4 vm0 = vec_splats(vm); u32x4 vm1 = vec_splats(vmask[1]); u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp); u8x16 s0 = vec_splat(sharp, 0); u8x16 s1 = vec_splat(sharp, 8); u32x4 mm = (u32x4){1, 2, 4, 8}; u32x4 v4u32 = vec_splat_u32(4); u32x4 zero = vec_splat_u32(0); u8x16 v1u8 = vec_splat_u8(1); u8x16 v2u8 = vec_splat_u8(2); u8x16 v4u8 = vec_splat_u8(4); const uint8_t (*pl)[4] = l; const uint8_t (*plb4)[4] = l - b4_stride; const u8x16 spread = (u8x16){ 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x0c, 0x0c, 0x0c, 0x0c, }; for (; vm; vm >>= 4, mm = vec_sl(mm, v4u32), dst += 4 * 4, pl += 4, plb4 += 4) { if (!(vm & 0x0f)) continue; u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl); u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4); u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8] u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8] b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero); u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] } u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ... b32x4 m6 = vec_cmpeq(wd6, mm); b32x4 m4 = vec_cmpeq(wd4, mm); b32x4 apply = vec_cmpne((u32x4)L, zero); if (vec_all_eq((u32x4)L, zero)) continue; u8x16 I = vec_sr(L, s0); // L >> sharp[0] u8x16 H = vec_sr(L, v4u8); I = vec_min(I, s1); // min(L >> sharp[0], sharp[1]) u8x16 E = vec_add(L, v2u8); // L + 2 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1) E = vec_add(E, E); // 2 * (L + 2) E = vec_add(E, I); // 2 * (L + 2) + limit apply = vec_and(apply, m4); if (vec_any_ne(wd6, zero)) { loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6); } else { loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply); } } } #endif // BITDEPTH dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/looprestoration.h000066400000000000000000000041741517466257200250370ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Michail Alvanos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common/intops.h" #include "src/cpu.h" #include "src/looprestoration.h" void dav2d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, const uint8_t (*const left)[4], const uint8_t *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges); static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav2dLoopRestorationDSPContext *const c, const int bpc) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_PPC_CPU_FLAG_VSX)) return; #if BITDEPTH == 8 c->wiener[0] = c->wiener[1] = dav2d_wiener_filter_vsx; #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/mc.h000066400000000000000000000035611517466257200221720ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/mc.h" decl_blend_fn(BF(dav2d_blend, pwr9)); decl_blend_dir_fn(BF(dav2d_blend_h, pwr9)); decl_blend_dir_fn(BF(dav2d_blend_v, pwr9)); static ALWAYS_INLINE void mc_dsp_init_ppc(Dav2dMCDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_PPC_CPU_FLAG_PWR9)) return; #if BITDEPTH == 8 c->blend = BF(dav2d_blend, pwr9); c->blend_h = BF(dav2d_blend_h, pwr9); c->blend_v = BF(dav2d_blend_v, pwr9); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/mc_tmpl.c000066400000000000000000000434421517466257200232230ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common/attributes.h" #include "src/ppc/mc.h" #include "src/tables.h" #include "src/ppc/dav2d_types.h" #if BITDEPTH == 8 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) typedef void (*blend_line)(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride); #define BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) \ { \ u16x8 anm0 = vec_mule(ab0, nm_m0); \ u16x8 anm1 = vec_mule(ab1, nm_m1); \ u16x8 anm2 = vec_mule(ab2, nm_m2); \ u16x8 anm3 = vec_mule(ab3, nm_m3); \ \ u16x8 bm0 = vec_mulo(ab0, nm_m0); \ u16x8 bm1 = vec_mulo(ab1, nm_m1); \ u16x8 bm2 = vec_mulo(ab2, nm_m2); \ u16x8 bm3 = vec_mulo(ab3, nm_m3); \ \ d0_u16 = vec_add(anm0, bm0); \ d1_u16 = vec_add(anm1, bm1); \ d2_u16 = vec_add(anm2, bm2); \ d3_u16 = vec_add(anm3, bm3); \ \ d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \ d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \ d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32)); \ d3_u16 = vec_add(d3_u16, vec_splats((uint16_t)32)); \ \ d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \ d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \ d2_u16 = vec_sr(d2_u16, vec_splat_u16(6)); \ d3_u16 = vec_sr(d3_u16, vec_splat_u16(6)); \ } #define BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_2) \ { \ u16x8 anm0 = vec_mule(ab0, nm_m0); \ u16x8 anm1 = vec_mule(ab1, nm_m1); \ u16x8 anm2 = vec_mule(ab2, nm_m2); \ \ u16x8 bm0 = vec_mulo(ab0, nm_m0); \ u16x8 bm1 = vec_mulo(ab1, nm_m1); \ u16x8 bm2 = vec_mulo(ab2, nm_m2); \ \ d0_u16 = vec_add(anm0, bm0); \ d1_u16 = vec_add(anm1, bm1); \ d2_u16 = vec_add(anm2, bm2); \ \ d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \ d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \ d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32)); \ \ d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \ d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \ d2_u16 = vec_sr(d2_u16, vec_splat_u16(6)); \ } #define BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m1) \ { \ u16x8 anm0 = vec_mule(ab0, nm_m0); \ u16x8 anm1 = vec_mule(ab1, nm_m1); \ \ u16x8 bm0 = vec_mulo(ab0, nm_m0); \ u16x8 bm1 = vec_mulo(ab1, nm_m1); \ \ d0_u16 = vec_add(anm0, bm0); \ d1_u16 = vec_add(anm1, bm1); \ \ d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \ d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \ \ d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \ d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \ } static void blend4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 a2 = vec_xl(0, dst + 2 * stride); u8x16 a3 = vec_xl(0, dst + 3 * stride); u8x16 m0 = vec_xl(0, mask); u8x16 m1 = vec_xl(0, mask + 4); u8x16 m2 = vec_xl(0, mask + 2 * 4); u8x16 m3 = vec_xl(0, mask + 3 * 4); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + 4); u8x16 b2 = vec_xl(0, tmp + 2 * 4); u8x16 b3 = vec_xl(0, tmp + 3 * 4); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 nm1 = vec_sub(v64u8, m1); u8x16 nm2 = vec_sub(v64u8, m2); u8x16 nm3 = vec_sub(v64u8, m3); u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd u8x16 nm_m0 = vec_mergeh(nm0, m0); u8x16 nm_m1 = vec_mergeh(nm1, m1); u8x16 nm_m2 = vec_mergeh(nm2, m2); u8x16 nm_m3 = vec_mergeh(nm3, m3); u16x8 d0_u16, d1_u16, d2_u16, d3_u16; BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3); u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16); u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16); vec_xst_len(d0, dst, 4); vec_xst_len(d1, dst + stride, 4); vec_xst_len(d2, dst + 2 * stride, 4); vec_xst_len(d3, dst + 3 * stride, 4); } static void blend8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 a2 = vec_xl(0, dst + 2 * stride); u8x16 a3 = vec_xl(0, dst + 3 * stride); u8x16 m0 = vec_xl(0, mask); u8x16 m1 = vec_xl(0, mask + 8); u8x16 m2 = vec_xl(0, mask + 2 * 8); u8x16 m3 = vec_xl(0, mask + 3 * 8); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + 8); u8x16 b2 = vec_xl(0, tmp + 2 * 8); u8x16 b3 = vec_xl(0, tmp + 3 * 8); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 nm1 = vec_sub(v64u8, m1); u8x16 nm2 = vec_sub(v64u8, m2); u8x16 nm3 = vec_sub(v64u8, m3); u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd u8x16 nm_m0 = vec_mergeh(nm0, m0); u8x16 nm_m1 = vec_mergeh(nm1, m1); u8x16 nm_m2 = vec_mergeh(nm2, m2); u8x16 nm_m3 = vec_mergeh(nm3, m3); u16x8 d0_u16, d1_u16, d2_u16, d3_u16; BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3); u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16); u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16); vec_xst_len(d0, dst, 8); vec_xst_len(d1, dst + stride, 8); vec_xst_len(d2, dst + 2 * stride, 8); vec_xst_len(d3, dst + 3 * stride, 8); } static inline void blend16_lines(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 a2 = vec_xl(0, dst + 2 * stride); u8x16 a3 = vec_xl(0, dst + 3 * stride); u8x16 m0 = vec_xl(0, mask); u8x16 m1 = vec_xl(0, mask + mstride); u8x16 m2 = vec_xl(0, mask + 2 * mstride); u8x16 m3 = vec_xl(0, mask + 3 * mstride); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + mstride); u8x16 b2 = vec_xl(0, tmp + 2 * mstride); u8x16 b3 = vec_xl(0, tmp + 3 * mstride); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 nm1 = vec_sub(v64u8, m1); u8x16 nm2 = vec_sub(v64u8, m2); u8x16 nm3 = vec_sub(v64u8, m3); u8x16 ab0 = vec_mergeh(a0, b0); u8x16 ab1 = vec_mergeh(a1, b1); u8x16 ab2 = vec_mergeh(a2, b2); u8x16 ab3 = vec_mergeh(a3, b3); u8x16 nm_m0 = vec_mergeh(nm0, m0); u8x16 nm_m1 = vec_mergeh(nm1, m1); u8x16 nm_m2 = vec_mergeh(nm2, m2); u8x16 nm_m3 = vec_mergeh(nm3, m3); u16x8 d0h_u16, d1h_u16, d2h_u16, d3h_u16; u16x8 d0l_u16, d1l_u16, d2l_u16, d3l_u16; BLEND_LINES4(d0h_u16, d1h_u16, d2h_u16, d3h_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) ab0 = vec_mergel(a0, b0); ab1 = vec_mergel(a1, b1); ab2 = vec_mergel(a2, b2); ab3 = vec_mergel(a3, b3); nm_m0 = vec_mergel(nm0, m0); nm_m1 = vec_mergel(nm1, m1); nm_m2 = vec_mergel(nm2, m2); nm_m3 = vec_mergel(nm3, m3); BLEND_LINES4(d0l_u16, d1l_u16, d2l_u16, d3l_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16); u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16); u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16); u8x16 d3 = (u8x16)vec_pack(d3h_u16, d3l_u16); vec_xst(d0, 0,dst); vec_xst(d1, 0,dst + stride); vec_xst(d2, 0,dst + 2 * stride); vec_xst(d3, 0,dst + 3 * stride); } static void blend16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend16_lines(dst, tmp, mask, stride, 16); } static void blend32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { for (int i = 0; i < 2; i++, dst += 16, tmp += 16, mask += 16) { blend16_lines(dst, tmp, mask, stride, 32); } } static blend_line blend_funcs[4] = { blend4, blend8, blend16, blend32 }; void dav2d_blend_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h, const uint8_t *mask) { assert(w <= 32); blend_line blend = blend_funcs[ctz(w) - 2]; for (int y = 0; y < h; y+=4) { blend(dst, tmp, mask, PXSTRIDE(dst_stride)); dst += 4 * PXSTRIDE(dst_stride); tmp += 4 * w; mask += 4 * w; } } static inline void blend_v_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 m0 = vec_xl(0, mask); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + mstride); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd u8x16 nm_m0 = vec_mergeh(nm0, m0); u16x8 d0_u16, d1_u16; BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m0); u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); vec_xst_len(d0, dst, l); vec_xst_len(d1, dst + stride, l); } static inline void blend_v_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 m0 = vec_xl(0, mask); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + mstride); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 ab0 = vec_mergeh(a0, b0); u8x16 ab1 = vec_mergeh(a1, b1); u8x16 nm_m0 = vec_mergeh(nm0, m0); u16x8 d0h_u16, d1h_u16; u16x8 d0l_u16, d1l_u16; BLEND_LINES2(d0h_u16, d1h_u16, ab0, ab1, nm_m0, nm_m0) ab0 = vec_mergel(a0, b0); ab1 = vec_mergel(a1, b1); nm_m0 = vec_mergel(nm0, m0); BLEND_LINES2(d0l_u16, d1l_u16, ab0, ab1,nm_m0, nm_m0) u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16); u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16); vec_xst_len(d0, dst, l); vec_xst_len(d1, dst + stride, l); } static void blend_v3(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_v_h(dst, tmp, mask, stride, 4, 3); } static void blend_v6(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_v_h(dst, tmp, mask, stride, 8, 6); } static void blend_v12(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_v_hl(dst, tmp, mask, stride, 16, 12); } static void blend_v24(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_v_hl(dst, tmp, mask, stride, 32, 16); blend_v_h(dst + 16, tmp + 16, mask + 16, stride, 32, 8); } static void blend_v1(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { dst[0] = blend_px(dst[0], tmp[0], mask[0]); dst[stride] = blend_px(dst[stride], tmp[2], mask[0]); } static blend_line blend_v_funcs[5] = { blend_v1, blend_v3, blend_v6, blend_v12, blend_v24 }; void dav2d_blend_v_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) { const uint8_t *const mask = &dav2d_obmc_masks[w]; assert(w <= 32); blend_line blend = blend_v_funcs[ctz(w) - 1]; for (int y = 0; y < h; y+=2) { blend(dst, tmp, mask, PXSTRIDE(dst_stride)); dst += 2 * PXSTRIDE(dst_stride); tmp += 2 * w; } } static inline void blend_h_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 a2 = vec_xl(0, dst + 2 * stride); u8x16 m = vec_xl(0, mask); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + mstride); u8x16 b2 = vec_xl(0, tmp + 2 * mstride); u8x16 m0 = vec_splat(m, 0); u8x16 m1 = vec_splat(m, 1); u8x16 m2 = vec_splat(m, 2); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 nm1 = vec_sub(v64u8, m1); u8x16 nm2 = vec_sub(v64u8, m2); u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd u8x16 nm_m0 = vec_mergeh(nm0, m0); u8x16 nm_m1 = vec_mergeh(nm1, m1); u8x16 nm_m2 = vec_mergeh(nm2, m2); u16x8 d0_u16, d1_u16, d2_u16; BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2); u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16); vec_xst_len(d0, dst, l); vec_xst_len(d1, dst + stride, l); vec_xst_len(d2, dst + 2 * stride, l); } static inline void blend_h_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride) { u8x16 v64u8 = vec_splats((uint8_t)64); u8x16 a0 = vec_xl(0, dst); u8x16 a1 = vec_xl(0, dst + stride); u8x16 a2 = vec_xl(0, dst + 2 * stride); u8x16 m = vec_xl(0, mask); u8x16 b0 = vec_xl(0, tmp); u8x16 b1 = vec_xl(0, tmp + mstride); u8x16 b2 = vec_xl(0, tmp + 2 * mstride); u8x16 m0 = vec_splat(m, 0); u8x16 m1 = vec_splat(m, 1); u8x16 m2 = vec_splat(m, 2); u8x16 nm0 = vec_sub(v64u8, m0); u8x16 nm1 = vec_sub(v64u8, m1); u8x16 nm2 = vec_sub(v64u8, m2); u8x16 ab0 = vec_mergeh(a0, b0); u8x16 ab1 = vec_mergeh(a1, b1); u8x16 ab2 = vec_mergeh(a2, b2); u8x16 nm_m0 = vec_mergeh(nm0, m0); u8x16 nm_m1 = vec_mergeh(nm1, m1); u8x16 nm_m2 = vec_mergeh(nm2, m2); u16x8 d0h_u16, d1h_u16, d2h_u16; u16x8 d0l_u16, d1l_u16, d2l_u16; BLEND_LINES3(d0h_u16, d1h_u16, d2h_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2) ab0 = vec_mergel(a0, b0); ab1 = vec_mergel(a1, b1); ab2 = vec_mergel(a2, b2); nm_m0 = vec_mergel(nm0, m0); nm_m1 = vec_mergel(nm1, m1); nm_m2 = vec_mergel(nm2, m2); BLEND_LINES3(d0l_u16, d1l_u16, d2l_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2) u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16); u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16); u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16); vec_xst(d0, 0, dst); vec_xst(d1, 0,dst + stride); vec_xst(d2, 0,dst + 2 * stride); } static void blend_h2(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { for (int y = 0; y < 3; y++) { const int m = *mask++; for (int x = 0; x < 2; x++) { dst[x] = blend_px(dst[x], tmp[x], m); } dst += stride; tmp += 2; } } static void blend_h4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_h_h(dst, tmp, mask, stride, 4, 4); } static void blend_h8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_h_h(dst, tmp, mask, stride, 8, 8); } static void blend_h16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_h_hl(dst, tmp, mask, stride, 16); } static void blend_h32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_h_hl(dst, tmp, mask, stride, 32); blend_h_hl(dst + 16, tmp + 16, mask, stride, 32); } static void blend_h64(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { blend_h_hl(dst, tmp, mask, stride, 64); blend_h_hl(dst + 16, tmp + 16, mask, stride, 64); blend_h_hl(dst + 32, tmp + 32, mask, stride, 64); blend_h_hl(dst + 48, tmp + 48, mask, stride, 64); } static void blend_h128(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { for (int i = 0; i < 2; i++, dst += 64, tmp += 64) { blend_h_hl(dst, tmp, mask, stride, 128); blend_h_hl(dst + 16, tmp + 16, mask, stride, 128); blend_h_hl(dst + 32, tmp + 32, mask, stride, 128); blend_h_hl(dst + 48, tmp + 48, mask, stride, 128); } } static blend_line blend_h_funcs[7] = { blend_h2, blend_h4, blend_h8, blend_h16, blend_h32, blend_h64, blend_h128 }; void dav2d_blend_h_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) { const uint8_t *mask = &dav2d_obmc_masks[h]; h = (h * 3) >> 2; assert(w <= 128); blend_line blend = blend_h_funcs[ctz(w) - 1]; if (h == 1) { const int m = *mask++; for (int x = 0; x < w; x++) { dst[x] = blend_px(dst[x], tmp[x], m); } } else for (int y = 0; y < h; y+=3) { blend(dst, tmp, mask, PXSTRIDE(dst_stride)); dst += 3 * PXSTRIDE(dst_stride); tmp += 3 * w; mask += 3; } } #endif // BITDEPTH dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ppc/utils.h000066400000000000000000000072151517466257200227330ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Luca Barbato * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_PPC_UTILS_H #define DAV2D_SRC_PPC_UTILS_H #include "src/ppc/dav2d_types.h" #define assert_eq(a, b) \ if ((a) != (b)) \ printf("%d: %d vs %d\n", __LINE__, a, b); \ assert((a) == (b)); #define MERGE_I32(a, b, h, l) \ { \ h = vec_mergeh(a, b); \ l = vec_mergel(a, b); \ } #define DECLARE_MERGE_I32(a, b, h, l) \ i32x4 h, l; \ MERGE_I32(a, b, h, l) // Transpose a 4x4 matrix of i32x4 vectors #define TRANSPOSE4_I32(c0, c1, c2, c3) \ { \ DECLARE_MERGE_I32(c0, c2, m02h, m02l) \ DECLARE_MERGE_I32(c1, c3, m13h, m13l) \ \ MERGE_I32(m02h, m13h, c0, c1) \ MERGE_I32(m02l, m13l, c2, c3) \ } // Transpose a 8x8 matrix of i32x4 vectors #define TRANSPOSE8_I32(c0, c1, c2, c3, c4, c5, c6, c7, \ c8, c9, cA, cB, cC, cD, cE, cF) \ { \ DECLARE_MERGE_I32(c0, c2, m02h, m02l) \ DECLARE_MERGE_I32(c1, c3, m13h, m13l) \ DECLARE_MERGE_I32(c4, c6, m46h, m46l) \ DECLARE_MERGE_I32(c5, c7, m57h, m57l) \ DECLARE_MERGE_I32(c8, cA, m8Ah, m8Al) \ DECLARE_MERGE_I32(c9, cB, m9Bh, m9Bl) \ DECLARE_MERGE_I32(cC, cE, mCEh, mCEl) \ DECLARE_MERGE_I32(cD, cF, mDFh, mDFl) \ \ MERGE_I32(m02h, m13h, c0, c1) \ MERGE_I32(m02l, m13l, c2, c3) \ MERGE_I32(m46h, m57h, c8, c9) \ MERGE_I32(m46l, m57l, cA, cB) \ MERGE_I32(m8Ah, m9Bh, c4, c5) \ MERGE_I32(m8Al, m9Bl, c6, c7) \ MERGE_I32(mCEh, mDFh, cC, cD) \ MERGE_I32(mCEl, mDFl, cE, cF) \ } // Transpose a 4x16 matrix of i32x4 vectors #define TRANSPOSE4x16_I32(c0, c1, c2, c3, c4, c5, c6, c7, \ c8, c9, cA, cB, cC, cD, cE, cF) \ { \ DECLARE_MERGE_I32(c0, c2, m02h, m02l) \ DECLARE_MERGE_I32(c1, c3, m13h, m13l) \ DECLARE_MERGE_I32(c4, c6, m46h, m46l) \ DECLARE_MERGE_I32(c5, c7, m57h, m57l) \ DECLARE_MERGE_I32(c8, cA, m8Ah, m8Al) \ DECLARE_MERGE_I32(c9, cB, m9Bh, m9Bl) \ DECLARE_MERGE_I32(cC, cE, mCEh, mCEl) \ DECLARE_MERGE_I32(cD, cF, mDFh, mDFl) \ \ MERGE_I32(m02h, m13h, c0, c1) \ MERGE_I32(m02l, m13l, c2, c3) \ MERGE_I32(m46h, m57h, c4, c5) \ MERGE_I32(m46l, m57l, c6, c7) \ MERGE_I32(m8Ah, m9Bh, c8, c9) \ MERGE_I32(m8Al, m9Bl, cA, cB) \ MERGE_I32(mCEh, mDFh, cC, cD) \ MERGE_I32(mCEl, mDFl, cE, cF) \ } #endif // DAV2D_SRC_PPC_UTILS_H dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/quantizer.c000066400000000000000000005405661517466257200230410ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/attributes.h" #include "src/quantizer.h" int dav2d_dq_lookup(int qidx) { if (!qidx) return 64; qidx--; const int shift = qidx / 24; qidx %= 24; static const uint8_t dq_lookup_tbl[] = { 40, 41, 43, 44, 45, 47, 48, 49, 51, 52, 54, 55, 57, 59, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, }; return dq_lookup_tbl[qidx] << shift; } static const uint8_t qm_tbl_32x16[][2][512] = { { { 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217, }, { 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47, 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113, }, }, { { 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83, 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, 186, 192, 193, 201, }, { 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71, 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109, }, }, { { 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39, 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187, }, { 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47, 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63, 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96, 101, 101, 103, 103, 105, }, }, { { 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, 161, 162, 166, 167, 173, }, { 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99, 99, 102, }, }, { { 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35, 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, 155, 160, }, { 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96, 96, 98, }, }, { { 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, }, { 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, }, }, { { 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133, }, { 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42, 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, }, }, { { 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, }, { 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, }, }, { { 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, }, { 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69, 71, 71, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76, 76, 79, }, { 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64, 64, 65, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, }, { 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, }, { 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43, 43, 44, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, }, }, }; static const uint8_t qm_tbl_32x32_t[][2][528] = { { { 32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 33, 33, 32, 32, 32, 33, 34, 35, 34, 34, 33, 34, 35, 37, 39, 35, 34, 34, 35, 36, 37, 41, 43, 36, 35, 34, 35, 36, 38, 42, 45, 48, 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103, 107, 111, 117, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 88, 84, 80, 79, 78, 76, 80, 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94, 89, 86, 85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, 156, 158, 161, 166, 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, 183, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200, 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, 210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230, 232, 242, }, { 32, 31, 31, 30, 31, 32, 32, 33, 33, 35, 33, 34, 35, 37, 39, 36, 38, 40, 41, 43, 47, 41, 42, 42, 43, 45, 47, 48, 45, 45, 44, 45, 46, 47, 49, 50, 49, 47, 46, 47, 47, 48, 50, 51, 53, 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 49, 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, 50, 47, 45, 46, 46, 46, 49, 51, 54, 56, 59, 60, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, 61, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79, 82, 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89, 64, 61, 58, 57, 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100, 70, 66, 64, 63, 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102, 104, 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106, 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109, 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108, 111, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, 110, 111, 113, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118, }, }, { { 32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 33, 32, 32, 32, 33, 34, 35, 32, 33, 33, 33, 34, 36, 36, 34, 34, 33, 34, 35, 37, 38, 39, 36, 35, 34, 35, 36, 38, 40, 42, 48, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 61, 58, 56, 56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 76, 72, 70, 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127, 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134, 83, 78, 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137, 140, 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184, 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, 193, 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219, }, { 32, 31, 31, 30, 31, 31, 31, 32, 32, 33, 33, 34, 35, 36, 39, 36, 38, 39, 40, 43, 47, 38, 40, 41, 41, 44, 47, 47, 41, 42, 42, 43, 45, 47, 48, 48, 49, 47, 46, 46, 47, 48, 49, 50, 53, 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 48, 47, 46, 45, 46, 46, 48, 49, 53, 54, 54, 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55, 58, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69, 72, 73, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74, 76, 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, 62, 59, 57, 56, 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94, 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97, 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113, }, }, { { 32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 34, 34, 35, 34, 34, 33, 33, 35, 36, 37, 39, 34, 34, 34, 34, 36, 36, 37, 41, 42, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 41, 39, 39, 38, 40, 40, 41, 46, 48, 51, 55, 56, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, 58, 59, 63, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, 70, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91, 93, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, 92, 97, 99, 105, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, 156, 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163, 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169, 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, 176, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, 176, 177, 184, 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 107, 101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199, }, { 32, 31, 31, 30, 31, 31, 30, 31, 31, 32, 33, 34, 35, 35, 39, 35, 36, 37, 37, 41, 43, 36, 38, 39, 40, 43, 45, 47, 41, 42, 42, 42, 45, 46, 47, 48, 44, 44, 44, 44, 46, 46, 47, 49, 50, 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, 58, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65, 68, 70, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70, 71, 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, 59, 56, 54, 53, 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91, 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103, 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104, 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106, 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108, }, }, { { 32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 33, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 34, 35, 32, 33, 33, 33, 34, 34, 36, 36, 34, 34, 34, 33, 35, 35, 37, 38, 39, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50, 54, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, 57, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 54, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 56, 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 64, 61, 60, 58, 58, 58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, 140, 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150, 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156, 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, 162, 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, 161, 162, 168, 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174, 100, 95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181, }, { 32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34, 37, 33, 34, 35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38, 40, 40, 41, 43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46, 47, 47, 48, 49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53, 54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, 55, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 63, 66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51, 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100, 102, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, 101, 104, }, }, { { 32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 33, 34, 35, 35, 33, 33, 33, 33, 34, 35, 36, 36, 38, 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 36, 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, 36, 35, 35, 34, 35, 36, 38, 38, 42, 43, 48, 49, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, 54, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70, 72, 74, 78, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 74, 76, 80, 82, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 71, 68, 67, 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112, 117, 74, 71, 70, 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114, 115, 120, 123, 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128, 134, 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137, 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140, 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, 147, 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 93, 88, 88, 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164, }, { 32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34, 35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43, 46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63, 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58, 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99, }, }, { { 32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 31, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 34, 34, 35, 32, 32, 32, 32, 32, 34, 34, 35, 35, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 71, 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 87, 83, 83, 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, }, { 32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34, 34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43, 43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, }, }, { { 32, 31, 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, 46, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60, 63, 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65, 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89, 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, 117, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, 119, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134, }, { 32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38, 40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57, 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54, 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, }, }, { { 32, 31, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, 39, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 105, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, 109, 114, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35, 38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53, 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80, }, }, { { 32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52, 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50, 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31, 31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50, 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48, 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, }, }, { { 32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40, 41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49, 49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64, }, }, { { 32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31, 32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47, 47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, }, }, { { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, }, { 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, }, }, }; const uint8_t *dav2d_qm_tbl[16][2][N_RECT_TX_SIZES]; static uint8_t qm_tbl_4x4[15][2][16]; static uint8_t qm_tbl_4x8[15][2][32]; static uint8_t qm_tbl_4x16[15][2][64]; static uint8_t qm_tbl_8x4[15][2][32]; static uint8_t qm_tbl_8x8[15][2][64]; static uint8_t qm_tbl_8x16[15][2][128]; static uint8_t qm_tbl_8x32[15][2][256]; static uint8_t qm_tbl_16x4[15][2][64]; static uint8_t qm_tbl_16x8[15][2][128]; static uint8_t qm_tbl_16x16[15][2][256]; static uint8_t qm_tbl_16x32[15][2][512]; static uint8_t qm_tbl_32x8[15][2][256]; static uint8_t qm_tbl_32x32[15][2][1024]; static void subsample(uint8_t *dst, const uint8_t *const src, const int h, const int hstep, const int vstep) { for (int y = 0; y < h; y += vstep) for (int x = 0; x < 32; x += hstep) *dst++ = src[y * 32 + x]; } static void transpose(uint8_t *const dst, const uint8_t *const src, const int w, const int h) { for (int y = 0, y_off = 0; y < h; y++, y_off += w) for (int x = 0, x_off = 0; x < w; x++, x_off += h) dst[x_off + y] = src[y_off + x]; } static void untriangle(uint8_t *dst, const uint8_t *src, const int sz) { for (int y = 0; y < sz; y++) { memcpy(dst, src, y + 1); const uint8_t *src_ptr = &src[y]; for (int x = y + 1; x < sz; x++) { src_ptr += x; dst[x] = *src_ptr; } dst += sz; src += y + 1; } } COLD void dav2d_init_qm_tables(void) { // This function is guaranteed to be called only once for (int i = 0; i < 15; i++) for (int j = 0; j < 2; j++) { // note that the w/h in the assignment is inverted, this is on purpose // because we store coefficients transposed dav2d_qm_tbl[i][j][RTX_4X8 ] = qm_tbl_8x4[i][j]; dav2d_qm_tbl[i][j][RTX_8X4 ] = qm_tbl_4x8[i][j]; dav2d_qm_tbl[i][j][RTX_4X16 ] = qm_tbl_16x4[i][j]; dav2d_qm_tbl[i][j][RTX_16X4 ] = qm_tbl_4x16[i][j]; dav2d_qm_tbl[i][j][RTX_8X16 ] = qm_tbl_16x8[i][j]; dav2d_qm_tbl[i][j][RTX_16X8 ] = qm_tbl_8x16[i][j]; dav2d_qm_tbl[i][j][RTX_8X32 ] = qm_tbl_32x8[i][j]; dav2d_qm_tbl[i][j][RTX_32X8 ] = qm_tbl_8x32[i][j]; dav2d_qm_tbl[i][j][RTX_16X32] = qm_tbl_32x16[i][j]; dav2d_qm_tbl[i][j][RTX_32X16] = qm_tbl_16x32[i][j]; dav2d_qm_tbl[i][j][ TX_4X4 ] = qm_tbl_4x4[i][j]; dav2d_qm_tbl[i][j][ TX_8X8 ] = qm_tbl_8x8[i][j]; dav2d_qm_tbl[i][j][ TX_16X16] = qm_tbl_16x16[i][j]; dav2d_qm_tbl[i][j][ TX_32X32] = qm_tbl_32x32[i][j]; untriangle(qm_tbl_32x32[i][j], qm_tbl_32x32_t[i][j], 32); subsample(qm_tbl_4x4[i][j], &qm_tbl_32x32[i][j][32*3+3], 32, 8, 8); subsample(qm_tbl_8x4[i][j], &qm_tbl_32x16[i][j][32*1+1], 16, 4, 4); subsample(qm_tbl_8x8[i][j], &qm_tbl_32x32[i][j][32*1+1], 32, 4, 4); subsample(qm_tbl_16x4[i][j], &qm_tbl_32x16[i][j][32*1+0], 16, 2, 4); subsample(qm_tbl_16x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 2, 2); subsample(qm_tbl_16x16[i][j], &qm_tbl_32x32[i][j][32*0+0], 32, 2, 2); subsample(qm_tbl_32x8[i][j], &qm_tbl_32x16[i][j][32*0+0], 16, 1, 2); transpose(qm_tbl_4x8[i][j], qm_tbl_8x4[i][j], 8, 4); transpose(qm_tbl_4x16[i][j], qm_tbl_16x4[i][j], 16, 4); transpose(qm_tbl_8x16[i][j], qm_tbl_16x8[i][j], 16, 8); transpose(qm_tbl_8x32[i][j], qm_tbl_32x8[i][j], 32, 8); transpose(qm_tbl_16x32[i][j], qm_tbl_32x16[i][j], 32, 16); dav2d_qm_tbl[i][j][ TX_64X64] = dav2d_qm_tbl[i][j][ TX_32X32]; dav2d_qm_tbl[i][j][RTX_64X32] = dav2d_qm_tbl[i][j][ TX_32X32]; dav2d_qm_tbl[i][j][RTX_64X16] = dav2d_qm_tbl[i][j][RTX_32X16]; dav2d_qm_tbl[i][j][RTX_32X64] = dav2d_qm_tbl[i][j][ TX_32X32]; dav2d_qm_tbl[i][j][RTX_16X64] = dav2d_qm_tbl[i][j][RTX_16X32]; } // dav2d_qm_tbl[15][*][*] == NULL } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/quantizer.h000066400000000000000000000032221517466257200230250ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_QUANTIZER_H #define DAV2D_SRC_QUANTIZER_H #include "src/levels.h" EXTERN const uint8_t *dav2d_qm_tbl[16][2][N_RECT_TX_SIZES]; void dav2d_init_qm_tables(void); int dav2d_dq_lookup(int qidx); #endif /* DAV2D_SRC_QUANTIZER_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/recon.h000066400000000000000000000072311517466257200221150ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_RECON_H #define DAV2D_SRC_RECON_H #include "src/debug.h" #include "src/internal.h" #include "src/levels.h" #define decl_recon_b_fn(name) \ int (name)(Dav2dTaskContext *t, DB_ONLY(int depth) \ enum BlockSize bs, const enum BlockSize cbs[2], Av2Block *b) typedef decl_recon_b_fn(*recon_b_fn); #define decl_filter_sbrow_fn(name) \ void (name)(Dav2dFrameContext *f, int sby) typedef decl_filter_sbrow_fn(*filter_sbrow_fn); #define decl_backup_prefilter_data_fn(name) \ void (name)(Dav2dTaskContext *t) typedef decl_backup_prefilter_data_fn(*backup_prefilter_data_fn); #define decl_read_coef_blocks_fn(name) \ int (name)(Dav2dTaskContext *t, DB_ONLY(int depth) \ enum BlockSize lbs, enum BlockSize cbs, Av2Block *b) typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn); #define decl_copy_pal_block_fn(name) \ void (name)(Dav2dTaskContext *t, int bx4, int by4, int bw4, int bh4) typedef decl_copy_pal_block_fn(*copy_pal_block_fn); #define decl_read_pal_plane_fn(name) \ void (name)(DB_ONLY(const int depth) Dav2dTaskContext *t, \ Av2Block *b, int bx4, int by4) typedef decl_read_pal_plane_fn(*read_pal_plane_fn); decl_recon_b_fn(dav2d_recon_b_8bpc); decl_recon_b_fn(dav2d_recon_b_16bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_8bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_16bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_deblock_cols_8bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_deblock_cols_16bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_deblock_rows_8bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_deblock_rows_16bpc); void dav2d_filter_sbrow_cdef_8bpc(Dav2dTaskContext *tc, int sby); void dav2d_filter_sbrow_cdef_16bpc(Dav2dTaskContext *tc, int sby); decl_filter_sbrow_fn(dav2d_filter_sbrow_lr_8bpc); decl_filter_sbrow_fn(dav2d_filter_sbrow_lr_16bpc); decl_backup_prefilter_data_fn(dav2d_backup_prefilter_data_8bpc); decl_backup_prefilter_data_fn(dav2d_backup_prefilter_data_16bpc); decl_read_coef_blocks_fn(dav2d_read_coef_blocks_8bpc); decl_read_coef_blocks_fn(dav2d_read_coef_blocks_16bpc); decl_copy_pal_block_fn(dav2d_copy_pal_block_y_8bpc); decl_copy_pal_block_fn(dav2d_copy_pal_block_y_16bpc); decl_read_pal_plane_fn(dav2d_read_pal_plane_8bpc); decl_read_pal_plane_fn(dav2d_read_pal_plane_16bpc); #endif /* DAV2D_SRC_RECON_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/recon_tmpl.c000066400000000000000000006007031517466257200231470ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "common/attributes.h" #include "common/bitdepth.h" #include "common/dump.h" #include "common/frame.h" #include "common/intops.h" #include "src/cdef_apply.h" #include "src/ctx.h" #include "src/ipred_prepare.h" #include "src/itx_1d.h" #include "src/derivation.h" #include "src/db_apply.h" #include "src/lr_apply.h" #include "src/recon.h" #include "src/scan.h" #include "src/stx_tables.h" #include "src/tables.h" #include "src/warpmv.h" #include "src/wedge.h" static inline unsigned decode_exp_golomb(MsacContext *const s, const int k) { const int length = dav2d_msac_decode_unary_bypass21(s) + k; const int x = (1 << length) + dav2d_msac_decode_bools_bypass(s, length); return x - (1 << k); } static inline int decode_hr(MsacContext *const s, const int hr_avg) { const int m = ulog2(iclip(hr_avg, 2, 64)); // 1..6 const int cmax = imin(m + 4, 6); // 5 or 6 const int q = dav2d_msac_decode_unary_bypass6(s, cmax); const int rem = (q == cmax) ? decode_exp_golomb(s, m + 1) : dav2d_msac_decode_bools_bypass(s, m); return rem + (q << m); } static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim, const enum BlockSize bs, const uint8_t *const a, const uint8_t *const l, const int plane, const int u_has_cf, const enum Dav2dPixelLayout layout) { const uint8_t *const b_dim = dav2d_block_dimensions[bs]; if (plane) { const int ss_ver = layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = layout != DAV2D_PIXEL_LAYOUT_I444; const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw || b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh; unsigned ca, cl; #define MERGE_CTX(dir, type, no_val) \ c##dir = ((union alias##type*)dir)->u##type != no_val; \ break switch (t_dim->lw) { /* For some reason the MSVC CRT _wassert() function is not flagged as * __declspec(noreturn), so when using those headers the compiler will * expect execution to continue after an assertion has been triggered * and will therefore complain about the use of uninitialized variables * when compiled in debug mode if we put the default case at the end. */ default: assert(0); /* fall-through */ case TX_4X4: MERGE_CTX(a, 8, 0x40); case TX_8X8: MERGE_CTX(a, 16, 0x4040); case TX_16X16: MERGE_CTX(a, 32, 0x40404040U); case TX_32X32: MERGE_CTX(a, 64, 0x4040404040404040ULL); case TX_64X64: ca = (*(const uint64_t *) a | *(const uint64_t *) &a[8]) != 0x4040404040404040ULL; } switch (t_dim->lh) { default: assert(0); /* fall-through */ case TX_4X4: MERGE_CTX(l, 8, 0x40); case TX_8X8: MERGE_CTX(l, 16, 0x4040); case TX_16X16: MERGE_CTX(l, 32, 0x40404040U); case TX_32X32: MERGE_CTX(l, 64, 0x4040404040404040ULL); case TX_64X64: cl = (((union alias64*)l)->u64 | ((union alias64*)&l[8])->u64) != 0x4040404040404040ULL; } #undef MERGE_CTX const int offset = plane == 1 ? 6 : 6 * u_has_cf + not_one_blk * 3; return offset + ca + cl; } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) { return 0; } else { unsigned la, ll; #define MERGE_CTX(dir, type, tx) \ if (tx == TX_64X64) { \ uint64_t tmp = ((union alias64*)dir)->u64; \ tmp |= ((union alias64*)&dir[8])->u64; \ l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \ } else \ l##dir = ((union alias##type*)dir)->u##type; \ if (tx == TX_32X32) l##dir |= ((union alias32*)&dir[4])->u32; \ if (tx >= TX_16X16) l##dir |= l##dir >> 16; \ if (tx >= TX_8X8) l##dir |= l##dir >> 8; \ break switch (t_dim->lw) { default: assert(0); /* fall-through */ case TX_4X4: MERGE_CTX(a, 8, TX_4X4); case TX_8X8: MERGE_CTX(a, 16, TX_8X8); case TX_16X16: MERGE_CTX(a, 32, TX_16X16); case TX_32X32: MERGE_CTX(a, 32, TX_32X32); case TX_64X64: MERGE_CTX(a, 32, TX_64X64); } switch (t_dim->lh) { default: assert(0); /* fall-through */ case TX_4X4: MERGE_CTX(l, 8, TX_4X4); case TX_8X8: MERGE_CTX(l, 16, TX_8X8); case TX_16X16: MERGE_CTX(l, 32, TX_16X16); case TX_32X32: MERGE_CTX(l, 32, TX_32X32); case TX_64X64: MERGE_CTX(l, 32, TX_64X64); } #undef MERGE_CTX return (umin(la & 0x3F, 4) + umin(ll & 0x3F, 4) + 3) >> 1; } } static inline unsigned get_dc_sign_ctx(const TxfmInfo *const t_dim, const uint8_t *const a, const uint8_t *const l) { uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL; #if ARCH_X86_64 && defined(__GNUC__) /* Coerce compilers into producing better code. For some reason * every x86-64 compiler is awful at handling 64-bit constants. */ __asm__("" : "+r"(mask), "+r"(mul)); #endif uint64_t t = 0; const uint8_t *edge = a; for (int dir = 0, len = t_dim->lw; dir < 2; dir++, edge = l, len = t_dim->lh) { switch(len) { default: assert(0); /* fall-through */ case TX_4X4: t += *edge >> 6; break; case TX_8X8: t += (((union alias16*)edge)->u16 & (uint32_t) mask) >> 6; break; case TX_16X16: t += (((union alias32*)edge)->u32 & (uint32_t) mask) >> 6; break; case TX_64X64: t += (((union alias64*)&edge[8])->u64 & mask) >> 6; // fall-through case TX_32X32: t += (((union alias64*)edge)->u64 & mask) >> 6; break; } } t *= mul; const int s = (int) (t >> 56) - t_dim->w - t_dim->h; return (s != 0) + (s > 0); } static inline unsigned get_lo_ctx(const int8_t *const levels, const enum TxClass tx_class, unsigned *const hi_mag_ptr, const unsigned xy, const int plane, const ptrdiff_t stride) { const int chroma = !!plane; #define add(v) do { \ const unsigned val = v; \ lo_mag += imin(val, lim); \ hi_mag += imin(val, 5); \ } while (0) unsigned lo_freq = xy < (chroma ? 1 : tx_class == TX_CLASS_2D ? 4 : 2); unsigned lim = lo_freq ? 5 : 3; unsigned lo_mag = 0, hi_mag = 0; add(levels[0 * stride + 1]); add(levels[1 * stride + 0]); // for the initial token: // br(l) = min(R, l) + min(B, l) - where l is 3 [hi-freq] or 5 [lo-freq] // brhvc(l) = (br(l) + 1) >> 1 // br2dc(l) = (br(l) + min(RB, l) + 1) >> 1 // br2dl(l) = (br(l) + min(RB, l) + min(B2, l) + min(R2, l) + 1) >> 1 // brvl(l1, l2) = (br(l1) + min(B2, l2) + min(B3, l2) + min(B4, l2) + 1) >> 1 // brhl(l1, l2) = (br(l1) + min(R2, l2) + min(R3, l2) + min(R4, l2) + 1) >> 1 // luma, hf: 2d: x + y < 6: min(br2dl(3), 4) + 0 // x + y < 8: min(br2dl(3), 4) + 5 // else: min(br2dl(3), 4) + 10 // h: min(brhl(3, 3), 4) + 15 // v: min(brvl(3, 3), 4) + 15 // luma, lf: 2d: is_dc: min(br2dl(5), 8) + 0 // x + y < 2: min(br2dl(5), 6) + 9 // else: min(br2dl(5), 4) + 16 // h: x == 0: min(brhl(5, 3), 6) + 21 // else: min(brhl(5, 3), 4) + 28 // v: y == 0: min(brvl(5, 3), 6) + 21 // else: min(brvl(5, 3), 4) + 28 // chroma, hf: 2d: min(br2dc(3), 3) + (plane == u ? 0 : 4) // hv: brhvc(3) + 8 // chroma, lf: 2d: min(br2dc(5), 3) + (plane == u ? 0 : 4) // hv: min(brhvc(5), 3) + 8 unsigned offset; if (tx_class == TX_CLASS_2D) { add(levels[1 * stride + 1]); if (!chroma) { lo_mag += imin(levels[0 * stride + 2], lim) + imin(levels[2 * stride + 0], lim); if (lo_freq) { offset = !xy ? 0 : xy < 2 ? 9 : 16; lim = !xy ? 8 : xy < 2 ? 6 : 4; } else { offset = xy < 6 ? 0 : xy < 8 ? 5 : 10; lim = 4; } } else { lim = 3; offset = plane == 1 ? 0 : 4; } } else { if (!chroma) { lim = 3; add(levels[0 * stride + 2]); lo_mag += imin(levels[0 * stride + 3], 3) + imin(levels[0 * stride + 4], 3); if (lo_freq) { offset = !xy ? 21 : 28; lim = !xy ? 6 : 4; } else { offset = 15; lim = 4; } } else { offset = 8; lim = 3; } } // for the base_range component: // br = min(R, 5) + min(B, 5) // br2d = (br + min(RB, 5) + 1) >> 1 // brh = (br + min(R2, 5) + 1) >> 1 // brv = (br + min(B2, 5) + 1) >> 1 // luma, hf: 2d: min(br2d, 6) // h: min(brh, 6) // v: min(brv, 6) // luma, lf: 2d: min(br2d, 6) + (is_dc ? 0 : 7) // h: min(brh, 6) + 7 // v: min(brv, 6) + 7 // chroma, hf: 2d: min(br2d, 3) // hv: min((br + 1) >> 1, 3) // chroma, lf: N/A *hi_mag_ptr = (!chroma && lo_freq & (xy > 0 || tx_class != TX_CLASS_2D) ? 7 : 0) + umin((hi_mag + 1) >> 1, chroma ? 3 : 6); return offset + umin((lo_mag + 1) >> 1, lim); } static inline unsigned get_lo_ctx_idtx(const int8_t *const levels, unsigned *const hi_mag_ptr, const ptrdiff_t stride) { #define lim 3 unsigned lo_mag = 0, hi_mag = 0; add(levels[ 0 * stride - 1]); add(levels[-1 * stride + 0]); #undef lim #undef add *hi_mag_ptr = imin(hi_mag, 6); return lo_mag; } static inline unsigned get_sign_ctx_idtx(const int8_t *const levels, const ptrdiff_t stride) { const int sum = levels[ 0 * stride - 1] + levels[-1 * stride + 0] + levels[-1 * stride - 1]; const int offset = *levels > 3 ? 2 : 0; switch (sum) { case -3: return offset + 6; case -2: case -1: return offset + 2; default: assert(0); case 0: return 0; case 1: case 2: return offset + 1; case 3: return offset + 5; } } static inline int tcq_next_state(const int state, const int abs_level) { // bit0-1 are shifted in from bit1-2 // bit2 is set based on bit0 ^ bit2 & (abs_level & 1) // bit31 is to always set it to 0 if tcq is disabled return (((state & 0x4) ^ (((abs_level & 1) ^ (state & 0x1)) << 2)) | ((state & 0x6) >> 1) | -0x80000000) & (state >> 31); } static int decode_coefs(Dav2dTaskContext *const t, DB_ONLY(const int depth) uint8_t *const a, uint8_t *const l, const enum RectTxfmSize tx, const enum BlockSize bs, const int sdp_active, const Av2Block *const b, const int plane, coef *cf, enum TxfmType *const txtp, uint8_t *res_ctx) { Dav2dTileState *const ts = t->ts; const int chroma = !!plane; // FIXME perhaps make this an inlined function arg? const int intra = b->intra && (sdp_active || !b->intrabc); const Dav2dFrameContext *const f = t->f; const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id]; const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; #if DEBUG_BLOCK_INFO const int dbg = BLOCK_TO_DEBUG && plane > -1 && 1; #define DEBUG_CF_printf(...) \ if (dbg) printf(__VA_ARGS__) #else #define DEBUG_CF_printf(...) #endif DEBUG_CF_printf("%*sdecode_cf[y=%d,x=%d,pl=%d,tx=%dx%d]: r=%d\n", depth - 1, "", t->by, t->bx, plane, t_dim->w * 4, t_dim->h * 4, ts->msac.rng); // does this block have any non-zero coefficients const int sctx = (b->fsc && !chroma && f->seq_hdr->fsc) ? 9 : get_skip_ctx(t_dim, bs, a, l, plane, t->u_has_cf, f->cur.p.p.layout); const int all_skip = dav2d_msac_decode_bool_adapt(&ts->msac, (plane == 2 ? ts->cdf.coef.skip_v : ts->cdf.coef.skip[!intra || b->fsc][t_dim->ctx])[sctx]); DEBUG_CF_printf("%*sPost-all_zero[ctx=%d|%d|%d,%d]: r=%d\n", depth, "", plane == 2 ? -1 : (!intra || b->fsc), t_dim->ctx, sctx, all_skip, ts->msac.rng); if (all_skip) { *res_ctx = 0x40; *txtp = (!chroma && b->fsc) ? IDTX : lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */ return -1; } // find end-of-block (eob) int eob; const int slw = imin(t_dim->lw, TX_32X32), slh = imin(t_dim->lh, TX_32X32); const int tx2dszctx = slw + slh; const int eob_ctx = chroma ? 2 : !intra; switch (tx2dszctx) { #define case_sz(sz, bin, bits, eb) \ case sz: { \ uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[eob_ctx]; \ eob = dav2d_msac_decode_symbol_adapt8(&ts->msac, eob_bin_cdf, bits); \ if (eb && eob == 7) { \ eob += dav2d_msac_decode_bools_bypass(&ts->msac, eb); \ if (bin == 512 && eob == 10) return INT_MIN; \ } \ break; \ } case_sz(0, 16, 4, 0); case_sz(1, 32, 5, 0); case_sz(2, 64, 6, 0); case_sz(3, 128, 7, 0); case_sz(4, 256, 7, 1); case_sz(5, 512, 7, 2); case_sz(6, 1024, 7, 2); #undef case_sz } DEBUG_CF_printf("%*sPost-eob_bin_%d[ctx=%d,%d]: r=%d\n", depth, "", 16 << tx2dszctx, eob_ctx, eob, ts->msac.rng); if (eob > 1) { const int eob_hi_bit = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.eob_hi_bit); const int eob_bin = eob - 2; eob = eob_hi_bit | 2; if (eob_bin) eob = (eob << eob_bin) | dav2d_msac_decode_bools_bypass(&ts->msac, eob_bin); DEBUG_CF_printf("%*sPost-eob[%d]: r=%d\n", depth, "", eob, ts->msac.rng); } assert(eob >= 0 && eob < (16 << tx2dszctx)); // transform type (chroma: derived, luma: explicitly coded) static const uint8_t txtp_long_tbl[2][2][4] = { { { V_DCT, V_ADST, V_FLIPADST, IDTX }, { H_DCT, H_ADST, H_FLIPADST, IDTX }, }, { { DCT_DCT, ADST_DCT, FLIPADST_DCT, H_DCT }, { DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT }, }, }; if (lossless) { // if luma but inter, this can be coded if (chroma) { if (intra) { const int y_fsc = !sdp_active ? b->fsc : t->luma_fsc_map[(t->cby & 15) * 16 + (t->cbx & 15)]; *txtp = y_fsc ? IDTX : WHT_WHT; } else { assert(*txtp == WHT_WHT || *txtp == IDTX || *txtp == IDTX_INV); *txtp &= 0xe7; // IDTX_INV -> IDTX } } else if (intra) { *txtp = b->fsc ? IDTX : WHT_WHT; } else if (t_dim->max == TX_4X4) { *txtp = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_lossless) ? IDTX : WHT_WHT; } else { *txtp = IDTX; } } else if (chroma) { if (f->seq_hdr->chroma_dctonly) { *txtp = DCT_DCT; } else { // inferred from either the luma txtp (inter) or a LUT (intra) if (intra) *txtp = dav2d_txtp_from_uvmode[b->uv_mode]; if ((t_dim->w >= 8 && *txtp & 0x02 /* horizontal is (flip)adst */) || (t_dim->h >= 8 && *txtp & 0x40 /* vertical is (flip)adst */) || (tx == (int) TX_16X16 && ((*txtp & 0x47) == 0x41 /* (flip)adst ver, identity hor */ || (*txtp & 0xe2) == 0x22 /* identity ver, (flip)adst hor */))) { *txtp = DCT_DCT; } else if (*txtp == IDTX_INV) *txtp = IDTX; } } else if (intra) { if (t_dim->sub == TX_32X32 /* 64x64, 64x32 or 32x64 */) { *txtp = DCT_DCT; } else if (b->fsc) { *txtp = IDTX; } else if (!eob /* dc-only */ || tx == (enum RectTxfmSize)TX_32X32) { *txtp = DCT_DCT; } else if (t_dim->max >= TX_32X32 /* {64,32}x{16,8,4} */) { // long64/32 const int long_dct = t_dim->max == TX_64X64 || dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_long32_dct[0]); const int short_idx = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.txtp_intra_short_1d[t_dim->min], 3); *txtp = txtp_long_tbl[long_dct][t_dim->w < t_dim->h][short_idx]; } else if (f->frame_hdr->reduced_txtp_set == 2) { // ext_tx_set_dct_idtx *txtp = DCT_DCT; } else { // ext_new_tx_set const int sz_ctx = (t_dim->lw + t_dim->lh) >> 1; assert(sz_ctx < 3); const int tx_idx = f->frame_hdr->reduced_txtp_set ? dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_ext_reduced[t_dim->min]) : dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.txtp_ext[t_dim->min], 6); static const uint8_t /*enum TxfmType*/ md_idx2type[][13][7] = { { { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, FLIPADST_ADST, H_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, V_DCT, V_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_ADST, H_DCT, H_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, FLIPADST_ADST, H_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, V_ADST, V_FLIPADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, FLIPADST_ADST, H_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_ADST, H_DCT, H_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, V_DCT, V_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, FLIPADST_ADST, V_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, FLIPADST_ADST, H_ADST }, { DCT_DCT, ADST_ADST, DCT_ADST, V_DCT, H_DCT, V_ADST, H_ADST }, }, { { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, ADST_FLIPADST, FLIPADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_ADST, FLIPADST_DCT, ADST_FLIPADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, FLIPADST_ADST, ADST_FLIPADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, V_DCT, H_DCT, H_ADST }, }, { { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, FLIPADST_ADST, ADST_FLIPADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, FLIPADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, FLIPADST_DCT, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, DCT_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, V_DCT, H_DCT, V_ADST }, }, }; *txtp = md_idx2type[sz_ctx][b->y_mode][tx_idx]; } } else { if (t_dim->sub == TX_32X32 /* 64x64, 64x32 or 32x64 */) { *txtp = DCT_DCT; } else { const int y = eob >> (2 + slw), x = eob & ((4 << slw) - 1); const int xy = x + y; const int ctx = xy < 2 ? 1 : xy > 4 * (imin(8, t_dim->w) + imin(8, t_dim->h)) - 4 ? 2 : 0; if (tx == (enum RectTxfmSize)TX_32X32) { *txtp = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_inter_dct_idtx[ctx][TX_32X32]) ? DCT_DCT : IDTX; } else if (t_dim->max >= TX_32X32 /* {64,32}x{16,8,4} */) { // long64/32 const int long_dct = t_dim->max == TX_64X64 || dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_long32_dct[1]); const int short_idx = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.txtp_inter_short_1d[ctx] [t_dim->min], 3); *txtp = txtp_long_tbl[long_dct][t_dim->w < t_dim->h][short_idx]; } else if (f->frame_hdr->reduced_txtp_set == 1 || f->frame_hdr->reduced_txtp_set == 2) { *txtp = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_inter_dct_idtx[ctx][t_dim->min]) ? DCT_DCT : IDTX; } else if (f->frame_hdr->reduced_txtp_set == 3) { const int tx_idx = dav2d_msac_decode_symbol_adapt4( &ts->msac, ts->cdf.m.txtp_inter_dct_idtx_iddct[ctx][t_dim->min], 3); static const uint8_t txtp_dct_idtx_iddct_tbl[4] = { DCT_DCT, V_DCT, H_DCT, IDTX }; *txtp = txtp_dct_idtx_iddct_tbl[tx_idx]; } else { const int setidx = tx == (enum RectTxfmSize)TX_16X16; const int set = dav2d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.txtp_inter_tx_set[setidx][ctx] [t_dim->min]); if (!set) { *txtp = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.txtp_inter_set0[setidx][ctx], 7); } else if (setidx) { *txtp = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.txtp_inter_set2[ctx], 3) + 8; } else { *txtp = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.txtp_inter_set1[ctx], 7) + 8; } static const uint8_t txtp_inv_tbl[][16] = { { IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, { IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST }, }; *txtp = txtp_inv_tbl[setidx][*txtp]; } } } DEBUG_CF_printf("%*sPost-txtp[%s/%s]: r=%d\n", depth, "", dav2d_tx1d_names[*txtp & 7], dav2d_tx1d_names[*txtp >> 5], ts->msac.rng); const enum TxClass tx_class = (*txtp >> 3) & 0x3; // secondary transform int stx_type = 0; if (f->seq_hdr->ist[!intra] && !chroma) { if (intra) { if (eob >= 1 && b->y_mode != PAETH_PRED && (*txtp == DCT_DCT || *txtp == ADST_ADST)) { int lim; if (tx == (enum RectTxfmSize)TX_8X8 && *txtp == DCT_DCT) lim = 20; else if (t_dim->min >= TX_8X8) lim = *txtp == DCT_DCT ? 32 : 20; else lim = 8; stx_type = eob < lim; } } else { stx_type = t_dim->min >= TX_16X16 && *txtp == DCT_DCT && eob >= 3 && eob < 32; } if (stx_type) { stx_type = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.stx[!intra][t_dim->min], 3); int stx_set = 0; if (stx_type && intra) { if (t_dim->min >= TX_8X8 && *txtp == ADST_ADST) { static const uint8_t inv_most_probable_stx_mapping_adst[][7] = { { 3, 1, 0, 2 }, // DC_PRED { 1, 3, 0, 2 }, // V_PRED { 1, 3, 0, 2 }, // H_PRED { 1, 3, 0, 2 }, // D45_PRED { 0, 2, 3, 1 }, // D135_PRED { 2, 1, 0, 3 }, // D113_PRED { 2, 1, 0, 3 }, // D157_PRED { 1, 0, 3, 2 }, // D203_PRED { 1, 0, 3, 2 }, // D67_PRED { 3, 1, 0, 2 }, // SMOOTH_PRED { 1, 3, 0, 2 }, // SMOOTH_V_PRED { 1, 3, 0, 2 }, // SMOOTH_H_PRED }; stx_set = dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.stx_set_adst, 3); stx_set = inv_most_probable_stx_mapping_adst[b->y_mode][stx_set]; } else { static const uint8_t inv_most_probable_stx_mapping[][7] = { { 6, 1, 0, 5, 4, 3, 2 }, // DC_PRED { 1, 6, 0, 4, 2, 5, 3 }, // V_PRED { 1, 6, 0, 4, 2, 5, 3 }, // H_PRED { 2, 6, 0, 5, 1, 4, 3 }, // D45_PRED { 3, 4, 6, 1, 0, 2, 5 }, // D135_PRED { 4, 1, 3, 6, 0, 5, 2 }, // D113_PRED { 4, 1, 3, 6, 0, 5, 2 }, // D157_PRED { 5, 0, 6, 2, 1, 4, 3 }, // D203_PRED { 5, 0, 6, 2, 1, 4, 3 }, // D67_PRED { 6, 1, 0, 5, 4, 3, 2 }, // SMOOTH_PRED { 1, 6, 0, 4, 2, 5, 3 }, // SMOOTH_V_PRED { 1, 6, 0, 4, 2, 5, 3 }, // SMOOTH_H_PRED }; stx_set = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.stx_set, 6); stx_set = inv_most_probable_stx_mapping[b->y_mode][stx_set]; } stx_set += 7 * (*txtp == ADST_ADST); *txtp |= stx_set << 10; } *txtp |= stx_type << 8; DEBUG_CF_printf("%*sPost-stx[type=%d,set=%d]: r=%d\n", depth, "", stx_type, stx_set, ts->msac.rng); } } else if (f->seq_hdr->cctx && plane == 1 && eob >= intra && !lossless && (f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420 || t_dim->max < 8)) { const int cctx = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.cctx, 6); DEBUG_CF_printf("%*sPost-cctx[%d]: r=%d\n", depth, "", cctx, ts->msac.rng); *txtp |= cctx << 8; } // base tokens unsigned cul_level = 0; int dc_tok; const int tcq_enabled = !chroma && f->frame_hdr->tcq && tx_class == TX_CLASS_2D && !lossless; int hr_avg = 0, tcq_state = tcq_enabled * -0x80000000; const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL; int dq_shift = tcq_enabled + 3 + imax(0, t_dim->ctx - 2); const uint32_t *const dq_tbl = ts->dq[b->seg_id][plane]; const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.p.bpc)); unsigned dc_sign_level = 1 << 6; if (f->seq_hdr->fsc && (!intra || b->fsc) && *txtp == IDTX && !chroma) { assert(!stx_type); *txtp = IDTX_INV; int8_t *const levels = t->scratch.levels; const ptrdiff_t stride = 1 + (4 << slh); memset(levels, 0, stride * ((4 << slw) + 1)); const uint16_t *scan = dav2d_scans[tx]; const int sz_ctx = imin(t_dim->ctx, 2); const int sz = (16 << tx2dszctx) - 1; const int bob = sz - eob; unsigned ctx = (bob > 2 << tx2dszctx) + (bob > 4 << tx2dszctx); uint16_t (*hi_cdf)[4] = ts->cdf.coef.br_y_tok_idtx[sz_ctx]; int tok = 1 + dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.coef.bob_base_y_tok[sz_ctx][ctx], 2); if (tok == 3) { tok += dav2d_msac_decode_symbol_adapt4(&ts->msac, hi_cdf[0], 3); } const unsigned shift = slh + 2; const unsigned mask = (4 << slh) - 1; int rc = scan[bob]; int x = rc >> shift, y = rc & mask; cf[rc] = levels[(1 + x) * stride + (y + 1)] = tok; DEBUG_CF_printf("%*sPost-bob_tok[pos=%d,ctx=%d|%d|%d,plane=%s,%d]: r=%d\n", depth, "", bob, sz_ctx, ctx, tok < 3 ? -1 : 0, chroma ? "uv" : "y", tok, ts->msac.rng); uint16_t (*lo_cdf)[4] = ts->cdf.coef.base_y_tok_idtx[sz_ctx]; for (int i = bob + 1; i <= sz; i++) { rc = scan[i]; x = rc >> shift; y = rc & mask; int8_t *const level = &levels[(1 + x) * stride + (1 + y)]; unsigned hr_ctx; ctx = get_lo_ctx_idtx(level, &hr_ctx, stride); int tok = dav2d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); if (tok == 3) { tok += dav2d_msac_decode_symbol_adapt4(&ts->msac, hi_cdf[hr_ctx], 3); } cf[rc] = *level = tok; DEBUG_CF_printf("%*sPost-tok[pos=%d,ctx=%d|%d|%d,plane=%s,%d]: r=%d\n", depth, "", i, sz_ctx, ctx, tok < 3 ? -1 : hr_ctx, chroma ? "uv" : "y", tok, ts->msac.rng); } int hr_avg = 0; uint16_t (*sign_cdf)[2] = ts->cdf.coef.sign_idtx[sz_ctx]; const unsigned dq = dq_tbl[1]; // FIXME qm dq_shift -= tcq_enabled; for (int i = bob; i <= sz; i++) { rc = scan[i]; int tok = cf[rc]; if (!tok) continue; x = rc >> shift; y = rc & mask; int8_t *const level = &levels[(1 + x) * stride + (1 + y)]; ctx = get_sign_ctx_idtx(level, stride); int sign = dav2d_msac_decode_bool_adapt(&ts->msac, sign_cdf[ctx]); if (!i) dc_sign_level = (sign - 1) & (2 << 6); DEBUG_CF_printf("%*sPost-sign[pos=%d,ctx=%d|%d,plane=%s,%d]: r=%d\n", depth, "", i, sz_ctx, ctx, chroma ? "uv" : "y", sign, ts->msac.rng); *level = 1 - 2 * sign; // residual int val; if (tok >= 6) { const int hr = decode_hr(&ts->msac, hr_avg); tok += hr; DEBUG_CF_printf("%*sPost-residual[pos=%d,%d->%d]: r=%d\n", depth, "", i, hr, tok, ts->msac.rng); hr_avg = (hr_avg + hr) >> 1; tok &= 0xfffff; val = (((tok * dq) & 0xffffff) + 4) >> dq_shift; val = umin(val, cf_max + sign); } else { val = (tok * dq + 4) >> dq_shift; } cul_level += tok; cf[rc] = sign ? -val : val; } goto end; } else if (eob) { int8_t *const levels = t->scratch.levels; #define DECODE_COEFS_CLASS(tx_class, xy, is_stx) \ int lim, tok; \ uint16_t (*hi_cdf)[4]; \ union { \ uint16_t (*hf)[4], (*lf)[8]; \ } lo_cdf;\ unsigned ctx = 1 + (eob > 2 << tx2dszctx) + (eob > 4 << tx2dszctx); \ if (eob >= hi_to_low_tx) { \ uint16_t (*eob_cdf)[4]; \ lim = 3; \ if (!chroma) { \ eob_cdf = ts->cdf.coef.eob_base_y_tok_hf[t_dim->ctx]; \ hi_cdf = ts->cdf.coef.br_y_tok_hf; \ lo_cdf.hf = ts->cdf.coef.base_y_tok_hf[t_dim->ctx][0]; \ } else { \ eob_cdf = ts->cdf.coef.eob_base_uv_tok_hf; \ hi_cdf = ts->cdf.coef.br_uv_tok_hf; \ lo_cdf.hf = ts->cdf.coef.base_uv_tok_hf; \ } \ tok = 1 + dav2d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2); \ } else { \ uint16_t (*eob_cdf)[8]; \ lim = 5; \ if (!chroma) { \ eob_cdf = ts->cdf.coef.eob_base_y_tok_lf[t_dim->ctx]; \ hi_cdf = ts->cdf.coef.br_y_tok_lf; \ lo_cdf.lf = ts->cdf.coef.base_y_tok_lf[t_dim->ctx][0]; \ } else { \ eob_cdf = ts->cdf.coef.eob_base_uv_tok_lf; \ hi_cdf = NULL; \ lo_cdf.lf = ts->cdf.coef.base_uv_tok_lf; \ } \ tok = 1 + dav2d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 4); \ } \ unsigned rc; \ unsigned x, y; \ int8_t *level; \ if (tx_class == TX_CLASS_2D) \ rc = scan[eob], x = rc >> shift, y = rc & mask; \ else if (tx_class == TX_CLASS_H) \ /* Transposing reduces the stride and padding requirements */ \ x = eob & mask, y = eob >> shift, rc = eob; \ else /* tx_class == TX_CLASS_V */ \ x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \ if (tok == lim && hi_cdf) { \ tok += dav2d_msac_decode_symbol_adapt4(&ts->msac, \ hi_cdf[lim == 5 ? 7 : 0], 3); \ } \ DEBUG_CF_printf("%*sPost-eob_tok[pos=%d,ctx=%d|%d|%d,freq=%s,plane=%s,%d]: r=%d\n", \ depth, "", eob, t_dim->ctx, ctx, \ tok < lim || !hi_cdf ? -1 : lim == 5 ? 7 : 0, \ lim == 5 ? "lo" : "hi", chroma ? "uv" : "y", \ tok, ts->msac.rng); \ tcq_state = tcq_next_state(tcq_state, tok); \ cf[is_stx ? (unsigned)eob : rc] = tok; \ if (tx_class == TX_CLASS_2D) \ level = levels + rc; \ else \ level = levels + x * stride + y; \ *level = tok; \ for (int i = eob - 1;; i--) { /* ac */ \ if (i == hi_to_low_tx - 1) { \ lim = 5; \ if (!chroma) { \ hi_cdf = ts->cdf.coef.br_y_tok_lf; \ lo_cdf.lf = ts->cdf.coef.base_y_tok_lf[t_dim->ctx][0]; \ } else { \ hi_cdf = NULL; \ lo_cdf.lf = ts->cdf.coef.base_uv_tok_lf; \ } \ } \ if (!i) break; \ if (tx_class == TX_CLASS_2D) \ rc = scan[i], x = rc >> shift, y = rc & mask; \ else if (tx_class == TX_CLASS_H) \ x = i & mask, y = i >> shift, rc = i; \ else /* tx_class == TX_CLASS_V */ \ x = i & mask, y = i >> shift, rc = (x << shift2) | y; \ assert(x < 32 && y < 32); \ if (tx_class == TX_CLASS_2D) \ level = levels + rc; \ else \ level = levels + x * stride + y; \ unsigned hr_ctx; \ ctx = get_lo_ctx(level, tx_class, &hr_ctx, xy, plane, stride); \ const int tcq = (tcq_state & 2) >> 1; \ const int lo_cdf_idx = ctx * (2 - chroma) + tcq; \ if (lim == 5) \ tok = dav2d_msac_decode_symbol_adapt8(&ts->msac, lo_cdf.lf[lo_cdf_idx], 5); \ else \ tok = dav2d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf.hf[lo_cdf_idx], 3); \ if (tok == lim && hi_cdf) \ tok += dav2d_msac_decode_symbol_adapt4(&ts->msac, hi_cdf[hr_ctx], 3); \ DEBUG_CF_printf("%*sPost-tok[pos=%d,ctx=%d|%d|%d|%d,freq=%s,plane=%s,%d]: r=%d\n", \ depth, "", i, t_dim->ctx, ctx, tcq, \ tok < lim || !hi_cdf ? -1 : hr_ctx, \ lim == 5 ? "lo" : "hi", \ chroma ? "uv" : "y", tok, ts->msac.rng); \ tcq_state = tcq_next_state(tcq_state, tok); \ *level = tok; \ cf[is_stx ? (unsigned)i : rc] = tok; \ } \ /* dc */ \ unsigned hr_ctx; \ ctx = get_lo_ctx(levels, tx_class, &hr_ctx, 0, plane, stride); \ const int tcq = (tcq_state & 2) >> 1; \ const int lo_cdf_idx = ctx * (2 - chroma) + tcq; \ if (lim == 5) \ dc_tok = dav2d_msac_decode_symbol_adapt8(&ts->msac, lo_cdf.lf[lo_cdf_idx], 5); \ else \ dc_tok = dav2d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf.hf[lo_cdf_idx], 3); \ if (dc_tok == lim && hi_cdf) { \ dc_tok += dav2d_msac_decode_symbol_adapt4(&ts->msac, hi_cdf[hr_ctx], 3); \ } \ DEBUG_CF_printf("%*sPost-dc_tok[pos=0,ctx=%d|%d|%d|%d,freq=%s,plane=%s,%d]: r=%d\n", \ depth, "", t_dim->ctx, ctx, tcq, \ dc_tok < lim || !hi_cdf ? -1 : hr_ctx, \ lim == 5 ? "lo" : "hi", chroma ? "uv" : "y", \ dc_tok, ts->msac.rng); \ tcq_state = tcq_enabled * -0x80000000; \ const unsigned ac_dq = dq_tbl[1]; /* FIXME qm */ \ for (int i = eob; i > 0; i--) { \ if (tx_class == TX_CLASS_2D) \ rc = is_stx ? i : scan[i]; \ else if (tx_class == TX_CLASS_H) \ y = i >> shift, rc = i; \ else /* tx_class == TX_CLASS_V */ \ x = i & mask, y = i >> shift, rc = (x << shift2) | y; \ int tok = cf[rc]; \ if (!tok) { \ tcq_state = tcq_next_state(tcq_state, 0); \ continue; \ } \ int sign; \ if (tx_class == TX_CLASS_2D || y > 0 || chroma) { \ sign = dav2d_msac_decode_bool_bypass(&ts->msac); \ DEBUG_CF_printf("%*sPost-%ssign[pos=%d,%d]: r=%d\n", \ depth, "", (tx_class != TX_CLASS_2D && \ !y) ? "dc_" : "", i, sign, \ ts->msac.rng); \ } else { \ sign = dav2d_msac_decode_bool_adapt(&ts->msac, \ ts->cdf.coef.dc_sign[chroma][0][0]); \ DEBUG_CF_printf("%*sPost-dc_sign[pos=%d,ctx=0,%d]: r=%d\n", \ depth, "", i, sign, ts->msac.rng); \ } \ const int tcq = (tcq_state & 2) >> 1; \ tcq_state = tcq_next_state(tcq_state, tok); \ /* residual */ \ const int max_br = i < hi_to_low_tx ? (chroma ? 5 : 8) : 6; \ int ac_val; \ if (tok >= max_br - tcq_enabled) { \ const int hr = decode_hr(&ts->msac, hr_avg); \ tok += hr << tcq_enabled; \ DEBUG_CF_printf("%*sPost-residual[pos=%d,%d->%d]: r=%d\n", \ depth, "", i, hr, tok, ts->msac.rng); \ hr_avg = (hr_avg + hr) >> 1; \ tok &= 0xfffff; \ ac_val = (tok << tcq_enabled) - tcq; \ ac_val = (((ac_val * ac_dq) & 0xffffff) + 4) >> dq_shift; \ ac_val = umin(ac_val, cf_max + sign); \ } else { \ ac_val = (tok << tcq_enabled) - tcq; \ ac_val = (ac_val * ac_dq + 4) >> dq_shift; \ } \ cul_level += tok; \ cf[rc] = sign ? -ac_val : ac_val; \ } \ break const uint16_t *scan; switch (tx_class) { case TX_CLASS_2D: { scan = dav2d_scans[tx]; const ptrdiff_t stride = 4 << slh; const unsigned shift = slh + 2, shift2 = 0; const unsigned mask = (4 << slh) - 1; memset(levels, 0, stride * ((4 << slw) + 2)); const int hi_to_low_tx = chroma ? 1 : 10; if (stx_type) { DECODE_COEFS_CLASS(TX_CLASS_2D, x + y, 1); } else { DECODE_COEFS_CLASS(TX_CLASS_2D, x + y, 0); } } case TX_CLASS_H: { const ptrdiff_t stride = 32; const unsigned shift = slh + 2, shift2 = 0; const unsigned mask = (4 << slh) - 1; memset(levels, 0, stride * ((4 << slh) + 2)); const int hi_to_low_tx = (8 << slh) >> chroma; DECODE_COEFS_CLASS(TX_CLASS_H, y, 0); } case TX_CLASS_V: { const ptrdiff_t stride = 32; const unsigned shift = slw + 2, shift2 = slh + 2; const unsigned mask = (4 << slw) - 1; memset(levels, 0, stride * ((4 << slw) + 2)); const int hi_to_low_tx = (8 << slw) >> chroma; DECODE_COEFS_CLASS(TX_CLASS_V, y, 0); } #undef DECODE_COEFS_CLASS default: assert(0); } } else if (chroma) { // dc-only dc_tok = 1 + dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.coef.eob_base_uv_tok_lf[0], 4); DEBUG_CF_printf("%*sPost-eob_tok[pos=%d,ctx=%d|0|-1,freq=lo,plane=uv,%d]: r=%d\n", depth, "", eob, t_dim->ctx, dc_tok, ts->msac.rng); } else { dc_tok = 1 + dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.coef.eob_base_y_tok_lf[t_dim->ctx][0], 4); if (dc_tok == 5) { dc_tok += dav2d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.coef.br_y_tok_lf[tx_class == TX_CLASS_2D ? 0 : 7], 3); } DEBUG_CF_printf("%*sPost-eob_tok[pos=%d,ctx=%d|0|%d,freq=lo,plane=y,%d]: r=%d\n", depth, "", eob, t_dim->ctx, dc_tok < 5 ? -1 : tx_class == TX_CLASS_2D ? 0 : 7, dc_tok, ts->msac.rng); } if (!dc_tok) goto end; // dc sign & residual int dc_sign; if (chroma) { dc_sign = dav2d_msac_decode_bool_bypass(&ts->msac); DEBUG_CF_printf("%*sPost-dc_sign[pos=0,%d]: r=%d\n", depth, "", dc_sign, ts->msac.rng); } else { const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l); uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][0][dc_sign_ctx]; dc_sign = dav2d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); DEBUG_CF_printf("%*sPost-dc_sign[pos=0,ctx=%d,%d]: r=%d\n", depth, "", dc_sign_ctx, dc_sign, ts->msac.rng); } int dc_dq = dq_tbl[0]; dc_sign_level = (dc_sign - 1) & (2 << 6); if (qm_tbl) { // FIXME dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5; if (dc_tok == 15) { dc_tok = 0; //read_golomb(&ts->msac) + 15; DEBUG_CF_printf("%*sPost-dc_residual[%d->%d]: r=%d\n", depth, "", dc_tok - 15, dc_tok, ts->msac.rng); dc_tok &= 0xfffff; dc_dq = (dc_dq * dc_tok) & 0xffffff; } else { dc_dq *= dc_tok; assert(dc_dq <= 0xffffff); } cul_level = dc_tok; dc_dq >>= dq_shift; dc_dq = umin(dc_dq, cf_max + dc_sign); cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq); } else { // non-qmatrix is the common case and allows for additional optimizations const int max_br = chroma ? 5 : 8; int dc_val; int tcq = (tcq_state & 2) >> 1; if (dc_tok >= max_br - tcq_enabled) { const int hr = decode_hr(&ts->msac, hr_avg); dc_tok += hr << tcq_enabled; DEBUG_CF_printf("%*sPost-residual[pos=0,%d->%d]: r=%d\n", depth, "", hr, dc_tok, ts->msac.rng); dc_tok &= 0xfffff; dc_val = (dc_tok << tcq_enabled) - tcq; dc_val = (((dc_val * dc_dq) & 0xffffff) + 4) >> dq_shift; dc_val = umin(dc_val, cf_max + dc_sign); } else { dc_val = (dc_tok << tcq_enabled) - tcq; dc_val = (dc_val * dc_dq + 4) >> dq_shift; } cul_level += dc_tok; cf[0] = dc_sign ? -dc_val : dc_val; } end: // context *res_ctx = umin(cul_level, 63) | dc_sign_level; return eob; } static enum IntraPredMode wide_angle_remap(const TxfmInfo *const t_dim, enum IntraPredMode mode, int *const angle, const int mrl_idx) { if ((unsigned) mode - 1 > VERT_LEFT_PRED - 1) return mode; // map directional modes const int mrl_adj = (mrl_idx == 1) - (mrl_idx == 2); *angle = dav2d_mode_to_angle_map[mode - 1] + *angle * 3 + mrl_adj; static const uint8_t thresh[] = { 61, 73, 82, 86 }; const int rect = t_dim->lw - t_dim->lh; // FIXME below, we should return 180 +/- angle after mode remapping, // otherwise the actual intra prediction won't work correctly if (rect > 0) { assert(rect <= 4); if (*angle > 270 - thresh[rect - 1]){ *angle -= 180; return DIAG_DOWN_LEFT_PRED; } } else if (rect < 0) { assert(rect >= -4); if (*angle < thresh[-1 - rect]) { *angle += 180; return HOR_UP_PRED; } } return mode; } static int read_luma_tx_cf(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum RectTxfmSize tx, Av2Block *const b) { const Dav2dFrameContext *const f = t->f; Dav2dTileState *const ts = t->ts; const int bx4 = t->bx & 63, by4 = t->by & 63; const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw = t_dim->w * 4, th = t_dim->h * 4; const enum IntraPredMode orig_y_mode = b->y_mode; int angle = b->y_angle; if (b->intra && !b->intrabc) b->y_mode = wide_angle_remap(t_dim, b->y_mode, &angle, b->mrl_index); // decode coefficients uint8_t cf_ctx; enum TxfmType txtp; coef *cf = ts->frame_thread[0].cf; ts->frame_thread[0].cf += imin(tw, 32) * imin(th, 32); struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; const int eob = decode_coefs(t, DB_ONLY(depth + 1) &t->a->lcoef[bx4], &t->l.lcoef[by4], tx, b->bs, 0, b, 0, cf, &txtp, &cf_ctx); if (eob == INT_MIN) return -1; cbi->txtp[0] = txtp; cbi->eob[0] = eob; txtp &= 0xff; DEBUG_BLOCK_printf("%*sPost-y_cf_blk[tx=%dx%d,txtp=%s/%s,eob=%d]: r=%d\n", depth + 1, "", tw, th, dav2d_tx1d_names[txtp & 7], dav2d_tx1d_names[txtp >> 5], eob, ts->msac.rng); dav2d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(t_dim->w, f->bw - t->bx)); dav2d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(t_dim->h, f->bh - t->by)); uint8_t *txtp_map = &t->txtp_map[(t->by & 15) * 16 + (t->bx & 15)]; #define set_ctx(rep_macro) \ for (int y = 0; y < t_dim->h; y++) { \ rep_macro(txtp_map, 0, txtp); \ txtp_map += 16; \ } case_set(t_dim->lw); #undef set_ctx b->y_mode = orig_y_mode; return 0; } int bytefn(dav2d_read_coef_blocks)(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum BlockSize lbs, const enum BlockSize cbs, Av2Block *const b) { const enum BlockSize bs = lbs == BS_INVALID ? cbs : lbs; assert(bs != BS_INVALID); const int has_luma = lbs != BS_INVALID, has_chroma = cbs != BS_INVALID; const Dav2dFrameContext *const f = t->f; const int bx4 = t->bx & 63, by4 = t->by & 63; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); if (has_luma && b->skip_txfm) { BlockContext *const a = t->a; dav2d_memset_pow2[b_dim[2]](&a->lcoef[bx4], 0x40); dav2d_memset_pow2[b_dim[3]](&t->l.lcoef[by4], 0x40); if (has_chroma) { const int ss_ver = f->ss_ver, ss_hor = f->ss_hor; const uint8_t *const cb_dim = dav2d_block_dimensions[cbs]; const int cbx4ss = (t->cbx & 63) >> ss_hor, cby4ss = (t->cby & 63) >> ss_ver; dav2d_memset_pow2_fn memset_cw = dav2d_memset_pow2[cb_dim[2] - ss_hor]; dav2d_memset_pow2_fn memset_ch = dav2d_memset_pow2[cb_dim[3] - ss_ver]; memset_cw(&a->ccoef[0][cbx4ss], 0x40); memset_cw(&a->ccoef[1][cbx4ss], 0x40); memset_ch(&t->l.ccoef[0][cby4ss], 0x40); memset_ch(&t->l.ccoef[1][cby4ss], 0x40); } return 0; } const uint8_t csplit[3][3] = { [BS_128x128 - BS_128x128] = { BS_64x64, BS_128x64, BS_128x128 }, [BS_128x64 - BS_128x128] = { BS_64x64, BS_128x64, BS_128x64 }, [BS_64x128 - BS_128x128] = { BS_64x64, BS_64x64, BS_64x128 }, }; if (imax(bw4, bh4) > 16) { const int ss_ver = f->ss_ver, ss_hor = f->ss_hor; assert(bw4 * 2 >= bh4 && bh4 * 2 >= bw4); // 1:2, 1:1 or 2:1 ratios only assert(t->cbx == t->bx && t->cby == t->by); const int y_start = t->by, y_end = imin(y_start + bh4, f->bh); const int x_start = t->bx, x_end = imin(x_start + bw4, f->bw); int step; enum BlockSize lbs2, cbs2i; if (imax(bw4, bh4) == 64) { step = 32; lbs2 = lbs == BS_INVALID ? BS_INVALID : BS_128x128; cbs2i = cbs == BS_INVALID ? BS_INVALID : BS_128x128; } else { step = 16; lbs2 = lbs == BS_INVALID ? BS_INVALID : BS_64x64; cbs2i = cbs == BS_INVALID ? BS_INVALID : csplit[cbs - BS_128x128][ss_hor + ss_ver]; } for (int y = 0; t->by < y_end; t->by += step, y++) { for (int x = 0; t->bx < x_end; t->bx += step, x++) { enum BlockSize cbs2; if (step == 32) { cbs2 = cbs2i; } else { // coef reading is done with the first luma 64x64 cbs2 = !((x & ss_hor) | (y & ss_ver)) ? cbs2i : BS_INVALID; } const int res = bytefn(dav2d_read_coef_blocks)(t, DB_ONLY(depth) lbs2, cbs2, b); if (step == 32) { t->cbx += step; } else if ((x & ss_hor) == ss_hor) { t->cbx += step << ss_hor; } if (res < 0) { t->cbx = t->bx = x_start; t->cby = t->by = y_start; return res; } } t->cbx = t->bx = x_start; if (step == 32) { t->cby += step; } else if ((y & ss_ver) == ss_ver) { t->cby += step << ss_ver; } } t->cby = t->by = y_start; return 0; } if (lbs == BS_INVALID) goto chroma; const int8_t *const tp = dav2d_tx_part_tbl[bs]; if (tp[b->tx_part] == -1) return -1; // luma enum RectTxfmSize tx = tp[b->tx_part]; t->pb.col_start = t->bx; t->pb.row_start = t->by; if (f->frame_hdr->segmentation.lossless[b->seg_id]) { int res = 0, y, x; tx = b->tx_size_ll ? dav2d_max_txfm_size_for_bs[bs][3] : (int) TX_4X4; const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w, th4 = t_dim->h; for (y = 0; y < h4 && !res; y += th4, t->by += th4) { for (x = 0; x < w4 && !res; x += tw4, t->bx += tw4) { res = read_luma_tx_cf(t, DB_ONLY(depth) (int) tx, b); } t->bx -= x; } t->by -= y; if (res < 0) return res; } else switch (b->tx_part) { case TX_PARTITION_NONE: { const int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; break; } case TX_PARTITION_SPLIT: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w, th4 = t_dim->h; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; const int have_v_split = t->bx + tw4 < f->bw; if (have_v_split) { t->bx += tw4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->bx -= tw4; if (res < 0) return res; } if (t->by + th4 >= f->bh) break; t->by += th4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (!res && have_v_split) { t->bx += tw4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->bx -= tw4; } t->by -= th4; if (res < 0) return res; break; } case TX_PARTITION_H: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int th4 = t_dim->h; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->by + th4 >= f->bh) break; t->by += th4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->by -= th4; if (res < 0) return res; break; } case TX_PARTITION_V: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->bx + tw4 >= f->bw) break; t->bx += tw4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->bx -= tw4; if (res < 0) return res; break; } case TX_PARTITION_H4: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int th4 = t_dim->h; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->by + th4 >= f->bh) break; t->by += th4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0 || t->by + th4 >= f->bh) { t->by -= th4; } else { t->by += th4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0 || t->by + th4 >= f->bh) { t->by -= 2 * th4; } else { t->by += th4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->by -= 3 * th4; } } if (res < 0) return res; break; } case TX_PARTITION_V4: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->bx + tw4 >= f->bw) break; t->bx += tw4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0 || t->bx + tw4 >= f->bw) { t->bx -= tw4; } else { t->bx += tw4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0 || t->bx + tw4 >= f->bw) { t->bx -= 2 * tw4; } else { t->bx += tw4; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->bx -= 3 * tw4; } } if (res < 0) return res; break; } case TX_PARTITION_H5: { const enum RectTxfmSize tx_big = tp[TX_PARTITION_H]; const TxfmInfo *const t_dim_small = &dav2d_txfm_dimensions[tx], *const t_dim_big = &dav2d_txfm_dimensions[tx_big]; const int tw4_small = t_dim_small->w, th4_small = t_dim_small->h; const int th4_big = t_dim_big->h; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; const int have_v_split = t->bx + tw4_small < f->bw; if (have_v_split) { t->bx += tw4_small; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->bx -= tw4_small; if (res < 0) return res; } if (t->by + th4_small >= f->bh) break; t->by += th4_small; res = read_luma_tx_cf(t, DB_ONLY(depth) tx_big, b); if (res < 0 || t->by + th4_big >= f->bh) { t->by -= th4_small; } else { t->by += th4_big; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (!res && have_v_split) { t->bx += tw4_small; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->bx -= tw4_small; } t->by -= th4_small + th4_big; } if (res < 0) return res; break; } case TX_PARTITION_V5: { const enum RectTxfmSize tx_big = tp[TX_PARTITION_V]; const TxfmInfo *const t_dim_small = &dav2d_txfm_dimensions[tx], *const t_dim_big = &dav2d_txfm_dimensions[tx_big]; const int tw4_small = t_dim_small->w, th4_small = t_dim_small->h; const int tw4_big = t_dim_big->w; int res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (res < 0) return res; const int have_h_split = t->by + th4_small < f->bh; if (have_h_split) { t->by += th4_small; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->by -= th4_small; if (res < 0) return res; } if (t->bx + tw4_small >= f->bw) break; t->bx += tw4_small; res = read_luma_tx_cf(t, DB_ONLY(depth) tx_big, b); if (res < 0 || t->bx + tw4_big >= f->bw) { t->bx -= tw4_small; } else { t->bx += tw4_big; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); if (!res && have_h_split) { t->by += th4_small; res = read_luma_tx_cf(t, DB_ONLY(depth) tx, b); t->by -= th4_small; } t->bx -= tw4_small + tw4_big; } if (res < 0) return res; break; } default: assert(0); } if (cbs == BS_INVALID) return 0; // chroma chroma: {} const int ss_ver = f->ss_ver, ss_hor = f->ss_hor; const uint8_t *const cb_dim = dav2d_block_dimensions[cbs]; const int cbw4 = cb_dim[0], cw4 = imin(f->bw - t->cbx, cbw4); const int cbh4 = cb_dim[1], ch4 = imin(f->bh - t->cby, cbh4); const int cbw4ss = (cbw4 + ss_hor) >> ss_hor, cbh4ss = (cbh4 + ss_ver) >> ss_ver; const int cw4ss = (cw4 + ss_hor) >> ss_hor, ch4ss = (ch4 + ss_ver) >> ss_ver; const enum RectTxfmSize uvtx = f->frame_hdr->segmentation.lossless[b->seg_id] ? (int) TX_4X4 : dav2d_max_txfm_size_for_bs[cbs][DAV2D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout]; const TxfmInfo *const uv_t_dim = &dav2d_txfm_dimensions[uvtx]; const int sdp_active = lbs == BS_INVALID; const int intra = b->intra && (sdp_active || !b->intrabc); const int cbx4 = t->cbx & 63, cby4 = t->cby & 63; const int cbx4ss = cbx4 >> ss_hor, cby4ss = cby4 >> ss_ver; const int ctw4 = imin(uv_t_dim->w, (f->bw - t->cbx + ss_hor) >> ss_hor); const int cth4 = imin(uv_t_dim->h, (f->bh - t->cby + ss_ver) >> ss_ver); const enum IntraPredMode orig_uv_mode = b->uv_mode; int angle = b->uv_angle; if (intra) b->uv_mode = wide_angle_remap(uv_t_dim, b->uv_mode, &angle, 0); Dav2dTileState *const ts = t->ts; enum TxfmType y_txtp = t->txtp_map[(t->by & 15) * 16 + (t->bx & 15)]; coef *cf[2] = { ts->frame_thread[0].cf }; ts->frame_thread[0].cf += cbw4ss * cbh4ss * 16 * 2; cf[1] = &cf[0][cbw4ss * cbh4ss * 16]; // decode coefficients for (int pl = 0; pl < 2; pl++) { int y; for (y = 0; y < ch4ss; y += uv_t_dim->h) { int x; for (x = 0; x < cw4ss; x += uv_t_dim->w) { const ptrdiff_t i = y * cbw4ss + x; if (b->bs == b->cbs) y_txtp = t->txtp_map[(t->by & 15) * 16 + (t->bx & 15)]; enum TxfmType uv_txtp = y_txtp; uint8_t cf_ctx; const int eob = decode_coefs(t, DB_ONLY(depth + 1) &t->a->ccoef[pl][cbx4ss + x], &t->l.ccoef[pl][cby4ss + y], uvtx, b->cbs, sdp_active, b, pl + 1, &cf[pl][i * 16], &uv_txtp, &cf_ctx); if (!pl) t->u_has_cf = eob >= 0; struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[(t->cby + (y << ss_ver)) * f->b4_stride + (t->cbx + (x << ss_hor))]; cbi->txtp[pl + 1] = uv_txtp; if (eob == INT_MIN) return -1; DEBUG_BLOCK_printf("%*sPost-%c_cf_blk[tx=%dx%d,txtp=%s/%s," "eob=%d]: r=%d\n", depth + 1, "", "uv"[pl], uv_t_dim->w * 4, uv_t_dim->h * 4, dav2d_tx1d_names[uv_txtp & 7], dav2d_tx1d_names[(uv_txtp >> 5) & 7], eob, t->ts->msac.rng); cbi->eob[pl + 1] = eob; dav2d_memset_likely_pow2(&t->a->ccoef[pl][cbx4ss + x], cf_ctx, ctw4); dav2d_memset_likely_pow2(&t->l.ccoef[pl][cby4ss + y], cf_ctx, cth4); t->bx += uv_t_dim->w << ss_hor; } t->bx -= x << ss_hor; t->by += uv_t_dim->h << ss_ver; } t->by -= y << ss_ver; } b->uv_mode = orig_uv_mode; return 0; } static void mc(Dav2dTaskContext *const t, pixel *dst8, int16_t *const dst16, const ptrdiff_t dst_stride, int bw4, const int bh4, const int bx, const int by, const int pl, const mv mv, const Dav2dThreadPicture *const refp, const int refidx, const enum Dav2dFilterMode filter, const int left, const int right, const int top, const int bottom) { assert((dst8 != NULL) ^ (dst16 != NULL)); const Dav2dFrameContext *const f = t->f; const int ss_ver = !!pl && f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = !!pl && f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; const int mvx = mv.x, mvy = mv.y; ptrdiff_t ref_stride = refp->p.stride[!!pl]; const pixel *ref; assert(left >= 0 && top >= 0 && left < right && top < bottom && right <= f->bw * 4 && bottom <= f->bh * 4); if (refp->p.p.w == f->cur.p.p.w && refp->p.p.h == f->cur.p.p.h) { const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver); const int dx = bx * h_mul + (mvx >> (3 + ss_hor)); const int dy = by * v_mul + (mvy >> (3 + ss_ver)); if (dx - !!mx * 3 < left || dy - !!my * 3 < top || dx + bw4 * h_mul + !!mx * 4 > right || dy + bh4 * v_mul + !!my * 4 > bottom) { pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); ref = refp->p.data[pl]; f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, right - left, bottom - top, dx - !!mx * 3 - left, dy - !!my * 3 - top, emu_edge_buf, 192 * sizeof(pixel), &ref[left + top * PXSTRIDE(ref_stride)], ref_stride); ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3]; ref_stride = 192 * sizeof(pixel); } else { ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; } if (dst8 != NULL) { if (bw4 & (bw4 - 1)) { assert(!ss_hor && !ss_ver && v_mul == 4 && h_mul == 4 && filter == DAV2D_FILTER_BILINEAR); assert(!((bw4 - 2) & (bw4 - 3))); f->dsp->mc.mc[filter](dst8, dst_stride, ref, ref_stride, bw4 * 4 - 8, bh4 * 4, mx << 1, my << 1 HIGHBD_CALL_SUFFIX); dst8 += 4 * (bw4 - 2); ref += 4 * (bw4 - 2); bw4 = 2; } f->dsp->mc.mc[filter](dst8, dst_stride, ref, ref_stride, bw4 * h_mul, bh4 * v_mul, mx << !ss_hor, my << !ss_ver HIGHBD_CALL_SUFFIX); } else { f->dsp->mc.mct[filter](dst16, dst_stride, ref, ref_stride, bw4 * h_mul, bh4 * v_mul, mx << !ss_hor, my << !ss_ver HIGHBD_CALL_SUFFIX); } } else { assert(refp != &f->cur); const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver); const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); #define scale_mv(res, val, scale) do { \ const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \ res = apply_sign64((llabs(tmp) + 128) >> 8, tmp) + 32; \ } while (0) int pos_y, pos_x; scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale); scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale); #undef scale_mv const int left = pos_x >> 10; const int top = pos_y >> 10; const int right = ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1; const int bottom = ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1; if (DEBUG_BLOCK_INFO) printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n", left, top, orig_pos_x, f->svc[refidx][0].scale, refidx, right-left, bottom-top, f->svc[refidx][0].step, f->svc[refidx][1].step); const int w = (refp->p.p.w + ss_hor) >> ss_hor; const int h = (refp->p.p.h + ss_ver) >> ss_ver; if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) { pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7, w, h, left - 3, top - 3, emu_edge_buf, 320 * sizeof(pixel), refp->p.data[pl], ref_stride); ref = &emu_edge_buf[320 * 3 + 3]; ref_stride = 320 * sizeof(pixel); if (DEBUG_BLOCK_INFO) printf("Emu\n"); } else { ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left; } if (dst8 != NULL) { f->dsp->mc.mc_scaled[filter](dst8, dst_stride, ref, ref_stride, bw4 * h_mul, bh4 * v_mul, pos_x & 0x3ff, pos_y & 0x3ff, f->svc[refidx][0].step, f->svc[refidx][1].step HIGHBD_CALL_SUFFIX); } else { f->dsp->mc.mct_scaled[filter](dst16, dst_stride, ref, ref_stride, bw4 * h_mul, bh4 * v_mul, pos_x & 0x3ff, pos_y & 0x3ff, f->svc[refidx][0].step, f->svc[refidx][1].step HIGHBD_CALL_SUFFIX); } } } // like mc(), but: // - mv subpel precision is 4 instead of 3 bits // - we support custom edge limits, since the opfl reference area cannot exceed // the original mv's bounding box (the remaining pixels are emulated) // - no scaled support, and no dst8 support static void mc_opfl(Dav2dTaskContext *const t, int16_t *const dst16, const ptrdiff_t dst_stride, const int bw4, const int bh4, const int bx4, const int by4, const int pl, const mv mv, const Dav2dThreadPicture *const refp, const enum Dav2dFilterMode filter, const int left, const int right, const int top, const int bottom) { const Dav2dFrameContext *const f = t->f; assert(refp->p.p.w == f->cur.p.p.w && refp->p.p.h == f->cur.p.p.h); const int mvx = mv.x, mvy = mv.y; ptrdiff_t ref_stride = refp->p.stride[!!pl]; const pixel *ref; const int mx = mvx & 15, my = mvy & 15; const int dx = bx4 * 4 + (mvx >> 4); const int dy = by4 * 4 + (mvy >> 4); assert(top >= 0 && left >= 0 && left < right && top < bottom && right <= (f->bw * 4) >> !!pl * f->ss_hor && bottom <= (f->bh * 4) >> !!pl * f->ss_ver); if (dx - !!mx * 3 < left || dy - !!my * 3 < top || dx + bw4 * 4 + !!mx * 4 > right || dy + bh4 * 4 + !!my * 4 > bottom) { pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); ref = refp->p.data[pl]; f->dsp->mc.emu_edge(bw4 * 4 + !!mx * 7, bh4 * 4 + !!my * 7, right - left, bottom - top, dx - !!mx * 3 - left, dy - !!my * 3 - top, emu_edge_buf, 192 * sizeof(pixel), &ref[left + top * PXSTRIDE(ref_stride)], ref_stride); ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3]; ref_stride = 192 * sizeof(pixel); } else { ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; } f->dsp->mc.mct[filter](dst16, dst_stride, ref, ref_stride, bw4 * 4, bh4 * 4, mx, my HIGHBD_CALL_SUFFIX); } static void ext_warp(Dav2dTaskContext *const t, pixel *dst8, int16_t *dst16, const ptrdiff_t dstride, const uint8_t *const b_dim, const int pl, const Dav2dThreadPicture *const refp, const Dav2dWarpedMotionParams *const wmp) { assert((dst8 != NULL) ^ (dst16 != NULL)); const Dav2dFrameContext *const f = t->f; const Dav2dDSPContext *const dsp = f->dsp; const int ss_ver = !!pl && f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = !!pl && f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; assert(!((b_dim[0] * h_mul) & 3) && !((b_dim[1] * v_mul) & 3)); const int32_t *const mat = wmp->matrix; const int w = f->bw * 4 >> ss_hor; const int h = f->bh * 4 >> ss_ver; const int sw = imin(b_dim[0] * h_mul, 8), hsw = sw >> 1; const int sh = imin(b_dim[1] * v_mul, 8), hsh = sh >> 1; const int bx = pl ? t->cbx : t->bx, by = pl ? t->cby : t->by; for (int y = 0; y < b_dim[1] * v_mul; y += sh) { const int src_y = by * 4 + ((y + hsh) << ss_ver); const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; for (int x = 0; x < b_dim[0] * h_mul; x += sw) { // calculate transformation relative to center of 8x8 block in // luma pixel units const int src_x = bx * 4 + ((x + hsw) << ss_hor); const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int left_window = (int) (mvx >> 16) - hsw - 3; const int top_window = (int) (mvy >> 16) - hsh - 3; const int left = iclip(left_window, 0, w - 1); const int right = iclip(left_window + sw + 7, 1, w); const int top = iclip(top_window, 0, h - 1); const int bottom = iclip(top_window + sh + 7, 1, h); for (int yy = y; yy < y + sh; yy += 4) { const int src_y = by * 4 + ((yy + 2) << ss_ver); const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; for (int xx = x; xx < x + sw; xx += 4) { const int src_x = bx * 4 + ((xx + 2) << ss_hor); const int64_t mvx = (((int64_t) mat[2] * src_x + mat3_y) >> ss_hor) + 0x200; const int64_t mvy = (((int64_t) mat[4] * src_x + mat5_y) >> ss_ver) + 0x200; const int dx = (int) (mvx >> 16) - 2; const int mx = (int) ((mvx >> 10) & 63); const int dy = (int) (mvy >> 16) - 2; const int my = (int) ((mvy >> 10) & 63); const pixel *ref_ptr = refp->p.data[pl]; ptrdiff_t ref_stride = refp->p.stride[!!pl]; if (dx - 3 < left || dx + 4 + 4 > right || dy - 3 < top || dy + 4 + 4 > bottom) { pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); f->dsp->mc.emu_edge(11, 11, right - left, bottom - top, dx - 3 - left, dy - 3 - top, emu_edge_buf, 32 * sizeof(pixel), &ref_ptr[left + top * PXSTRIDE(ref_stride)], ref_stride); ref_ptr = &emu_edge_buf[32 * 3 + 3]; ref_stride = 32 * sizeof(pixel); } else { ref_ptr = &ref_ptr[PXSTRIDE(ref_stride) * dy + dx]; } if (dst16 != NULL) dsp->mc.ext_warp4x4t(&dst16[yy * dstride + xx], dstride, ref_ptr, ref_stride, mx, my HIGHBD_CALL_SUFFIX); else dsp->mc.ext_warp4x4(&dst8[yy * PXSTRIDE(dstride) + xx], dstride, ref_ptr, ref_stride, mx, my HIGHBD_CALL_SUFFIX); } } } } } static void warp_affine(Dav2dTaskContext *const t, pixel *dst8, int16_t *dst16, const ptrdiff_t dstride, const uint8_t *const b_dim, const int pl, const Dav2dThreadPicture *const refp, const Dav2dWarpedMotionParams *const wmp) { assert((dst8 != NULL) ^ (dst16 != NULL)); const Dav2dFrameContext *const f = t->f; const int ss_ver = !!pl && f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = !!pl && f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I444; const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; if (!wmp->affine || imin(b_dim[0] * h_mul, b_dim[1] * v_mul) < 8) { ext_warp(t, dst8, dst16, dstride, b_dim, pl, refp, wmp); return; } const Dav2dDSPContext *const dsp = f->dsp; assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); const int32_t *const mat = wmp->matrix; const int width = f->bw * 4 >> ss_hor; const int height = f->bh * 4 >> ss_ver; const int bx = pl ? t->cbx : t->bx, by = pl ? t->cby : t->by; for (int y = 0; y < b_dim[1] * v_mul; y += 8) { const int src_y = by * 4 + ((y + 4) << ss_ver); const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; for (int x = 0; x < b_dim[0] * h_mul; x += 8) { // calculate transformation relative to center of 8x8 block in // luma pixel units const int src_x = bx * 4 + ((x + 4) << ss_hor); const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int dx = (int) (mvx >> 16) - 4; const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 - wmp->u.p.beta * 7) & ~0x3f; const int dy = (int) (mvy >> 16) - 4; const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 - wmp->u.p.delta * 4) & ~0x3f; const pixel *ref_ptr; ptrdiff_t ref_stride = refp->p.stride[!!pl]; if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) { pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3, emu_edge_buf, 32 * sizeof(pixel), refp->p.data[pl], ref_stride); ref_ptr = &emu_edge_buf[32 * 3 + 3]; ref_stride = 32 * sizeof(pixel); } else { ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx; } if (dst16 != NULL) dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride, wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); else dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride, wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); } if (dst8) dst8 += 8 * PXSTRIDE(dstride); else dst16 += 8 * dstride; } } static void gen_mask(uint8_t *mask, const ptrdiff_t stride, const int bw, const int bh, const int x0, const int y0, const int x1, const int y1, const unsigned fw, const unsigned fh) { for (int y = 0; y < bh; y++) { for (int x = 0; x < bw; x++) { int p0 = (unsigned) (x0 + x) < fw && (unsigned) (y0 + y) < fh; int p1 = (unsigned) (x1 + x) < fw && (unsigned) (y1 + y) < fh; mask[x] = 32 * (p0 - p1 + 1); } mask += stride; } } static ALWAYS_INLINE int get_mask(uint8_t *const mask, const ptrdiff_t stride, const int bx4, const int x4, const int by4, const int y4, const union mv mv[2], const int h_subpel_bits, const int v_subpel_bits, const int bw4, const int bh4, const int iw, const int ih) { const int x0 = (bx4 + x4) * 4 + (mv[0].x >> h_subpel_bits); const int y0 = (by4 + y4) * 4 + (mv[0].y >> v_subpel_bits); const int x1 = (bx4 + x4) * 4 + (mv[1].x >> h_subpel_bits); const int y1 = (by4 + y4) * 4 + (mv[1].y >> v_subpel_bits); if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0 || x0 + bw4 * 4 >= iw || x1 + bw4 * 4 >= iw || y0 + bh4 * 4 >= ih || y1 + bh4 * 4 >= ih) { gen_mask(&mask[(y4 * stride + x4) * 4], stride, bw4 * 4, bh4 * 4, x0, y0, x1, y1, iw, ih); return 1; } return 0; } static void update_temporal(const Dav2dFrameContext *const f, refmvs_temporal_block *t_dst, const ptrdiff_t t_stride, const int w8, const int h8, const union refpair ref, const union mv mv[2], const int swap) { if (!f->seq_hdr->ref_frame_mvs) return; refmvs_temporal_block t_src; t_src.ref.ref[0] = ref.ref[swap]; t_src.ref.ref[1] = ref.ref[!swap]; t_src.mv.mv[0] = quantize_mv(mv[swap]); t_src.mv.mv[1] = quantize_mv(mv[!swap]); if (t_src.mv.mv[0].n == INVALID_TRAJ) { if (t_src.mv.mv[1].n == INVALID_TRAJ) { t_src.ref.pair = -1; } else { t_src.mv.mv[0] = t_src.mv.mv[1]; t_src.ref.ref[0] = t_src.ref.ref[1]; } } else if (t_src.mv.mv[1].n == INVALID_TRAJ) { t_src.mv.mv[1] = t_src.mv.mv[0]; t_src.ref.ref[1] = t_src.ref.ref[0]; } for (int y = 0; y < h8; y++) { for (int x = 0; x < w8; x++) t_dst[x] = t_src; t_dst += t_stride; } } union OpflMvDeltaBlock { struct OpflMvDelta { int8_t x, y; } d[2]; uint32_t n; }; static void opfl_mv_adj(const struct OpflRegressionData *const r, union OpflMvDeltaBlock *const dd, const union aliasi16 d) { int su2 = r->su2, suv = r->suv, sv2 = r->sv2, suw = r->suw, svw = r->svw; const int nbits_su2 = 1 + ulog2(su2 + !su2); const int nbits_sv2 = 1 + ulog2(sv2 + !sv2); const int nbits_suv = 1 + ulog2(abs(suv) + !suv); const int nbits_suw = 1 + ulog2(abs(suw) + !suw); const int nbits_svw = 1 + ulog2(abs(svw) + !svw); const int nbits_max = imax(nbits_su2 + nbits_sv2, imax(imax(nbits_sv2 + nbits_suw, nbits_suv + nbits_svw), imax(nbits_su2 + nbits_svw, nbits_suv + nbits_suw))); const int rbits = imax(0, nbits_max - 23) >> 1; if (rbits) { const int rnd = (1 << rbits) >> 1; su2 = (su2 + rnd) >> rbits; sv2 = (sv2 + rnd) >> rbits; suv = (suv + rnd - (suv < 0)) >> rbits; suw = (suw + rnd - (suw < 0)) >> rbits; svw = (svw + rnd - (svw < 0)) >> rbits; } const int det = su2 * sv2 - suv * suv; if (det > 0) { int s[2] = { sv2 * suw - suv * svw, su2 * svw - suv * suw }, shift; const int idet = dav2d_resolve_divisor_32(det, &shift), idet_bits = ulog2(idet); for (int i = 0; i < 2; i++) { if (!s[i]) continue; int abss = abs(s[i]); const int rbits = imax(0, ulog2(abss) + idet_bits - 22); if (rbits > 0) abss = (abss + ((1 << rbits) >> 1)) >> rbits; const int ibits = 3 + rbits - shift; if (ibits >= 0) abss = abss * idet * (1 << ibits); else abss = (abss * idet + ((1 << -ibits) >> 1)) >> -ibits; s[i] = apply_sign(abss, s[i]); } dd->d[0].x = -iclip(d.i8[0] * s[0], -16, 16); dd->d[0].y = -iclip(d.i8[0] * s[1], -16, 16); dd->d[1].x = +iclip(d.i8[1] * s[0], -16, 16); dd->d[1].y = +iclip(d.i8[1] * s[1], -16, 16); } else dd->n = 0; } static ALWAYS_INLINE void scaledown_16pel_mv_for_chroma(union mv *const mv, const enum Dav2dPixelLayout layout) { switch (layout) { case DAV2D_PIXEL_LAYOUT_I420: for (int i = 0; i < 2; i++) mv[i].y = (mv[i].y + (mv[i].y > 0)) >> 1; // fall-through case DAV2D_PIXEL_LAYOUT_I422: for (int i = 0; i < 2; i++) mv[i].x = (mv[i].x + (mv[i].x > 0)) >> 1; break; default: break; } } static ALWAYS_INLINE void scaleup_8pel_mv_for_chroma(union mv *const mv, const enum Dav2dPixelLayout layout) { switch (layout) { case DAV2D_PIXEL_LAYOUT_I444: for (int i = 0; i < 2; i++) mv[i].x <<= 1; // fall-through case DAV2D_PIXEL_LAYOUT_I422: for (int i = 0; i < 2; i++) mv[i].y <<= 1; break; default: break; } } static int tip_pred(Dav2dTaskContext *const t, int16_t (*const tmp)[64 * 64], const Av2Block *const b, const int bw4, const int bh4, const int w4, const int h4) { const Dav2dFrameContext *const f = t->f; int opfl = f->seq_hdr->tip_refine_mv && (f->frame_hdr->tip.frame_mode == 1 || f->frame_hdr->tip.subpel_filter == DAV2D_FILTER_8TAP_SHARP); const union refpair ref = f->rf.tip.ref; const int refine = opfl && f->frame_hdr->tip.frame_mode == 1 && f->refdist[ref.ref[0]] == -f->refdist[ref.ref[1]]; const int step = 2 << (f->frame_hdr->tip.frame_mode == 2 /* frame */ ? !opfl : ((!opfl && imin(bw4, bh4) >= 4) || b->bs == BS_256x256)); opfl &= !!f->seq_hdr->opfl_refine && f->frame_hdr->has_bothside_refs; ptrdiff_t off_y = 0; uint8_t *const mask = t->scratch.seg_mask; const int bacp = f->seq_hdr->imp_msk_bld && b->cwp_idx == 8 && !f->svc[ref.ref[0]][0].scale && !f->svc[ref.ref[1]][0].scale; const int w = f->bw * 4, h = f->bh * 4; if (bacp) memset(mask, 0x20, bw4 * bh4 * 16); int have_bacp = 0; const Dav2dThreadPicture *const refp[2] = { &f->refp[ref.ref[0]], &f->refp[ref.ref[1]] }; pixel *p[2]; ptrdiff_t p_stride; union aliasi16 d; if (opfl) { p[0] = bitfn(t->scratch.p)[0]; p[1] = bitfn(t->scratch.p)[1]; p_stride = ((step + 2) * 4 * sizeof(pixel) + 63) & ~63; const int d0 = f->absrefdist[ref.ref[0]], d1 = f->absrefdist[ref.ref[1]]; d.i8[0] = apply_sign(1 + (d0 > d1), -f->refdist[ref.ref[0]]); d.i8[1] = apply_sign(1 + (d1 > d0), +f->refdist[ref.ref[1]]); } union mv (*rmv_line)[2][2] = &t->rmv[((t->by & 31) >> 1) * 16 + ((t->bx & 31) >> 1)]; const unsigned sad8x8_thr = f->frame_hdr->tip.frame_mode == 1 /* reference */ ? 6 : 15; const ptrdiff_t t_stride = f->rf.rp_stride; refmvs_temporal_block *t_dst = &f->rf.rp[(t->by >> 1) * t_stride + (t->bx >> 1)]; const int t_swap = !!(f->rf.ref_flip & (1ULL << (ref.ref[0] * 8 + ref.ref[1]))); for (int y = 0, yy = 0; y < h4; y += step, yy++, rmv_line += 16 * step >> 1) { const ptrdiff_t off_y8 = (((t->by + y) & (f->sb_step - 1)) >> 1) * t_stride; for (int x = 0; x < w4; x += step) { const ptrdiff_t off_8x8 = off_y8 + ((t->bx + x) >> 1); mv tmv = t->rt.rp_proj[off_8x8].mv; if (tmv.y == INVALID_MV) tmv.n = 0; union mv (*const rmv)[2] = rmv_line[x >> 1], *const cmv = rmv[0]; int left[2], top[2]; for (int i = 0; i < 2; i++) { const mv tipmv = scale_mv(tmv, f->rf.tip.sf[i]); rmv[1][i].y = cmv[i].y = iclip(tipmv.y + b->mv[0].y, -0xffff, 0xffff); rmv[1][i].x = cmv[i].x = iclip(tipmv.x + b->mv[0].x, -0xffff, 0xffff); top[i] = t->by * 4 + y * 4 + (cmv[i].y >> 3) - 3; left[i] = t->bx * 4 + x * 4 + (cmv[i].x >> 3) - 3; } scaleup_8pel_mv_for_chroma(rmv[1], f->cur.p.p.layout); if (opfl) { // refinement for (int i = 0; i < 2; i++) mc(t, p[i], NULL, p_stride, step + 2, step + 2, t->bx + x, t->by + y, 0, (union mv) { .y = cmv[i].y - 32, .x = cmv[i].x - 32 }, refp[i], ref.ref[i], DAV2D_FILTER_BILINEAR, iclip(left[i], 0, w - 1), iclip(left[i] + 7 + step * 4, 1, w), iclip(top[i], 0, h - 1), iclip(top[i] + 7 + step * 4, 1, h)); int dy, dx; if (refine) { struct OpflOffset o; f->dsp->mc.sad_refine_mv(p[0], p_stride, p[1], p_stride, step * 4, step * 4, 1, &o HIGHBD_CALL_SUFFIX); dy = o.y; cmv[0].y += 8 * dy; cmv[1].y -= 8 * dy; dx = o.x; cmv[0].x += 8 * dx; cmv[1].x -= 8 * dx; } else dy = dx = 0; union OpflMvDeltaBlock dd; const unsigned sad = b->bs == BS_256x256 && f->frame_hdr->tip.frame_mode == 1 ? 0 : f->dsp->mc.sad8x8(&p[0][(4 + dy) * PXSTRIDE(p_stride) + (4 + dx)], p_stride, &p[1][(4 - dy) * PXSTRIDE(p_stride) + (4 - dx)], p_stride HIGHBD_CALL_SUFFIX); if (sad >= sad8x8_thr) { struct OpflRegressionData res[4]; f->dsp->mc.opfl_derive_mv(res, &p[0][(4 + dy) * PXSTRIDE(p_stride) + (4 + dx)], p_stride, &p[1][(4 - dy) * PXSTRIDE(p_stride) + (4 - dx)], p_stride, step * 4, step * 4, 8, d HIGHBD_CALL_SUFFIX); opfl_mv_adj(res, &dd, d); } else { dd.n = 0; } cmv[0].x = cmv[0].x * 2 + dd.d[0].x; cmv[0].y = cmv[0].y * 2 + dd.d[0].y; cmv[1].x = cmv[1].x * 2 + dd.d[1].x; cmv[1].y = cmv[1].y * 2 + dd.d[1].y; for (int i = 0; i < 2; i++) mc_opfl(t, &tmp[i][y * bw4 * 16 + x * 4], bw4 * 4, step, step, t->bx + x, t->by + y, 0, cmv[i], refp[i], b->filter, iclip(left[i], 0, w - 1), iclip(left[i] + 7 + step * 4, 1, w), iclip(top[i], 0, h - 1), iclip(top[i] + 7 + step * 4, 1, h)); const union mv dmv[2] = { [0] = { .y = (cmv[0].y + (dd.d[0].y > 0)) >> 1, .x = (cmv[0].x + (dd.d[0].x > 0)) >> 1 }, [1] = { .y = (cmv[1].y + (dd.d[1].y > 0)) >> 1, .x = (cmv[1].x + (dd.d[1].x > 0)) >> 1 }, }; update_temporal(f, &t_dst[x >> 1], t_stride, step >> 1, step >> 1, ref, dmv, t_swap); if (bacp) have_bacp |= get_mask(mask, bw4 * 4, t->bx, x, t->by, y, cmv, 4, 4, step, step, w, h); scaledown_16pel_mv_for_chroma(cmv, f->cur.p.p.layout); } else { for (int i = 0; i < 2; i++) mc(t, NULL, &tmp[i][off_y + x * 4], bw4 * 4, step, step, t->bx + x, t->by + y, 0, cmv[i], refp[i], ref.ref[i], b->filter, 0, f->bw * 4, 0, f->bh * 4); // when refinement is disabled, each sub-block in the temporal // MV buffer gets its own 8x8 tip MV even if the tip blocksize // is 16x16 (see #945) update_temporal(f, &t_dst[x >> 1], t_stride, step >> 1, step >> 1, ref, cmv, t_swap); if (step == 4 && f->frame_hdr->tip.frame_mode == 1 /* reference */) { union mv dmv[2]; for (int p = 1; p < 4; p++) { mv tmv = t->rt.rp_proj[off_8x8 + (p & 1) + ((p & 2) >> 1) * t_stride].mv; if (tmv.y == INVALID_MV) tmv.n = 0; for (int i = 0; i < 2; i++) { const mv tipmv = scale_mv(tmv, f->rf.tip.sf[i]); dmv[i].y = iclip(tipmv.y + b->mv[0].y, -0xffff, 0xffff); dmv[i].x = iclip(tipmv.x + b->mv[0].x, -0xffff, 0xffff); } update_temporal(f, &t_dst[((p & 2) >> 1) * t_stride + (x >> 1) + (p & 1)], t_stride, 1, 1, ref, dmv, t_swap); } } if (bacp) have_bacp |= get_mask(mask, bw4 * 4, t->bx, x, t->by, y, cmv, 3, 3, step, step, w, h); scaleup_8pel_mv_for_chroma(cmv, f->cur.p.p.layout); } } off_y += bw4 * 4 * 4 * step; t_dst += (step >> 1) * t_stride; } return bacp && have_bacp; } static int opfl_pred(Dav2dTaskContext *const t, int16_t (*const tmp)[64 * 64], const Av2Block *const b, const int bw4, const int bh4, const int w4, const int h4) { const Dav2dFrameContext *const f = t->f; assert(!f->svc[b->ref.ref[0]][0].scale && !f->svc[b->ref.ref[1]][0].scale); const int refine = b->comp_type == COMP_INTER_AVG && b->refine_mv; const int opfl = b->inter_mode >= OPFL_NEARMV_NEARMV; assert(opfl || refine); assert(bw4 >= 2 && bh4 >= 2); const int w = f->bw * 4, h = f->bh * 4; pixel *p[2] = { bitfn(t->scratch.p)[0], bitfn(t->scratch.p)[1] }; const ptrdiff_t p_stride = ((bw4 + refine * 2) * 4 * sizeof(pixel) + 63) & ~63; const Dav2dThreadPicture *refp[2] = { &f->refp[b->ref.ref[0]], &f->refp[b->ref.ref[1]] }; int top[2] = { t->by * 4 + (b->mv[0].y >> 3) - 3, t->by * 4 + (b->mv[1].y >> 3) - 3 }; // FIXME namespace bacp symbols uint8_t *const mask = t->scratch.seg_mask; const int bacp = f->seq_hdr->imp_msk_bld && b->cwp_idx == 8; if (bacp) memset(mask, 0x20, bw4 * bh4 * 16); int have_bacp = 0; // FIXME namespace opfl symbols // find reduced distance as inverse weights const int d0 = f->absrefdist[b->ref.ref[0]], d1 = f->absrefdist[b->ref.ref[1]]; const union aliasi16 d = { .i8 = { apply_sign(1 + (d0 > d1), -f->refdist[b->ref.ref[0]]), apply_sign(1 + (d1 > d0), +f->refdist[b->ref.ref[1]]), }}; const int bs = 2 - (b->bs == BS_8x8 /* FIXME not tip */); union OpflMvDeltaBlock dd[2 * 2]; union mv (*rmv_line)[2][2] = &t->rmv[((t->by & 31) >> 1) * 16 + ((t->bx & 31) >> 1)]; const ptrdiff_t t_stride = f->rf.rp_stride; refmvs_temporal_block *t_dst = &f->rf.rp[(t->by >> 1) * t_stride + (t->bx >> 1)]; const int t_swap = !!(f->rf.ref_flip & (1ULL << (b->ref.ref[0] * 8 + b->ref.ref[1]))); const int sh4 = imin(4, bh4), sw4 = imin(4, bw4); for (int y = 0; y < h4; y += sh4, rmv_line += 16 * sh4 >> 1) { int left[2] = { t->bx * 4 + (b->mv[0].x >> 3) - 3, t->bx * 4 + (b->mv[1].x >> 3) - 3 }; if (refine) { for (int x = 0; x < w4; x += sw4) { for (int n = 0; n < 2; n++) mc(t, p[n], NULL, p_stride, sw4 + 2, sh4 + 2, t->bx + x, t->by + y, 0, (union mv) { .y = b->mv[n].y - 32, .x = b->mv[n].x - 32 }, refp[n], b->ref.ref[n], DAV2D_FILTER_BILINEAR, iclip(left[n], 0, w - 1), iclip(left[n] + 4 * sw4 + 7, 1, w), iclip(top[n], 0, h - 1), iclip(top[n] + 4 * sh4 + 7, 1, h)); struct OpflOffset o; f->dsp->mc.sad_refine_mv(p[0], p_stride, p[1], p_stride, sw4 * 4, sh4 * 4, b->refine_mv == 2, &o HIGHBD_CALL_SUFFIX); const int dy = o.y, dx = o.x; if (opfl) { struct OpflRegressionData res[2 * 2]; // subpel-gradient based mv refinement (optical flow = opfl) f->dsp->mc.opfl_derive_mv(res, &p[0][(4 + dy) * PXSTRIDE(p_stride) + (4 + dx)], p_stride, &p[1][(4 - dy) * PXSTRIDE(p_stride) + (4 - dx)], p_stride, sw4 * 4, sh4 * 4, bs * 4, d HIGHBD_CALL_SUFFIX); const struct OpflRegressionData *r = res; for (int by = 0; by < sh4; by += 2) { for (int bx = 0; bx < sw4; bx += 2, r++) { opfl_mv_adj(r, dd, d); union mv *const mv = rmv_line[!!by * 16 + ((x + bx) >> 1)][0]; mv[0].y = b->mv[0].y * 2 + dd[0].d[0].y + dy * 16; mv[0].x = b->mv[0].x * 2 + dd[0].d[0].x + dx * 16; mv[1].y = b->mv[1].y * 2 + dd[0].d[1].y - dy * 16; mv[1].x = b->mv[1].x * 2 + dd[0].d[1].x - dx * 16; for (int i = 0; i < 2; i++) mc_opfl(t, &tmp[i][((y + by) * bw4 * 4 + x + bx) * 4], bw4 * 4, bs, bs, t->bx + x + bx, t->by + y + by, 0, mv[i], refp[i], b->filter, iclip(left[i], 0, w - 1), iclip(left[i] + sw4 * 4 + 7, 1, w), iclip(top[i], 0, h - 1), iclip(top[i] + sh4 * 4 + 7, 1, h)); const union mv dmv[2] = { [0] = { .y = (mv[0].y + (dd[0].d[0].y > 0)) >> 1, .x = (mv[0].x + (dd[0].d[0].x > 0)) >> 1 }, [1] = { .y = (mv[1].y + (dd[0].d[1].y > 0)) >> 1, .x = (mv[1].x + (dd[0].d[1].x > 0)) >> 1 }, }; update_temporal(f, &t_dst[((x + bx) >> 1) + !!by * t_stride], t_stride, 1, 1, b->ref, dmv, t_swap); if (bacp) have_bacp |= get_mask(mask, bw4 * 4, t->bx, x + bx, t->by, y + by, mv, 4, 4, 2, 2, w, h); scaledown_16pel_mv_for_chroma(mv, f->cur.p.p.layout); } } } else { union mv *const mv = rmv_line[x >> 1][0]; mv[0].y = b->mv[0].y + dy * 8; mv[0].x = b->mv[0].x + dx * 8; mv[1].y = b->mv[1].y - dy * 8; mv[1].x = b->mv[1].x - dx * 8; for (int i = 0; i < 2; i++) mc(t, NULL, &tmp[i][(y * 4 * bw4 + x) * 4], bw4 * 4, sw4, sh4, t->bx + x, t->by + y, 0, mv[i], refp[i], b->ref.ref[i], b->filter, iclip(left[i], 0, w - 1), iclip(left[i] + sw4 * 4 + 7, 1, w), iclip(top[i], 0, h - 1), iclip(top[i] + sh4 * 4 + 7, 1, h)); update_temporal(f, &t_dst[x >> 1], t_stride, sw4 >> 1, sh4 >> 1, b->ref, mv, t_swap); scaleup_8pel_mv_for_chroma(mv, f->cur.p.p.layout); if (bacp) have_bacp |= get_mask(mask, bw4 * 4, t->bx, x, t->by, y, mv, 3, 3, sw4, sh4, w, h); } for (int n = 0; n < 2; n++) left[n] += 16; } } else { assert(opfl); for (int n = 0; n < 2; n++) mc(t, p[n], NULL, p_stride, bw4, sh4, t->bx, t->by + y, 0, b->mv[n], refp[n], b->ref.ref[n], DAV2D_FILTER_BILINEAR, 0, w, 0, h); struct OpflRegressionData res[2 * 8]; f->dsp->mc.opfl_derive_mv(res, p[0], p_stride, p[1], p_stride, bw4 * 4, sh4 * 4, bs * 4, d HIGHBD_CALL_SUFFIX); const struct OpflRegressionData *r_line = res; union OpflMvDeltaBlock *ddl = dd; for (int by = 0; by < sh4; by += bs) { const struct OpflRegressionData *r = r_line; for (int bx = 0, xx = 0; bx < w4; bx += bs, r++, xx++) { opfl_mv_adj(r, ddl, d); union mv mv_8x8[2]; // for 8x8, the opfl blocksize is 4x4; for inter, we // only need to store the first (top/left) one, and // the rest can be discarded. (Not sure if this is // true for 422/444, but it's definitely true for 420.) union mv *const mv = bs == 1 && (bx || by) ? mv_8x8 : rmv_line[!!by * 16 + xx][0]; mv[0].y = b->mv[0].y * 2 + ddl->d[0].y; mv[0].x = b->mv[0].x * 2 + ddl->d[0].x; mv[1].y = b->mv[1].y * 2 + ddl->d[1].y; mv[1].x = b->mv[1].x * 2 + ddl->d[1].x; for (int i = 0; i < 2; i++) mc_opfl(t, &tmp[i][((y + by) * bw4 * 4 + bx) * 4], bw4 * 4, bs, bs, t->bx + bx, t->by + y + by, 0, mv[i], refp[i], b->filter, iclip(left[i] + bx * 4, 0, w - 1), iclip(left[i] + bx * 4 + 7 + 8, 1, w), iclip(top[i] + by * 4, 0, h - 1), iclip(top[i] + by * 4 + 7 + 8, 1, h)); if (bs > 1) { const union mv dmv[2] = { [0] = { .y = (mv[0].y + (ddl->d[0].y > 0)) >> 1, .x = (mv[0].x + (ddl->d[0].x > 0)) >> 1 }, [1] = { .y = (mv[1].y + (ddl->d[1].y > 0)) >> 1, .x = (mv[1].x + (ddl->d[1].x > 0)) >> 1 }, }; update_temporal(f, &t_dst[(bx >> 1) + !!by * t_stride], t_stride, bs >> 1, bs >> 1, b->ref, dmv, t_swap); } else { assert(b->bs == BS_8x8); ddl++; } if (bacp) have_bacp |= get_mask(mask, bw4 * 4, t->bx, bx, t->by, y + by, mv, 4, 4, bs, bs, w, h); scaledown_16pel_mv_for_chroma(mv, f->cur.p.p.layout); } r_line += bw4 >> (bs == 2); } if (bs == 1) { union mv dmv[2]; int tmp = dd[0].d[0].x + dd[1].d[0].x + dd[2].d[0].x + dd[3].d[0].x; dmv[0].x = (b->mv[0].x * 8 + tmp + 3 + (tmp > 0)) >> 3; tmp = dd[0].d[0].y + dd[1].d[0].y + dd[2].d[0].y + dd[3].d[0].y; dmv[0].y = (b->mv[0].y * 8 + tmp + 3 + (tmp > 0)) >> 3; tmp = dd[0].d[1].x + dd[1].d[1].x + dd[2].d[1].x + dd[3].d[1].x; dmv[1].x = (b->mv[1].x * 8 + tmp + 3 + (tmp > 0)) >> 3; tmp = dd[0].d[1].y + dd[1].d[1].y + dd[2].d[1].y + dd[3].d[1].y; dmv[1].y = (b->mv[1].y * 8 + tmp + 3 + (tmp > 0)) >> 3; update_temporal(f, t_dst, t_stride, 1, 1, b->ref, dmv, t_swap); } } for (int n = 0; n < 2; n++) top[n] += 4 * sh4; t_dst += t_stride * (sh4 >> 1); } return bacp && have_bacp; } static int rmv_uvpred(Dav2dTaskContext *const t, const Av2Block *const b, const int plane, const int r_step, const int o_step, const int bw4, const int bh4) { assert(r_step >= o_step); const Dav2dFrameContext *const f = t->f; const int ss_hor = f->ss_hor, ss_ver = f->ss_ver; const int tip = b->ref.ref[0] == TIP_FRAME; const union refpair ref = tip ? f->rf.tip.ref : b->ref; int16_t (*const tmp)[64 * 64] = t->scratch.compinter; union mv (*rmv_line)[2][2] = &t->rmv[((t->cby & 31) >> 1) * 16 + ((t->cbx & 31) >> 1)]; const ptrdiff_t stride = bw4 * 4 >> ss_hor; ptrdiff_t uvoff = 0; uint8_t *const mask = t->scratch.seg_mask; const int bacp = !plane && f->seq_hdr->imp_msk_bld && b->cwp_idx == 8; if (bacp) memset(mask, 0x20, bw4 * bh4 * 16); int have_bacp = 0; const int w = f->bw * 4 >> ss_hor, h = f->bh * 4 >> ss_hor; const int rw4 = imin(bw4, r_step), rh4 = imin(bh4, r_step); const int ow4 = imin(bw4, o_step), oh4 = imin(bh4, o_step); const int hhtaps = 2 + 2 * (rw4 > 1 + ss_hor); const int hvtaps = 2 + 2 * (rh4 > 1 + ss_ver); const int h4 = imin(bh4, f->bh - t->cby); const int w4 = imin(bw4, f->bw - t->cbx); for (int y = 0; y < h4; y += rh4, rmv_line += 16 * r_step >> 1) { for (int x = 0; x < w4; x += rw4) { union mv (*const rmv)[2] = rmv_line[x >> 1]; int top[2], left[2], bottom[2], right[2]; for (int i = 0; i < 2; i++) { top[i] = ((t->cby + y) * 4 >> ss_ver) + ((tip ? rmv[1][i].y : b->mv[i].y) >> 4); left[i] = ((t->cbx + x) * 4 >> ss_hor) + ((tip ? rmv[1][i].x : b->mv[i].x) >> 4); bottom[i] = iclip(top[i] + (4 * rh4 >> ss_ver) + hvtaps, 1, h); right[i] = iclip(left[i] + (4 * rw4 >> ss_hor) + hhtaps, 1, w); top[i] = iclip(top[i] + 1 - hvtaps, 0, h - 1); left[i] = iclip(left[i] + 1 - hhtaps, 0, w - 1); } ptrdiff_t uvoffi = uvoff; for (int by = 0; by < rh4; by += oh4) { for (int bx = 0; bx < rw4; bx += ow4) { union mv (*const rmv)[2] = rmv_line[!!by * 16 + ((x + bx) >> 1)]; for (int i = 0; i < 2; i++) mc_opfl(t, &tmp[i][uvoffi + ((x + bx) * 4 >> ss_hor)], stride, ow4 >> ss_hor, oh4 >> ss_ver, (t->cbx + x + bx) >> ss_hor, (t->cby + y + by) >> ss_ver, 1 + plane, rmv[0][i], &f->refp[ref.ref[i]], b->filter, left[i], right[i], top[i], bottom[i]); if (bacp) have_bacp |= get_mask(mask, bw4 * 4 >> ss_hor, t->cbx >> ss_hor, (x + bx) >> ss_hor, t->cby >> ss_ver, (y + by) >> ss_ver, rmv[0], 4, 4, ow4 >> ss_hor, oh4 >> ss_ver, f->bw * 4 >> ss_hor, f->bh * 4 >> ss_ver); } uvoffi += oh4 * 4 * stride >> ss_ver; } } uvoff += rh4 * 4 * stride >> ss_ver; } return bacp && have_bacp; } static int recon_b_luma_tx(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum RectTxfmSize tx, Av2Block *const b) { const Dav2dFrameContext *const f = t->f; const Dav2dDSPContext *const dsp = f->dsp; Dav2dTileState *const ts = t->ts; const int bx4 = t->bx & 63, by4 = t->by & 63; const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw = t_dim->w * 4, th = t_dim->h * 4; const enum IntraPredMode orig_y_mode = b->y_mode; int angle = b->y_angle; if (b->intra && !b->intrabc) b->y_mode = wide_angle_remap(t_dim, b->y_mode, &angle, b->mrl_index); // decode coefficients uint8_t cf_ctx; enum TxfmType txtp; coef *cf; int eob; int stx; if (b->skip_txfm) { cf_ctx = 0x40; txtp = DCT_DCT; eob = -1; stx = 0; if (t->task_thread.pass & PASS_ENTROPY) { dav2d_memset_pow2[t_dim->lw](&t->a->lcoef[bx4], cf_ctx); dav2d_memset_pow2[t_dim->lh](&t->l.lcoef[by4], cf_ctx); } } else if (!(t->task_thread.pass & PASS_ENTROPY)) { cf = ts->frame_thread[1].cf; ts->frame_thread[1].cf += imin(tw, 32) * imin(th, 32); const struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; txtp = cbi->txtp[0]; stx = txtp >> 8; txtp &= 0xff; eob = cbi->eob[0]; } else { cf = bitfn(t->cf_y); eob = decode_coefs(t, DB_ONLY(depth + 1) &t->a->lcoef[bx4], &t->l.lcoef[by4], tx, b->bs, 0, b, 0, cf, &txtp, &cf_ctx); if (eob == INT_MIN) return -1; stx = txtp >> 8; txtp = txtp & 0xff; DEBUG_BLOCK_printf("%*sPost-y_cf_blk[tx=%dx%d,txtp=%s/%s,eob=%d]: r=%d\n", depth + 1, "", tw, th, dav2d_tx1d_names[txtp & 7], dav2d_tx1d_names[txtp >> 5], eob, ts->msac.rng); dav2d_memset_likely_pow2(&t->a->lcoef[bx4], cf_ctx, imin(t_dim->w, f->bw - t->bx)); dav2d_memset_likely_pow2(&t->l.lcoef[by4], cf_ctx, imin(t_dim->h, f->bh - t->by)); uint8_t *txtp_map = &t->txtp_map[(t->by & 15) * 16 + (t->bx & 15)]; #define set_ctx(rep_macro) \ for (int y = 0; y < t_dim->h; y++) { \ rep_macro(txtp_map, 0, txtp); \ txtp_map += 16; \ } case_set(t_dim->lw); #undef set_ctx } pixel *dst = ((pixel *) f->cur.p.data[0]) + 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx); if (b->intra && !b->intrabc && !b->pal_sz) { const int sbsz = f->sb_step; const int mrl_idx = b->mrl_index; const int mrl_mul = b->multi_mrl && tx != (int) TX_4X4; pixel *const edge = bitfn(t->scratch.edge) + 128 + !!mrl_idx * 9; const int is_hv5 = (t->by > t->pb.row_start || t->bx > t->pb.col_start) && (b->tx_part == TX_PARTITION_H5 || b->tx_part == TX_PARTITION_V5); int n_tr = 0, n_bl = 0; if (t->by > ts->tiling.row_start) { int w = imin(t_dim->w, ts->tiling.col_end - t->bx - t_dim->w); if (is_hv5) { n_tr = 0; } else if (!(t->by & (sbsz - 1))) { // top sb boundary n_tr = w; } else { const int end = imin((t->bx + sbsz) & ~(sbsz - 1), ts->tiling.col_end); w = imin(w, end - t->bx - t_dim->w); if (!w) { // right sb or tile/frame boundary n_tr = 0; } else { const int xpos = (bx4 + t_dim->w) & 63; const unsigned bits = (unsigned) (t->is_coded[0][by4 - 1] >> xpos); n_tr = imin(ctz(0x10000 | ~bits), w); } } } if (t->bx > ts->tiling.col_start) { const int end = imin((t->by + sbsz) & ~(sbsz - 1), ts->tiling.row_end); const int h = imin(t_dim->h, end - t->by - t_dim->h); if (is_hv5) { n_bl = 0; } else if (!h) { // bottom sb or tile/frame boundary n_bl = 0; } else if (!(t->bx & (sbsz - 1))) { // left sb boundary n_bl = h; } else { const uint64_t mask = 1ULL << ((bx4 - 1) & 63); int y; for (y = 0; y < h; y++) { if (!(t->is_coded[0][by4 + y + t_dim->h] & mask)) break; } n_bl = y; } } const pixel *const top_sb_edge = t->by & (f->sb_step - 1) ? NULL : &f->prefilter_data[0][(f->prefilter_data_full_frame ? t->by * 4 - 1 : ts->tiling.row) * PXSTRIDE(f->cur.p.stride[0])]; int apply_ibp = f->seq_hdr->ibp && tx != (enum RectTxfmSize) TX_4X4 && !mrl_idx; const int dip = b->dip - 1; const int sm_top = b->is_sm[0].a; const int sm_left = b->is_sm[0].l; const int is_sm_flag = apply_ibp ? (sm_top * ANGLE_SMOOTH_TOP_EDGE_FLAG) | (sm_left * ANGLE_SMOOTH_LEFT_EDGE_FLAG) : (sm_top | sm_left) * (ANGLE_SMOOTH_TOP_EDGE_FLAG | ANGLE_SMOOTH_LEFT_EDGE_FLAG); if (b->y_angle & 1) apply_ibp = 0; const int intra_flags = ANGLE_IS_LUMA | is_sm_flag | (f->seq_hdr->intra_edge_filter ? ANGLE_USE_EDGE_FILTER_FLAG : 0) | (apply_ibp ? ANGLE_IBP_FLAG : 0) | (mrl_idx << ANGLE_MRL_IDX_SHIFT) | (mrl_mul ? ANGLE_MULTI_MRL_FLAG : 0) | ((t->bx > ts->tiling.col_start) ? ANGLE_HAS_LEFT_FLAG : 0) | ((t->by > ts->tiling.row_start) ? ANGLE_HAS_TOP_FLAG : 0) | (dip >= 0 ? ANGLE_DIP_FLAG : 0); angle = dip >= 0 ? dip : angle; const enum IntraPredMode m = bytefn(dav2d_prepare_intra_edges)( DB_ONLY(BLOCK_TO_DEBUG && DEBUG_B_PIXELS) t->bx, t->by, ts->tiling.col_end, ts->tiling.row_end, n_tr, n_bl, dst, f->cur.p.stride[0], top_sb_edge, b->y_mode, t_dim->w, t_dim->h, angle | intra_flags, edge HIGHBD_CALL_SUFFIX); dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge, tw, th, angle | intra_flags, 4 * f->bw - 4 * t->bx, 4 * f->bh - 4 * t->by HIGHBD_CALL_SUFFIX); if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(dst, f->cur.p.stride[0], tw, th, "y-intra-pred"); } } if (eob != -1) { const int mask_idx = bx4 >> 4; const int mask = ((1 << t_dim->w) - 1) << (bx4 & 0xf); for (int y = 0; y < t_dim->h; y++) t->lf_mask->lr_noskip_mask[by4 + y][mask_idx] |= mask; if (stx) { if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { coef_dump(cf, 8, 8, 3, "dq"); } const int mask = (1 << HOR_PRED) | (1 << HOR_DOWN_PRED) | (1 << VERT_LEFT_PRED) | (1 << SMOOTH_H_PRED); const int transpose = b->intrabc || !b->intra || !((mask >> b->y_mode) & 1); const int type = (stx & 3) - 1; const int set = (stx >> 2) & 15; if (tw >= 8 && th >= 8) { const int8_t *kernel = &dav2d_stx_8x8_kernel[set][type][0][0]; coef sums[48]; dsp->stx.stxfm(sums, cf, kernel, 48, eob HIGHBD_CALL_SUFFIX); memset(cf, 0, 32 * sizeof(coef)); // Subtract 1 to map {8,16,32} to idx {0,1,2} const int idx = imin(t_dim->lh, 3) - 1; const uint8_t *scan_out = dav2d_stx_scan_orders_8x8[idx][transpose]; const uint8_t *mapping = dav2d_coeff8x8_mapping[set * 3 + type]; for (int x = 0; x < 48; x++) { cf[scan_out[mapping[x]]] = sums[x]; } eob = (uint8_t[]){ 63, 119, 231 }[idx]; } else { const int8_t *kernel = &dav2d_stx_4x4_kernel[set][type][0][0]; coef sums[16]; dsp->stx.stxfm(sums, cf, kernel, 16, eob HIGHBD_CALL_SUFFIX); const int idx = imin(t_dim->lh, 3); const uint8_t *scan_out = dav2d_stx_scan_orders_4x4[idx][transpose]; memset(&cf[4], 0, 4 * sizeof(coef)); for (int x = 0; x < 16; x++) { cf[scan_out[x]] = sums[x]; } eob = (uint8_t[]){ 15, 15, 51, 99 }[idx]; } if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "stx"); } } else { if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq"); } } if (f->frame_hdr->segmentation.lossless[b->seg_id] && b->intra && !b->intrabc && b->dpcm[0]) { txtp += (1 + (b->y_mode == VERT_PRED)) << 8; } else if (f->seq_hdr->inter_ddt && !b->intra) txtp += txtp & dav2d_tx_ddt_mask[tx]; // (flip)adst -> (f)ddt dsp->itx.itxfm_add[tx](dst, f->cur.p.stride[0], cf, txtp, eob HIGHBD_CALL_SUFFIX); if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon"); } } const uint64_t mask = ((1ULL << t_dim->w) - 1) << bx4; for (int y = 0; y < t_dim->h; y++) { t->is_coded[0][by4 + y] |= mask; } b->y_mode = orig_y_mode; return 0; } static void bawp(Dav2dTaskContext *const t, const int bawp_idx, const union mv mv, pixel *const dst, const ptrdiff_t stride, const Dav2dThreadPicture *const refp, const int refidx, const int bw4, const int bh4, const int w4, const int h4, const int plane, const enum BlockSize sb_bs) { const Dav2dFrameContext *const f = t->f; const int chroma = !!plane; const int ss_hor = f->ss_hor * chroma, ss_ver = f->ss_ver * chroma; const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; const Dav2dDSPContext *const dsp = f->dsp; const uint8_t *const sb_dim = dav2d_block_dimensions[sb_bs]; const int bx = chroma ? t->cbx : t->bx, by = chroma ? t->cby : t->by; if ((sb_dim[0] > (16 << ss_hor) && bx & (sb_dim[0] - 1)) || (sb_dim[1] > (16 << ss_ver) && by & (sb_dim[1] - 1))) { const int alpha = t->pb.bawp[plane].alpha, beta = t->pb.bawp[plane].beta; if (alpha != 256 || beta) dsp->mc.morph(dst, stride, alpha, beta, bw4 * h_mul, bh4 * v_mul HIGHBD_CALL_SUFFIX); return; } // defaults t->pb.bawp[plane].alpha = 256; t->pb.bawp[plane].beta = 0; Dav2dTileState *const ts = t->ts; int tile_top_edge, tile_left_edge, tile_bottom_edge, tile_right_edge; if (refp == &f->cur) { tile_top_edge = ts->tiling.row_start * v_mul; tile_left_edge = ts->tiling.col_start * h_mul; tile_bottom_edge = ts->tiling.row_end * v_mul; tile_right_edge = ts->tiling.col_end * h_mul; } else { tile_top_edge = tile_left_edge = 0; tile_bottom_edge = f->bh * v_mul; tile_right_edge = f->bw * h_mul; } const int mvx = (mv.x + 3 + (mv.x >= 0)) >> (3 + ss_hor); const int mvy = (mv.y + 3 + (mv.y >= 0)) >> (3 + ss_ver); const int ref_y = (by * v_mul + mvy); const int ref_x = (bx * h_mul + mvx); const int ref_tmplt_x = ref_x - 1; const int ref_tmplt_y = ref_y - 1; const int sb_w4 = imin(sb_dim[0], f->bw - bx); const int sb_h4 = imin(sb_dim[1], f->bh - by); const int ref_bottom_edge = ref_y + sb_h4 * v_mul; const int ref_right_edge = ref_x + sb_w4 * h_mul; const int can_morph = ref_bottom_edge <= tile_bottom_edge && ref_right_edge <= tile_right_edge && ref_tmplt_y >= tile_top_edge && ref_tmplt_x >= tile_left_edge; if (!can_morph) return; // TODO (optimization): Consider moving this code (and associated // size lookup tables) to a DSP function. SIMD could specialize on // edge sizes (4/8/16/32/64) and step values. static const uint8_t n_edge_samples[2 /* have edges */][3 /* h */] [3 /* w */][2 /* above, left */] = { { // !have_above || !have_left { { 2, 2 }, { 3, 2 }, { 4, 2 } }, { { 2, 3 }, { 3, 3 }, { 4, 3 } }, { { 2, 4 }, { 3, 4 }, { 4, 4 } }, }, { // have_above && have_left { { 2, 2 }, { 2, 2 }, { 4, 0 } }, { { 2, 2 }, { 3, 3 }, { 3, 3 } }, { { 0, 4 }, { 3, 3 }, { 4, 4 } }, } }; const int have_left = bx > ts->tiling.col_start; const int have_above = by > ts->tiling.row_start; const int lw4 = imin(ulog2(w4), 2) - ss_hor, lh4 = imin(ulog2(h4), 2) - ss_ver; const int idx = have_above && have_left; const int n_above_l2 = have_above * n_edge_samples[idx][lh4][lw4][0]; const int n_left_l2 = have_left * n_edge_samples[idx][lh4][lw4][1]; const pixel *const ref = &((const pixel *) refp->p.data[plane])[ref_y * PXSTRIDE(refp->p.stride[chroma]) + ref_x]; assert(n_above_l2 == 0 || n_left_l2 == 0 || n_above_l2 == n_left_l2); const int count_l2 = n_above_l2 + (n_above_l2 == n_left_l2 ? !!n_above_l2 : n_left_l2); int sum_x = 0, sum_y = 0, sum_xy = 0, sum_x2 = 0; if (n_above_l2) { const int bw = 4 << lw4; const int step = bw >> n_above_l2; assert(step > 0); const int start = step >> 1; const pixel *const top = by & (f->sb_step - 1) ? &dst[-PXSTRIDE(stride)] : &f->prefilter_data[plane][((f->prefilter_data_full_frame ? by * 4 - 1 : ts->tiling.row) >> ss_ver) * PXSTRIDE(stride) + (bx * 4 >> ss_hor)]; for (int i = start; i < bw; i += step) { const int x = ref[i - PXSTRIDE(refp->p.stride[chroma])]; const int y = top[i]; sum_x += x; sum_y += y; sum_xy += x * y; sum_x2 += x * x; } } if (n_left_l2) { const int bh = 4 << lh4; const int step = bh >> n_left_l2; assert(step > 0); const int start = step >> 1; for (int i = start; i < bh; i += step) { const int x = ref[(i * PXSTRIDE(refp->p.stride[chroma])) - 1]; const int y = dst[(i * PXSTRIDE(stride)) - 1]; sum_x += x; sum_y += y; sum_xy += x * y; sum_x2 += x * x; } } int alpha, beta; if (plane) { alpha = have_left || have_above ? t->pb.bawp[0].alpha : 256; } else { if (bawp_idx != 1) { assert(bawp_idx & 2); const int idx = (1 + (bawp_idx >> 2) + (f->absrefdist[refidx] > 4)) * (bawp_idx & 1 ? 1 : -1); alpha = 256 + 16 * idx; } else if (count_l2) { const int num = sum_xy - (int)(((int64_t)sum_x * sum_y) >> count_l2); const int den = sum_x2 - (int)(((int64_t)sum_x * sum_x) >> count_l2); alpha = derive_alpha(num, den, 256); } else { alpha = 256; } } t->pb.bawp[plane].alpha = alpha; if (count_l2) { const int diff = (sum_y << 8) - sum_x * alpha; const int abs_diff = abs(diff); beta = apply_sign(abs_diff >> count_l2, diff); } else { beta = -128; } t->pb.bawp[plane].beta = beta; dsp->mc.morph(dst, stride, alpha, beta, bw4 * h_mul, bh4 * v_mul HIGHBD_CALL_SUFFIX); } static void iiblend(Dav2dTaskContext *const t, const Av2Block *const b, pixel *const dst, const ptrdiff_t stride, const int plane, const int bw4, const int bh4, const int by, const int bx, const enum BlockSize ss_bs) { const Dav2dTileState *const ts = t->ts; const Dav2dFrameContext *const f = t->f; const Dav2dDSPContext *const dsp = f->dsp; pixel *const tl_edge = bitfn(t->scratch.edge) + 128; enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ? SMOOTH_PRED : b->interintra_mode; pixel *const tmp = bitfn(t->scratch.interintra); int angle = (const uint8_t[4]) { 0, 90, 180, 0 }[b->interintra_mode]; int n_tr = 0, n_bl = 0; const int chroma = !!plane; const int ss_hor = chroma * f->ss_hor, ss_ver = chroma * f->ss_ver; if (m == SMOOTH_PRED) { const int bx4 = bx & 63, by4 = by & 63, sbsz = f->sb_step; if (by > ts->tiling.row_start) { int w = imin(bw4, ts->tiling.col_end - bx - bw4); if (!(by & (sbsz - 1))) { // top sb boundary n_tr = w; } else { const int end = imin((bx + sbsz) & ~(sbsz - 1), ts->tiling.col_end); w = imin(w, end - t->bx - bw4); if (w <= 0) { // right sb or tile/frame boundary n_tr = 0; } else { // smooth pred uses 1px max n_tr = (t->is_coded[chroma][(by4 >> ss_ver) - 1] >> ((bx4 + bw4) >> ss_hor)) & 1; } } } if (bx > ts->tiling.col_start) { const int end = imin((by + sbsz) & ~(sbsz - 1), ts->tiling.row_end); const int h = imin(bh4, end - by - bh4); if (h <= 0) { // bottom sb or tile/frame boundary n_bl = 0; } else if (!(bx & (sbsz - 1))) { // left sb boundary n_bl = h; } else { // smooth pred uses 1px max n_bl = (t->is_coded[chroma][(by4 + bh4) >> ss_ver] >> ((bx4 - 1) >> ss_hor)) & 1; } } } const pixel *const top_sb_edge = by & (f->sb_step - 1) ? NULL : &f->prefilter_data[plane][(f->prefilter_data_full_frame ? (by * 4 >> ss_ver) - 1 : ts->tiling.row) * PXSTRIDE(f->cur.p.stride[!!plane])]; const int ssbw4 = bw4 >> ss_hor; const int ssbh4 = bh4 >> ss_ver; const int apply_ibp = f->seq_hdr->ibp && imax(ssbw4, ssbh4) > 1; const int intra_flags = (apply_ibp ? ANGLE_IBP_FLAG /* for dc */ : 0) | ((bx > ts->tiling.col_start) ? ANGLE_HAS_LEFT_FLAG : 0) | ((by > ts->tiling.row_start) ? ANGLE_HAS_TOP_FLAG : 0); m = bytefn(dav2d_prepare_intra_edges)( DB_ONLY(!plane && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) bx >> ss_hor, by >> ss_ver, ts->tiling.col_end >> ss_hor, ts->tiling.row_end >> ss_ver, n_tr, n_bl, dst, stride, top_sb_edge, m, ssbw4, ssbh4, angle | intra_flags, tl_edge HIGHBD_CALL_SUFFIX); dsp->ipred.intra_pred[m](tmp, 4 * ssbw4 * sizeof(pixel), tl_edge, ssbw4 * 4, ssbh4 * 4, intra_flags, 0, 0 HIGHBD_CALL_SUFFIX); if (!plane && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(tmp, ssbw4 * 4 * sizeof(pixel), ssbw4 * 4, ssbh4 * 4, "y-intra-pred"); if (0) hex_dump(dst, stride, ssbw4 * 4, ssbh4 * 4, "inter-pred"); } const uint8_t *const mask = b->wedge_idx == -1 ? II_MASK(ss_bs, ssbw4, ssbh4, b->interintra_mode) : WEDGE_MASK(ss_bs, bw4, bh4, b->wedge_idx, ss_hor + ss_ver); dsp->mc.blend(dst, stride, tmp, ssbw4 * 4, ssbh4 * 4, mask); } static inline void cfl(Dav2dTaskContext *const t, const Av2Block *const b, const enum BlockSize bs, const TxfmInfo *const t_dim) { const Dav2dTileState *const ts = t->ts; const Dav2dFrameContext *const f = t->f; const Dav2dDSPContext *const dsp = f->dsp; const enum Dav2dPixelLayout layout = f->cur.p.p.layout - 1; const ptrdiff_t ystride = f->cur.p.stride[0]; const ptrdiff_t cstride = f->cur.p.stride[1]; const int sbsz = f->sb_step; const int ss_hor = f->ss_hor, ss_ver = f->ss_ver; const int ssbx = t->cbx >> ss_hor, ssby = t->cby >> ss_ver; const int has_top = t->cby > ts->tiling.row_start; const int has_left = t->cbx > ts->tiling.col_start; const int is_top_sb_edge = !(t->cby & (sbsz - 1)); const int ctw4 = imin(t_dim->w, (f->bw - t->cbx + ss_hor) >> ss_hor); const int cth4 = imin(t_dim->h, (f->bh - t->cby + ss_ver) >> ss_ver); const int ctw = t_dim->w * 4, cth = t_dim->h * 4; const int filter_type = f->seq_hdr->cfl_ds_filter_index; pixel *const ysrc = ((pixel *) f->cur.p.data[0]) + (t->cby * PXSTRIDE(ystride) + t->cbx) * 4; pixel *const ytop_sb_edge = !is_top_sb_edge ? NULL : &f->prefilter_data[0][(f->prefilter_data_full_frame ? t->cby * 4 - 1 : ts->tiling.row) * PXSTRIDE(ystride) + t->cbx * 4]; if (b->cfl_type < CFL_MHCCP) { // CFL EXPLICIT / IMPLICIT const ptrdiff_t off = (ssby * PXSTRIDE(cstride) + ssbx) * 4; pixel *const usrc = ((pixel *) f->cur.p.data[1]) + off; pixel *const vsrc = ((pixel *) f->cur.p.data[2]) + off; pixel *const ytop = is_top_sb_edge ? ytop_sb_edge : ysrc - (1 + ss_ver) * PXSTRIDE(ystride); const ptrdiff_t top_off = f->prefilter_data_full_frame ? off - PXSTRIDE(cstride) : ts->tiling.row * PXSTRIDE(cstride) + ssbx * 4; pixel *const utop = is_top_sb_edge ? &f->prefilter_data[1][top_off] : usrc - PXSTRIDE(cstride); pixel *const vtop = is_top_sb_edge ? &f->prefilter_data[2][top_off] : vsrc - PXSTRIDE(cstride); pixel *const ptrs[6] = { ytop, utop, vtop, ysrc, usrc, vsrc }; const int cbw4 = (dav2d_block_dimensions[bs][0] + ss_hor) >> ss_hor; const int cbh4 = (dav2d_block_dimensions[bs][1] + ss_ver) >> ss_ver; const int wpad = cbw4 - ctw4; const int hpad = cbh4 - cth4; const unsigned flags = filter_type | (t->cby > ts->tiling.row_start ? CFL_HAS_TOP : 0) | (t->cbx > ts->tiling.col_start ? CFL_HAS_LEFT : 0) | (is_top_sb_edge ? CFL_IS_TOP_SB_EDGE : 0) | (((unsigned)b->cfl_alpha[0] << CFL_ALPHA_U_SHIFT) & CFL_ALPHA_U_MASK) | (((unsigned)b->cfl_alpha[1] << CFL_ALPHA_V_SHIFT) & CFL_ALPHA_V_MASK); dsp->ipred.cfl_pred[b->cfl_type][layout](ptrs, f->cur.p.stride, wpad, hpad, ctw, cth, flags HIGHBD_CALL_SUFFIX); if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(ptrs[1], cstride, ctw, cth, "u-intra-pred"); hex_dump(ptrs[2], cstride, ctw, cth, "v-intra-pred"); } } else { // CFL MHCCP const int cbx4 = (t->cbx & 63) >> ss_hor, cby4 = (t->cby & 63) >> ss_ver; ALIGN(pixel luma[CFL_MHCCP_MAX_LUMA_SIZE], 64); int refw = ctw4 * 4, refh = cth4 * 4, luma_top_stride; uint16_t imat[2][CFL_MHCCP_MAX_EDGE_SAMPLES]; int32_t mat[3][3] = { 0 }; int n_tr = 0, n_bl = 0; if (has_top) { const int csbsz = sbsz >> ss_hor; const int tile_end = ts->tiling.col_end >> ss_hor; int w = imax(0, imin(ctw4, tile_end - ssbx - ctw4)); if (is_top_sb_edge) { n_tr = w; } else { const int end = imin((ssbx + csbsz) & ~(csbsz - 1), tile_end); w = imin(ctw4, end - ssbx - ctw4); if (!w) { // right sb boundary n_tr = 0; } else { const unsigned bits = (unsigned) (t->is_coded[1][cby4 - 1] >> (cbx4 + ctw4)); n_tr = imin(ctz(0x10000 | ~bits), w); } } refw += n_tr * 4; } int subleft = 0; if (has_left) { const int csbsz = sbsz >> ss_ver; const int end = imax(0, imin((ssby + csbsz) & ~(csbsz - 1), ts->tiling.row_end >> ss_ver)); const int h = imin(cth4, end - ssby - cth4); if (!(t->cbx & (sbsz - 1)) || !h) { // left or bottom sb boundary n_bl = h; } else { const uint64_t mask = 1ULL << (cbx4 - 1); for (; n_bl < h; n_bl++) if (!(t->is_coded[1][cby4 + n_bl + cth4] & mask)) break; } refh += n_bl * 4; refw += 2; subleft = b->cfl_mh_dir != CFL_DIR_LEFT; } if (refw > (128 >> ss_hor)) { refw = 128 >> ss_hor; subleft = 0; } refh = imin(refh, (128 >> ss_ver) - 2 * has_top); luma_top_stride = (refw * sizeof(pixel) + 63) & ~63; const int edge_flags = (has_top ? CFL_HAS_TOP : 0) | (has_left ? CFL_HAS_LEFT : 0) | (is_top_sb_edge ? CFL_IS_TOP_SB_EDGE : 0); dsp->ipred.cfl_gen_y[layout][filter_type](luma, luma_top_stride, ysrc, ytop_sb_edge, ystride, refw - subleft, refh, ctw, cth, edge_flags | b->cfl_mh_dir); refh += has_top; if (has_top || has_left) dsp->ipred.cfl_gen_mat[b->cfl_mh_dir]( mat, imat, luma, luma_top_stride, refw, refh, edge_flags HIGHBD_CALL_SUFFIX); for (int pl = 1; pl <= 2; pl++) { int alpha[3] = { 0 }; pixel *chroma = ((pixel *) f->cur.p.data[pl]) + 4 * (ssby * PXSTRIDE(cstride) + ssbx); const pixel *const ctop_sb_edge = is_top_sb_edge ? &f->prefilter_data[pl][(f->prefilter_data_full_frame ? 4 * ssby - 1 : ts->tiling.row) * PXSTRIDE(cstride) + ssbx * 4] : NULL; if (has_top || has_left) { dsp->ipred.cfl_calc_alphas(alpha, chroma, ctop_sb_edge, cstride, refw, refh, mat, imat, edge_flags HIGHBD_CALL_SUFFIX); } else { // XXX optimize for no edge case? (single const alpha) alpha[2] = 0x10000; } const int n_top = has_top ? has_top + (b->cfl_mh_dir == CFL_DIR_TOP) : 0; const pixel *const src = luma + n_top * PXSTRIDE(luma_top_stride); dsp->ipred.cfl_mhccp_pred[b->cfl_mh_dir](chroma, cstride, src, luma_top_stride, ctw, cth, alpha, edge_flags HIGHBD_CALL_SUFFIX); if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(chroma, cstride, ctw, cth, pl == 1 ? "u-intra-pred" : "v-intra-pred"); } } } } int bytefn(dav2d_recon_b)(Dav2dTaskContext *const t, DB_ONLY(const int depth) const enum BlockSize lbs, // [0] = coef reading, [1] = reconstruction const enum BlockSize cbs_stage[2], Av2Block *const b) { Dav2dTileState *const ts = t->ts; const Dav2dFrameContext *const f = t->f; const Dav2dDSPContext *const dsp = f->dsp; const enum BlockSize cbs = cbs_stage[cbs_stage[0] == BS_INVALID]; const enum BlockSize bs = lbs == BS_INVALID ? cbs : lbs; assert(cbs_stage[0] == cbs_stage[1] || ((cbs_stage[0] == BS_INVALID || cbs_stage[1] == BS_INVALID) && (lbs == BS_64x64 && (f->ss_ver || f->ss_hor)))); assert(bs != BS_INVALID); const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = b_dim[0], bh4 = b_dim[1]; const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); const int ss_hor = f->ss_hor, ss_ver = f->ss_ver; const uint8_t csplit[3][3] = { [BS_128x128 - BS_128x128] = { BS_64x64, BS_128x64, BS_128x128 }, [BS_128x64 - BS_128x128] = { BS_64x64, BS_128x64, BS_128x64 }, [BS_64x128 - BS_128x128] = { BS_64x64, BS_64x64, BS_64x128 }, }; if (imax(bw4, bh4) > 16) { assert(bw4 * 2 >= bh4 && bh4 * 2 >= bw4); // 1:2, 1:1 or 2:1 ratios only assert(t->cbx == t->bx && t->cby == t->by); const int y_start = t->by, y_end = imin(y_start + bh4, f->bh); const int x_start = t->bx, x_end = imin(x_start + bw4, f->bw); int step; enum BlockSize lbs2, cbs2i; if (imax(bw4, bh4) == 64) { step = 32; lbs2 = lbs == BS_INVALID ? BS_INVALID : BS_128x128; cbs2i = cbs == BS_INVALID ? BS_INVALID : BS_128x128; } else { step = 16; lbs2 = lbs == BS_INVALID ? BS_INVALID : BS_64x64; cbs2i = cbs == BS_INVALID ? BS_INVALID : csplit[cbs - BS_128x128][ss_hor + ss_ver]; } for (int y = 0; t->by < y_end; t->by += step, y++) { for (int x = 0; t->bx < x_end; t->bx += step, x++) { enum BlockSize cbs2[2]; if (step == 32) { cbs2[0] = cbs2[1] = cbs2i; } else { // coef reading is done with the first luma 64x64 cbs2[0] = !((x & ss_hor) | (y & ss_ver)) ? cbs2i : BS_INVALID; // reconstruction should be done with the last luma 64x64, // so that COMP_INTER_SEG or refine-mv work correctly cbs2[1] = (!ss_hor || t->bx + step >= x_end) && (!ss_ver || t->by + step >= y_end) ? cbs2i : BS_INVALID; } const int res = bytefn(dav2d_recon_b)(t, DB_ONLY(depth) lbs2, cbs2, b); if (step == 32) { t->cbx += step; } else if ((x & ss_hor) == ss_hor) { t->cbx += step << ss_hor; } if (res < 0) { t->cbx = t->bx = x_start; t->cby = t->by = y_start; return res; } } t->cbx = t->bx = x_start; if (step == 32) { t->cby += step; } else if ((y & ss_ver) == ss_ver) { t->cby += step << ss_ver; } } t->cby = t->by = y_start; return 0; } if (lbs == BS_INVALID) goto chroma; const int8_t *const tp = dav2d_tx_part_tbl[bs]; if (tp[b->tx_part] == -1) return -1; pixel *const dst = ((pixel *) f->cur.p.data[0]) + 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx); if (b->intrabc) { mc(t, dst, NULL, f->cur.p.stride[0], bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, 0 /* unused */, DAV2D_FILTER_BILINEAR, 0, f->bw * 4, 0, f->bh * 4); if (b->morph_pred) bawp(t, 1, b->mv[0], dst, f->cur.p.stride[0], &f->cur, 0 /* unused */, bw4, bh4, w4, h4, 0, b->bs); if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(dst, f->cur.p.stride[0], imin(f->bw - t->bx, bw4) * 4, imin(f->bh - t->by, bh4) * 4, "y-pred"); } } else if (!b->intra) { if (b->ref.ref[1] == -1 && b->ref.ref[0] != TIP_FRAME) { const Dav2dThreadPicture *const refp = &f->refp[b->ref.ref[0]]; if (!f->frame_hdr->force_integer_mv && ((b->inter_mode == GLOBALMV && imin(bw4, bh4) > 1 && f->gmv_warp_allowed[b->ref.ref[0]]) || (b->motion_mode >= MM_WARP_CAUSAL && t->warpmv[0].type > DAV2D_WM_TYPE_INVALID))) { warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp, b->motion_mode >= MM_WARP_CAUSAL ? &t->warpmv[0] : &f->frame_hdr->gmv.m[b->ref.ref[0]]); } else { mc(t, dst, NULL, f->cur.p.stride[0], bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref.ref[0], b->filter, 0, f->bw * 4, 0, f->bh * 4); } if (b->bawp[0]) { bawp(t, b->bawp[0], b->mv[0], dst, f->cur.p.stride[0], refp, b->ref.ref[0], bw4, bh4, w4, h4, 0, b->bs); } else if (b->motion_mode == MM_INTERINTRA || b->warp_ii) { iiblend(t, b, dst, f->cur.p.stride[0], 0, bw4, bh4, t->by, t->bx, bs); } } else { int16_t (*const tmp)[64 * 64] = t->scratch.compinter; int bacp; if (b->ref.ref[0] == TIP_FRAME) { bacp = tip_pred(t, tmp, b, bw4, bh4, w4, h4); } else if (b->inter_mode >= OPFL_NEARMV_NEARMV || (b->refine_mv && b->comp_type == COMP_INTER_AVG)) { bacp = opfl_pred(t, tmp, b, bw4, bh4, w4, h4); } else { bacp = 2 * (f->seq_hdr->imp_msk_bld && b->motion_mode != MM_WARP_CAUSAL && b->inter_mode != GLOBALMV_GLOBALMV && !f->svc[b->ref.ref[0]][0].scale && !f->svc[b->ref.ref[1]][0].scale); for (int i = 0; i < 2; i++) { const Dav2dThreadPicture *const refp = &f->refp[b->ref.ref[i]]; if ((b->inter_mode == GLOBALMV_GLOBALMV && imin(bw4, bh4) > 1 && f->gmv_warp_allowed[b->ref.ref[i]]) || (b->motion_mode == MM_WARP_CAUSAL && t->warpmv[i].type > DAV2D_WM_TYPE_INVALID)) { warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp, b->motion_mode >= MM_WARP_CAUSAL ? &t->warpmv[i] : &f->frame_hdr->gmv.m[b->ref.ref[i]]); } else { mc(t, NULL, tmp[i], bw4 * 4, bw4, bh4, t->bx, t->by, 0, b->mv[i], refp, b->ref.ref[i], b->filter, 0, f->bw * 4, 0, f->bh * 4); } if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) ac_dump(tmp[i], bw4 * 4, bh4 * 4, "y-single-pred"); } } switch (b->comp_type) { case COMP_INTER_WEDGE: { const uint8_t *const mask = WEDGE_MASK(bs, bw4, bh4, b->wedge_idx, 0); dsp->mc.mask(dst, f->cur.p.stride[0], tmp[b->wedge_sign], tmp[!b->wedge_sign], bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); break; } case COMP_INTER_SEG: { const int chr_layout_idx = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I400 ? 0 : DAV2D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout; const ptrdiff_t mask_stride = imin(dav2d_block_dimensions[b->bs][0] * 4 >> f->ss_hor, 64); uint8_t *const seg_mask = imin(bw4, bh4) < 16 ? t->scratch.seg_mask : &t->scratch.seg_mask[((t->by >> f->ss_ver) & 15) * 4 * mask_stride + ((t->bx >> f->ss_hor) & 15) * 4]; dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0], tmp[b->mask_sign], tmp[!b->mask_sign], bw4 * 4, bh4 * 4, seg_mask, mask_stride, b->mask_sign HIGHBD_CALL_SUFFIX); break; } default: assert(0); case COMP_INTER_NONE: assert(b->ref.ref[0] == TIP_FRAME); // fall-through case COMP_INTER_AVG: { const int wt = b->cwp_idx; if (wt == 8) { if (bacp == 2) bacp = get_mask(t->scratch.seg_mask, bw4 * 4, t->bx, 0, t->by, 0, b->mv, 3, 3, bw4, bh4, f->bw * 4, f->bh * 4); if (bacp) { dsp->mc.mask(dst, f->cur.p.stride[0], tmp[0], tmp[1], bw4 * 4, bh4 * 4, t->scratch.seg_mask HIGHBD_CALL_SUFFIX); } else { dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1], bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX); } } else { dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1], bw4 * 4, bh4 * 4, wt HIGHBD_CALL_SUFFIX); } break; }} } if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) hex_dump(dst, f->cur.p.stride[0], imin(f->bw - t->bx, bw4) * 4, imin(f->bh - t->by, bh4) * 4, "y-pred"); } else if (b->pal_sz) { const uint8_t *pal_idx; const pixel *pal; if (t->task_thread.pass != PASS_ALL) { const int p = !!(t->task_thread.pass & PASS_ENTROPY); assert(ts->frame_thread[p].pal_idx); pal_idx = ts->frame_thread[p].pal_idx; ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; pal = *ts->frame_thread[0].pal++; } else { pal_idx = t->scratch.pal_idx_y; pal = bytefn(t->scratch.pal); } f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal, pal_idx, bw4 * 4, bh4 * 4); if (BLOCK_TO_DEBUG && DEBUG_B_PIXELS) hex_dump(dst, f->cur.p.stride[0], imin(f->bw - t->bx, bw4) * 4, imin(f->bh - t->by, bh4) * 4, "y-pal-pred"); } // luma enum RectTxfmSize tx = tp[b->tx_part]; t->pb.col_start = t->bx; t->pb.row_start = t->by; if (f->frame_hdr->segmentation.lossless[b->seg_id]) { int res = 0, y, x; tx = b->tx_size_ll ? dav2d_max_txfm_size_for_bs[bs][3] : (int) TX_4X4; const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w, th4 = t_dim->h; for (y = 0; y < h4 && !res; y += th4, t->by += th4) { for (x = 0; x < w4 && !res; x += tw4, t->bx += tw4) { res = recon_b_luma_tx(t, DB_ONLY(depth) (int) tx, b); } t->bx -= x; } t->by -= y; if (res < 0) return res; } else switch (b->tx_part) { case TX_PARTITION_NONE: { const int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; break; } case TX_PARTITION_SPLIT: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w, th4 = t_dim->h; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; const int have_v_split = t->bx + tw4 < f->bw; if (have_v_split) { t->bx += tw4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->bx -= tw4; if (res < 0) return res; } if (t->by + th4 >= f->bh) break; t->by += th4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (!res && have_v_split) { t->bx += tw4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->bx -= tw4; } t->by -= th4; if (res < 0) return res; break; } case TX_PARTITION_H: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int th4 = t_dim->h; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->by + th4 >= f->bh) break; t->by += th4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->by -= th4; if (res < 0) return res; break; } case TX_PARTITION_V: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->bx + tw4 >= f->bw) break; t->bx += tw4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->bx -= tw4; if (res < 0) return res; break; } case TX_PARTITION_H4: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int th4 = t_dim->h; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->by + th4 >= f->bh) break; t->by += th4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0 || t->by + th4 >= f->bh) { t->by -= th4; } else { t->by += th4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0 || t->by + th4 >= f->bh) { t->by -= 2 * th4; } else { t->by += th4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->by -= 3 * th4; } } if (res < 0) return res; break; } case TX_PARTITION_V4: { const TxfmInfo *const t_dim = &dav2d_txfm_dimensions[tx]; const int tw4 = t_dim->w; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; if (t->bx + tw4 >= f->bw) break; t->bx += tw4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0 || t->bx + tw4 >= f->bw) { t->bx -= tw4; } else { t->bx += tw4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0 || t->bx + tw4 >= f->bw) { t->bx -= 2 * tw4; } else { t->bx += tw4; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->bx -= 3 * tw4; } } if (res < 0) return res; break; } case TX_PARTITION_H5: { const enum RectTxfmSize tx_big = tp[TX_PARTITION_H]; const TxfmInfo *const t_dim_small = &dav2d_txfm_dimensions[tx], *const t_dim_big = &dav2d_txfm_dimensions[tx_big]; const int tw4_small = t_dim_small->w, th4_small = t_dim_small->h; const int th4_big = t_dim_big->h; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; const int have_v_split = t->bx + tw4_small < f->bw; if (have_v_split) { t->bx += tw4_small; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->bx -= tw4_small; if (res < 0) return res; } if (t->by + th4_small >= f->bh) break; t->by += th4_small; res = recon_b_luma_tx(t, DB_ONLY(depth) tx_big, b); if (res < 0 || t->by + th4_big >= f->bh) { t->by -= th4_small; } else { t->by += th4_big; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (!res && have_v_split) { t->bx += tw4_small; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->bx -= tw4_small; } t->by -= th4_small + th4_big; } if (res < 0) return res; break; } case TX_PARTITION_V5: { const enum RectTxfmSize tx_big = tp[TX_PARTITION_V]; const TxfmInfo *const t_dim_small = &dav2d_txfm_dimensions[tx], *const t_dim_big = &dav2d_txfm_dimensions[tx_big]; const int tw4_small = t_dim_small->w, th4_small = t_dim_small->h; const int tw4_big = t_dim_big->w; int res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (res < 0) return res; const int have_h_split = t->by + th4_small < f->bh; if (have_h_split) { t->by += th4_small; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->by -= th4_small; if (res < 0) return res; } if (t->bx + tw4_small >= f->bw) break; t->bx += tw4_small; res = recon_b_luma_tx(t, DB_ONLY(depth) tx_big, b); if (res < 0 || t->bx + tw4_big >= f->bw) { t->bx -= tw4_small; } else { t->bx += tw4_big; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); if (!res && have_h_split) { t->by += th4_small; res = recon_b_luma_tx(t, DB_ONLY(depth) tx, b); t->by -= th4_small; } t->bx -= tw4_small + tw4_big; } if (res < 0) return res; break; } default: assert(0); } if (cbs == BS_INVALID) return 0; // chroma chroma: {} const uint8_t *const cb_dim = dav2d_block_dimensions[cbs]; const int cbw4 = cb_dim[0], cw4 = imin(f->bw - t->cbx, cbw4); const int cbh4 = cb_dim[1], ch4 = imin(f->bh - t->cby, cbh4); const int cbw4ss = (cbw4 + ss_hor) >> ss_hor, cbh4ss = (cbh4 + ss_ver) >> ss_ver; const int cw4ss = (cw4 + ss_hor) >> ss_hor, ch4ss = (ch4 + ss_ver) >> ss_ver; const enum RectTxfmSize uvtx = f->frame_hdr->segmentation.lossless[b->seg_id] ? (int) TX_4X4 : dav2d_max_txfm_size_for_bs[cbs][DAV2D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout]; const TxfmInfo *const uv_t_dim = &dav2d_txfm_dimensions[uvtx]; const int ctw4 = imin(uv_t_dim->w, (f->bw - t->cbx + ss_hor) >> ss_hor); const int cth4 = imin(uv_t_dim->h, (f->bh - t->cby + ss_ver) >> ss_ver); const int ctw = uv_t_dim->w * 4, cth = uv_t_dim->h * 4; const int bx4 = t->cbx & 63, by4 = t->cby & 63; const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver; const int ssbx = t->cbx >> ss_hor, ssby = t->cby >> ss_ver; const ptrdiff_t stride = f->cur.p.stride[1]; const ptrdiff_t uvdstoff = 4 * (ssby * PXSTRIDE(stride) + ssbx); const int sbsz = f->sb_step; const int sdp_active = lbs == BS_INVALID; const int intra = b->intra && (sdp_active || !b->intrabc); const int skip_txfm = !sdp_active && b->skip_txfm; const enum IntraPredMode orig_uv_mode = b->uv_mode; int angle = b->uv_angle; if (intra) b->uv_mode = wide_angle_remap(uv_t_dim, b->uv_mode, &angle, 0); if (cbs_stage[0] != BS_INVALID) { if (!(t->task_thread.pass & PASS_ENTROPY)) { if (!skip_txfm) { uint16_t /*enum TxfmType*/ (*const txtp)[2] = t->chroma_txtp; int16_t (*const uv_eob)[2] = t->chroma_eob; for (int y = 0; y < ch4ss; y += uv_t_dim->h) { for (int x = 0; x < cw4ss; x += uv_t_dim->w) { const ptrdiff_t i = y * cbw4ss + x; const struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[(t->cby + (y << ss_ver)) * f->b4_stride + (t->cbx + (x << ss_hor))]; for (int pl = 0; pl < 2; pl++) { txtp[i][pl] = cbi->txtp[1 + pl]; uv_eob[i][pl] = cbi->eob[1 + pl]; } } } t->cf_uv = ts->frame_thread[1].cf; ts->frame_thread[1].cf += cbw4ss * cbh4ss * 16 * 2; } } else if (skip_txfm) { for (int pl = 0; pl < 2; pl++) { dav2d_memset_likely_pow2(&t->a->ccoef[pl][cbx4], 0x40, cw4ss); dav2d_memset_likely_pow2(&t->l.ccoef[pl][cby4], 0x40, ch4ss); } } else { enum TxfmType y_txtp = t->txtp_map[(t->by & 15) * 16 + (t->bx & 15)]; uint16_t /*enum TxfmType*/ (*const txtp)[2] = t->chroma_txtp; int16_t (*const uv_eob)[2] = t->chroma_eob; uint8_t cf_ctx[2]; coef (*const cf)[64 * 64] = bitfn(t->cf_uv); // decode coefficients for (int pl = 0; pl < 2; pl++) { int y; for (y = 0; y < ch4ss; y += uv_t_dim->h) { int x; for (x = 0; x < cw4ss; x += uv_t_dim->w) { const ptrdiff_t i = y * cbw4ss + x; if (b->bs == b->cbs) y_txtp = t->txtp_map[(t->by & 15) * 16 + (t->bx & 15)]; enum TxfmType uv_txtp = y_txtp; const int eob = decode_coefs(t, DB_ONLY(depth + 1) &t->a->ccoef[pl][cbx4 + x], &t->l.ccoef[pl][cby4 + y], uvtx, b->cbs, sdp_active, b, pl + 1, &cf[pl][i * 16], &uv_txtp, &cf_ctx[pl]); if (!pl) t->u_has_cf = eob >= 0; txtp[i][pl] = uv_txtp; if (eob == INT_MIN) return -1; DEBUG_BLOCK_printf("%*sPost-%c_cf_blk[tx=%dx%d,txtp=%s/%s," "eob=%d]: r=%d\n", depth + 1, "", "uv"[pl], ctw, cth, dav2d_tx1d_names[uv_txtp & 7], dav2d_tx1d_names[(uv_txtp >> 5) & 7], eob, t->ts->msac.rng); uv_eob[i][pl] = eob; dav2d_memset_likely_pow2(&t->a->ccoef[pl][cbx4 + x], cf_ctx[pl], ctw4); dav2d_memset_likely_pow2(&t->l.ccoef[pl][cby4 + y], cf_ctx[pl], cth4); t->bx += uv_t_dim->w << ss_hor; } t->bx -= x << ss_hor; t->by += uv_t_dim->h << ss_ver; } t->by -= y << ss_ver; } } if (cbs_stage[1] == BS_INVALID) { b->uv_mode = orig_uv_mode; return 0; } } if (intra) { if (b->uv_mode == CFL_PRED) cfl(t, b, cbs, uv_t_dim); } else if (!sdp_active && b->intrabc) { for (int pl = 0; pl < 2; pl++) { mc(t, ((pixel *)f->cur.p.data[1 + pl]) + uvdstoff, NULL, stride, cbw4, cbh4, t->cbx, t->cby, 1 + pl, b->mv[0], &f->cur, 0 /* unused */, DAV2D_FILTER_BILINEAR, 0, f->bw * 4 >> ss_hor, 0, f->bh * 4 >> ss_ver); // FIXME morph_pred? if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) hex_dump(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, pl ? "v-pred" : "u-pred"); } } else if (cbs != lbs && imin(bw4, bh4) < 16) { // sub8x8 coding if (f->c->task_thread.n_passes == 3) { uint16_t mask[16] = { 0 }; ptrdiff_t uvoff = uvdstoff; for (int y = 0; y < ch4; y++, uvoff += 4 * PXSTRIDE(stride) >> ss_ver) { for (int x = 0, m = 1; x < cw4; x++, m <<= 1) { if (mask[y] & m) continue; // for 3-pass coding, refmvs is active for the mv-resolution // (2nd) pass and cannot be reused in the reconstruction (3rd) // pass, so instead we use the block coding array. // for 2-pass coding, this codepath and the spatial refmvs // codepath a few lines down should both be valid. const Av2Block *const b2 = &f->frame_thread.b[(t->cby + y) * f->b4_stride + t->cbx + x]; const int ref = b2->ref.ref[0]; const union mv mv = b2->mv[0]; const Dav2dThreadPicture *const refp = &f->refp[ref]; const uint8_t *const sdim = dav2d_block_dimensions[b2->bs]; for (int pl = 0; pl < 2; pl++) { mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvoff + (x * 4 >> ss_hor), NULL, stride, sdim[0], sdim[1], t->cbx + x, t->cby + y, 1 + pl, mv, refp, ref, b2->filter, 0, f->bw * 4 >> ss_hor, 0, f->bh * 4 >> ss_ver); } const unsigned m2 = ((1 << sdim[0]) - 1) << x; for (int yy = y; yy < y + sdim[1]; yy++) mask[yy] |= m2; } } } else { const refmvs_block *r = &t->rt.r[(t->cby & 63) * 128 + (t->cbx & 127)]; ptrdiff_t uvoff = uvdstoff; for (int y = 0; y < ch4; y++, r += 128, uvoff += 4 * PXSTRIDE(stride) >> ss_ver) { for (int x = 0; x < cw4; x++) { // grab ref/MV/filter from spatial refmvs const refmvs_block *const r2 = &r[x]; if (r2->ox4 || r2->oy4) continue; const int ref = r2->ref.ref[0]; const union mv mv = r2->mf & 2 ? r2->lmv[0] : r2->mv[0]; const Dav2dThreadPicture *const refp = &f->refp[ref]; const uint8_t *const sdim = dav2d_block_dimensions[r2->bs]; for (int pl = 0; pl < 2; pl++) { mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvoff + (x * 4 >> ss_hor), NULL, stride, sdim[0], sdim[1], t->cbx + x, t->cby + y, 1 + pl, mv, refp, ref, r2->subpel_filter, 0, f->bw * 4 >> ss_hor, 0, f->bh * 4 >> ss_ver); } } } } if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) for (int pl = 0; pl < 2; pl++) hex_dump(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, cw4 * 4 >> ss_hor, ch4 * 4 >> ss_ver, pl ? "v-pred" : "u-pred"); } else if (b->ref.ref[1] == -1 && b->ref.ref[0] != TIP_FRAME) { const Dav2dThreadPicture *const refp = &f->refp[b->ref.ref[0]]; for (int pl = 0; pl < 2; pl++) { pixel *const dst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff; if (!f->frame_hdr->force_integer_mv && ((b->inter_mode == GLOBALMV && imin(bw4, bh4) > 1 && f->gmv_warp_allowed[b->ref.ref[0]]) || (b->motion_mode >= MM_WARP_CAUSAL && t->warpmv[0].type > DAV2D_WM_TYPE_INVALID))) { warp_affine(t, dst, NULL, stride, cb_dim, 1 + pl, refp, b->motion_mode >= MM_WARP_CAUSAL ? &t->warpmv[0] : &f->frame_hdr->gmv.m[b->ref.ref[0]]); } else { mc(t, dst, NULL, stride, cbw4, cbh4, t->cbx, t->cby, 1 + pl, b->mv[0], refp, b->ref.ref[0], b->filter, 0, f->bw * 4 >> ss_hor, 0, f->bh * 4 >> ss_ver); } if (b->bawp[1]) { bawp(t, 1, b->mv[0], dst, f->cur.p.stride[1], refp, b->ref.ref[0], cbw4, cbh4, cw4, ch4, pl + 1, b->bs); } else if (b->motion_mode == MM_INTERINTRA || b->warp_ii) { iiblend(t, b, dst, stride, 1 + pl, cbw4, cbh4, t->cby, t->cbx, b->wedge_idx == -1 ? dav2d_ss_bs[cbs][f->cur.p.p.layout - 1] : cbs); } if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) hex_dump(dst, stride, cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, pl ? "v-pred" : "u-pred"); } } else /* compound-inter */ { int16_t (*tmp)[64 * 64] = t->scratch.compinter; for (int pl = 0, bacp, bacpu; pl < 2; pl++, bacpu = bacp, bacp = 0) { if (b->ref.ref[0] == TIP_FRAME) { const int opfl = f->seq_hdr->tip_refine_mv && (f->frame_hdr->tip.frame_mode == 1 || f->frame_hdr->tip.subpel_filter == DAV2D_FILTER_8TAP_SHARP); const int step = 2 << (f->frame_hdr->tip.frame_mode == 2 /* frame */ ? !opfl : ((!opfl && imin(bw4, bh4) >= 4) || b->bs == BS_256x256)); bacp = rmv_uvpred(t, b, pl, step, step, cbw4, cbh4); } else if (b->inter_mode >= OPFL_NEARMV_NEARMV || (b->refine_mv && b->comp_type == COMP_INTER_AVG)) { const int refine = b->comp_type == COMP_INTER_AVG && b->refine_mv; const int opfl = b->inter_mode >= OPFL_NEARMV_NEARMV; bacp = rmv_uvpred(t, b, pl, 2 << refine, 4 >> opfl, cbw4, cbh4); } else { if (!pl) bacp = 2 * (f->seq_hdr->imp_msk_bld && b->motion_mode != MM_WARP_CAUSAL && b->inter_mode != GLOBALMV_GLOBALMV && !f->svc[b->ref.ref[0]][0].scale && !f->svc[b->ref.ref[1]][0].scale); for (int i = 0; i < 2; i++) { const Dav2dThreadPicture *const refp = &f->refp[b->ref.ref[i]]; if ((b->inter_mode == GLOBALMV_GLOBALMV && imin(bw4, bh4) > 1 && f->gmv_warp_allowed[b->ref.ref[i]]) || (b->motion_mode == MM_WARP_CAUSAL && t->warpmv[i].type > DAV2D_WM_TYPE_INVALID)) { warp_affine(t, NULL, tmp[i], cbw4 * 4 >> ss_hor, cb_dim, 1 + pl, refp, b->motion_mode >= MM_WARP_CAUSAL ? &t->warpmv[i] : &f->frame_hdr->gmv.m[b->ref.ref[i]]); } else { mc(t, NULL, tmp[i], cbw4 * 4 >> ss_hor, cbw4, cbh4, t->cbx, t->cby, 1 + pl, b->mv[i], refp, b->ref.ref[i], b->filter, 0, f->bw * 4 >> ss_hor, 0, f->bh * 4 >> ss_ver); } } } switch (b->comp_type) { case COMP_INTER_SEG: { const ptrdiff_t mask_stride = cbw4 * 4 >> ss_hor; assert(mask_stride <= 64); uint8_t *const seg_mask = imin(cbw4, cbh4) < 16 ? t->scratch.seg_mask : &t->scratch.seg_mask[(ssby & 15) * 4 * mask_stride + (ssbx & 15) * 4]; dsp->mc.mask(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, tmp[b->mask_sign], tmp[!b->mask_sign], cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, seg_mask HIGHBD_CALL_SUFFIX); break; } case COMP_INTER_WEDGE: { const uint8_t *const mask = WEDGE_MASK(cbs, cbw4, cbh4, b->wedge_idx, ss_hor + ss_ver); dsp->mc.mask(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, tmp[b->wedge_sign], tmp[!b->wedge_sign], cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, mask HIGHBD_CALL_SUFFIX); break; } default: assert(0); case COMP_INTER_NONE: assert(b->ref.ref[0] == TIP_FRAME); // fall-through case COMP_INTER_AVG: { const int wt = b->cwp_idx; if (wt == 8) { if (bacp == 2) bacp = get_mask(t->scratch.seg_mask, cbw4 * 4 >> ss_hor, t->cbx >> ss_hor, 0, t->cby >> ss_ver, 0, b->mv, 3 + ss_hor, 3 + ss_ver, cbw4 >> ss_hor, cbh4 >> ss_ver, f->bw * 4 >> ss_hor, f->bh * 4 >> ss_ver); if (pl) bacp = bacpu; if (bacp) { dsp->mc.mask(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, tmp[0], tmp[1], cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, t->scratch.seg_mask HIGHBD_CALL_SUFFIX); } else { dsp->mc.avg(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, tmp[0], tmp[1], cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver HIGHBD_CALL_SUFFIX); } } else { dsp->mc.w_avg(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, tmp[0], tmp[1], cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, wt HIGHBD_CALL_SUFFIX); } break; }} if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) hex_dump(((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, stride, cbw4 * 4 >> ss_hor, cbh4 * 4 >> ss_ver, pl ? "v-pred" : "u-pred"); } } // x/y recon loop for (int y = 0; y < ch4ss; y += uv_t_dim->h) { for (int x = 0; x < cw4ss; x += uv_t_dim->w) { const ptrdiff_t i = y * cbw4ss + x; for (int pl = 0; pl < 2; pl++) { pixel *const dst = ((pixel *) f->cur.p.data[1 + pl]) + 4 * ((ssby + y) * PXSTRIDE(stride) + ssbx + x); if (intra && b->uv_mode != CFL_PRED) { // intra prediction pixel *const edge = bitfn(t->scratch.edge) + 128; // We're skipping upsampling y here as this condition is // only true when y is 0. const pixel *const top_sb_edge = (t->cby + y) & (sbsz - 1) ? NULL : &f->prefilter_data[1 + pl][(f->prefilter_data_full_frame ? (t->cby * 4 - 1) >> ss_ver : ts->tiling.row) * PXSTRIDE(f->cur.p.stride[1])]; int n_tr = 0, n_bl = 0; if (t->cby + (y << ss_ver) > ts->tiling.row_start && ctw < 64) { const int csbsz = sbsz >> ss_hor; const int tile_end = ts->tiling.col_end >> ss_hor; int w = imin(ctw4, tile_end - (ssbx + x) - ctw4); if (!((t->cby + y) & (sbsz - 1))) { n_tr = w; // top sb boundary } else { const int end = imin((ssbx + x + csbsz) & ~(csbsz - 1), tile_end); int w = imin(ctw4, end - (ssbx + x) - ctw4); if (!w) { // right sb boundary n_tr = w; } else { const unsigned bits = (unsigned) (t->is_coded[1][cby4 + y - 1] >> (cbx4 + x + ctw4)); n_tr = imin(ctz(0x10000 | ~bits), w); } } } if (t->cbx + (x << ss_hor) > ts->tiling.col_start && cth < 64) { const int csbsz = sbsz >> ss_ver; const int end = imin((ssby + y + csbsz) & ~(csbsz - 1), ts->tiling.row_end >> ss_ver); const int h = imin(cth4, end - (ssby + y) - cth4); if (!((t->cbx + x) & (sbsz - 1)) || !h) { // left or bottom sb boundary n_bl = h; } else { const uint64_t mask = 1ULL << (cbx4 + x - 1); for (; n_bl < h; n_bl++) if (!(t->is_coded[1][cby4 + y + n_bl + cth4] & mask)) break; } } int apply_ibp = f->seq_hdr->ibp && uvtx != (enum RectTxfmSize) TX_4X4; const int sm_top = b->is_sm[1].a; const int sm_left = b->is_sm[1].l; const int is_sm_flag = apply_ibp ? (sm_top * ANGLE_SMOOTH_TOP_EDGE_FLAG) | (sm_left * ANGLE_SMOOTH_LEFT_EDGE_FLAG) : (sm_top | sm_left) * (ANGLE_SMOOTH_TOP_EDGE_FLAG | ANGLE_SMOOTH_LEFT_EDGE_FLAG); apply_ibp &= b->uv_mode == DC_PRED; int intra_flags = is_sm_flag | (apply_ibp ? ANGLE_IBP_FLAG : 0) | (f->seq_hdr->intra_edge_filter ? ANGLE_USE_EDGE_FILTER_FLAG : 0) | ((t->cbx + (x << ss_hor) > ts->tiling.col_start) ? ANGLE_HAS_LEFT_FLAG : 0) | ((t->cby + (y << ss_ver) > ts->tiling.row_start) ? ANGLE_HAS_TOP_FLAG : 0); const enum IntraPredMode uv_mode = b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode; const enum IntraPredMode m = bytefn(dav2d_prepare_intra_edges)( // don't print chroma as avm does things in a different order // (decode coefs of both planes first then pred + itx) DB_ONLY(0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) ssbx + x, ssby + y, ts->tiling.col_end >> ss_hor, ts->tiling.row_end >> ss_ver, n_tr, n_bl, dst, stride, top_sb_edge, uv_mode, uv_t_dim->w, uv_t_dim->h, angle | intra_flags, edge HIGHBD_CALL_SUFFIX); dsp->ipred.intra_pred[m](dst, stride, edge, ctw, cth, angle | intra_flags, 4 * f->bw - 4 * (t->cbx + x), 4 * f->bh - 4 * (t->cby + y) HIGHBD_CALL_SUFFIX); if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { hex_dump(dst, stride, ctw, cth, pl ? "v-intra-pred" : "u-intra-pred"); } } } if (!skip_txfm) { const int cctx = f->seq_hdr->cctx && (f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420 || uv_t_dim->max < 8); uint16_t /*enum TxfmType*/ (*const txtp)[2] = t->chroma_txtp; int16_t (*const eob)[2] = t->chroma_eob; coef *cf[2]; if (!(t->task_thread.pass & PASS_ENTROPY)) { cf[0] = t->cf_uv; cf[1] = &t->cf_uv[cbw4ss * cbh4ss * 16]; } else { cf[0] = bitfn(t->cf_uv)[0]; cf[1] = bitfn(t->cf_uv)[1]; } // FIXME I'm not convinced the above is correct for pass==2 if we // do block coding vs. reconstruction in different orders for // e.g. 128x128 chroma blocks. int cctx_type = cctx && eob[i][0] >= intra ? (txtp[i][0] >> 8) : 0; if (cctx_type) { dsp->itx.cctx(&cf[0][i * 16], &cf[1][i * 16], dav2d_cctx_angle[cctx_type - 1], umin(ctw, 32) * umin(cth, 32) HIGHBD_CALL_SUFFIX); const int gt = eob[i][1] > eob[i][0]; eob[i][!gt] = eob[i][gt]; txtp[i][1] = txtp[i][0] &= 0xff; } // inverse transform for (int pl = 0; pl < 2; pl++) { if (eob[i][pl] != -1) { // don't print chroma as avm does things in a different order // (decode coefs of both planes first then pred + itx) if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { coef_dump(&cf[pl][i * 16], imin(cth, 32), imin(ctw, 32), 3, "dq"); } pixel *const dst = ((pixel *) f->cur.p.data[1 + pl]) + 4 * ((ssby + y) * PXSTRIDE(stride) + ssbx + x); if (f->frame_hdr->segmentation.lossless[b->seg_id] && b->intra && (sdp_active || !b->intrabc) && b->dpcm[1]) { txtp[i][pl] += (1 + (b->uv_mode == VERT_PRED)) << 8; } else if (f->seq_hdr->inter_ddt && !b->intra) // (flip)adst -> (f)ddt txtp[i][pl] += txtp[i][pl] & dav2d_tx_ddt_mask[uvtx]; dsp->itx.itxfm_add[uvtx](dst, stride, &cf[pl][i * 16], txtp[i][pl], eob[i][pl] HIGHBD_CALL_SUFFIX); } } } for (int pl = 1; pl <= 2; pl++) { if (0 && BLOCK_TO_DEBUG && DEBUG_B_PIXELS) { const pixel *const dst = ((pixel *) f->cur.p.data[pl]) + 4 * ((ssby + y) * PXSTRIDE(stride) + ssbx + x); hex_dump(dst, stride, ctw, cth, "recon"); } } const uint64_t mask = ((1ULL << ctw4) - 1) << (cbx4 + x); for (int yy = 0; yy < cth4; yy++) t->is_coded[1][cby4 + y + yy] |= mask; } } b->uv_mode = orig_uv_mode; return 0; } void bytefn(dav2d_filter_sbrow_deblock_cols)(Dav2dFrameContext *const f, const int sby) { if (!(f->c->inloop_filters & DAV2D_INLOOPFILTER_DEBLOCK) || (!f->frame_hdr->deblock.level_y[0] && !f->frame_hdr->deblock.level_y[1])) { return; } const int y = sby * f->sb_step * 4; const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; pixel *const p[3] = { f->lf.p[0] + y * PXSTRIDE(f->cur.p.stride[0]), f->lf.p[1] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver), f->lf.p[2] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver) }; Av2Filter *mask = f->lf.mask + (sby >> (2 - f->frame_hdr->sb128)) * f->sb256w; const int start_of_tile_row = f->lf.start_of_tile_row[sby]; bytefn(dav2d_deblock_sbrow_cols)(f, p, mask, sby, start_of_tile_row & 1 ? start_of_tile_row >> 1 : 0); } void bytefn(dav2d_filter_sbrow_deblock_rows)(Dav2dFrameContext *const f, const int sby) { const int y = sby * f->sb_step * 4; const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; pixel *const p[3] = { f->lf.p[0] + y * PXSTRIDE(f->cur.p.stride[0]), f->lf.p[1] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver), f->lf.p[2] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver) }; Av2Filter *mask = f->lf.mask + (sby >> (2 - f->frame_hdr->sb128)) * f->sb256w; if (f->c->inloop_filters & DAV2D_INLOOPFILTER_DEBLOCK && (f->frame_hdr->deblock.level_y[0] || f->frame_hdr->deblock.level_y[1])) { bytefn(dav2d_deblock_sbrow_rows)(f, p, mask, sby); } if ((f->seq_hdr->cdef && f->c->inloop_filters & DAV2D_INLOOPFILTER_CDEF) || (f->lf.restore_planes && f->c->inloop_filters & (DAV2D_INLOOPFILTER_WIENER | DAV2D_INLOOPFILTER_GDF))) { // Store deblocked pixels required by CDEF / LR bytefn(dav2d_copy_db)(f, p, sby); } } void bytefn(dav2d_filter_sbrow_cdef)(Dav2dTaskContext *const tc, const int sby) { const Dav2dFrameContext *const f = tc->f; if (!(f->c->inloop_filters & (DAV2D_INLOOPFILTER_CDEF | DAV2D_INLOOPFILTER_CCSO))) return; const int sbsz = f->sb_step; const int y = sby * sbsz * 4; const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; pixel *const p[3] = { f->lf.p[0] + y * PXSTRIDE(f->cur.p.stride[0]), f->lf.p[1] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver), f->lf.p[2] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver) }; Av2Filter *prev_mask = f->lf.mask + ((sby - 1) >> (2 - f->frame_hdr->sb128)) * f->sb256w; Av2Filter *mask = f->lf.mask + (sby >> (2 - f->frame_hdr->sb128)) * f->sb256w; const int start = sby * sbsz; if (sby) { const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; pixel *p_up[3] = { p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]), p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver), p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver), }; bytefn(dav2d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby); } const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); const int end = imin(start + n_blks, f->bh); bytefn(dav2d_cdef_brow)(tc, p, mask, start, end, 0, sby); } void bytefn(dav2d_filter_sbrow_lr)(Dav2dFrameContext *const f, const int sby) { if (!(f->c->inloop_filters & (DAV2D_INLOOPFILTER_WIENER | DAV2D_INLOOPFILTER_GDF))) return; const int y = sby * f->sb_step * 4; const int ss_ver = f->cur.p.p.layout == DAV2D_PIXEL_LAYOUT_I420; pixel *const p[3] = { f->lf.p[0] + y * PXSTRIDE(f->cur.p.stride[0]), f->lf.p[1] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver), f->lf.p[2] + (y * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver) }; bytefn(dav2d_lr_sbrow)(f, p, sby); } void bytefn(dav2d_filter_sbrow)(Dav2dFrameContext *const f, const int sby) { bytefn(dav2d_filter_sbrow_deblock_cols)(f, sby); bytefn(dav2d_filter_sbrow_deblock_rows)(f, sby); if (f->seq_hdr->cdef) bytefn(dav2d_filter_sbrow_cdef)(f->c->tc, sby); if (f->lf.restore_planes) bytefn(dav2d_filter_sbrow_lr)(f, sby); } void bytefn(dav2d_backup_prefilter_data)(Dav2dTaskContext *const t) { const Dav2dFrameContext *const f = t->f; assert (f->c->n_tc > 1); Dav2dTileState *const ts = t->ts; const int y_end = imin(ts->tiling.row_end, t->by + f->sb_step) * 4; const int x_off = ts->tiling.col_start * 4, sz = ts->tiling.col_end * 4 - x_off; const int ss_ver = f->ss_ver, ss_hor = f->ss_hor; const int uv_y_end = y_end >> ss_ver, uv_x_off = x_off >> ss_hor, uv_sz = sz >> ss_hor; if (f->frame_hdr->allow_intrabc) { // make a complete copy of block reconstruction. The original data will // be used as prefilter-data for intrabc and intra prediction, and the // copy will be used for postfilter and output to the application/user. for (int y = t->by * 4; y < y_end; y++) { const ptrdiff_t off = y * PXSTRIDE(f->cur.p.stride[0]) + x_off; pixel_copy(&f->lf.p[0][off], &((const pixel *)f->cur.p.data[0])[off], sz); } if (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400) { for (int y = t->by * 4 >> ss_ver; y < uv_y_end; y++) { const ptrdiff_t off = y * PXSTRIDE(f->cur.p.stride[1]) + uv_x_off; for (int pl = 1; pl <= 2; pl++) pixel_copy(&f->lf.p[pl][off], &((const pixel *)f->cur.p.data[pl])[off], uv_sz); } } } else { if (t->by + f->sb_step >= ts->tiling.row_end) return; // make a copy of the final line in the tile-sbrow, which can be used for // intra prediction. The original buffer will be for postfilter and output // to the application/user. const ptrdiff_t off1 = ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]) + x_off; const ptrdiff_t off2 = ts->tiling.row * PXSTRIDE(f->cur.p.stride[0]) + x_off; pixel_copy(&f->prefilter_data[0][off2], &((const pixel *)f->cur.p.data[0])[off1], sz); if (f->cur.p.p.layout != DAV2D_PIXEL_LAYOUT_I400) { const ptrdiff_t uv_off1 = (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]) + uv_x_off; const ptrdiff_t uv_off2 = ts->tiling.row * PXSTRIDE(f->cur.p.stride[1]) + uv_x_off; for (int pl = 1; pl <= 2; pl++) pixel_copy(&f->prefilter_data[pl][uv_off2], &((const pixel *)f->cur.p.data[pl])[uv_off1], uv_sz); } } } void bytefn(dav2d_copy_pal_block_y)(Dav2dTaskContext *const t, const int bx4, const int by4, const int bw4, const int bh4) { pixel *const pal = t->task_thread.pass != PASS_ALL ? *t->ts->frame_thread[1].pal++ : bytefn(t->scratch.pal); for (int x = 0; x < bw4; x++) memcpy(bytefn(t->al_pal)[0][bx4 + x], pal, 8 * sizeof(pixel)); for (int y = 0; y < bh4; y++) memcpy(bytefn(t->al_pal)[1][by4 + y], pal, 8 * sizeof(pixel)); } void bytefn(dav2d_read_pal_plane)(DB_ONLY(const int depth) Dav2dTaskContext *const t, Av2Block *const b, const int bx4, const int by4) { Dav2dTileState *const ts = t->ts; const Dav2dFrameContext *const f = t->f; const int pal_sz = b->pal_sz = dav2d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.pal_sz, 6) + 2; // don't reuse above palette outside SB64 boundaries const int a_cache = by4 & 15 ? t->a->pal_sz[bx4] : 0; const int l_cache = t->l.pal_sz[by4]; const pixel *l = bytefn(t->al_pal)[1][by4], *a = bytefn(t->al_pal)[0][bx4]; // find cached entries (but don't load them yet) const int n_cache = l_cache + a_cache; int n_used_cache = 0; unsigned cache_reuse_mask = 0; int off = 0; for (int n = imin(n_cache, pal_sz); n; off += n, n = imin(n_cache - off, pal_sz - n_used_cache)) { const unsigned m = dav2d_msac_decode_bools_bypass(&ts->msac, n); cache_reuse_mask <<= n; cache_reuse_mask |= m; n_used_cache += popcnt(m); } pixel cache[8]; if (n_used_cache) { if (!l_cache) { #define select(dir) \ /* directly copy the selected cache entries into cache[] */ \ assert(!(cache_reuse_mask & ~0xff)); \ unsigned mask = cache_reuse_mask << (32 - off); \ int i = 0, n = 0; \ do { \ int n_zero = clz(mask); \ cache[i++] = dir[n + n_zero++]; \ n += n_zero; \ mask <<= n_zero; \ } while (mask) select(a); } else if (!a_cache) { select(l); #undef select } else { // sort selected cache entries from a & l into cache[] const int min_n = imin(a_cache, l_cache); unsigned mask = cache_reuse_mask << (32 - off); const unsigned rem_mask = mask << (min_n * 2) >> min_n; unsigned shared_mask = mask - (rem_mask >> min_n); shared_mask = (shared_mask & 0xaaaa0000) | ((shared_mask & 0x55550000) >> 15); shared_mask |= shared_mask << 1; shared_mask &= 0xcccccccc; shared_mask |= shared_mask << 2; shared_mask &= 0xf0f0f0f0; shared_mask |= shared_mask << 4; const int a_gt_l = a_cache > l_cache; unsigned a_mask = (shared_mask & 0xff000000) + a_gt_l * rem_mask; unsigned l_mask = ((shared_mask & 0xff00) << 16) + !a_gt_l * rem_mask; int i = 0, a_n = 0, l_n = 0; if (a_mask && l_mask) { #define cnt_zero(dir) do { \ const int n_zero = clz(dir##_mask); \ dir##_n += n_zero; \ dir##_mask <<= n_zero; \ } while (0) cnt_zero(a); cnt_zero(l); for (;;) { assert((a_mask & l_mask) & 0x80000000); if (a[a_n] < l[l_n]) { #define consume(dir) \ cache[i++] = dir[dir##_n]; \ dir##_mask <<= 1; \ if (!dir##_mask) break; \ const int n_zero = clz(dir##_mask); \ dir##_n += 1 + n_zero; \ dir##_mask <<= n_zero consume(a); } else { consume(l); } } } assert(i < pal_sz); if (a_mask) { cnt_zero(a); for (;;) { consume(a); } } else { cnt_zero(l); for (;;) { consume(l); #undef cnt_zero #undef consume } } } } // parse new entries pixel *const pal = t->task_thread.pass != PASS_ALL ? *ts->frame_thread[1].pal : bytefn(t->scratch.pal); if (n_used_cache < pal_sz) { int i = n_used_cache; const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.p.bpc; int prev = pal[i++] = dav2d_msac_decode_bools_bypass(&ts->msac, bpc); if (i < pal_sz) { int bits = bpc - 3 + dav2d_msac_decode_bools_bypass(&ts->msac, 2); const int max = (1 << bpc) - 1; do { const int delta = dav2d_msac_decode_bools_bypass(&ts->msac, bits); prev = pal[i++] = imin(prev + delta + 1, max); if (prev + 1 >= max) { for (; i < pal_sz; i++) pal[i] = max; break; } bits = imin(bits, 1 + ulog2(max - prev - 1)); } while (i < pal_sz); } // merge selected cache & new entries into pal while sorting cache if (n_used_cache) { int n = 0, m = n_used_cache; for (i = 0; i < pal_sz; i++) { if (n < n_used_cache && (m >= pal_sz || cache[n] <= pal[m])) { pal[i] = cache[n++]; } else { assert(m < pal_sz); pal[i] = pal[m++]; } } } } else { pixel_copy(pal, cache, pal_sz); } #if DEBUG_BLOCK_INFO #define bitmask(x) /* emulate %b (up to 16 bits) */ \ ((uint64_t) ((x) & 0x8000) << 45) | \ ((uint64_t) ((x) & 0x4000) << 42) | \ ((uint64_t) ((x) & 0x2000) << 39) | \ ((uint64_t) ((x) & 0x1000) << 36) | \ ((uint64_t) ((x) & 0x800) << 33) | \ ((uint64_t) ((x) & 0x400) << 30) | \ ((uint64_t) ((x) & 0x200) << 27) | \ ((uint64_t) ((x) & 0x100) << 24) | \ ((uint64_t) ((x) & 0x80) << 21) | \ ((uint64_t) ((x) & 0x40) << 18) | \ ((uint64_t) ((x) & 0x20) << 15) | \ ((uint64_t) ((x) & 0x10) << 12) | \ ((uint64_t) ((x) & 0x8) << 9) | \ ((uint64_t) ((x) & 0x4) << 6) | \ ((uint64_t) ((x) & 0x2) << 3) | \ ((uint64_t) ((x) & 0x1) << 0) if (BLOCK_TO_DEBUG) { printf("%*sPost-ypal[sz=%d,cache_sz=%d,mask=%0*"PRIx64"|%d]: r=%d, cache=", depth, "", pal_sz, n_cache, off, bitmask(cache_reuse_mask), n_used_cache, ts->msac.rng); const int min_n = imin(a_cache, l_cache), max_n = n_cache - min_n; for (int n = 0; n < min_n; n++) printf("%c"PIX_HEX_FMT","PIX_HEX_FMT, n ? ',' : '[', a[n], l[n]); const pixel *dir = a_cache > l_cache ? a : l; for (int n = min_n; n < max_n; n++) printf("%c"PIX_HEX_FMT, n ? ',' : '[', dir[n]); printf("%s, pal=", n_cache ? "]" : "[]"); for (int n = 0; n < pal_sz; n++) printf("%c"PIX_HEX_FMT, n ? ',' : '[', pal[n]); printf("]\n"); } #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ref.c000066400000000000000000000061201517466257200215520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "src/ref.h" static void default_free_callback(const uint8_t *const data, void *const user_data) { assert(data == user_data); dav2d_free_aligned(user_data); } Dav2dRef *dav2d_ref_create(const enum AllocationType type, size_t size) { size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); uint8_t *const data = dav2d_alloc_aligned(type, size + sizeof(Dav2dRef), 64); if (!data) return NULL; Dav2dRef *const res = (Dav2dRef*)(data + size); res->const_data = res->user_data = res->data = data; atomic_init(&res->ref_cnt, 1); res->free_ref = 0; res->free_callback = default_free_callback; return res; } static void pool_free_callback(const uint8_t *const data, void *const user_data) { dav2d_mem_pool_push((Dav2dMemPool*)data, user_data); } Dav2dRef *dav2d_ref_create_using_pool(Dav2dMemPool *const pool, size_t size) { void *const buf = dav2d_mem_pool_pop(pool, size); if (!buf) return NULL; /* Store Dav2dRef inside the Dav2dMemPoolBuffer alignment padding */ assert(sizeof(Dav2dMemPoolBuffer) + sizeof(Dav2dRef) <= 64); Dav2dRef *const res = &((Dav2dRef*)buf)[-1]; res->data = buf; res->const_data = pool; atomic_init(&res->ref_cnt, 1); res->free_ref = 0; res->free_callback = pool_free_callback; res->user_data = buf; return res; } void dav2d_ref_dec(Dav2dRef **const pref) { assert(pref != NULL); Dav2dRef *const ref = *pref; if (!ref) return; *pref = NULL; if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) { const int free_ref = ref->free_ref; ref->free_callback(ref->const_data, ref->user_data); if (free_ref) dav2d_free(ref); } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/ref.h000066400000000000000000000054111517466257200215610ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_REF_H #define DAV2D_SRC_REF_H #include "dav2d/dav2d.h" #include "src/mem.h" #include "src/thread.h" #include #include struct Dav2dRef { void *data; const void *const_data; atomic_int ref_cnt; int free_ref; void (*free_callback)(const uint8_t *data, void *user_data); void *user_data; }; #if !TRACK_HEAP_ALLOCATIONS #define dav2d_ref_create(type, size) dav2d_ref_create(size) #endif Dav2dRef *dav2d_ref_create(enum AllocationType type, size_t size); Dav2dRef *dav2d_ref_create_using_pool(Dav2dMemPool *pool, size_t size); void dav2d_ref_dec(Dav2dRef **ref); static inline Dav2dRef *dav2d_ref_init(Dav2dRef *const ref, const void *const ptr, void (*const free_callback)(const uint8_t *data, void *user_data), void *const user_data, const int free_ref) { ref->data = NULL; ref->const_data = ptr; atomic_init(&ref->ref_cnt, 1); ref->free_ref = free_ref; ref->free_callback = free_callback; ref->user_data = user_data; return ref; } static inline void dav2d_ref_inc(Dav2dRef *const ref) { atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed); } static inline int dav2d_ref_is_writable(Dav2dRef *const ref) { return atomic_load(&ref->ref_cnt) == 1 && ref->data; } #endif /* DAV2D_SRC_REF_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/refmvs.c000066400000000000000000003314531517466257200223120ustar00rootroot00000000000000/* * Copyright © 2020-2026, VideoLAN and dav2d authors * Copyright © 2020-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "dav2d/common.h" #include "common/frame.h" #include "common/intops.h" #include "src/env.h" #include "src/mem.h" #include "src/refmvs.h" #define DEBUG_REFMV 0 #if DEBUG_BLOCK_INFO && DEBUG_REFMV #define RDB_ONLY(...) __VA_ARGS__ #define DB_ARGS(...) __VA_ARGS__, #define DEBUG_REFMV_printf(...) \ if (DEBUG_REFMV && BLOCK_TO_DEBUG_S(rf->frm_hdr->frame_offset, by4, bx4)) { \ printf(__VA_ARGS__); \ } #else #define RDB_ONLY(...) #define DB_ARGS(...) #define DEBUG_REFMV_printf(...) do {} while (0) #endif static int add_candidate_sngl(DB_ARGS(const refmvs_frame *const rf, const int by4, const int bx4, const int y_off, const int x_off, const char *const tag, const int refidx) refmvs_candidate *const mvstack, int *const cnt, const int max_cnt, const int weight, const mv cand_mv, const int y_off_s, const int x_off_s, int *const iter_cntr, const int max_iter) { const int last = *cnt; RDB_ONLY(int did_check = 0); if (iter_cntr[0] < max_iter) { for (int m = 0; m < last; m++) if (mvstack[m].mv[0].n == cand_mv.n) { iter_cntr[0] += m + 1; mvstack[m].weight += weight; DEBUG_REFMV_printf("%s[%d:%d]: increasing[%d] y=%d,x=%d,w+=%d " "at offset y=%d,x=%d\n", tag, refidx, iter_cntr[0], m, cand_mv.y, cand_mv.x, weight, y_off, x_off); return 0; } RDB_ONLY(did_check = 1); iter_cntr[0] += last; } if (last >= max_cnt) return 0; mvstack[last].mv[0] = cand_mv; mvstack[last].weight = weight; mvstack[last].y_off = y_off_s; mvstack[last].x_off = x_off_s; DEBUG_REFMV_printf("%s[%d:%d]: %s[%d] y=%d,x=%d,w=%d,y_off=%d,x_off=%d at " "offset y=%d,x=%d\n", tag, refidx, iter_cntr[0], did_check ? "adding" : "tailing", *cnt, cand_mv.y, cand_mv.x, weight, y_off_s, x_off_s, y_off, x_off); *cnt = last + 1; return 1; } static void add_candidate_c2s(DB_ARGS(const refmvs_frame *const rf, const int by4, const int bx4, const int y_off, const int x_off, const char *const tag, const int refidx) refmvs_sngl_mv_block *const mvstack, int *const cnt, const int max_cnt, const int ref, const mv cand_mv, int *const iter_cntr, const int max_iter) { const int last = *cnt; RDB_ONLY(int did_check = 0); if (*iter_cntr < max_iter) { for (int m = 0; m < last; m++) if (mvstack[m].mv.n == cand_mv.n && mvstack[m].ref == ref) { *iter_cntr += m + 1; DEBUG_REFMV_printf("%s[%d:%d]: skipping[%d] y=%d,x=%d,r=%d " "at offset y=%d,x=%d\n", tag, refidx, *iter_cntr, m, cand_mv.y, cand_mv.x, ref, y_off, x_off); return; } RDB_ONLY(did_check = 1); *iter_cntr += last; } if (last >= max_cnt) return; mvstack[last].mv = cand_mv; mvstack[last].ref = ref; DEBUG_REFMV_printf("%s[%d:%d]: %s[%d] y=%d,x=%d,r=%d at offset y=%d,x=%d\n", tag, refidx, *iter_cntr, did_check ? "adding" : "tailing", *cnt, cand_mv.y, cand_mv.x, ref, y_off, x_off); *cnt = last + 1; } static int add_candidate_comp(DB_ARGS(const refmvs_frame *const rf, const int by4, const int bx4, const int y_off, const int x_off, const char *const tag) refmvs_candidate *const mvstack, int *const cnt, const int max_cnt, const int weight, const int cwp_idx, const mv cand_mv[2], int *const iter_cntr, const int max_iter) { const int last = *cnt; RDB_ONLY(int did_check = 0); if (iter_cntr[0] < max_iter) { for (int n = 0; n < last; n++) if (!CMP2MV(mvstack[n].mv, cand_mv)) { iter_cntr[0] += n + 1; mvstack[n].weight += weight; DEBUG_REFMV_printf("%s-c[%d]: increasing[%d] y=%d,x=%d," "y2=%d,x2=%d,w+=%d at offset y=%d,x=%d\n", tag, iter_cntr[0], n, cand_mv[0].y, cand_mv[0].x, cand_mv[1].y, cand_mv[1].x, weight, y_off, x_off); return 0; } RDB_ONLY(did_check = 1); iter_cntr[0] += last; } if (last >= max_cnt) return 0; COPY2MV(mvstack[last].mv, cand_mv); mvstack[last].weight = weight; mvstack[last].cwp_idx = cwp_idx; DEBUG_REFMV_printf("%s-c[%d]: %s[%d] y=%d,x=%d,y2=%d,x2=%d,w=%d at offset " "y=%d,x=%d\n", tag, iter_cntr[0], did_check ? "adding" : "tailing", *cnt, cand_mv[0].y, cand_mv[0].x, cand_mv[1].y, cand_mv[1].x, weight, y_off, x_off); *cnt = last + 1; return 1; } struct refmvs_state { refmvs_candidate dr[6], *mv; refmvs_sngl_mv_block sngl[4]; int drvd_cnt, sngl_cnt, *cnt; int drvd_iter_cntr, sngl_iter_cntr, iter_cntr; ptrdiff_t b8x8; #if DEBUG_BLOCK_INFO && DEBUG_REFMV int bx4, by4; #endif }; static void add_spatial_candidate(const int y_off, const int x_off, const refmvs_tile *const rt, struct refmvs_state *const st, const int weight, const refmvs_block *const b, ptrdiff_t off_y_8x8, ptrdiff_t off_x_8x8, const union refpair ref, const mv gmv[2]) { if (*st->cnt >= 6) return; if (b->mv[0].y == INVALID_MV) return; // intra block, no intrabc const refmvs_frame *const rf = rt->rf; if (b->ref.ref[0] == TIP_FRAME) { const int tip16 = rf->frm_hdr->tip.frame_mode == 2 ? !rf->seq_hdr->tip_refine_mv || rf->frm_hdr->tip.subpel_filter != DAV2D_FILTER_8TAP_SHARP : (!rf->seq_hdr->tip_refine_mv && imin(dav2d_block_dimensions[b->bs][0], dav2d_block_dimensions[b->bs][1]) >= 4) || b->bs == BS_256x256; const int tip16m = ~tip16; // FIXME this should be relative to the block's top/left position off_y_8x8 &= tip16m; off_x_8x8 &= tip16m; } const ptrdiff_t off_8x8 = rf->rp_stride * off_y_8x8 + off_x_8x8; if (ref.ref[1] == -1) { const int num = 1 + (ref.ref[0] >= 0); for (int n = 0; n < num; n++) { if (b->ref.ref[n] == ref.ref[0]) { const mv cand_mv = ((b->mf & 1) && gmv[0].y != INVALID_MV) ? gmv[0] : b->mv[n]; add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "spc", n) st->mv, st->cnt, 6, weight, cand_mv, y_off, x_off, &st->iter_cntr, 16); } else if (b->ref.ref[0] == TIP_FRAME && rf->tip.ref.ref[n] == ref.ref[0]) { union mv tmv = rt->rp_proj[off_8x8].mv; if (tmv.y == INVALID_MV) tmv.n = 0; const mv tipmv = scale_mv(tmv, rf->tip.sf[n]); const mv cand_mv = (mv) { .y = iclip(tipmv.y + b->mv[0].y, -0xffff, 0xffff), .x = iclip(tipmv.x + b->mv[0].x, -0xffff, 0xffff), }; add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "tip-spc", n) st->mv, st->cnt, 6, weight, cand_mv, y_off, x_off, &st->iter_cntr, 16); } else if (ref.ref[0] == TIP_FRAME && b->ref.pair == rf->tip.ref.pair) { const mv in_delta = (mv) { .y = b->mv[0].y - b->mv[1].y, .x = b->mv[0].x - b->mv[1].x, }; const mv out_delta = scale_mv(in_delta, rf->tip.sf[0]); const mv cand_mv = (mv) { .y = iclip(b->mv[0].y - out_delta.y, -0xffff, 0xffff), .x = iclip(b->mv[0].x - out_delta.x, -0xffff, 0xffff), }; add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "tip2-spc", n) st->dr, &st->drvd_cnt, 4, weight, cand_mv, 0, 0, &st->drvd_iter_cntr, 2); break; } else if (rf->seq_hdr->mv_traj && rf->frm_hdr->use_ref_frame_mvs && (unsigned) ref.ref[0] < TIP_FRAME && (b->ref.ref[0] == TIP_FRAME || (unsigned) b->ref.ref[n] < TIP_FRAME) && rt->rp_traj[ref.ref[0]][st->b8x8].y != INVALID_MV && rt->rp_traj[b->ref.ref[0] == TIP_FRAME ? rf->tip.ref.ref[n] : b->ref.ref[n]][st->b8x8].y != INVALID_MV) { mv a_mv, b_mv; if (b->ref.ref[0] == TIP_FRAME) { a_mv = rt->rp_traj[rf->tip.ref.ref[n]][st->b8x8]; union mv tmv = rt->rp_proj[off_8x8].mv; if (tmv.y == INVALID_MV) tmv.n = 0; const mv tipmv = scale_mv(tmv, rf->tip.sf[n]); b_mv = (mv) { .y = iclip(tipmv.y + b->mv[0].y, -0xffff, 0xffff), .x = iclip(tipmv.x + b->mv[0].x, -0xffff, 0xffff), }; } else { a_mv = rt->rp_traj[b->ref.ref[n]][st->b8x8]; b_mv = b->mv[n]; } const mv c_mv = rt->rp_traj[ref.ref[0]][st->b8x8]; const mv cand_mv = (mv) { .y = iclip(b_mv.y + c_mv.y - a_mv.y, -0xffff, 0xffff), .x = iclip(b_mv.x + c_mv.x - a_mv.x, -0xffff, 0xffff), }; add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "mvtj-spc", n) st->dr, &st->drvd_cnt, 4, weight, cand_mv, 0, 0, &st->drvd_iter_cntr, 2); } else if ((unsigned) ref.ref[0] < TIP_FRAME && b->ref.ref[0] >= 0 && rf->ref_sign[ref.ref[0]] == rf->ref_sign[b->ref.ref[0] == TIP_FRAME ? rf->tip.ref.ref[n] : b->ref.ref[n]]) { mv cand_mv; int den; if ((unsigned) b->ref.ref[0] == TIP_FRAME) { union mv tmv = rt->rp_proj[off_8x8].mv; if (tmv.y == INVALID_MV) tmv.n = 0; const mv tipmv = scale_mv(tmv, rf->tip.sf[n]); cand_mv = (mv) { .y = iclip(tipmv.y + b->mv[0].y, -0xffff, 0xffff), .x = iclip(tipmv.x + b->mv[0].x, -0xffff, 0xffff), }; den = rf->abspocdiff[rf->tip.ref.ref[n]]; } else { cand_mv = b->mv[n]; den = rf->abspocdiff[b->ref.ref[n]]; } cand_mv = dav2d_mv_projection(cand_mv, rf->abspocdiff[ref.ref[0]], den, -0xffff, 0xffff); add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "lnr-spc", n) st->dr, &st->drvd_cnt, 4, weight, cand_mv, 0, 0, &st->drvd_iter_cntr, 2); } if (b->ref.ref[1] < 0 && b->ref.ref[0] != TIP_FRAME) break; } } else if (b->ref.ref[0] == TIP_FRAME && ref.pair == rf->tip.ref.pair) { mv tmv = rt->rp_proj[off_8x8].mv; if (tmv.y == INVALID_MV) tmv.n = 0; const mv tip0mv = scale_mv(tmv, rf->tip.sf[0]); const mv tip1mv = scale_mv(tmv, rf->tip.sf[1]); const union mv cand_mv[2] = { [0] = { .y = iclip(tip0mv.y + b->mv[0].y, -0xffff, 0xffff), .x = iclip(tip0mv.x + b->mv[0].x, -0xffff, 0xffff), }, [1] = { .y = iclip(tip1mv.y + b->mv[0].y, -0xffff, 0xffff), .x = iclip(tip1mv.x + b->mv[0].x, -0xffff, 0xffff), } }; add_candidate_comp(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "tip-spc") st->mv, st->cnt, 6, weight, 8, cand_mv, &st->iter_cntr, 16); } else if (b->ref.pair == ref.pair) { const union mv cand_mv[2] = { [0] = ((b->mf & 1) && gmv[0].y != INVALID_MV) ? gmv[0] : b->mv[0], [1] = ((b->mf & 1) && gmv[1].y != INVALID_MV) ? gmv[1] : b->mv[1], }; add_candidate_comp(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "spc") st->mv, st->cnt, 6, weight, b->mf >> 2, cand_mv, &st->iter_cntr, 16); } else { if (rf->seq_hdr->mv_traj && rf->frm_hdr->use_ref_frame_mvs && b->ref.ref[0] != TIP_FRAME && ref.ref[0] != ref.ref[1] && rt->rp_traj[ref.ref[0]][st->b8x8].y != INVALID_MV && rt->rp_traj[ref.ref[1]][st->b8x8].y != INVALID_MV) { const mv b1_mv = rt->rp_traj[ref.ref[0]][st->b8x8]; const mv b2_mv = rt->rp_traj[ref.ref[1]][st->b8x8]; for (int n = 0; n < 2 && b->ref.ref[n] >= 0; n++) { const mv a_mv = rt->rp_traj[b->ref.ref[n]][st->b8x8]; if (a_mv.y == INVALID_MV) continue; const union mv cand_mv[2] = { [0] = { .y = iclip(b->mv[n].y + b1_mv.y - a_mv.y, -0xffff, 0xffff), .x = iclip(b->mv[n].x + b1_mv.x - a_mv.x, -0xffff, 0xffff), }, [1] = { .y = iclip(b->mv[n].y + b2_mv.y - a_mv.y, -0xffff, 0xffff), .x = iclip(b->mv[n].x + b2_mv.x - a_mv.x, -0xffff, 0xffff), }, }; add_candidate_comp(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "mvtj-spc") st->dr, &st->drvd_cnt, 4, weight, 8, cand_mv, &st->drvd_iter_cntr, 2); } } int ns = 1; if (ref.ref[0] == b->ref.ref[0] || ref.ref[0] == b->ref.ref[1]) { ns = 0; } else if (ref.ref[1] != b->ref.ref[0] && ref.ref[1] != b->ref.ref[1]) return; const int nc = ref.ref[ns] != b->ref.ref[0]; int oidx; for (oidx = 0; oidx < st->sngl_cnt; oidx++) if (ref.ref[!ns] == st->sngl[oidx].ref) break; if (oidx < st->sngl_cnt) { union mv cand_mv[2]; cand_mv[ns] = b->mv[nc]; cand_mv[!ns] = st->sngl[oidx].mv; add_candidate_comp(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "mvxp-spc") st->dr, &st->drvd_cnt, 4, weight, 8, cand_mv, &st->drvd_iter_cntr, 2); } const mv cand_mv = ((b->mf & 1) && gmv[nc].y != INVALID_MV) ? gmv[ns] : b->mv[nc]; add_candidate_c2s(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "sngl-c", ns) st->sngl, &st->sngl_cnt, 4, b->ref.ref[nc], cand_mv, &st->sngl_iter_cntr, 2); } } static void add_derived(DB_ARGS(const refmvs_frame *const rf, const char *const tag) struct refmvs_state *const st, const int lim, const int comp) { for (int n = 0; n < st->drvd_cnt && *st->cnt < 6; n++) if (comp) { add_candidate_comp(DB_ARGS(rf, st->by4, st->bx4, st->dr[n].y_off, st->dr[n].x_off, tag) st->mv, st->cnt, lim, 0, 8, st->dr[n].mv, &st->iter_cntr, 16); } else { add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, st->dr[n].y_off, st->dr[n].x_off, tag, n) st->mv, st->cnt, lim, 0, st->dr[n].mv[0], 0, 0, &st->iter_cntr, 16); } } static const uint16_t div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780, 744, 712, 682, 655, 630, 606, 585, 564, 546, 528 }; mv dav2d_mv_projection(const union mv mv, const int num, const int den, const int min, const int max) { assert(den > 0 && den < 32); assert(num > -32 && num < 32); const int frac = num * div_mult[den]; const int y = mv.y * frac, x = mv.x * frac; return (union mv) { .y = iclip((y + 8192 + (y >> 31)) >> 14, min, max), .x = iclip((x + 8192 + (x >> 31)) >> 14, min, max) }; } static int add_temporal_candidate(const refmvs_tile *const rt, struct refmvs_state *const st, const ptrdiff_t off_8x8, DB_ARGS(const int x_off, const int y_off) const union refpair ref) { const refmvs_frame *const rf = rt->rf; if ((unsigned) ref.ref[0] >= TIP_FRAME) return 0; union mv mv = rt->rp_traj[ref.ref[0]][off_8x8]; if (!rf->seq_hdr->mv_traj || mv.y == INVALID_MV) { mv = rt->rp_proj[off_8x8].mv; if (mv.y == INVALID_MV) return 0; mv = dav2d_mv_projection(mv, rf->pocdiff[ref.ref[0]], rt->rp_proj[off_8x8].ref, -0xffff, 0xffff); } if (ref.ref[1] == -1) { const int weight = 1 + (rf->abspocdiff[ref.ref[0]] <= 2); return add_candidate_sngl(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "tpl", 0) st->mv, st->cnt, 6, weight, mv, 0, 0, &st->iter_cntr, 16); } union mv mv2 = rt->rp_traj[ref.ref[1]][off_8x8]; if (!rf->seq_hdr->mv_traj || mv2.y == INVALID_MV) { mv2 = rt->rp_proj[off_8x8].mv; if (mv2.y == INVALID_MV) return 0; mv2 = dav2d_mv_projection(mv2, rf->pocdiff[ref.ref[1]], rt->rp_proj[off_8x8].ref, -0xffff, 0xffff); } const union mv cand_mv[2] = { [0] = mv, [1] = mv2 }; return add_candidate_comp(DB_ARGS(rf, st->by4, st->bx4, y_off, x_off, "tpl") st->mv, st->cnt, 6, 1, 8, cand_mv, &st->iter_cntr, 16); } static int model_from_corners(DB_ARGS(const refmvs_frame *const rf, const int by4, const int bx4, const int idx) int32_t *const mat, const mv topleft_mv, const mv topright_mv, const mv bottomleft_mv, const int xpos, const int ypos, const uint8_t *const b_dim) { if (topright_mv.n == topleft_mv.n && bottomleft_mv.n == topleft_mv.n) return 0; if (imin(imin(topleft_mv.x, bottomleft_mv.x), topright_mv.x + b_dim[0] * 32) < -xpos * 8) return 0; if (imin(imin(topleft_mv.y, topright_mv.y), bottomleft_mv.y + b_dim[1] * 32) < -ypos * 8) return 0; mat[2] = iclip64to32(((topright_mv.x - topleft_mv.x) * (1LL << 11)) >> b_dim[2], INT32_MIN, INT32_MAX); mat[4] = iclip64to32(((topright_mv.y - topleft_mv.y) * (1LL << 11)) >> b_dim[2], INT32_MIN, INT32_MAX); mat[3] = iclip64to32(((bottomleft_mv.x - topleft_mv.x) * (1LL << 11)) >> b_dim[3], INT32_MIN, INT32_MAX); mat[5] = iclip64to32(((bottomleft_mv.y - topleft_mv.y) * (1LL << 11)) >> b_dim[3], INT32_MIN, INT32_MAX); mat[0] = iclip64to32(topleft_mv.x * (1LL << 13) - (int64_t) xpos * mat[2] - (int64_t) ypos * mat[3], -0x8000000, 0x7ffffc0); mat[1] = iclip64to32(topleft_mv.y * (1LL << 13) - (int64_t) xpos * mat[4] - (int64_t) ypos * mat[5], -0x8000000, 0x7ffffc0); #define reduce(i) \ mat[i] = iclip(mat[i], -0x7fc0, 0x7fc0); \ mat[i] += 0x20 - (mat[i] < 0); \ mat[i] &= ~0x3f reduce(2); reduce(3); reduce(4); reduce(5); #undef reduce mat[2] += 0x10000; mat[5] += 0x10000; mat[6] = DAV2D_WM_TYPE_AFFINE; DEBUG_REFMV_printf("MFC[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d " "from tl=y:%d,x:%d,bl=y:%d,x:%d,tr=y:%d,x:%d\n", idx, mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], mat[6], topleft_mv.y, topleft_mv.x, bottomleft_mv.y, bottomleft_mv.x, topright_mv.y, topright_mv.x); return 1; } static ALWAYS_INLINE mv get_warpmv_proj(const refmvs_block *const r, const int x, const int y, const int minx, const int maxx, const int miny, const int maxy) { if (r->warp_type <= 0) return (mv) { .n = 0 }; // see #834 const int xc = (r->m[2] - (1 << 16)) * x + r->m[3] * y + r->m[0]; const int yc = (r->m[5] - (1 << 16)) * y + r->m[4] * x + r->m[1]; union mv res = (mv) { .y = iclip((yc + 0x1000 - (yc < 0)) >> 13, -0xffff, +0xffff), .x = iclip((xc + 0x1000 - (xc < 0)) >> 13, -0xffff, +0xffff), }; res.y = iclip(res.y, miny, maxy); res.x = iclip(res.x, minx, maxx); return res; } /* * refmvs_frame allocates memory for one sbrow (32 blocks high, whole frame * wide) of 4x4-resolution refmvs_block entries for spatial MV referencing. * mvrefs_tile[] keeps a list of 35 (32 + 3 above) pointers into this memory, * and each sbrow, the bottom entries (y=27/29/31) are exchanged with the top * (-5/-3/-1) pointers by calling dav2d_refmvs_tile_sbrow_init() at the start * of each tile/sbrow. * * For temporal MV referencing, we call dav2d_refmvs_save_tmvs() at the end of * each tile/sbrow (when tile column threading is enabled), or at the start of * each interleaved sbrow (i.e. once for all tile columns together, when tile * column threading is disabled). This will copy the 4x4-resolution spatial MVs * into 8x8-resolution refmvs_temporal_block structures. Then, for subsequent * frames, at the start of each tile/sbrow (when tile column threading is * enabled) or at the start of each interleaved sbrow (when tile column * threading is disabled), we call load_tmvs(), which will project the MVs to * their respective position in the current frame. */ void dav2d_refmvs_find(const refmvs_tile *const rt, refmvs_candidate mvstack[6], int32_t (*const warp)[7], int *const cnt, const union refpair ref, const enum BlockSize bs, const int skip_mode, const int by4, const int bx4) { const refmvs_frame *const rf = rt->rf; const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int bw4 = b_dim[0], w4 = imin(bw4, rt->tile_col.end - bx4); const int bh4 = b_dim[1], h4 = imin(bh4, rt->tile_row.end - by4); mv gmv[2]; const int comp = ref.ref[1] >= 0; DEBUG_REFMV_printf("setup_ref_mv_list(%d,%d) for y=%d,x=%d\n", ref.ref[0], ref.ref[1], by4, bx4); *cnt = 0; if (warp) cnt[1] = 0; assert(ref.ref[0] >= -1 && ref.ref[0] <= TIP_FRAME && ref.ref[1] >= -1 && ref.ref[1] < TIP_FRAME); gmv[0] = (unsigned) ref.ref[0] >= TIP_FRAME ? (mv) { .n = 0 } : get_gmv_2d(&rf->frm_hdr->gmv.m[ref.ref[0]], bx4, by4, bw4, bh4, rf->iw4, rf->ih4, rf->frm_hdr); if (comp) { gmv[1] = get_gmv_2d(&rf->frm_hdr->gmv.m[ref.ref[1]], bx4, by4, bw4, bh4, rf->iw4, rf->ih4, rf->frm_hdr); DEBUG_REFMV_printf("Gmv2d: y=%d,x=%d, y2=%d,x2=%d\n", gmv[0].y, gmv[0].x, gmv[1].y, gmv[1].x); } else { gmv[1].n = 0; if (ref.ref[0] >= 0) DEBUG_REFMV_printf("Gmv2d: y=%d,x=%d\n", gmv[0].y, gmv[0].x); } const int minx = -(bx4 + bw4 + 4) * 32; const int miny = -(by4 + bh4 + 4) * 32; const int maxx = (rf->iw4 - bx4 + 4) * 32; const int maxy = (rf->ih4 - by4 + 4) * 32; const int is_sb_boundary = !(by4 & (rf->sbsz - 1)); const int have_left = bx4 > rt->tile_col.start; const refmvs_block *const bml = have_left && bh4 == h4 ? &rt->r[((by4 + bh4 - 1) & 63) * 128 + ((bx4 - 1) & 127)] : NULL; const int have_top = by4 > rt->tile_row.start; int x_off, abw4; const refmvs_block *tl = NULL, *lmt = NULL, *rmt = NULL, *tr = NULL; if (have_top) { if (is_sb_boundary) { x_off = bx4 & 1; abw4 = (bw4 + 1) & ~1; if (bx4 - x_off - 2 >= rt->tile_col.start) tl = bx4 & (rf->sbsz - 2) ? &rt->ra[(bx4 >> 1) - 1] : &rt->ra_tl; if (bw4 > 2) lmt = &rt->ra[bx4 >> 1]; if (bw4 == w4) rmt = &rt->ra[(bx4 >> 1) + (abw4 >> 1) - 1]; if (bx4 - x_off + abw4 < rt->tile_col.end && bw4 <= 16) tr = &rt->ra[(bx4 >> 1) + (abw4 >> 1)]; } else { x_off = 0; abw4 = bw4; if (have_left) tl = &rt->r[((by4 - 1) & 63) * 128 + ((bx4 - 1) & 127)]; if (bw4 > 1) lmt = &rt->r[((by4 - 1) & 63) * 128 + (bx4 & 127)]; if (bw4 == w4) rmt = &rt->r[((by4 - 1) & 63) * 128 + ((bx4 + bw4 - 1) & 127)]; if ((bx4 + bw4) & (rf->sbsz - 1) && bx4 + bw4 < rt->tile_col.end && bw4 <= 16) { tr = &rt->r[((by4 - 1) & 63) * 128 + ((bx4 + bw4) & 127)]; if (tr->mv[0].y == INVALID_MV) tr = NULL; // not yet coded } } } if (warp) { DEBUG_REFMV_printf("Warp corners [%d|%d]\n", *cnt, cnt[1]); int bl_ref_idx; if (bml && (!(bl_ref_idx = (bml->ref.ref[0] != ref.ref[0])) || (bml->ref.ref[1] == ref.ref[0] && !(bml->mf & 2)))) { int tl_ref_idx, tr_ref_idx; const mv bl_mv = !(bml->mf & 2) ? bml->mv[bl_ref_idx] : get_warpmv_proj(bml, bx4 * 4, (by4 + bh4) * 4, minx, maxx, miny, maxy); if (tl && (!(tl_ref_idx = (tl->ref.ref[0] != ref.ref[0])) || (tl->ref.ref[1] == ref.ref[0] && !(tl->mf & 2))) && rmt && (!(tr_ref_idx = (rmt->ref.ref[0] != ref.ref[0])) || (rmt->ref.ref[1] == ref.ref[0] && !(rmt->mf & 2)))) { const mv tl_mv = !(tl->mf & 2) ? tl->mv[tl_ref_idx] : get_warpmv_proj(tl, bx4 * 4, by4 * 4, minx, maxx, miny, maxy); const mv tr_mv = !(rmt->mf & 2) ? rmt->mv[tr_ref_idx] : get_warpmv_proj(rmt, (bx4 + bw4) * 4, by4 * 4, minx, maxx, miny, maxy); cnt[1] = model_from_corners(DB_ARGS(rf, by4, bx4, 0) warp[0], tl_mv, tr_mv, bl_mv, bx4 * 4, by4 * 4, b_dim); } if (!cnt[1] && lmt && (!(tl_ref_idx = (lmt->ref.ref[0] != ref.ref[0])) || (lmt->ref.ref[1] == ref.ref[0] && !(lmt->mf & 2))) && tr && (!(tr_ref_idx = (tr->ref.ref[0] != ref.ref[0])) || (tr->ref.ref[1] == ref.ref[0] && !(tr->mf & 2)))) { const mv tl_mv = !(lmt->mf & 2) ? lmt->mv[tl_ref_idx] : get_warpmv_proj(lmt, bx4 * 4, by4 * 4, minx, maxx, miny, maxy); const mv tr_mv = !(tr->mf & 2) ? tr->mv[tr_ref_idx] : get_warpmv_proj(tr, (bx4 + bw4) * 4, by4 * 4, minx, maxx, miny, maxy); cnt[1] = model_from_corners(DB_ARGS(rf, by4, bx4, 1) warp[0], tl_mv, tr_mv, bl_mv, bx4 * 4, by4 * 4, b_dim); } } } const ptrdiff_t stride = rf->rp_stride; const ptrdiff_t tms_8x8y = (by4 & (rf->sbsz - 1)) >> 1; const ptrdiff_t lms_8x8x = bx4 >> 1; struct refmvs_state st = { .mv = mvstack, .cnt = cnt, .drvd_cnt = 0, .sngl_cnt = 0, .iter_cntr = 0, .drvd_iter_cntr = 0, .sngl_iter_cntr = 0, .b8x8 = lms_8x8x + tms_8x8y * stride, #if DEBUG_BLOCK_INFO && DEBUG_REFMV .by4 = by4, .bx4 = bx4, #endif }; // FIXME high-priority TMVP DEBUG_REFMV_printf("Spatial MVP [%d|%d]\n", *cnt, warp ? cnt[1] : 0); // bottom-most left const ptrdiff_t bms_8x8y = ((by4 + bh4 - 1) & (rf->sbsz - 1)) >> 1; const ptrdiff_t left_8x8x = (bx4 - 1) >> 1; if (bml) { if (warp && bml->mf & 2 && bml->ref.ref[0] == ref.ref[0] && bml->warp_type != DAV2D_WM_TYPE_INVALID) { #define add_matrix(var) do { \ DEBUG_REFMV_printf("Spatial[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d from %s\n", \ cnt[1], var->m[0], var->m[1], var->m[2], \ var->m[3], var->m[4], var->m[5], var->warp_type, #var); \ warp[cnt[1]][6] = var->warp_type; \ memcpy(warp[cnt[1]++], var->m, sizeof(int32_t) * 6); \ } while (0) add_matrix(bml); } add_spatial_candidate(bh4 - 1, -1, rt, &st, 1, bml, bms_8x8y, left_8x8x, ref, gmv); } // right-most top const ptrdiff_t top_8x8y = by4 & (rf->sbsz - 1) ? ((by4 - 1) & (rf->sbsz - 1)) >> 1 : -1; if (rmt) { if (warp && rmt->mf & 2 && rmt->ref.ref[0] == ref.ref[0] && rmt->warp_type != DAV2D_WM_TYPE_INVALID) { add_matrix(rmt); } const int xpos = abw4 - (1 << is_sb_boundary) - x_off; add_spatial_candidate(-1, xpos, rt, &st, xpos >= 0, rmt, top_8x8y, (bx4 + xpos) >> 1, ref, gmv); } // top-most left const refmvs_block *tml = NULL; if (have_left && bh4 > 1) { tml = &rt->r[(by4 & 63) * 128 + ((bx4 - 1) & 127)]; if (warp && tml->mf & 2 && tml->ref.ref[0] == ref.ref[0] && tml->warp_type != DAV2D_WM_TYPE_INVALID) { add_matrix(tml); } add_spatial_candidate(0, -1, rt, &st, 1, tml, tms_8x8y, left_8x8x, ref, gmv); } // left-most top if (lmt) { if (warp && cnt[1] < 4 && lmt->mf & 2 && lmt->ref.ref[0] == ref.ref[0] && lmt->warp_type != DAV2D_WM_TYPE_INVALID) { add_matrix(lmt); } const int xpos = -x_off; add_spatial_candidate(-1, xpos, rt, &st, !x_off, lmt, top_8x8y, (bx4 + xpos) >> 1, ref, gmv); } // bottom-left if (have_left && bh4 <= 16 && (by4 + bh4) & (rf->sbsz - 1) && by4 + bh4 < rt->tile_row.end) { const refmvs_block *const bl = &rt->r[((by4 + bh4) & 63) * 128 + ((bx4 - 1) & 127)]; if (warp && cnt[1] < 4 && bl->mf & 2 && bl->ref.ref[0] == ref.ref[0] && bl->warp_type != DAV2D_WM_TYPE_INVALID) { add_matrix(bl); } add_spatial_candidate(bh4, -1, rt, &st, 1, bl, ((by4 + bh4) & (rf->sbsz - 1)) >> 1, left_8x8x, ref, gmv); } // top-right if (tr) { if (warp && cnt[1] < 4 && tr->mf & 2 && tr->ref.ref[0] == ref.ref[0] && tr->warp_type != DAV2D_WM_TYPE_INVALID) { add_matrix(tr); } const int xpos = abw4 - x_off; add_spatial_candidate(-1, xpos, rt, &st, 1, tr, top_8x8y, (bx4 + xpos) >> 1, ref, gmv); } // normal priority TMVP DEBUG_REFMV_printf("Low-priority TMVP [%d|%d]\n", *cnt, warp ? cnt[1] : 0); if (rf->use_ref_frame_mvs && (ref.ref[0] != ref.ref[1] || skip_mode) && *cnt < 6) { const int bw8 = imin(bw4 >> 1, 8), bh8 = imin(bh4 >> 1, 8); const int step_h = bw4 >= 16 ? 2 : 1, step_v = bh4 >= 16 ? 2 : 1; const int x_off = 2 * bw8 - 2 * step_h, y_off = 2 * bh8 - 2 * step_v; const int first = (unsigned) x_off < (unsigned) w4 && (unsigned) y_off < (unsigned) h4 && add_temporal_candidate(rt, &st, (((by4 + y_off) & (rf->sbsz - 1)) >> 1) * stride + ((bx4 + x_off) >> 1), DB_ARGS((bx4 + x_off) >> 1, (by4 + y_off) >> 1) ref); if (!first && (bw4 > 4 || bh4 > 4)) { add_temporal_candidate(rt, &st, (((by4 + bh8) & (rf->sbsz - 1)) >> 1) * stride + ((bx4 + bw8) >> 1), DB_ARGS((bx4 + bw8) >> 1, (by4 + bh8) >> 1) ref); } } // top-left DEBUG_REFMV_printf("Extra Spatial MVP [%d|%d]\n", *cnt, warp ? cnt[1] : 0); if (tl) { if (warp && cnt[1] < 4 && tl->mf & 2 && tl->ref.ref[0] == ref.ref[0] && tl->warp_type != DAV2D_WM_TYPE_INVALID) { add_matrix(tl); } const int xpos = -(1 << is_sb_boundary) - x_off; add_spatial_candidate(-1, xpos, rt, &st, 0, tl, top_8x8y, (bx4 + xpos) >> 1, ref, gmv); } const int nearest_refmv_count = *cnt; if (have_left) { DEBUG_REFMV_printf("Spatial Ext [left] MVP [%d|%d]\n", *cnt, warp ? cnt[1] : 0); const int adj = 3 - (bx4 & (bw4 == 1)); if (bx4 - adj >= rt->tile_col.start) { if (bh4 == h4) { const int pos = ((by4 + bh4 - 1) & 63) * 128 + ((bx4 - adj) & 127); const refmvs_block *const ext_bml = &rt->r[pos]; assert(bml); if (dav2d_block_dimensions[ext_bml->bs][0] < adj || ext_bml->bs != bml->bs) { if (warp && cnt[1] < 4 && ext_bml->mf & 2 && ext_bml->ref.ref[0] == ref.ref[0]) { add_matrix(ext_bml); } add_spatial_candidate(bh4 - 1, -adj, rt, &st, 0, ext_bml, bms_8x8y, (bx4 - adj) >> 1, ref, gmv); } } if (bh4 > 1) { const int pos = (by4 & 63) * 128 + ((bx4 - adj) & 127); const refmvs_block *const ext_tml = &rt->r[pos]; assert(tml); if (dav2d_block_dimensions[ext_tml->bs][0] < adj || ext_tml->bs != tml->bs) { if (warp && cnt[1] < 4 && ext_tml->mf & 2 && ext_tml->ref.ref[0] == ref.ref[0]) { add_matrix(ext_tml); } add_spatial_candidate(0, -adj, rt, &st, 0, ext_tml, tms_8x8y, (bx4 - adj) >> 1, ref, gmv); } } } } // sort if ((rf->seq_hdr->drl_reorder == 2 /* always */ && nearest_refmv_count >= 2) || (rf->seq_hdr->drl_reorder == 1 /* constraint */ && (/*!is_tmvp_high_priority &&*/ nearest_refmv_count >= 4))) { int maxwidx = 0, maxw = mvstack[0].weight; for (int n = 1; n < nearest_refmv_count; n++) { const int w = mvstack[n].weight; if (w > maxw) { maxw = w; maxwidx = n; } } if (maxwidx) { refmvs_candidate tmp = mvstack[maxwidx]; mvstack[maxwidx] = mvstack[0]; mvstack[0] = tmp; } } DEBUG_REFMV_printf("Derived Spatial MVP & refbank [%d|%d]\n", *cnt, warp ? cnt[1] : 0); const int lim = 1 + (ref.ref[0] >= 0 ? rf->frm_hdr->max_drl_bits : rf->frm_hdr->max_bvp_drl_bits); if (ref.ref[1] != -1 && *cnt < lim) add_derived(DB_ARGS(rf, "derived") &st, lim, 1); if (rf->seq_hdr->refmv_bank) { const int c = ref.ref[1] == -1 ? ((unsigned) ref.ref[0] <= 5U ? ref.ref[0] : 8) : (!ref.ref[0] && ref.ref[1] < 2) ? 6 + ref.ref[1] : 8; const int sz = rt->bank.size[c], idx = rt->bank.idx[c]; const int start = sz + idx - 1; for (int n = 0; n < sz && *cnt < lim; n++) { const int bank_idx = (start - n) & 3; if (c == 8 && rt->bank.ref[bank_idx].pair != ref.pair) continue; const union mv *const mv = rt->bank.mv[c][bank_idx]; const int last = *cnt; RDB_ONLY(int did_check = 0); if (st.iter_cntr < 16) { for (int m = 0; m < last; m++) if (mvstack[m].mv[0].n == mv[0].n && mvstack[m].mv[comp].n == mv[comp].n) { st.iter_cntr += m + 1; DEBUG_REFMV_printf("insert_bank[%d/%d:%d]: skipping[%d] y=%d,x=%d,y2=%d,x2=%d\n", n, sz, st.iter_cntr, m, mv[0].y, mv[0].x, comp ? mv[1].y : 0, comp ? mv[1].x : 0); goto end; } RDB_ONLY(did_check = 1); st.iter_cntr += last; } int i; for (i = 0; i <= comp; i++) { const int rx = bx4 * 4 + apply_sign(abs(mv[i].x) >> 3, mv[i].x); const int ry = by4 * 4 + apply_sign(abs(mv[i].y) >> 3, mv[i].y); if (rx <= -bw4 * 4 || ry <= -bh4 * 4 || rx >= rf->iw8 * 8 || ry >= rf->ih8 * 8) { break; } } if (i <= comp) continue; DEBUG_REFMV_printf("insert_bank[%d/%d:%d]: %s[%d] y=%d,x=%d,y2=%d,x2=%d,w=%d\n", n, sz, st.iter_cntr, did_check ? "adding" : "tailing", last, mv[0].y, mv[0].x, comp ? mv[1].y : 0, comp ? mv[1].x : 0, 0); COPY2MV(mvstack[last].mv, mv); mvstack[last].weight = 0; if (ref.ref[1] >= 0) mvstack[last].cwp_idx = rt->bank.cwp_idx[c - 6][bank_idx]; mvstack[last].y_off = mvstack[last].x_off = 0; *cnt = last + 1; end: {} } } if (ref.ref[1] == -1 && *cnt < lim) add_derived(DB_ARGS(rf, "derived") &st, lim, 0); for (int n = 0; n < cnt[0]; n++) { union mv *const mv = mvstack[n].mv; mv[0].y = iclip(mv[0].y, miny, maxy); mv[0].x = iclip(mv[0].x, minx, maxx); if (ref.ref[1] >= 0) { mv[1].y = iclip(mv[1].y, miny, maxy); mv[1].x = iclip(mv[1].x, minx, maxx); } } DEBUG_REFMV_printf("GMVs [%d|%d]\n", *cnt, warp ? cnt[1] : 0); if (*cnt < 6 && ref.ref[0] >= 0) { int last = *cnt; RDB_ONLY(int did_check = 0;) if (st.iter_cntr < 16) { for (int n = 0; n < last; n++) if (mvstack[n].mv[0].n == gmv[0].n && mvstack[n].mv[comp].n == gmv[comp].n) { st.iter_cntr += n + 1; DEBUG_REFMV_printf("gmv_add[%d]: skipping[%d] y=%d,x=%d," "y2=%d,x2=%d from GMV\n", st.iter_cntr, n, gmv[0].y, gmv[0].x, gmv[1].y, gmv[1].x); goto end_gmv; } RDB_ONLY(did_check = 1); st.iter_cntr += last; } COPY2MV(mvstack[last].mv, gmv); mvstack[last].weight = 0; mvstack[last].cwp_idx = 8; mvstack[last].y_off = mvstack[last].x_off = 0; DEBUG_REFMV_printf("gmv_add[%d]: %s[%d] y=%d,x=%d,y2=%d,x2=%d,w=%d from GMV\n", st.iter_cntr, did_check ? "adding" : "tailing", last, gmv[0].y, gmv[0].x, gmv[1].y, gmv[1].x, 0); *cnt = last + 1; end_gmv: {} if (imin(bw4, bh4) > 8) { DEBUG_REFMV_printf("Ext MVP candidates [%d|%d]\n", *cnt, warp ? cnt[1] : 0); if (*cnt >= 2 && *cnt < 6) { static const struct { uint8_t y, x; } ext_mvp[] = { { .y = 0, .x = 1 }, { .y = 1, .x = 0 }, { .y = 0, .x = 2 }, { .y = 2, .x = 0 }, { .y = 1, .x = 2 }, { .y = 2, .x = 1 }, }; for (int c = 0, n; c < 2; c++) { for (n = c * 2; n < c * 4 + 2; n++) { const int yidx = ext_mvp[n].y, xidx = ext_mvp[n].x; st.dr[n].mv[0].y = mvstack[yidx].mv[0].y; st.dr[n].mv[0].x = mvstack[xidx].mv[0].x; if (ref.ref[1] >= 0) { st.dr[n].mv[1].y = mvstack[yidx].mv[1].y; st.dr[n].mv[1].x = mvstack[xidx].mv[1].x; } RDB_ONLY(st.dr[n].x_off = xidx; st.dr[n].y_off = yidx); } st.drvd_cnt = n; if (*cnt == 2) break; } add_derived(DB_ARGS(rf, "insert_cand") &st, 6, ref.ref[1] >= 0); } } } if (warp && cnt[1] < 4) { assert((unsigned) ref.ref[0] < TIP_FRAME && ref.ref[1] == -1); DEBUG_REFMV_printf("Warp bank [%d|%d]\n", *cnt, cnt[1]); const int sz = rt->warp.size[ref.ref[0]]; const int idx = rt->warp.idx[ref.ref[0]]; const int start = sz + idx - 1; for (int n = 0; n < sz && cnt[1] < 4; n++) { const int32_t *const mat = rt->warp.mat[ref.ref[0]][(start - n) & 3]; warp[cnt[1]][6] = rt->warp.type[ref.ref[0]][(start - n) & 3]; DEBUG_REFMV_printf("Bank[%d/%d]: [ %d, %d | %d, %d, %d, %d ],t=%d\n", n, cnt[1], mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], warp[cnt[1]][6]); memcpy(warp[cnt[1]++], mat, sizeof(int32_t) * 6); } DEBUG_REFMV_printf("Warp gmv [%d|%d]\n", *cnt, cnt[1]); if (cnt[1] < 4) { const int32_t *const mat = rf->frm_hdr->gmv.m[ref.ref[0]].matrix; warp[cnt[1]][6] = rf->frm_hdr->gmv.m[ref.ref[0]].type; DEBUG_REFMV_printf("GMV[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d\n", cnt[1], mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], warp[cnt[1]][6]); memcpy(warp[cnt[1]++], mat, sizeof(int32_t) * 6); } DEBUG_REFMV_printf("Warp defaults [%d|%d]\n", *cnt, cnt[1]); for (int n = 0; n < 2; n++) { if (cnt[1] >= 4) break; warp[cnt[1]][6] = dav2d_default_wm_params.type; const int32_t *const mat = dav2d_default_wm_params.matrix; DEBUG_REFMV_printf("Defaults[%d]: [ %d, %d | %d, %d, %d, %d ],t=%d\n", cnt[1], mat[0], mat[1], mat[2], mat[3], mat[4], mat[5], warp[cnt[1]][6]); memcpy(warp[cnt[1]++], mat, sizeof(int32_t) * 6); #undef add_matrix } } assert(*cnt <= 6); // default intrabc refs int n_refmvs = *cnt; if (ref.ref[0] == -1) { DEBUG_REFMV_printf("Intrabc defaults [%d|%d]\n", *cnt, warp ? cnt[1] : 0); if (n_refmvs < rt->rf->frm_hdr->max_bvp_drl_bits + 1) { const int sbsz = 64 << rt->rf->frm_hdr->sb128; mvstack[n_refmvs].mv[0].x = 0; mvstack[n_refmvs].mv[0].y = -(sbsz * 8); mvstack[n_refmvs].weight = 0; *cnt = ++n_refmvs; if (n_refmvs < rt->rf->frm_hdr->max_bvp_drl_bits + 1) { mvstack[n_refmvs].mv[0].x = -(8 * (sbsz + 256)); mvstack[n_refmvs].mv[0].y = 0; mvstack[n_refmvs].weight = 0; *cnt = ++n_refmvs; if (n_refmvs < rt->rf->frm_hdr->max_bvp_drl_bits + 1) { mvstack[n_refmvs].mv[0].x = 0; mvstack[n_refmvs].mv[0].y = -(bh4 * 32); mvstack[n_refmvs].weight = 0; *cnt = ++n_refmvs; if (n_refmvs < rt->rf->frm_hdr->max_bvp_drl_bits + 1) { mvstack[n_refmvs].mv[0].x = -(bw4 * 32); mvstack[n_refmvs].mv[0].y = 0; mvstack[n_refmvs].weight = 0; *cnt = ++n_refmvs; } } } } } for (int n = *cnt; n < 6; n++) { ZERO2MV(&mvstack[n].mv); mvstack[n].weight = 0; mvstack[n].cwp_idx = 8; mvstack[n].x_off = 0; mvstack[n].y_off = 0; } DEBUG_REFMV_printf("Final [%d|%d]\n", *cnt, warp ? cnt[1] : 0); } void dav2d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf, const int tile_col_start4, const int tile_col_end4, const int tile_row_start4, const int tile_row_end4, const int sby, int tile_row_idx) { if (!rf->have_threading) tile_row_idx = 0; const ptrdiff_t off1 = rf->rp_stride * tile_row_idx; const int sbsz8 = rf->sbsz >> 1; const ptrdiff_t off2 = sbsz8 * off1; const ptrdiff_t off3 = rf->have_frame_threading ? (sby * sbsz8) * rf->rp_stride : (sbsz8 + 2) * off1 + 2 * rf->rp_stride; rt->rp_proj = &rf->rp_proj[off3]; for (int n = 0; n < 7; n++) rt->rp_traj[n] = &rf->rp_traj[n][off2]; rt->ra = &rf->ra[off1]; rt->rf = rf; rt->tile_row.start = tile_row_start4; rt->tile_row.end = imin(tile_row_end4, rf->ih4); rt->tile_col.start = tile_col_start4; rt->tile_col.end = imin(tile_col_end4, rf->iw4); memset(rt->bank.size, 0, sizeof(rt->bank.size)); memset(rt->bank.idx, 0, sizeof(rt->bank.idx)); memset(rt->warp.size, 0, sizeof(rt->warp.size)); memset(rt->warp.idx, 0, sizeof(rt->warp.idx)); } void dav2d_refmvs_bank_update(refmvs_tile *const rt, const enum BlockSize bs, const int by4, const int bx4) { const refmvs_frame *const rf = rt->rf; const int bsh = 1 + rf->frm_hdr->sb128, bsz = 1 << bsh; if (!((by4 | bx4) & (rf->sbsz - 1))) { const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int w = imax(1, b_dim[0] >> bsh) * imax(1, b_dim[1] >> bsh); rt->bank.hits[1] = 0; rt->bank.avail = imax(w, 4); DEBUG_REFMV_printf("Resetting refbank: remain=%d|hits=%d|%d\n", rt->bank.avail, rt->bank.hits[1], rt->bank.hits[0]); } else if (!((by4 | bx4) & (bsz - 1))) { const uint8_t *const b_dim = dav2d_block_dimensions[bs]; const int w = imax(1, b_dim[0] >> bsh) * imax(1, b_dim[1] >> bsh); rt->bank.hits[1] = 0; rt->bank.avail += w; DEBUG_REFMV_printf("Updating refbank availability: remain=%d|hits=%d|%d\n", rt->bank.avail, rt->bank.hits[1], rt->bank.hits[0]); } } #if DEBUG_BLOCK_INFO && DEBUG_REFMV static void debug_warpbank(const refmvs_tile *const rt, const int ref, const int by4, const int bx4) { const refmvs_frame *const rf = rt->rf; const int sz = rt->warp.size[ref], idx = rt->warp.idx[ref]; const int start = idx + sz - 1; for (int n = 0; n < sz; n++) { const int32_t *const m = rt->warp.mat[ref][(start - n) & 3]; DEBUG_REFMV_printf("refbank[%d/%d,r=%d]: %d,%d,%d,%d,%d,%d,t=%d\n", n, sz, ref, m[0], m[1], m[2], m[3], m[4], m[5], rt->warp.type[ref][(start - n) & 3]); } } #else #define debug_warpbank(...) #endif int dav2d_refmvs_warp_add(refmvs_tile *const rt, const Dav2dWarpedMotionParams *const mat, DB_ONLY(const int by4, const int bx4) const int ref) { RDB_ONLY(const refmvs_frame *const rf = rt->rf); if (rt->warp.hits >= 64) { DEBUG_REFMV_printf("warprefbank: ignoring further action, hits=%d\n", rt->warp.hits); return -1; } rt->warp.hits++; const int sz = rt->warp.size[ref], idx = rt->warp.idx[ref]; int n; for (n = 0; n < sz; n++) { const int32_t *const m = &rt->warp.mat[ref][(idx + n) & 3][2]; if (!memcmp(m, &mat->matrix[2], sizeof(int32_t) * 4)) break; } if (n < sz) { const int to = sz == 4 ? (idx - 1) & 3 : sz - 1, from = (idx + n) & 3; DEBUG_REFMV_printf("warprefbank: reordering %d to %d [%d,%d,%d,%d,%d,%d]\n", from, to, mat->matrix[0], mat->matrix[1], mat->matrix[2], mat->matrix[3], mat->matrix[4], mat->matrix[5]); if (from != to) { int32_t bak[6]; memcpy(bak, rt->warp.mat[ref][from], sizeof(int32_t) * 6); const int bak_type = rt->warp.type[ref][from]; for (int n1 = from, n2 = (n1 + 1) & 3; n1 != to; n1 = n2, n2 = (n2 + 1) & 3) { memcpy(rt->warp.mat[ref][n1], rt->warp.mat[ref][n2], sizeof(int32_t) * 6); rt->warp.type[ref][n1] = rt->warp.type[ref][n2]; } memcpy(rt->warp.mat[ref][to], bak, sizeof(int32_t) * 6); rt->warp.type[ref][to] = bak_type; } debug_warpbank(rt, ref, by4, bx4); return 0; } const int tgt = sz == 4 ? rt->warp.idx[ref]++ & 3 : rt->warp.size[ref]++; memcpy(rt->warp.mat[ref][tgt], mat->matrix, sizeof(int32_t) * 6); rt->warp.type[ref][tgt] = mat->type; DEBUG_REFMV_printf("warprefbank: adding at %d|%d [%d,%d,%d,%d,%d,%d]\n", rt->warp.size[ref], rt->warp.idx[ref], mat->matrix[0], mat->matrix[1], mat->matrix[2], mat->matrix[3], mat->matrix[4], mat->matrix[5]); debug_warpbank(rt, ref, by4, bx4); return 0; } #if DEBUG_BLOCK_INFO && DEBUG_REFMV static void debug_refbank(const refmvs_tile *const rt, const int c, const int by4, const int bx4) { const refmvs_frame *const rf = rt->rf; const int sz = rt->bank.size[c], idx = rt->bank.idx[c]; const int start = idx + sz - 1; for (int n = 0; n < sz; n++) { const int idx = (start - n) & 3; const union refpair ref = rt->bank.ref[idx]; const int comp = c - 6U < 2U || (c == 8 && ref.ref[1] != -1); DEBUG_REFMV_printf("refbank[%d/%d,c=%d]: mv=y:%d,x:%d,y2=%d,x2=%d,r=%d,%d\n", n, sz, c, rt->bank.mv[c][idx][0].y, rt->bank.mv[c][idx][0].x, comp ? rt->bank.mv[c][idx][1].y : 0, comp ? rt->bank.mv[c][idx][1].x : 0, c < 6 ? c : c < 8 ? 0 : ref.ref[0], c < 6 ? -1 : c < 8 ? c - 6 : ref.ref[1]); } } #else #define debug_refbank(...) #endif static void refmvs_bank_add(refmvs_tile *const rt, DB_ONLY(const int by4, const int bx4) const union refpair ref, const union mv mv[2], const int cwp_idx) { RDB_ONLY(const refmvs_frame *const rf = rt->rf); rt->bank.hits[0]++; const int c = ref.ref[1] == -1 ? ((unsigned) ref.ref[0] <= 5U ? ref.ref[0] : 8) : (!ref.ref[0] && ref.ref[1] <= 1) ? 6 + ref.ref[1] : 8; const int sz = rt->bank.size[c], idx = rt->bank.idx[c]; const int comp = ref.ref[1] != -1; int n; for (n = 0; n < sz; n++) { const int i = (idx + n) & 3; if (mv[0].n == rt->bank.mv[c][i][0].n && mv[comp].n == rt->bank.mv[c][i][comp].n && (c < 8 || ref.pair == rt->bank.ref[i].pair)) { break; } } if (n < sz) { const int to = sz == 4 ? (idx - 1) & 3 : sz - 1, from = (idx + n) & 3; DEBUG_REFMV_printf("Moving refbank entry %d to tail %d | remain=%d|hits=%d|%d\n", from, to, rt->bank.avail, rt->bank.hits[1], rt->bank.hits[0]); if (from != to) { union mv mv_bak[2]; COPY2MV(mv_bak, rt->bank.mv[c][from]); union refpair ref_bak = rt->bank.ref[from]; const int cwp_idx = rt->bank.cwp_idx[imax(0, c - 6)][from]; for (int n1 = from, n2 = (n1 + 1) & 3; n1 != to; n1 = n2, n2 = (n2 + 1) & 3) { COPY2MV(rt->bank.mv[c][n1], rt->bank.mv[c][n2]); if (c == 8) rt->bank.ref[n1] = rt->bank.ref[n2]; if (c >= 6) rt->bank.cwp_idx[c - 6][n1] = rt->bank.cwp_idx[c - 6][n2]; } COPY2MV(rt->bank.mv[c][to], mv_bak); if (c == 8) rt->bank.ref[to] = ref_bak; if (c >= 6) rt->bank.cwp_idx[c - 6][to] = cwp_idx; } debug_refbank(rt, c, by4, bx4); return; } const int tgt = sz == 4 ? rt->bank.idx[c]++ & 3 : rt->bank.size[c]++; COPY2MV(rt->bank.mv[c][tgt], mv); if (c == 8) rt->bank.ref[tgt] = ref; if (ref.ref[1] != -1) rt->bank.cwp_idx[c - 6][tgt] = cwp_idx; DEBUG_REFMV_printf("Adding new refbank entry in %d | remain=%d|hits=%d|%d\n", tgt, rt->bank.avail, rt->bank.hits[1], rt->bank.hits[0]); debug_refbank(rt, c, by4, bx4); } void dav2d_refmvs_bank_add(refmvs_tile *const rt, const enum BlockSize bs, const int by4, const int bx4, const Av2Block *const b) { RDB_ONLY(const refmvs_frame *const rf = rt->rf); assert(rt->rf->seq_hdr->refmv_bank); assert(!b->intra || b->intrabc); dav2d_refmvs_bank_update(rt, bs, by4, bx4); if (rt->bank.hits[0] >= 64 || rt->bank.hits[1] >= 16 || !rt->bank.avail) { DEBUG_REFMV_printf("Refbank is full: remain=%d|hits=%d|%d\n", rt->bank.avail, rt->bank.hits[1], rt->bank.hits[0]); return; } rt->bank.hits[1]++; rt->bank.avail--; refmvs_bank_add(rt, DB_ONLY(by4, bx4) b->ref, b->mv, b->ref.ref[1] == -1 ? 0 : b->cwp_idx); } void dav2d_refmvs_reset_sb(refmvs_tile *const rt, const int by, const int bx) { // FIXME should (eventually) be able to re-use is_coded for (int y = by & 63; y < (by & 63) + rt->rf->sbsz; y++) { for (int x = bx & 127; x < (bx & 127) + rt->rf->sbsz; x++) { rt->r[y * 128 + x].mv[0].y = INVALID_MV; rt->r[y * 128 + x].ref.pair = -1; } } const refmvs_frame *const rf = rt->rf; if (rf->seq_hdr->refmv_bank) { rt->bank.hits[0] = 0; rt->bank.hits[1] = 0; rt->bank.avail = 0; } rt->warp.hits = 0; if (by == rt->tile_row.start || IS_KEY_OR_INTRA(rf->frm_hdr) || rf->frm_hdr->tip.frame_mode == 2) { return; } const int end_x4 = imin(bx + rf->sbsz, rt->tile_col.end); for (int x = bx, sz4, hits = 0; x < end_x4; x += sz4) { const refmvs_block *const r = &rt->ra[x >> 1]; sz4 = dav2d_block_dimensions[r->bs][0]; if (r->mv[0].y == INVALID_MV) continue; if (rf->seq_hdr->refmv_bank) { union refpair ref = r->ref; refmvs_bank_add(rt, DB_ONLY(by, x) ref, r->mf & 2 ? r->lmv : r->mv, r->mf >> 2); } if (r->mf & 2) { Dav2dWarpedMotionParams wmp; wmp.type = r->warp_type; if (wmp.type != DAV2D_WM_TYPE_INVALID) { memcpy(wmp.matrix, r->m, sizeof(int32_t) * 6); dav2d_refmvs_warp_add(rt, &wmp, DB_ONLY(by, x) r->ref.ref[0]); } } if (++hits == 4) break; } } static inline int dequantize_mv_comp(const int v) { const unsigned absv = abs(v); assert(v < 0x80); const int nbits = (absv >> 4) - (absv >= 16); int res = (absv - (nbits + !!nbits) * 16) << nbits; res += 16 * !!nbits << nbits; return v < 0 ? -res : res; } static inline mv dequantize_mv(const union qmv mv) { if (mv.n == INVALID_TRAJ) return (union mv) { .y = INVALID_MV }; return (union mv) { .y = dequantize_mv_comp(mv.y), .x = dequantize_mv_comp(mv.x), }; } static void tip_projection(const refmvs_frame *const rf, refmvs_sngl_mv_block *const rp_proj, const ptrdiff_t stride, const int col_start8, const int col_end8, const int row_start8, int row_end8, const int mfmv_sbsz8, const int sbsz8, const int tmvp_sample_step) { for (int sx = col_start8; sx < col_end8; sx += mfmv_sbsz8) { const int xend = imin(col_end8, sx + mfmv_sbsz8); for (int y = row_start8; y < row_end8; y += tmvp_sample_step) { const ptrdiff_t pos_base = (y & (sbsz8 - 1)) * stride; for (int x = sx; x < xend; x += tmvp_sample_step) { const ptrdiff_t pos = pos_base + x; const union mv mv = rp_proj[pos].mv; if (mv.y == INVALID_MV) continue; rp_proj[pos].mv = dav2d_mv_projection(mv, rf->tip.delta, rp_proj[pos].ref, -2047, 2047); rp_proj[pos].ref = rf->tip.delta; } } } } static void fill_holes(const refmvs_frame *const rf, refmvs_sngl_mv_block *const rp_proj, const ptrdiff_t stride, const int col_start8, const int col_end8, const int row_start8, int row_end8, const int mfmv_sbsz8, const int sbsz8, const int tmvp_sample_step) { for (int sx = col_start8; sx < col_end8; sx += mfmv_sbsz8) { const int xend = imin(col_end8, sx + mfmv_sbsz8); for (int y = row_start8; y < row_end8; y += tmvp_sample_step) { const int ystart = y & ~(mfmv_sbsz8 - 1); const int yend = imin(ystart + mfmv_sbsz8, row_end8); const ptrdiff_t pos_base = (y & (sbsz8 - 1)) * stride; for (int x = sx; x < xend; x += tmvp_sample_step) { const ptrdiff_t pos = pos_base + x; const union mv mv = rp_proj[pos].mv; if (mv.y == INVALID_MV) continue; #define copy(off) do { \ if (rp_proj[pos + off].mv.y == INVALID_MV) { \ rp_proj[pos + off].mv = mv; \ rp_proj[pos + off].ref = rf->tip.delta; \ } \ } while (0) if (x - tmvp_sample_step >= sx) copy(-tmvp_sample_step); if (x + tmvp_sample_step < xend) copy(+tmvp_sample_step); if (y - tmvp_sample_step >= ystart) copy(-tmvp_sample_step * stride); if (y + tmvp_sample_step < yend) copy(+tmvp_sample_step * stride); #undef copy } } } } static void smoothen(const refmvs_frame *const rf, refmvs_sngl_mv_block *const rp_proj, const ptrdiff_t stride, const int col_start8, const int col_end8, const int row_start8, int row_end8, const int mfmv_sbsz8, const int sbsz8, const int tmvp_sample_step) { static const unsigned idiv[] = { 65536, 32768, 21845, 16384, 13107 }; union mv mv_line[32]; for (int sx = col_start8; sx < col_end8; sx += mfmv_sbsz8) { const int xend = imin(col_end8, sx + mfmv_sbsz8); int first_line = 1, y; for (y = row_start8; y < row_end8; y += tmvp_sample_step, first_line = 0) { const int ystart = y & ~(mfmv_sbsz8 - 1); const int yend = imin(ystart + mfmv_sbsz8, row_end8); const ptrdiff_t pos_base = (y & (sbsz8 - 1)) * stride; for (int x = sx; x < xend; x += tmvp_sample_step) { const ptrdiff_t pos = pos_base + x; int sum_x = 0, sum_y = 0, sum_n = 0; #define add(p) do { \ if (rp_proj[p].mv.y != INVALID_MV) { \ sum_x += rp_proj[p].mv.x; \ sum_y += rp_proj[p].mv.y; \ sum_n++; \ } \ } while (0) add(pos); if (x - tmvp_sample_step >= sx) add(pos - tmvp_sample_step); if (x + tmvp_sample_step < xend) add(pos + tmvp_sample_step); if (y - tmvp_sample_step >= ystart) add(pos - tmvp_sample_step * stride); if (y + tmvp_sample_step < yend) add(pos + tmvp_sample_step * stride); #undef add if (!first_line) { rp_proj[pos - tmvp_sample_step * stride].mv = mv_line[x - sx]; rp_proj[pos - tmvp_sample_step * stride].ref = rf->tip.delta; } if (sum_n) { mv_line[x - sx].y = (int)((int64_t) sum_y * idiv[sum_n - 1] + 0x8000 - (sum_y < 0)) >> 16; mv_line[x - sx].x = (int)((int64_t) sum_x * idiv[sum_n - 1] + 0x8000 - (sum_x < 0)) >> 16; } else { mv_line[x - sx].y = INVALID_MV; } } } if (!first_line) { const ptrdiff_t pos_base = ((y - tmvp_sample_step) & (sbsz8 - 1)) * stride; for (int x = sx; x < xend; x += tmvp_sample_step) { rp_proj[pos_base + x].mv = mv_line[x - sx]; rp_proj[pos_base + x].ref = rf->tip.delta; } } } } static void fill_gap_proj(refmvs_sngl_mv_block *const rp_proj, const ptrdiff_t stride, const int col_start8, const int col_end8, const int row_start8, int row_end8, const int mfmv_sbsz8, const int sbsz8) { for (int sx = col_start8; sx < col_end8; sx += mfmv_sbsz8) { const int xend = imin(col_end8, sx + mfmv_sbsz8); for (int y = row_start8; y < row_end8; y += 2) { const int ystart = y & ~(mfmv_sbsz8 - 1); const int yend = imin(ystart + mfmv_sbsz8, row_end8); const ptrdiff_t pos_base = (y & (sbsz8 - 1)) * stride; for (int x = sx; x < xend; x += 2) { const ptrdiff_t pos = pos_base + x; const union mv mv = rp_proj[pos].mv; if (mv.y == INVALID_MV) continue; int mvy = mv.y, mvx = mv.x, sum_y = mvy, sum_x = mvx, sum_n = 1; int ref_off = rp_proj[pos].ref; // right const int have_right = x + 2 < xend; if (have_right && rp_proj[pos + 2].mv.y != INVALID_MV) { const int right_ref_off = rp_proj[pos + 2].ref; const union mv right_mv = dav2d_mv_projection(rp_proj[pos + 2].mv, ref_off, right_ref_off, -2047, 2047); sum_x += right_mv.x; sum_y += right_mv.y; rp_proj[pos + 1].mv.y = (sum_y + (sum_y > 0)) >> 1; rp_proj[pos + 1].mv.x = (sum_x + (sum_x > 0)) >> 1; rp_proj[pos + 1].ref = ref_off; sum_n++; } else { rp_proj[pos + 1] = rp_proj[pos]; } // bottom const int have_bottom = y + 2 < yend; if (have_bottom && rp_proj[pos + 2 * stride].mv.y != INVALID_MV) { const int bottom_ref_off = rp_proj[pos + 2 * stride].ref; const union mv bottom_mv = dav2d_mv_projection(rp_proj[pos + 2 * stride].mv, ref_off, bottom_ref_off, -2047, 2047); sum_x += bottom_mv.x; const int mx = mvx + bottom_mv.x; sum_y += bottom_mv.y; const int my = mvy + bottom_mv.y; rp_proj[pos + stride].mv.y = (my + (my > 0)) >> 1; rp_proj[pos + stride].mv.x = (mx + (mx > 0)) >> 1; rp_proj[pos + stride].ref = ref_off; sum_n++; } else { rp_proj[pos + stride] = rp_proj[pos]; } // bottom/right if (have_right && have_bottom) { union mv bottom_right_mv = rp_proj[pos + 2 * (1 + stride)].mv; if (bottom_right_mv.y != INVALID_MV) { const int bottom_right_ref_off = rp_proj[pos + 2 * (1 + stride)].ref; bottom_right_mv = dav2d_mv_projection(bottom_right_mv, ref_off, bottom_right_ref_off, -2047, 2047); sum_x += bottom_right_mv.x; sum_y += bottom_right_mv.y; sum_n++; } } switch (sum_n) { default: assert(0); case 1: rp_proj[pos + 1 + stride].mv = mv; break; case 2: rp_proj[pos + 1 + stride].mv.y = (sum_y + (sum_y > 0)) >> 1; rp_proj[pos + 1 + stride].mv.x = (sum_x + (sum_x > 0)) >> 1; break; case 3: rp_proj[pos + 1 + stride].mv.y = (sum_y * 85 + 128 - (sum_y < 0)) >> 8; rp_proj[pos + 1 + stride].mv.x = (sum_x * 85 + 128 - (sum_x < 0)) >> 8; break; case 4: rp_proj[pos + 1 + stride].mv.y = (sum_y + 1 + (sum_y > 0)) >> 2; rp_proj[pos + 1 + stride].mv.x = (sum_x + 1 + (sum_x > 0)) >> 2; break; } rp_proj[pos + 1 + stride].ref = ref_off; } } } } static void fill_gap_traj(union mv *const rp_traj, const ptrdiff_t stride, const int col_start8, const int col_end8, const int row_start8, int row_end8, const int mfmv_sbsz8, const int sbsz8) { for (int sx = col_start8; sx < col_end8; sx += mfmv_sbsz8) { const int xend = imin(col_end8, sx + mfmv_sbsz8); for (int y = row_start8; y < row_end8; y += 2) { const int ystart = y & ~(mfmv_sbsz8 - 1); const int yend = imin(ystart + mfmv_sbsz8, row_end8); const ptrdiff_t pos_base = (y & (sbsz8 - 1)) * stride; for (int x = sx; x < xend; x += 2) { const ptrdiff_t pos = pos_base + x; const union mv mv = rp_traj[pos]; if (mv.y == INVALID_MV) continue; int mvy = mv.y, mvx = mv.x, sum_y = mvy, sum_x = mvx, sum_n = 1; // bottom const int have_bottom = y + 2 < yend; if (have_bottom && rp_traj[pos + 2 * stride].y != INVALID_MV) { const union mv bottom_mv = rp_traj[pos + 2 * stride]; sum_x += bottom_mv.x; sum_y += bottom_mv.y; rp_traj[pos + stride].y = (sum_y + (sum_y > 0)) >> 1; rp_traj[pos + stride].x = (sum_x + (sum_x > 0)) >> 1; sum_n++; } else { rp_traj[pos + stride] = mv; } // right const int have_right = x + 2 < xend; if (have_right && rp_traj[pos + 2].y != INVALID_MV) { const union mv right_mv = rp_traj[pos + 2]; sum_x += right_mv.x; const int mx = mvx + right_mv.x; sum_y += right_mv.y; const int my = mvy + right_mv.y; rp_traj[pos + 1].y = (my + (my > 0)) >> 1; rp_traj[pos + 1].x = (mx + (mx > 0)) >> 1; sum_n++; } else { rp_traj[pos + 1] = mv; } // bottom/right if (have_right && have_bottom) { const union mv bottom_right_mv = rp_traj[pos + 2 * (1 + stride)]; if (bottom_right_mv.y != INVALID_MV) { sum_x += bottom_right_mv.x; sum_y += bottom_right_mv.y; sum_n++; } } switch (sum_n) { default: assert(0); case 1: rp_traj[pos + 1 + stride] = mv; break; case 2: rp_traj[pos + 1 + stride].y = (sum_y + (sum_y > 0)) >> 1; rp_traj[pos + 1 + stride].x = (sum_x + (sum_x > 0)) >> 1; break; case 3: rp_traj[pos + 1 + stride].y = (sum_y * 85 + 128 - (sum_y < 0)) >> 8; rp_traj[pos + 1 + stride].x = (sum_x * 85 + 128 - (sum_x < 0)) >> 8; break; case 4: rp_traj[pos + 1 + stride].y = (sum_y + 1 + (sum_y > 0)) >> 2; rp_traj[pos + 1 + stride].x = (sum_x + 1 + (sum_x > 0)) >> 2; break; } } } } } static void check_traj_intersect(const refmvs_frame *const rf, mv *rp_traj[7], refmvs_traj_map *map[3][7], const int ref1 /* src */, const int ref2 /* dst */, const int y, const int x, const union mv mv_in, const int col_start8_shifted, const int col_end8_shifted) { assert(ref2 != -1); const unsigned sbsz8 = rf->sbsz >> 1; const int mfmv_sbsz8 = rf->mfmv_sbsz8; const int mfmv_edge = rf->mfmv_edge; const int shift = rf->mfmv_k_shift, mask = ~(rf->frm_hdr->tmvp_sample_step - 1); const ptrdiff_t stride = rf->rp_stride; const ptrdiff_t pos = (y & (sbsz8 - 1)) * stride + x; const int min_k = imax(-1, col_start8_shifted - (x >> shift)); const int max_k = imin(+1, col_end8_shifted - (x >> shift)); for (int k = min_k + 1; k <= max_k + 1; k++) { refmvs_traj_map *const map1 = &map[k][ref1][pos]; if (map1->n == INVALID_TRAJ) continue; const int x1 = x + map1->x; const int k1 = (x1 >> shift) - (x >> shift); if (k1 + 1 != k) continue; const int x_sb_align = x1 & ~(mfmv_sbsz8 - 1); const int x_proj_start = imax(x_sb_align - mfmv_edge, 0); const int x_proj_end = imin(x_sb_align + mfmv_sbsz8 + rf->mfmv_edge, rf->iw8); if (x < x_proj_start || x >= x_proj_end) continue; const int y1 = y + map1->y; const int y_proj_start = y1 & ~(mfmv_sbsz8 - 1); const int y_proj_end = imin(y_proj_start + mfmv_sbsz8, rf->ih8); if (y < y_proj_start || y >= y_proj_end) continue; const ptrdiff_t pos1 = (y1 & (sbsz8 - 1)) * stride + x1; mv *const mv_dst = &rp_traj[ref2][pos1]; if (mv_dst->y != INVALID_MV) continue; const mv *const mv_src = &rp_traj[ref1][pos1]; const int py = mv_dst->y = iclip(mv_src->y + mv_in.y, -2047, 2047); const int px = mv_dst->x = iclip(mv_src->x + mv_in.x, -2047, 2047); int y2 = y1 + apply_sign(abs(py) >> 6, py); int x2 = x1 + apply_sign(abs(px) >> 6, px); if (x2 < x_proj_start || x2 >= x_proj_end) continue; if (y2 < y_proj_start || y2 >= y_proj_end) continue; y2 &= mask; x2 &= mask; const ptrdiff_t pos2 = (y2 & (sbsz8 - 1)) * stride + x2; const int k2 = (x1 >> shift) - (x2 >> shift); assert(k2 >= -1 && k2 <= +1); refmvs_traj_map *const map2 = &map[k2 + 1][ref2][pos2]; map2->y = y1 - y2; map2->x = x1 - x2; } int y1 = y + apply_sign(abs(mv_in.y) >> 6, mv_in.y); int x1 = x + apply_sign(abs(mv_in.x) >> 6, mv_in.x); if (imin(y1, x1) < 0 || y1 >= rf->ih8 || x1 >= rf->iw8) return; y1 &= mask; x1 &= mask; const int min_k1 = imax(-1, col_start8_shifted - (x1 >> shift)); const int max_k1 = imin(+1, col_end8_shifted - (x1 >> shift)); for (int k = min_k1 + 1; k <= max_k1 + 1; k++) { const ptrdiff_t pos1 = (y1 & (sbsz8 - 1)) * stride + x1; refmvs_traj_map *const map1 = &map[k][ref2][pos1]; if (map1->n == INVALID_TRAJ) continue; const int x2 = x1 + map1->x; const int k2 = (x2 >> shift) - (x1 >> shift); if (k2 + 1 != k) continue; const int x_sb_align = x2 & ~(mfmv_sbsz8 - 1); const int x_proj_start = imax(x_sb_align - mfmv_edge, 0); const int x_proj_end = imin(x_sb_align + mfmv_sbsz8 + rf->mfmv_edge, rf->iw8); if (x < x_proj_start || x >= x_proj_end) continue; if (x1 < x_proj_start || x1 >= x_proj_end) continue; const int y2 = y1 + map1->y; const int y_proj_start = y2 & ~(mfmv_sbsz8 - 1); const int y_proj_end = imin(y_proj_start + mfmv_sbsz8, rf->ih8); if (y < y_proj_start || y >= y_proj_end || y1 < y_proj_start || y1 >= y_proj_end) { continue; } const ptrdiff_t pos2 = (y2 & (sbsz8 - 1)) * stride + x2; mv *const mv_dst = &rp_traj[ref1][pos2]; if (mv_dst->y != INVALID_MV) continue; const mv *const mv_src = &rp_traj[ref2][pos2]; const int py = mv_dst->y = iclip(mv_src->y - mv_in.y, -0xffff, 0xffff); const int px = mv_dst->x = iclip(mv_src->x - mv_in.x, -0xffff, 0xffff); int y3 = y2 + apply_sign(abs(py) >> 6, py); int x3 = x2 + apply_sign(abs(px) >> 6, px); if (x3 < x_proj_start || x3 >= x_proj_end) continue; if (y3 < y_proj_start || y3 >= y_proj_end) continue; y3 &= mask; x3 &= mask; const ptrdiff_t pos3 = (y3 & (sbsz8 - 1)) * stride + x3; const int k3 = (x2 >> shift) - (x3 >> shift); assert(k3 >= -1 && k3 <= +1); refmvs_traj_map *const map2 = &map[k3 + 1][ref1][pos3]; map2->y = y2 - y3; map2->x = x2 - x3; } } // FIXME split this up in smaller DSP'able functions void dav2d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx, const int col_start8, const int col_end8, const int row_start8, int row_end8) { if (!rf->have_threading) tile_row_idx = 0; assert(row_start8 >= 0); const unsigned sbsz8 = rf->sbsz >> 1; const int mfmv_sbsz8 = rf->mfmv_sbsz8; const int mfmv_edge = rf->mfmv_edge; assert((unsigned) (row_end8 - row_start8) <= sbsz8); assert(!(row_start8 & (sbsz8 - 1))); row_end8 = imin(row_end8, rf->ih8); const int col_start8i = imax(col_start8 - mfmv_edge, 0); const int col_end8i = imin(col_end8 + mfmv_edge, rf->iw8); const int sample_step = rf->frm_hdr->tmvp_sample_step; const ptrdiff_t stride = rf->rp_stride; const ptrdiff_t offset = sbsz8 * stride * tile_row_idx; const ptrdiff_t poffset = rf->have_frame_threading ? row_start8 * stride : (sbsz8 + 2) * stride * tile_row_idx + 2 * stride; refmvs_sngl_mv_block *rp_proj = &rf->rp_proj[poffset]; if (!rf->have_frame_threading) { memcpy(&rp_proj[col_start8 - 2 * rf->rp_stride], &rp_proj[col_start8 + (sbsz8 - 2) * rf->rp_stride], (col_end8 - col_start8) * sizeof(*rp_proj)); memcpy(&rp_proj[col_start8 - 1 * rf->rp_stride], &rp_proj[col_start8 + (sbsz8 - 1) * rf->rp_stride], (col_end8 - col_start8) * sizeof(*rp_proj)); } for (int y = row_start8; y < row_end8; y++) { for (int x = col_start8; x < col_end8; x++) rp_proj[x].mv.y = INVALID_MV; rp_proj += stride; } mv *rp_traj[7]; refmvs_traj_map *rp_map[3][7]; if (rf->seq_hdr->mv_traj) { const int mask = mfmv_sbsz8 - 1; for (int n = 0; n < 7 /*rf->frm_hdr->n_ref_frames*/; n++) { mv *tj = rp_traj[n] = &rf->rp_traj[n][offset]; for (int y = row_start8; y < row_end8; y++) { for (int x = col_start8; x < col_end8; x++) tj[x].y = INVALID_MV; tj += stride; } for (int k = -1; k <= +1; k++) { const int x_start = imax(0, col_start8 - k * mfmv_sbsz8); const int x_end = imin(rf->iw8, ((col_end8 + mask) & ~mask) - k * mfmv_sbsz8); refmvs_traj_map *map = rp_map[k + 1][n] = &rf->rp_map[k + 1][n][offset]; for (int y = row_start8; y < row_end8; y++) { for (int x = x_start; x < x_end; x++) map[x].n = INVALID_TRAJ; map += stride; } } } } rp_proj = &rf->rp_proj[poffset]; const int shift = rf->mfmv_k_shift, mask = ~(sample_step - 1); const int col_start8_shifted = col_start8 >> shift; const int col_end8_shifted = (col_end8 - 1) >> shift; for (int n = 0; n < rf->n_mfmvs; n++) { const int ref2cur = rf->mfmv_ref2cur[n]; if (ref2cur == INVALID_REF2CUR) continue; const int ref = rf->mfmv[n].ref; const int tgt = rf->mfmv[n].tgt; const int ref_sign = rf->mfmv[n].dir; const refmvs_temporal_block *r = &rf->rp_ref[ref][row_start8 * stride]; for (int y = row_start8; y < row_end8; y += sample_step) { for (int x = col_start8i; x < col_end8i; x += sample_step) { const ptrdiff_t pos = (y & (sbsz8 - 1)) * stride + x; const refmvs_temporal_block *rb = &r[pos]; const int b_ref = rb->ref.ref[ref_sign]; if (b_ref == -1) continue; const int ref2idx = rf->mfmv_ref2idx[n][b_ref]; mv b_mv = dequantize_mv(rb->mv.mv[ref_sign]); if (b_mv.y == INVALID_MV) continue; if (rf->seq_hdr->mv_traj && ref2idx != -1) check_traj_intersect(rf, rp_traj, rp_map, ref, ref2idx, y, x, b_mv, col_start8_shifted, col_end8_shifted); int ref2ref = rf->mfmv_ref2ref[n][b_ref]; if (!ref2ref || (ref2ref < 0) != ref_sign) continue; const mv mv1 = scale_mv(b_mv, -rf->mfmv_ref2sf[n][b_ref][0]); int y1 = y - apply_sign(abs(mv1.y) >> 6, mv1.y); if (y1 < 0 || y1 >= rf->ih8) continue; y1 &= mask; int x1 = x - apply_sign(abs(mv1.x) >> 6, mv1.x); if (x1 < col_start8 || x1 >= col_end8) continue; x1 &= mask; const int y_proj_start = y1 & ~(mfmv_sbsz8 - 1); const int y_proj_end = imin(y_proj_start + mfmv_sbsz8, row_end8); if (y < y_proj_start || y >= y_proj_end) continue; const int x_sb_align = x1 & ~(mfmv_sbsz8 - 1); const int x_proj_start = imax(x_sb_align - mfmv_edge, 0); const int x_proj_end = imin(x_sb_align + mfmv_sbsz8 + rf->mfmv_edge, rf->iw8); if (x < x_proj_start || x >= x_proj_end) continue; const ptrdiff_t pos1 = (y1 & (sbsz8 - 1)) * stride + x1; if (rp_proj[pos1].mv.y != INVALID_MV && (tgt == -1 || ref2idx != tgt || rp_proj[pos1].ref == abs(ref2ref))) { continue; } if (rf->seq_hdr->mv_traj) { const int k1 = (x1 >> shift) - (x >> shift); assert(k1 >= -1 && k1 <= +1); rp_traj[ref][pos1].y = iclip(mv1.y, -2047, 2047); rp_traj[ref][pos1].x = iclip(mv1.x, -2047, 2047); rp_map[k1 + 1][ref][pos].y = y1 - y; rp_map[k1 + 1][ref][pos].x = x1 - x; assert((x >> shift) + k1 >= col_start8_shifted && (x >> shift) + k1 <= col_end8_shifted); do /* so we can "break" out of it, saves indentation */ { if (ref2idx < 0) break; const mv mv2 = scale_mv(b_mv, rf->mfmv_ref2sf[n][b_ref][1]); rp_traj[ref2idx][pos1].y = iclip(mv2.y, -2047, 2047); rp_traj[ref2idx][pos1].x = iclip(mv2.x, -2047, 2047); int y2 = y + apply_sign(abs(b_mv.y) >> 6, b_mv.y); if (y2 < y_proj_start || y2 >= y_proj_end) break; y2 &= mask; int x2 = x + apply_sign(abs(b_mv.x) >> 6, b_mv.x); if (x2 < x_proj_start || x2 >= x_proj_end) break; x2 &= mask; const ptrdiff_t pos2 = (y2 & (sbsz8 - 1)) * stride + x2; const int k2 = (x1 >> shift) - (x2 >> shift); assert(k2 >= -1 && k2 <= +1); rp_map[k2 + 1][ref2idx][pos2].y = y1 - y2; rp_map[k2 + 1][ref2idx][pos2].x = x1 - x2; assert((x2 >> shift) + k2 >= col_start8_shifted && (x2 >> shift) + k2 <= col_end8_shifted); } while (0); } if (ref2ref < 0) { b_mv.y = -b_mv.y; b_mv.x = -b_mv.x; } rp_proj[pos1].mv = b_mv; rp_proj[pos1].ref = abs(ref2ref); } } } if (rf->frm_hdr->tip.frame_mode) { tip_projection(rf, rp_proj, stride, col_start8, col_end8, row_start8, row_end8, mfmv_sbsz8, sbsz8, rf->frm_hdr->tmvp_sample_step); if (rf->seq_hdr->tip_hole_fill) { fill_holes(rf, rp_proj, stride, col_start8, col_end8, row_start8, row_end8, mfmv_sbsz8, sbsz8, rf->frm_hdr->tmvp_sample_step); smoothen(rf, rp_proj, stride, col_start8, col_end8, row_start8, row_end8, mfmv_sbsz8, sbsz8, rf->frm_hdr->tmvp_sample_step); } } if (sample_step > 1) { for (int n = 0; n < rf->frm_hdr->n_ref_frames; n++) fill_gap_traj(rp_traj[n], stride, col_start8, col_end8, row_start8, row_end8, mfmv_sbsz8, sbsz8); fill_gap_proj(rp_proj, stride, col_start8, col_end8, row_start8, row_end8, mfmv_sbsz8, sbsz8); } } // cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors // into buffers for use in future frame's temporal MV prediction void dav2d_refmvs_save_tmvs(const Dav2dRefmvsDSPContext *const dsp, refmvs_tile *const rt, const int col_start8, int col_end8, const int row_start8, int row_end8) { const refmvs_frame *const rf = rt->rf; assert(row_start8 >= 0); assert((unsigned) (row_end8 - row_start8) <= 32U && (unsigned) (col_end8 - col_start8) <= 32U); row_end8 = imin(row_end8, rf->ih8); col_end8 = imin(col_end8, rf->iw8); // keep a backup of top (at 8x8 resolution) for next sbrow const refmvs_block *const b = &rt->r[(((row_end8 - 1) & 31) * 2 + 1) * 128]; rt->ra_tl = rt->ra[col_end8 - 1]; for (int x = col_start8; x < col_end8; x++) { const refmvs_block *const cand_b = &b[((x * 2) & 127) + 0]; rt->ra[x] = *cand_b; } } static unsigned abs_closest_ref(const int8_t *const ref2ref, const int8_t *const cur2ref, const int dir) { int b = 0xff; for (int n = 0; n < 7; n++) { const int a = abs(ref2ref[n]); if (((cur2ref[n] > 0 && ref2ref[n] > 0 && dir) || (cur2ref[n] < 0 && ref2ref[n] < 0 && !dir)) && a < b) { b = a; } } return b; } static int topo_insert(int cnt, const int idx, int8_t *const order, int8_t *const rev_order, const int8_t (*const cnv)[7], const uint8_t refcnt[7]) { if (rev_order[idx] != -1) return cnt; rev_order[idx] = 0; // dummy for (int n = 0; n < 7 * !!refcnt[idx]; n++) { const int r_idx = cnv[idx][n]; if (r_idx == -1) continue; cnt = topo_insert(cnt, r_idx, order, rev_order, cnv, refcnt); } order[cnt] = idx; rev_order[idx] = cnt; return cnt + 1; } int dav2d_refmvs_init_frame(refmvs_frame *const rf, const Dav2dSequenceHeader *const seq_hdr, const Dav2dFrameHeader *const frm_hdr, const uint8_t ref_poc[7], refmvs_temporal_block *const rp, const uint8_t ref_ref_poc[7][7], const uint8_t refcnt[7], /*const*/ refmvs_temporal_block *const rp_ref[7], const int have_threading, const int have_frame_threading) { const int rp_stride = ((frm_hdr->width + 255) & ~255) >> 3; const int n_tile_rows = have_threading ? frm_hdr->tiling.t.rows : 1; const int n_blocks = rp_stride * n_tile_rows; rf->sbsz = 16 << frm_hdr->sb128; const int mfmv_sb128 = frm_hdr->sb128 && frm_hdr->tmvp_sample_step > 1; rf->mfmv_k_shift = 3 + mfmv_sb128; rf->mfmv_sbsz8 = 8 << mfmv_sb128; rf->mfmv_edge = rf->mfmv_sbsz8 >> (frm_hdr->tmvp_sample_step == 1); rf->seq_hdr = seq_hdr; rf->frm_hdr = frm_hdr; rf->iw8 = (frm_hdr->width + 7) >> 3; rf->ih8 = (frm_hdr->height + 7) >> 3; rf->iw4 = rf->iw8 << 1; rf->ih4 = rf->ih8 << 1; rf->rp = rp; rf->rp_stride = rp_stride; rf->have_threading = have_threading; rf->have_frame_threading = have_frame_threading; if (n_blocks * rf->sbsz > rf->n_blocks) { const int sbsz8 = rf->sbsz >> 1; const size_t rp_proj_sz = have_frame_threading ? sizeof(*rf->rp_proj) * ((rf->ih8 + 31) & ~31) * rp_stride: sizeof(*rf->rp_proj) * (2 + sbsz8) * n_blocks; const size_t rp_traj_sz = sizeof(mv) * sbsz8 * n_blocks; const size_t rp_map_sz = sizeof(**rf->rp_map) * sbsz8 * n_blocks; const size_t r_above_sz = sizeof(*rf->ra) * n_blocks; dav2d_free_aligned(rf->rp_proj); uint8_t *mem = dav2d_alloc_aligned(ALLOC_REFMVS, 7 * 3 * rp_map_sz + rp_proj_sz + 7 * rp_traj_sz + r_above_sz, 64); if (!mem) { rf->rp_proj = NULL; rf->n_blocks = 0; return DAV2D_ERR(ENOMEM); } rf->rp_proj = (refmvs_sngl_mv_block *) mem; mem += rp_proj_sz; for (int n = 0; n < 7; n++) { rf->rp_traj[n] = (mv *) mem; mem += rp_traj_sz; } for (int n = 0; n < 3; n++) for (int m = 0; m < 7; m++) { rf->rp_map[n][m] = (refmvs_traj_map *) mem; mem += rp_map_sz; } rf->ra = (refmvs_block *) mem; rf->n_blocks = n_blocks * rf->sbsz; } const int poc = frm_hdr->frame_offset; int8_t ref2ref[7][7], ref2cur[7][7], refref2curref_idx[7][7]; uint8_t have_ref_sign[7][2] = { { 0 } }; for (int i = 0; i < frm_hdr->n_ref_frames; i++) { const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[i], poc); rf->ref_sign[i] = poc_diff < 0; rf->pocdiff[i] = iclip(get_poc_diff(seq_hdr->order_hint_n_bits, poc, ref_poc[i]), -31, 31); rf->abspocdiff[i] = abs(rf->pocdiff[i]); for (int n = 0; n < 7 * !!refcnt[i]; n++) { ref2ref[i][n] = get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[i], ref_ref_poc[i][n]); if (ref2ref[i][n] > 0) have_ref_sign[i][0] = 1; if (ref2ref[i][n] < 0) have_ref_sign[i][1] = 1; ref2cur[i][n] = get_poc_diff(seq_hdr->order_hint_n_bits, poc, ref_ref_poc[i][n]); int m; for (m = 0; m < frm_hdr->n_ref_frames; m++) if (ref_ref_poc[i][n] == ref_poc[m]) break; refref2curref_idx[i][n] = m == frm_hdr->n_ref_frames ? -1 : m; } } uint64_t flipmask = 0; for (int i = 0; i < frm_hdr->n_ref_frames; i++) { for (int n = 0; n < frm_hdr->n_ref_frames; n++) { const int flip = rf->ref_sign[i] == rf->ref_sign[n] ? get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[i], ref_poc[n]) < 0 : rf->ref_sign[n]; flipmask |= ((uint64_t) flip) << (i * 8 + n); } } rf->ref_flip = flipmask; // tip setup memcpy(rf->tip.ref.ref, frm_hdr->tip.ref, 2); if (rf->frm_hdr->tip.frame_mode) { const unsigned tip0poc = ref_poc[rf->tip.ref.ref[0]]; const unsigned tip1poc = ref_poc[rf->tip.ref.ref[1]]; const int d2 = get_poc_diff(seq_hdr->order_hint_n_bits, tip1poc, tip0poc); rf->tip.delta = abs(d2); const int d1 = rf->pocdiff[rf->tip.ref.ref[0]]; const int dv = div_mult[imin(abs(d2), 31)]; rf->tip.sf[0] = imin(abs(d1), 31) * dv; if ((d1 < 0) ^ (d2 < 0)) rf->tip.sf[0] *= -1; const int d3 = rf->pocdiff[rf->tip.ref.ref[1]]; rf->tip.sf[1] = imin(abs(d3), 31) * dv; if ((d3 < 0) ^ (d2 < 0)) rf->tip.sf[1] *= -1; } // temporal MV setup rf->n_mfmvs = 0; rf->rp_ref = rp_ref; rf->mfmv_mask = 0; if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) { // sort refs uint8_t order[7]; for (int n = 0; n < frm_hdr->n_ref_frames; n++) { const int pocdiff = rf->pocdiff[n]; int m; for (m = n; m > 0 && pocdiff > rf->pocdiff[order[m - 1]]; m--) order[m] = order[m - 1]; order[m] = n; } // find bwd/fwd ref split point int first_fut; for (first_fut = 0; first_fut < frm_hdr->n_ref_frames && rf->ref_sign[order[first_fut]]; first_fut++) { /* empty */ } // dependency ordering int8_t topo_order[7], rev_topo_order[7]; memset(rev_topo_order, -1, sizeof(rev_topo_order)); int topo_cnt = 0; for (int n = 0; n < frm_hdr->n_ref_frames; n++) topo_cnt = topo_insert(topo_cnt, n, topo_order, rev_topo_order, refref2curref_idx, refcnt); if (topo_cnt <= 1) goto end; uint8_t ref_done[7][2] = {{ 0 }}; // mark keyframes as done since they don't have MVs for (int n = 0; n < frm_hdr->n_ref_frames; n++) if (!rp_ref[n]) ref_done[n][0] = ref_done[n][1] = 1; if (seq_hdr->tip && (rf->ref_sign[rf->tip.ref.ref[0]] || rf->ref_sign[rf->tip.ref.ref[1]])) { const int o = rev_topo_order[rf->tip.ref.ref[0]] > rev_topo_order[rf->tip.ref.ref[1]]; const int dir = get_poc_diff(seq_hdr->order_hint_n_bits, ref_poc[rf->tip.ref.ref[!o]], ref_poc[rf->tip.ref.ref[o]]) < 0; rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = rf->tip.ref.ref[!o], .tgt = rf->tip.ref.ref[o], .dir = dir, }; ref_done[rf->tip.ref.ref[!o]][dir] = 1; } // adjacent refs for (int n = 0; n < 2; n++) { int ref1 = -1, ref2 = -1; if (first_fut - n > 0) { ref1 = order[first_fut - n - 1]; if (!have_ref_sign[ref1][1]) ref1 = -1; } if (first_fut + n < frm_hdr->n_ref_frames) { ref2 = order[first_fut + n]; if (!have_ref_sign[ref2][0]) ref2 = -1; } int order = 0; if (ref1 >= 0 && ref2 >= 0) { uint8_t acr1 = abs_closest_ref(ref2ref[ref1], ref2cur[ref1], 0); uint8_t acr2 = abs_closest_ref(ref2ref[ref2], ref2cur[ref2], 1); order = acr1 < acr2; } if (order && !ref_done[ref1][1]) { assert(ref1 >= 0); rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref1, .tgt = -1, .dir = 1, }; ref_done[ref1][1] = 1; if (rf->n_mfmvs == 3) break; } if (ref2 >= 0 && !ref_done[ref2][0]) { rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref2, .tgt = -1, .dir = 0, }; ref_done[ref2][0] = 1; if (rf->n_mfmvs == 3) break; } if (!order && ref1 >= 0 && !ref_done[ref1][1]) { rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref1, .tgt = -1, .dir = 1, }; ref_done[ref1][1] = 1; if (rf->n_mfmvs == 3) break; } } // bwd adjacent refs, opposite direction if (rf->n_mfmvs < 3 && first_fut > 0) { const int ref = order[first_fut - 1]; if (!ref_done[ref][0]) { rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref, .tgt = -1, .dir = 0, }; ref_done[ref][0] = 1; } if (rf->n_mfmvs < 3 && first_fut > 1) { const int ref2 = order[first_fut - 2]; if (!ref_done[ref2][0]) { rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref2, .tgt = -1, .dir = 0, }; ref_done[ref2][0] = 1; } } } for (int n = topo_cnt - 1; n >= 0; n--) { const int ref = topo_order[n]; const int dir = rf->pocdiff[ref] >= 0; if (!ref_done[ref][dir]) { rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref, .tgt = -1, .dir = dir, }; ref_done[ref][dir] = 1; if (rf->n_mfmvs == 4) break; } if (!ref_done[ref][!dir]) { rf->mfmv[rf->n_mfmvs++] = (struct MfmvRef) { .ref = ref, .tgt = -1, .dir = !dir, }; ref_done[ref][!dir] = 1; if (rf->n_mfmvs == 4) break; } } for (int n = 0; n < 7; n++) if (ref_done[n][0] || ref_done[n][1]) rf->mfmv_mask |= 1 << n; for (int n = 0; n < rf->n_mfmvs; n++) { const int rpoc = ref_poc[rf->mfmv[n].ref]; const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits, rpoc, frm_hdr->frame_offset); if (abs(diff1) > 31) { rf->mfmv_ref2cur[n] = INVALID_REF2CUR; } else { rf->mfmv_ref2cur[n] = diff1; for (int m = 0; m < 7; m++) { const int rrpoc = ref_ref_poc[rf->mfmv[n].ref][m]; const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits, rpoc, rrpoc); rf->mfmv_ref2ref[n][m] = diff2 + 31U < 63U ? diff2 : 0; int l; for (l = 0; l < 7; l++) if (rrpoc == ref_poc[l]) break; rf->mfmv_ref2idx[n][m] = l == 7 ? -1 : l; const int d1 = rf->mfmv_ref2cur[n]; const int d2 = rf->mfmv_ref2ref[n][m]; const int dv = div_mult[imin(abs(d2), 31)]; rf->mfmv_ref2sf[n][m][0] = imin(abs(d1), 31) * dv; if ((d1 < 0) ^ (d2 < 0)) rf->mfmv_ref2sf[n][m][0] *= -1; const int d3 = d1 - d2; rf->mfmv_ref2sf[n][m][1] = imin(abs(d3), 31) * dv; if ((d3 < 0) ^ (d2 > 0)) rf->mfmv_ref2sf[n][m][1] *= -1; } } } } end: rf->use_ref_frame_mvs = rf->n_mfmvs > 0; return 0; } static void splat_mv_c(refmvs_block *s_dst, refmvs_block *const s_src, refmvs_temporal_block *t_dst, const ptrdiff_t t_stride, refmvs_temporal_block *const t_src, const int bw4, int bh4) { s_src->oy4 = 0; do { s_src->ox4 = 0; for (int x = 0; x < bw4; x += 2, s_src->ox4 += 2) { memcpy(&s_dst[x], s_src, offsetof(refmvs_block, lmv)); if (bw4 > 1) { s_src->ox4++; memcpy(&s_dst[x + 1], s_src, offsetof(refmvs_block, lmv)); s_src->ox4--; } if (bh4 > 1) { s_src->oy4++; memcpy(&s_dst[x + 128], s_src, offsetof(refmvs_block, lmv)); if (bw4 > 1) { s_src->ox4++; memcpy(&s_dst[x + 129], s_src, offsetof(refmvs_block, lmv)); s_src->ox4--; } s_src->oy4--; } if (t_dst) t_dst[x >> 1] = *t_src; } s_dst += 128 * 2; if (t_dst) t_dst += t_stride; s_src->oy4 += 2; bh4 -= 2; } while (bh4 > 0); } static void splat_warpmv_c(refmvs_block *s_dst, refmvs_block *const s_src, refmvs_temporal_block *t_dst, const ptrdiff_t t_stride, refmvs_temporal_block *const t_src, int64_t mvy, int64_t mvx, const Dav2dWarpedMotionParams *const mat, const int bw4, int bh4) { assert(bw4 > 1 && bh4 > 1); s_src->oy4 = 0; do { int64_t mvxi = mvx, mvyi = mvy; s_src->ox4 = 0; for (int x = 0; x < bw4; x += 2, s_src->ox4 += 2) { const union mv warpmv = (union mv) { .y = iclip(apply_sign64((llabs(mvyi) + 4096) >> 13, mvyi), -0xffff, 0xffff), .x = iclip(apply_sign64((llabs(mvxi) + 4096) >> 13, mvxi), -0xffff, 0xffff), }; if (s_src->mf & 2) s_src->mv[0] = warpmv; t_src->mv.mv[0] = t_src->mv.mv[1] = quantize_mv(warpmv); s_dst[x] = *s_src; s_src->ox4++; s_dst[x + 1] = *s_src; s_src->oy4++; s_dst[x + 129] = *s_src; s_src->ox4--; s_dst[x + 128] = *s_src; s_src->oy4--; if (t_dst) { t_dst[x >> 1].mv.n = t_src->mv.n; t_dst[x >> 1].ref.pair = t_src->mv.n == INVALID_TRAJ * 0x10001U ? -1 : t_src->ref.pair; } mvxi += (mat->matrix[2] - 0x10000) * 8; mvyi += mat->matrix[4] * 8; } mvx += mat->matrix[3] * 8; mvy += (mat->matrix[5] - 0x10000) * 8; s_dst += 2 * 128; if (t_dst) t_dst += t_stride; s_src->oy4 += 2; bh4 -= 2; } while (bh4); } static void splat_comp_warpmv_c(refmvs_block *s_dst, refmvs_block *const s_src, refmvs_temporal_block *t_dst, const ptrdiff_t t_stride, refmvs_temporal_block *t_src, int64_t mvy1, int64_t mvx1, int64_t mvy2, int64_t mvx2, const Dav2dWarpedMotionParams *const wm1, const Dav2dWarpedMotionParams *const wm2, const int bw4, int bh4, const int t_swap, const uint8_t *mask, const int w_swap) { assert(bw4 > 1 && bh4 > 1); s_src->oy4 = 0; do { int64_t mvxi1 = mvx1, mvyi1 = mvy1, mvxi2 = mvx2, mvyi2 = mvy2; s_src->ox4 = 0; for (int x = 0; x < bw4; x += 2, s_src->ox4 += 2) { const union mv warpmv1 = (union mv) { .y = iclip(apply_sign64((llabs(mvyi1) + 4096) >> 13, mvyi1), -0xffff, 0xffff), .x = iclip(apply_sign64((llabs(mvxi1) + 4096) >> 13, mvxi1), -0xffff, 0xffff), }; if (s_src->mf & 2) s_src->mv[0] = warpmv1; t_src->mv.mv[t_swap] = quantize_mv(warpmv1); const union mv warpmv2 = (union mv) { .y = iclip(apply_sign64((llabs(mvyi2) + 4096) >> 13, mvyi2), -0xffff, 0xffff), .x = iclip(apply_sign64((llabs(mvxi2) + 4096) >> 13, mvxi2), -0xffff, 0xffff), }; if (s_src->mf & 2) s_src->mv[1] = warpmv2; t_src->mv.mv[!t_swap] = quantize_mv(warpmv2); if (mask) { const int d = mask[x >> 1]; if (d != 2) t_src->mv.mv[d ^ w_swap].n = INVALID_TRAJ; } s_dst[x] = *s_src; s_src->ox4++; s_dst[x + 1] = *s_src; s_src->oy4++; s_dst[x + 129] = *s_src; s_src->ox4--; s_dst[x + 128] = *s_src; s_src->oy4--; if (t_dst) { if (t_src->mv.mv[0].n == INVALID_TRAJ) { if (t_src->mv.mv[1].n == INVALID_TRAJ) { t_dst[x >> 1].ref.pair = -1; } else { t_dst[x >> 1].mv.n = t_src->mv.mv[1].n * 0x10001U; t_dst[x >> 1].ref.pair = (uint8_t) t_src->ref.ref[1] * 0x101U; } } else { if (t_src->mv.mv[1].n == INVALID_TRAJ) { t_dst[x >> 1].mv.n = t_src->mv.mv[0].n * 0x10001U; t_dst[x >> 1].ref.pair = (uint8_t) t_src->ref.ref[0] * 0x101U; } else { t_dst[x >> 1] = *t_src; } } } mvxi1 += (wm1->matrix[2] - 0x10000) * 8; mvyi1 += wm1->matrix[4] * 8; mvxi2 += (wm2->matrix[2] - 0x10000) * 8; mvyi2 += wm2->matrix[4] * 8; } mvx1 += wm1->matrix[3] * 8; mvy1 += (wm1->matrix[5] - 0x10000) * 8; mvx2 += wm2->matrix[3] * 8; mvy2 += (wm2->matrix[5] - 0x10000) * 8; if (mask) mask += bw4 >> 1; s_dst += 2 * 128; if (t_dst) t_dst += t_stride; s_src->oy4 += 2; bh4 -= 2; } while (bh4); } static void splat_comp_wedgemv_c(refmvs_block *s_dst, refmvs_block *const s_src, refmvs_temporal_block *t_dst, const ptrdiff_t t_stride, refmvs_temporal_block *const t_src, const int bw4, int bh4, const uint8_t *mask, const int w_swap) { assert(bw4 > 1 && bh4 > 1 && mask); s_src->oy4 = 0; do { s_src->ox4 = 0; for (int x = 0; x < bw4; x += 2, s_src->ox4 += 2) { memcpy(&s_dst[x], s_src, offsetof(refmvs_block, lmv)); s_src->ox4++; memcpy(&s_dst[x + 1], s_src, offsetof(refmvs_block, lmv)); s_src->oy4++; memcpy(&s_dst[x + 129], s_src, offsetof(refmvs_block, lmv)); s_src->ox4--; memcpy(&s_dst[x + 128], s_src, offsetof(refmvs_block, lmv)); s_src->oy4--; const int d = mask[x >> 1]; if (t_dst) { if (d != 2) { const int idx = !(d ^ w_swap); const int m = t_src->mv.mv[idx].n; t_dst[x >> 1].mv.n = m * 0x10001U; t_dst[x >> 1].ref.pair = (m == INVALID_TRAJ) ? -1 : (uint8_t) t_src->ref.ref[idx] * 0x101; } else if (t_src->mv.mv[0].n == INVALID_TRAJ) { if (t_src->mv.mv[1].n == INVALID_TRAJ) { t_dst[x >> 1].mv.n = INVALID_TRAJ * 0x10001U; t_dst[x >> 1].ref.pair = -1; } else { t_dst[x >> 1].mv.n = t_src->mv.mv[1].n * 0x10001U; t_dst[x >> 1].ref.pair = (uint8_t) t_src->ref.ref[1] * 0x101; } } else if (t_src->mv.mv[1].n == INVALID_TRAJ) { t_dst[x >> 1].mv.n = t_src->mv.mv[0].n * 0x10001U; t_dst[x >> 1].ref.pair = (uint8_t) t_src->ref.ref[0] * 0x101; } else { t_dst[x >> 1] = *t_src; } } } s_dst += 128 * 2; if (t_dst) t_dst += t_stride; mask += bw4 >> 1; s_src->oy4 += 2; bh4 -= 2; } while (bh4 > 0); } #if HAVE_ASM && 0 #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/refmvs.h" #elif ARCH_LOONGARCH64 #include "src/loongarch/refmvs.h" #elif ARCH_X86 #include "src/x86/refmvs.h" #endif #endif COLD void dav2d_refmvs_dsp_init(Dav2dRefmvsDSPContext *const c) { c->splat_mv = splat_mv_c; c->splat_warpmv = splat_warpmv_c; c->splat_comp_warpmv = splat_comp_warpmv_c; c->splat_comp_wedgemv = splat_comp_wedgemv_c; #if HAVE_ASM && 0 #if ARCH_AARCH64 || ARCH_ARM refmvs_dsp_init_arm(c); #elif ARCH_LOONGARCH64 refmvs_dsp_init_loongarch(c); #elif ARCH_X86 refmvs_dsp_init_x86(c); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/refmvs.h000066400000000000000000000232541517466257200223140ustar00rootroot00000000000000/* * Copyright © 2020-2026, VideoLAN and dav2d authors * Copyright © 2020-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_REF_MVS_H #define DAV2D_SRC_REF_MVS_H #include #include #include "dav2d/headers.h" #include "common/intops.h" #include "src/debug.h" #include "src/intra_edge.h" #include "src/tables.h" #define INVALID_REF2CUR (-32) #define INVALID_TRAJ 0x8080 PACKED(typedef union refmvs_traj_map { struct { int8_t y, x; }; uint16_t n; }) ALIGN(refmvs_traj_map, 2); CHECK_SIZE(refmvs_traj_map, 2); PACKED(typedef struct refmvs_sngl_mv_block { mv mv; uint8_t ref; }) refmvs_sngl_mv_block; CHECK_SIZE(refmvs_sngl_mv_block, 9); PACKED(typedef struct refmvs_temporal_block { union { union qmv { struct { int8_t y, x; }; uint16_t n; } mv[2]; uint32_t n; } mv; union refpair ref; }) ALIGN(refmvs_temporal_block, 2); CHECK_SIZE(refmvs_temporal_block, 6); // FIXME the size of this array can be reduced if we generate mv on-the-fly // from the (separately stored) warp matrix. typedef struct refmvs_block { union mv mv[2]; union refpair ref; uint8_t bs; int8_t mf; // bits: 0: globalmv, 1: warp[not gmv], 2-7: cwp_idx uint8_t ox4, oy4; // distance to top/left coordinates (in 4px units) of this block uint8_t subpel_filter; int8_t warp_type; union mv lmv[2]; // 2dmv for warp blocks (see #1146; mf & 2) int32_t m[6]; // warp matrix } ALIGN(refmvs_block, 64); CHECK_SIZE(refmvs_block, 64); typedef struct refmvs_frame { const Dav2dSequenceHeader *seq_hdr; const Dav2dFrameHeader *frm_hdr; int iw4, ih4, iw8, ih8; int sbsz /* in 4px units */; int mfmv_sbsz8, mfmv_edge, mfmv_k_shift; int use_ref_frame_mvs; struct { int32_t sf[2]; refpair ref; int8_t delta; } tip; uint8_t ref_sign[7]; int8_t pocdiff[7]; uint64_t ref_flip; uint8_t abspocdiff[7]; uint8_t mfmv_mask; struct MfmvRef { uint8_t ref; int8_t tgt; uint8_t dir; } mfmv[4]; int8_t mfmv_ref2cur[4]; int8_t mfmv_ref2ref[4][7]; int8_t mfmv_ref2idx[4][7]; int32_t mfmv_ref2sf[4][7][2]; int n_mfmvs; int n_blocks; refmvs_temporal_block *rp; ptrdiff_t rp_stride; /*const*/ refmvs_temporal_block *const *rp_ref; refmvs_sngl_mv_block *rp_proj; mv *rp_traj[7]; // FIXME we may not need 7? refmvs_traj_map *rp_map[3][7]; refmvs_block *ra; int have_threading, have_frame_threading; } refmvs_frame; typedef struct refmvs_tile { const refmvs_frame *rf; refmvs_sngl_mv_block *rp_proj; mv *rp_traj[7]; refmvs_block *ra, ra_tl; refmvs_block r[64 * 64 * 2]; // one sb may be enough? smaller sb sizes than 256x256? struct { int start, end; } tile_col, tile_row; struct { union mv mv[9][4][2]; int8_t cwp_idx[3 /* class-6 */][4]; union refpair ref[4]; uint8_t size[9], idx[9]; uint8_t hits[2 /* sb, b */], avail; } bank; struct { int32_t mat[7][4][6]; int8_t type[7][4]; // see #834 uint8_t hits, size[7], idx[7]; } warp; } refmvs_tile; typedef struct refmvs_candidate { union mv mv[2]; uint16_t weight; int8_t cwp_idx; int8_t y_off, x_off; } refmvs_candidate; #define decl_splat_mv_fn(name) \ void (name)(refmvs_block *s_dst, refmvs_block *s_src, \ refmvs_temporal_block *t_dst, ptrdiff_t t_stride, \ refmvs_temporal_block *t_src, int bw4, int bh4) typedef decl_splat_mv_fn(*splat_mv_fn); #define decl_splat_warpmv_fn(name) \ void (name)(refmvs_block *s_dst, refmvs_block *s_src, \ refmvs_temporal_block *t_dst, ptrdiff_t t_stride, \ refmvs_temporal_block *t_src, int64_t mvy, int64_t mvx, \ const Dav2dWarpedMotionParams *const matrix, int bw4, int bh4) typedef decl_splat_warpmv_fn(*splat_warpmv_fn); #define decl_splat_comp_warpmv_fn(name) \ void (name)(refmvs_block *s_dst, refmvs_block *s_src, \ refmvs_temporal_block *t_dst, ptrdiff_t t_stride, \ refmvs_temporal_block *t_src, \ int64_t mvy1, int64_t mvx1, int64_t mvy2, int64_t mvx2, \ const Dav2dWarpedMotionParams *const wm1, \ const Dav2dWarpedMotionParams *const wm2, \ int bw4, int bh4, int t_swap, const uint8_t *wedge, int w_mask) typedef decl_splat_comp_warpmv_fn(*splat_comp_warpmv_fn); #define decl_splat_comp_wedgemv_fn(name) \ void (name)(refmvs_block *s_dst, refmvs_block *s_src, \ refmvs_temporal_block *t_dst, ptrdiff_t t_stride, \ refmvs_temporal_block *t_src, int bw4, int bh4, \ const uint8_t *wedge, int w_mask) typedef decl_splat_comp_wedgemv_fn(*splat_comp_wedgemv_fn); typedef struct Dav2dRefmvsDSPContext { splat_mv_fn splat_mv; splat_warpmv_fn splat_warpmv; splat_comp_warpmv_fn splat_comp_warpmv; splat_comp_wedgemv_fn splat_comp_wedgemv; } Dav2dRefmvsDSPContext; // call once per frame int dav2d_refmvs_init_frame(refmvs_frame *rf, const Dav2dSequenceHeader *seq_hdr, const Dav2dFrameHeader *frm_hdr, const uint8_t ref_poc[7], refmvs_temporal_block *rp, const uint8_t ref_ref_poc[7][7], const uint8_t refcnt[7], /*const*/ refmvs_temporal_block *const rp_ref[7], int have_threading, int have_frame_threading); // cache the current superblock's bottom spatial values into into a "top" // buffer to act as "top" across superblock boundaries for the next sbrow void dav2d_refmvs_save_tmvs(const Dav2dRefmvsDSPContext *dsp, refmvs_tile *rt, int col_start8, int col_end8, int row_start8, int row_end8); // load temporal MVs for current tile or frame's superblock-row void dav2d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx, const int col_start8, const int col_end8, const int row_start8, int row_end8); mv dav2d_mv_projection(mv in, int num, int den, int min, int max); static ALWAYS_INLINE mv scale_mv(const mv in, const int sf) { const int64_t y = in.y * (int64_t) sf, x = in.x * (int64_t) sf; return (mv) { .y = iclip((int)((y + 0x2000 - (y < 0)) >> 14), -0xffff, 0xffff), .x = iclip((int)((x + 0x2000 - (x < 0)) >> 14), -0xffff, 0xffff), }; } static ALWAYS_INLINE unsigned quantize_mv_comp(const unsigned absv) { assert(absv < 2048); if (!absv) return 0; const int nbits = iclip(ulog2(absv) - 4, 0, 6); int res = (absv - (16 * !!nbits << nbits)) >> nbits; res += (nbits + !!nbits) * 16; return res; } static ALWAYS_INLINE union qmv quantize_mv(const union mv mv) { const int absy = abs(mv.y), absx = abs(mv.x); if (imax(absx, absy) >= 2048) return (union qmv) { .n = INVALID_TRAJ }; return (union qmv) { .y = apply_sign(quantize_mv_comp(absy), mv.y), .x = apply_sign(quantize_mv_comp(absx), mv.x), }; } // initialize tile boundaries and refmvs_block pointers for one tile/sbrow void dav2d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf, int tile_col_start4, int tile_col_end4, int tile_row_start4, int tile_row_end4, int sby, int tile_row_idx); void dav2d_refmvs_reset_sb(refmvs_tile *rt, int by, int bx); void dav2d_refmvs_bank_update(refmvs_tile *rt, enum BlockSize bs, int by, int bx); void dav2d_refmvs_bank_add(refmvs_tile *rt, enum BlockSize bs, int by, int bx, const Av2Block *b); int dav2d_refmvs_warp_add(refmvs_tile *rt, const Dav2dWarpedMotionParams *const m, DB_ONLY(int by4, int bx4) int ref); // call for each block void dav2d_refmvs_find(const refmvs_tile *rt, refmvs_candidate mvstack[6], int32_t (*warp)[7], int *cnt, const union refpair ref, enum BlockSize bs, int skip_mode, int by4, int bx4); void dav2d_refmvs_dsp_init(Dav2dRefmvsDSPContext *dsp); void dav2d_refmvs_dsp_init_arm(Dav2dRefmvsDSPContext *dsp); void dav2d_refmvs_dsp_init_loongarch(Dav2dRefmvsDSPContext *dsp); void dav2d_refmvs_dsp_init_x86(Dav2dRefmvsDSPContext *dsp); #endif /* DAV2D_SRC_REF_MVS_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/000077500000000000000000000000001517466257200217615ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/000077500000000000000000000000001517466257200222125ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/cdef.S000066400000000000000000000317201517466257200232420ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" .macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2 vmslt.vx v0, \vec_tmp1, zero vneg.v \vec_tmp1, \vec_tmp1, v0.t vmmv.m v1, v0 vmslt.vx v0, \vec_tmp2, zero vneg.v \vec_tmp2, \vec_tmp2, v0.t vsra.vx \vec1, \vec_tmp1, \shift vsra.vx \vec2, \vec_tmp2, \shift vrsub.vx \vec1, \vec1, \strength vrsub.vx \vec2, \vec2, \strength vmax.vx \vec1, \vec1, zero vmax.vx \vec2, \vec2, zero vmin.vv \vec_tmp1, \vec1, \vec_tmp1 vmin.vv \vec_tmp2, \vec2, \vec_tmp2 vneg.v \vec_tmp2, \vec_tmp2, v0.t vmmv.m v0, v1 vneg.v \vec_tmp1, \vec_tmp1, v0.t .endm .macro padding_fn w, h li t5, -32768 # INT16_MIN andi t4, a7, 4 li t2, -2 # y_start .if \w == 4 vsetivli zero, \w + 4, e16, m1, ta, ma .else vsetivli zero, \w + 4, e16, m2, ta, ma .endif vmv.v.x v0, t5 bnez t4, L(top_done_\w\()x\h) slli t5, a1, 1 addi t5, t5, 2 slli t5, t5, 1 sub t5, a0, t5 sh1add t4, a1, t5 vse16.v v0, (t5) vse16.v v0, (t4) li t2, 0 L(top_done_\w\()x\h): andi t4, a7, 8 li t3, 2 + \h # y_end bnez t4, L(bottom_done_\w\()x\h) li t5, \h mul t5, a1, t5 addi t5, t5, -2 sh1add t5, t5, a0 sh1add t4, a1, t5 vse16.v v0, (t5) vse16.v v0, (t4) addi t3, t3, -2 L(bottom_done_\w\()x\h): andi t4, a7, 1 li t0, -2 # x_start .if \w == 4 vsetivli zero, 2, e16, m1, ta, ma .else vsetivli zero, 2, e16, m2, ta, ma .endif bnez t4, L(left_done_\w\()x\h) mul t5, a1, t2 addi t5, t5, -2 sh1add t5, t5, a0 sub t0, t3, t2 3: vse16.v v0, (t5) sh1add t5, a1, t5 addi t0, t0, -1 bnez t0, 3b L(left_done_\w\()x\h): andi t4, a7, 2 li t1, 2 + \w # x_end bnez t4, L(right_done_\w\()x\h) mul t5, t2, a1 addi t5, t5, \w sh1add t5, t5, a0 sub t1, t3, t2 4: vse16.v v0, (t5) sh1add t5, a1, t5 addi t1, t1, -1 bnez t1, 4b li t1, \w L(right_done_\w\()x\h): beqz t2, L(top_skip_\w\()x\h) mul t5, a1, t2 add t5, t0, t5 sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start add a5, a5, t0 sub t5, t1, t0 # x_end - x_start slli t6, t0, 1 .if \w == 4 vsetvli zero, t5, e16, m1, ta, ma .else vsetvli zero, t5, e16, m2, ta, ma .endif 5: vle8.v v0, (a5) addi t2, t2, 1 vzext.vf2 v2, v0 add a5, a3, a5 vse16.v v2, (a0) sh1add a0, a1, a0 bnez t2, 5b sub a0, a0, t6 # tmp -= x_start L(top_skip_\w\()x\h): li a5, \h beqz t0, L(left_skip_\w\()x\h) sh1add a0, t0, a0 # tmp += x_start 7: .if \w == 4 vsetivli zero, 2, e16, m1, ta, ma .else vsetivli zero, 2, e16, m2, ta, ma .endif vle8.v v0, (a4) addi a5, a5, -1 vzext.vf2 v2, v0 addi a4, a4, 2 vse16.v v2, (a0) sh1add a0, a1, a0 bnez a5, 7b li a5, \h mul t5, a1, a5 add t5, t5, t0 slli t5, t5, 1 sub a0, a0, t5 # tmp -= h * tmp_stride + x_start L(left_skip_\w\()x\h): 8: .if \w == 4 vsetvli zero, t1, e16, m1, ta, ma .else vsetvli zero, t1, e16, m2, ta, ma .endif vle8.v v0, (a2) vzext.vf2 v2, v0 vse16.v v2, (a0) add a2, a3, a2 sh1add a0, a1, a0 addi a5, a5, -1 bnez a5, 8b li a5, \h sh1add a0, t0, a0 # tmp += x_start add a6, a6, t0 # bottom += x_start beq a5, t3, L(bottom_skip_\w\()x\h) sub t5, t1, t0 .if \w == 4 vsetvli zero, t5, e16, m1, ta, ma .else vsetvli zero, t5, e16, m2, ta, ma .endif 9: vle8.v v0, (a6) add a6, a3, a6 vzext.vf2 v2, v0 addi a5, a5, 1 vse16.v v2, (a0) sh1add a0, a1, a0 bne a5, t3, 9b L(bottom_skip_\w\()x\h): li t6, \h mul t6, a3, t6 sub a2, a2, t6 # src -= h * src_stride mul t5, a1, t3 add t5, t5, t0 slli t5, t5, 1 sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start .endm .macro cdef_fn w, h function cdef_filter_block_\w\()x\h\()_8bpc_rvv, export=1, ext="v,zba,zbb" csrw vxrm, zero addi sp, sp, -32 - 144*2 sd a5, 24(sp) # pri_strength sd a6, 16(sp) # sec_strength sd a7, 8(sp) # dir ld a7, 8 + 32 + 144*2(sp) # edges mv a6, a4 # bottom mv a5, a3 # top mv a4, a2 # left mv a3, a1 # dst_stride mv a2, a0 # dst li a1, 12 # tmp_stride addi a0, sp, 32 + 2*(2*12+2) padding_fn \w, \h ld a4, 32 + 2*144(sp) # damping ld a5, 24(sp) # pri_strength ld a6, 16(sp) # sec_strength ld a7, 8(sp) # dir beqz a5, cdef_filter_sec_only_\w\()x\h bnez a6, cdef_filter_pri_sec_\w\()x\h andi t0, a5, 1 li t1, 4 sub t4, t1, t0 li t1, 63 clz t2, a5 sub t1, t1, t2 sub t1, a4, t1 li t0, \h la t2, dav2d_cdef_directions addi t3, a7, 2 sh1add t2, t3, t2 blt zero, t1, 1f mv t1, zero 1: vsetivli zero, \w, e16, m1, ta, mu lb t3, 0(t2) vle8.v v0, (a2) vzext.vf2 v2, v0 sh1add t6, t3, a0 slli t3, t3, 1 sub t3, a0, t3 vle16.v v4, (t6) vle16.v v6, (t3) vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a5, t1, v8, v16 vmul.vx v28, v16, t4 vmacc.vx v28, t4, v8 lb t3, 1(t2) andi t5, t4, 3 ori t5, t5, 2 sh1add t6, t3, a0 slli t3, t3, 1 sub t3, a0, t3 vsetvli zero, zero, e16, m1, ta, mu vle16.v v4, (t6) vle16.v v6, (t3) vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a5, t1, v8, v16 vmacc.vx v28, t5, v16 vmacc.vx v28, t5, v8 vmslt.vx v0, v28, zero vadd.vi v28, v28, -1, v0.t vsetvli zero, zero, e16, m1, ta, ma vnclip.wi v24, v28, 4 vadd.vv v28, v2, v24 vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v24, v28, 0 vse8.v v24, (a2) addi t0, t0, -1 add a2, a2, a3 sh1add a0, a1, a0 bnez t0, 1b addi sp, sp, 32 + 144*2 ret cdef_filter_sec_only_\w\()x\h: li t1, 63 clz t2, a6 sub t1, t1, t2 sub t1, a4, t1 li t0, \h la t2, dav2d_cdef_directions addi t3, a7, 4 sh1add t3, t3, t2 sh1add t2, a7, t2 2: vsetivli zero, \w, e16, m1, ta, mu lb t4, 0(t3) lb t5, 0(t2) vle8.v v0, (a2) vzext.vf2 v2, v0 sh1add t6, t4, a0 slli t4, t4, 1 sub t4, a0, t4 vle16.v v4, (t6) vle16.v v6, (t4) sh1add t4, t5, a0 slli t5, t5, 1 sub t5, a0, t5 vle16.v v8, (t4) vle16.v v10, (t5) vwsub.vv v12, v4, v2 vwsub.vv v14, v6, v2 vwsub.vv v16, v8, v2 vwsub.vv v18, v10, v2 vsetvli zero, zero, e32, m2, ta, mu li t4, 2 constrain_vectors v4, v6, v12, a6, t1, v12, v14 constrain_vectors v8, v10, v14, a6, t1, v16, v18 vmul.vx v28, v18, t4 vmacc.vx v28, t4, v16 vmacc.vx v28, t4, v14 vmacc.vx v28, t4, v12 lb t4, 1(t3) lb t5, 1(t2) sh1add t6, t4, a0 slli t4, t4, 1 sub t4, a0, t4 vsetvli zero, zero, e16, m1, ta, mu vle16.v v4, (t6) vle16.v v6, (t4) sh1add t4, t5, a0 slli t5, t5, 1 sub t5, a0, t5 vle16.v v8, (t4) vle16.v v10, (t5) vwsub.vv v12, v4, v2 vwsub.vv v14, v6, v2 vwsub.vv v16, v8, v2 vwsub.vv v18, v10, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a6, t1, v12, v14 constrain_vectors v8, v10, v14, a6, t1, v16, v18 vadd.vv v4, v28, v12 vadd.vv v28, v4, v14 vadd.vv v4, v28, v16 vadd.vv v28, v4, v18 vmslt.vx v0, v28, zero vadd.vi v28, v28, -1, v0.t vsetvli zero, zero, e16, m1, ta, ma vnclip.wi v24, v28, 4 vadd.vv v28, v2, v24 vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v24, v28, 0 vse8.v v24, (a2) addi t0, t0, -1 add a2, a2, a3 sh1add a0, a1, a0 bnez t0, 2b addi sp, sp, 32 + 144*2 ret cdef_filter_pri_sec_\w\()x\h: li t1, 63 clz t2, a5 clz t3, a6 sub t2, t1, t2 sub t3, t1, t3 sub t1, a4, t2 sub t2, a4, t3 li t0, \h la t3, dav2d_cdef_directions blt zero, t1, 3f mv t1, zero 3: vsetivli zero, \w, e16, m1, ta, ma li t4, 4 andi t6, a5, 1 addi t5, a7, 2 sub t4, t4, t6 sh1add t5, t5, t3 vle8.v v0, (a2) lb t6, 0(t5) vzext.vf2 v2, v0 sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v2 vmax.vv v24, v4, v2 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a5, t1, v8, v16 vmul.vx v28, v16, t4 vmacc.vx v28, t4, v8 lb t6, 1(t5) andi t4, t4, 3 ori t4, t4, 2 sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a5, t1, v8, v16 addi t5, a7, 4 vmacc.vx v28, t4, v16 vmacc.vx v28, t4, v8 sh1add t5, t5, t3 lb t6, 0(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu li t6, 2 constrain_vectors v4, v6, v12, a6, t2, v8, v16 vmacc.vx v28, t6, v16 vmacc.vx v28, t6, v8 lb t6, 1(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a6, t2, v8, v16 sh1add t5, a7, t3 vadd.vv v4, v28, v8 vadd.vv v28, v4, v16 vsetvli zero, zero, e16, m1, ta, ma lb t6, 0(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu li t6, 2 constrain_vectors v4, v6, v12, a6, t2, v8, v16 vmacc.vx v28, t6, v16 vmacc.vx v28, t6, v8 lb t6, 1(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v12, a6, t2, v8, v16 vadd.vv v4, v28, v8 vadd.vv v28, v4, v16 vmslt.vx v0, v28, zero vadd.vi v28, v28, -1, v0.t vsetvli zero, zero, e16, m1, ta, mu vnclip.wi v16, v28, 4 vadd.vv v28, v2, v16 vmslt.vv v0, v20, v28 vmerge.vvm v4, v20, v28, v0 vmslt.vv v0, v4, v24 vmerge.vvm v28, v24, v4, v0 vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v24, v28, 0 vse8.v v24, (a2) addi t0, t0, -1 add a2, a2, a3 sh1add a0, a1, a0 bnez t0, 3b addi sp, sp, 32 + 144*2 ret endfunc .endm cdef_fn 4, 4 cdef_fn 4, 8 cdef_fn 8, 8 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/cdef16.S000066400000000000000000000315351517466257200234150ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" .macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2 vmslt.vx v0, \vec_tmp1, zero vneg.v \vec_tmp1, \vec_tmp1, v0.t vmmv.m v1, v0 vmslt.vx v0, \vec_tmp2, zero vneg.v \vec_tmp2, \vec_tmp2, v0.t vsra.vx \vec1, \vec_tmp1, \shift vsra.vx \vec2, \vec_tmp2, \shift vrsub.vx \vec1, \vec1, \strength vrsub.vx \vec2, \vec2, \strength vmax.vx \vec1, \vec1, zero vmax.vx \vec2, \vec2, zero vmin.vv \vec_tmp1, \vec1, \vec_tmp1 vmin.vv \vec_tmp2, \vec2, \vec_tmp2 vneg.v \vec_tmp2, \vec_tmp2, v0.t vmmv.m v0, v1 vneg.v \vec_tmp1, \vec_tmp1, v0.t .endm .macro padding_fn w, h li t5, -32768 # INT16_MIN andi t4, a7, 4 li t2, -2 # y_start .if \w == 4 vsetivli zero, \w + 4, e16, m1, ta, ma .else vsetivli zero, \w + 4, e16, m2, ta, ma .endif vmv.v.x v0, t5 bnez t4, L(top_done_\w\()x\h) slli t5, a1, 1 addi t5, t5, 2 slli t5, t5, 1 sub t5, a0, t5 sh1add t4, a1, t5 vse16.v v0, (t5) vse16.v v0, (t4) li t2, 0 L(top_done_\w\()x\h): andi t4, a7, 8 li t3, 2 + \h # y_end bnez t4, L(bottom_done_\w\()x\h) li t5, \h mul t5, a1, t5 addi t5, t5, -2 sh1add t5, t5, a0 sh1add t4, a1, t5 vse16.v v0, (t5) vse16.v v0, (t4) addi t3, t3, -2 L(bottom_done_\w\()x\h): andi t4, a7, 1 li t0, -2 # x_start .if \w == 4 vsetivli zero, 2, e16, m1, ta, ma .else vsetivli zero, 2, e16, m2, ta, ma .endif bnez t4, L(left_done_\w\()x\h) mul t5, a1, t2 addi t5, t5, -2 sh1add t5, t5, a0 sub t0, t3, t2 3: vse16.v v0, (t5) sh1add t5, a1, t5 addi t0, t0, -1 bnez t0, 3b L(left_done_\w\()x\h): andi t4, a7, 2 li t1, 2 + \w # x_end bnez t4, L(right_done_\w\()x\h) mul t5, t2, a1 addi t5, t5, \w sh1add t5, t5, a0 sub t1, t3, t2 4: vse16.v v0, (t5) sh1add t5, a1, t5 addi t1, t1, -1 bnez t1, 4b li t1, \w L(right_done_\w\()x\h): beqz t2, L(top_skip_\w\()x\h) mul t5, a1, t2 add t5, t0, t5 sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start sh1add a5, t0, a5 # top += x_start sub t5, t1, t0 slli t6, t0, 1 .if \w == 4 vsetvli zero, t5, e16, m1, ta, ma .else vsetvli zero, t5, e16, m2, ta, ma .endif 5: vle16.v v2, (a5) addi t2, t2, 1 add a5, a3, a5 vse16.v v2, (a0) sh1add a0, a1, a0 bnez t2, 5b sub a0, a0, t6 # tmp -= x_start L(top_skip_\w\()x\h): li a5, \h beqz t0, L(left_skip_\w\()x\h) sh1add a0, t0, a0 # tmp += x_start 7: .if \w == 4 vsetivli zero, 2, e16, m1, ta, ma .else vsetivli zero, 2, e16, m2, ta, ma .endif vle16.v v2, (a4) addi a5, a5, -1 addi a4, a4, 4 vse16.v v2, (a0) sh1add a0, a1, a0 bnez a5, 7b li a5, \h mul t5, a1, a5 add t5, t5, t0 slli t5, t5, 1 sub a0, a0, t5 # tmp -= h * tmp_stride + x_start L(left_skip_\w\()x\h): 8: .if \w == 4 vsetvli zero, t1, e16, m1, ta, ma .else vsetvli zero, t1, e16, m2, ta, ma .endif vle16.v v2, (a2) add a2, a3, a2 vse16.v v2, (a0) sh1add a0, a1, a0 addi a5, a5, -1 bnez a5, 8b li a5, \h sh1add a0, t0, a0 # tmp += x_start sh1add a6, t0, a6 # bottom += x_start beq a5, t3, L(bottom_skip_\w\()x\h) sub t5, t1, t0 .if \w == 4 vsetvli zero, t5, e16, m1, ta, ma .else vsetvli zero, t5, e16, m2, ta, ma .endif 9: vle16.v v2, (a6) add a6, a3, a6 addi a5, a5, 1 vse16.v v2, (a0) sh1add a0, a1, a0 bne a5, t3, 9b L(bottom_skip_\w\()x\h): li t6, \h mul t6, a3, t6 sub a2, a2, t6 # src -= h * PXSTRIDE(src_stride) mul t5, a1, t3 add t5, t5, t0 slli t5, t5, 1 sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start .endm .macro cdef_fn w, h function cdef_filter_block_\w\()x\h\()_16bpc_rvv, export=1, ext="v,zba,zbb" csrw vxrm, zero addi sp, sp, -32 - 144*2 sd a5, 24(sp) # pri_strength sd a6, 16(sp) # sec_strength sd a7, 8(sp) # dir ld a7, 8 + 32 + 144*2(sp) # edges mv a6, a4 # bottom mv a5, a3 # top mv a4, a2 # left mv a3, a1 # dst_stride mv a2, a0 # dst li a1, 12 # tmp_stride addi a0, sp, 32 + 2*(2*12+2) padding_fn \w, \h ld a4, 32 + 2*144(sp) # damping ld a5, 24(sp) # pri_strength ld a6, 16(sp) # sec_strength ld a7, 8(sp) # dir beqz a5, cdef_filter_sec_only_\w\()x\h bnez a6, cdef_filter_pri_sec_\w\()x\h li t1, 64-8 ld t4, 32 + 2*144 + 16(sp) # bitdepth_max clz t4, t4 sub t4, t1, t4 sra t4, a5, t4 andi t0, t4, 1 li t1, 4 sub t4, t1, t0 li t1, 63 clz t2, a5 sub t1, t1, t2 sub t1, a4, t1 li t0, \h la t2, dav2d_cdef_directions addi t3, a7, 2 sh1add t2, t3, t2 vsetivli zero, \w, e16, m1, ta, ma blt zero, t1, 1f mv t1, zero 1: lb t3, 0(t2) vle16.v v2, (a2) sh1add t6, t3, a0 slli t3, t3, 1 sub t3, a0, t3 vle16.v v4, (t6) vle16.v v6, (t3) vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a5, t1, v8, v16 vmul.vx v28, v16, t4 vmacc.vx v28, t4, v8 lb t3, 1(t2) andi t5, t4, 3 ori t5, t5, 2 sh1add t6, t3, a0 slli t3, t3, 1 sub t3, a0, t3 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (t6) vle16.v v6, (t3) vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a5, t1, v8, v16 vmacc.vx v28, t5, v16 vmacc.vx v28, t5, v8 vmslt.vx v0, v28, zero vadd.vi v28, v28, -1, v0.t vsetvli zero, zero, e16, m1, ta, ma vnclip.wi v24, v28, 4 vadd.vv v28, v2, v24 vse16.v v28, (a2) add a2, a2, a3 sh1add a0, a1, a0 addi t0, t0, -1 bnez t0, 1b addi sp, sp, 32 + 144*2 ret cdef_filter_sec_only_\w\()x\h: li t1, 63 clz t2, a6 sub t1, t1, t2 sub t1, a4, t1 li t0, \h la t2, dav2d_cdef_directions addi t3, a7, 4 sh1add t3, t3, t2 sh1add t2, a7, t2 vsetivli zero, \w, e16, m1, ta, ma 2: lb t4, 0(t3) lb t5, 0(t2) vle16.v v2, (a2) sh1add t6, t4, a0 slli t4, t4, 1 sub t4, a0, t4 vle16.v v4, (t6) vle16.v v6, (t4) sh1add t4, t5, a0 slli t5, t5, 1 sub t5, a0, t5 vle16.v v8, (t4) vle16.v v10, (t5) vwsub.vv v12, v4, v2 vwsub.vv v14, v6, v2 vwsub.vv v16, v8, v2 vwsub.vv v18, v10, v2 vsetvli zero, zero, e32, m2, ta, mu li t4, 2 constrain_vectors v4, v6, v2, a6, t1, v12, v14 constrain_vectors v8, v10, v2, a6, t1, v16, v18 vmul.vx v28, v18, t4 vmacc.vx v28, t4, v16 vmacc.vx v28, t4, v14 vmacc.vx v28, t4, v12 lb t4, 1(t3) lb t5, 1(t2) sh1add t6, t4, a0 slli t4, t4, 1 sub t4, a0, t4 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (t6) vle16.v v6, (t4) sh1add t4, t5, a0 slli t5, t5, 1 sub t5, a0, t5 vle16.v v8, (t4) vle16.v v10, (t5) vwsub.vv v12, v4, v2 vwsub.vv v14, v6, v2 vwsub.vv v16, v8, v2 vwsub.vv v18, v10, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a6, t1, v12, v14 constrain_vectors v8, v10, v2, a6, t1, v16, v18 vadd.vv v4, v28, v12 vadd.vv v28, v4, v14 vadd.vv v4, v28, v16 vadd.vv v28, v4, v18 vmslt.vx v0, v28, zero vadd.vi v28, v28, -1, v0.t vsetvli zero, zero, e16, m1, ta, ma vnclip.wi v24, v28, 4 vadd.vv v28, v2, v24 vse16.v v28, (a2) add a2, a2, a3 sh1add a0, a1, a0 addi t0, t0, -1 bnez t0, 2b addi sp, sp, 32 + 144*2 ret cdef_filter_pri_sec_\w\()x\h: li t1, 63 clz t2, a5 clz t3, a6 sub t2, t1, t2 sub t3, t1, t3 sub t1, a4, t2 sub t2, a4, t3 li t0, \h la t3, dav2d_cdef_directions vsetivli zero, \w, e16, m1, ta, ma blt zero, t1, 3f mv t1, zero 3: li t5, 64-8 ld t4, 32 + 2*144 + 16(sp) # bitdepth_max clz t4, t4 sub t4, t5, t4 sra t4, a5, t4 li t6, 4 andi t5, t4, 1 sub t4, t6, t5 addi t5, a7, 2 sh1add t5, t5, t3 vle16.v v2, (a2) lb t6, 0(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v2 vmax.vv v24, v4, v2 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a5, t1, v8, v16 vmul.vx v28, v16, t4 vmacc.vx v28, t4, v8 andi t4, t4, 3 ori t4, t4, 2 lb t6, 1(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a5, t1, v8, v16 addi t5, a7, 4 vmacc.vx v28, t4, v16 vmacc.vx v28, t4, v8 sh1add t5, t5, t3 lb t6, 0(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu li t6, 2 constrain_vectors v4, v6, v2, a6, t2, v8, v16 vmacc.vx v28, t6, v16 vmacc.vx v28, t6, v8 lb t6, 1(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a6, t2, v8, v16 sh1add t5, a7, t3 vadd.vv v4, v28, v8 vadd.vv v28, v4, v16 vsetvli zero, zero, e16, m1, ta, ma lb t6, 0(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu li t6, 2 constrain_vectors v4, v6, v2, a6, t2, v8, v16 vmacc.vx v28, t6, v16 vmacc.vx v28, t6, v8 lb t6, 1(t5) sh1add a4, t6, a0 slli t6, t6, 1 sub t6, a0, t6 vsetvli zero, zero, e16, m1, ta, ma vle16.v v4, (a4) vle16.v v6, (t6) vminu.vv v20, v4, v20 vmax.vv v24, v4, v24 vminu.vv v20, v6, v20 vmax.vv v24, v6, v24 vwsub.vv v8, v4, v2 vwsub.vv v16, v6, v2 vsetvli zero, zero, e32, m2, ta, mu constrain_vectors v4, v6, v2, a6, t2, v8, v16 vadd.vv v4, v28, v8 vadd.vv v28, v4, v16 vmslt.vx v0, v28, zero vadd.vi v28, v28, -1, v0.t vsetvli zero, zero, e16, m1, ta, ma vnclip.wi v16, v28, 4 vadd.vv v28, v2, v16 vmslt.vv v0, v20, v28 vmerge.vvm v4, v20, v28, v0 vmslt.vv v0, v4, v24 vmerge.vvm v28, v24, v4, v0 vse16.v v28, (a2) add a2, a2, a3 sh1add a0, a1, a0 addi t0, t0, -1 bnez t0, 3b addi sp, sp, 32 + 144*2 ret endfunc .endm cdef_fn 4, 4 cdef_fn 4, 8 cdef_fn 8, 8 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/cpu.S000066400000000000000000000046011517466257200231260ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" // This function detects non-compliant RVV 0.7.1 hardware which reports support // for the V extension through HWCAP, by intentionally setting tail and mask // agnostic vector configurations that were only introduced in RVV 0.9 spec. // Existing non-compliant (pre RVV 1.0) hardware will set the VILL bit in VTYPE // (indicating an illegal vector configuration) which is stored in the XLEN-1 // bit position, thus a simple sign check is sufficient for detection. // This work around is inexpensive and harmless on compliant hardware, but we // should still consider removing it once all non-compliant RVV 0.7.1 hardware // is out of service. function has_compliant_rvv, export=1, ext=v vsetvli t0, zero, e8, m1, ta, ma csrr a0, vtype sgtz a0, a0 ret endfunc function get_vlenb, export=1 csrr a0, vlenb ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/ipred.S000066400000000000000000000233011517466257200234400ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function dc_gen_8bpc_rvv, export=1, ext="v,zbb" .variant_cc dav2d_dc_gen_8bpc_rvv add t1, a1, a2 srli t5, t1, 1 mv t1, a1 addi t2, a0, 1 vsetvli zero, t1, e16, m4, ta, ma vmv.v.x v0, zero 1: vsetvli t3, t1, e8, m2, tu, ma vle8.v v4, (t2) vwaddu.wv v0, v0, v4 sub t1, t1, t3 add t2, t2, t3 bnez t1, 1b mv t1, a2 mv t2, a0 vsetvli zero, t1, e16, m4, ta, ma vmv.v.x v8, zero 2: vsetvli t3, t1, e8, m2, tu, ma sub t2, t2, t3 vle8.v v4, (t2) vwaddu.wv v8, v8, v4 sub t1, t1, t3 bnez t1, 2b vsetvli zero, zero, e32, m8, ta, ma vmv.s.x v16, t5 vmv.s.x v12, zero vsetvli zero, a1, e16, m4, ta, ma vwredsum.vs v24, v0, v16 vsetvli zero, a2, e16, m4, ta, ma vwredsum.vs v16, v8, v12 vsetvli zero, zero, e32, m8, ta, ma vmv.x.s t5, v24 vmv.x.s t1, v16 add t5, t5, t1 add t1, a1, a2 ctz t1, t1 srl a0, t5, t1 beq a1, a2, 5f slli t1, a1, 1 sltu t2, t1, a2 slli t3, a2, 1 sltu t1, t3, a1 or t1, t1, t2 bnez t1, 3f li t1, 0x5556 j 4f 3: li t1, 0x3334 4: mul a0, a0, t1 srli a0, a0, 16 5: jr t0 endfunc function dc_gen_top_8bpc_rvv, export=1, ext="v,zbb" .variant_cc dav2d_dc_gen_top_8bpc_rvv mv t1, a1 srli t5, a1, 1 addi a0, a0, 1 vsetvli zero, t1, e16, m4, ta, ma vmv.v.x v0, zero 1: vsetvli t3, t1, e8, m2, tu, ma vle8.v v4, (a0) vwaddu.wv v0, v0, v4 sub t1, t1, t3 add a0, a0, t3 bnez t1, 1b j dc_gen_sum_up_8bpc_rvv endfunc function dc_gen_left_8bpc_rvv, export=1, ext="v,zbb" .variant_cc dav2d_dc_gen_left_8bpc_rvv mv t1, a1 srli t5, a1, 1 vsetvli t2, t1, e16, m4, ta, ma vmv.v.x v0, zero 1: vsetvli t3, t1, e8, m2, tu, ma sub a0, a0, t3 vle8.v v4, (a0) vwaddu.wv v0, v0, v4 sub t1, t1, t3 bnez t1, 1b j dc_gen_sum_up_8bpc_rvv endfunc function dc_gen_sum_up_8bpc_rvv, export=1, ext="v,zbb" .variant_cc dav2d_dc_gen_sum_up_8bpc_rvv vsetvli zero, a1, e32, m8, ta, ma vmv.s.x v4, t5 vsetvli zero, zero, e16, m4, ta, ma vwredsum.vs v8, v0, v4 vsetvli zero, zero, e32, m8, ta, ma vmv.x.s t5, v8 ctz t1, a1 srl a0, t5, t1 jr t0 endfunc function cfl_pred_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero 1: li t2, 0 mv t3, a2 2: vsetvli t0, t3, e16, m2, ta, ma add t4, a0, t2 vle16.v v0, (a5) sh1add a5, t0, a5 vwmul.vx v4, v0, a6 vsetvli zero, zero, e32, m4, ta, mu vneg.v v8, v4 vmslt.vx v0, v4, x0 vmax.vv v12, v8, v4 vssra.vi v16, v12, 6 vneg.v v16, v16, v0.t vadd.vx v20, v16, a4 vmax.vx v0, v20, zero vsetvli zero, zero, e16, m2, ta, ma vnclipu.wi v4, v0, 0 vsetvli zero, zero, e8, m1, ta, ma vnclipu.wi v0, v4, 0 vse8.v v0, (t4) add t2, t0, t2 sub t3, t3, t0 bnez t3, 2b addi a3, a3, -1 add a0, a0, a1 bnez a3, 1b ret endfunc function ipred_cfl_8bpc_rvv, export=1, ext=v mv t6, a0 # dst mv a0, a2 # topleft mv t4, a1 # stride mv a1, a3 # width mv a2, a4 # height jal t0, dc_gen_8bpc_rvv mv a2, a3 # width mv a3, a4 # height mv a4, a0 # dc_get_top mv a0, t6 # dst mv a1, t4 # stride j cfl_pred_8bpc_rvv endfunc function ipred_cfl_128_8bpc_rvv, export=1, ext="v,zba" # dc = 128, then just rearrange registers mv a2, a3 mv a3, a4 li a4, 128 j cfl_pred_8bpc_rvv endfunc function ipred_cfl_top_8bpc_rvv, export=1, ext=v mv t6, a0 # dst mv a0, a2 # topleft mv t4, a1 # stride mv a1, a3 # width jal t0, dc_gen_top_8bpc_rvv mv a3, a4 # height mv a4, a0 # dc_get_top mv a0, t6 # dst mv a2, a1 # width mv a1, t4 # stride j cfl_pred_8bpc_rvv endfunc function ipred_cfl_left_8bpc_rvv, export=1, ext="v,zba" mv t6, a0 # dst mv a0, a2 # topleft mv t4, a1 # stride mv a1, a4 # height mv a2, a3 # width jal t0, dc_gen_left_8bpc_rvv mv a3, a4 # height mv a4, a0 # dc_get_left mv a1, t4 # stride mv a0, t6 # dst j cfl_pred_8bpc_rvv endfunc function ipred_paeth_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero li t0, 0 mv t3, a2 lbu t1, (a2) addi a6, a2, -1 addi a2, a2, 1 1: lbu t2, (a6) mv t3, a3 2: sub t5, a3, t3 add t5, a2, t5 vsetvli t6, t3, e8, m1, ta, ma vle8.v v2, (t5) vwaddu.vx v4, v2, t2 vsetvli zero, zero, e16, m2, ta, ma vwsub.vx v8, v4, t1 vsetvli zero, zero, e32, m4, ta, mu vzext.vf4 v24, v2 vsub.vx v12, v8, t1 vmslt.vx v0, v12, zero vneg.v v12, v12, v0.t vsub.vx v16, v8, t2 vmslt.vx v0, v16, zero vneg.v v16, v16, v0.t vsub.vv v20, v8, v24 vmslt.vx v0, v20, zero vneg.v v20, v20, v0.t sub t5, a3, t3 vmsleu.vv v4, v16, v20 vmsleu.vv v5, v16, v12 vmsgtu.vv v0, v20, v12 vmand.mm v6, v4, v5 vsetvli zero, zero, e8, m1, ta, ma vmerge.vxm v8, v2, t1, v0 vmmv.m v0, v6 add t5, a0, t5 sub t3, t3, t6 vmerge.vxm v4, v8, t2, v0 vse8.v v4, (t5) bnez t3, 2b addi a4, a4, -1 addi a6, a6, -1 add a0, a0, a1 bnez a4, 1b ret endfunc function ipred_smooth_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero la t0, dav2d_sm_weights add t1, t0, a3 add t2, a2, a3 add t0, t0, a4 lbu t2, (t2) sub t3, a2, a4 addi a6, a2, -1 addi a2, a2, 1 lbu t3, (t3) 1: mv t6, a3 lbu a7, (a6) lbu t4, (t0) 2: li a5, 256 vsetvli t5, t6, e8, m1, ta, ma vle8.v v2, (t1) add t1, t1, t5 vle8.v v4, (a2) add a2, a2, t5 sub a5, a5, t4 vwmulu.vx v8, v4, t4 vsetvli zero, zero, e16, m2, ta, ma mul a5, a5, t3 vadd.vx v4, v8, a5 vsetvli zero, zero, e8, m1, ta, ma vwmulu.vx v8, v2, a7 vneg.v v12, v2 vwmaccu.vx v8, t2, v12 vsetvli zero, zero, e16, m2, ta, ma vwaddu.vv v12, v4, v8 sub a5, a3, t6 sub t6, t6, t5 add a5, a5, a0 vnclipu.wi v2, v12, 9 vsetvli zero, zero, e8, m1, ta, ma vnclipu.wi v0, v2, 0 vse8.v v0, (a5) bnez t6, 2b sub t1, t1, a3 add a0, a0, a1 sub a2, a2, a3 addi a4, a4, -1 addi t0, t0, 1 addi a6, a6, -1 bnez a4, 1b ret endfunc function ipred_smooth_v_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero la t0, dav2d_sm_weights add t2, a2, a3 add t0, t0, a4 sub t3, a2, a4 addi a2, a2, 1 lbu t3, (t3) 1: mv t6, a3 lbu t4, (t0) 2: li a5, 256 vsetvli t5, t6, e8, m1, ta, ma vle8.v v4, (a2) add a2, a2, t5 sub a5, a5, t4 vwmulu.vx v8, v4, t4 vsetvli zero, zero, e16, m2, ta, ma mul a5, a5, t3 vwaddu.vx v4, v8, a5 sub a5, a3, t6 sub t6, t6, t5 add a5, a5, a0 vsetvli zero, zero, e16, m2, ta, ma vnclipu.wi v2, v4, 8 vsetvli zero, zero, e8, m1, ta, ma vnclipu.wi v0, v2, 0 vse8.v v0, (a5) bnez t6, 2b add a0, a0, a1 sub a2, a2, a3 addi a4, a4, -1 addi t0, t0, 1 bnez a4, 1b ret endfunc function ipred_smooth_h_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero la t0, dav2d_sm_weights add t1, t0, a3 add t2, a2, a3 lbu t2, (t2) addi a6, a2, -1 1: mv t6, a3 lbu a7, (a6) 2: vsetvli t5, t6, e8, m1, ta, ma vle8.v v2, (t1) add t1, t1, t5 vwmulu.vx v8, v2, a7 vneg.v v12, v2 vwmaccu.vx v8, t2, v12 sub a5, a3, t6 sub t6, t6, t5 add a5, a5, a0 vsetvli zero, zero, e8, m1, ta, ma vnclipu.wi v0, v8, 8 vse8.v v0, (a5) bnez t6, 2b sub t1, t1, a3 add a0, a0, a1 addi a4, a4, -1 addi a6, a6, -1 bnez a4, 1b ret endfunc function pal_pred_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero vsetivli t5, 8, e8, m1, ta, ma vle8.v v30, (a2) li t0, 2 srli t1, a4, 1 1: mv t4, a4 2: vsetvli t5, t1, e8, m1, ta, ma vle8.v v0, (a3) add a3, a3, t5 vsrl.vi v2, v0, 4 sub t6, a4, t4 vand.vi v1, v0, 7 add t6, a0, t6 vrgather.vv v3, v30, v1 addi t2, t6, 1 vrgather.vv v4, v30, v2 slli t5, t5, 1 vsse8.v v3, (t6), t0 sub t4, t4, t5 vsse8.v v4, (t2), t0 bnez t4, 2b addi a5, a5, -1 add a0, a0, a1 bnez a5, 1b ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/ipred16.S000066400000000000000000000235211517466257200236130ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function dc_gen_16bpc_rvv, export=1, ext="v,zba,zbb" .variant_cc dav2d_dc_gen_16bpc_rvv add t1, a1, a2 srli t5, t1, 1 mv t1, a1 addi t2, a0, 2 vsetvli zero, t1, e32, m8, ta, ma vmv.v.x v0, zero 1: vsetvli t3, t1, e16, m4, tu, ma vle16.v v8, (t2) vwaddu.wv v0, v0, v8 sub t1, t1, t3 sh1add t2, t3, t2 bnez t1, 1b mv t1, a2 mv t2, a0 vsetvli zero, t1, e32, m8, ta, ma vmv.v.x v16, zero 2: vsetvli t3, t1, e16, m4, tu, ma sub t1, t1, t3 sll t3, t3, 1 sub t2, t2, t3 vle16.v v8, (t2) vwaddu.wv v16, v16, v8 bnez t1, 2b vsetvli zero, a1, e32, m8, ta, ma vmv.s.x v24, t5 vmv.s.x v25, zero vredsum.vs v8, v0, v24 vsetvli zero, a2, e32, m8, ta, ma vredsum.vs v0, v16, v25 vmv.x.s t5, v8 vmv.x.s t1, v0 add t5, t5, t1 add t1, a1, a2 ctz t1, t1 srl a0, t5, t1 beq a1, a2, 5f slli t1, a1, 1 sltu t2, t1, a2 slli t3, a2, 1 sltu t1, t3, a1 or t1, t1, t2 bnez t1, 3f li t1, 0xAAAB j 4f 3: li t1, 0x6667 4: mul a0, a0, t1 li t1, 17 srl a0, a0, t1 5: jr t0 endfunc function dc_gen_top_16bpc_rvv, export=1, ext="v,zba,zbb" .variant_cc dav2d_dc_gen_top_16bpc_rvv mv t1, a1 srli t5, a1, 1 addi a0, a0, 2 vsetvli zero, t1, e32, m2, ta, ma vmv.v.x v0, zero 1: vsetvli t3, t1, e16, m1, tu, ma vle16.v v4, (a0) vwaddu.wv v0, v0, v4 sh1add a0, t3, a0 sub t1, t1, t3 bnez t1, 1b j dc_gen_sum_up_16bpc_rvv endfunc function dc_gen_left_16bpc_rvv, export=1, ext="v,zba,zbb" .variant_cc dav2d_dc_gen_left_16bpc_rvv mv t1, a1 srli t5, a1, 1 vsetvli zero, t1, e32, m2, ta, ma vmv.v.x v0, zero 1: vsetvli t3, t1, e16, m1, tu, ma sub t1, t1, t3 slli t3, t3, 1 sub a0, a0, t3 vle16.v v4, (a0) vwaddu.wv v0, v0, v4 bnez t1, 1b j dc_gen_sum_up_16bpc_rvv endfunc function dc_gen_sum_up_16bpc_rvv, export=1, ext="v,zba,zbb" .variant_cc dav2d_dc_gen_sum_up_16bpc_rvv vsetvli zero, a1, e32, m2, ta, ma vmv.s.x v4, t5 vredsum.vs v8, v0, v4 vmv.x.s t5, v8 ctz t1, a1 srl a0, t5, t1 jr t0 endfunc function cfl_pred_16bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero 1: li t2, 0 mv t3, a2 2: vsetvli t0, t3, e16, m2, ta, ma sh1add t4, t2, a0 vle16.v v0, (a5) sh1add a5, t0, a5 vwmul.vx v4, v0, a6 vsetvli zero, zero, e32, m4, ta, mu vneg.v v8, v4 vmslt.vx v0, v4, x0 vmax.vv v12, v8, v4 vssra.vi v16, v12, 6 vneg.v v16, v16, v0.t vadd.vx v20, v16, a4 vmax.vx v0, v20, zero vmin.vx v0, v0, a7 vsetvli zero, zero, e16, m2, ta, ma vnclipu.wi v4, v0, 0 vse16.v v4, (t4) add t2, t0, t2 sub t3, t3, t0 bnez t3, 2b addi a3, a3, -1 add a0, a0, a1 bnez a3, 1b ret endfunc function ipred_cfl_16bpc_rvv, export=1, ext=v mv t6, a0 # dst mv a0, a2 # topleft mv t4, a1 # stride mv a1, a3 # width mv a2, a4 # height jal t0, dc_gen_16bpc_rvv mv a2, a3 # width mv a3, a4 # height mv a4, a0 # dc_get_top mv a0, t6 # dst mv a1, t4 # stride j cfl_pred_16bpc_rvv endfunc function ipred_cfl_128_16bpc_rvv, export=1, ext="v,zba" # dc = (bitdepth_max + 1) >> 1, then just rearrange registers mv a2, a3 mv a3, a4 addi a4, a7, 1 srli a4, a4, 1 j cfl_pred_16bpc_rvv endfunc function ipred_cfl_top_16bpc_rvv, export=1, ext=v mv t6, a0 # dst mv a0, a2 # topleft mv t4, a1 # stride mv a1, a3 # width jal t0, dc_gen_top_16bpc_rvv mv a3, a4 # height mv a4, a0 # dc_get_top mv a0, t6 # dst mv a2, a1 # width mv a1, t4 # stride j cfl_pred_16bpc_rvv endfunc function ipred_cfl_left_16bpc_rvv, export=1, ext=v mv t6, a0 # dst mv a0, a2 # topleft mv t4, a1 # stride mv a1, a4 # height mv a2, a3 # width jal t0, dc_gen_left_16bpc_rvv mv a3, a4 # height mv a4, a0 # dc_get_top mv a1, t4 # stride mv a0, t6 # dst j cfl_pred_16bpc_rvv endfunc function ipred_paeth_16bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero li t0, 0 mv t3, a2 lhu t1, (a2) addi a6, a2, -2 addi a2, a2, 2 1: lhu t2, (a6) mv t3, a3 2: sub t5, a3, t3 sh1add t5, t5, a2 vsetvli t6, t3, e16, m2, ta, ma vle16.v v2, (t5) vwaddu.vx v4, v2, t2 vsetvli zero, zero, e32, m4, ta, mu vsub.vx v8, v4, t1 vzext.vf2 v24, v2 vsub.vx v12, v8, t1 vmslt.vx v0, v12, zero vneg.v v12, v12, v0.t vsub.vx v16, v8, t2 vmslt.vx v0, v16, zero vneg.v v16, v16, v0.t vsub.vv v20, v8, v24 vmslt.vx v0, v20, zero vneg.v v20, v20, v0.t sub t5, a3, t3 vmsleu.vv v4, v16, v20 vmsleu.vv v5, v16, v12 vmsgtu.vv v0, v20, v12 vmand.mm v6, v4, v5 vsetvli zero, zero, e16, m2, ta, ma vmerge.vxm v8, v2, t1, v0 vmmv.m v0, v6 sh1add t5, t5, a0 sub t3, t3, t6 vmerge.vxm v4, v8, t2, v0 vse16.v v4, (t5) bnez t3, 2b addi a4, a4, -1 addi a6, a6, -2 add a0, a0, a1 bnez a4, 1b ret endfunc function ipred_smooth_16bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero la t0, dav2d_sm_weights add t1, t0, a3 sh1add t2, a3, a2 slli t3, a4, 1 add t0, t0, a4 lhu t2, (t2) sub t3, a2, t3 addi a6, a2, -2 addi a2, a2, 2 lhu t3, (t3) 1: mv t6, a3 lhu a7, (a6) lbu t4, (t0) 2: li a5, 256 vsetvli t5, t6, e16, m2, ta, ma vle8.v v2, (t1) add t1, t1, t5 vle16.v v4, (a2) sh1add a2, t5, a2 sub a5, a5, t4 vwmul.vx v8, v4, t4 mul a5, a5, t3 vsetvli zero, zero, e32, m4, ta, ma vadd.vx v4, v8, a5 li a5, 256 vzext.vf4 v12, v2 vmul.vx v8, v12, a7 vrsub.vx v12, v12, a5 vmacc.vx v8, t2, v12 vadd.vv v12, v4, v8 vsetvli zero, zero, e32, m4, ta, ma sub a5, a3, t6 sub t6, t6, t5 sh1add a5, a5, a0 vsetvli zero, zero, e16, m2, ta, ma vnclipu.wi v2, v12, 9 vse16.v v2, (a5) bnez t6, 2b sub t1, t1, a3 slli t6, a3, 1 add a0, a0, a1 sub a2, a2, t6 addi a4, a4, -1 addi t0, t0, 1 addi a6, a6, -2 bnez a4, 1b ret endfunc function ipred_smooth_v_16bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero la t0, dav2d_sm_weights slli t3, a4, 1 add t0, t0, a4 sub t3, a2, t3 addi a2, a2, 2 lhu t3, (t3) 1: mv t6, a3 lbu t4, (t0) 2: li a5, 256 vsetvli t5, t6, e16, m2, ta, ma vle16.v v4, (a2) sh1add a2, t5, a2 sub a5, a5, t4 vwmul.vx v8, v4, t4 mul a5, a5, t3 vsetvli zero, zero, e32, m4, ta, ma vadd.vx v4, v8, a5 vsetvli zero, zero, e32, m4, ta, ma sub a5, a3, t6 sub t6, t6, t5 sh1add a5, a5, a0 vsetvli zero, zero, e16, m2, ta, ma vnclipu.wi v2, v4, 8 vse16.v v2, (a5) bnez t6, 2b slli t6, a3, 1 add a0, a0, a1 sub a2, a2, t6 addi a4, a4, -1 addi t0, t0, 1 bnez a4, 1b ret endfunc function ipred_smooth_h_16bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero la t0, dav2d_sm_weights add t1, t0, a3 sh1add t2, a3, a2 lhu t2, (t2) addi a6, a2, -2 1: mv t6, a3 lhu a7, (a6) 2: vsetvli t5, t6, e16, m2, ta, ma vle8.v v2, (t1) add t1, t1, t5 li a5, 256 vsetvli zero, zero, e32, m4, ta, ma vzext.vf4 v12, v2 vmul.vx v8, v12, a7 vrsub.vx v12, v12, a5 vmacc.vx v8, t2, v12 sub a5, a3, t6 sub t6, t6, t5 sh1add a5, a5, a0 vsetvli zero, zero, e16, m2, ta, ma vnclipu.wi v2, v8, 8 vse16.v v2, (a5) bnez t6, 2b sub t1, t1, a3 add a0, a0, a1 addi a4, a4, -1 addi a6, a6, -2 bnez a4, 1b ret endfunc function pal_pred_16bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero vsetivli t5, 8, e16, m1, ta, ma vle16.v v30, (a2) li t0, 4 srli t1, a4, 1 li t2, 1 1: mv t4, a4 2: vsetvli t5, t1, e8, mf2, ta, ma vle8.v v0, (a3) add a3, a3, t5 vand.vi v1, v0, 7 sub t6, a4, t4 vsrl.vi v2, v0, 4 vwmul.vx v4, v1, t2 vwmul.vx v6, v2, t2 vsetvli zero, zero, e16, m1, ta, ma sh1add t6, t6, a0 vrgather.vv v8, v30, v4 addi t3, t6, 2 vrgather.vv v10, v30, v6 slli t5, t5, 1 vsse16.v v8, (t6), t0 vsse16.v v10, (t3), t0 sub t4, t4, t5 bnez t4, 2b add a0, a0, a1 addi a5, a5, -1 bnez a5, 1b ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/itx.S000066400000000000000000001177651517466257200231630ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2023, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function inv_txfm_add_4x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) addi t0, t0, 8 vle16.v v2, (t0) addi t0, t0, 8 vle16.v v3, (t0) jalr t0, a4 vmv.v.x v4, zero vsseg4e16.v v0, (a2) vle16.v v0, (a2) vse16.v v4, (a2) addi t0, a2, 8 vle16.v v1, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v2, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v3, (t0) vse16.v v4, (t0) jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 itx_4x4_end: vsetvli zero, zero, e8, mf4, ta, ma vle8.v v4, (a0) add t0, a0, a1 vle8.v v5, (t0) add t0, t0, a1 vle8.v v6, (t0) add t0, t0, a1 vle8.v v7, (t0) vwaddu.wv v0, v0, v4 vwaddu.wv v1, v1, v5 vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 vsetvli zero, zero, e16, mf2, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 vnclipu.wi v6, v2, 0 vnclipu.wi v7, v3, 0 vse8.v v4, (a0) add a0, a0, a1 vse8.v v5, (a0) add a0, a0, a1 vse8.v v6, (a0) add a0, a0, a1 vse8.v v7, (a0) ret endfunc function inv_identity_e16_x4_rvv, export=1, ext=v li t1, (5793-4096)*8 vsmul.vx v4, v0, t1 vsmul.vx v5, v1, t1 vsmul.vx v6, v2, t1 vsmul.vx v7, v3, t1 vsadd.vv v0, v0, v4 vsadd.vv v1, v1, v5 vsadd.vv v2, v2, v6 vsadd.vv v3, v3, v7 jr t0 endfunc .macro iwht_4 vadd.vv v0, v0, v1 vsub.vv v5, v2, v3 vsub.vv v4, v0, v5 vsra.vi v4, v4, 1 vsub.vv v2, v4, v1 vsub.vv v1, v4, v3 vadd.vv v3, v5, v2 vsub.vv v0, v0, v1 .endm .macro idct_4 o0, o1, o2, o3 li t1, 2896 li t2, 1567 li t3, 3784 vwmul.vx v16, \o0, t1 vwmul.vx v18, \o0, t1 vwmacc.vx v16, t1, \o2 neg t1, t1 vwmacc.vx v18, t1, \o2 vwmul.vx v20, \o1, t3 neg t3, t3 vwmul.vx v22, \o1, t2 vwmacc.vx v20, t2, \o3 vwmacc.vx v22, t3, \o3 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vsadd.vv \o0, v16, v20 vsadd.vv \o1, v18, v22 vssub.vv \o2, v18, v22 vssub.vv \o3, v16, v20 .endm .macro iadst_4 o0, o1, o2, o3, lm2, lm li t1, 1321 li t2, 3803 li t3, 2482 vwmul.vx v16, v0, t1 vwmul.vx v18, v0, t3 neg t1, t1 vwmacc.vx v16, t2, v2 vwmacc.vx v18, t1, v2 neg t2, t2 vwmacc.vx v16, t3, v3 vwmacc.vx v18, t2, v3 vwsub.vv v20, v0, v2 vwadd.wv v20, v20, v3 li t1, 3344 vwmul.vx v22, v1, t1 vsetvli zero, zero, e32, \lm2, ta, ma vmul.vx v20, v20, t1 vadd.vv v24, v16, v18 vadd.vv v16, v16, v22 vadd.vv v18, v18, v22 vsub.vv v22, v24, v22 vsetvli zero, zero, e16, \lm, ta, ma vnclip.wi \o0, v16, 12 vnclip.wi \o1, v18, 12 vnclip.wi \o2, v20, 12 vnclip.wi \o3, v22, 12 .endm function inv_dct_e16_x4_rvv, export=1, ext=v idct_4 v0, v1, v2, v3 jr t0 endfunc function inv_adst_e16_x4_rvv, export=1, ext=v iadst_4 v0, v1, v2, v3, m1, mf2 jr t0 endfunc function inv_flipadst_e16_x4_rvv, export=1, ext=v iadst_4 v3, v2, v1, v0, m1, mf2 jr t0 endfunc function inv_adst_e16_x4w_rvv, export=1, ext=v iadst_4 v0, v1, v2, v3, m2, m1 jr t0 endfunc function inv_flipadst_e16_x4w_rvv, export=1, ext=v iadst_4 v3, v2, v1, v0, m2, m1 jr t0 endfunc function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) addi t0, t0, 8 vle16.v v2, (t0) addi t0, t0, 8 vle16.v v3, (t0) vsra.vi v0, v0, 2 vsra.vi v1, v1, 2 vsra.vi v2, v2, 2 vsra.vi v3, v3, 2 iwht_4 vmv.v.x v4, zero vsseg4e16.v v0, (a2) vle16.v v0, (a2) vse16.v v4, (a2) addi t0, a2, 8 vle16.v v1, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v2, (t0) vse16.v v4, (t0) addi t0, t0, 8 vle16.v v3, (t0) vse16.v v4, (t0) iwht_4 j itx_4x4_end endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct beqz a3, 1f .endif la a4, inv_\txfm1\()_e16_x4_rvv la a5, inv_\txfm2\()_e16_x4_rvv j inv_txfm_add_4x4_rvv .ifc \txfm1\()_\txfm2, dct_dct 1: csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma ld t2, (a2) li t1, 2896*8 vmv.v.x v0, t2 vsmul.vx v0, v0, t1 sd x0, (a2) vsmul.vx v0, v0, t1 vssra.vi v0, v0, 4 vmv.v.v v1, v0 vmv.v.v v2, v0 vmv.v.v v3, v0 j itx_4x4_end .endif endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) addi t0, t0, 16 vle16.v v2, (t0) addi t0, t0, 16 vle16.v v3, (t0) addi t0, t0, 16 vle16.v v4, (t0) addi t0, t0, 16 vle16.v v5, (t0) addi t0, t0, 16 vle16.v v6, (t0) addi t0, t0, 16 vle16.v v7, (t0) .ifc \variant, identity_ // The identity vsadd.vv and downshift vssra.vi 1 cancel out j L(itx_8x8_epilog) .else jalr t0, a4 vssra.vi v0, v0, 1 vssra.vi v1, v1, 1 vssra.vi v2, v2, 1 vssra.vi v3, v3, 1 vssra.vi v4, v4, 1 vssra.vi v5, v5, 1 vssra.vi v6, v6, 1 vssra.vi v7, v7, 1 L(itx_8x8_epilog): vsseg8e16.v v0, (a2) vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) addi t0, t0, 16 vle16.v v2, (t0) addi t0, t0, 16 vle16.v v3, (t0) addi t0, t0, 16 vle16.v v4, (t0) addi t0, t0, 16 vle16.v v5, (t0) addi t0, t0, 16 vle16.v v6, (t0) addi t0, t0, 16 vle16.v v7, (t0) jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 vssra.vi v4, v4, 4 vssra.vi v5, v5, 4 vssra.vi v6, v6, 4 vssra.vi v7, v7, 4 li t1, 64 vsetvli zero, t1, e16, m8, ta, ma vmv.v.x v8, zero vse16.v v8, (a2) itx_8x8_end: vsetivli zero, 8, e8, mf2, ta, ma vle8.v v8, (a0) add t0, a0, a1 vle8.v v9, (t0) add t0, t0, a1 vle8.v v10, (t0) add t0, t0, a1 vle8.v v11, (t0) add t0, t0, a1 vle8.v v12, (t0) add t0, t0, a1 vle8.v v13, (t0) add t0, t0, a1 vle8.v v14, (t0) add t0, t0, a1 vle8.v v15, (t0) vwaddu.wv v0, v0, v8 vwaddu.wv v1, v1, v9 vwaddu.wv v2, v2, v10 vwaddu.wv v3, v3, v11 vwaddu.wv v4, v4, v12 vwaddu.wv v5, v5, v13 vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vmax.vx v4, v4, zero vmax.vx v5, v5, zero vmax.vx v6, v6, zero vmax.vx v7, v7, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v8, v0, 0 vnclipu.wi v9, v1, 0 vnclipu.wi v10, v2, 0 vnclipu.wi v11, v3, 0 vnclipu.wi v12, v4, 0 vnclipu.wi v13, v5, 0 vnclipu.wi v14, v6, 0 vnclipu.wi v15, v7, 0 vse8.v v8, (a0) add a0, a0, a1 vse8.v v9, (a0) add a0, a0, a1 vse8.v v10, (a0) add a0, a0, a1 vse8.v v11, (a0) add a0, a0, a1 vse8.v v12, (a0) add a0, a0, a1 vse8.v v13, (a0) add a0, a0, a1 vse8.v v14, (a0) add a0, a0, a1 vse8.v v15, (a0) ret .endif endfunc .endm def_fn_8x8_base identity_ def_fn_8x8_base function inv_identity_e16_x8_rvv, export=1, ext=v vsadd.vv v0, v0, v0 vsadd.vv v1, v1, v1 vsadd.vv v2, v2, v2 vsadd.vv v3, v3, v3 vsadd.vv v4, v4, v4 vsadd.vv v5, v5, v5 vsadd.vv v6, v6, v6 vsadd.vv v7, v7, v7 jr t0 endfunc .macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7 idct_4 \o0, \o2, \o4, \o6 li t1, 799 li t2, 4017 li t3, 3406 li t4, 2276 vwmul.vx v22, \o1, t2 neg t2, t2 vwmul.vx v16, \o1, t1 vwmacc.vx v22, t1, \o7 vwmacc.vx v16, t2, \o7 vwmul.vx v20, \o5, t4 neg t4, t4 vwmul.vx v18, \o5, t3 vwmacc.vx v20, t3, \o3 vwmacc.vx v18, t4, \o3 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vssub.vv \o7, v22, v20 vsadd.vv v22, v22, v20 vssub.vv \o1, v16, v18 vsadd.vv v16, v16, v18 li t2, 2896 vwmul.vx v18, \o7, t2 vwmul.vx v20, \o7, t2 vwmacc.vx v20, t2, \o1 neg t2, t2 vwmacc.vx v18, t2, \o1 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vssub.vv \o7, \o0, v22 vsadd.vv \o0, \o0, v22 vssub.vv v17, \o2, v20 vsadd.vv \o1, \o2, v20 vssub.vv \o5, \o4, v18 vsadd.vv \o2, \o4, v18 vssub.vv \o4, \o6, v16 vsadd.vv \o3, \o6, v16 vmv.v.v \o6, v17 .endm .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 li t1, 4076 li t2, 401 li t3, 3612 li t4, 1931 li t5, 2598 li t6, 3166 vwmul.vx v16, v7, t1 neg t1, t1 vwmul.vx v18, v7, t2 vwmacc.vx v16, t2, v0 vwmacc.vx v18, t1, v0 vwmul.vx v20, v5, t3 neg t3, t3 vwmul.vx v22, v5, t4 vwmacc.vx v20, t4, v2 vwmacc.vx v22, t3, v2 vwmul.vx v24, v3, t5 neg t5, t5 vwmul.vx v26, v3, t6 vwmacc.vx v24, t6, v4 vwmacc.vx v26, t5, v4 li t2, 1189 li t3, 3920 li t4, 1567 li t5, 3784 li t6, 2896 vwmul.vx v28, v1, t2 neg t2, t2 vwmul.vx v30, v1, t3 vwmacc.vx v28, t3, v6 vwmacc.vx v30, t2, v6 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vnclip.wi v24, v24, 12 vnclip.wi v26, v26, 12 vnclip.wi v28, v28, 12 vnclip.wi v30, v30, 12 vssub.vv v4, v16, v24 vsadd.vv v16, v16, v24 vsadd.vv v1, v18, v26 vsadd.vv v2, v20, v28 vsadd.vv v3, v22, v30 vssub.vv v5, v18, v26 vssub.vv v6, v20, v28 vssub.vv v30, v22, v30 vsadd.vv \o0, v16, v2 vsadd.vv \o7, v1, v3 vssub.vv v2, v16, v2 vssub.vv v3, v1, v3 vwmul.vx v16, v4, t5 vwmul.vx v18, v4, t4 vwmul.vx v20, v30, t5 vwmul.vx v22, v30, t4 vwmacc.vx v16, t4, v5 neg t4, t4 vwmacc.vx v22, t5, v6 neg t5, t5 vwmacc.vx v20, t4, v6 vwmacc.vx v18, t5, v5 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vsadd.vv \o1, v16, v20 vsadd.vv \o6, v18, v22 vssub.vv v16, v16, v20 vssub.vv v17, v18, v22 vwmul.vx v18, v2, t6 vwmul.vx v20, v2, t6 vwmul.vx v22, v16, t6 vwmul.vx v24, v16, t6 vwmacc.vx v18, t6, v3 vwmacc.vx v22, t6, v17 neg t6, t6 vwmacc.vx v20, t6, v3 vwmacc.vx v24, t6, v17 vnclip.wi \o3, v18, 12 vnclip.wi \o4, v20, 12 vnclip.wi \o2, v22, 12 vnclip.wi \o5, v24, 12 vmv.v.x v16, zero vssub.vv \o1, v16, \o1 vssub.vv \o3, v16, \o3 vssub.vv \o5, v16, \o5 vssub.vv \o7, v16, \o7 .endm function inv_dct_e16_x8_rvv, export=1, ext=v idct_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 endfunc function inv_adst_e16_x8_rvv, export=1, ext=v iadst_8 v0, v1, v2, v3, v4, v5, v6, v7 jr t0 endfunc function inv_flipadst_e16_x8_rvv, export=1, ext=v iadst_8 v7, v6, v5, v4, v3, v2, v1, v0 jr t0 endfunc .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct beqz a3, 1f .endif la a5, inv_\txfm2\()_e16_x8_rvv .ifc \txfm1, identity j inv_txfm_identity_add_8x8_rvv .else la a4, inv_\txfm1\()_e16_x8_rvv j inv_txfm_add_8x8_rvv .endif .ifc \txfm1\()_\txfm2, dct_dct 1: csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma ld t2, (a2) li t1, 2896*8 vmv.v.x v0, t2 vsmul.vx v0, v0, t1 sd x0, (a2) vssra.vi v0, v0, 1 vsmul.vx v0, v0, t1 vssra.vi v0, v0, 4 vmv.v.v v1, v0 vmv.v.v v2, v0 vmv.v.v v3, v0 vmv.v.v v4, v0 vmv.v.v v5, v0 vmv.v.v v6, v0 vmv.v.v v7, v0 j itx_8x8_end .endif endfunc .endm def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst def_fn_8x8 adst, flipadst def_fn_8x8 flipadst, dct def_fn_8x8 flipadst, adst def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst function inv_txfm_add_4x8_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) addi t0, t0, 16 vle16.v v2, (t0) addi t0, t0, 16 vle16.v v3, (t0) li t1, 2896*8 .irp i, 0, 1, 2, 3 vsmul.vx v\i, v\i, t1 .endr jalr t0, a4 vsseg4e16.v v0, (a2) vsetivli zero, 4, e16, mf2, ta, ma vmv.v.x v8, zero vle16.v v0, (a2) vse16.v v8, (a2) .irp i, 1, 2, 3, 4, 5, 6, 7 addi a2, a2, 8 vle16.v v\i, (a2) vse16.v v8, (a2) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vssra.vi v\i, v\i, 4 .endr vsetvli zero, zero, e8, mf4, ta, ma vle8.v v8, (a0) add t0, a0, a1 vle8.v v9, (t0) .irp i, 10, 11, 12, 13, 14, 15 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v8 vwaddu.wv v1, v1, v9 vwaddu.wv v2, v2, v10 vwaddu.wv v3, v3, v11 vwaddu.wv v4, v4, v12 vwaddu.wv v5, v5, v13 vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 vsetvli zero, zero, e16, mf2, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v8, v0, 0 vnclipu.wi v9, v1, 0 vnclipu.wi v10, v2, 0 vnclipu.wi v11, v3, 0 vnclipu.wi v12, v4, 0 vnclipu.wi v13, v5, 0 vnclipu.wi v14, v6, 0 vnclipu.wi v15, v7, 0 vse8.v v8, (a0) .irp i, 9, 10, 11, 12, 13, 14, 15 add a0, a0, a1 vse8.v v\i, (a0) .endr ret endfunc function inv_txfm_add_8x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) .irp i, 2, 3, 4, 5, 6, 7 addi t0, t0, 8 vle16.v v\i, (t0) .endr li t1, 2896*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vsmul.vx v\i, v\i, t1 .endr jalr t0, a4 vsseg8e16.v v0, (a2) vsetivli zero, 8, e16, m1, ta, ma vmv.v.x v4, zero vle16.v v0, (a2) vse16.v v4, (a2) .irp i, 1, 2, 3 addi a2, a2, 16 vle16.v v\i, (a2) vse16.v v4, (a2) .endr jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 vsetvli zero, zero, e8, mf2, ta, ma vle8.v v4, (a0) add t0, a0, a1 vle8.v v5, (t0) add t0, t0, a1 vle8.v v6, (t0) add t0, t0, a1 vle8.v v7, (t0) vwaddu.wv v0, v0, v4 vwaddu.wv v1, v1, v5 vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 vnclipu.wi v6, v2, 0 vnclipu.wi v7, v3, 0 vse8.v v4, (a0) add a0, a0, a1 vse8.v v5, (a0) add a0, a0, a1 vse8.v v6, (a0) add a0, a0, a1 vse8.v v7, (a0) ret endfunc /* Define symbols added in .if statement */ .equ dct, 1 .equ identity, 2 .equ adst, 3 .equ flipadst, 4 .macro def_fn_48 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 .if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) la a4, inv_\txfm1\()_e16_x\w\()w_rvv .else la a4, inv_\txfm1\()_e16_x\w\()_rvv .endif .if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) la a5, inv_\txfm2\()_e16_x\h\()w_rvv .else la a5, inv_\txfm2\()_e16_x\h\()_rvv .endif j inv_txfm_add_\w\()x\h\()_rvv endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct def_fn_48 \w, \h, identity, identity def_fn_48 \w, \h, dct, adst def_fn_48 \w, \h, dct, flipadst def_fn_48 \w, \h, dct, identity def_fn_48 \w, \h, adst, dct def_fn_48 \w, \h, adst, adst def_fn_48 \w, \h, adst, flipadst def_fn_48 \w, \h, flipadst, dct def_fn_48 \w, \h, flipadst, adst def_fn_48 \w, \h, flipadst, flipadst def_fn_48 \w, \h, identity, dct def_fn_48 \w, \h, adst, identity def_fn_48 \w, \h, flipadst, identity def_fn_48 \w, \h, identity, adst def_fn_48 \w, \h, identity, flipadst .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_identity_e16_x16_rvv, export=1, ext=v li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vsadd.vv v\i, v\i, v\i vsadd.vv v\i, v\i, v16 .endr jr t0 endfunc function inv_dct_e16_x16_rvv, export=1, ext=v idct_8 v0, v2, v4, v6, v8, v10, v12, v14 li t1, 401 li t2, 4076 li t3, 3166 li t4, 2598 vwmul.vx v30, v1, t2 neg t2, t2 vwmul.vx v16, v1, t1 vwmacc.vx v30, t1, v15 vwmacc.vx v16, t2, v15 vwmul.vx v28, v9, t4 neg t4, t4 vwmul.vx v18, v9, t3 vwmacc.vx v28, t3, v7 vwmacc.vx v18, t4, v7 li t1, 1931 li t2, 3612 li t3, 3920 li t4, 1189 vwmul.vx v26, v5, t2 neg t2, t2 vwmul.vx v20, v5, t1 vwmacc.vx v26, t1, v11 vwmacc.vx v20, t2, v11 vwmul.vx v24, v13, t4 neg t4, t4 vwmul.vx v22, v13, t3 vwmacc.vx v24, t3, v3 vwmacc.vx v22, t4, v3 li t2, 2896 li t3, 1567 li t4, 3784 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vnclip.wi v24, v24, 12 vnclip.wi v26, v26, 12 vnclip.wi v28, v28, 12 vnclip.wi v30, v30, 12 vssub.vv v3, v16, v18 vsadd.vv v16, v16, v18 vssub.vv v5, v22, v20 vsadd.vv v22, v22, v20 vssub.vv v11, v24, v26 vsadd.vv v24, v24, v26 vssub.vv v13, v30, v28 vsadd.vv v30, v30, v28 vwmul.vx v28, v13, t4 neg t4, t4 vwmul.vx v18, v13, t3 vwmul.vx v26, v11, t3 vwmacc.vx v28, t3, v3 neg t3, t3 vwmul.vx v20, v11, t4 vwmacc.vx v18, t4, v3 vwmacc.vx v20, t3, v5 vwmacc.vx v26, t4, v5 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v26, v26, 12 vnclip.wi v28, v28, 12 vssub.vv v5, v18, v20 vsadd.vv v18, v18, v20 vssub.vv v11, v28, v26 vsadd.vv v28, v28, v26 vssub.vv v7, v16, v22 vsadd.vv v16, v16, v22 vssub.vv v9, v30, v24 vsadd.vv v30, v30, v24 vwmul.vx v20, v11, t2 vwmul.vx v22, v9, t2 vwmul.vx v24, v9, t2 vwmul.vx v26, v11, t2 vwmacc.vx v24, t2, v7 vwmacc.vx v26, t2, v5 neg t2, t2 vwmacc.vx v20, t2, v5 vwmacc.vx v22, t2, v7 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vnclip.wi v24, v24, 12 vnclip.wi v26, v26, 12 vssub.vv v15, v0, v30 vsadd.vv v0, v0, v30 vssub.vv v17, v2, v28 vsadd.vv v1, v2, v28 vssub.vv v13, v4, v26 vsadd.vv v2, v4, v26 vssub.vv v19, v6, v24 vsadd.vv v3, v6, v24 vssub.vv v11, v8, v22 vsadd.vv v4, v8, v22 vsadd.vv v5, v10, v20 vssub.vv v10, v10, v20 vssub.vv v9, v12, v18 vsadd.vv v6, v12, v18 vssub.vv v8, v14, v16 vsadd.vv v7, v14, v16 vmv.v.v v14, v17 vmv.v.v v12, v19 jr t0 endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 li t1, 4091 li t2, 201 li t3, 3973 li t4, 995 vwmul.vx v16, v15, t1 neg t1, t1 vwmul.vx v18, v15, t2 vwmacc.vx v16, t2, v0 vwmacc.vx v18, t1, v0 vwmul.vx v20, v13, t3 neg t3, t3 vwmul.vx v22, v13, t4 vwmacc.vx v20, t4, v2 vwmacc.vx v22, t3, v2 li t1, 3703 li t2, 1751 li t3, 3290 li t4, 2440 vwmul.vx v24, v11, t1 neg t1, t1 vwmul.vx v26, v11, t2 vwmacc.vx v24, t2, v4 vwmacc.vx v26, t1, v4 vwmul.vx v28, v9, t3 neg t3, t3 vwmul.vx v30, v9, t4 vwmacc.vx v28, t4, v6 vwmacc.vx v30, t3, v6 vnclip.wi v0, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v2, v20, 12 vnclip.wi v22, v22, 12 vnclip.wi v4, v24, 12 vnclip.wi v26, v26, 12 vnclip.wi v6, v28, 12 vnclip.wi v30, v30, 12 li t1, 2751 li t2, 3035 li t3, 2106 li t4, 3513 vwmul.vx v16, v7, t1 neg t1, t1 vwmul.vx v20, v7, t2 vwmacc.vx v16, t2, v8 vwmacc.vx v20, t1, v8 vwmul.vx v24, v5, t3 neg t3, t3 vwmul.vx v28, v5, t4 vwmacc.vx v24, t4, v10 vwmacc.vx v28, t3, v10 vnclip.wi v16, v16, 12 vnclip.wi v9, v20, 12 vnclip.wi v24, v24, 12 vnclip.wi v11, v28, 12 vssub.vv v8, v0, v16 vsadd.vv v0, v0, v16 vssub.vv v10, v2, v24 vsadd.vv v2, v2, v24 li t1, 1380 li t2, 3857 li t3, 601 li t4, 4052 vwmul.vx v16, v3, t1 neg t1, t1 vwmul.vx v20, v3, t2 vwmacc.vx v16, t2, v12 vwmacc.vx v20, t1, v12 vwmul.vx v24, v1, t3 neg t3, t3 vwmul.vx v28, v1, t4 vwmacc.vx v24, t4, v14 vwmacc.vx v28, t3, v14 vnclip.wi v16, v16, 12 vnclip.wi v13, v20, 12 vnclip.wi v24, v24, 12 vnclip.wi v15, v28, 12 vssub.vv v12, v4, v16 vsadd.vv v16, v4, v16 vssub.vv v14, v6, v24 vsadd.vv v20, v6, v24 vsadd.vv v1, v18, v9 vssub.vv v9, v18, v9 vsadd.vv v3, v22, v11 vssub.vv v11, v22, v11 vsadd.vv v18, v26, v13 vssub.vv v13, v26, v13 vsadd.vv v22, v30, v15 vssub.vv v15, v30, v15 vssub.vv v4, v0, v16 vsadd.vv v0, v0, v16 vssub.vv v5, v1, v18 vsadd.vv v1, v1, v18 vssub.vv v6, v2, v20 vsadd.vv v2, v2, v20 vssub.vv v7, v3, v22 vsadd.vv v3, v3, v22 li t1, 799 li t2, 4017 li t3, 3406 li t4, 2276 vwmul.vx v16, v8, t2 vwmul.vx v18, v8, t1 vwmul.vx v20, v10, t4 vwmul.vx v22, v10, t3 vwmul.vx v24, v13, t2 vwmul.vx v26, v13, t1 vwmul.vx v28, v15, t4 vwmul.vx v30, v15, t3 vwmacc.vx v16, t1, v9 neg t1, t1 vwmacc.vx v20, t3, v11 neg t3, t3 vwmacc.vx v26, t2, v12 neg t2, t2 vwmacc.vx v30, t4, v14 neg t4, t4 vwmacc.vx v18, t2, v9 vwmacc.vx v22, t4, v11 vwmacc.vx v24, t1, v12 vwmacc.vx v28, t3, v14 li t2, 2896 li t3, 1567 li t4, 3784 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vnclip.wi v24, v24, 12 vnclip.wi v26, v26, 12 vnclip.wi v28, v28, 12 vnclip.wi v30, v30, 12 vsadd.vv v8, v16, v24 vsadd.vv v9, v18, v26 vsadd.vv v10, v20, v28 vsadd.vv v11, v22, v30 vssub.vv v12, v16, v24 vssub.vv v13, v18, v26 vssub.vv v14, v20, v28 vssub.vv v15, v22, v30 vwmul.vx v16, v4, t4 vwmul.vx v18, v4, t3 vwmul.vx v20, v7, t4 vwmul.vx v22, v7, t3 vwmul.vx v24, v12, t4 vwmul.vx v26, v12, t3 vwmul.vx v28, v15, t4 vwmul.vx v30, v15, t3 vwmacc.vx v16, t3, v5 vwmacc.vx v22, t4, v6 vwmacc.vx v24, t3, v13 neg t3, t3 vwmacc.vx v30, t4, v14 neg t4, t4 vwmacc.vx v20, t3, v6 vwmacc.vx v28, t3, v14 vwmacc.vx v18, t4, v5 vwmacc.vx v26, t4, v13 vnclip.wi v16, v16, 12 vnclip.wi v18, v18, 12 vnclip.wi v20, v20, 12 vnclip.wi v22, v22, 12 vnclip.wi v24, v24, 12 vnclip.wi v26, v26, 12 vnclip.wi v28, v28, 12 vnclip.wi v30, v30, 12 .ifc \o0, v0 vsadd.vv \o14, v9, v11 vssub.vv v11, v9, v11 vssub.vv v9, v1, v3 vsadd.vv \o15, v1, v3 vsadd.vv \o1, v8, v10 vssub.vv v10, v8, v10 vssub.vv v8, v0, v2 vsadd.vv \o0, v0, v2 .else vsadd.vv \o1, v8, v10 vssub.vv v10, v8, v10 vssub.vv v8, v0, v2 vsadd.vv \o0, v0, v2 vsadd.vv v2, v9, v11 vssub.vv v11, v9, v11 vssub.vv v9, v1, v3 vsadd.vv \o15, v1, v3 vmv.v.v \o14, v2 .endif vsadd.vv \o3, v16, v20 vssub.vv v6, v16, v20 vsadd.vv \o12, v18, v22 vssub.vv v7, v18, v22 vsadd.vv \o2, v24, v28 vssub.vv v24, v24, v28 vsadd.vv \o13, v26, v30 vssub.vv v26, v26, v30 neg t3, t2 vwmul.vx v28, v24, t2 vwmul.vx v30, v24, t2 vwmacc.vx v28, t2, v26 vwmacc.vx v30, t3, v26 vwmul.vx v24, v10, t2 vwmul.vx v26, v10, t2 vwmacc.vx v24, t2, v11 vwmacc.vx v26, t3, v11 vwmul.vx v20, v6, t2 vwmul.vx v22, v6, t2 vwmacc.vx v20, t2, v7 vwmacc.vx v22, t3, v7 vwmul.vx v16, v8, t2 vwmul.vx v18, v8, t2 vwmacc.vx v16, t2, v9 vwmacc.vx v18, t3, v9 vnclip.wi \o7, v16, 12 vnclip.wi \o8, v18, 12 vnclip.wi \o4, v20, 12 vnclip.wi \o11, v22, 12 vnclip.wi \o6, v24, 12 vnclip.wi \o9, v26, 12 vnclip.wi \o5, v28, 12 vnclip.wi \o10, v30, 12 vmv.v.x v16, zero vssub.vv \o1, v16, \o1 vssub.vv \o3, v16, \o3 vssub.vv \o5, v16, \o5 vssub.vv \o7, v16, \o7 vssub.vv \o9, v16, \o9 vssub.vv \o11, v16, \o11 vssub.vv \o13, v16, \o13 vssub.vv \o15, v16, \o15 .endm function inv_adst_e16_x16_rvv, export=1, ext=v iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 jr t0 endfunc function inv_flipadst_e16_x16_rvv, export=1, ext=v iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 jr t0 endfunc .macro def_horz_16 variant function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v vmv.v.x v16, zero vle16.v v0, (t4) vse16.v v16, (t4) .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add t4, t4, t6 vle16.v v\i, (t4) vse16.v v16, (t4) .endr .ifc \variant, _identity li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vsra.vi v16, v16, 1 vaadd.vv v\i, v\i, v16 .endr j L(horz_16x8_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 2 .endr L(horz_16x8_epilog): vsse16.v v0, (t5), t6 .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t5, t5, 2 vsse16.v v\i, (t5), t6 .endr jr a7 .endif endfunc .endm def_horz_16 _identity def_horz_16 function inv_txfm_add_vert_8x16_rvv, export=1, ext=v vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (t4) .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add t4, t4, t6 vle16.v v\i, (t4) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 4 .endr vsetivli zero, 8, e8, mf2, ta, ma vle8.v v16, (t5) add t0, t5, a1 vle8.v v17, (t0) .irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v16 vwaddu.wv v1, v1, v17 vwaddu.wv v2, v2, v18 vwaddu.wv v3, v3, v19 vwaddu.wv v4, v4, v20 vwaddu.wv v5, v5, v21 vwaddu.wv v6, v6, v22 vwaddu.wv v7, v7, v23 vwaddu.wv v8, v8, v24 vwaddu.wv v9, v9, v25 vwaddu.wv v10, v10, v26 vwaddu.wv v11, v11, v27 vwaddu.wv v12, v12, v28 vwaddu.wv v13, v13, v29 vwaddu.wv v14, v14, v30 vwaddu.wv v15, v15, v31 vsetvli zero, zero, e16, m1, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v16, v0, 0 vnclipu.wi v17, v1, 0 vnclipu.wi v18, v2, 0 vnclipu.wi v19, v3, 0 vnclipu.wi v20, v4, 0 vnclipu.wi v21, v5, 0 vnclipu.wi v22, v6, 0 vnclipu.wi v23, v7, 0 vnclipu.wi v24, v8, 0 vnclipu.wi v25, v9, 0 vnclipu.wi v26, v10, 0 vnclipu.wi v27, v11, 0 vnclipu.wi v28, v12, 0 vnclipu.wi v29, v13, 0 vnclipu.wi v30, v14, 0 vnclipu.wi v31, v15, 0 vse8.v v16, (t5) .irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t5, t5, a1 vse8.v v\i, (t5) .endr jr a7 endfunc function inv_txfm_add_16x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma addi sp, sp, -16*32 .irp i, 8, 0 addi t4, a2, \i*2 addi t5, sp, \i*16*2 .if \i == 8 blt a3, a7, 1f .endif li t6, 16*2 jalr a7, a6 .if \i == 8 j 2f 1: li t1, 64 vsetvli zero, t1, e16, m8, ta, ma vmv.v.x v0, zero vse16.v v0, (t5) addi t5, t5, 128 vse16.v v0, (t5) vsetivli zero, 8, e16, m1, ta, ma 2: .endif .endr .irp i, 0, 8 addi t4, sp, \i*2 addi t5, a0, \i li t6, 16*2 jal a7, inv_txfm_add_vert_8x16_rvv .endr addi sp, sp, 16*32 ret endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v .ifc \txfm1\()_\txfm2, dct_dct beqz a3, 1f .endif .ifc \txfm1, identity la a6, inv_txfm_horz_identity_16x8_rvv .else la a6, inv_txfm_horz_16x8_rvv la a4, inv_\txfm1\()_e16_x16_rvv .endif la a5, inv_\txfm2\()_e16_x16_rvv li a7, \eob_half j inv_txfm_add_16x16_rvv .ifc \txfm1\()_\txfm2, dct_dct 1: csrw vxrm, zero vsetivli zero, 16, e16, m2, ta, ma lh t2, (a2) li t3, 2896*8 li t4, 1<<14 li t5, 0xFFFF li t6, -0x10000 sh x0, (a2) mul t2, t2, t3 add t2, t2, t4 srai t2, t2, 15 ble t2, t5, 3f mv t2, t5 3: ble t6, t2, 4f mv t2, t6 4: addi t2, t2, 2 srai t2, t2, 2 mul t2, t2, t3 add t2, t2, t4 srai t2, t2, 15 ble t2, t5, 5f mv t2, t5 5: ble t6, t2, 6f mv t2, t6 6: addi t2, t2, 8 srai t2, t2, 4 vmv.v.x v24, t2 vsetvli zero, zero, e8, m1, ta, ma add t2, a1, a1 li t3, 16 2: add t0, a0, a1 vle8.v v16, (a0) vle8.v v17, (t0) vwaddu.wv v0, v24, v16 vwaddu.wv v2, v24, v17 addi t3, t3, -2 # loop counter vsetvli zero, zero, e16, m2, ta, ma .irp i, 0, 2 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, m1, ta, ma vnclipu.wi v16, v0, 0 vnclipu.wi v17, v2, 0 add t0, a0, a1 vse8.v v16, (a0) add a0, a0, t2 vse8.v v17, (t0) bnez t3, 2b ret .endif endfunc .endm def_fn_16x16 dct, dct, 36 def_fn_16x16 identity, identity, 36 def_fn_16x16 dct, adst, 36 def_fn_16x16 dct, flipadst, 36 def_fn_16x16 dct, identity, 8 def_fn_16x16 adst, dct, 36 def_fn_16x16 adst, adst, 36 def_fn_16x16 adst, flipadst, 36 def_fn_16x16 flipadst, dct, 36 def_fn_16x16 flipadst, adst, 36 def_fn_16x16 flipadst, flipadst, 36 def_fn_16x16 identity, dct, 8 .macro def_fn_416_base variant function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma blt a3, a6, 1f addi t0, a2, 16 vle16.v v0, (t0) addi t0, t0, 32 vle16.v v1, (t0) addi t0, t0, 32 vle16.v v2, (t0) addi t0, t0, 32 vle16.v v3, (t0) .ifc \variant, identity_ li t1, (5793-4096)*8 vsmul.vx v8, v0, t1 vaadd.vv v4, v0, v8 vsmul.vx v8, v1, t1 vaadd.vv v5, v1, v8 vsmul.vx v8, v2, t1 vaadd.vv v6, v2, v8 vsmul.vx v8, v3, t1 vaadd.vv v7, v3, v8 .else jalr t0, a4 vssra.vi v4, v0, 1 vssra.vi v5, v1, 1 vssra.vi v6, v2, 1 vssra.vi v7, v3, 1 .endif j 2f 1: .irp i, 4, 5, 6, 7 vmv.v.x v\i, zero .endr 2: vle16.v v0, (a2) addi t0, a2, 32 vle16.v v1, (t0) addi t0, t0, 32 vle16.v v2, (t0) addi t0, t0, 32 vle16.v v3, (t0) .ifc \variant, identity_ li t1, (5793-4096)*8 .irp i, 0, 1, 2, 3 vsmul.vx v8, v\i, t1 vaadd.vv v\i, v\i, v8 .endr j L(itx_4x16_epilog) .else jalr t0, a4 vssra.vi v0, v0, 1 vssra.vi v1, v1, 1 vssra.vi v2, v2, 1 vssra.vi v3, v3, 1 L(itx_4x16_epilog): vsseg4e16.v v0, (a2) addi t0, a2, 64 vsseg4e16.v v4, (t0) vsetivli zero, 4, e16, mf2, ta, ma vmv.v.x v16, zero vle16.v v0, (a2) vse16.v v16, (a2) addi t0, a2, 8 vle16.v v1, (t0) vse16.v v16, (t0) .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t0, t0, 8 vle16.v v\i, (t0) vse16.v v16, (t0) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 4 .endr vsetvli zero, zero, e8, mf4, ta, ma vle8.v v16, (a0) add t0, a0, a1 vle8.v v17, (t0) .irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v16 vwaddu.wv v1, v1, v17 vwaddu.wv v2, v2, v18 vwaddu.wv v3, v3, v19 vwaddu.wv v4, v4, v20 vwaddu.wv v5, v5, v21 vwaddu.wv v6, v6, v22 vwaddu.wv v7, v7, v23 vwaddu.wv v8, v8, v24 vwaddu.wv v9, v9, v25 vwaddu.wv v10, v10, v26 vwaddu.wv v11, v11, v27 vwaddu.wv v12, v12, v28 vwaddu.wv v13, v13, v29 vwaddu.wv v14, v14, v30 vwaddu.wv v15, v15, v31 vsetvli zero, zero, e16, mf2, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf4, ta, ma vnclipu.wi v16, v0, 0 vnclipu.wi v17, v1, 0 vnclipu.wi v18, v2, 0 vnclipu.wi v19, v3, 0 vnclipu.wi v20, v4, 0 vnclipu.wi v21, v5, 0 vnclipu.wi v22, v6, 0 vnclipu.wi v23, v7, 0 vnclipu.wi v24, v8, 0 vnclipu.wi v25, v9, 0 vnclipu.wi v26, v10, 0 vnclipu.wi v27, v11, 0 vnclipu.wi v28, v12, 0 vnclipu.wi v29, v13, 0 vnclipu.wi v30, v14, 0 vnclipu.wi v31, v15, 0 vse8.v v16, (a0) .irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 add a0, a0, a1 vse8.v v\i, (a0) .endr ret .endif endfunc function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 4, e16, mf2, ta, ma vle16.v v0, (a2) addi t0, a2, 8 vle16.v v1, (t0) .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t0, t0, 8 vle16.v v\i, (t0) .endr .ifc \variant, identity_ li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vssra.vi v16, v16, 1 vsadd.vv v\i, v\i, v16 .endr j L(itx_16x4_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 1 .endr L(itx_16x4_epilog): li t0, 32 vssseg8e16.v v0, (a2), t0 addi t1, a2, 16 vssseg8e16.v v8, (t1), t0 .irp j, 0, 8 vsetivli zero, 8, e16, m1, ta, ma vmv.v.x v4, zero addi t0, a2, \j*2 vle16.v v0, (t0) vse16.v v4, (t0) .irp i, 1, 2, 3 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v4, (t0) .endr jalr t0, a5 vssra.vi v0, v0, 4 vssra.vi v1, v1, 4 vssra.vi v2, v2, 4 vssra.vi v3, v3, 4 vsetvli zero, zero, e8, mf2, ta, ma addi t0, a0, \j vle8.v v4, (t0) add t0, t0, a1 vle8.v v5, (t0) add t0, t0, a1 vle8.v v6, (t0) add t0, t0, a1 vle8.v v7, (t0) vwaddu.wv v0, v0, v4 vwaddu.wv v1, v1, v5 vwaddu.wv v2, v2, v6 vwaddu.wv v3, v3, v7 vsetvli zero, zero, e16, m1, ta, ma vmax.vx v0, v0, zero vmax.vx v1, v1, zero vmax.vx v2, v2, zero vmax.vx v3, v3, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v4, v0, 0 vnclipu.wi v5, v1, 0 vnclipu.wi v6, v2, 0 vnclipu.wi v7, v3, 0 addi t0, a0, \j vse8.v v4, (t0) add t0, t0, a1 vse8.v v5, (t0) add t0, t0, a1 vse8.v v6, (t0) add t0, t0, a1 vse8.v v7, (t0) .endr ret .endif endfunc .endm def_fn_416_base identity_ def_fn_416_base .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 .if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst) la a4, inv_\txfm1\()_e16_x\w\()w_rvv .elseif \txfm1 != identity la a4, inv_\txfm1\()_e16_x\w\()_rvv .endif .if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst) la a5, inv_\txfm2\()_e16_x\h\()w_rvv .else la a5, inv_\txfm2\()_e16_x\h\()_rvv .endif .if \w == 4 li a6, \eob_half .endif .ifc \txfm1, identity j inv_txfm_identity_add_\w\()x\h\()_rvv .else j inv_txfm_add_\w\()x\h\()_rvv .endif endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 29 def_fn_416 \w, \h, identity, identity, 29 def_fn_416 \w, \h, dct, adst, 29 def_fn_416 \w, \h, dct, flipadst, 29 def_fn_416 \w, \h, dct, identity, 8 def_fn_416 \w, \h, adst, dct, 29 def_fn_416 \w, \h, adst, adst, 29 def_fn_416 \w, \h, adst, flipadst, 29 def_fn_416 \w, \h, flipadst, dct, 29 def_fn_416 \w, \h, flipadst, adst, 29 def_fn_416 \w, \h, flipadst, flipadst, 29 def_fn_416 \w, \h, identity, dct, 32 def_fn_416 \w, \h, adst, identity, 8 def_fn_416 \w, \h, flipadst, identity, 8 def_fn_416 \w, \h, identity, adst, 32 def_fn_416 \w, \h, identity, flipadst, 32 .endm def_fns_416 4, 16 def_fns_416 16, 4 .macro def_fn_816_base variant function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma blt a3, a6, 1f vmv.v.x v16, zero addi t0, a2, 16 vle16.v v0, (t0) vse16.v v16, (t0) .irp i, 1, 2, 3, 4, 5, 6, 7 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v16, (t0) .endr li t1, 2896*8 .ifc \variant, identity_ vsmul.vx v8, v0, t1 vsmul.vx v9, v1, t1 vsmul.vx v10, v2, t1 vsmul.vx v11, v3, t1 vsmul.vx v12, v4, t1 vsmul.vx v13, v5, t1 vsmul.vx v14, v6, t1 vsmul.vx v15, v7, t1 .else .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vsmul.vx v\i, v\i, t1 .endr jalr t0, a4 vssra.vi v8, v0, 1 vssra.vi v9, v1, 1 vssra.vi v10, v2, 1 vssra.vi v11, v3, 1 vssra.vi v12, v4, 1 vssra.vi v13, v5, 1 vssra.vi v14, v6, 1 vssra.vi v15, v7, 1 .endif j 2f 1: .irp i, 8, 9, 10, 11, 12, 13, 14, 15 vmv.v.x v\i, zero .endr 2: vmv.v.x v16, zero vle16.v v0, (a2) vse16.v v16, (a2) addi t0, a2, 32 vle16.v v1, (t0) vse16.v v16, (t0) .irp i, 2, 3, 4, 5, 6, 7 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v16, (t0) .endr li t1, 2896*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vsmul.vx v\i, v\i, t1 .endr .ifc \variant, identity_ j L(itx_8x16_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vssra.vi v\i, v\i, 1 .endr L(itx_8x16_epilog): addi t4, sp, -8*32 vsseg8e16.v v0, (t4) addi t0, t4, 8*16 vsseg8e16.v v8, (t0) mv t5, a0 li t6, 16 jal a7, inv_txfm_add_vert_8x16_rvv ret .endif endfunc function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma vle16.v v0, (a2) addi t0, a2, 16 vle16.v v1, (t0) .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 addi t0, t0, 16 vle16.v v\i, (t0) .endr li t1, 2896*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v\i, v\i, t1 .endr .ifc \variant, identity_ li t1, 2*(5793-4096)*8 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vsmul.vx v16, v\i, t1 vssra.vi v16, v16, 1 vsadd.vv v\i, v\i, v16 .endr j L(itx_16x8_epilog) .else jalr t0, a4 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 vssra.vi v\i, v\i, 1 .endr L(itx_16x8_epilog): li t0, 32 vssseg8e16.v v0, (a2), t0 addi t1, a2, 16 vssseg8e16.v v8, (t1), t0 .irp j, 0, 8 vsetivli zero, 8, e16, m1, ta, ma vmv.v.x v8, zero addi t0, a2, \j*2 vle16.v v0, (t0) vse16.v v8, (t0) .irp i, 1, 2, 3, 4, 5, 6, 7 addi t0, t0, 32 vle16.v v\i, (t0) vse16.v v8, (t0) .endr jalr t0, a5 .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vssra.vi v\i, v\i, 4 .endr vsetvli zero, zero, e8, mf2, ta, ma addi t0, a0, \j vle8.v v8, (t0) .irp i, 9, 10, 11, 12, 13, 14, 15 add t0, t0, a1 vle8.v v\i, (t0) .endr vwaddu.wv v0, v0, v8 vwaddu.wv v1, v1, v9 vwaddu.wv v2, v2, v10 vwaddu.wv v3, v3, v11 vwaddu.wv v4, v4, v12 vwaddu.wv v5, v5, v13 vwaddu.wv v6, v6, v14 vwaddu.wv v7, v7, v15 vsetvli zero, zero, e16, m1, ta, ma .irp i, 0, 1, 2, 3, 4, 5, 6, 7 vmax.vx v\i, v\i, zero .endr vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v8, v0, 0 vnclipu.wi v9, v1, 0 vnclipu.wi v10, v2, 0 vnclipu.wi v11, v3, 0 vnclipu.wi v12, v4, 0 vnclipu.wi v13, v5, 0 vnclipu.wi v14, v6, 0 vnclipu.wi v15, v7, 0 addi t0, a0, \j vse8.v v8, (t0) .irp i, 9, 10, 11, 12, 13, 14, 15 add t0, t0, a1 vse8.v v\i, (t0) .endr .endr ret .endif endfunc .endm def_fn_816_base identity_ def_fn_816_base .macro def_fn_816 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1 .ifnc \txfm1, identity la a4, inv_\txfm1\()_e16_x\w\()_rvv .endif la a5, inv_\txfm2\()_e16_x\h\()_rvv .if \w == 8 li a6, \eob_half .endif .ifc \txfm1, identity j inv_txfm_identity_add_\w\()x\h\()_rvv .else j inv_txfm_add_\w\()x\h\()_rvv .endif endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct, 43 def_fn_816 \w, \h, identity, identity, 43 def_fn_816 \w, \h, dct, adst, 43 def_fn_816 \w, \h, dct, flipadst, 43 def_fn_816 \w, \h, dct, identity, 8 def_fn_816 \w, \h, adst, dct, 43 def_fn_816 \w, \h, adst, adst, 43 def_fn_816 \w, \h, adst, flipadst, 43 def_fn_816 \w, \h, flipadst, dct, 43 def_fn_816 \w, \h, flipadst, adst, 43 def_fn_816 \w, \h, flipadst, flipadst, 43 def_fn_816 \w, \h, identity, dct, 64 def_fn_816 \w, \h, adst, identity, 8 def_fn_816 \w, \h, flipadst, identity, 8 def_fn_816 \w, \h, identity, adst, 64 def_fn_816 \w, \h, identity, flipadst, 64 .endm def_fns_816 8, 16 def_fns_816 16, 8 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/mc.S000066400000000000000000000246621517466257200227470ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Nathan Egge, Niklas Haas, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function blend_vl256_8bpc_rvv, export=1, ext=zbb ctz t0, a3 addi t0, t0, 0xc3 j L(blend_epilog) endfunc function blend_8bpc_rvv, export=1, ext="v,zbb" ctz t0, a3 addi t0, t0, 0xc4 L(blend_epilog): csrw vxrm, zero andi t0, t0, 0xc7 vsetvl zero, a3, t0 li t1, 64 1: addi a4, a4, -2 vle8.v v4, (a2) add a2, a2, a3 vle8.v v6, (a2) add a2, a2, a3 vle8.v v8, (a5) add a5, a5, a3 vle8.v v10, (a5) add a5, a5, a3 vle8.v v0, (a0) add t0, a0, a1 vle8.v v2, (t0) vwmulu.vv v16, v4, v8 vwmulu.vv v20, v6, v10 vrsub.vx v8, v8, t1 vrsub.vx v10, v10, t1 vwmaccu.vv v16, v0, v8 vwmaccu.vv v20, v2, v10 vnclipu.wi v0, v16, 6 vnclipu.wi v2, v20, 6 vse8.v v0, (a0) vse8.v v2, (t0) add a0, t0, a1 bnez a4, 1b ret endfunc function blend_h_vl256_8bpc_rvv, export=1, ext=zbb srai t0, a3, 2 li t2, 64 ctz t0, t0 addi t0, t0, 0xc5 j L(blend_h_epilog) endfunc function blend_h_8bpc_rvv, export=1, ext="v,zbb" li t2, 64 bgt a3, t2, 128f ctz t0, a3 addi t0, t0, 0xc4 L(blend_h_epilog): csrw vxrm, zero andi t0, t0, 0xc7 vsetvl zero, a3, t0 la t1, dav2d_obmc_masks srai t0, a4, 2 add t1, t1, a4 sub a4, a4, t0 0: mv t5, ra 1: addi a4, a4, -2 lbu t3, (t1) addi t1, t1, 1 lbu t4, (t1) addi t1, t1, 1 vle8.v v8, (a2) add a2, a2, a3 vle8.v v12, (a2) add a2, a2, a3 vle8.v v0, (a0) add t0, a0, a1 vle8.v v4, (t0) vwmulu.vx v16, v8, t3 vwmulu.vx v24, v12, t4 sub t3, t2, t3 sub t4, t2, t4 vwmaccu.vx v16, t3, v0 vwmaccu.vx v24, t4, v4 vnclipu.wi v0, v16, 6 vnclipu.wi v4, v24, 6 vse8.v v0, (a0) vse8.v v4, (t0) add a0, t0, a1 bgtz a4, 1b jr t5 128: csrw vxrm, zero vsetvli zero, t2, e8, m4, ta, ma la t1, dav2d_obmc_masks srai t0, a4, 2 add t1, t1, a4 sub a4, a4, t0 mv a5, a0 mv a6, a2 mv a7, a4 jal t5, 1b add t1, t1, a4 add a0, a5, t2 add a2, a6, t2 mv a4, a7 sub t1, t1, a4 j 0b endfunc function blend_v_vl256_8bpc_rvv, export=1, ext=zbb srai t0, a3, 2 ctz t0, t0 addi t0, t0, 0xc5 j L(blend_v_epilog) endfunc function blend_v_8bpc_rvv, export=1, ext="v,zbb" ctz t0, a3 addi t0, t0, 0xc4 L(blend_v_epilog): andi t0, t0, 0xc7 srai t1, a3, 2 sub t1, a3, t1 vsetvl zero, t1, t0 csrw vxrm, zero la t1, dav2d_obmc_masks add t1, t1, a3 vle8.v v8, (t1) li t0, 64 vrsub.vx v10, v8, t0 1: addi a4, a4, -2 vle8.v v4, (a2) add a2, a2, a3 vle8.v v6, (a2) add a2, a2, a3 vle8.v v0, (a0) add t0, a0, a1 vle8.v v2, (t0) vwmulu.vv v12, v4, v8 vwmulu.vv v16, v6, v8 vwmaccu.vv v12, v0, v10 vwmaccu.vv v16, v2, v10 vnclipu.wi v0, v12, 6 vnclipu.wi v2, v16, 6 vse8.v v0, (a0) vse8.v v2, (t0) add a0, t0, a1 bnez a4, 1b ret endfunc .macro avg va, vb, vm vadd.vv \va, \va, \vb .endm .macro w_avg va, vb, vm vwmul.vx v24, \va, a6 vwmacc.vx v24, a7, \vb vnclip.wi \va, v24, 8 .endm .macro mask va, vb, vm vwmul.vv v24, \va, \vm vrsub.vx \vm, \vm, a7 vwmacc.vv v24, \vb, \vm vnclip.wi \va, v24, 10 .endm .macro bidir_fn type, shift function \type\()_8bpc_rvv, export=1, ext="v,zba,zbb" .ifc \type, w_avg li a7, 16 sub a7, a7, a6 .endif .ifc \type, mask li a7, 64 .endif li t0, 4 csrw vxrm, zero beq t0, a4, 4f csrr t0, vlenb ctz t1, a4 ctz t0, t0 li t2, 1 sub t0, t1, t0 li t4, -3 bgt t0, t2, 2f max t0, t0, t4 andi t1, t0, 0x7 addi t0, t1, 1 # may overflow into E16 bit ori t0, t0, MA | TA | E16 ori t1, t1, MA | TA | E8 1: addi a5, a5, -4 .rept 2 vsetvl zero, a4, t0 sh1add t3, a4, a2 vle16.v v0, (a2) sh1add a2, a4, t3 vle16.v v4, (t3) sh1add t3, a4, a3 vle16.v v8, (a3) sh1add a3, a4, t3 vle16.v v12, (t3) .ifc \type, mask add t3, a4, a6 vle8.v v24, (a6) add a6, a4, t3 vle8.v v26, (t3) vzext.vf2 v16, v24 vzext.vf2 v20, v26 .endif \type v0, v8, v16 \type v4, v12, v20 vmax.vx v8, v0, zero vmax.vx v12, v4, zero vsetvl zero, zero, t1 vnclipu.wi v0, v8, \shift vnclipu.wi v2, v12, \shift add t3, a1, a0 vse8.v v0, (a0) add a0, a1, t3 vse8.v v2, (t3) .endr bnez a5, 1b ret 2: mv t0, a0 neg t4, a4 add a0, a1, a0 addi a5, a5, -1 20: vsetvli t2, a4, e16, m4, ta, ma sh1add t4, t2, t4 sh1add t3, t2, a2 vle16.v v0, (a2) sh1add a2, t2, t3 vle16.v v4, (t3) sh1add t3, t2, a3 vle16.v v8, (a3) sh1add a3, t2, t3 vle16.v v12, (t3) .ifc \type, mask add t3, t2, a6 vle8.v v24, (a6) add a6, t2, t3 vle8.v v26, (t3) vzext.vf2 v16, v24 vzext.vf2 v20, v26 .endif \type v0, v8, v16 \type v4, v12, v20 vmax.vx v8, v0, zero vmax.vx v12, v4, zero vsetvli zero, zero, e8, m2, ta, ma vnclipu.wi v0, v8, \shift vnclipu.wi v2, v12, \shift add t3, t2, t0 vse8.v v0, (t0) add t0, t2, t3 vse8.v v2, (t3) bnez t4, 20b bnez a5, 2b ret 4: slli t0, a5, 2 vsetvli t1, t0, e16, m4, ta, ma vle16.v v0, (a2) sh1add a2, t1, a2 vle16.v v4, (a3) sh1add a3, t1, a3 .ifc \type, mask vle8.v v16, (a6) add a6, t1, a6 vzext.vf2 v8, v16 .endif \type v0, v4, v8 vmax.vx v8, v0, zero vsetvli zero, zero, e8, m2, ta, ma vnclipu.wi v0, v8, \shift vsetvli t1, a5, e32, m2, ta, ma vsse32.v v0, (a0), a1 ctz t0, t1 sub a5, a5, t1 sll t0, a1, t0 add a0, t0, a0 bnez a5, 4b ret endfunc .endm bidir_fn avg, 5 bidir_fn w_avg, 0 bidir_fn mask, 0 function warp_8x8_8bpc_rvv, export=1, ext="v" csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma addi sp, sp, -2*15*8 mv t5, sp li t0, 3 mul t0, a3, t0 sub a2, a2, t0 addi a2, a2, -3 li t0, 64 addi a3, a3, -8 li t1, 15 la t2, dav2d_mc_warp_filter lh t6, (a4) lh t4, 2(a4) vid.v v30 vwmul.vx v28, v30, t6 1: addi t1, t1, -1 vsetvli zero, zero, e32, m2, ta, ma vadd.vx v4, v28, a5 add a5, a5, t4 vssra.vi v2, v4, 10 vadd.vx v2, v2, t0 vsll.vi v24, v2, 3 vsetvli zero, zero, e8, mf2, ta, ma vluxseg8ei32.v v2, (t2), v24 vsetvli zero, zero, e16, m1, ta, ma .irp i, 2, 3, 4, 5, 6, 7, 8, 9 vle8.v v10, (a2) addi a2, a2, 1 vsext.vf2 v14, v\i vzext.vf2 v16, v10 .if \i == 2 vwmulsu.vv v12, v14, v16 .else vwmaccsu.vv v12, v14, v16 .endif .endr vnclip.wi v10, v12, 3 add a2, a2, a3 vse16.v v10, (t5) addi t5, t5, 16 bnez t1, 1b mv t5, sp li t1, 8 lh t6, 4(a4) lh t4, 6(a4) vwmul.vx v28, v30, t6 2: addi t1, t1, -1 vsetvli zero, zero, e32, m2, ta, ma vadd.vx v4, v28, a6 add a6, a6, t4 vssra.vi v2, v4, 10 vadd.vx v2, v2, t0 vsll.vi v24, v2, 3 vsetvli zero, zero, e8, mf2, ta, ma vluxseg8ei32.v v2, (t2), v24 vsetvli zero, zero, e16, m1, ta, ma .irp i, 2, 3, 4, 5, 6, 7, 8, 9 vle16.v v10, (t5) addi t5, t5, 16 vsext.vf2 v14, v\i .if \i == 2 vwmul.vv v12, v14, v10 .else vwmacc.vv v12, v14, v10 .endif .endr addi t5, t5, -16*7 vnclip.wi v10, v12, 11 vmax.vx v10, v10, zero vsetvli zero, zero, e8, mf2, ta, ma vnclipu.wi v12, v10, 0 vse8.v v12, (a0) add a0, a0, a1 bnez t1, 2b addi sp, sp, 2*15*8 ret endfunc function warp_8x8t_8bpc_rvv, export=1, ext="v,zba" csrw vxrm, zero vsetivli zero, 8, e16, m1, ta, ma addi sp, sp, -2*15*8 mv t5, sp li t0, 3 mul t0, a3, t0 sub a2, a2, t0 addi a2, a2, -3 li t0, 64 addi a3, a3, -8 li t1, 15 la t2, dav2d_mc_warp_filter lh t6, (a4) lh t4, 2(a4) vid.v v30 vwmul.vx v28, v30, t6 1: addi t1, t1, -1 vsetvli zero, zero, e32, m2, ta, ma vadd.vx v4, v28, a5 add a5, a5, t4 vssra.vi v2, v4, 10 vadd.vx v2, v2, t0 vsll.vi v24, v2, 3 vsetvli zero, zero, e8, mf2, ta, ma vluxseg8ei32.v v2, (t2), v24 vsetvli zero, zero, e16, m1, ta, ma .irp i, 2, 3, 4, 5, 6, 7, 8, 9 vle8.v v10, (a2) addi a2, a2, 1 vsext.vf2 v14, v\i vzext.vf2 v16, v10 .if \i == 2 vwmulsu.vv v12, v14, v16 .else vwmaccsu.vv v12, v14, v16 .endif .endr vnclip.wi v10, v12, 3 add a2, a2, a3 vse16.v v10, (t5) addi t5, t5, 16 bnez t1, 1b mv t5, sp li t1, 8 lh t6, 4(a4) lh t4, 6(a4) vwmul.vx v28, v30, t6 2: addi t1, t1, -1 vsetvli zero, zero, e32, m2, ta, ma vadd.vx v4, v28, a6 add a6, a6, t4 vssra.vi v2, v4, 10 vadd.vx v2, v2, t0 vsll.vi v24, v2, 3 vsetvli zero, zero, e8, mf2, ta, ma vluxseg8ei32.v v2, (t2), v24 vsetvli zero, zero, e16, m1, ta, ma .irp i, 2, 3, 4, 5, 6, 7, 8, 9 vle16.v v10, (t5) addi t5, t5, 16 vsext.vf2 v14, v\i .if \i == 2 vwmul.vv v12, v14, v10 .else vwmacc.vv v12, v14, v10 .endif .endr addi t5, t5, -16*7 vnclip.wi v10, v12, 7 vse16.v v10, (a0) sh1add a0, a1, a0 bnez t1, 2b addi sp, sp, 2*15*8 ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/mc16.S000066400000000000000000000063671517466257200231200ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function blend_vl256_16bpc_rvv, export=1, ext=zbb ctz t0, a3 addi t0, t0, 0xc4 j L(blend_epilog) endfunc function blend_16bpc_rvv, export=1, ext="v,zbb" ctz t0, a3 addi t0, t0, 0xc5 L(blend_epilog): csrw vxrm, zero andi t0, t0, 0xc7 li t1, 64 ori t0, t0, 8 add a6, a3, a3 vsetvl zero, a3, t0 1: addi a4, a4, -2 vle8.v v24, (a5) add a5, a5, a3 vle8.v v28, (a5) add a5, a5, a3 vle16.v v8, (a2) add a2, a2, a6 vle16.v v12, (a2) add a2, a2, a6 vzext.vf2 v16, v24 vzext.vf2 v20, v28 vle16.v v0, (a0) add t0, a0, a1 vle16.v v4, (t0) vwmulu.vv v24, v8, v16 vwmulu.vv v8, v12, v20 vrsub.vx v16, v16, t1 vrsub.vx v20, v20, t1 vwmaccu.vv v24, v0, v16 vwmaccu.vv v8, v4, v20 vnclipu.wi v0, v24, 6 vnclipu.wi v4, v8, 6 vse16.v v0, (a0) vse16.v v4, (t0) add a0, t0, a1 bnez a4, 1b ret endfunc function blend_v_vl256_16bpc_rvv, export=1, ext=zbb srai t0, a3, 2 ctz t0, t0 addi t0, t0, 0xc6 j L(blend_v_epilog) endfunc function blend_v_16bpc_rvv, export=1, ext="v,zbb" ctz t0, a3 addi t0, t0, 0xc5 L(blend_v_epilog): andi t0, t0, 0xc7 ori t0, t0, 8 srai t1, a3, 2 sub t1, a3, t1 vsetvl zero, t1, t0 csrw vxrm, zero la t1, dav2d_obmc_masks add t1, t1, a3 vle8.v v20, (t1) li t0, 64 vzext.vf2 v16, v20 add a3, a3, a3 vrsub.vx v20, v16, t0 1: addi a4, a4, -2 vle16.v v8, (a2) add a2, a2, a3 vle16.v v12, (a2) add a2, a2, a3 vle16.v v0, (a0) add t0, a0, a1 vle16.v v4, (t0) vwmulu.vv v24, v8, v16 vwmulu.vv v8, v12, v16 vwmaccu.vv v24, v0, v20 vwmaccu.vv v8, v4, v20 vnclipu.wi v0, v24, 6 vnclipu.wi v4, v8, 6 vse16.v v0, (a0) vse16.v v4, (t0) add a0, t0, a1 bnez a4, 1b ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/64/pal.S000066400000000000000000000052731517466257200231210ustar00rootroot00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/riscv/asm.S" function pal_idx_finish_rvv, export=1, ext="v,zba,zbb" csrw vxrm, zero srl t0, a2, 1 sub a2, a2, a4 srl t1, a4, 1 mv t2, a5 csrr t6, vlenb li t4, -3 ctz a6, t0 ctz t6, t6 li a7, 16 sub a6, a6, t6 li t6, 1<<4+1 // a6 is never > 3 for VLEN >=128 // that would've required stripmining with a6 set to 3 max a6, a6, t4 li t5, 2 andi a6, a6, 7 addi t4, a1, 1 ori a6, a6, 0xc0 1: sub t3, t0, t1 vsetvl zero, t1, a6 vlse8.v v0, (a1), t5 sh1add a1, t1, a1 vlse8.v v8, (t4), t5 sh1add t4, t1, t4 vmacc.vx v0, a7, v8 vse8.v v0, (a0) add a0, a0, t1 ble t3, zero, 4f lbu a4, -1(a1) mul a4, a4, t6 vsetvl zero, t3, a6 vmv.v.x v0, a4 vse8.v v0, (a0) add a0, a0, t3 4: addi t2, t2, -1 add a1, a1, a2 add t4, t4, a2 bnez t2, 1b sub t1, a3, a5 sub t2, a0, t0 ble t1, zero, 7f vsetvl zero, t0, a6 vle8.v v0, (t2) add t2, a0, t0 5: addi t1, t1, -2 vse8.v v0, (a0) vse8.v v0, (t2) sh1add a0, t0, a0 sh1add t2, t0, t2 bnez t1, 5b 7: ret endfunc dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/asm.S000066400000000000000000000070611517466257200226710ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2023, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_RISCV_ASM_S #define DAV2D_SRC_RISCV_ASM_S #include "config.h" #if !defined(PIC) #if defined(__PIC__) #define PIC __PIC__ #elif defined(__pic__) #define PIC __pic__ #endif #endif #ifndef PRIVATE_PREFIX #define PRIVATE_PREFIX dav2d_ #endif #define PASTE(a,b) a ## b #define CONCAT(a,b) PASTE(a,b) #ifdef PREFIX #define EXTERN CONCAT(_,PRIVATE_PREFIX) #else #define EXTERN PRIVATE_PREFIX #endif .macro arch ext:req, more:vararg .option arch, +\ext .ifnb \more arch \more .endif .endm .macro function name, export=0, ext= .macro endfunc #ifdef __ELF__ .size \name, . - \name #endif .option pop .purgem endfunc .endm .text .option push .ifnb \ext arch \ext .endif .if \export .global EXTERN\name #ifdef __ELF__ .type EXTERN\name, %function .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif EXTERN\name: .else #ifdef __ELF__ .type \name, %function #endif .endif \name: .endm .macro const name, export=0, align=2 .macro endconst #ifdef __ELF__ .size \name, . - \name #endif .purgem endconst .endm #if defined(_WIN32) .section .rdata #elif !defined(__MACH__) .section .rodata #else .const_data #endif .align \align .if \export .global EXTERN\name #ifdef __ELF__ .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif EXTERN\name: .endif \name: .endm .macro thread_local name, align=3, quads=1 .macro end_thread_local .size \name, . - \name .purgem end_thread_local .endm .section .tbss, "waT" .align \align .hidden \name \name: .rept \quads .quad 0 .endr end_thread_local .endm #define L(x) .L ## x #define MA (1 << 7) #define TA (1 << 6) #define E8 (0 << 3) #define E16 (1 << 3) #define E32 (2 << 3) #define E64 (3 << 3) #define M1 0 #define M2 1 #define M4 2 #define M8 3 #define MF2 7 #define MF4 6 #define MF8 5 #endif /* DAV2D_SRC_RISCV_ASM_S */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/cdef.h000066400000000000000000000033341517466257200230360ustar00rootroot00000000000000#include "src/cpu.h" #include "src/cdef.h" extern void BF(dav2d_cdef_filter_block_4x4, rvv)(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int pri_strength, const int sec_strength, const int dir, const int damping, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); extern void BF(dav2d_cdef_filter_block_4x8, rvv)(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int pri_strength, const int sec_strength, const int dir, const int damping, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); extern void BF(dav2d_cdef_filter_block_8x8, rvv)(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, const pixel *const bottom, const int pri_strength, const int sec_strength, const int dir, const int damping, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); static ALWAYS_INLINE void cdef_dsp_init_riscv(Dav2dCdefDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_RISCV_CPU_FLAG_V)) return; // c->dir = BF(dav2d_cdef_dir, rvv); c->fb[0] = BF(dav2d_cdef_filter_block_8x8, rvv); c->fb[1] = BF(dav2d_cdef_filter_block_4x8, rvv); c->fb[2] = BF(dav2d_cdef_filter_block_4x4, rvv); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/cpu.c000066400000000000000000000036731517466257200227250ustar00rootroot00000000000000/* * Copyright © 2022, VideoLAN and dav2d authors * Copyright © 2022, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/attributes.h" #include "src/cpu.h" #include "src/riscv/cpu.h" #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #include #define HWCAP_RVV (1 << ('v' - 'a')) #endif int dav2d_has_compliant_rvv(void); COLD unsigned dav2d_get_cpu_flags_riscv(void) { unsigned flags = dav2d_get_default_cpu_flags(); #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO unsigned long hw_cap = dav2d_getauxval(AT_HWCAP); flags |= (hw_cap & HWCAP_RVV) && dav2d_has_compliant_rvv() ? DAV2D_RISCV_CPU_FLAG_V : 0; #endif return flags; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/cpu.h000066400000000000000000000032331517466257200227220ustar00rootroot00000000000000/* * Copyright © 2022, VideoLAN and dav2d authors * Copyright © 2022, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_RISCV_CPU_H #define DAV2D_SRC_RISCV_CPU_H enum CpuFlags { DAV2D_RISCV_CPU_FLAG_V = 1 << 0, }; unsigned dav2d_get_cpu_flags_riscv(void); int dav2d_get_vlenb(void); #define dav2d_get_vlen() (dav2d_get_vlenb()*8) #endif /* DAV2D_SRC_RISCV_CPU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/ipred.h000066400000000000000000000062761517466257200232500ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/ipred.h" decl_cfl_pred_fn(BF(dav2d_ipred_cfl, rvv)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl_128, rvv)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl_top, rvv)); decl_cfl_pred_fn(BF(dav2d_ipred_cfl_left, rvv)); decl_angular_ipred_fn(BF(dav2d_ipred_paeth, rvv)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth, rvv)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth_v, rvv)); decl_angular_ipred_fn(BF(dav2d_ipred_smooth_h, rvv)); decl_pal_pred_fn(BF(dav2d_pal_pred, rvv)); static ALWAYS_INLINE void intra_pred_dsp_init_riscv(Dav2dIntraPredDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 c->cfl_pred[DC_PRED ] = dav2d_ipred_cfl_8bpc_rvv; c->cfl_pred[DC_128_PRED ] = dav2d_ipred_cfl_128_8bpc_rvv; c->cfl_pred[TOP_DC_PRED ] = dav2d_ipred_cfl_top_8bpc_rvv; c->cfl_pred[LEFT_DC_PRED] = dav2d_ipred_cfl_left_8bpc_rvv; c->intra_pred[PAETH_PRED ] = dav2d_ipred_paeth_8bpc_rvv; c->intra_pred[SMOOTH_PRED ] = dav2d_ipred_smooth_8bpc_rvv; c->intra_pred[SMOOTH_V_PRED] = dav2d_ipred_smooth_v_8bpc_rvv; c->intra_pred[SMOOTH_H_PRED] = dav2d_ipred_smooth_h_8bpc_rvv; c->pal_pred = dav2d_pal_pred_8bpc_rvv; #elif BITDEPTH == 16 c->cfl_pred[DC_PRED ] = dav2d_ipred_cfl_16bpc_rvv; c->cfl_pred[DC_128_PRED ] = dav2d_ipred_cfl_128_16bpc_rvv; c->cfl_pred[TOP_DC_PRED ] = dav2d_ipred_cfl_top_16bpc_rvv; c->cfl_pred[LEFT_DC_PRED] = dav2d_ipred_cfl_left_16bpc_rvv; c->intra_pred[PAETH_PRED ] = dav2d_ipred_paeth_16bpc_rvv; c->intra_pred[SMOOTH_PRED ] = dav2d_ipred_smooth_16bpc_rvv; c->intra_pred[SMOOTH_V_PRED] = dav2d_ipred_smooth_v_16bpc_rvv; c->intra_pred[SMOOTH_H_PRED] = dav2d_ipred_smooth_h_16bpc_rvv; c->pal_pred = dav2d_pal_pred_16bpc_rvv; #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/itx.h000066400000000000000000000044231517466257200227410ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2023, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/itx.h" #define decl_itx_fns(ext) \ decl_itx17_fns( 4, 4, ext); \ decl_itx16_fns( 4, 8, ext); \ decl_itx16_fns( 4, 16, ext); \ decl_itx16_fns( 8, 4, ext); \ decl_itx16_fns( 8, 8, ext); \ decl_itx16_fns( 8, 16, ext); \ decl_itx16_fns(16, 4, ext); \ decl_itx16_fns(16, 8, ext); \ decl_itx16_fns(16, 16, ext) decl_itx_fns(rvv); static ALWAYS_INLINE void itx_dsp_init_riscv(Dav2dInvTxfmDSPContext *const c, int const bpc) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_RISCV_CPU_FLAG_V)) return; #if BITDEPTH == 8 assign_itx17_fn( , 4, 4, rvv); assign_itx16_fn(R, 4, 8, rvv); assign_itx16_fn(R, 4, 16, rvv); assign_itx16_fn(R, 8, 4, rvv); assign_itx16_fn( , 8, 8, rvv); assign_itx16_fn(R, 8, 16, rvv); assign_itx16_fn(R, 16, 4, rvv); assign_itx16_fn(R, 16, 8, rvv); assign_itx12_fn( , 16, 16, rvv); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/mc.h000066400000000000000000000051171517466257200225350ustar00rootroot00000000000000/* * Copyright © 2024, VideoLAN and dav2d authors * Copyright © 2024, Nathan Egge * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/mc.h" decl_blend_fn(BF(dav2d_blend, rvv)); decl_blend_dir_fn(BF(dav2d_blend_h, rvv)); decl_blend_dir_fn(BF(dav2d_blend_v, rvv)); decl_blend_fn(BF(dav2d_blend_vl256, rvv)); decl_blend_dir_fn(BF(dav2d_blend_h_vl256, rvv)); decl_blend_dir_fn(BF(dav2d_blend_v_vl256, rvv)); decl_avg_fn(BF(dav2d_avg, rvv)); decl_w_avg_fn(BF(dav2d_w_avg, rvv)); decl_mask_fn(BF(dav2d_mask, rvv)); decl_warp8x8_fn(BF(dav2d_warp_8x8, rvv)); decl_warp8x8t_fn(BF(dav2d_warp_8x8t, rvv)); static ALWAYS_INLINE void mc_dsp_init_riscv(Dav2dMCDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_RISCV_CPU_FLAG_V)) return; c->blend = BF(dav2d_blend, rvv); c->blend_v = BF(dav2d_blend_v, rvv); if (dav2d_get_vlen() >= 256) { c->blend = BF(dav2d_blend_vl256, rvv); c->blend_v = BF(dav2d_blend_v_vl256, rvv); } #if BITDEPTH == 8 c->blend_h = BF(dav2d_blend_h, rvv); if (dav2d_get_vlen() >= 256) { c->blend_h = BF(dav2d_blend_h_vl256, rvv); } c->avg = BF(dav2d_avg, rvv); c->w_avg = BF(dav2d_w_avg, rvv); c->mask = BF(dav2d_mask, rvv); c->warp8x8 = BF(dav2d_warp_8x8, rvv); c->warp8x8t = BF(dav2d_warp_8x8t, rvv); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/riscv/pal.h000066400000000000000000000033261517466257200227120ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2024, Bogdan Gligorijevic * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/pal.h" decl_pal_idx_finish_fn(dav2d_pal_idx_finish_rvv); static ALWAYS_INLINE void pal_dsp_init_riscv(Dav2dPalDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_RISCV_CPU_FLAG_V)) return; c->pal_idx_finish = dav2d_pal_idx_finish_rvv; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/scan.c000066400000000000000000000572671517466257200217440ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/attributes.h" #include "common/intops.h" #include "src/scan.h" #include "src/thread.h" static const uint16_t ALIGN(scan_4x4[], 32) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 11, 14, 15, }; static const uint16_t ALIGN(scan_4x8[], 32) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 5, 12, 19, 26, 6, 13, 20, 27, 7, 14, 21, 28, 15, 22, 29, 23, 30, 31, }; static const uint16_t ALIGN(scan_4x16[], 32) = { 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 5, 20, 35, 50, 6, 21, 36, 51, 7, 22, 37, 52, 8, 23, 38, 53, 9, 24, 39, 54, 10, 25, 40, 55, 11, 26, 41, 56, 12, 27, 42, 57, 13, 28, 43, 58, 14, 29, 44, 59, 15, 30, 45, 60, 31, 46, 61, 47, 62, 63, }; static const uint16_t ALIGN(scan_4x32[], 32) = { 0, 1, 32, 2, 33, 64, 3, 34, 65, 96, 4, 35, 66, 97, 5, 36, 67, 98, 6, 37, 68, 99, 7, 38, 69, 100, 8, 39, 70, 101, 9, 40, 71, 102, 10, 41, 72, 103, 11, 42, 73, 104, 12, 43, 74, 105, 13, 44, 75, 106, 14, 45, 76, 107, 15, 46, 77, 108, 16, 47, 78, 109, 17, 48, 79, 110, 18, 49, 80, 111, 19, 50, 81, 112, 20, 51, 82, 113, 21, 52, 83, 114, 22, 53, 84, 115, 23, 54, 85, 116, 24, 55, 86, 117, 25, 56, 87, 118, 26, 57, 88, 119, 27, 58, 89, 120, 28, 59, 90, 121, 29, 60, 91, 122, 30, 61, 92, 123, 31, 62, 93, 124, 63, 94, 125, 95, 126, 127, }; static const uint16_t ALIGN(scan_8x4[], 32) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, }; static const uint16_t ALIGN(scan_8x8[], 32) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 23, 30, 37, 44, 51, 58, 31, 38, 45, 52, 59, 39, 46, 53, 60, 47, 54, 61, 55, 62, 63, }; static const uint16_t ALIGN(scan_8x16[], 32) = { 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 9, 24, 39, 54, 69, 84, 99, 114, 10, 25, 40, 55, 70, 85, 100, 115, 11, 26, 41, 56, 71, 86, 101, 116, 12, 27, 42, 57, 72, 87, 102, 117, 13, 28, 43, 58, 73, 88, 103, 118, 14, 29, 44, 59, 74, 89, 104, 119, 15, 30, 45, 60, 75, 90, 105, 120, 31, 46, 61, 76, 91, 106, 121, 47, 62, 77, 92, 107, 122, 63, 78, 93, 108, 123, 79, 94, 109, 124, 95, 110, 125, 111, 126, 127, }; static const uint16_t ALIGN(scan_8x32[], 32) = { 0, 1, 32, 2, 33, 64, 3, 34, 65, 96, 4, 35, 66, 97, 128, 5, 36, 67, 98, 129, 160, 6, 37, 68, 99, 130, 161, 192, 7, 38, 69, 100, 131, 162, 193, 224, 8, 39, 70, 101, 132, 163, 194, 225, 9, 40, 71, 102, 133, 164, 195, 226, 10, 41, 72, 103, 134, 165, 196, 227, 11, 42, 73, 104, 135, 166, 197, 228, 12, 43, 74, 105, 136, 167, 198, 229, 13, 44, 75, 106, 137, 168, 199, 230, 14, 45, 76, 107, 138, 169, 200, 231, 15, 46, 77, 108, 139, 170, 201, 232, 16, 47, 78, 109, 140, 171, 202, 233, 17, 48, 79, 110, 141, 172, 203, 234, 18, 49, 80, 111, 142, 173, 204, 235, 19, 50, 81, 112, 143, 174, 205, 236, 20, 51, 82, 113, 144, 175, 206, 237, 21, 52, 83, 114, 145, 176, 207, 238, 22, 53, 84, 115, 146, 177, 208, 239, 23, 54, 85, 116, 147, 178, 209, 240, 24, 55, 86, 117, 148, 179, 210, 241, 25, 56, 87, 118, 149, 180, 211, 242, 26, 57, 88, 119, 150, 181, 212, 243, 27, 58, 89, 120, 151, 182, 213, 244, 28, 59, 90, 121, 152, 183, 214, 245, 29, 60, 91, 122, 153, 184, 215, 246, 30, 61, 92, 123, 154, 185, 216, 247, 31, 62, 93, 124, 155, 186, 217, 248, 63, 94, 125, 156, 187, 218, 249, 95, 126, 157, 188, 219, 250, 127, 158, 189, 220, 251, 159, 190, 221, 252, 191, 222, 253, 223, 254, 255, }; static const uint16_t ALIGN(scan_16x4[], 32) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, }; static const uint16_t ALIGN(scan_16x8[], 32) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127, }; static const uint16_t ALIGN(scan_16x16[], 32) = { 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 127, 142, 157, 172, 187, 202, 217, 232, 247, 143, 158, 173, 188, 203, 218, 233, 248, 159, 174, 189, 204, 219, 234, 249, 175, 190, 205, 220, 235, 250, 191, 206, 221, 236, 251, 207, 222, 237, 252, 223, 238, 253, 239, 254, 255, }; static const uint16_t ALIGN(scan_16x32[], 32) = { 0, 1, 32, 2, 33, 64, 3, 34, 65, 96, 4, 35, 66, 97, 128, 5, 36, 67, 98, 129, 160, 6, 37, 68, 99, 130, 161, 192, 7, 38, 69, 100, 131, 162, 193, 224, 8, 39, 70, 101, 132, 163, 194, 225, 256, 9, 40, 71, 102, 133, 164, 195, 226, 257, 288, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289, 320, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385, 416, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294, 325, 356, 387, 418, 449, 480, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 17, 48, 79, 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 19, 50, 81, 112, 143, 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423, 454, 485, 21, 52, 83, 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486, 22, 53, 84, 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302, 333, 364, 395, 426, 457, 488, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 26, 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 29, 60, 91, 122, 153, 184, 215, 246, 277, 308, 339, 370, 401, 432, 463, 494, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, 495, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434, 465, 496, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467, 498, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 159, 190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 223, 254, 285, 316, 347, 378, 409, 440, 471, 502, 255, 286, 317, 348, 379, 410, 441, 472, 503, 287, 318, 349, 380, 411, 442, 473, 504, 319, 350, 381, 412, 443, 474, 505, 351, 382, 413, 444, 475, 506, 383, 414, 445, 476, 507, 415, 446, 477, 508, 447, 478, 509, 479, 510, 511, }; static const uint16_t ALIGN(scan_32x4[], 32) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 64, 59, 62, 65, 68, 63, 66, 69, 72, 67, 70, 73, 76, 71, 74, 77, 80, 75, 78, 81, 84, 79, 82, 85, 88, 83, 86, 89, 92, 87, 90, 93, 96, 91, 94, 97, 100, 95, 98, 101, 104, 99, 102, 105, 108, 103, 106, 109, 112, 107, 110, 113, 116, 111, 114, 117, 120, 115, 118, 121, 124, 119, 122, 125, 123, 126, 127, }; static const uint16_t ALIGN(scan_32x8[], 32) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255, }; static const uint16_t ALIGN(scan_32x16[], 32) = { 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511, }; static const uint16_t ALIGN(scan_32x32[], 32) = { 0, 1, 32, 2, 33, 64, 3, 34, 65, 96, 4, 35, 66, 97, 128, 5, 36, 67, 98, 129, 160, 6, 37, 68, 99, 130, 161, 192, 7, 38, 69, 100, 131, 162, 193, 224, 8, 39, 70, 101, 132, 163, 194, 225, 256, 9, 40, 71, 102, 133, 164, 195, 226, 257, 288, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289, 320, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385, 416, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294, 325, 356, 387, 418, 449, 480, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 17, 48, 79, 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482, 513, 544, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 514, 545, 576, 19, 50, 81, 112, 143, 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546, 577, 608, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423, 454, 485, 516, 547, 578, 609, 640, 21, 52, 83, 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486, 517, 548, 579, 610, 641, 672, 22, 53, 84, 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, 642, 673, 704, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302, 333, 364, 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705, 736, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 520, 551, 582, 613, 644, 675, 706, 737, 768, 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800, 26, 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, 832, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771, 802, 833, 864, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, 741, 772, 803, 834, 865, 896, 29, 60, 91, 122, 153, 184, 215, 246, 277, 308, 339, 370, 401, 432, 463, 494, 525, 556, 587, 618, 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, 495, 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, 929, 960, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434, 465, 496, 527, 558, 589, 620, 651, 682, 713, 744, 775, 806, 837, 868, 899, 930, 961, 992, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528, 559, 590, 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467, 498, 529, 560, 591, 622, 653, 684, 715, 746, 777, 808, 839, 870, 901, 932, 963, 994, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 530, 561, 592, 623, 654, 685, 716, 747, 778, 809, 840, 871, 902, 933, 964, 995, 159, 190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717, 748, 779, 810, 841, 872, 903, 934, 965, 996, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, 718, 749, 780, 811, 842, 873, 904, 935, 966, 997, 223, 254, 285, 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688, 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 255, 286, 317, 348, 379, 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, 751, 782, 813, 844, 875, 906, 937, 968, 999, 287, 318, 349, 380, 411, 442, 473, 504, 535, 566, 597, 628, 659, 690, 721, 752, 783, 814, 845, 876, 907, 938, 969, 1000, 319, 350, 381, 412, 443, 474, 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, 908, 939, 970, 1001, 351, 382, 413, 444, 475, 506, 537, 568, 599, 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002, 383, 414, 445, 476, 507, 538, 569, 600, 631, 662, 693, 724, 755, 786, 817, 848, 879, 910, 941, 972, 1003, 415, 446, 477, 508, 539, 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942, 973, 1004, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, 788, 819, 850, 881, 912, 943, 974, 1005, 479, 510, 541, 572, 603, 634, 665, 696, 727, 758, 789, 820, 851, 882, 913, 944, 975, 1006, 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, 914, 945, 976, 1007, 543, 574, 605, 636, 667, 698, 729, 760, 791, 822, 853, 884, 915, 946, 977, 1008, 575, 606, 637, 668, 699, 730, 761, 792, 823, 854, 885, 916, 947, 978, 1009, 607, 638, 669, 700, 731, 762, 793, 824, 855, 886, 917, 948, 979, 1010, 639, 670, 701, 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 671, 702, 733, 764, 795, 826, 857, 888, 919, 950, 981, 1012, 703, 734, 765, 796, 827, 858, 889, 920, 951, 982, 1013, 735, 766, 797, 828, 859, 890, 921, 952, 983, 1014, 767, 798, 829, 860, 891, 922, 953, 984, 1015, 799, 830, 861, 892, 923, 954, 985, 1016, 831, 862, 893, 924, 955, 986, 1017, 863, 894, 925, 956, 987, 1018, 895, 926, 957, 988, 1019, 927, 958, 989, 1020, 959, 990, 1021, 991, 1022, 1023, }; const uint16_t *const dav2d_scans[N_RECT_TX_SIZES] = { [ TX_4X4 ] = scan_4x4, [ TX_8X8 ] = scan_8x8, [ TX_16X16] = scan_16x16, [ TX_32X32] = scan_32x32, [ TX_64X64] = scan_32x32, [RTX_4X8 ] = scan_4x8, [RTX_8X4 ] = scan_8x4, [RTX_8X16 ] = scan_8x16, [RTX_16X8 ] = scan_16x8, [RTX_16X32] = scan_16x32, [RTX_32X16] = scan_32x16, [RTX_32X64] = scan_32x32, [RTX_64X32] = scan_32x32, [RTX_4X16 ] = scan_4x16, [RTX_16X4 ] = scan_16x4, [RTX_8X32 ] = scan_8x32, [RTX_32X8 ] = scan_32x8, [RTX_16X64] = scan_16x32, [RTX_64X16] = scan_32x16, [RTX_4X32 ] = scan_4x32, [RTX_32X4 ] = scan_32x4, [RTX_8X64 ] = scan_8x32, [RTX_64X8 ] = scan_32x8, [RTX_4X64 ] = scan_4x32, [RTX_64X4 ] = scan_32x4, }; /* Index (eob) of the last coefficient per 4xN column. */ const struct Dav2dLastEob dav2d_last_eob_per_col = { .offset = { /* Offset into the table below. */ [ TX_4X4 ] = 1, [RTX_8X4 ] = 1, [RTX_16X4 ] = 1, [RTX_32X4 ] = 1, [RTX_64X4 ] = 1, [RTX_4X8 ] = 0, [ TX_8X8 ] = 0, [RTX_16X8 ] = 0, [RTX_32X8 ] = 0, [RTX_64X8 ] = 0, [RTX_4X16 ] = 2, [RTX_8X16 ] = 6, [ TX_16X16] = 10, [RTX_32X16] = 10, [RTX_64X16] = 10, [RTX_4X32 ] = 14, [RTX_4X64 ] = 14, [RTX_8X32 ] = 22, [RTX_8X64 ] = 22, [RTX_16X32] = 30, [RTX_16X64] = 30, [ TX_32X32] = 38, [RTX_32X64] = 38, [RTX_64X32] = 38, [ TX_64X64] = 38, }, .table = { 9, 0xffff, // 4x8, 8x8, 16x8, 32x8 9, 25, 41, 0xffff, // 4x16 9, 35, 67, 0xffff, // 8x16 9, 35, 77, 0xffff, // 16x16, 32x16 9, 25, 41, 57, 73, 89, 105, 0xffff, // 4x32 9, 35, 67, 99, 131, 163, 195, 0xffff, // 8x32 9, 35, 77, 135, 199, 263, 327, 0xffff, // 16x32 9, 35, 77, 135, 209, 299, 405, 0xffff, // 32x32 } }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/scan.h000066400000000000000000000033201517466257200217260ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_SCAN_H #define DAV2D_SRC_SCAN_H #include #include "src/levels.h" EXTERN const uint16_t *const dav2d_scans[N_RECT_TX_SIZES]; EXTERN const struct Dav2dLastEob { uint8_t offset[N_RECT_TX_SIZES]; uint16_t table[46]; } dav2d_last_eob_per_col; #endif /* DAV2D_SRC_SCAN_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/stx.h000066400000000000000000000035031517466257200216230ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_STX_H #define DAV2D_SRC_STX_H #include "common/bitdepth.h" #define decl_stx_fn(name) \ void (name)(coef *cf_out, const coef *cf, const int8_t *kernel, int stride, \ int eob HIGHBD_DECL_SUFFIX) typedef decl_stx_fn(*stx_fn); typedef struct Dav2dStxDSPContext { stx_fn stxfm; } Dav2dStxDSPContext; bitfn_decls(void dav2d_stx_dsp_init, Dav2dStxDSPContext *c); #endif /* DAV2D_SRC_STX_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/stx_tables.c000066400000000000000000013021471517466257200231570ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "src/stx_tables.h" // Scan order of primary transform coefficients for 4x4 IST const uint8_t dav2d_stx_scan_orders_4x4[TX_64X64][2][16] = { [TX_4X4] = { { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }, { 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15 }, }, [TX_8X8] = { { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 25, 18, 11, 19, 26, 27 }, { 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 11, 18, 25, 26, 19, 27 }, }, [TX_16X16] = { { 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 49, 34, 19, 35, 50, 51 }, { 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 19, 34, 49, 50, 35, 51 }, }, [TX_32X32] = { { 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 97, 66, 35, 67, 98, 99 }, { 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 35, 66, 97, 98, 67, 99 }, } }; const uint8_t dav2d_stx_scan_orders_8x8[TX_32X32][2][64] = { [TX_8X8 - 1] = { { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }, { 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 } }, [TX_16X16 - 1] = { { 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, 67, 82, 97, 112, 113, 98, 83, 68, 53, 38, 23, 39, 54, 69, 84, 99, 114, 115, 100, 85, 70, 55, 71, 86, 101, 116, 117, 102, 87, 103, 118, 119 }, { 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67, 52, 37, 22, 7, 23, 38, 53, 68, 83, 98, 113, 114, 99, 84, 69, 54, 39, 55, 70, 85, 100, 115, 116, 101, 86, 71, 87, 102, 117, 118, 103, 119 } }, [TX_32X32 - 1] = { { 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66, 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68, 37, 6, 7, 38, 69, 100, 131, 162, 193, 224, 225, 194, 163, 132, 101, 70, 39, 71, 102, 133, 164, 195, 226, 227, 196, 165, 134, 103, 135, 166, 197, 228, 229, 198, 167, 199, 230, 231, }, { 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131, 100, 69, 38, 7, 39, 70, 101, 132, 163, 194, 225, 226, 195, 164, 133, 102, 71, 103, 134, 165, 196, 227, 228, 197, 166, 135, 167, 198, 229, 230, 199, 231 } } }; const uint8_t dav2d_coeff8x8_mapping[33][48] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 38, 39, 40, 41, 42, 43, 44, 45, 46, 51, 52, 53, 54, 55, 60, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 51, } }; const int8_t dav2d_stx_4x4_kernel[14][3][8][16] = { { { { 102, -45, -53, -5, -13, -3, -3, 19, 22, -2, 3, 8, 2, -5, -8, -4 }, { 34, 39, 55, -33, -81, -18, 0, -10, -14, 0, 19, 42, 8, 2, 3, -13 }, { -53, -8, -21, -8, -43, -3, 2, 56, 72, 8, 9, 21, 4, -29, -37, -8 }, { -35, -49, -40, 20, -21, 23, 5, -24, -36, 9, 29, 61, 13, 25, 30, -40 }, { 0, -80, 74, 39, -9, -16, 8, 33, -11, -11, -29, -3, 13, -11, 5, 4 }, { -22, -24, -30, -28, -49, -54, 11, -8, -6, 25, -8, -48, 6, 33, 38, 50 }, { 5, 20, -7, 69, -3, -58, -3, -54, 40, 3, -27, 15, 30, 21, -26, -14 }, { -9, -29, -6, 5, -45, 33, -30, -63, -18, -12, -6, -29, -30, -39, -58, 16 } }, { { 115, -36, -36, -10, -16, -11, -4, 4, 4, -3, 4, 7, 2, 1, 1, -1 }, { 51, 67, 73, -2, 28, -6, -9, -30, -30, -9, -16, -26, -12, 2, 1, 10 }, { -17, 0, -17, -50, -68, -23, 2, -47, -66, -6, 1, -8, 4, 21, 26, 19 }, { 6, 52, -47, -58, 21, 68, 13, 46, -24, -15, 3, 0, 1, -8, 4, -1 }, { -12, -45, -19, -16, 31, -12, -13, -3, -27, -30, -48, -68, -22, -37, -44, -2 }, { 2, -50, 62, -7, -5, 17, 31, 53, -37, -24, -34, 23, 44, 17, 9, 3 }, { -10, 2, -9, -46, 55, -60, -30, 3, 24, -51, -7, 41, 7, 31, 21, 15 }, { 1, -28, -15, 12, 62, 6, 25, -10, -20, 40, 34, -43, 19, 30, 41, 52 } }, { { 114, 13, -42, -18, 23, -22, -4, -3, -3, -3, -4, 9, -2, -3, 4, 1 }, { -39, 75, -64, 13, 17, -14, -5, 36, -43, 9, -9, 28, -15, 3, -6, 11 }, { 19, -13, -37, 7, -98, 29, -4, 37, -22, -8, 17, -27, 12, 24, -17, 4 }, { 33, 66, 74, 40, 11, 22, -4, 38, -4, -4, 16, -27, 0, 9, -25, -3 }, { -4, 35, -5, 38, -39, -70, -6, -68, 2, 3, 18, -40, -8, -17, 1, -15 }, { 12, -49, 34, 36, -6, -45, -12, 7, -61, 8, -3, 44, -29, 11, -31, 37 }, { -9, 2, 16, -70, 17, -16, 16, -8, -74, -17, -15, -56, 17, 8, -25, -15 }, { 11, 33, 24, -24, -41, 51, -7, -46, -27, -7, -42, 29, -41, -39, 23, 21 } } }, { { { 70, 64, -63, 12, -52, 6, -12, 0, 4, -1, 2, -10, 13, -7, 4, 1 }, { -47, -26, -29, 64, -45, 9, 11, -29, 66, -17, -13, 18, -7, -9, 5, 10 }, { 32, -48, -23, 27, 22, -78, -30, 55, 21, -13, -10, 1, 20, 4, -4, -4 }, { 77, -28, 21, -8, 21, 15, 18, -45, 27, -19, -25, 51, -32, 22, -18, 1 }, { -23, 77, 42, 15, 17, -6, -30, 22, 23, -38, -48, 29, 15, 17, -25, -2 }, { 22, -10, 1, 35, 40, 59, 40, 37, 2, -38, -27, -53, 13, -35, 5, 8 }, { -1, -32, -15, 2, 7, 68, -98, 1, -3, 0, 3, -3, 8, 26, -4, -6 }, { 22, 5, 55, 8, -5, -23, -42, -36, 12, -15, 13, -25, -1, -44, 62, 47 } }, { { 80, -82, -39, -1, 26, 22, 1, 13, 10, -3, 1, -11, -11, 0, -1, 2 }, { 53, 21, -6, -6, -47, -86, 24, 42, 10, -2, 0, 16, 18, -20, -3, -1 }, { 61, 55, -25, -1, -5, 9, -57, -63, -12, -1, -1, 12, 24, 27, 4, -6 }, { -39, -27, -52, 28, 54, -61, -1, -24, -6, 2, -6, 10, 56, 10, 7, -7 }, { 13, 63, -39, 8, 59, -10, 33, 20, -5, 1, -10, -51, -52, 6, 6, 14 }, { -27, 8, -52, 15, -26, 13, -76, 55, 42, 1, -8, -21, 5, -17, -15, 14 }, { 21, 25, 29, 7, 50, 32, -4, 32, -26, -10, -4, 7, 60, -72, -15, -4 }, { -8, 19, -64, 7, -39, 55, 72, -10, 17, 4, 1, 16, 34, -8, -1, -11 } }, { { -39, -94, 30, -3, 1, 29, 21, -55, 24, 1, 1, 0, 4, -18, 5, 1 }, { -85, 42, 20, 8, -55, 45, -9, 16, 0, 1, 1, -28, 25, 8, -1, -6 }, { -80, 6, 5, 13, 35, -63, 4, 7, -11, 4, -2, 40, -52, 0, -2, 11 }, { -5, 51, -46, 6, 7, 26, 55, -62, -18, 6, -5, 20, 3, -55, 5, 9 }, { 9, 38, 69, -6, 63, 50, -25, -4, 24, 0, -13, 41, 5, 0, 8, 18 }, { -25, -31, -54, 0, 62, 44, 5, 44, -42, 9, -5, 6, 39, 19, -18, -3 }, { 17, -13, -10, 20, -42, 56, 13, 15, -13, -7, 7, 40, -72, 31, -26, 38 }, { -7, -11, -39, 0, -8, 14,-103, -13, -3, 7, -1, 5, -17, -53, 14, 16 } } }, { { { 103, -54, -44, -5, -9, -3, -3, 20, 14, -3, 3, 9, 2, -6, -2, -5 }, { 55, 73, 25, -23, -47, -25, -7, -44, -28, 0, 5, 23, 8, 24, 13, -3 }, { 44, 18, 35, -9, 74, 24, -7, -10, -30, -9, -14, -61, -33, -10, 0, 23 }, { -13, -75, 36, -22, 1, -61, -6, -25, -42, -2, 4, -9, 14, 28, 31, 25 }, { 16, 3, 78, 6, 11, -33, -3, 56, 10, -14, -15, 26, 32, -42, -25, -32 }, { -7, -10, 9, -56, -45, 52, 30, 42, -58, 11, 22, -9, -5, -37, 17, -1 }, { 15, -22, 44, 41, 17, 66, 40, -13, -1, 9, 6, 44, 12, 50, 29, -15 }, { -4, 31, -38, -40, 65, -13, 2, 41, -13, 11, 26, 21, 57, 33, 24, 0 } }, { { 87, -43, -77, 12, 22, -3, -3, 13, 15, 2, -6, -10, 1, -1, -2, 2 }, { 50, 29, 19, -60, -73, -20, -3, 16, 40, 16, 12, 22, 5, -9, -22, 0 }, { 3, -76, 50, -69, 48, -3, 6, 17, -5, 20, 11, -6, -9, 1, -5, 1 }, { -51, -13, -16, 8, -2, -13, 3, 47, 79, 14, -42, -41, 1, -17, -18, 21 }, { 2, 15, 5, 13, 10, -83, -15, 67, -49, -6, 16, -11, 33, -20, 5, -2 }, { -16, 52, -47, -47, 32, -1, -1, -11, -21, 83, -2, -19, 0, 15, 3, 2 }, { 48, 42, 45, -6, 16, 6, 7, -3, -4, -22, -42, -72, -23, 3, 33, 28 }, { -17, -11, -27, -12, -42, 44, 19, 37, -23, -10, 69, -48, -25, -17, 26, 19 } }, { { 109, -45, -42, -7, -12, -14, -1, 13, 13, -2, 2, 6, 5, -1, -1, -2 }, { 31, 73, 37, -21, -67, -53, 3, -14, -15, 0, 2, 21, 13, 4, 5, 0 }, { -46, -30, -15, 6, -45, -43, 17, 66, 52, 7, 5, 25, 20, -23, -14, -9 }, { -22, 27, -91, 32, 0, -13, -7, -44, -6, 5, 10, 40, 28, 30, 10, -14 }, { 15, 77, -41, 17, 28, -4, 5, 41, 27, -2, -19, -43, -27, -34, -24, 13 }, { 0, 19, -20, -34, -58, 94, -8, 10, 21, 11, 23, 21, -24, -2, -1, -5 }, { 23, 20, 35, 41, 31, 37, 1, 17, 1, -17, -3, 54, 57, -30, -27, -42 }, { -7, 9, -25, -64, 34, -5, 33, 46, -66, -3, 31, 21, 15, -22, 26, 0 } } }, { { { -86, 25, 7, 20, -64, 11, 2, 25, -16, -4, 14, -42, 7, 17, -16, -18 }, { 16, -37, 39, -24, -29, 4, -3, -41, 62, -2, 5, -58, 17, -13, 28, -42 }, { 66, -28, 20, 2, -27, 3, 9, 33, -19, -21, 30, -28, 7, 55, -56, 16 }, { 22, 28, -54, -18, 28, -38, 6, 15, -24, 4, -6, -40, -7, 11, -5, -84 }, { -35, 26, -17, -29, 50, -19, 23, -39, 44, -23, 25, -21, 26, 40, -36, 36 }, { 28, 80, 35, -3, -3, -14, -10, 53, 41, -2, 7, -4, -4, 18, 50, 16 }, { -30, -17, 69, -15, 35, 0, 13, 14, 8, -4, -31, 37, -39, 45, -19, -48 }, { 6, 21, 38, 51, 6, -53, -3, -35, -8, -17, 56, 0, -41, -43, -26, -11 } }, { { -95, -25, 57, 2, -18, 22, 2, -35, 34, 0, -6, 5, 0, -12, 10, 5 }, { 14, -45, 27, -21, 79, -6, 10, -15, -11, -1, -18, 64, -22, 16, -26, 20 }, { 47, 11, -18, 14, -15, 25, -8, -53, 49, 0, -11, 37, 8, -44, 33, 52 }, { -16, 37, -28, 34, -39, 27, 22, -8, 6, -26, -24, 45, -28, 51, -58, 0 }, { 44, -27, 15, -22, -12, 3, 14, -44, 44, -10, -25, -23, -13, 13, -8, -84 }, { -19, -71, -68, 48, 0, -12, 15, -32, -16, 7, 36, -8, -25, -14, 0, -9 }, { 12, -14, 24, 46, -3, -54, 2, 44, 60, -18, 11, 4, -32, 41, 39, 16 }, { -41, 53, -43, -2, 44, -30, -5, -4, 14, 11, -35, 26, -24, -24, 41, -46 } }, { { 110, -43, 10, -22, 35, -13, 0, -8, 10, -6, -3, 15, -5, 2, 0, 3 }, { 15, 60, -76, -16, 42, -27, -3, 40, -43, 2, -11, 12, -5, 11, -9, -6 }, { -47, -10, -5, -22, 72, -4, 9, -46, 29, -1, -24, 59, -17, -18, 15, 20 }, { -15, -70, -86, -41, -25, 15, 5, -14, 5, -2, 5, -24, 13, -12, 3, -10 }, { 21, 61, -14, -18, 1, -4, -14, -30, 62, -4, 12, -41, 31, -37, 50, -8 }, { -14, 10, 36, -90, 7, 34, -6, 40, 16, -35, -21, -8, 18, 34, -15, -11 }, { 25, 20, -14, -2, -48, 65, -14, 6, -15, 6, -36, 57, 12, -29, 19, 48 }, { 0, -11, -27, 44, 24, 41, 2, 59, 67, -8, 37, 13, -11, 36, 14, 14 } } }, { { {-103, -31, 38, 7, -26, 28, 8, -36, 19, 3, 0, -3, 1, -12, 4, 0 }, { -16, 75, -16, 3, -60, 33, -16, 31, -10, 0, 4, -48, 44, -4, 4, -12 }, { -51, 31, 28, 6, 27, -34, -8, 70, -17, -3, -1, 22, -25, 59, -16, -3 }, { 21, -54, 49, -30, 20, 18, -10, 26, 7, 0, -2, -41, 64, 35, -8, -33 }, { -29, -49, -55, 9, -11, -68, -8, 20, -30, 4, 15, -50, 1, -24, 4, -23 }, { 19, -40, -28, 32, -67, 20, 16, 13, 32, -7, 12, 1, -30, 71, -6, 7 }, { -25, -8, -67, 6, 45, 58, 42, -1, -42, 2, -8, 17, 28, 21, -26, -9 }, { 13, -30, 30, -19, -29, 42, -5, 39, -70, 1, 9, -2, -44, -38, -31, 26 } }, { { -88, 68, 2, 20, -29, -2, -5, 40, -24, 1, 2, 1, -10, 14, -9, 5 }, { 20, 37, -38, 12, 65, -47, 5, -1, -29, 5, -8, 59, -35, -22, 3, 19 }, { 56, -13, 16, 10, -18, 8, -9, 72, -24, -14, 18, 14, -24, 54, -33, 36 }, { -28, -54, 21, 27, -26, 43, -13, -11, -21, 1, 2, 50, -38, -58, -2, 32 }, { 31, 28, 63, -4, -36, -49, -24, 12, 21, 2, 0, -7, -49, -38, 27, -34 }, { -34, -12, 59, -8, 25, -37, 27, -37, 46, -10, 15, 16, -16, 43, -15, 49 }, { 7, -13, -10, 34, -34, -51, 44, -32, -47, -17, 38, -33, 6, -22, -51, -5 }, { -39, -66, 1, -4, 34, -26, 7, 32, -19, 3, -14, -1, -34, 24, -5, -72 } }, { { 104, -59, 35, -4, 11, -7, 1, -22, 14, 1, 1, 3, -1, -5, 3, 1 }, { 33, 48, -45, -10, 80, -44, 2, 11, -14, -1, -6, 39, -23, 2, -3, 9 }, { 45, 30, 11, 12, -37, 11, -15, 86, -38, -3, 3, -11, 0, 45, -21, -4 }, { 30, 30, -53, -27, -8, -11, 9, -20, 19, -9, 9, -74, 53, 3, 2, -40 }, { -6, -76, -81, -2, -3, 18, 16, 7, -47, -1, 0, 7, 0, 11, -29, 9 }, { 34, 32, -40, 39, -52, 19, -18, -16, -16, 7, 7, 15, -14, -72, 29, 26 }, { -10, -13, 5, 25, -37, -98, 15, -10, -19, -5, 23, -25, -46, -3, -15, -16 }, { 8, 13, -23, 13, -22, 8, 19, -15, 66, -40, 37, 10, -17, 49, -23, 61 } } }, { { { -94, 10, -63, 11, 30, 16, 2, 18, 37, 16, 7, -2, 0, -5, -14, -3 }, { 50, -53, -81, -38, -26, 7, 1, 25, 20, -5, 22, 26, 5, 1, 8, -5 }, { 64, 8, -23, 49, 64, -4, -8, -5, 42, 16, -23, -36, -10, -10, -34, 1 }, { -17, -75, 2, 77, -43, -4, 11, 4, -12, 29, -35, -4, 14, 7, -1, 4 }, { 7, -5, 54, 23, -8, 16, -3, 30, 60, 33, 65, 42, -6, -16, -5, -27 }, { -8, 4, -5, -26, -51, -57, -7, -60, 37, 33, 20, -31, -1, 6, -46, 6 }, { 12, 33, -33, 38, 12, -21, 3, -37, -24, 47, 42, 20, 9, 28, 69, 9 }, { -19, -76, 21, -11, 58, -20, -6, -35, 7, -36, 44, -24, -8, 6, 25, 9 } }, { { 84, -42, -75, 12, 17, -1, -3, 15, 27, -1, -12, -12, 0, -3, -7, 5 }, { 44, 8, 28, -57, -75, -7, -2, 17, 39, 3, 27, 34, 2, -11, -28, -6 }, { -52, 35, -29, -4, 2, -9, -1, 21, 81, 25, -31, -32, -2, -9, -39, 17 }, { 37, 12, 45, 12, -35, -31, 4, 8, -9, 8, -79, -46, 1, 9, 34, 27 }, { -28, -44, -8, -54, 3, -2, 9, 56, 10, 17, 24, -27, -11, 8, 75, 4 }, { 17, -35, 62, -30, 56, 6, -5, -3, 8, -19, 18, -49, -12, -13, -50, 30 }, { -40, -77, 5, 32, -42, 13, -2, 33, -5, -40, -33, 20, 11, -26, -27, 9 }, { 20, 8, 51, 64, 27, 15, 2, 50, 44, 16, 12, 20, -15, -15, 21, -46 } }, { {-114, 32, -5, 31, 26, 14, 2, 5, 5, 11, -10, -10, -3, -4, -8, 0 }, { 14, 17,-104, -24, 50, -14, 2, -5, 40, 6, 7, -1, 5, 0, -3, -3 }, { 9, 69, 40, -26, 38, -52, 10, -60, -15, -21, -7, -20, 10, 9, 9, 5 }, { -34, 24, 4, -95, -32, 13, -6, 23, 0, -21, 40, 40, -6, -4, 15, -7 }, { 12, -7, 42, -20, 43, 39, -24, 14, 60, -29, 18, -49, -27, -14, -41, 4 }, { -24, -42, 16, 1, -25, -40, 41, -32, 59, 12, 48, -5, 47, 16, -17, -11 }, { 26, 67, 17, 23, -24, -13, -6, 21, 36, 64, 10, 32, -13, -31, -33, -23 }, { 2, -15, 20, 33, 51, -22, 12, 23, 17, -49, 8, 64, 0, -36, 37, -37 } } }, { { {-104, 25, 12, 32, -49, 21, 1, 6, -1, 0, 11, -24, 8, 0, 1, -5 }, { 7, 55, -83, 18, 5, -16, -4, 45, -55, 11, 5, 1, -2, 14, -18, -1 }, { 11, 94, 70, -23, 6, -24, -15, 17, 1, -15, -12, 14, -8, 7, 4, 5 }, { -56, -6, -2, 19, 79, -7, 5, -16, -10, -1, -23, 65, -23, -6, -5, 29 }, { 17, 8, -3, 83, -2, -67, -4, -18, 35, -6, 33, -8, -26, -18, 29, -2 }, { -27, -13, -29, -62, -54, -75, 4, -9, 11, 3, -17, 35, -13, -7, 13, 13 }, { 1, -45, 51, 17, -17, -37, 7, 20, -61, -3, 23, 3, -26, 37, -55, 0 }, { 5, 23, 3, -9, -3, -3, 3, -92, -61, 23, 1, -22, -4, -47, -12, -6 } }, { { 104, -39, -59, -9, 3, -15, -2, 8, 13, 3, 0, 6, 6, 0, -2, -1 }, { 53, 28, 82, -47, -13, -34, -3, -29, -19, -26, 8, 15, 3, 8, 16, -1 }, { -11, -90, 43, 1, -57, 12, 27, 7, 35, -10, 19, -6, 16, -9, -6, -8 }, { -26, 22, -44, -13, -69, -54, 21, -13, -15, 27, 22, 48, 29, 5, 3, -15 }, { 7, -7, 11, 94, 11, -62, 12, -48, 4, -16, -12, -21, 11, -1, 8, -2 }, { -37, -41, -6, -50, 63, -54, -4, -21, 42, 4, -19, 31, -8, -10, 13, 2 }, { 11, 42, 20, 26, 3, -15, 10, 68, 64, -23, -10, 46, 8, -27, -26, -13 }, { -9, -30, -3, 5, 24, -4, 34, 45, -76, -51, -27, 33, 34, 6, 17, 1 } }, { { 115, -20, -32, -22, 20, -27, -1, -2, 0, 0, -1, 8, 0, -1, 1, 0 }, { 12, 3, 99, -22, -7, -49, 3, -43, 17, -27, 2, 17, -6, -7, 16, 0 }, { -4,-102, -1, 25, -31, 10, 28, -15, 46, -3, 13, -21, 16, -11, 5, -7 }, { -26, 2, -36, 62, 58, -65, 3, -32, 22, 6, -6, 19, -20, -20, 15, 4 }, { 18, -16, 36, 32, 58, 56, -58, 1, 18, -15, -15, -33, -36, 16, 4, 12 }, { -33, -43, -7, -74, 41, 4, -6, 27, 24, 12, -29, 59, -18, -4, 3, 12 }, { 8, 26, 13, 0, 55, 38, 81, 6, 17, -25, -22, -10, 45, -29, 7, -8 }, { -11, 16, -25, -53, 9, 18, -16, -84, 19, 41, 0, -45, 7, -9, 19, -16 } } }, { { { 105, -52, -47, 6, 12, 10, 0, 8, 8, 0, -4, -11, -5, 3, 2, 0 }, { 42, 33, 42, -38, -65, -44, 14, 45, 37, 9, -4, -9, -7, -11, -9, 10 }, { -4, -64, 66, -59, -3, 61, -14, -14, 15, 11, 3, -1, -3, 6, -4, 0 }, { -48, -29, -42, -26, 15, -16, 20, 38, 43, 14, -22, -62, -31, 27, 20, -1 }, { -11, -52, 33, 43, -17, -18, 33, 45, -41, -49, 33, 1, -39, 8, -6, -1 }, { 16, 3, 28, -32, 65, -43, 65, -18, -31, 45, -2, 18, -17, 6, 1, -3 }, { -27, -48, -48, -45, -17, -41, -2, -3, -1, -5, 18, 48, 21, -44, -39, 20 }, { 8, 43, -11, -29, 46, 35, 16, 16, 26, -49, 60, -12, -16, -3, -45, 22 } }, { { 121, -17, -14, -4, -31, -6, -2, 10, 9, -1, 0, 10, 1, -5, -4, -2 }, { 15, 71, 69, 5, -30, 3, 0, -43, -42, 1, 3, 32, 5, 16, 15, -17 }, { -6, -73, 71, 24, -6, -29, 2, 42, -39, -1, -9, 6, 14, -23, 21, -3 }, { -36, -6, -22, -27, -80, -27, -7, 15, 22, -10, 17, 61, 20, -20, -23, -22 }, { 11, 36, 35, -54, 44, -57, -14, 23, 22, -12, 31, -19, 31, -28, -29, 17 }, { -1, -32, -37, -46, 6, -54, -20, -51, -53, -16, 5, -8, 13, 37, 36, -3 }, { 1, -15, 12, -71, 1, 63, 27, 23, -30, -30, 45, 5, -41, -9, 9, -5 }, { -9, 15, 4, -13, -67, -11, -1, -2, -20, -7, -31, -65, -30, -25, -12, 64 } }, { { 122, -2, -28, -12, -12, -14, -2, 0, 7, 1, 1, 6, 4, -1, -2, -1 }, { 20, 53, 97, -18, -24, 2, -7, -31, -25, -27, 11, 14, 2, 10, 8, -6 }, { -9, 102, -60, 10, 9, 5, -9, -27, -19, 24, -6, 14, -7, 9, 0, -1 }, { -13, 8, -16, -16, -97, 50, -15, 22, 41, 1, 24, 12, -4, 4, -23, -4 }, { 24, -2, 14, 40, 25, 96, 5, -6, -2, 5, -12, -45, -37, 19, 9, 8 }, { -6, -10, -10,-112, 36, 32, 1, -11, -7, 17, 16, 3, -20, 9, 6, -2 }, { 4, 12, 4, 0, 7, -10, -56, 86, -57, 5, 37, -24, -13, 7, -9, 5 }, { -2, -42, -21, 18, -36, 8, 30, -30, -86, 9, 24, 39, -23, 20, 12, -13 } } }, { { { 72, 100, -15, -1, -4, 20, -8, 15, -8, -1, 0, -6, 4, 3, 0, -1 }, { -52, 19, -30, 4, -66, 76, 29, -22, -1, -2, -1, -21, 17, -14, 2, -2 }, { 40, -25, -59, 21, -75, -59, -13, -24, 16, -2, 2, -7, -9, -5, 2, -2 }, { 74, -44, 31, -2, 5, 43, 56, -51, 1, 4, -2, 3, -14, -15, -1, 4 }, { 25, -38, -35, 20, 7, 14, 42, 87, 3, -4, -3, -12, 53, 9, -1, -3 }, { -5, 13, 9, 50, 4, 15, -2, -1, 98, -10, -6, 55, 3, 6, 18, 17 }, { -24, 28, -46, 5, 33, -31, 83, 0, 0, -10, 1, 5, -62, -4, 1, 5 }, { 4, -13, -32, -42, -15, 23, -4, 0, -24, 3, -16, 78, -10, 72, -19, 10 } }, { { 109, -60, -19, 0, 2, 16, 0, 9, 4, -1, 0, -4, -6, 1, 1, 0 }, { 34, 27, 76, -23, -63, -50, 23, 30, 9, 3, 1, 9, -2, -10, -4, 2 }, { -32, -58, 65, -20, -25, 75, -32, 0, 4, -1, 2, 3, -1, 0, -1, -1 }, { 32, 27, 45, 35, 25, 8, -14, -68, -50, -5, 9, 41, 39, -7, 0, -6 }, { 13, 18, 7, -90, 38, 1, 0, -50, 51, 20, -3, -15, 23, 3, -7, 5 }, { -32, -84, 6, -9, 11, -62, 48, -31, -17, -1, -1, 18, 15, -10, -4, 0 }, { 4, 11, 39, 26, 45, 37, 89, 11, 5, 18, -13, -29, -17, 23, 20, 0 }, { -1, 14, -22, -43, 11, 37, 43, 31, -23, -17, 31, 72, -11, -44, -23, 8 } }, { { 122, -14, 13, 2, 12, -26, 6, -13, 3, 2, 2, 0, -1, -1, -2, 0 }, { 8, 113, -24, -3, 28, -35, -20, 15, -1, -2, 3, 10, -15, -2, 3, -1 }, { -34, -32, -7, -3, 26,-105, 25, -37, 1, 1, 1, 6, -18, -11, -4, -3 }, { -13, 32, 115, 7, -14, -9, -6, -34, 13, 6, 5, -14, 11, -1, -4, -1 }, { 2, 33, -22, -2, -15, 13, 98, -37, 1, -1, -2, -10, 58, -3, -3, 6 }, { -5, -4, 3, -15, 93, 53, 7, -48, 25, 2, 4, 15, -25, -27, 6, -6 }, { 1, 2, -39, -10, -18, -4, -65, -64, 39, -8, 8, -53, 39, 1, -6, -11 }, { 1, -3, 11, -94, -4, -6, -21, -10, -26, -17, -27, 54, 41, 6, 16, 13 } } }, { { { 119, -16, -11, -7, -34, -7, -1, 16, 13, -1, 0, 7, 0, -6, -5, 0 }, { 0, 71, 66, -7, -59, -10, -2, -22, -19, 0, 7, 47, 11, -2, -2, -15 }, { -30, 6, 7, -39, -35, -43, 0, 62, 57, -2, 4, -20, 4, -37, -31, 31 }, { -10, -32, -26, 18, 4, 8, -35, -8, -13, -27, 33, 64, 46, -54, -42, 4 }, { -8, -76, 68, 19, -11, -23, 10, 41, -41, 2, -8, 5, -1, -11, 30, -14 }, { 26, 21, 59, 14, 49, 36, 4, -1, -16, -7, 9, -35, -11, -39, -36, 57 }, { -10, -37, 18, -54, -19, 63, 21, -5, 16, -22, 41, 19, -48, 27, -29, -15 }, { -8, -30, -11, -22, -47, -25, -26, -55, -37, -15, -8, -19, -9, 19, 2, 77 } }, { { 110, -35, -49, 5, -1, 0, 0, 13, 16, 1, -6, -11, -4, 1, 1, 2 }, { 34, 58, 43, -38, -75, -24, 1, 16, 32, 4, 4, 19, 4, -14, -20, 7 }, { 37, 22, 17, 26, 0, 39, -9, -60, -55, -19, 27, 62, 13, 2, -11, -16 }, { 29, 31, 49, 6, 11, 5, 32, -5, -21, 23, -26, -40, -32, 50, 57, -30 }, { -8, 81, -67, 55, -3, -29, -3, -9, 5, 5, -25, -4, 15, -6, 14, 0 }, { 24, 6, 54, 44, 40, -36, -4, 36, -33, -11, 4, -23, 23, -54, -9, 37 }, { -11, -40, -10, 19, -51, -72, 8, 21, -47, -29, 13, 24, -4, 17, 31, -30 }, { 1, -26, 8, -20, -28, -13, -56, -53, -22, 33, -56, -14, 35, -14, 33, 31 } }, { { 119, -26, -28, -6, -16, -7, 1, 13, 13, -2, 0, 1, 0, -4, -4, 1 }, { 23, 96, 34, -12, -49, -34, -4, -21, -7, 0, 5, 30, 16, -3, -4, -6 }, { 9, -55, 102, -37, -17, 2, 10, 10, -19, -8, 16, 14, -3, -5, -1, -2 }, { 28, 18, 17, 8, 23, 86, -15, -65, -25, 2, -2, -2, -20, 36, 11, -9 }, { -21, -1, -15, -23, -77, 68, -23, 24, 54, -7, 7, 7, -10, -9, -19, 5 }, { 0, -20, 17, 106, -45, 2, -9, 3, -16, -30, -18, 9, 16, -6, 22, -9 }, { 14, 40, 36, 26, 10, 15, 53, 41, 23, 4, -16, -61, -51, -10, -3, 25 }, { 10, 14, 29, 2, 26, -2, -97, 25, 11, 0, -1, -52, 36, -8, 2, 15 } } }, { { { -93, 41, -73, -8, 12, -9, 2, 10, -15, -5, -2, 9, -4, 0, -1, 1 }, { 57, 16, -49, -49, 76, -16, 5, -24, 25, -10, -4, 19, 0, -11, 15, -1 }, { 51, 31, -26, 40, 1, -2, 2, 51, -69, 23, -16, 40, -12, 10, -16, 19 }, { -16, 53, 47, 67, 49, -43, 10, -38, 10, 19, 15, 3, -3, -13, 0, -2 }, { 23, 64, 2, -14, -13, -9, 4, 35, 19, -42, 48, -55, 8, 38, -34, 0 }, { -17, 32, 57, -33, 3, 1, -9, 53, 18, -36, -16, 56, -25, -7, 47, 7 }, { 24, 53, -19, 5, -53, 10, 8, -15, -1, 15, -22, -25, 10, -44, 59, -51 }, { 0, 5, 21, -35, -25, -56, 27, -47, -61, -50, -34, 7, 15, 4, -20, 1 } }, { { 109, 29, -54, 11, 10, -10, 1, 14, -17, 0, 4, 0, 0, 3, -4, 0 }, { -36, 63, -47, 40, -73, 7, -3, 9, 12, -14, 12, -34, 3, -6, 12, -5 }, { 0, -93, -55, 53, -12, -3, 6, -26, 22, -16, 2, 0, -10, -6, 8, 3 }, { -42, -15, -39, 0, 28, -35, 1, 44, -80, 6, 25, -19, -4, 17, -29, -20 }, { 2, 6, 26, 26, 0, -99, -14, 39, 44, -20, -8, 12, -20, 24, 8, 5 }, { -32, 35, -48, 4, 28, 7, 10, -20, 4, -14, -34, 83, -25, 17, -23, 27 }, { 5, -12, -20, -65, -64, -47, -2, -39, -9, 48, 5, 16, -27, -9, -13, 2 }, { -5, -20, -35, -43, -11, 11, -24, 51, 34, 3, -70, -9, 51, 12, -17, -19 } }, { { 118, -25, 4, -16, 34, -4, 2, -12, 9, 4, -5, 12, -1, -3, 4, 3 }, { -14, -87, 52, -7, -34, 29, -5, -42, 32, 1, 2, -25, 11, -11, 6, -10 }, { -12, 46, 97, -19, 30, -37, 5, -7, 33, -6, -2, 23, -9, -4, 14, 7 }, { 43, 35, 36, 44, -63, -3, 0, 30, -4, -12, 35, -54, 8, 17, -15, -17 }, { -6, -18, -13, 72, 18, -65, 27, -53, -5, -10, 32, 3, -22, -29, -5, -9 }, { 6, 25, -30, -70, -50, -51, 20, -35, 11, 10, -14, -31, -2, -31, 12, -25 }, { -1, -44, 27, -23, -26, -50, 17, 27, -60, 10, -17, 22, -24, 29, -51, 20 }, { 7, 35, 23, 5, -7, 40, 8, -60, -67, 4, -18, 16, 44, -34, -32, 4 } } }, { { { -80, -85, 37, -12, 25, -18, -4, -11, 2, 3, -3, 13, -7, 3, -3, 1 }, { -60, 22, -11, 11, -69, 57, 17, -47, 30, -5, -6, -2, -5, -19, 10, 7 }, { -55, 55, 27, -31, 30, 36, -6, 46, -45, 7, 9, -30, 35, -3, -2, -9 }, { 40, -2, 53, -24, 45, 66, 43, -39, 0, 9, -5, 34, -12, -2, -7, 11 }, { -32, 52, -2, 8, 17, -17, -5, 15, 29, -10, -22, 69, -34, 62, -22, 2 }, { 17, 5, 65, 11, -13, 1, -56, 15, 66, -3, -22, 6, 55, -12, 11, 8 }, { 0, 38, 36, -43, -14, -44, -45, -51, -19, 27, 31, -1, -45, -15, 24, 12 }, { 17, -20, 29, -14, -57, 16, -19, -7, -32, 10, -12, -11, 0, 56, -57, -54 } }, { { 102, -58, 44, -7, -13, 17, -1, -9, 7, 1, -1, -3, 5, -1, 1, -1 }, { 47, 15, -69, -27, 74, -30, 14, -38, 11, 3, -2, 13, 3, -12, 4, -1 }, { 9, 58, 77, 20, 10, -67, 24, -37, 6, -2, 7, -6, 10, -2, -1, -2 }, { 37, 21, -18, 71, 0, -12, 1, 39, -68, 20, -5, 41, -23, -1, -12, 13 }, { -23, -44, -13, 73, 18, 13, 9, -64, -12, 20, 3, -36, 38, 12, -15, 1 }, { 33, 54, -12, 23, 15, 34, -25, 30, 32, -15, 36, -62, 4, 42, -25, 5 }, { -10, 11, 37, 19, 57, 40, -51, -4, 27, 48, -27, 13, -36, -24, 30, -11 }, { 12, 41, -5, -27, -25, 25, 16, -2, -17, 62, -57, -7, 60, 28, 19, 9 } }, { { 113, -32, -35, 3, 29, -14, 8, -6, -7, 1, 0, 9, -6, -2, -1, 1 }, { 28, 12, 96, -56, 10, -25, 1, -24, 31, 19, -11, 16, -4, -12, 5, 3 }, { -14,-102, 17, 1, -10, 51, -4, -45, 14, 0, -5, -13, 16, -13, 5, -6 }, { -9, 23, 2, -2, 86, 64, -37, -14, -21, -3, -1, 46, 5, 3, 2, 11 }, { 43, 29, 12, -17, -45, 75, -20, 43, 12, 6, -9, -30, 40, 24, 6, -1 }, { -19, -22, -45, -89, -14, -2, 9, 20, -26, 41, -29, 33, 3, 4, -22, 16 }, { 1, 44, -52, -21, -7, 10, 7, -62, 44, 0, -18, -12, 20, -56, 31, -17 }, { -13, -24, -16, -9, 36, -3, -4, 58, 77, -6, -25, -7, -33, 16, 51, 4 } } }, { { { 122, -4, -16, -7, -30, 0, 0, 2, 10, -1, 2, 11, -1, 1, -4, -3 }, { 11, 31, 107, -5, -25, -2, 2, -17, -47, -8, 8, 19, 2, 2, 15, -5 }, { -28, 29, -17, -82, -59, -4, -1, 2, 30, -8, 39, 35, 0, -3, -22, -7 }, { 12, 104, -20, 0, 42, 3, 5, -48, 11, 13, -3, -17, 0, 18, -7, 3 }, { -10, 21, 24, 63, -39, -15, -3, 19, 58, 52, -23, 24, 10, -11, -43, -7 }, { 15, -13, 54, -50, 49, 2, 4, 29, 53, 10, 3, -50, -3, -15, -33, 23 }, { -6, 7, -9, -33, -50, -24, 1, 0, -18, 46, -65, -49, -11, 4, 34, 41 }, { -2, -4, 0, 3, -10, 94, 22, 11, -8, 59, 31, -3, -43, 19, 9, 2 } }, { { 106, -24, -62, 17, 2, 2, -1, 6, 14, 0, -9, -7, -2, 1, 1, 0 }, { 42, 50, 42, -56, -68, -9, 3, 8, 38, 20, -3, 13, -3, -4, -18, 4 }, { 50, -9, 65, 5, 10, 24, 2, -29, -68, -29, 42, 30, 0, 2, 1, -7 }, { -12, 72, -47, 52, -32, 8, -3, -34, -7, -31, 10, 47, 7, -1, -22, -8 }, { 20, 47, 46, 67, 5, -26, 8, 22, -10, 1, -50, -43, 1, -1, 38, 3 }, { -7, -13, -18, -12, -46, -60, 1, 46, -26, -68, 38, -19, -4, -12, 24, 8 }, { 6, -7, 28, 38, 34, -19, -15, 43, 41, -8, 32, 15, 32, -43, -66, 18 }, { -9, -63, 17, 23, -60, 3, -26, -20, -10, -16, -70, 18, 9, -21, -26, 9 } }, { { 124, -6, -14, -10, -25, -5, -1, 3, 7, -2, 2, 7, 0, 0, -2, -2 }, { 7, 79, 83, -10, -27, -12, 6, -27, -31, -8, 5, 16, 3, 5, 9, -3 }, { -5, 86, -84, -9, 12, -23, 5, -25, 16, 13, 0, 5, 5, 5, -5, -1 }, { -26, -8, -4, -19, -81, -52, -2, 43, 35, 7, 14, 40, 16, -20, -21, -7 }, { 9, 12, 7, 105, -5, -34, 2, 20, -6, 36, -44, -8, 9, -5, -2, 3 }, { -9, 16, -20, 27, -61, 95, -15, 2, -6, 18, 4, 22, -27, 14, -2, -8 }, { 6, 37, 29, -10, 14, 27, 4, 48, 64, 6, 2, -57, -21, -21, -44, 24 }, { -4, 17, -28, 3, -25, -6, 13, 51, -50, -75, -14, -33, -24, -23, 31, 18 } } }, { { { 120, -24, -33, -10, -1, -12, 1, 8, 6, 3, 0, -1, -1, -1, -1, 0 }, { 31, 81, 71, -36, -9, -36, -4, -12, -18, 1, 10, 21, 9, -2, -4, -2 }, { -5, 84, -83, 38, -9, -17, -13, 14, -9, 8, -3, -3, 5, 1, -1, 0 }, { -18, -20, -9, -20,-100, -50, 10, 25, 37, 2, 5, 9, 9, -16, -13, 3 }, { 23, 7, 47, 98, -30, 10, -3, -1, 24, -41, -1, -23, 7, 0, 13, -2 }, { -11, -27, -2, 34, 46,-102, 27, -19, -11, -8, -10, 21, -7, -4, 3, 0 }, { 1, -16, -21, 2, -43, 6, -27,-101, -43, -15, 3, 19, 8, 8, 13, -13 }, { -1, -19, 5, 8, -21, -4, -4, 59, -98, -28, 31, -3, -18, 16, -4, -7 } }, { { 126, -4, 8, -4, -17, -10, -2, -1, -2, -2, 0, 4, 1, 0, 1, -1 }, { 7, 97, 55, 2, 44, 19, -6, -27, -21, 0, -2, -15, -9, 5, 3, 2 }, { -1, -64, 82, 34, 53, -34, 1, 6, 2, 6, -10, -15, 1, -3, -3, 3 }, { -16, 4, 28, 14, -68, -57, -17, -71, -23, 9, -1, 19, 11, 25, 9, -2 }, { 5, 18, -10, 90, -18, 20, 7, -15, 74, 30, -2, -11, -4, -5, -19, 0 }, { -4, -15, 44, 20, -56, 73, 43, 24, -50, 13, -6, -4, 10, -6, 12, 1 }, { 6, -30, -31, -4, 9, 19, 12, -64, -26, 0, -15, -77, -50, -3, -4, 19 }, { -1, -21, 33, -42, 1, 35, 22, -42, 52, -4, 61, 37, -39, 5, 15, -6 } }, { { 122, 10, 17, -14, 24, -16, -1, -3, -5, -1, -2, 4, -2, 0, -1, 0 }, { 0, -94, 74, 7, -4, 2, 0, -32, 33, 2, 1, 2, 0, -3, 3, 1 }, { 31, -67, -85, 8, -56, 13, 8, 5, -4, 5, 6, -8, 8, 0, 0, -1 }, { 17, 29, 43, 20, -52, 79, 8, 25, -3, 2, 10, -45, 33, -4, -1, -9 }, { -1, 26, 20, -29, -82, -53, -1, -61, -31, -3, 2, -12, -7, -6, 3, 1 }, { 9, 25, -9, 96, -6, -32, -8, -21, 48, 16, 40, -14, -20, -6, 11, -1 }, { -3, -23, 26, 38, -7, -43, -14, 59, -69, 22, 5, -11, -5, 34, -41, -5 }, { 2, 11, 15, -19, -45, -33, 6, 62, 58, -11, 10, 58, 29, 26, 14, 5 } } } }; const int8_t dav2d_stx_8x8_kernel[11][3][32][48] = { { { { 103, -44, -48, -9, -12, -6, -4, 19, 24, -2, -2, 4, 7, 2, -2, -1, 3, -7, -10, 2, 0, -1, 1, -2, -4, -1, 1, -2, 2, 3, -1, 1, 0, 0, 0, 1, 2, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0 }, { 28, 31, 52, -35, -70, -18, 0, -14, -23, -3, -7, 28, 50, 12, -5, 0, -1, 7, 14, 3, -2, -2, 5, -12, -24, -1, 4, 1, -5, -9, -2, 1, -1, 1, -2, 3, 8, -2, 2, 5, 1, 0, 0, -1, -2, -2, 0, 0 }, { -47, -3, -12, -10, -37, 3, 1, 43, 62, 8, -1, 18, 26, 4, 0, 2, 3, -35, -52, -9, 5, 1, 1, -14, -18, -2, 1, -5, 14, 26, 2, -3, 2, 0, 0, 9, 10, 0, -2, -8, 0, 0, 0, -5, -5, 1, -1, 2 }, { -29, -52, -33, 18, 12, 26, 6, 2, -19, 5, -3, 25, 38, 3, 2, 2, -2, 16, 35, 8, -2, 2, 3, -34, -49, -10, 3, 3, -15, -31, -8, 4, 0, 0, -4, 18, 27, 1, 8, 19, 5, -2, 2, -6, -9, -8, -2, 0 }, { -25, -25, -13, -16, -43, -17, 11, 32, 20, 16, 9, -7, -5, 14, -1, 4, -2, 10, 29, 5, -1, 0, -3, 32, 34, 2, 1, 0, -26, -47, -13, 5, 0, 2, 0, -33, -34, -2, 11, 28, 5, -4, -1, 19, 19, -9, 0, -7 }, { 4, -78, 85, -4, 5, 18, 5, 25, -6, -17, -6, -7, -14, -12, 4, 5, -6, -24, -2, 10, -5, -3, 2, 2, 6, 3, -4, 2, 17, 3, -5, 4, -2, 1, -1, 1, 1, 1, -8, 0, 2, -2, 2, -1, -2, -2, -1, 0 }, { -18, -40, -19, 15, -18, 1, 1, -32, -29, -16, 10, 15, 33, 30, 1, 0, 8, 12, -4, 12, 2, 2, -1, 17, 7, -9, 1, -2, 16, 43, 12, -3, 1, 1, -2, -33, -33, -5, -19, -44, -15, 4, -2, 19, 19, 23, 7, -6 }, { 7, 4, 14, 80, -16, -36, 7, 8, 6, -3, -10, -56, 15, 30, -4, 2, -4, -8, -4, -6, -2, -4, 9, 26, -26, -29, 5, -1, -3, -2, 4, 1, -2, 3, -3, -9, 24, 17, 7, 6, -2, -1, 1, 4, -11, -4, 0, -2 }, { -16, -26, -8, -21, -31, -38, 21, -8, 11, 15, -9, -26, -30, -9, 9, 1, 10, 40, 24, 12, 4, -1, 12, 1, 9, 13, 0, -6, -11, 14, 9, -2, 2, -1, 4, 37, 31, 2, -13, -38, -19, 2, -2, -32, -28, 19, 7, 13 }, { 7, -25, 33, 4, 25, -12, 9, -42, 43, 26, -4, 19, 7, 12, 3, -1, 1, 37, -27, -46, 6, 0, -8, -12, -1, -9, -3, -9, -43, 2, 28, -4, -1, -2, 7, 0, -15, -6, 30, 8, -11, 4, -4, 1, 14, -4, 3, 0 }, { -12, -25, -19, 1, -26, -6, -26, -44, -27, -6, 6, 0, -1, 10, 9, 5, 7, -18, -26, -26, 1, 0, 15, 22, 30, 21, 4, 9, 19, 6, 12, 8, -3, 2, 1, 15, 8, -7, 14, 44, 23, 0, -5, -30, -27, -37, -16, 11 }, { 11, 8, 1, 37, -53, 88, 4, -12, 8, 26, -13, -14, -10, -39, -4, -2, 2, 17, 1, -12, 2, -7, 12, 8, 3, 16, 2, 3, -9, 2, 9, 0, -1, 4, -8, -9, -1, -8, -1, -7, -8, 1, 2, 4, 0, 4, 3, -1 }, { -7, -3, -1, -59, 19, 28, -22, -20, 24, -33, 2, -37, 21, -6, -4, -4, 11, 18, -7, 11, 11, -4, 18, 45, -22, -18, 3, -6, -12, 2, -1, -4, 3, 4, -18, -26, 27, 23, 14, 3, 2, 1, 8, 13, -18, -2, 1, -5 }, { 1, -6, -9, -39, 4, 6, 28, 15, -44, 64, 4, -14, -10, 7, -13, 6, -26, -18, -2, -39, 5, 0, -3, 9, -14, -25, 3, 18, 16, 12, 33, 3, -3, 0, 4, -11, 16, 27, -3, 0, -16, 0, -1, 16, 1, -8, 5, -11 }, { -8, -13, -3, 0, -18, -33, -23, -28, 11, 9, -12, -16, -21, -47, -10, 5, 27, 9, 4, 12, 4, 1, -18, -28, -26, 0, 9, 5, 25, 13, 2, 5, -1, 1, 10, -2, 5, 8, -11, 13, 19, 0, 12, 46, 35, -24, -18, -33 }, { 6, -1, 16, 2, 16, 7, -43, -12, 19, 78, 10, 5, 10, 26, 3, -1, 28, 5, 11, 13, -14, -8, -3, 15, -1, -12, -4, -5, 23, -8, -35, 0, 0, -2, -5, -9, -1, -1, -36, -11, 22, -1, -4, -15, -16, 4, -12, 14 }, { -4, 1, 1, 3, 20, -21, -56, 53, -24, 10, -34, -8, 34, -27, 0, 4, 24, 18, 9, -27, 2, -3, 21, 6, 1, 29, -2, -7, -8, 9, 30, 9, -4, 2, -15, -7, -11, -24, 1, -4, -22, -9, 3, -1, 1, -3, 8, 1 }, { 3, 4, -1, 4, 26, -20, 61, -25, 31, 13, -17, 8, 39, -30, 8, 5, -16, -18, 21, 10, 11, -7, 25, 19, 2, 32, 2, 18, 28, -1, 5, -1, -2, 2, -24, -20, -3, -22, -18, 5, -3, -6, 11, 2, -9, -17, -7, 3 }, { 3, 12, 0, -14, 3, 28, 30, 18, 6, -11, -38, -24, -3, 43, 4, -1, 33, 24, 15, 12, 15, 4, 6, -16, -1, -23, -16, -4, 18, 23, 18, 14, -2, -3, 8, 11, -25, -13, -7, 12, 6, -8, -13, -17, 19, -44, -32, 11 }, { 7, 5, 10, 13, 2, -2, 22, 29, -9, 1, 89, 0, 15, -21, -3, 1, 24, 33, -6, 10, 13, -1, -31, 1, -1, 13, 4, -8, -5, 20, 14, -3, 0, -1, -4, -10, 4, -1, 2, 3, 1, 0, 2, -3, -25, -32, -19, 2 }, { 0, 1, 4, 0, -2, 11, -37, -10, 42, -22, 27, -15, 17, 5, -20, 16, -30, -5, 53, -22, 5, 9, -26, -8, 11, -7, 11, 32, 28, -7, 32, 2, -4, -5, 19, 16, -7, 4, -18, -1, -27, -6, -13, -11, 1, -6, 13, 7 }, { -2, -8, -3, -16, 7, 4, 12, -5, -15, 12, 5, -71, 45, -6, -25, -2, -22, 3, -29, 1, -5, 0, -20, -11, -3, 29, 8, 1, -8, -6, -26, -6, 6, 4, 24, 24, -21, -27, -1, -2, 23, 9, -11, -11, 23, 20, -2, 5 }, { -5, -5, 0, 6, 0, -11, -3, 6, 0, -2, -20, 24, -16, -13, -94, 5, -22, 28, -15, 22, -7, -8, 4, 5, -3, -13, 12, 22, 2, 12, -2, 3, 3, -2, -13, -18, -11, 6, 9, 5, 9, 3, -4, -19, -9, -5, -8, 37 }, { 1, -1, 0, 0, -8, -6, -26, -8, -6, 25, 10, -16, -5, -9, 33, 5, -23, -12, -11, 58, 43, -12, 4, -19, -10, -21, -24, 24, -4, 6, 8, -16, 6, -6, -20, -3, -22, -12, 36, 10, -24, -1, 5, -13, 9, -10, 23, 24 }, { 3, 3, 7, 9, 12, 11, -13, 11, 2, 17, -17, 17, 6, 35, -7, -5, -23, 18, -9, 41, 31, 9, 9, 15, 24, 37, 39, 12, -5, 11, -4, -18, 5, 1, 27, 21, 26, 9, 8, -1, -14, -1, -7, 18, 0, -23, -2, -48 }, { 1, 0, -2, 9, 0, -4, -1, 7, -6, 2, -20, 12, 36, -55, 24, 2, -18, 8, 6, -7, 15, 16, -6, 17, 40, -53, -1, -8, -9, 14, -19, -16, 0, -8, 19, 15, -6, 41, 0, -11, 19, 3, -19, -11, 4, -9, -24, 5 }, { 0, -3, 1, 1, 4, 3, 9, -16, -5, 9, 2, 3, 1, -8, -41, 11, 52, -62, 25, 7, 37, 0, -2, 11, -2, -1, 29, -46, -21, 0, 2, -17, -4, -4, 11, 10, -5, 1, 16, -4, -17, -4, -16, -9, 11, -2, 12, 12 }, { 0, -3, 3, 2, -2, -4, -8, -2, 0, -2, 40, -8, -5, 4, -20, -34, -18, 2, -3, -18, 21, 18, 78, -23, 7, -8, 1, -14, -5, -12, -11, -11, -14, -8, -30, 18, -6, 7, -28, -10, -2, -13, 12, 2, 32, -11, -19, -3 }, { 0, -4, 1, 2, -4, -3, -2, 11, -5, -14, -14, 4, -16, 13, 2, 2, 2, 13, 9, -47, 78, -7, -20, -3, -15, 11, -1, 16, 12, -5, -26, -47, 12, 4, -10, -15, -4, -9, -13, -3, 25, 27, 9, 2, -16, 12, -2, 7 }, { -5, -5, -5, -5, -5, -2, -15, -6, -1, 0, -17, -4, -8, 3, 10, -64, -42, -18, 14, 0, -9, -20, -41, -2, -8, 3, 22, -35, -20, 10, 6, -8, -6, 16, -4, -15, -1, -16, -8, -16, -11, -16, 18, 3, -15, -31, -34, 20 }, { -1, -2, -2, 3, -3, -6, -2, -4, -5, -6, -1, 16, -9, -17, -6, -26, -5, 7, -29, 7, 23, 17, -7, 43, -40, 6, -47, -9, 9, -41, 11, 6, -1, -4, 37, 0, -3, -9, -34, -2, -22, -16, -40, -11, 5, -9, 2, -6 }, { -1, -3, 3, 5, -1, -2, -2, 1, 8, 10, 5, 0, 9, 2, -28, -57, 2, -18, 37, -1, 6, 16, -2, 1, 10, 11, -66, 10, -1, 34, 1, 15, 10, -9, 0, 3, 9, -3, 40, 4, 23, 14, -3, 1, -9, 21, -4, -29 } }, { { 109, 7, -40, -24, 31, -26, -4, -1, -3, -1, -2, -11, 17, -5, -7, -1, 2, -6, 8, -4, 0, -1, -1, -1, 3, 2, -1, -2, -1, 1, 0, -2, 4, -1, 0, 0, 0, 0, 1, -1, 1, 1, 0, 0, 0, 1, 0, 0 }, { 38, -61, 73, -14, -13, 5, 4, -36, 46, -18, -5, 14, -22, 9, 2, 1, -4, -8, 15, -3, -4, -2, -1, 12, -16, 5, 1, 0, 1, -2, -3, 3, -1, 3, 0, -1, 0, 0, 4, -5, 0, 1, 1, 0, 2, -2, 1, 0 }, { -35, 44, 31, -21, 64, -35, 3, -34, 25, -4, -1, -20, 28, -18, 2, 1, 4, -30, 36, -14, -1, 0, -7, 8, -4, 4, -4, 1, 0, 0, -2, -10, 15, -2, -3, 0, -2, -1, 8, -9, 5, 0, -1, -2, 1, 1, 2, -3 }, { -31, -70, -63, -36, 27, 32, 4, -10, 26, 18, 19, -25, 5, 14, 4, 3, -1, -10, 18, -6, 7, 3, 5, -9, -7, 10, 2, 2, 1, 0, -4, -1, 9, -9, 4, 2, 0, -2, 2, -7, 3, 2, 0, -2, 4, 1, -3, -3 }, { -9, -9, -44, 41, -11, -61, 24, -50, -3, 9, -1, 27, -28, 11, 7, 2, 22, -24, -2, 15, 1, 0, 1, 18, -31, 17, -1, 1, 1, 3, 1, 1, -19, 16, -1, 0, 0, 6, 0, -11, 6, -1, 1, -3, 6, -12, 5, 0 }, { -3, 56, -24, -58, -32, 15, 4, 7, 29, -28, 9, 9, -38, 23, -4, -3, -1, 12, 13, -27, 6, 2, -5, 24, -31, 14, 1, -3, -1, 0, -6, 22, -14, -2, -1, 3, 0, -4, 9, -7, -3, 3, 1, 0, 12, -13, 3, 5 }, { 12, 19, -12, 61, 56, 52, -39, -1, 36, 15, -4, 18, -36, -4, 2, -4, -17, 3, 1, -1, -8, -5, -10, 14, -15, -8, 3, 0, -1, -2, -6, 7, -6, 10, -6, -3, -4, 0, 8, -3, -6, 2, 0, 0, 2, -4, 7, 1 }, { -2, -10, -27, 18, -47, -6, -29, 27, 30, -22, -2, 28, 24, -41, 4, 4, 4, -26, 41, -11, 2, 2, -7, 23, 6, -25, 9, 2, 1, 3, 10, -33, 27, -4, -2, 2, 0, -9, 23, -16, 1, -3, 3, 3, -11, 3, 8, -11 }, { 18, 32, 9, 17, -45, 32, -7, -34, -12, 46, 16, -42, -17, 10, 7, -5, 7, -28, 23, 0, -8, -3, 23, -25, -10, 11, 5, -4, -2, 0, 1, -18, 30, -16, 3, -2, 1, 5, 2, -24, 15, 0, 2, -5, 3, 9, -11, -13 }, { -6, -4, 7, -37, -6, -39, -60, -7, 13, 52, 33, -2, -28, -43, 16, 3, -3, 22, -24, 7, 3, -4, 15, 1, -1, -21, 13, 6, 1, 0, 11, 1, -13, 5, 5, 0, 0, 11, -3, 2, -13, 5, 3, 8, -5, -6, 2, 1 }, { 4, 9, 5, 22, -8, -30, 44, 41, 66, 46, 31, 12, 12, 17, -34, 3, -14, 5, 0, -7, -4, -8, 10, -20, 3, -2, -12, -1, 1, 3, -16, 6, 4, -16, -1, -5, -3, 3, -5, 9, -10, -2, -1, -4, 0, 10, -10, 2 }, { -5, 13, -25, 11, -18, -12, -17, -58, 33, -31, -17, -12, -4, 21, 2, -1, -18, 30, 9, 1, -2, 5, 0, -22, 44, -20, 9, 2, 1, 0, -8, 5, 14, 0, -3, 2, 1, 5, -24, 36, -20, 3, 3, 6, -19, 28, -15, 4 }, { 3, 22, -5, -36, -3, 42, 21, -20, 19, 9, 1, 46, 12, -18, -14, 5, 5, -45, -26, 47, 8, 1, 0, -17, 12, -3, 1, -2, -1, 4, 4, -25, -14, 31, -9, 0, -3, 13, -24, 8, 2, 0, 0, 2, -19, 6, 7, -2 }, { -7, -1, 3, -18, -8, -10, -29, -1, -21, 35, -34, 19, 30, 40, -31, 3, -49, 18, 35, 42, -12, 2, 0, 20, -18, 5, 7, 4, 3, 2, -21, 4, 8, 17, -11, 2, -3, 6, 8, -15, -9, 10, 1, -7, 0, 1, 10, -9 }, { 1, -4, 0, 8, 1, 7, 10, -35, -38, -10, 49, 14, -8, -21, -58, 22, -49, -14, -5, -29, 29, -1, -1, 14, 5, -7, 9, -2, 2, 10, -10, 6, 6, -14, 18, -1, 4, -14, 15, 6, -13, 12, 0, 4, -8, -1, 4, 1 }, { -8, -7, -5, -8, 9, -20, 2, 32, 4, -22, -44, -31, -63, -22, -42, 24, -17, -25, -16, 12, -21, 11, 15, -11, -9, 8, 4, -1, 3, 8, -3, -11, 23, 4, -5, 7, 1, 16, -4, -7, 7, 11, -2, 4, -8, 18, -11, -17 }, { 0, -2, -3, 18, -29, 3, -30, -6, 26, -8, -15, -46, 42, -28, -27, 12, -8, -21, -15, -2, 8, 3, -10, -16, -7, 28, 2, -1, 2, 6, -14, 35, -36, 18, 0, 2, 4, 1, -10, 7, 11, 3, -2, -7, 28, -30, 14, 19 }, { 3, 0, -5, -3, -12, 16, 56, -12, 2, 37, -30, -41, -7, -28, -1, 6, 0, 18, 11, -11, -22, -7, -8, 49, 20, -37, -1, -1, 1, 3, 2, -3, -14, 30, -4, 0, 0, -3, 16, 10, -14, 1, -1, 8, -10, -13, 18, 8 }, { -3, -6, -1, -15, -8, -1, -19, -8, -8, 39, -30, 30, -15, -11, -12, -8, 21, -9, -12, -42, -7, -11, -44, 5, 32, 46, -29, 1, 2, 5, -17, 6, 30, -3, -9, -3, 0, -13, 5, 22, 8, -6, -3, -18, 5, 21, 1, 11 }, { 0, 2, 3, -1, -3, -11, -10, 3, 8, 10, -9, -16, 12, 57, 11, 22, -14, -25, -52, -40, 4, -10, -39, -5, -13, -18, 34, 3, -2, 0, 15, -36, -4, 11, -5, -2, 3, -12, 9, -4, 3, 9, -4, 14, -18, 0, 20, -12 }, { 1, 5, 4, 9, 4, 1, -9, -5, 21, -4, 16, -18, -3, 12, -33, 20, 32, 50, -12, 28, 38, 13, -6, 27, 33, 34, -8, -18, 5, -2, 11, -31, 7, 13, 6, -2, 5, -13, 12, -29, 27, -4, -5, -3, -14, -9, 5, -18 }, { 4, 7, -1, -1, 3, 6, 24, -12, 6, 19, -40, 26, 7, -28, 20, 55, 5, 29, 1, 0, 22, 18, -10, -32, -10, 6, 48, -12, -2, -1, 2, 21, 3, -36, -6, 1, 4, -14, 12, -19, -6, 11, -2, 2, 11, -4, -14, -10 }, { 2, 0, 6, 0, -1, -8, 0, 12, -1, 19, -27, -13, -26, 7, 1, -31, -6, -19, 21, 15, 84, 35, -18, -12, -8, -30, -23, 8, -3, -2, -3, -6, -8, -9, 10, 6, 19, -24, -7, 17, -4, -4, 1, -4, 0, -4, 10, 8 }, { 2, 0, 10, 4, 6, -3, -10, 29, -12, 7, 15, 0, -19, 24, 11, 48, 25, -30, 49, 2, 8, -4, 8, 12, 33, 19, 41, -17, 2, -4, -5, 8, -4, 22, 9, 1, -1, 4, -24, 35, -16, 14, -9, 0, -6, -12, 8, 21 }, { -5, -4, 1, -2, 7, 5, -17, -10, -4, 13, -32, 22, 4, 13, -53, -8, 29, 8, 19, -41, -16, 5, 37, -13, -1, -18, 0, -7, 2, 6, 41, -25, -41, -21, 14, 8, -1, -2, -20, 3, 9, -7, -3, 8, 1, -22, -8, 2 }, { -1, -5, 0, -1, -2, -5, 7, 2, -11, -5, 29, -5, -21, -11, -20, -2, -4, 20, 43, 13, -20, -13, -70, -45, -7, -5, 4, 6, -3, 4, 36, -2, -14, 4, -39, -16, -5, -3, -14, -4, 19, -2, -1, 5, -1, -2, 18, 3 }, { 0, -1, -3, -2, 2, -7, 11, 3, -3, -2, -8, 17, -13, -4, 27, 0, -45, 10, 11, -38, 16, -10, 15, -27, 39, -1, -10, -6, 7, -7, -20, -17, -4, 37, 10, -8, 3, 26, -25, -41, 28, -3, 3, -16, 23, -27, 15, -16 }, { -3, 0, -1, -7, 4, -6, -4, -6, 2, -7, 0, 6, -15, 22, -17, -11, 22, -19, -12, 22, -12, -4, 11, -30, 46, -31, 7, 0, 1, -5, -5, 22, 19, -10, -14, 4, -9, -5, 63, -6, -10, -4, 4, -4, 30, -42, 34, 0 }, { -2, 1, -4, -1, -1, -6, 10, 1, 9, 9, -5, 3, -1, -3, 3, -32, -45, -13, -6, 9, -3, 9, 6, 20, 18, 24, 31, -30, -11, -48, 63, 23, 13, -4, 9, 7, 0, -12, 2, 15, 33, -19, 0, 13, 10, 4, -2, 9 }, { 0, 6, -4, -7, -1, -3, -5, -2, 5, -2, 1, -2, -11, 6, 18, 36, -26, -11, -3, 33, -33, -13, -14, 17, 20, -13, -22, 9, 10, 4, -15, -28, -28, -51, 25, 9, 6, -42, -11, 0, 31, -12, -4, -17, 20, -9, -11, 22 }, { 3, 4, 2, 3, 3, 2, 8, 5, 7, 2, -3, 2, -2, 4, 8, -8, -9, -2, 3, 2, 3, 1, 11, 6, 24, 26, 18, 90, -23, 56, 31, 20, 1, 0, 13, 4, 3, -9, -1, -6, 9, 6, -4, -2, -13, -9, 3, -14 }, { 4, -1, 3, 4, -1, -1, -2, 0, -4, 6, 21, 2, -1, 1, 10, -1, -9, -10, -1, -16, -34, 96, -3, 1, 20, 16, -9, 1, -1, -10, -13, -2, -24, -3, -35, 30, 2, -11, -4, -15, -18, -3, -1, 5, -16, -4, 6, -14 } }, { { 113, -35, -40, -16, 3, -16, -3, 8, 6, -1, -4, 1, 3, 2, -4, -1, 2, 0, 0, 2, -1, -1, 0, 1, 0, 1, 0, -2, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 25, 36, 75, -67, -52, -28, 0, -6, 5, 10, -9, 8, 21, 4, -5, 0, -1, 0, -5, -3, 0, -2, 4, 3, -1, 4, 1, -2, 0, -1, 0, 0, 1, 0, -1, 0, 1, 1, -1, 1, 0, 1, 0, 0, 0, 0, 0, 0 }, { -15,-100, 43, -34, 9, 42, 11, -6, 32, 6, -1, -1, -2, -5, 8, 4, -3, -3, -3, 5, -1, 0, 1, -2, 0, 0, -2, 2, 1, -1, -1, 0, -1, 0, 2, -1, 0, -1, 0, -1, 0, 1, -1, 0, 0, 0, 0, 0 }, { -30, -3, -52, -18, -49, -18, 13, 56, 62, 40, -5, -6, 1, 4, 1, 3, 5, -15, -20, 0, 9, 0, 2, 6, 3, 0, 1, 1, 1, 1, -5, -4, -3, -7, -2, 3, 1, 1, -1, 0, 1, -1, 0, -1, 0, 0, 1, -1 }, { 1, -33, 29, 60, -48, -48, 27, 27, 3, -62, 15, 14, 2, 7, 2, 7, -3, -11, 9, 7, -10, 0, 1, -3, -2, -4, 2, 0, 2, -2, -3, 2, -2, 2, 4, -1, 0, 1, 0, 0, -1, -1, 1, 2, 0, 1, -2, 0 }, { -4, -8, -34, -2, -62, 52, -27, -31, -17, -12, 20, 47, 40, 36, 4, -6, 7, 4, 0, -7, -2, 3, 8, -13, -18, -8, 6, 1, -1, 3, 1, 2, 7, 6, 1, 0, 1, -6, -6, -1, -4, -2, 2, 0, 0, -1, 1, 0 }, { -27, -29, -21, -25, 36, -67, 27, -13, -38, 20, 4, 26, 62, 12, 9, 8, -3, 1, -5, -2, 3, 3, -1, -3, -13, -1, -1, 4, 2, -1, 1, 0, 2, 3, 2, 1, -1, -2, -2, -1, -3, -1, -1, 0, 0, 0, -1, 0 }, { 1, 14, -2, -37, 21, 29, 19, 70, -26, -30, 37, 49, -8, -34, -9, -3, -11, -11, -17, -15, -11, 7, -1, -18, -3, 11, -3, -3, -1, -3, 1, 3, 4, 1, 1, -1, -4, -5, 2, -2, -3, 4, -1, 2, 3, 2, 0, 0 }, { -9, -23, -8, -10, -40, 1, 6, 25, -69, 26, -28, 3, -37, -14, 1, 1, 7, 45, 36, 32, 16, -5, 2, 8, 5, 10, 2, 0, 1, 2, 8, -12, -16, -8, 5, 1, 3, 5, 5, 2, -1, 2, 1, -4, -3, -2, -5, 0 }, { 15, -6, 32, 50, 5, 18, -17, 37, -28, 44, -48, 20, 20, 13, 3, -2, -14, -18, -53, -17, 10, -12, 5, 12, -10, -7, -2, 1, -1, -4, 2, 5, 13, -9, -3, -1, 2, 1, -4, 2, -2, -2, -1, 2, 3, 2, 4, 0 }, { -13, -17, 7, -10, 12, -38,-102, 26, 8, -10, 19, 11, -17, 6, 26, 0, 4, 32, -5, -12, -8, 5, 0, 4, 1, 1, -1, 10, 4, 0, 2, -2, 7, 0, -1, -2, -2, 2, 0, -4, -2, -2, -2, -1, 1, 1, 4, 0 }, { 1, 13, 6, -11, 20, 20, -14, 47, 6, -27, -17, -35, 48, 35, 5, -7, -9, 5, 30, 49, 16, -2, -16, -24, -30, -25, -10, 1, -3, -4, 2, 1, -18, -15, 4, 4, -1, 0, 7, 10, 7, -4, -3, 2, -2, -1, -4, 2 }, { -9, -8, -13, -26, -8, 5, -8, 4, -41, -30, 8, -48, -1, 34, 7, -2, -20, -45, -27, 13, -3, 7, 15, 43, 48, 18, 6, 1, -1, -5, 6, 18, 3, -9, 1, 0, 4, 3, -12, -16, -1, 4, 0, 3, 0, 6, 6, 0 }, { 8, 6, 12, -1, 12, -1, 27, 12, -2, 34, 29, 8, -47, 83, 23, -7, -21, -3, 29, -21, -27, 8, 8, -4, -19, -9, -2, 2, -2, -6, 3, -7, -5, 3, -3, -7, -2, 0, -9, -4, -3, 1, -3, -2, -1, 2, -2, -1 }, { -7, 5, -9, -17, 14, -10, -3, -18, 19, -34, -52, 61, -37, 22, 11, 0, -13, -32, 12, 11, 52, -12, -13, 7, -4, 15, 0, 1, -1, -1, 0, 6, 3, -18, -3, 8, -2, 9, 2, -1, -7, 5, 0, 2, 3, 1, 2, 1 }, { 3, 10, -4, -7, 7, 12, 31, 6, 10, -34, -21, 2, 13, -3, 53, -16, 20, 50, -9, -36, 9, 2, 33, 49, 5, -31, -5, 3, -5, -1, -7, -17, -13, -14, -9, 2, 2, -4, -16, -8, 3, -5, -5, 1, 3, -1, 1, -2 }, { -9, -8, -9, -22, -6, -7, 9, 3, -18, -25, -22, -33, -31, 16, -19, 16, 30, 8, -26, -29, 11, -9, -1, -16, -42, -28, -11, -5, 1, 0, 6, 26, 52, 35, 9, 9, 0, 2, 0, 17, 11, 2, -3, 4, -5, -16, -2, 0 }, { -6, -5, 4, 2, 16, 3, -5, 10, 10, -12, -3, 8, 9, 35, -93, 17, 22, 27, 4, -14, 1, 3, 16, 35, -4, 29, 0, -8, 5, -2, -12, -22, -16, -14, -5, -1, 1, -6, -15, -2, -2, 1, -1, -2, 2, 4, 2, -1 }, { 7, 7, 8, 5, 4, 3, 18, -9, 3, 3, 39, 12, -13, 26, 7, 4, 24, 39, -66, 50, 18, -15, -50, 8, 8, 0, -4, -1, -1, -2, -8, 2, -7, -4, 13, 7, -6, 13, 7, -15, -6, -1, -2, 4, -6, -6, -6, 4 }, { -4, -2, -10, -1, -9, -3, 7, -10, 10, -14, -4, -17, 0, -3, -11, -18, -94, 50, -25, -14, 6, -7, -3, -5, -25, 27, 24, 2, -3, 1, 8, 6, -3, 3, 1, 3, 4, 6, 8, 11, -5, -5, 5, 5, 6, 8, -5, -4 }, { -2, -1, -3, -4, -2, -8, -6, -15, 0, 5, 16, 17, -20, -21, -20, 8, -19, -10, -16, 37, -16, 23, 24, 40, -41, -57, -13, -3, 1, 5, 21, 28, -18, -32, -18, -11, 4, -1, 3, 17, 18, 5, 2, -3, -13, 7, 11, -3 }, { -5, 1, -9, -10, -2, -1, -1, -10, -6, -17, -45, 4, -12, 13, 10, 13, 10, -5, -37, 7, -55, 39, -8, -22, 2, 8, -2, 5, 0, -7, -22, -35, -41, 11, 2, -12, 1, 3, 33, 32, 11, 4, -2, 5, 14, 14, 2, -6 }, { -1, -10, 1, -4, -8, -5, -7, -8, -18, -1, 34, -18, -6, -1, -6, -7, 5, -22, -7, -45, 38, -41, -9, -17, -3, -22, -7, -3, -2, 2, -3, -35, -38, -37, -3, 11, 5, 25, 20, 26, 20, 8, 2, 2, 22, 22, 12, -3 }, { -3, -7, 5, 1, 5, -9, 4, -1, -2, -2, 5, -8, -7, 0, -5, -94, 39, -14, -13, 18, 3, 15, 17, -2, -32, 26, 32, 9, -6, 2, 18, -17, 14, -11, -13, -8, 2, -1, 7, 11, -12, -5, 1, 3, 2, -9, 0, -7 }, { 6, 5, 2, 3, 0, 5, 4, -3, -4, 0, 15, -17, -2, -5, 32, 56, 12, 0, -7, 13, 10, 1, 34, -18, -43, 52, 10, 0, 0, -11, -29, -2, 16, -38, -40, -5, 11, 7, -3, -3, -3, -10, 0, 5, -7, -1, 15, -4 }, { 1, 2, -2, -2, -2, 6, -8, 0, -3, 1, 8, -5, 2, -16, 16, 8, -4, -26, 14, 3, -15, -17, -28, 63, -56, 16, 14, -1, -1, -2, -24, -34, 8, 16, 52, 14, -16, -11, -1, 1, 10, 4, 5, 8, 8, 4, -12, -3 }, { -2, -2, -4, -3, -5, 5, 5, 2, -1, -4, -17, -8, 8, -3, 1, 4, 21, 2, 12, -36, -37, 6, -57, 11, -19, 12, 9, 5, 0, 8, 32, 36, -8, -44, -1, -6, 7, 38, 18, -30, -24, -14, -4, -10, -12, 14, 8, 14 }, { -2, -4, -5, 2, -11, -3, 1, -7, -2, -2, -6, -4, 0, -6, -2, -30, -15, 3, -2, 5, -12, 7, -14, -5, -13, 21, -94, -9, -1, -1, -6, -25, 27, -11, -14, -8, 2, 4, -20, -31, 23, 19, -3, 4, 8, 11, 11, 23 }, { -3, -2, -2, -7, -2, -8, -3, -5, -6, -4, -14, -9, -16, -10, -8, 1, 0, -4, -13, 10, -9, -11, -8, -30, -15, -31, 30, 2, 0, -3, -9, -14, -18, -3, 1, 8, -4, -37, -62, -41, -35, -6, 1, 12, 20, 34, 32, 19 }, { -1, 3, -1, 2, 4, -3, -1, -2, 8, -3, -4, 1, -1, 4, 14, 3, 16, 0, -13, 7, -19, -44, 44, -11, -13, 36, -31, -28, -6, 16, 52, 21, -40, 12, 32, 30, -12, -14, 2, 1, -9, 3, -5, -12, -6, 16, 1, -6 }, { -6, 1, -7, -8, 6, 0, 0, 1, 2, -5, -11, 14, 2, 1, -7, -9, -6, 2, 4, 22, -61, -74, 10, 0, 15, -15, 17, 3, -1, -8, -13, -16, 26, -6, -26, 22, 18, 41, -11, 9, 11, 1, 0, 2, -12, -17, -1, 2 }, { 3, 4, 0, -2, -1, 5, 3, 1, 0, -5, 4, -1, -1, -2, -6, 19, -3, -12, -4, 6, 10, -14, 0, 12, -9, 3, -19, 86, 7, 14, 40, -31, -12, 41, -27, -9, 8, 4, 5, -11, -7, -40, -11, -7, 3, -2, 2, 2 } } }, { { { 98, 51, -42, -8, -12, -21, -30, 18, -11, -2, -2, 0, -3, 12, -14, -5, 2, 4, 0, -2, -1, 0, 0, -1, 5, 0, -2, -1, 1, 2, -1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -52, 92, 1, 8, -42, 46, -11, -3, -8, 3, 2, 0, -14, 18, -15, -9, 10, -5, 1, -2, 1, 0, -2, -1, 3, 2, -2, -2, 0, 4, -2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 35, -35, -42, 10, -27, 73, 50, -34, 13, 1, 1, 6, -15, 8, -3, -13, 25, -23, 5, 2, 0, 1, -3, 2, -9, 13, -8, -3, 2, 3, -4, -3, 1, 0, 0, -4, 4, 0, 1, -1, 1, -1, -1, 0, 0, 0, 0, 0 }, { 33, 19, 67, -48, 54, 50, 16, -14, -30, -1, -6, -6, 2, -15, -18, -17, -3, 9, -8, -3, -1, -3, 1, -2, 9, -3, -8, -3, -5, 8, 2, -1, -2, 0, 0, 2, 4, -3, 1, 2, 1, 0, 0, 0, 1, 0, 0, 0 }, { -11, -44, -12, 5, 20, 36, -49, 44, -7, 1, 2, 3, -27, 54, -46, -1, 3, 6, -4, 2, 0, 1, -2, -11, 29, -24, 9, 5, -16, 15, -7, 2, -2, -3, 3, 3, -1, -4, -3, 5, -1, -1, 1, -1, 2, 0, 1, 0 }, { 12, -18, 34, -8, -39, -3, -61, -69, 34, 1, -5, -1, 30, -2, -43, 2, 13, 4, 6, 2, 0, 1, 2, 10, -2, 2, 12, 6, 0, -2, 2, 1, 3, 2, 2, -3, -3, 3, -4, -1, 1, -1, 0, 0, -1, 0, 0, 0 }, { 7, 13, -17, 59, 38, 43, -21, 4, 36, -30, 3, -23, 14, -40, -6, 7, -43, 17, 0, -5, -4, -3, -8, 17, -6, -10, 2, -2, 8, -23, 21, -3, 1, 3, 0, 11, -14, 5, -2, -1, 2, 3, 1, 3, -3, 2, -1, 0 }, { -6, -14, -31, -43, -22, 33, -28, -15, -42, 28, 3, 20, -5, -26, 30, 44, -38, 33, -6, 4, 5, 4, 5, -10, 10, -2, 14, -1, 20, -19, 1, 4, -2, 1, -4, 4, -13, 11, 0, -3, -5, 3, 1, -1, -4, 0, -1, -1 }, { 29, 1, 56, 2, -29, 20, -2, 21, 34, -14, -6, -9, -14, 48, 58, 46, -18, -20, 1, 1, -3, -1, -5, -3, -7, -18, 4, -7, -1, -11, -2, -2, 0, -2, -2, 4, -5, 4, 0, 3, -3, 0, 1, 1, 0, 1, 0, 0 }, { -1, -2, -6, -21, -36, 0, 45, 19, 26, 2, 1, -18, 57, 16, -16, -15, -14, 49, -31, 4, 3, 0, -2, -13, 35, -22, -5, 0, -10, 0, 16, -13, -3, 0, -11, 23, -16, 0, -11, 11, -2, 2, 2, 4, 1, 3, 0, 0 }, { 11, -11, 28, 17, -46, -15, 17, 6, -5, -8, 2, 21, -56, -39, -23, -31, -53, -3, 27, 4, -3, 3, 2, 2, 32, -21, -9, 3, -1, 15, -3, 7, 1, 0, 3, 6, -5, 6, -4, 3, -2, 4, 4, -1, -2, 0, 1, -1 }, { 0, -19, 14, -7, -23, 18, -39, 43, 14, 9, -5, 5, 8, -12, 26, -53, -1, 23, -8, 0, 1, 0, -2, 2, -19, 50, -53, -13, 14, -4, 4, 1, -3, 1, -2, -8, 21, -10, 15, -10, 4, 0, -3, 6, -1, 0, -2, 1 }, { 0, 4, 5, -22, 1, 4, 21, 41, 28, 50, -22, 38, 13, -1, -44, 8, -17, -23, 17, -17, 5, -1, -5, 16, -35, -3, 19, 8, 13, -36, 10, 5, -6, -3, 17, -12, -11, 3, 1, -19, 14, -1, -1, 6, -11, -1, 0, 2 }, { -6, -13, 1, -37, -27, -2, 6, 29, -19, -53, 21, -55, -19, -13, -32, 16, 4, -16, -13, 17, 1, 4, -6, 26, -19, 27, 24, 13, 18, -15, 23, -13, 4, 3, -3, 0, 4, 1, 6, -8, 2, -2, -6, -1, -1, -2, -3, 0 }, { 11, -1, 29, 48, -2, -5, 20, 5, -16, -4, -7, 18, -10, 8, -27, 33, 29, 41, -9, -8, -5, 2, 3, -32, 24, 51, 6, -6, 41, -26, -11, 1, 0, -2, -5, -1, -13, 8, 9, -8, -5, 1, -2, 3, -7, 2, -4, -1 }, { -11, 0, -13, -15, 19, -18, 6, -34, -5, -7, 6, -10, -7, 53, -17, -2, -34, -12, 17, 5, 2, 0, -2, 11, 12, -7, -60, -38, 43, -25, 18, -5, 4, -4, 2, 7, -2, 10, 16, -11, 4, 0, 0, 9, -12, 5, -3, -1 }, { 3, -9, 10, 14, -11, 3, 0, 0, -49, -24, 6, 28, 15, 20, 20, -21, 24, 34, 46, -10, -4, 6, -23, 41, -12, -15, 9, 4, -25, -23, 30, -13, 2, -7, 4, 8, -24, -4, -10, -8, 20, -7, -2, 14, -12, 4, 2, 2 }, { 7, -8, 13, 35, -12, -4, -4, -12, -44, 16, 0, -6, 2, 13, -6, -29, -18, -21, -58, 16, -2, 0, 29, -30, -52, -23, 3, 6, -11, -31, 16, 6, -2, 6, -5, 16, -9, 1, 6, -1, 3, 8, 2, 2, -3, -2, -1, -1 }, { -4, 5, -7, -22, 20, -7, -8, -16, 31, 6, -11, -14, -41, 16, 28, -53, 6, 1, -2, -8, 2, -2, -2, -16, 17, 21, 35, 41, 21, -26, 9, 2, 1, -1, -5, 22, -42, 10, -19, 0, 1, 0, -3, 10, -11, 5, -4, -1 }, { -4, -8, -6, 2, -12, -5, -4, -5, -16, -5, -76, -23, -13, -12, -1, 10, 7, -9, -25, -71, 24, 18, -23, 4, 3, -8, -12, -12, -1, 9, 13, 1, -1, -8, 11, 18, 6, 1, 0, 14, 10, -11, -6, 11, 5, 9, -4, -1 }, { 0, 4, -6, 3, 3, -3, 13, -25, -1, -10, -11, 0, -5, 32, -3, 4, -53, 26, 4, -12, 6, 6, -8, -8, -4, 32, 10, 47, -29, -11, 0, 2, 3, -4, 11, -23, 42, -53, 12, -10, 7, 2, -4, -1, 17, -11, 2, 6 }, { 2, 5, 3, -2, 5, -8, 0, -5, 22, 41, 12, -2, -51, -16, -10, 33, 27, 18, -8, 18, -1, -4, 0, 12, -3, 0, -33, -2, -41, -20, 39, -22, 7, 7, -22, 30, 4, -33, -2, 11, 0, -3, -7, 14, 1, 7, -3, 2 }, { 8, -2, 10, 30, 1, 1, -1, -2, -17, 53, 2, -10, 10, 16, 10, -2, -17, -14, -16, 8, -4, -4, -13, 45, 24, 17, 22, 20, 29, 49, 34, -36, 2, 4, -16, -2, 18, 16, -2, -12, 2, -6, -9, 1, -4, 0, -4, -1 }, { -1, -1, 1, 0, 1, 4, -8, 8, -17, -15, 11, 30, 33, -2, -9, 15, -16, -50, 9, -5, 0, 2, -19, -16, 3, 33, -36, 41, 2, 3, -5, -12, 3, 6, -16, 33, -24, -18, -33, 37, -17, 1, -6, 8, 6, 17, -10, -4 }, { -4, 4, -5, -14, 6, -2, -3, -4, 17, -34, -3, 58, -3, 5, -1, -12, -9, -5, -8, -12, -8, -1, 4, -17, -10, 8, 46, -43, 4, 15, 37, -24, 2, 7, -32, 27, 29, -3, 28, 13, -13, 7, -8, 0, 3, 6, -8, 0 }, { -2, 0, -4, 1, -1, 4, 3, 4, -4, -9, -33, 3, 7, 9, 4, 8, 0, -8, 5, 12, -12, -23, 71, 39, 22, 28, 2, -5, -9, 6, 31, 52, -31, -9, 11, 18, -8, -15, -10, 9, -10, 10, 9, -7, 1, -1, 4, 3 }, { -1, 3, -3, -4, -2, 3, -6, 2, 7, -28, -5, 30, 8, -17, 10, 9, 27, -20, -23, 1, -6, -4, 11, -8, 36, -41, -22, 52, 22, -4, 19, 0, -4, 1, -2, -3, 14, -19, 32, -47, 19, -6, -1, 15, -22, -13, 3, 9 }, { -3, 4, -3, -8, 7, -3, 7, -5, 12, -24, -9, 26, -22, 14, -8, 14, 0, 40, -16, -2, 1, -4, 14, 7, -49, -7, -32, 41, 7, 34, 14, -2, -7, 2, -2, -15, 10, 57, -35, 8, -7, 7, 1, -6, -7, 0, 1, -4 }, { 4, 2, 6, 8, 4, 5, 2, 7, 2, 8, 8, -7, 9, 2, 1, -3, 10, 5, 33, 9, 70, 66, 29, 2, -6, -12, 2, 20, 15, -3, 6, 18, 8, -10, -21, 24, 19, 5, 20, 5, -29, -7, -7, -13, 3, -6, -11, -6 }, { 1, 5, 1, -1, 7, -1, -2, -4, 10, -16, 19, 29, -3, 0, 3, 3, -12, -15, -46, 10, 42, 44, 13, 12, 21, 28, 1, -28, -33, 6, 4, -5, 15, -7, 21, -29, -28, 6, -26, -16, 25, -20, -12, 11, -17, 0, 7, 4 }, { 4, 0, 5, 0, -2, -1, 2, -4, 6, 12, 63, 0, 0, 0, 2, 6, 2, 3, -6, -26, -10, 0, -20, -17, -6, 7, 4, 4, 9, 18, 26, 47, -13, -27, 53, 40, 21, 9, 9, 16, 24, -19, -2, 4, -5, 2, 6, 4 }, { 1, 1, 3, 0, 1, -2, 1, -1, 4, -3, 8, 25, -11, 2, 5, -10, 12, 12, -34, 2, 1, 8, -4, 39, -5, -26, 2, -2, 54, -9, -5, -4, 16, 1, 29, -11, -13, -50, -12, 36, -10, -5, 10, -30, 45, -7, -2, -5 } }, { { 89, -76, -38, -6, 17, 2, 13, 18, 12, -4, -1, 3, -4, -8, -1, 0, -3, -6, -2, 3, -1, 1, -1, 1, 3, 1, -1, 0, 1, 1, 1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 40, 44, -4, -6, -37, -93, 15, 22, 1, -2, -2, 0, 20, 28, 14, -5, -16, -13, 0, 0, 0, 0, 3, -4, -8, -2, -1, 0, 2, 7, 4, -2, 0, -1, 0, 1, 0, 1, -1, -2, 0, 1, 0, 0, 0, 0, 0, 0 }, { -55, -41, 15, 5, -4, -18, 79, 46, 11, 2, 1, 1, -4, -19, -10, -7, -31, -26, -3, 3, 0, 0, -1, 2, 14, 13, 2, 2, 3, 8, 5, -3, -1, 1, 1, -5, -5, -2, -1, -2, 0, 1, -1, 1, 2, 0, 0, 0 }, { 37, 50, 7, -11, -22, 17, 18, 15, -1, -2, -3, 0, -15, -54, -67, 7, 13, 0, -1, 0, -1, 0, -1, 9, 37, 33, 4, -2, -13, -16, -4, 0, 0, 2, 2, -9, -9, -1, 5, 7, 0, -1, -2, 1, 2, -1, 1, 0 }, { -22, 35, -90, 32, 50, -17, -4, 26, -3, 9, 9, -13, -26, -18, 11, -5, -1, 3, 0, -5, 3, -4, 1, 12, 10, 2, 1, -1, 4, 0, -4, 0, 1, 1, -4, -5, -4, -2, 0, 1, 3, 0, 0, 1, 2, -1, -1, 0 }, { 28, 28, 16, -3, 48, 15, 42, -51, -28, -2, -2, -3, 4, 0, 7, -51, -45, 0, 10, -4, -1, -1, 2, -6, 0, 11, 4, 0, 24, 30, 4, -3, 3, -1, 1, -3, -12, -8, -5, -7, 3, 2, -1, -1, 4, 1, -2, 1 }, { -8, -19, 10, -1, 41, -57, 37, -46, -15, -2, 1, -2, 5, -13, -19, 18, 48, 36, 16, -2, 0, -1, 4, -5, -7, -2, 4, -1, -20, -32, -21, -3, 4, -1, 3, 10, 9, 2, 8, 11, 4, -3, 1, -5, -5, -2, 0, 1 }, { 14, 10, 53, -23, 29, -13, -14, 31, 4, -3, -4, -2, -34, -54, 65, -6, 4, 5, -2, 0, -1, -3, -1, 19, 14, -24, -22, 5, 3, -9, -9, 2, 0, 4, -7, -3, 13, 7, -3, 4, 4, -1, 1, 3, -3, -2, -1, -1 }, { -9, -31, 17, 20, 0, -35, -52, -12, -12, -8, -1, 1, 3, -6, 14, -9, -17, 7, 4, 2, -2, 2, 2, 10, 38, 62, 27, 1, -3, -3, -1, 2, 1, 1, -5, -28, -43, -20, 12, 10, 3, 0, -4, 4, 13, -5, 0, 0 }, { 8, 9, 29, -3, 65, -6, -18, 22, -17, -3, -3, -6, 4, 25, -31, 66, -29, -34, -8, -4, -2, -3, 0, -5, -1, -8, 2, -14, -25, 10, 25, 3, -2, -1, 1, -3, -4, 1, 8, -7, -11, 3, 0, 2, 3, 4, 2, -1 }, { -18, 4, -21, -83, 24, -12, -10, -24, 58, 16, 4, 8, 31, -11, -2, -8, 13, -25, -26, -5, 6, -3, -1, -1, 11, 9, 8, 2, 6, 4, 13, 11, -1, 1, -2, -7, -10, -6, -2, -3, -5, -4, 1, 3, 3, 2, 1, -1 }, { 1, 9, -14, -9, 4, 27, 15, 25, -49, -2, 0, 19, 66, -15, 43, -8, 12, -15, -6, 0, 0, 4, -7, -24, -9, 22, -1, -7, -39, -25, 2, 5, -4, 1, 4, -1, -4, 3, 27, 21, -2, -2, -2, -2, -6, -5, 4, 0 }, { -11, -15, -7, -14, -3, -19, -31, 19, -41, 10, -2, 8, 37, -54, -28, 6, -11, 34, 11, 0, 1, 5, -2, -22, -16, -4, -13, 12, 37, 35, 8, 3, -1, 1, 12, 15, 6, 4, -26, -27, -9, 1, -2, -2, 3, 8, -1, 0 }, { 1, 4, 19, -18, 24, 4, -14, 55, 9, 5, -3, -7, -10, 21, -22, -46, 6, 26, -1, -1, -1, -2, 0, -11, -35, 3, 63, 0, -3, -12, -17, -3, -3, -4, 8, 29, -3, -29, 5, 5, 7, 0, -1, -15, -4, 0, 0, 5 }, { -7, 0, -21, -65, -14, 10, 12, 4, -20, 19, -1, 5, -27, 27, 14, 38, -47, 50, 23, 0, 5, 2, 1, 1, 9, 12, -13, -10, -3, -11, -27, -15, 2, -2, 3, -3, -9, -4, 10, 9, 8, 5, -4, -1, 3, -4, -3, 2 }, { 12, 15, 13, 20, 8, 17, 26, 16, 37, 8, -1, -23, 16, 4, 28, 39, 34, 27, 5, -3, -1, -3, -3, -21, -27, 31, -18, 9, 20, 10, -2, -5, 1, -3, 3, -12, -44, -32, -14, -20, -7, 1, -4, 7, 30, 3, 0, 2 }, { -2, 10, -1, 17, 16, -1, -13, -1, 55, -58, 4, 33, 10, -7, -5, -1, -43, 16, 15, 3, 1, -3, -13, -35, -6, 24, -32, 10, -4, -24, -7, -2, -2, 3, 18, 18, 18, 10, 0, 11, 5, 2, -7, -12, -16, -1, 1, 4 }, { -4, -2, 6, -10, 21, 2, -5, 27, -20, -8, 4, 2, 18, 47, -30, -28, 22, -1, -1, 2, -1, 0, -1, 2, 32, -12, -54, 53, 25, -18, -16, -2, -2, -4, -9, -25, -8, -1, -10, 18, 13, -2, 3, 10, 11, -14, -4, -1 }, { -6, 5, -11, -21, -3, 0, 13, 4, -21, -74, 5, 53, -27, 5, 8, 10, 31, -2, 12, 11, 1, -1, -19, -14, 9, -25, 31, -9, 15, 17, 16, 5, -1, 7, -1, -16, -17, -19, -6, -9, -9, -3, 1, 10, 17, 2, 0, -3 }, { 6, 4, 6, 22, 4, -1, 13, -2, -3, 21, -12, 49, 3, 7, 0, 12, -17, 42, -86, -6, 2, 2, -4, 4, 7, -6, 10, -8, 25, -27, 8, 26, 0, -2, -11, -8, 6, -5, -12, 4, -2, -7, 5, 3, -3, 2, 0, -1 }, { -4, -6, -4, -13, -6, -12, -2, -8, -15, 4, -10, 1, -55, -1, -7, -23, 4, 4, -28, 10, 2, -2, -1, -16, -43, 26, -41, 33, -38, 9, 43, 19, 1, 1, 9, 5, -6, -9, 23, 1, -21, -7, 0, 8, 14, -3, 11, -3 }, { -2, 0, -2, -16, 5, 4, 5, 5, -19, -42, 14, -4, 2, 13, 7, 10, 12, -5, -24, -1, -1, 0, 18, 58, -25, 61, -1, 10, -2, 9, -13, -8, 1, -5, -14, 5, 32, 8, -42, -20, 4, 3, 2, -13, -18, 15, -3, 2 }, { -2, -2, 7, 1, 11, -1, 0, 20, 16, 3, 5, 7, 5, 23, -6, -34, 19, 43, 7, -3, -3, 2, -4, -5, 19, 11, -17, -59, -33, 32, 6, 4, 0, -2, -9, -23, 2, 54, -3, -24, -15, -5, 2, 18, 2, 13, 5, -9 }, { -3, -2, -4, -4, -7, 1, 1, -1, 10, -45, 13, -34, 31, -21, -17, -5, -27, 31, -16, 0, -2, -5, 6, 44, -36, -37, -7, -11, -13, 5, 5, 4, 4, -7, -20, -20, -19, -10, 28, 22, 12, 2, 12, 20, 22, -14, -5, -5 }, { 3, -6, 8, 3, -5, -5, -6, 0, -8, 12, 76, 23, -10, -1, -8, -6, 3, -21, -3, -52, 0, -7, -19, 4, -10, 2, -31, -34, 2, 7, -30, 23, 15, 3, 1, 20, -10, -31, 7, 0, 9, -6, -1, -10, 11, 6, -2, 8 }, { 3, 5, 3, 12, 8, -1, -3, 0, 15, 34, -14, 75, 2, -9, -9, 6, 2, -9, 20, 11, -1, 0, 3, 28, -37, -1, 5, 25, -23, 39, -30, -21, -5, -4, -13, -24, -7, 7, 6, 12, 19, 11, 3, 7, 3, -11, -8, 0 }, { 6, 3, 2, 2, -5, 6, 9, -1, 7, 5, 50, -4, 5, 1, 11, 9, -14, 24, 13, -41, -4, -4, -10, -4, 15, -11, 41, 73, -32, 2, 17, 16, 11, -1, -10, -7, 2, 26, -12, -10, -16, -5, 7, 6, -3, 3, 8, -8 }, { -2, 0, -5, 5, -2, 1, 0, -5, 3, -11, -18, 10, 14, 0, 4, -9, -5, 9, -24, -4, -4, 6, 27, 13, 41, -29, -14, 13, -53, 14, -22, -30, -3, 0, 21, 37, -15, -38, -8, -35, -8, 9, -13, -13, 21, 26, 12, 5 }, { -2, -2, -2, -5, -3, -3, -1, -4, -8, -19, -21, -22, -13, -3, 1, 7, 2, -14, -42, -16, 0, -4, -19, -24, -9, -4, 11, 8, -5, 26, -55, 11, 8, 7, 25, 10, -45, 59, -17, 20, 18, 0, -10, -2, 4, -7, -5, 3 }, { -2, -3, -2, -2, -3, 2, -2, 6, 1, -15, -29, 11, -10, -5, 0, 0, 5, -6, 7, -70, 14, 2, 81, -29, -4, 4, -2, -4, 10, -2, 9, 4, 4, -25, -29, 1, -4, 8, 10, 0, 15, 7, 8, 3, 2, -4, -7, -2 }, { -1, -2, -3, -3, -2, -2, -7, -7, -6, 8, -4, 5, -5, -8, -5, -15, -21, -23, 14, -4, 1, 2, 0, -4, -30, -7, 4, -14, -15, -65, -9, -11, -6, 2, 5, -20, -8, 10, -61, -17, -6, -2, 10, 33, 49, 19, 5, -13 }, { -2, -1, -4, 2, 0, 1, 3, 2, 3, -5, -66, -2, 10, 7, -2, 0, -5, -1, 33, -39, 11, 11, -39, 37, -1, 2, -6, 3, -4, 2, -16, 61, 2, 6, -3, 7, 4, -20, 0, -4, -23, -24, 5, 5, -10, 8, 11, -2 } }, { { 105, -48, -38, -10, 18, -31, 11, 7, 4, -4, -2, 2, 4, -3, 0, 0, 1, -1, 1, 1, -1, -1, 0, 1, 0, 0, 0, -2, 0, 1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 24, 93, -5, -3, -23, -74, -20, 20, -12, -1, -1, -3, 10, 0, 9, -4, 0, 4, 2, -3, 0, 0, -1, 2, 1, -2, 1, 1, -2, 0, 1, 0, 1, 1, -1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 }, { -43, -34, 17, 1, 10, -76, 76, 9, 4, 1, 1, 0, 14, -19, 23, -8, 2, -11, 1, 1, 1, 0, 0, 3, -1, -5, 0, 1, 2, -1, 2, -1, -3, 0, 1, 0, 0, 1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0 }, { -31, -32, -41, 25, 24, -22, -69, 22, -4, 2, 6, 2, -1, -1, 71, 9, -14, 16, -2, 0, 1, 2, 0, -1, 2, -13, 4, -3, 3, -1, -2, 0, 4, -1, 0, 0, 0, 0, 1, -3, -1, 1, -1, 0, -1, 0, 2, 0 }, { 21, -25, 87, -42, -24, -16, -38, -36, 16, -5, -9, 2, 14, 16, 38, 7, -6, 4, 4, 3, -2, -3, 1, 4, -1, -6, 3, -5, 1, -3, -3, 0, 1, 1, 1, 0, 0, 1, 0, -2, 0, 3, 0, 0, -1, 0, 1, 0 }, { -20, -28, 4, 2, -1, -43, -34, 12, 7, 1, 1, 0, 3, 13, -75, 72, -2, 6, 1, 1, 0, 1, 0, 2, -1, 18, -16, 17, -1, 4, -13, 1, 1, 0, 1, 0, 0, 1, -1, 3, 0, -4, 0, -1, -3, 0, 0, 0 }, { -1, 19, 5, 6, 83, -11, -13, -73, -30, -3, 1, -8, 24, -21, -8, 5, 31, 0, 7, -6, -1, 0, -3, 4, 4, -3, -10, 0, -1, 0, 4, -4, 2, 3, -2, 0, -1, 1, 2, -1, 0, -1, -3, 0, 0, -2, 1, 0 }, { 10, 7, 45, -23, 47, 14, -7, 63, -14, 0, -4, -5, -9, -70, -11, 2, -32, 18, -11, -3, 0, -1, -2, -2, 7, 1, 12, 2, 0, -3, -1, 5, 3, -3, -1, 0, -1, -2, 3, 1, -2, 2, 0, 1, -1, 2, 1, -1 }, { -9, -22, 5, 1, -5, -18, -35, 5, 4, -1, 1, 1, 4, -3, -32, -86, 16, 3, 0, 1, 0, 0, 0, 1, 1, 6, 0, 67, 10, -13, 22, -2, 1, -1, 0, 0, 0, 0, 0, 1, 0, -14, 6, 1, 4, -1, 0, 1 }, { 4, 9, 19, 9, 48, -4, 12, 12, -17, -6, -1, 0, -12, 78, 2, -20, -66, -7, -1, -3, -2, 0, -1, 0, -13, 20, -30, 7, -1, 12, 0, 8, -3, -1, -1, -1, -1, 0, -4, 1, 6, -3, -2, -3, 2, 3, -1, 2 }, { -20, 7, -37, -88, 16, 9, -5, 11, 27, 34, 2, -14, 54, 13, -2, -5, -7, -25, -11, -1, 10, 1, -3, -3, 3, -4, -8, -3, 6, 7, 2, -1, 0, -4, -1, 3, -2, -1, 1, 2, 3, 4, -2, -2, 1, 1, 0, 0 }, { 9, 7, 9, -9, 5, 19, 16, 26, -14, 5, -3, 4, 0, 11, 38, 37, 42, -7, -6, 0, 1, -1, 1, -1, 0, -9, -43, 70, -45, -14, 2, -9, -2, -1, 0, 0, 0, -1, 0, -3, 5, -20, 13, 1, 0, -3, -1, 1 }, { -1, 0, 18, -1, 22, -5, -12, 38, 26, -2, 0, -11, -32, 18, -2, -15, 66, -11, 1, 0, 0, -1, -2, -2, 0, 12, -47, -55, 28, -18, 11, -12, 0, 0, 0, 0, -1, 0, -1, -2, 8, 9, 0, 3, 1, -4, 0, 3 }, { 0, 17, -9, -5, 19, -9, -2, -26, 77, 6, 0, -19, -62, -22, 15, 6, -16, -27, 16, -3, 1, 0, -5, 3, 3, 34, 11, 28, -14, 7, -12, -2, 3, 6, -2, 1, -2, 2, 2, -1, 0, -9, 2, -2, -2, -1, 1, -1 }, { -10, -7, -19, -54, -13, -10, 8, -21, -45, 24, -2, 9, -55, -6, 7, -8, 7, 64, 6, 3, 7, 0, 2, 4, -12, 32, -21, -2, 6, 14, -12, 2, -3, 1, 1, 3, 1, 2, -2, -2, 4, 2, 1, -4, 2, -1, -2, 1 }, { 5, -3, 0, 21, -26, 3, -8, -13, 7, 1, -2, -4, 20, -46, 4, -9, -28, -10, -4, 4, 0, -1, 0, 0, 9, 3, -86, -9, 6, 54, 7, 3, 1, 0, 1, 0, -1, 1, 3, 0, 16, -13, 16, -8, 2, 1, 1, 4 }, { -1, -10, -1, -12, -15, -1, -12, 1, -59, 0, -2, 7, -14, -11, 12, 13, -3, -82, 6, 4, 0, 1, 3, 1, 12, 51, 16, -1, 22, -2, 30, -14, 5, 0, 2, 0, 2, 0, 4, -1, 1, -12, -6, 2, -4, -1, 2, -1 }, { -12, 4, -16, -35, 3, 4, 0, 6, 6,-102, 16, 33, 4, -7, 0, 1, -1, -1, 46, -6, -6, 11, 2, 0, -20, -3, -8, 1, -2, 7, 2, -2, 3, 3, -3, 1, -1, -2, -2, -3, 1, 3, 3, 4, 0, 1, 1, 1 }, { 7, 12, -2, 4, 2, 10, 16, -4, 9, -2, -5, -2, 5, 0, 14, 26, -7, 12, 5, 2, 2, -2, 0, 1, -4, -12, -7, 39, 111, -18, -12, 1, -3, 0, -1, 1, 0, -1, 0, 0, 1, -9, 4, -1, 0, 1, -1, 0 }, { 3, 1, 5, 17, 0, 10, 3, 13, 9, -12, -5, -5, 52, 1, 16, -9, 21, 23, -6, -1, -2, -1, 0, 1, -4, 85, 17, -3, 0, 13, -51, -6, -4, -2, -1, -1, 0, -1, -1, -7, 9, -27, -12, -7, 2, -4, -1, 1 }, { 2, -2, 11, 0, 6, 2, -1, 19, -7, 3, -6, -22, -11, 14, 0, -1, 30, -5, 25, 0, 0, -1, -1, -1, 3, -30, 24, 18, 12, 93, -4, -11, 0, 2, 1, 1, 0, -1, -2, 2, -3, 4, -48, -17, -1, -2, -1, -2 }, { -3, -7, -6, -2, -12, 3, 5, 2, -16, -20, -10, -97, 7, 6, 0, 1, -14, 18, 48, 17, 2, -1, -4, -8, 36, 2, -4, -2, -8, -23, 8, -17, -4, 3, 7, 1, 0, -4, 2, 3, 6, -1, 11, 7, -3, -1, -2, 0 }, { -4, 0, -7, -12, -2, -4, 3, -10, 1, -53, 4, -27, -15, 7, 4, 5, 9, 7, -89, 8, -3, 3, -3, 8, 41, 5, 1, 6, 5, 13, 16, 34, -13, 0, 1, 0, -1, 4, 0, 3, -15, -3, -13, 3, -3, 0, -3, 0 }, { -4, 5, -7, 2, 7, 2, 4, 2, 20, 0, -78, 25, 8, 5, 1, 11, -5, 27, 9, 14, 8, -7, -6, 23, 2, 9, 7, -11, -3, 6, 68, -14, -10, 0, -1, 5, -3, 2, 0, 2, -8, -39, -10, 9, -9, 0, -2, 0 }, { 6, 2, 9, 1, 1, 1, 8, -1, 12, 12, 76, -7, 7, 7, 4, 14, 0, 29, -1, -26, -5, 4, 5, -29, -6, 15, 15, -6, 6, 17, 62, -14, 21, -2, -4, -6, 1, -3, -4, -3, -2, -39, 15, 5, -9, 1, 0, -1 }, { 3, 2, -2, 3, -8, 2, 3, -6, 8, 6, 20, 17, 5, -8, 2, 0, -15, 11, 10, -8, 0, 3, -1, -13, 29, 8, -31, 7, -6, -33, 7, -14, 2, -1, -4, -2, -1, -1, -1, 4, -6, 12,-105, 13, -1, 0, -1, 2 }, { 1, -1, -2, 4, -7, 1, -4, -3, -2, -4, -1, -41, 4, -14, 2, 7, 1, -5, -15, 23, 0, 1, -5, 1, -96, 4, 0, 4, -4, -12, 26, 30, -2, 1, 4, 2, -1, 0, 0, -4, 34, 6, -35, -7, 2, 0, 2, 1 }, { 0, -5, 0, -1, -6, 3, 0, 5, -5, 8, -8, -11, 2, 1, 1, 1, 8, -6, 32, -63, 4, -1, 0, 26, 10, 0, -6, -7, -1, -11, 4, 86, 13, -4, -3, 2, 4, 0, 0, -16, -24, -27, -9, -23, 13, -1, 2, -3 }, { -2, -5, -4, -3, -8, 0, 1, -3, -5, -11, -16, -21, -6, -1, -5, -2, -10, -1, -30, -74, -4, 0, 5, 49, -15, -14, 0, -1, 0, -3, -12, -59, 33, -4, -4, 0, 5, 5, -2, -15, 24, -7, -7, 17, 2, 4, 1, -1 }, { -2, -1, -1, -11, 2, -5, -4, -3, -8, -3, 7, 2, -17, 0, -9, -9, -4, -13, -5, 24, 3, 2, -3, -17, -1, -38, 2, -22, -3, -12, -39, 0, -4, 0, 1, 2, -2, -1, 2, 9, 19,-101, -17, 7, 1, 1, 1, 2 }, { -3, -2, -4, -2, -2, -5, 2, -6, -1, -9, -51, 0, -2, 0, 1, 0, 2, 0, -12, -38, 1, -4, 4, -96, -3, 3, 1, 2, 1, 0, -3, 8, 37, 13, -1, 5, 2, -3, -7, 26, 12, 7, 2, -4, -6, -5, 4, -1 }, { 1, -1, 0, 1, -3, 1, 0, 0, -4, -1, -2, -17, 3, -4, 1, -5, -1, -6, -4, 9, 17, -6, -13, -4, -37, 3, -15, -2, 1, 6, -15, -12, 27, 3, 4, 2, -1, 2, -2, 8,-103, -8, 0, 41, 1, -3, 1, 12 } } }, { { { 83, -72, -48, -1, 23, 9, 2, 24, 16, -4, 1, 0, -16, -12, 2, 0, 1, -5, -3, 2, -1, 0, -1, 7, 6, -1, 0, 0, 1, -1, 0, 0, -1, 1, -2, -2, 0, -1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0 }, { 43, 24, 7, -13, -70, -55, 5, 40, 18, -2, -3, 5, 32, 27, -2, 3, -10, -36, -16, 3, 0, 1, 2, -4, -2, 2, -1, 1, -2, 4, 17, 4, -4, -1, -2, -4, 0, 2, 0, -1, -5, 0, 1, 3, 0, 0, 1, -2 }, { 49, 21, -25, 6, -1, -5, -23, -62, -24, 0, 0, 1, 37, 37, -5, -4, 17, 38, 15, -3, 0, 1, 0, -34, -37, 0, 1, -1, 2, -1, -9, 0, 3, -4, 14, 18, 0, 0, 0, -2, 0, -1, -3, -6, 0, 1, 0, 1 }, { -33, -33, -14, 26, 28, 14, -7, 8, -12, -2, 2, -1, 26, 40, 5, 8, -18, -46, -9, 2, -1, 1, -2, -26, -34, -7, 3, 0, -6, 28, 54, 13, -3, -2, 4, 6, -1, -1, 3, -15, -29, -2, 2, 4, 2, 4, 8, -3 }, { 37, 19, 25, -5, 10, 19, -22, -37, -43, -9, -5, 13, 8, -8, 11, 1, -16, -20, 1, 1, -2, 3, -4, 23, 43, 0, -4, -1, 0, 20, 31, 11, 1, 2, -31, -51, -9, 4, 3, -2, -7, 0, 13, 27, 4, -3, -1, -8 }, { -18, 50, -86, 33, 24, -29, -12, 4, 11, 3, 7, -8, -11, 17, 12, -4, -5, -9, -13, -4, 2, -4, 6, 25, 13, 0, 1, -2, 5, -1, -7, 3, 1, -3, -21, -19, -5, 0, -1, 9, 16, 3, 7, 7, 1, -5, -9, -1 }, { 27, 50, 2, -13, 24, -16, 0, 7, -1, 9, 1, -26, -53, -29, -15, 8, 5, -2, 14, -3, -1, -5, 8, 9, -15, -7, 0, 0, -6, 24, 34, 2, 0, 2, 14, 30, 11, -3, 1, -34, -43, -7, -3, -10, 0, 14, 19, 0 }, { -17, 2, -35, -10, -48, 46, -28, 2, 39, 15, -3, 3, 9, -21, -11, 8, 47, 22, -4, 3, 3, 4, -9, -13, 5, -5, -2, 0, -1, -2, 17, 11, -3, 3, -5, -23, -1, 4, -3, -19, -33, -13, 13, 29, 7, 6, 12, -14 }, { 15, 31, 4, -6, 2, 56, -39, 0, 12, -13, 2, 8, 0, -9, -9, -10, 4, -40, -50, -3, 0, -2, 2, 3, -2, 11, 1, 0, 1, -21, 7, 25, -3, -2, 13, 31, 7, -4, 3, 28, 17, -1, -16, -36, -13, -9, -6, 15 }, { -10, -27, 24, -30, 6, -28, -45, -18, 21, -3, 3, 6, -20, 43, 27, 6, 32, 8, -15, -2, 0, -2, 16, 46, -15, -27, 11, 0, -3, -9, 2, 9, 0, -8, -35, 9, 22, -5, -2, -6, -6, -4, 15, -8, -13, 7, 4, 2 }, { 18, 21, 33, 1, 35, 5, 4, 19, 26, -1, -10, -31, -13, 6, -3, 3, 11, -1, -15, 1, 0, -1, -10, -33, -41, -34, -5, -1, 5, -9, -6, 12, -1, 4, -6, -19, -4, 1, -5, 20, 24, -1, 27, 49, 20, -21, -24, -19 }, { -12, -23, -6, 17, -24, -35, -14, -8, -32, -9, -2, -20, -20, -25, -14, 10, 32, 3, 21, 29, -2, 1, 3, -2, 0, 5, -2, 5, 11, -3, 40, 35, 1, 1, 6, 6, -2, 1, 8, 42, 6, -15, -1, -6, 0, -46, -27, 8 }, { 9, 1, 24, 62, 20, -19, -6, 18, -14, -23, -2, -4, 13, -2, -3, -6, 39, -3, -10, -4, -7, 1, -1, -2, 16, 15, 2, 5, -26, -58, -11, -1, -4, 3, 2, -7, -2, 1, 8, 3, -33, -15, 1, 6, -3, 18, 39, -9 }, { -11, 4, -14, -66, 52, -38, 23, -4, 4, 10, -2, 36, 35, -21, -3, -4, 17, 0, -18, -2, 5, -2, -23, -21, 12, 9, 0, 2, -4, -16, 11, 27, 1, 5, 2, -6, -6, -3, 3, 10, -13, -19, 0, -4, -3, -6, 5, 5 }, { 1, -19, 15, 23, -3, -21, 25, -64, 54, 6, -4, -27, -5, -2, -19, -7, -10, -3, -40, -18, 0, 0, 22, 5, 14, 29, -3, -2, 9, 12, 8, 24, 9, -2, 11, -4, -24, 4, -5, -8, -2, -7, -10, 3, 16, 6, 1, -3 }, { -13, -13, -20, -30, -13, -16, -22, -16, -30, -16, 12, -7, -11, -17, 6, -8, -19, -22, -19, -8, 1, 12, 19, -2, 2, 0, 13, 0, -8, -15, -13, -4, 2, 17, 41, 22, 26, 11, 5, 7, 9, 10, 6, 45, 19, 14, 13, -55 }, { 3, 5, -14, 13, -20, 18, 74, -21, -17, -20, -1, 22, -4, -3, -27, -4, 0, -4, -15, 2, 2, 3, -2, 18, -21, -22, 8, 0, -6, -12, 4, 22, 6, -9, -33, 17, 42, 0, -1, 1, -1, -11, 35, 5, -20, 6, -1, -9 }, { 0, 3, -8, -42, 2, 30, 29, 6, -16, 21, 11, -36, 0, 43, -19, -10, 9, -10, 16, 4, -1, 3, 46, 23, -2, 24, -5, 2, -14, -28, 4, -13, -10, -19, -8, -6, -27, -2, 13, 23, -19, -9, -3, 5, 21, -20, 10, -4 }, { 10, 12, 8, 19, 1, 13, 23, -3, 23, 14, 10, 22, 2, 12, 68, -26, -8, 10, 14, 4, -10, 8, 5, 11, 15, -11, 7, -9, 13, -4, 11, 4, -14, 15, 29, 15, 18, 5, -9, 8, -27, -35, -17, 4, -1, -42, -11, -26 }, { -3, -1, -12, -15, -26, 5, 16, -11, -11, -20, -14, -25, -33, 1, 56, -25, -14, 2, -10, 5, 5, -2, -20, -21, -16, -24, -14, 6, 3, -13, 1, 20, 14, 9, -1, -20, -27, -12, 22, 18, -4, -7, -8, -10, 2, 27, 40, 30 }, { -4, 7, -2, -13, 3, 6, 7, 17, -2, -90, 17, 24, -17, 27, -8, 2, 12, 21, -5, 18, 1, -2, 3, 4, 3, 15, -7, -9, 13, 10, -1, 14, -1, -1, 11, -2, -24, 2, -28, -27, -2, -11, -15, 3, 32, 2, -15, -2 }, { 4, -1, 1, 4, -1, 7, 14, 32, -44, 25, 1, -23, 27, -11, 32, 20, 26, 25, -38, -25, 7, 6, 34, 9, 4, -14, -11, -1, 2, 14, -3, 36, 31, -17, 6, 1, -1, 5, -16, -21, 14, 0, -22, -4, 3, 14, -17, -6 }, { 3, 8, -5, -3, 4, 6, 29, -26, 22, -16, 4, 4, -4, 12, 17, 68, 35, -25, 17, 6, 5, -2, 0, -5, 17, -23, -15, 10, -32, -6, 14, -15, 3, 7, 20, -10, 8, 6, 20, 8, 17, 33, -34, -5, -8, 6, -5, -12 }, { 3, -4, 1, 11, -15, -1, -3, 2, -22, 40, 10, 53, -53, 28, -12, 25, -8, 7, -23, -40, -8, -3, -23, -5, -7, 8, -8, 1, -30, -11, -4, 7, 11, 18, 10, -3, -16, 4, 7, -2, -6, -10, 0, 7, 28, -16, -11, 15 }, { -5, -7, -11, -10, -7, -4, 5, -20, -16, -13, -12, -34, 7, -11, 8, 3, 12, -31, -16, -19, -20, 11, -7, -6, 6, -19, -11, -21, -11, -17, -16, -33, -28, 18, 4, -3, 7, 10, -42, -32, -19, -10, 12, -3, 2, -27, -28, 48 }, { 0, 2, -1, -12, 3, 10, 3, 12, -3, 0, -39, -36, -15, 46, -6, 9, -6, 19, -2, 7, 2, 19, -21, -11, 34, 37, 29, -6, -13, 3, 2, 21, 10, 38, 16, -5, 20, -10, 1, -8, 7, -10, 15, -2, -45, 10, -17, -8 }, { 0, 4, -2, -2, 2, 3, -14, 4, 20, -24, 2, -9, 22, -24, 18, 50, -58, 34, 1, 6, -1, 13, 22, 6, -12, 10, -1, 0, -49, -11, -2, 12, 0, -10, -8, 9, 2, 15, 24, 6, -11, -25, 21, 5, 7, -10, -7, 23 }, { -4, -3, -1, -4, 6, -1, -11, 10, 6, -16, 8, -9, 7, 21, -48, 0, -24, 18, 7, -38, -7, 10, 9, -2, 28, -55, -26, 10, 26, 1, 1, 6, 20, 8, 13, -16, 28, 19, 0, 22, -12, -15, -25, 0, -14, -16, 29, 13 }, { 1, -1, 2, 6, 6, -1, -5, -6, -6, 38, -2, 3, 0, 13, -19, 2, -15, -5, -7, 84, 17, 5, -4, 11, 12, -26, -27, -6, -5, -14, -15, 21, -20, 8, 12, 2, 10, 25, -15, -11, 8, -19, -13, 2, 22, 32, 1, 13 }, { 3, -3, 5, -5, -6, 2, 0, -4, -3, 6, 94, -37, 3, 3, 5, 14, -11, 0, 0, 0, -5, -46, -39, -9, 6, 12, -4, 5, 2, -11, -2, 11, -1, 2, -12, -3, 5, 1, -15, -4, 0, -12, 4, -6, -20, 6, -5, -14 }, { 4, 4, 7, 5, 7, 6, 9, 6, 9, 1, 39, -2, 18, 5, 9, -14, 26, -4, 17, -6, 11, 28, 1, 18, 8, 11, 10, 10, 10, 13, 12, 6, 21, 45, 13, 20, 9, 21, 16, 6, 15, 21, 40, 17, 31, 14, 8, 56 }, { 2, 2, -1, 0, 2, 4, 8, 0, -12, 3, -20, -6, 10, 2, 7, 59, -5, 16, -34, 11, -6, -6, -22, 28, -11, 19, -8, 23, 61, 1, -1, -21, -20, 22, 2, 15, -5, -6, -11, 17, -12, 9, 13, 2, 9, -16, 39, -7 } }, { { 83, -61, -61, 3, 25, 4, -1, 21, 21, -1, 1, -7, -20, -7, 1, -1, 2, -2, -3, 2, -1, -1, 2, 8, 2, -1, 0, 0, 1, -1, -2, -1, -1, 0, -2, 0, 0, -1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0 }, { 33, 18, 17, -29, -75, -31, -2, 42, 41, -2, -3, 10, 21, 10, -3, 1, 0, -36, -35, 1, 1, 0, 5, 10, 6, 0, -1, 0, 0, -3, 13, 13, -3, -4, -11, -4, 2, 1, -1, 2, -2, -2, 5, 1, -1, -1, -1, 0 }, { -39, 5, 8, -14, 13, -12, 12, 37, 36, 11, 5, -21, -66, -24, 6, 2, -6, 1, 1, -6, 2, -5, 19, 54, 21, -5, 0, 0, 1, -2, -26, -25, -2, -1, -16, -2, 4, -1, -1, -1, 17, 15, -1, -2, 1, 2, -4, 1 }, { -29, -12, -13, 18, 36, 18, -19, -7, -6, -18, 6, 19, 4, 20, 7, 1, -3, -44, -44, -2, 1, -5, 5, 33, 5, -6, 3, -1, -2, 5, 35, 35, 5, -18, -51, -20, 3, -1, 0, 5, 0, 0, 24, 7, -4, -2, -7, 1 }, { -45, -16, -13, 9, 7, 9, 9, 38, 38, 8, -7, -21, -15, -21, -7, -1, 19, -6, -8, 17, 0, 1, -16, -35, -18, 0, 0, 1, 0, -7, 32, 32, -6, 14, 27, 15, 0, -1, 0, -3, -41, -40, 7, 6, 3, -3, 14, -4 }, { 9, 78, -61, 34, 4, -42, -11, -10, 3, 8, 3, -36, -12, 24, -7, -2, 5, 2, 2, 1, 0, -4, 18, 5, -19, 1, 0, 0, 0, 4, 10, 2, -1, -8, -3, 9, -1, 0, 2, -3, -8, -2, 4, -1, 2, 0, 3, 0 }, { 39, 15, 38, -29, 2, -12, 6, -19, -21, 3, -12, -9, -37, -22, -9, 2, 18, 27, 29, 19, 0, -6, -11, 16, -3, -11, 1, 1, 0, 5, 26, 26, 5, -4, -37, -13, 7, -2, 2, -5, -27, -26, 40, 18, -2, -4, -1, -2 }, { -33, -29, -34, 9, -29, -15, 18, 20, 11, 17, -9, 8, 25, 20, -6, 11, 8, 26, 39, 14, 6, -7, -21, -15, -21, -11, 2, 2, 1, 13, -7, -18, 4, -13, -42, -26, -1, 0, 0, -6, 15, 18, 36, 22, -1, -3, -20, 1 }, { 5, -11, 21, 41, 9, -43, 14, 46, -50, -19, -5, -8, -3, 5, 9, 5, -29, -43, 31, 19, -9, 5, -9, 2, 18, 2, 0, 1, -6, 9, 21, -22, -14, 21, 9, -13, -1, 1, 3, 0, -12, 3, -4, 8, 0, -1, 8, -4 }, { -20, -34, -17, -11, -29, -9, -8, -19, -10, 7, 17, 12, 8, 33, 13, -8, 8, 15, 21, 0, -10, 8, 36, 42, 15, 12, 4, 1, -3, -15, 1, -15, -19, -7, -5, 10, 3, 0, 2, -9, -52, -31, -6, -12, -4, 6, 41, -5 }, { 10, -23, 46, -8, 16, 10, -18, -3, 18, 19, 3, -42, -3, 40, -9, 10, -1, -20, 7, -9, -2, -1, 31, -11, -53, -9, 2, 1, -5, 21, 24, -26, -6, -20, 0, 20, 0, -2, 4, -15, -10, 33, 5, 2, 5, 5, 4, -3 }, { -15, -36, -4, -2, -29, -32, 7, -18, -19, -7, 4, -23, -15, -4, -2, 15, 12, 9, 6, 16, 13, -8, 20, 7, 4, 2, -7, 2, 9, 24, 33, 43, 28, -16, 13, 4, -11, 2, 1, 10, 23, 10, -46, -28, 3, -11, -28, 18 }, { 5, -13, 30, 26, 29, -50, 41, 17, 2, -21, 11, 17, 13, 4, -19, -8, 8, 21, -19, -17, -1, -14, 7, -5, -27, 10, -1, 2, 0, -29, -31, 14, 16, -33, -15, 28, -1, -4, -1, 7, 2, -14, 12, -22, -3, 7, 14, 9 }, { -9, 1, -17, -71, 50, -51, 25, -10, 1, 43, 3, 15, 21, 7, 9, 4, -25, -11, -5, -20, 1, 6, -6, -12, 10, 2, -1, 1, 0, 17, 14, 5, 6, 10, 10, -8, -4, 0, 3, -5, -5, -3, -4, 6, 2, -1, -1, -6 }, { 13, 29, 5, 2, 19, 34, 16, 32, 11, 23, 2, 15, 18, -3, 1, 2, 5, 5, 24, 11, 2, -21, -5, -1, 6, -13, -15, 1, 5, 8, 17, -5, -3, -36, -25, -32, -23, -2, 2, 0, -18, 0, -54, -34, -2, 1, 25, 27 }, { -11, 12, -28, -42, -5, 4, -21, 31, -35, -37, 21, 44, -24, -12, -15, 7, 17, -6, 10, -3, 5, -14, -17, -3, -29, -4, 3, 3, -6, 9, 17, -11, 10, -5, 5, 37, 14, -4, 7, -12, -5, 33, 3, -17, -5, 17, 14, -2 }, { 1, 0, -19, 8, -25, 30, 53, -1, -35, 29, 3, -21, 13, -50, -36, 4, -12, -19, -13, -30, -3, 12, 31, -1, 12, 16, -4, 4, -6, 2, 13, -4, 7, -26, -5, 7, 5, 2, 3, -4, -9, 17, 21, 10, 3, 8, 11, -8 }, { 9, 19, 2, -1, 9, 35, 40, 28, 2, -10, 25, 21, 7, 29, 16, 7, -11, 14, 16, 14, 1, 13, 30, 17, -3, 7, 7, -9, -16, 14, 4, 13, 2, 0, 16, 25, 11, 1, -11, -26, -3, -24, 3, 5, 4, -19, -64, -13 }, { -6, 3, -14, -46, 2, 23, 10, 10, -24, -9, -22, -35, 3, 21, -5, -20, -29, -20, 8, 44, 3, -1, 4, -3, -15, 17, -3, 0, 11, -24, -18, 3, -19, -8, -11, 16, -9, -4, 4, 44, 25, -18, 19, -22, -3, -13, 5, 20 }, { -2, 1, 5, -3, 28, -17, -33, 12, -6, 23, -2, 1, 40, -27, -33, 9, 61, -4, 4, 28, -9, 16, 17, 13, 19, 13, 2, 1, -6, -25, 6, -12, -34, 1, 0, 11, 11, 3, -6, 6, 4, 15, 2, -8, -6, -17, -33, 3 }, { 2, -2, 2, 18, -5, -3, 20, -49, 43, 6, 7, 23, -9, -5, -33, -1, -26, -28, 18, 22, 3, -27, -33, 10, 12, 34, 3, 0, -7, 14, 14, -15, -15, 10, -3, 28, 2, -2, 3, -10, -7, 7, 14, -33, -14, -1, -12, 17 }, { 5, -3, 9, 23, -9, 5, -39, 29, -25, 78, -19, 13, -8, 5, 27, -10, -18, 12, -6, -7, 12, -7, -14, 7, 1, 20, 9, 2, -6, 2, -1, 17, 26, 0, 0, 28, 9, -4, 9, 1, 6, -4, 17, -28, -21, 7, -7, 1 }, { -5, -8, -3, -3, -10, -13, -43, 5, -11, -1, 21, 6, -11, -30, -10, -19, -40, 4, -7, -24, -27, -10, 5, -21, -11, -2, -10, -11, -2, 12, -8, -14, -27, -30, -7, -7, -17, 11, -14, -15, -14, -31, 3, 2, 25, -29, -34, 38 }, { -2, 4, 6, 0, 13, -5, -34, 13, 30, -26, 15, -1, 25, -24, -1, -29, -31, 24, 26, 21, 14, 7, 28, -9, 23, 40, 3, 2, 16, 27, 12, 12, 19, -25, 4, -1, -12, -5, 12, 9, 7, 11, 17, 13, -1, 11, 19, -36 }, { -4, -1, -7, -2, 3, -11, 9, -12, 1, -21, -57, -10, 28, -41, 50, -7, -12, 8, -24, 28, 11, -5, 10, 9, -14, -22, 8, -7, -10, 18, -4, -13, -18, -13, -8, 18, 12, -3, -6, -39, -15, 6, -4, -16, -8, 23, -7, 5 }, { -4, -2, -2, -19, 6, 7, -16, 12, -5, -25, -21, -43, 10, 24, -27, 0, 5, -1, 5, -20, 9, -21, -17, -3, 33, 32, -2, -4, -43, -17, -15, 3, 34, 1, -3, -17, -11, 4, 0, -44, -19, -4, -3, -7, -5, 28, -22, 11 }, { 3, 2, 1, -5, -6, -3, 10, -12, 4, 1, 53, -22, 2, -20, 74, -15, 22, -20, 17, -4, -6, -20, -14, -20, -1, 18, 15, -3, -26, -41, 10, -2, 2, -18, -5, -4, -2, 3, 11, 13, -5, 21, 8, 3, -3, -2, -14, 3 }, { 1, 5, 1, -1, 2, 5, 16, 11, 1, -18, -27, 3, -5, 10, 25, -9, 25, 29, -14, -31, -22, 22, -1, 5, 2, 48, -5, 0, 22, 18, 29, -9, -9, 26, -1, 1, -42, -10, -3, 3, -8, 29, 27, -15, 11, -18, 1, 37 }, { 2, -4, 5, -2, -2, 1, 13, 5, -26, 20, 32, -9, -2, 11, -10, -39, 17, 14, -50, 43, 10, -44, -18, 14, -7, 21, 3, -3, 18, 18, -9, -8, -16, 8, 9, -11, -21, -3, -13, -23, -2, 10, -9, 22, 24, 10, 3, -21 }, { 1, 1, 2, -3, 2, 7, -7, 10, 0, -6, 47, -47, 45, -11, 7, 35, -7, 13, -1, -6, -33, -13, -27, 26, 9, -22, -13, -1, 22, 30, -6, 7, 9, 26, -5, 21, 12, 4, -11, -1, 10, -12, 22, -18, -9, 5, 18, 15 }, { -3, -3, -2, -4, -6, -1, 6, 2, -3, -6, 0, -13, -11, 5, -10, -18, -6, 15, -11, -15, -56, 7, -17, -11, -3, -2, 5, 5, -3, 6, 9, 1, -26, -8, -15, -12, -3, -38, 8, 2, 4, -2, -20, -27, -57, 1, -22, -60 }, { -3, -2, -2, 0, 0, 2, 6, 2, 7, -2, -17, -7, 23, -5, -15, -82, 6, -8, 23, -24, -18, -12, -9, 33, -15, -22, 23, -14, 4, 2, 16, -3, 34, 18, 2, 9, 27, 11, -3, 15, -2, 5, -19, 11, 7, -22, -4, 8 } }, { { 106, -44, -50, -9, 0, -14, -3, 14, 14, -2, -2, 2, 4, 3, -3, -1, 3, -2, -2, 3, 0, -1, 0, -1, -2, -1, 1, -1, 0, 1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 38, 69, 38, -34, -65, -47, -8, -11, -7, -1, -5, 7, 24, 10, -7, -1, 4, 8, 4, -1, 0, -2, 2, 4, -1, 4, 4, -2, -1, 1, 0, -3, -2, 0, 0, 0, 1, 1, -1, 0, -1, 0, 2, 0, 1, 0, 0, 0 }, { -39, -34, -9, -5, -57, -24, 24, 60, 54, 12, 1, 9, 25, 20, 3, 6, 0, -24, -21, 2, 3, 0, 2, -3, -13, -11, 0, 1, 2, -2, -7, -1, -2, -5, 1, 1, 0, -1, 0, 3, 2, -2, 0, 2, 1, 1, 0, 0 }, { -24, 18, -78, 39, -12, -20, -12, -33, -20, 8, 6, 15, 39, 36, 8, 0, 16, 30, 18, 0, 2, 2, 1, -5, -19, -11, 6, 2, -1, 3, -5, -15, -7, 2, 0, 1, 0, -1, -3, -2, -6, -6, 2, -1, 3, 0, -1, 1 }, { 4, 80, -50, 26, 12, -2, -3, 43, 20, 7, 0, -26, -34, -17, -10, -4, -9, -29, -25, -8, 2, 0, -4, 4, 11, 2, -1, -3, -2, -1, 1, 10, 9, -1, -3, 1, -2, 1, 4, 5, 6, 2, 0, 2, 0, -1, 1, -2 }, { -21, -16, -24, -19, -21, -41, 10, -3, 11, 18, 4, -11, -50, -36, 4, 6, 23, 39, 31, 10, 3, 0, -2, 16, 48, 37, 5, 1, 2, 3, -2, -13, -10, 1, 3, 2, 0, 4, 0, -17, -10, 3, 1, -7, -8, -7, -4, 2 }, { -1, 15, -30, -46, -39, 90, -5, -7, 12, 25, 4, 30, 4, -25, -12, -6, 5, 12, -2, -3, 4, 1, 2, -14, -7, 13, 4, -5, -3, 5, 5, -7, -4, -3, -2, 2, 0, -4, 2, 2, -6, 1, 2, -5, 1, 1, 1, -1 }, { 14, 21, 17, 3, 25, 9, 50, 37, 10, 12, -6, -12, -4, 1, -7, -6, 1, 43, 37, 0, -2, -2, -7, -15, -27, -25, -16, -7, -1, -4, -20, -52, -40, -6, 0, 0, -2, -3, 0, 3, 2, -2, -4, 2, 16, 9, -5, 7 }, { -3, 0, -21, -11, -13, -1, 54, 32, -77, -17, 14, 25, 1, -2, -4, 0, -35, -29, 29, 22, -1, 2, 3, -3, 8, 9, 0, -2, 0, -3, 20, 21, -9, -4, 4, 0, 1, -3, -4, -7, 0, 7, 1, -1, -8, 2, -2, 0 }, { 15, -13, 18, 86, -65, 25, 9, -10, -2, -28, -14, -13, -23, -25, -16, -1, 2, 4, 12, 7, -6, -2, 4, -2, -6, 3, 10, -4, 0, 4, 2, -2, -2, 4, 4, -2, 1, 0, 6, 11, 2, -1, 4, 1, 4, 4, -1, -2 }, { -15, -11, -12, -25, 2, -32, -15, -3, -13, 8, -8, -17, -7, -47, -37, 7, 4, 11, 12, 11, 6, 1, 0, -21, -52, -27, 6, 2, 5, 11, 18, 19, 12, 5, 3, 2, 0, -1, 22, 52, 34, 5, 0, 1, 0, 2, 3, -9 }, { 3, -2, -10, -22, -26, 20, 33, -34, -5, 19, -5, -48, -38, 48, 25, -3, 0, -4, 1, 11, 7, -2, 9, 32, 1, -46, -19, 0, -2, -2, 2, 11, 16, 8, 0, 1, 3, 3, -7, 9, 27, 9, -4, 9, 0, -9, -4, -1 }, { -8, -3, 0, -15, -11, 21, -73, 56, -24, -43, 8, -22, -10, 26, -1, 1, 25, 14, 21, 32, 4, 2, 1, 13, 6, -11, -4, 1, 1, -4, -15, -5, -8, -12, 4, 2, 2, -2, -8, -2, 7, -1, -3, 8, 0, 0, 1, 0 }, { 8, 12, 13, 12, 14, 14, 24, 18, 13, 12, -9, -11, 23, 9, -10, 16, 44, 22, 20, 26, 5, -4, -4, -5, -10, 4, 5, -5, -3, -3, 16, 47, 33, 1, -3, 1, -1, -7, -9, -10, -19, -17, -5, -25, -52, -37, -6, -5 }, { -3, 4, -9, -15, 2, -2, 41, -3, -18, -56, 18, 14, -5, -17, 0, 6, 55, 8, -44, -9, 4, 3, 6, 1, -7, 1, 0, -7, 0, -9, -44, -18, 36, 17, -2, 1, 2, -2, 3, 7, 0, -2, -2, 19, 12, -15, -3, -2 }, { 9, 6, 10, 13, -8, -9, -18, 6, -11, 27, 1, 53, -49, -2, 64, 16, 26, -14, 5, 3, -7, -1, -16, -43, -18, -19, -22, 3, 0, -14, -9, 10, -1, -5, 1, -1, -3, 3, 18, 0, 0, 4, -7, 1, -13, -1, 6, 6 }, { -6, 15, -15, -15, 3, -3, 8, -28, 58, -76, 9, 0, 7, -16, 33, -11, -21, -5, 32, 25, 4, 6, -9, -18, -5, -11, -21, -5, -2, 5, 15, 16, -9, -7, 0, 2, -3, -2, 5, -7, -3, 7, 1, -4, -6, 0, 3, 5 }, { -2, -2, 4, 9, 1, -9, -5, -14, 16, 2, 31, 45, -25, 11, -77, -8, 8, -3, 0, 7, -9, 0, -17, 4, 10, -37, -31, -6, 4, 6, 0, 7, 2, -5, -3, -2, -3, -3, -27, -30, 13, 21, 3, 2, -6, -6, 2, 19 }, { 6, 9, 8, 7, 15, 4, 11, -11, 23, 2, 44, 28, -9, 23, -1, 2, 15, -16, 15, 24, -11, 3, 7, 25, 8, 17, 52, 10, -2, -2, -10, -6, -22, -26, -9, -4, -1, 0, 11, 49, 34, 0, 4, -3, -10, -3, 5, -38 }, { 1, 2, -3, -3, 2, 0, 2, -28, -3, 20, -5, -26, 17, -31, -10, 35, 23, -58, 9, 48, 12, -3, -4, 6, -11, -8, 4, -2, 6, -17, -35, -5, -19, -27, 1, 4, -2, -4, -16, -20, -23, -15, 2, 16, 12, 17, 12, 15 }, { 5, 2, 2, 17, -1, 4, -9, 13, -2, 16, 8, 13, 48, -57, 34, 7, -5, 9, 11, -4, -8, -4, 15, 42, 28, -30, -29, 1, -1, -5, -9, -4, 11, -2, -7, -1, 3, -11, -21, 5, 39, 25, 5, 7, -8, -18, -9, -9 }, { -4, -5, 1, 4, 2, -6, -5, -4, -1, 19, -27, 5, 5, 4, -5, -80, -16, -9, 9, 40, 6, -2, -5, -14, 2, 22, -12, -15, -3, -9, -40, -7, 23, -7, -7, 0, -1, 1, 17, 10, -4, 11, 10, 30, -6, -37, -7, -8 }, { -1, 10, -2, -2, 14, 4, 3, 7, 17, -8, -72, 47, -10, 11, -7, 25, 2, -6, 35, 7, 21, 4, 15, 19, 14, -8, 22, 25, 10, 18, 3, 4, 24, 23, 3, 4, 0, -2, -4, 1, 4, -3, -4, 28, 35, 5, -8, 2 }, { -7, -3, -12, -9, 0, 3, 3, -9, -2, -18, -53, -2, 7, 10, -23, 5, 24, -25, 19, -51, -21, 8, 10, 2, 14, 7, -26, -3, -2, -38, -20, 5, -31, -17, 6, -1, 3, 14, 21, 12, 17, 16, 2, -19, -39, -2, 9, -2 }, { 1, 1, 5, 7, 8, 0, 13, 5, -5, 6, -10, -5, 16, 3, 11, -37, 67, -11, -12, 12, 9, -2, 7, -2, 9, 16, -19, -9, 0, 41, 47, 6, -13, -15, 7, 1, 1, 1, 4, -2, 14, 36, 11, 3, 25, 33, 22, 3 }, { 4, 2, 2, 6, -2, 11, 4, 0, 7, 10, 24, -20, 24, 19, -11, 43, -6, 14, 14, 5, -1, 0, -33, -42, 32, 32, -19, 4, 12, -17, -15, 23, 17, 18, 12, 1, -3, 15, 23, 7, 27, 30, 2, 33, 25, 12, 0, 0 }, { 1, 3, -1, 8, 4, -4, 0, -5, 1, -4, -17, 18, -10, 10, -7, 35, -23, 34, -46, 39, 41, -7, 37, 12, -12, 29, -45, -4, 0, -22, 5, 3, -20, -21, -14, 0, 0, -14, 3, 15, -1, 16, 1, -11, -3, 3, 2, -9 }, { 1, 1, -2, 0, -2, 3, 8, -2, -9, -9, -25, -7, 5, -5, 6, 1, -8, 15, -35, 15, 14, -2, -30, -24, 9, -11, 44, 57, -2, 10, -5, -14, -10, -30, -7, 5, 3, 10, -5, -21, 35, 31, 3, -9, -25, -19, 3, 36 }, { -4, -2, -7, -4, -2, 0, 7, -1, -8, -6, -35, 10, 4, -2, 1, 11, -10, 26, -29, 17, -38, -9, -56, 7, 28, -23, -1, -38, -7, 18, -1, 12, -4, -32, 0, 9, 1, 9, 1, 9, -2, -33, -12, 8, 7, 11, 23, -39 }, { -3, -1, -1, -3, 4, 0, -9, -1, 1, -5, -16, 2, -5, 7, 9, 26, -1, -21, 0, 9, -2, -15, -40, 12, -27, 20, 18, -60, -8, 3, 10, -33, -2, 34, 3, 5, 3, 3, -23, -2, 12, 45, 30, -24, -4, -17, -26, -3 }, { -2, -1, 0, 0, 1, -4, 2, 2, 8, 0, -6, 5, -7, -5, 9, -25, 2, 25, 0, -16, 11, 4, -2, 4, -19, 10, 39, -29, -16, -65, 6, 41, -7, -6, 4, -1, 0, -14, -45, -15, 25, 5, -11, 33, 8, 26, 14, 11 }, { -2, 5, -5, -5, 3, 1, 2, 3, 6, 2, -8, -3, -7, 3, 6, 11, -8, 14, -6, 17, -94, 8, 27, 18, -40, 28, -4, 18, 11, 19, -12, 16, 4, -4, 11, -4, 3, -1, -9, -10, -6, 23, 9, 21, 0, -1, 9, 21 } } }, { { { 102, -31, -14, -32, 47, -12, -2, -13, 11, 0, -5, -16, 24, -2, -3, -1, -2, -7, 10, -1, -1, -1, -3, -5, 9, 2, -1, -1, -2, 5, 0, 0, 0, -1, 0, -2, 3, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0 }, { -4, -22, 69, -15, -19, 7, -4, -33, 70, -11, 2, 3, -21, 5, 2, -1, -6, -20, 47, -10, -5, 0, 2, 9, -18, 2, 2, -4, -6, 22, -3, -2, -2, 0, 1, 7, -10, -1, 0, 7, 0, 0, 0, 4, -4, 2, 0, 1 }, { -47, 14, 13, -14, 32, 15, -1, -12, 6, 12, -5, -50, 57, -3, 1, 0, -2, -17, 20, 3, 1, 1, -4, -43, 45, 0, -2, -3, -11, 20, -3, 0, 0, 0, -3, -21, 20, 4, -4, 11, -2, 0, -1, -7, 6, 4, -1, -2 }, { 1, -39, 48, -9, -11, 14, 4, -23, 2, -16, -9, 3, 4, 7, 2, 2, 2, 12, -51, 15, 5, -2, -8, -15, 23, -1, 1, 4, 18, -58, 23, 7, 1, 0, -1, -22, 26, -1, 6, -32, 12, 5, 0, -15, 13, -11, 4, -6 }, { -33, 2, 0, -24, 50, -4, 2, -22, 2, 3, 14, -17, 27, -18, -3, 3, 3, -9, -8, -8, 5, 0, 2, 37, -25, -2, -3, 1, 11, -33, 6, 5, 3, -4, -4, 49, -37, 1, 11, -34, 12, 3, 0, 26, -17, -17, 7, 7 }, { 0, -38, 50, -6, 3, 11, 11, 16, -34, 7, -3, 1, 18, -13, -2, -1, -4, 35, -35, 24, -9, 0, 7, 5, 5, -10, -3, 1, -2, 28, -19, -3, -2, -2, 2, 26, -20, 0, -12, 44, -28, -3, -2, 28, -20, 18, -10, 14 }, { 25, 82, 58, 1, 15, -29, -6, 28, 0, -31, -5, 15, 17, -15, -7, -3, -1, 20, 0, -21, -4, -4, -9, 14, 13, -7, -5, 1, 14, -6, -5, -5, -1, -2, -4, 3, 14, -4, 6, -4, -1, -2, -1, -2, 9, 0, -1, -1 }, { -27, -6, 4, -44, 47, -24, 12, -3, -6, 11, 1, 7, -24, -6, 9, 0, -2, 18, -10, 2, 4, 2, -17, 20, -45, 23, 2, -2, 9, 6, -2, -4, -1, 2, 9, -37, 10, 2, -5, 23, -11, -4, 10, -46, 25, 21, -12, -18 }, { 8, 17, 1, -65, -40, 10, -4, 32, -3, -1, 16, -54, -16, 12, 2, -2, 0, 24, -3, -15, -3, 7, 22, -35, -18, 8, 5, 3, 19, -7, -16, 2, 1, 5, 7, -8, -25, 7, 17, -14, -5, 0, 3, 0, -15, -11, -1, -1 }, { -6, 19, -34, -5, 2, 3, 15, -30, 42, -25, 5, -2, -6, 18, -5, -1, 15, 14, -12, -2, 4, 5, 1, -4, 15, -5, -3, -6, 42, -29, 11, -17, -1, -3, 5, 4, 12, -11, 1, 35, -36, 0, -5, 27, -13, 39, -33, 21 }, { 14, 44, 16, -10, -10, -29, -7, -29, -11, 26, -8, -11, -13, 4, -3, -2, -3, -26, -16, 58, 8, 4, 8, -16, -14, 5, 2, -3, -25, -7, 51, -4, -2, 2, 11, -7, -21, 4, -24, 16, 18, 2, 3, 8, -25, 12, 4, 6 }, { -12, 3, 0, -49, 23, 4, -5, 13, -8, 16, -5, 32, -32, 41, -1, -1, -14, 5, 4, -3, 6, 1, -2, 3, 21, 8, -10, 0, -16, 11, 0, 3, 5, -6, 3, -2, 44, -26, -8, -9, 13, 2, -21, 47, -6, -20, 19, 36 }, { 10, 19, -11, -28, -17, 69, -19, 18, 15, 7, 13, -4, 13, 11, 0, -8, -9, 8, -6, 22, 5, -11, -28, 37, 2, -2, 4, -7, -6, 1, 25, 1, 0, -2, -34, 31, 4, 1, -18, 18, 14, -2, -1, -18, 29, 17, 8, -20 }, { -5, -5, -12, -47, -60, -53, 16, -32, -22, -9, 12, 5, 28, -12, 2, 8, 14, -24, -5, -6, 0, -4, -9, 24, 16, -4, 1, 5, -18, 4, -18, 7, 4, -2, -18, 16, 18, -4, -8, 4, -16, 7, -8, -1, 24, -3, -5, -2 }, { 17, 19, 25, 9, 3, 31, -8, -28, -40, 55, -10, 1, -11, 28, 1, 5, 23, -30, 9, -7, -10, -8, -3, 5, -6, 19, 0, 9, 13, -8, -22, -1, -3, -1, -11, 8, 6, 3, 37, -15, -29, -1, -8, 1, 13, 14, -38, 0 }, { -8, -32, 9, -13, -12, -18, -20, 55, -6, 18, 3, 8, 13, -19, 3, 2, 18, -22, 31, -13, 20, 2, 3, 5, 8, -8, -6, 7, 8, -19, 38, -11, -6, 5, 6, -1, 8, -10, 15, -5, 27, -19, 9, 6, -4, 53, -29, 15 }, { 1, 5, -7, -1, -12, -22, 8, 3, 46, 85, 8, 13, 6, -26, -8, -2, -14, 29, -6, 22, -19, -6, 9, 3, 16, -30, 1, -11, 25, 1, -8, -2, -2, -1, 5, 2, 8, -15, 19, -10, -4, 1, 5, -5, 5, -15, 7, -9 }, { 0, 15, -11, -35, -2, 46, -24, -17, -5, -9, -23, 39, -1, -45, -6, 1, 1, -19, -7, -9, -1, 14, -1, 3, 8, -41, 4, 11, -18, 1, -11, 0, 3, 6, 36, -22, -10, -13, -1, -7, -16, 4, 32, -13, -27, -7, -16, 1 }, { -12, -6, -3, -19, 15, -20, -1, -1, -6, -3, -33, 23, -22, 27, -20, 4, 0, 5, 6, -13, 9, -18, 3, -25, 27, -32, 13, 5, 2, -1, 4, -2, 1, -1, -33, 21, -28, 5, 3, 2, 3, -2, -19, -8, -24, 13, -4, -72 }, { -3, 1, -11, -9, -22, -2, 19, 10, 16, 8, -79, 14, 31, 4, 15, -1, 5, 9, 5, 5, 3, 5, -54, 6, -11, 31, 2, -4, 14, 3, -3, -5, -1, 0, -8, -11, -17, 28, 14, -4, 4, -7, -4, 7, -26, -9, 8, 19 }, { -10, -6, -7, -2, 13, -17, -38, 40, 37, -18, -1, 1, -3, 3, 11, 11, 2, -22, -24, 41, -34, -6, -2, -3, 1, 13, 1, 26, -32, -3, -4, -25, 1, -7, -9, 5, -2, 12, 11, -17, -36, 9, -15, 4, 10, -20, -37, -4 }, { 5, 10, 5, -12, 7, 22, 88, 28, 16, 1, 17, 34, 18, 16, -11, 4, 35, -28, -8, 7, -4, -5, 30, -7, 1, 11, -3, 13, -14, 2, 4, -7, 0, -7, 18, 0, -9, 13, 1, -3, 6, -4, 8, -5, 6, -2, 5, -11 }, { -6, -2, -8, -12, 0, 3, 6, -21, -31, -38, 2, 8, 1, 3, -3, 5, -4, 18, 38, 56, -36, 7, 7, 1, 3, -11, 7, 1, 26, 24, 15, -29, -9, -1, 3, 7, 3, -5, 49, -10, 19, -27, 10, -2, 15, -6, 4, 3 }, { -1, 7, -6, 2, -2, -9, -7, -19, 16, 0, 12, 18, 24, 39, -10, 20, -47, 27, -20, -7, 14, -3, 9, -1, 5, 5, 16, 24, -28, 12, -18, 26, -16, 9, 9, 3, -13, 21, 12, -21, 25, -24, 25, -8, 6, 31, -32, 22 }, { 0, 7, -7, -12, 3, 12, -15, -18, 3, 1, -7, 31, -5, -50, 42, -4, 2, 11, -4, -10, 4, -25, 43, -28, 2, 21, 22, 1, 7, 3, 4, 4, 3, -16, -5, 5, -4, 42, 3, 6, 14, -2, -41, 15, 30, 15, 10, 11 }, { -5, -9, 3, -6, -6, -8, -40, 4, -11, 2, 12, 41, 44, 40, -12, -16, -12, -3, 19, -1, 9, -9, 28, -3, -3, 20, -3, -36, 25, -15, 9, -15, 13, -2, 22, -2, -14, 20, -12, 15, -27, 22, 22, -10, 4, -25, 8, -4 }, { -3, -3, 0, -2, 4, -4, 20, 11, -4, 3, -60, -28, -26, 0, -20, 2, -22, -8, 5, -12, -6, -4, 31, 9, 10, -16, 25, -8, -3, -15, 10, -18, 8, 13, 2, 30, 0, 10, -21, 3, -9, 4, 30, -11, 50, -10, -9, 35 }, { 5, -1, 4, 1, -4, 7, 29, -5, -16, 11, 33, 7, 3, -9, 22, 7, -53, 0, 6, -33, -15, 2, -28, -12, -1, -5, 22, -13, -12, -13, 24, -61, -1, -3, -13, -9, -6, 5, -17, -4, -6, -18, -16, 3, -16, -21, -29, 0 }, { 1, 1, -5, 2, -15, 11, 11, 0, -1, -8, -35, -15, 18, -15, -19, 6, -49, 2, 5, 5, 9, -5, 33, 15, -24, 24, -37, -9, -6, -3, 1, 0, 3, -34, 27, -3, 21, -29, 2, -11, 1, -1, -26, 17, 1, 9, -25, -44 }, { 1, 1, 2, -3, 6, 3, 24, 18, -4, -14, 17, 13, -14, -23, -18, -4, -43, -31, 13, 39, 50, 2, -18, -19, 0, -5, 7, -26, 20, 0, -13, 37, 7, 13, -27, -2, -7, 7, 22, -7, -18, 25, -13, -3, 3, 0, -17, 20 }, { 4, 5, 4, 6, 3, -2, 13, 6, -1, 0, -10, -6, 16, 27, 81, -5, 7, 20, 26, 21, 39, 0, 10, 8, -15, -40, 30, 13, -11, -10, -3, 10, 7, -11, 13, 1, 2, -25, -4, -6, -14, 16, 2, 2, 5, -9, -13, -9 }, { 2, 0, 2, 4, -2, 3, -9, -13, 7, 7, 3, -1, -3, -14, -29, 5, 27, 28, 2, 7, 82, 9, 3, -7, -3, 10, -16, 26, -13, 10, -3, -50, 6, 12, -9, 6, 2, 2, 5, 0, -8, -29, -3, -1, 13, -34, -5, 0 } }, { { -89, 1, 28, 8, -54, 31, -2, -20, 20, 3, 3, 3, -35, 23, 6, 1, 0, -18, 13, 3, 1, 2, -1, -17, 10, 3, 2, 0, 1, 1, -11, 6, 3, -1, -6, 3, 1, 1, 0, 1, -5, 2, -2, 0, 1, 0, -2, 0 }, { 7, -45, 15, -3, 26, 9, 1, -63, 28, 5, -3, -7, 35, -6, -5, 5, 8, -55, 24, 6, 1, -3, -5, 33, -14, -4, -1, 1, 3, 5, -33, 12, 5, -1, 20, -11, -2, -1, 1, 1, -15, 3, 8, -5, -1, 0, -5, -1 }, { -55, 5, 20, 14, 7, 6, 13, 6, -2, -3, 2, 3, 40, -32, -7, -1, 5, 30, -18, -4, -1, -1, -1, 46, -41, -5, 1, 0, -3, -5, 38, -21, -4, 3, 28, -26, -3, 0, -2, -5, 26, -11, 10, -10, -2, -2, 11, -2 }, { 6, -49, 36, -16, 18, 1, 16, -30, 33, -1, -4, -15, 4, -6, 8, -4, -3, 23, 0, -1, -2, -1, -1, -27, 16, 10, 3, 0, -5, -18, 47, -17, -2, 3, -39, 32, 2, 0, -3, -10, 31, -9, -25, 23, 0, -1, 11, 8 }, { 48, -11, 0, -7, -37, 11, -2, -11, 27, 0, 4, 16, -43, 31, -9, -1, 10, -2, 18, -4, -4, 4, 7, 2, -12, -6, -1, -2, -6, -2, 30, -8, -2, -1, 32, -40, 2, 2, -4, -11, 41, -17, 23, -28, 0, -7, 24, -10 }, { 3, -52, 43, -18, 7, -2, 6, 11, -2, -11, 4, 5, -17, 0, 5, 3, -24, 40, -34, 2, 1, 3, 15, -18, 6, -9, -2, 2, 4, 6, -11, -7, 0, 3, 16, -22, -4, 1, 2, 25, -44, 14, 27, -33, 3, 12, -27, -18 }, { -2, 53, 63, -24, 37, -14, -22, 20, 24, 6, -2, -18, 10, 5, 2, -4, -23, 3, 35, 1, -2, -5, 0, -7, 22, -11, -1, -2, -8, -8, -5, 35, -3, -6, 12, -4, -1, -1, -4, -13, 13, 11, 18, -15, 2, -12, 18, -6 }, { 37, 13, 35, 8, -33, 38, -15, 13, 21, -13, -9, 7, 7, 22, -5, -9, -19, 21, -4, -1, 0, -7, -24, 50, -10, 11, -1, 1, 1, -12, 3, -3, 3, -14, 17, 23, -3, -2, 4, 5, -24, 12, -14, 42, -9, 9, -24, 22 }, { -11, 9, -12, -14, 31, 32, 1, -7, -17, -2, -4, -22, 15, 56, 4, 7, 15, -6, -19, -11, -1, -7, -13, 8, 54, -16, -6, 1, 6, 14, 12, -36, -3, -14, 26, 18, -7, -4, 3, 16, 9, -32, 26, -3, 0, 15, -3, -1 }, { 8, 55, -11, 2, -7, -7, 8, -33, 39, -8, 4, -17, 9, -15, -4, -10, 3, -4, 13, -20, 0, 5, 2, -13, -8, 1, -5, 2, 10, -22, 30, -30, 5, 15, -21, -5, -5, 3, 2, 19, -27, -1, 6, -27, 4, 29, -47, -26 }, { -22, -25, -54, 24, 14, -19, 4, 2, 24, 10, 5, 13, -3, 9, 15, -12, -32, 18, 27, -1, 2, -5, 6, 5, 24, -2, -3, 2, -1, -47, 22, 17, 3, -15, 33, 12, -8, -4, -3, -19, -8, 25, 31, 8, -9, -2, -17, 11 }, { 13, -18, -16, 21, -33, 45, -12, 13, -2, 1, -6, -32, 36, -11, 9, -3, -12, 11, -1, 14, 2, 10, -23, 7, 12, -2, -10, -2, -1, -7, -5, 19, 0, 17, -28, 24, -22, 3, -4, -9, 7, 11, 21, -40, 10, -14, 22, -45 }, { -10, 11, -33, -57, 23, 49, 8, -16, -8, 9, 0, -26, -6, 23, -18, -9, -15, 15, -14, 16, 9, -5, -3, -2, -23, 11, 6, -2, -7, -21, 5, 8, 9, 2, -6, -34, 25, 4, -4, -16, -13, 32, -24, -3, 6, -20, 1, 5 }, { 11, 37, -3, 18, -5, -1, 33, -48, 21, -26, 11, 13, 3, -7, 4, 9, -3, 19, -39, 19, 5, -4, 23, -8, 14, 0, -5, -7, 2, 15, -11, -3, 12, 1, 12, 15, -8, -1, -12, 11, -12, 26, 22, 9, -5, -36, 48, 15 }, { -15, -11, -31, 8, 19, -15, -1, 22, 59, -21, -9, -13, -10, 14, -7, 21, -13, 23, 14, -16, -2, -5, -29, 12, -10, -1, 4, -3, 5, 36, -9, 3, -2, -16, -6, -10, 7, 3, -11, 46, -7, -5, -32, -2, 9, 3, 32, -18 }, { 6, 2, 5, 59, 52, 68, 8, 15, 8, -2, 6, 30, -10, 3, -18, -5, 11, 4, 17, -10, -8, 1, 33, -18, -4, -10, -4, -1, -3, 8, 1, 15, -11, 24, -13, -7, 0, -1, 2, 5, 3, 6, -6, 0, 1, 10, -8, 8 }, { -6, 7, 14, 19, 30, -24, 0, -22, -18, 1, -22, 41, -29, 34, -24, -2, -6, -3, -18, 9, 4, 3, -23, 22, -9, 2, -12, 2, -3, -16, 2, -8, 11, -5, -15, 16, -19, -5, 3, -21, -1, 9, -19, -7, -12, -15, -1, -65 }, { -2, -3, -5, 30, 4, -7, -84, -17, 6, -12, 10, -7, 7, 3, -12, 7, -44, -3, -23, 11, 3, 2, 11, -11, 4, -20, 7, 3, -4, -12, -7, -33, 18, 15, -6, -17, 7, 4, -3, -11, 0, -27, -14, -4, 9, -15, 1, 23 }, { 7, -5, 19, 55, -4, -21, 32, 11, -5, -25, -2, -38, 7, 21, 2, -12, 29, -10, -8, 1, 2, -4, -28, -8, 16, -8, 15, 0, -14, -8, 4, -4, 9, -21, 2, -26, 35, 3, 1, -24, -7, 21, -12, -16, 32, -20, -17, 14 }, { -1, -5, 2, -2, -28, -26, 8, -2, 3, 47, 3, 10, 35, 32, -53, 5, 11, 13, 1, 23, -1, 2, 16, 8, 21, -49, 1, -3, -2, 4, 14, 13, 0, 24, -8, -4, -9, 1, -8, 21, 1, 26, -23, -2, 6, 21, 2, 12 }, { -12, -16, -4, -28, 5, -10, -20, 39, 26, -36, 17, -1, 4, 15, -20, -14, 46, -17, 3, -17, 13, 18, 13, 9, -7, 0, -7, -8, -18, 11, 6, -21, 25, 35, -9, 19, -15, -8, -6, -14, -13, 24, 21, 7, -14, -34, -7, 6 }, { 5, 0, 9, 15, 11, 8, -22, 18, 12, 75, -15, 3, -10, -13, 29, -13, 26, -13, -7, 15, 16, -14, 1, -2, -3, 20, 5, -12, 7, 1, 17, -32, 41, -13, 5, 0, 15, -2, -15, 27, -16, 18, 7, 0, 7, -7, 19, -7 }, { 1, -9, 11, 2, -2, 6, -19, -20, -57, -28, 5, -5, -13, -16, -14, -3, 4, 0, 41, -40, 18, 7, -13, -1, 6, -20, 2, -10, 31, -19, 28, -2, 8, -12, 4, -3, -8, 4, -17, 33, -15, 30, -7, 1, -2, -4, 33, 8 }, { 3, -1, -7, -17, -15, 22, 10, 17, 16, -10, -10, 14, 9, -37, 5, -8, 4, -12, -2, -7, 8, -34, 20, -13, 25, -57, 3, -5, -7, -4, -6, -13, 9, -37, 17, -12, -19, -7, -3, -17, -4, -8, -44, 8, -14, -22, -11, -32 }, { 2, 1, 6, -11, -3, 1, 46, 26, 3, 10, -24, 31, 12, 16, 15, 10, -41, -29, 6, -31, 11, 11, 2, 13, 0, -6, -4, -6, 18, -28, -10, -24, 3, 27, -15, 2, -1, 18, -8, 0, -14, -23, -4, -19, 32, -27, 14, 27 }, { 3, -2, 8, 14, 8, 1, 35, 28, 4, 6, 28, -54, -33, -10, -29, 7, -21, -28, -6, 33, 1, 9, -6, 0, -12, -14, -15, 4, 25, -25, -4, -10, 22, 6, 6, -1, -23, -14, 10, 5, -1, -16, 1, 13, -38, 13, 16, 4 }, { 2, -5, 7, 11, -14, -9, -3, 9, -8, -10, 11, -8, 45, 46, 24, 10, -3, -8, 3, -17, -4, -26, 43, -17, -12, 28, -2, 7, 28, -18, 2, 1, -6, 13, 4, -18, 23, -27, 13, 3, -7, 6, -15, 17, -32, 1, 22, -37 }, { -3, 3, -3, -15, 10, 2, -6, 10, 2, -48, 1, 28, 1, 6, 52, -4, 13, -13, 1, 60, -5, 3, -3, 13, 3, -16, 3, 6, 21, -14, 13, 6, 29, 12, -3, -6, -5, 10, 16, 8, 14, 16, -18, -14, 17, 44, 11, 7 }, { -5, -4, -4, -5, -2, 0, -17, 12, 19, -13, -60, -1, -5, -16, -50, 10, 22, -14, -29, -3, -9, -28, 12, -10, 3, 12, 5, 13, 42, -26, 4, 11, -17, -13, 2, 12, 6, 9, 33, -7, -2, 9, 21, -9, 14, 9, 17, 12 }, { -2, -4, 1, -7, -7, 6, 14, 11, -10, -23, 17, 30, 23, -13, -55, -22, -24, -8, 27, 31, 5, -9, 1, -12, 18, 37, 5, -1, -16, 3, -1, -13, 30, -16, 5, 10, 39, -12, -14, 13, 2, -22, 16, 4, 22, 12, 4, -21 }, { 2, 1, 2, -7, 4, 8, -8, 4, 18, 20, 69, 37, 23, 2, -8, 13, 26, 14, -10, -10, -3, 21, -45, -28, -5, -1, -5, 4, 35, -22, -15, -1, -5, -31, -2, -1, -1, 7, 27, -14, -8, -6, -2, -7, 6, -8, 9, 4 }, { 3, 3, 2, -1, -2, 9, 4, 2, -8, 0, -9, 0, -2, -8, -2, 108, 7, 16, 31, 14, 1, -3, 2, 2, 3, -1, 9, 7, 3, 6, 11, -9, 31, 4, -1, 4, 10, -5, 3, -16, 1, 8, 11, 6, 2, -15, -27, -5 } }, { { 110, -27, -39, -16, 32, -23, -3, 8, -7, -1, -3, -1, 9, -3, -3, -1, -1, 4, -3, 0, 0, -1, 0, 1, 0, 2, -1, -1, 0, 0, 0, 1, 0, -1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 }, { 4, -66, 71, -34, 27, 17, 11, -47, 37, -5, -4, -14, 24, -9, 3, 2, -4, -8, 4, 5, -1, -2, -3, -2, 9, -3, -1, 1, 1, -1, -3, 3, -3, 2, 2, -1, -1, -1, 1, 1, 1, -1, 0, 0, 1, -1, -1, 0 }, { -29, 30, 9, -24, 67, -28, 8, 16, -46, 13, 0, -21, 52, -32, 1, 1, -5, 24, -29, 3, 3, 0, -5, 2, 10, -1, -6, 0, 1, -1, 1, 8, -8, -3, 1, 1, -2, 0, 4, -2, 4, -1, -2, 1, 0, 0, -2, 0 }, { -41, -77, -66, 44, 32, 16, 21, -7, -11, 8, 5, 8, 1, -10, 8, 4, 3, -3, -9, 3, 3, 1, 1, 3, 0, -5, 1, 2, 2, 2, -1, -1, -2, -2, 1, 1, 0, 1, 2, -2, 0, -1, 0, 0, -1, 0, -1, 0 }, { 0, -40, 33, -26, -15, 9, -1, 38, -39, 9, -2, 30, -42, 7, 9, 1, -16, 45, -39, 11, 0, -1, 2, 22, -40, 17, 0, 2, 0, -4, 5, 3, 0, -6, 3, 0, 1, 5, 2, -13, 4, 2, 0, 3, -6, 7, -3, -1 }, { -12, 13, -35, -69, 15, 30, -4, -36, -18, 49, 1, -6, -12, 33, -6, 0, 11, -35, 8, 8, 7, 2, -6, 10, -24, 23, -1, -1, 0, 1, 7, -24, 10, 2, 2, 3, -2, 0, 2, -4, 0, 7, 0, -2, -5, 1, 2, 1 }, { -13, -1, 17, 14, 41, -70, 8, 0, 14, -5, -1, 23, -28, -7, 8, 7, -9, -21, 45, -17, -2, 0, -2, 23, -37, 19, -7, 2, 2, -1, 1, -25, 37, -8, -4, -1, 0, 5, -2, -4, 1, 2, -3, -1, -8, 11, 2, 4 }, { 7, -3, 6, 3, -50, -49, 32, -42, -26, 32, -15, 41, 19, -25, -5, 2, 25, -23, -23, 19, 2, -3, -4, 25, 6, -26, 6, -1, 1, 6, 3, -5, -12, 4, 3, 0, 0, 1, 11, 1, -11, 0, 1, -4, 0, -2, -3, -1 }, { -16, 2, -35, -31, -20, -40, 22, -12, 54, -10, 20, -28, -2, -1, 7, 0, -6, 26, -12, 2, 3, 5, 1, 10, -34, 11, 7, 0, 1, 3, -18, 39, -30, 12, -3, 0, -1, 2, 13, -34, 15, 0, 1, 5, 4, -1, -4, -11 }, { 18, 38, 12, 39, 42, 37, 18, -55, -14, -35, 1, 21, -22, 7, -14, -6, 13, 5, -17, -4, -3, 0, -5, 27, -29, 7, 2, -5, -2, 2, -1, 20, -25, 6, -2, -1, -2, 2, 13, -18, 5, 1, 0, 3, 4, -4, -5, -6 }, { -13, -2, -3, -21, 25, -25, 15, 0, 9, 7, -18, 34, -35, 39, -14, 4, 1, 12, -2, -24, 7, -1, 2, -16, 27, -9, -6, 0, 2, -4, -1, 33, -35, 6, -1, 2, -2, 9, -32, 48, -26, 5, -2, -2, 25, -27, 4, 5 }, { 1, 17, -17, -22, 6, 39, -3, 7, 34, 27, -17, 45, 4, -49, 7, -9, -14, 26, 23, -28, 4, -2, -10, 29, 1, -36, 13, -2, -3, 1, -17, 20, 23, -23, -3, 1, -2, 1, 10, -1, -11, -2, 2, -6, 10, 13, -9, -1 }, { -2, -2, -7, -43, -11, 13, 62, 15, -28, -61, 27, -5, -17, -32, -1, -3, 30, -13, 19, -21, 0, 8, 14, -13, 0, -17, 8, -2, -1, 3, 15, -9, 8, -9, 0, 0, 3, 6, -8, 2, -13, 7, 1, 2, -3, 1, 1, -1 }, { 1, -13, 17, 5, 3, 3, -24, 30, -13, 24, -13, -10, -6, -17, 5, -7, 39, -34, 23, -32, 9, -1, 13, -3, -16, -1, 0, 5, 1, -4, 5, 20, -29, 2, 4, 0, 1, 1, 21, -38, 11, 3, -2, -13, 41, -44, 16, -27 }, { -13, -4, -8, -19, 17, -6, -55, 21, 27, -37, 22, 18, -5, -4, -13, 6, 19, -20, -24, 22, 0, 3, -16, 37, 4, -33, -2, 8, 3, -5, 26, -21, -20, 20, -2, 1, -3, -8, 29, 5, -32, 8, 0, 8, -11, -6, 4, 2 }, { 8, 9, 12, 16, 13, 14, 65, 54, 46, 30, -24, -16, 0, 21, -26, -3, 29, -17, -23, 14, -4, -5, -14, 14, -1, 3, -7, -4, 1, -3, 15, -8, 2, -10, -2, -1, -2, -7, 11, -3, -4, 6, -4, 5, -6, 13, -10, 1 }, { -9, -13, -9, -8, -20, 1, -8, 3, -1, -33, -8, 35, 67, 36, -29, 5, 21, 9, 5, -40, 2, 0, -9, 17, -9, 34, -30, 1, 1, -5, 11, 5, 9, -30, 3, 3, -1, 0, 5, -1, 15, -9, -4, 5, 9, -1, -11, 6 }, { -1, 12, -10, -7, 1, 9, 0, 1, 25, -15, -56, 22, 4, -29, 44, 8, 1, -10, -31, -16, 27, 0, 5, -19, -7, 22, -1, 1, -4, 6, 23, -30, -18, 18, 2, 2, -4, 14, -27, -2, 25, -11, 3, 23, -28, -15, 16, 2 }, { -7, -5, -1, -4, 8, -14, -12, -13, -2, -1, -30, 1, -9, 22, -47, 7, 1, 18, -4, -22, 20, -6, 33, -32, 12, -36, 23, 1, 3, -3, 11, -12, 12, -3, -2, 3, 3, 7, 14, -41, 1, 11, 2, 14, -30, 29, -9, -46 }, { 1, 3, 4, -5, 6, 12, 17, 16, 9, 20, 44, 56, 13, -16, -39, 2, -21, -2, 21, 16, -21, -2, 17, -30, 7, 18, -18, -7, 1, 5, -5, -8, -18, 43, -19, -1, 5, -3, 8, -12, 22, -16, 1, 10, -16, -9, 22, -12 }, { 5, 2, 2, 15, -4, 0, -18, -22, 21, 34, 37, -19, -6, -37, -30, 2, 37, 33, -20, -18, -1, 3, 27, -8, -27, 5, -24, 8, 1, -9, 33, -11, 11, -15, 3, -5, 3, 22, -33, 17, -19, 2, 0, 14, 2, -9, 2, 11 }, { -6, 2, -1, -6, 15, -6, -17, 4, 10, -7, 2, 32, 10, 9, 23, -34, 56, 3, -3, 43, -33, 12, -10, -35, -9, -4, 21, 7, -1, -7, -4, 15, 18, -1, -5, -7, -3, 16, -38, -11, 18, 7, -5, -19, 11, 24, -15, -16 }, { -9, -7, -10, -9, -7, 0, -14, -1, -5, -22, -48, -17, -28, -46, -63, -10, 8, -1, 10, 38, 16, 4, -13, 8, 12, 29, -29, -2, 4, 1, -24, 12, 6, 8, 2, 4, -5, 7, -9, 8, 14, -8, -4, -17, 8, 10, 0, 2 }, { 1, 7, -4, 2, -2, 9, 11, 6, 9, -16, -8, 4, 26, 6, -19, 19, -49, -19, -19, 28, -6, 11, 18, 4, -32, -16, 9, 1, 9, -8, -5, -10, 21, -16, 17, -3, -8, 38, -26, -6, -20, 16, 1, -26, 30, -34, 29, -28 }, { 1, -5, 9, 6, 0, 5, -7, 17, -4, 0, 7, -1, 25, 8, -5, -38, 3, -31, -11, -17, 41, -12, 39, 7, -32, -4, 4, 2, -2, 14, -47, 19, -7, 28, -18, 6, 3, 18, -14, 15, -24, 13, -7, -10, -26, 32, 3, 24 }, { 0, 7, 1, -9, 9, -4, -7, -1, 13, -3, 28, 13, -27, -2, 11, 12, -4, -39, -51, -5, 8, -12, 38, -4, 36, 3, -27, -4, 2, 10, -17, 6, 15, -41, 3, 5, -3, 12, 13, -12, 29, -33, 4, -20, 16, 9, -22, 5 }, { 0, -6, 5, 2, -2, -1, -15, 8, -4, 9, 12, -3, -7, -15, -27, 13, -8, -34, -21, -39, -38, 32, -49, -6, -2, 6, 28, -7, 8, 29, -31, 7, -18, -21, 23, -3, 3, 9, -26, -1, 6, 10, -3, 18, -33, 1, -8, -8 }, { 3, 3, -2, -2, -3, 9, -2, 1, -8, 0, -24, -4, 3, -4, 25, 56, 3, -12, 7, 3, -38, 8, 11, -13, -14, -3, -62, 27, 0, 16, -5, 27, -4, 14, -20, -2, 4, 5, 3, -5, -40, 8, 4, 8, -6, 27, -22, -20 }, { 5, 3, 5, 6, 0, 3, 3, 6, -4, 7, 28, 7, 2, 5, 12, -3, -7, -5, 36, 34, 58, -9, -24, 7, 10, -5, -24, 14, -4, 0, 10, 15, -28, -30, 32, 2, -13, 36, -21, -11, -3, -1, -1, 30, -20, 6, -18, -28 }, { -3, -2, -6, -1, -7, -5, 4, -8, 0, 1, -6, -5, -4, -5, 10, -81, -32, -11, -11, -20, -35, 16, 10, 18, 29, 24, -28, 12, -8, -23, 22, -4, -7, -2, -2, 0, 0, 15, 9, -1, -18, 19, -7, 5, -5, 8, 0, -36 }, { -4, -4, -1, -5, 2, 1, -5, 6, -8, 2, -15, -19, 1, 18, -1, 0, 9, -1, 22, 9, -37, 10, 36, 56, 11, -32, 4, -22, -6, 10, 10, 17, -3, 1, -18, -5, -2, 35, -27, -1, 30, -42, 9, 29, -19, -8, 15, 11 }, { -3, -1, -3, 0, -1, 1, -10, 3, 0, -3, -8, 6, 5, -20, -9, 13, -17, -20, 11, 21, -17, -39, 31, -3, 0, 38, 49, -13, 0, -4, 30, 19, -22, -29, -27, 18, -2, 1, -1, 3, -11, 31, -8, 24, 9, -1, -40, 18 } } }, { { { -99, 31, 29, 8, -44, 39, -10, 7, 4, 2, 3, 0, -21, 19, 3, -1, -7, 6, 0, 1, 1, 0, -2, -6, 6, 1, 1, 0, -1, -3, 2, 1, 0, -1, -1, 2, 0, 1, 0, -1, 1, 1, 0, 0, 0, 0, 0, 0 }, { 11, 70, -13, 5, -2, -36, -25, 69, -21, -3, 1, 3, 2, -25, 5, -5, -24, 44, -9, -5, -1, 1, 1, 3, -15, 6, 1, -1, -2, -10, 18, 1, -3, 1, 1, -7, 2, 0, 0, -3, 5, 2, 0, -2, 1, 0, 1, 0 }, { -47, -1, 6, 12, 12, -28, 24, -14, -8, 0, 2, 3, 36, -66, 5, 3, 29, -16, -6, -1, 0, 0, 4, 26, -54, 11, 5, 0, -1, 22, -13, -3, -1, 6, 7, -26, 6, 2, -1, 11, -6, -2, -1, -8, 2, 3, -1, -2 }, { 2, 59, -28, 9, 2, -10, 0, 4, -25, -1, 0, 0, 4, 20, -7, 6, 42, -42, 0, -3, 0, -1, -2, -1, 32, -21, -6, -3, 1, 44, -42, 2, 1, -1, -2, 25, -18, -3, 0, 23, -18, -5, 2, 10, -7, 7, -3, 2 }, { -38, -3, 12, 8, 49, -37, 12, -10, -7, 4, -1, -7, 36, -14, 10, -2, -17, 1, 2, 2, 2, -3, 0, -6, 43, -21, -6, 4, 6, -34, 11, 4, 1, -1, -12, 49, -25, -6, 6, -29, 10, 4, 0, 23, -13, -13, 4, 6 }, { 0, 43, -49, 7, -7, -1, 17, -35, -5, 9, 1, -7, -4, 10, -10, -14, 15, -36, 19, 1, 2, -1, -2, -5, -7, 10, 5, 3, 8, -43, 18, 2, 4, 0, -1, -22, 25, 2, 11, -49, 27, -1, -1, -18, 19, -21, 7, -7 }, { -6, -36, -68, 26, -23, 48, 14, 28, -28, -2, 4, 15, 7, -13, 0, -5, 11, 13, -21, 2, 1, 1, -3, 23, -14, 2, -7, 2, 2, -4, 5, -5, -3, 1, 1, 25, -26, -1, 9, -16, 7, 0, -2, 26, -23, -12, 4, 8 }, { 37, 25, 48, -1, -55, -5, -14, -21, 23, -6, -3, 12, -1, -25, 16, -10, 7, -20, 11, 4, -2, 0, -2, 19, -21, 19, -7, 2, -2, -8, -10, 7, 4, 1, -6, 30, -23, -1, 10, -24, 5, 2, -8, 36, -27, -17, 5, 12 }, { -7, -20, -11, 4, -39, -57, 56, 9, 5, -3, 4, 6, -43, -4, 26, -10, 29, 14, 0, -2, -1, 4, 5, -41, 15, 14, 1, -3, -13, 14, 19, -5, -1, -3, -21, 11, 4, 0, -10, 8, 12, -3, -5, 4, 1, 5, 6, 1 }, { 2, 35, -28, 3, -1, 7, 28, -43, 38, 1, 5, -4, 8, 1, -12, 5, -19, 21, 10, -8, -1, -2, 5, 10, -9, 2, -10, -6, 14, -16, 40, -17, 1, 1, 8, 6, -18, -3, -21, 47, -13, 1, 2, 17, -27, 41, -16, 11 }, { -16, -17, -46, 6, -4, -20, -45, -9, 7, 6, 2, -4, -14, -12, 40, 18, -42, -23, 16, 2, 2, -1, -1, -15, -3, 42, 0, 3, 18, -11, -33, 14, 5, -2, -15, 9, 19, -4, 2, 12, -31, 5, -9, 13, 3, 10, -13, 7 }, { 17, 1, 0, 28, -37, 26, -5, -3, 8, -1, -6, -20, 42, -37, 26, -5, 1, 2, 4, 5, -4, 1, -15, -6, 27, -18, 8, -5, 4, -4, 7, 5, -2, 5, -29, 24, -4, 14, -11, 14, 3, -1, 7, -41, 48, 20, -2, -27 }, { -1, -22, 1, 28, -44, -47, -3, 0, -6, -6, 5, 21, -11, 6, -28, 17, -20, -11, -1, 2, -1, 6, 5, 13, 2, -51, 4, 7, 34, -25, -11, -1, 0, 1, 25, -20, -28, 8, 21, -6, -16, 0, 15, -24, -7, 9, -16, -10 }, { 15, 11, 32, 1, -8, 13, 54, -1, -49, -5, -7, 1, 10, 8, -5, 8, -11, 5, -34, 4, 1, -1, -5, 1, 6, 29, -3, 2, 41, -23, -14, -7, -2, -4, -8, 8, 35, -13, 22, 3, -32, 7, -9, 13, 20, 11, -21, 11 }, { -10, -13, -12, -89, -33, -12, 8, 29, -6, 19, -4, -32, 35, 15, 16, -12, 10, -12, 16, 2, 3, -2, -14, 22, 3, -3, 0, -2, 8, -13, 1, 8, -5, -5, 15, -3, -5, -2, 3, -1, 3, 0, 8, 0, -8, 23, -11, 3 }, { 4, -11, 24, 21, 21, 14, -27, 21, -30, 22, -1, 12, -21, -13, 5, -13, 35, -29, 30, -6, -5, 4, 7, -16, -5, 2, 1, -2, 25, -21, 28, -1, -6, 5, -7, -3, 0, -8, 8, 6, 26, -17, -1, 6, -15, 61, -19, 9 }, { -2, -15, 5, 43, -23, -33, -27, -12, -19, 1, -5, -2, 36, 47, 2, 7, 6, 4, -2, 8, -1, -3, -17, 39, 6, 14, 3, -4, -23, 5, 20, -7, 1, -10, 18, 2, 34, -19, -28, 7, 24, -7, -6, 21, 13, 11, 12, 17 }, { 0, -7, 2, 28, 2, 4, 36, 54, 47, 0, 0, -5, 12, 0, -49, -16, -13, -20, 46, 2, -2, -1, -8, 4, -2, -2, 11, -3, -8, -5, -19, 33, 1, 1, -7, 4, 21, -10, -7, 1, -8, 14, -15, 16, 10, 4, -3, 16 }, { -4, -5, -6, -33, -23, -7, -21, -16, -23, 21, -3, -4, 0, -29, -75, 39, -8, 0, -9, 9, 3, 2, 2, -22, -3, -6, 16, -7, 4, 6, 8, -1, 0, 7, -36, 15, 6, 4, -16, 7, 19, -6, -27, 15, 5, 1, 21, 6 }, { 6, -1, -3, 7, -20, 12, -13, -3, 9, -14, 21, -33, 23, -24, 10, -1, 12, 6, -11, -3, -4, 1, 28, -38, 25, -28, 17, -1, 6, 3, -1, -8, -2, 8, 14, -32, 29, -33, 8, -5, -5, -2, 15, 10, -24, -16, -4, 61 }, { -9, -2, -8, -10, 9, -8, -18, 20, 58, -37, -1, 6, 2, 5, -6, 32, 35, 1, -16, -15, -5, -5, -7, 15, 3, 8, 5, 3, 40, 6, 16, -34, -1, -3, -4, 20, 12, -6, 45, -6, 10, -21, -8, 9, 29, 1, -3, -5 }, { -7, -6, -8, -4, 0, -15, -29, -17, 5, 0, 0, 0, -1, 7, -24, -82, 16, 11, -28, 8, 4, -1, -2, -2, -9, -10, 27, 12, -20, -5, 1, -18, 9, 1, -19, 9, 2, 14, 13, -3, -33, 13, -19, 10, 13, 10, -38, 4 }, { 5, 1, 6, 33, 5, -7, 4, 6, 19, 59, -9, -76, -15, 5, -6, 10, 5, 7, -27, 25, -1, 0, -33, 1, -7, 13, -4, 9, 8, 11, -4, -7, 5, -7, -5, -7, -11, 7, 28, -4, 4, -6, 0, -7, -29, 0, 0, -7 }, { 4, 7, -1, -8, 4, 6, 9, 10, 23, 68, -10, 41, -21, -24, 21, -4, -10, -21, -28, 16, 10, -7, 5, 20, 19, -22, 25, -12, 4, -4, 1, -22, 11, -15, 30, 3, 12, 0, -17, 6, 0, -8, 2, 12, 29, -8, 8, 20 }, { 1, -9, 10, 7, -9, -3, -6, 29, 1, 18, 2, -1, 27, 14, -11, -3, 1, -39, -2, -25, 8, 3, 18, -17, -4, 23, -24, -6, -1, -17, 22, -52, 13, 19, -15, 0, -5, -2, -29, 18, -27, -11, 8, -12, -15, -50, -14, -20 }, { 0, -5, 5, 3, 6, 1, -7, -5, -22, -4, -8, -28, -28, -7, 11, 9, 30, 19, 40, -30, 4, 0, -33, 17, -6, -22, 32, -25, 27, -13, 16, 1, 4, -18, 6, 2, -1, 15, -16, 19, -29, 23, -13, 15, 9, -45, 1, 7 }, { -3, -6, -2, -6, -4, -2, -14, -8, -3, -2, 5, -8, -30, -45, -40, -10, 14, 9, 4, -8, -3, 5, -16, 21, 42, 32, -35, 6, -15, -4, 3, 5, -4, -14, 33, 15, 12, -39, -9, -4, -11, 3, 42, -10, 7, -4, -19, -15 }, { -1, 0, -4, 3, -6, -3, -10, -12, 12, 60, -14, 32, 23, 17, 1, 13, 37, 51, 23, -5, -1, 3, 24, -8, -9, -6, -33, 1, 8, 3, -6, 27, -8, 16, -13, 2, 5, -24, 23, -3, -15, 23, -1, -7, 12, -15, -11, 0 }, { -2, -1, 0, -2, 7, -2, -13, 10, 18, -23, -3, 2, -3, 4, 2, 2, 37, -5, -39, 49, 11, -10, 4, -5, -2, -1, -14, -40, 22, -36, -6, 21, 18, -2, -5, -8, 2, -17, -38, 0, -9, 42, -5, -7, -19, 13, 25, -18 }, { 0, -4, 1, 3, -2, -5, -5, -8, -2, 14, 43, 6, 15, 7, -11, -52, -18, 4, 0, -12, -5, 5, 4, -8, 4, 23, 20, -37, 55, 29, -2, 11, -14, -6, 12, 5, -2, -9, 17, 17, 15, -1, 14, 5, -5, -11, 43, -28 }, { -5, -3, -7, 1, -5, -1, 1, -3, -7, -17, -95, -9, -8, -8, -2, -33, -18, 4, 19, 6, 14, -19, 18, 8, 5, -10, -13, 0, 23, 18, -2, -17, 12, 10, 0, -4, 9, -26, 10, 9, 5, -15, -10, -8, -12, -4, 26, -4 }, { 2, -1, 1, 10, -6, 2, 0, 3, -2, 3, -16, -17, 8, -1, -9, 17, 15, 8, 4, -5, 2, -44, 68, -1, 23, 34, 39, -3, -14, -11, -3, 11, -1, -3, 33, -1, -6, 42, 8, -2, -3, 6, 3, 23, -1, 8, -8, -32 } }, { { 96, -64, -6, -21, 31, -6, 6, -30, 19, -2, -4, -3, 9, 2, -2, 1, 2, -14, 8, 2, -1, -1, 0, 1, 3, 0, -1, 0, 0, 0, -5, 2, 1, 0, 0, 1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0 }, { 32, 49, -54, 9, 45, -43, -3, 34, -38, 3, 1, -5, 39, -31, -4, -1, 4, 8, -10, -4, 1, -2, -1, 19, -14, -3, -1, 0, 1, 3, -1, 0, -2, 2, 5, -4, -1, -1, 1, 1, -1, 0, 1, -1, 0, 0, 0, 0 }, { -56, -6, 3, -1, 45, -11, 15, -56, 11, 11, 0, -18, 41, -20, 2, 5, 17, -53, 23, 2, 2, -4, -3, 13, -1, -3, 0, 1, 4, 7, -28, 11, 3, 2, 0, 4, -1, -1, 0, 2, -10, 2, -1, 2, 0, 1, -3, 1 }, { -6, 13, -81, 1, 10, 11, 6, -13, -4, 12, 6, -10, -32, 45, -1, -1, 6, -20, 16, -4, 3, 0, 2, -42, 46, -3, -3, 1, 2, -7, 3, 1, 2, -1, -22, 22, 1, -1, -1, -6, 8, -3, -6, 7, 1, -2, 3, 1 }, { -29, -69, -49, 26, 10, 58, 3, 13, -30, 7, 1, 7, 12, -2, -4, 3, -1, 15, -30, 8, 3, 1, -8, 28, -19, 0, 1, 1, 2, 5, -1, -8, -3, -1, 15, -10, -2, 0, 1, 4, -5, 0, 4, -2, -1, 1, -2, 0 }, { 28, 11, -11, 29, -61, 10, -16, 9, -16, -4, -4, 13, -6, -2, 0, 4, 22, -48, 9, 8, -1, 4, -10, 15, -8, 4, -1, 0, 4, 29, -61, 23, 1, -1, -2, 9, -5, 1, 5, 11, -31, 8, -7, 10, -2, 2, -9, 4 }, { -8, -28, -3, 37, -19, -61, 53, -21, -20, -13, 2, 35, -21, -17, 6, 4, 33, -1, -27, 0, 0, 3, 22, -22, -2, -2, 0, 1, 1, 13, 10, -26, 2, 4, -7, 0, -4, 0, 1, 6, 5, -12, 1, -1, -2, 2, 1, 0 }, { 4, 28, -39, 1, -14, 1, 5, -33, 46, -9, 8, 16, -22, 14, -16, 1, 0, -8, 15, -10, 1, -2, 10, 22, -41, 5, 1, 0, 4, -5, 12, -14, 6, -4, 43, -54, 11, 3, -2, 9, -5, -3, 18, -24, 1, 7, -8, -5 }, { -18, -33, -35, -26, -19, -19, -14, 34, 37, 7, 10, -7, -10, -56, 28, -1, -4, 1, 31, -9, 0, 5, 9, -23, -31, 26, 7, -1, -4, 1, -1, 20, -6, 13, -31, -6, 9, 4, -2, 0, 6, 2, -16, -3, 4, 1, 4, -3 }, { 17, 10, 8, 84, 10, 25, -40, -27, 6, -5, 0, 12, 2, -28, 10, -5, -4, -9, 21, -3, -5, 2, 7, -5, -10, 4, 0, 0, -1, -22, 24, 1, 1, 13, -18, 7, -5, 3, 1, -26, 34, -11, -3, -4, 4, -10, 15, -5 }, { -8, 3, 10, 10, 59, 1, -39, -6, -1, -20, 1, 36, -39, -1, 3, 13, -29, 14, -13, -4, 3, -5, 28, -27, -5, 10, 1, 1, 3, 15, -28, 1, 5, 3, -7, -6, 7, -2, -2, 28, -44, 11, -11, 11, -4, 11, -19, 9 }, { 8, 22, 6, -36, 12, 57, -2, 9, 3, -24, 7, 8, 5, -23, 0, -3, 55, -16, -10, -22, 1, -2, 25, -12, -1, -4, -2, -5, -5, 39, 5, -33, -3, 18, -11, 5, -9, 0, 0, 16, 19, -33, 5, -3, -5, 8, 7, -2 }, { -5, -1, -10, 20, 4, 7, 42, 29, 52, -54, -1, 26, 24, 3, -34, 1, -11, 12, 22, -18, -2, -4, 3, 16, 8, -17, -5, -1, 0, -5, 2, 10, -4, 2, -5, 31, -21, -3, -3, 3, -9, 14, -13, 31, -14, -4, 0, 12 }, { 0, 15, -23, -14, -18, -5, -23, -39, 29, -2, -11, -2, 8, -10, 37, -1, -3, 17, -39, 21, 5, -8, -21, 25, -7, 24, -1, -1, -2, 3, 12, -34, 18, -17, -3, 38, -8, -1, -4, 7, -5, -3, -18, 45, -13, -5, 3, 18 }, { -16, -25, -9, -8, -3, -31, -56, 21, 8, -29, -1, 28, 15, 10, 37, -5, 4, -16, 9, -7, 4, -8, 10, 7, 37, -28, -1, 3, -4, -3, -5, -9, 6, -13, 39, 6, -20, -1, 5, -13, 8, -14, 40, -12, -3, -5, -3, -5 }, { -2, 1, -10, -37, -34, 7, -6, -32, -47, -42, -1, 26, 45, 3, -10, 10, -34, -10, 1, -10, 7, -3, 26, -12, -10, 12, 1, 5, 20, -36, 6, -4, -1, 19, -20, -3, 12, 1, 6, -15, -5, 1, -11, -2, 8, -2, -10, -1 }, { 8, -1, -3, 31, -23, 4, -1, -5, 17, -2, 19, -55, 31, -12, 14, 2, -11, 16, -5, -18, -2, 3, 5, -18, 26, -12, 1, 0, 14, -4, 10, -23, -2, 13, -8, 4, -3, 1, -2, 37, -38, -3, 22, -22, 10, 34, -41, -2 }, { -3, -4, 2, 11, 9, -7, 3, -13, -26, -60, 0, -26, 0, 39, 45, -11, 20, 25, 26, -29, -6, 2, -25, 12, -12, 32, -3, -3, -18, 25, 6, 20, -18, -6, -2, -8, 20, -1, -8, 14, 8, 6, -7, 0, 7, 5, 8, -2 }, { -8, -18, 4, 10, -7, -31, -37, 11, 3, 14, 12, -14, 14, 35, -50, 40, -18, -14, 4, -21, -4, 10, -3, 9, -11, 16, -6, 3, 1, 26, 3, -25, -5, 11, -8, 1, 10, -10, -7, 21, 24, -43, -14, 17, -11, 12, 18, 5 }, { 7, -3, 12, 0, -1, 6, 11, 6, -25, 50, 19, 15, -2, 7, 26, -21, 5, -6, 36, -45, 8, -6, 27, 12, -9, 11, -3, -1, 2, -19, 18, -5, -6, 0, 23, 6, -5, -9, -4, 2, -6, -1, 2, 43, -29, 14, -25, 39 }, { 3, 2, 11, 18, 14, 9, 25, 47, 18, -3, -26, 14, 25, 20, 25, -7, -8, -45, -5, 22, 7, -15, -5, -15, 4, 43, 0, -4, 3, -15, 2, -30, 24, -14, -1, -12, 45, -9, -3, 1, -1, -13, -4, -2, 22, 10, -9, 3 }, { -5, -10, 3, -6, 5, -2, -16, 6, -18, -33, -12, -23, -40, -40, -33, 14, 28, -17, 12, -13, -3, 5, -35, 7, 14, -4, 14, 1, 33, -27, 23, -13, 10, -35, 12, -2, 19, -4, 14, -8, -3, 3, -14, 15, 7, 15, -23, 19 }, { -7, -10, -4, 3, -7, -12, -29, -1, -3, 6, -45, -1, 18, 22, -23, -47, 35, 18, 24, 32, 8, -21, 16, -6, -14, -35, 22, -5, -13, 6, 19, -4, 19, 13, -18, -18, 2, 4, -9, 16, -5, 10, -30, 5, 0, 18, -12, 10 }, { -2, -4, 6, 1, 10, -10, -13, 9, 14, -7, 67, 0, 5, 26, -1, -51, 12, -24, -40, -17, -1, 31, -19, -1, -21, -16, 13, 4, 1, -11, -4, -13, 3, 6, -19, -8, 2, 8, 17, -24, -9, 14, -26, 0, 4, -12, -12, 3 }, { 3, -3, -1, 12, -11, 4, 0, -1, -2, -21, 29, -47, 7, -13, -13, -5, -7, 3, -8, 15, 8, -34, 39, -28, 7, 1, 16, 1, -16, 15, -16, 1, 12, -11, 31, -21, 24, -6, -1, -16, 5, 7, 7, 24, -11, -39, 25, 41 }, { 3, 6, -4, -1, -7, 9, -5, -13, 7, 14, 20, 43, 32, -12, 9, 5, -2, 4, -7, 7, -23, 22, -20, -25, 22, -25, 26, -7, 14, 14, 12, 25, -34, -32, 14, -17, 29, -7, -17, 29, 17, 9, -13, 7, 13, 22, 21, 27 }, { 1, -1, 1, -5, 0, 4, 8, -3, -27, -12, 23, 7, -1, -18, -18, -33, -42, -21, 29, 20, 1, 9, -35, -27, -14, 8, 3, 2, -33, 23, 18, -15, 29, -29, 5, 11, -27, 20, -24, 21, 3, -4, 18, -4, -22, 6, -6, -14 }, { 0, -4, 2, 3, -9, -2, -15, -7, 5, 19, 5, 21, 12, -12, -29, 4, 21, 1, -18, -51, 46, -32, -10, 3, 28, 27, -6, -4, -36, 5, 17, 9, 10, -16, -4, 4, 13, 5, -25, 5, -13, 36, -4, -14, 21, -19, 0, -22 }, { 1, 1, -3, 1, -3, 0, 16, -6, -2, 4, -33, -7, -19, -10, 12, -33, -49, -11, -12, -51, 12, -27, -5, 17, 4, -30, 48, 0, 8, 14, -20, -6, -6, -3, -1, 1, 2, 18, 5, 6, 6, -35, -19, -2, 21, -2, 19, -3 }, { -6, -1, -7, -3, -1, -6, -7, -9, 8, -6, -25, 0, -7, -14, -33, -47, -9, -8, -25, 6, -28, 22, 14, 14, 25, 41, 0, 8, -39, 2, -3, 16, -42, 18, 7, 11, 15, -2, 2, -12, 10, -11, 26, 2, 5, 18, -29, 20 }, { 0, 1, -2, 3, -1, 3, 0, -9, 8, 19, -15, 11, 26, 7, -14, 1, 10, 41, 18, -22, -18, 1, -8, -39, -25, 9, 1, 16, 2, 27, -26, -14, 5, -46, 3, 15, 1, 6, 46, -37, 1, -13, 10, -3, 19, -11, -31, 2 }, { -2, -2, -4, -3, -5, 4, 1, 0, -7, 6, -18, -4, 2, -7, 6, 1, -3, 2, 6, -29, -68, 47, 26, 16, 17, -11, -12, 0, -24, -6, -7, -20, 51, -11, -4, -10, 11, -9, -27, 5, -14, 13, -17, -5, 6, -29, 13, -3 } }, { { 110, -29, -41, -11, 27, -27, 0, 6, -2, -3, -2, 0, 6, -3, -3, 0, 1, 0, 0, 0, -1, 0, 1, 0, 1, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 8, 100, -24, 8, -13, -40, -23, 47, -27, 0, 2, -1, 1, -5, -2, -5, -2, 14, -4, -5, 0, 0, -1, 1, 0, -1, -1, -2, 0, 2, 1, 1, -1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, { -32, -11, 13, 1, 46, -79, 38, -9, -10, 3, 0, -6, 42, -53, 15, 4, 7, -4, -1, -1, 1, -2, 5, 9, -14, -1, 2, 2, 1, 0, -1, -1, 0, 2, -1, -1, -2, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -26, 0, -92, 43, 0, 13, 42, -16, -15, 8, 7, 3, -6, 8, -10, -4, 35, -33, 6, -1, 2, 1, -1, 0, 0, -1, -2, 0, 5, 7, -10, -1, 1, 0, 0, 0, 0, 0, 2, -1, 0, -2, 0, 0, 0, 0, 0, 0 }, { -28, -48, -40, 26, 21, 15, -39, 54, -18, 3, 4, 7, -6, -22, 25, 6, -38, 45, -18, 1, 1, 2, -1, 2, -13, 8, 4, 1, -1, -10, 12, 2, -4, 0, 0, -4, 0, 1, -1, 0, 0, 3, 0, -1, 0, 0, 0, 0 }, { 12, -22, 1, 20, -91, -33, 13, 11, 28, -9, 0, 18, -17, -27, 31, -5, 8, -4, 5, 4, -2, 5, 0, 7, -37, 26, 2, -1, 0, 1, 1, 0, 2, 1, 0, -11, 2, 5, 0, 0, 0, 0, -2, -1, -1, 0, 0, -1 }, { -17, -26, 0, 1, -14, -58, 30, 28, 4, -2, 3, 10, -37, 50, -19, 16, -23, 8, 2, -1, 0, 2, 6, -36, 52, -25, 2, 3, 2, -10, 6, 0, 0, -5, -5, 14, -3, -2, -1, 1, 0, 1, 2, 0, 1, 1, 0, 0 }, { 11, -11, 30, -11, -3, 30, 58, 54, -45, -4, -4, 14, -9, -15, -17, -24, 38, 18, -33, 3, -1, 2, 1, -2, -3, -6, -2, -2, -15, 34, -5, -6, -5, 0, 0, -1, -1, -1, 2, 9, -1, -1, 0, 0, -1, 0, 1, 0 }, { -11, -22, 5, -10, 8, -20, -57, 17, -9, 5, 0, 2, -10, 12, 40, -13, 41, -39, 13, 2, 1, 1, 0, -8, 14, -13, 10, 1, -18, 50, -48, 14, 0, 0, -4, 7, -5, 1, 1, 10, -8, -5, 0, 0, 0, 0, 1, 0 }, { 15, -7, 32, 89, -1, -2, -23, -3, -6, -35, -5, 16, 14, -8, -41, 13, 3, -5, 2, -1, -8, 1, -4, 22, 5, -35, 6, 3, 3, 5, -8, 2, 1, 2, 7, 13, -24, 1, 0, 3, -4, 1, 3, 4, -6, 0, -1, 0 }, { -13, -18, -25, -44, -41, 3, -11, 32, 22, 21, 1, -22, 56, -12, -47, 20, 2, -9, -5, 8, 3, 1, -13, 30, 7, -33, 7, 2, 4, 0, -9, 4, -3, 3, 6, 11, -20, 4, 0, 1, -2, -1, 1, 3, -4, 0, 1, 0 }, { 6, 16, -5, 15, 12, 22, 23, 23, 63, -16, 2, -27, 6, -24, 42, -37, 3, 7, 12, -8, -2, -2, -6, -13, 31, -21, 3, -7, -9, -1, 13, -2, 0, 2, -21, 40, -30, 6, -4, 2, 5, 1, 0, 9, -3, 1, 1, -1 }, { -10, 3, 2, 5, 37, -5, 5, 42, 62, -15, 3, -12, -26, 6, -45, -2, 3, -12, 26, -12, -2, -3, -6, 13, -30, 18, -3, 3, -20, 17, -4, 6, -2, -2, 17, -37, 29, -10, -9, 11, -1, 1, -1, -7, 6, 2, 2, 1 }, { 7, 4, 13, -10, 9, 19, 5, 28, -11, 16, -3, 16, -39, -53, -7, 24, -9, -44, 35, -7, 0, 2, 3, -10, 8, -3, 0, -3, 42, -33, -21, 25, -4, 2, -4, 6, -5, 2, 25, -30, 4, 1, 0, 2, -3, -5, -2, -1 }, { -10, 1, -18, -27, 3, -9, -29, -34, 16, -5, 7, 31, -40, -44, -37, -14, 34, 35, -3, -6, 3, 1, 27, -23, -4, -29, 26, 1, -8, 5, 21, -12, 2, 6, -3, -5, -19, 13, -5, 1, 11, -4, 2, -5, -9, 2, 2, -1 }, { 4, 14, -10, -4, -3, 12, 13, -17, 12, -12, 6, -2, -11, -32, 0, 64, -25, 12, -16, 5, 1, -3, 11, -20, 11, 14, -21, 4, -5, 38, -36, 6, 0, 4, -12, 11, -3, 1, -19, 47, -40, 9, -2, 4, -2, 8, -4, 0 }, { -6, -11, -2, -3, -9, 2, -22, 11, -5, -60, 17, -23, 23, -10, -11, 24, 46, 0, -4, -6, -1, 1, 4, -30, 21, 27, -21, 1, 15, -13, 17, -13, -1, 9, -31, 7, 38, -25, 15, -14, 12, -8, -7, -2, 20, -1, 0, 2 }, { -9, 6, -12, -36, 11, 4, 14, 6, -1, -76, 10, 22, -6, 25, 34, 24, 3, 0, 0, -14, 2, -1, -5, 32, -20, -26, 4, -8, 28, -3, -4, -12, 3, -9, 27, -9, -24, 13, 15, 2, -6, -3, 9, 2, -18, 0, -3, 0 }, { 4, 8, 10, 15, 19, -3, -7, 14, 42, 49, -17, 21, -7, 19, 17, 44, 43, -1, -51, 6, 6, -7, 14, 10, -10, -3, -21, -9, 23, -6, 12, -26, 5, 3, 1, 8, -5, -8, 16, -5, 6, -12, -3, 12, -8, 1, -1, 2 }, { 8, 2, 5, 13, -5, -1, 4, -8, -3, 5, 11, -75, -32, -12, 14, 12, 7, 7, -28, 24, -5, 0, -13, 2, 5, -34, 41, -3, 22, 0, -7, -10, 6, -4, 15, -31, 16, 7, 12, -3, -8, -2, 11, -35, 22, -2, -3, -5 }, { 4, 8, -3, 5, -4, 16, 5, 10, 16, -2, -8, 50, 44, -3, 17, -5, -17, -23, -10, -5, 5, -5, 41, -28, 14, -32, 21, -9, 2, -4, 0, 3, -3, 10, 8, -37, 29, -14, -10, 10, -3, 0, 18, -39, 31, 4, -2, -4 }, { -7, 2, -8, -4, 6, -10, -5, -5, 23, -23, -1, 8, 1, 4, -23, -50, -24, -9, -54, 36, 2, -6, 14, -7, -3, 20, -1, -1, 21, 2, -38, 7, 10, 7, -10, 6, -1, 1, 40, -27, -27, 20, -2, 6, -9, -24, -4, 1 }, { -2, 2, -7, -8, -3, -1, -8, -5, -3, -15, -5, 9, -25, -33, 6, -6, -22, -39, -29, 23, 2, -4, -9, 36, 43, -5, -35, 27, -31, 3, 17, -26, 16, -12, 26, -2, 10, -19, -21, -7, 26, -18, 13, 3, 2, -10, 12, 6 }, { 4, 6, 3, 3, 4, 7, 7, 11, -5, -9, 13, -14, 8, 12, 0, 35, -20, -34, -12, 11, -7, -4, 25, -27, -30, 7, 29, 17, -43, 12, 10, -10, 2, 19, -24, -4, -23, 40, 1, -27, 33, -14, -7, 0, -15, -26, 24, -4 }, { 2, 6, -5, 3, -4, 4, 11, -11, 12, 2, -5, 6, 7, 3, 15, 19, 17, 53, 14, 4, -4, -1, 2, 3, 2, -25, -28, 32, -29, 2, -34, 42, -10, -8, 14, -8, 10, -15, 16, -47, 16, 11, 8, -5, 8, -37, 22, -2 }, { 1, -7, 7, 5, -4, -1, -7, 6, -8, 9, 44, -28, 2, 4, 0, -21, 0, -6, 10, 8, -3, -4, 39, -19, -20, -25, -78, 7, 12, -8, 5, -2, 1, 20, 11, -17, -20, 6, -6, 10, -5, 0, 24, -18, -30, 10, -7, -6 }, { 6, 1, 6, 4, -3, 0, 2, -9, 8, 21, 90, 15, 4, -4, 1, -3, -15, -3, -26, -55, -23, 16, -16, 8, 14, 4, 14, -2, 5, 22, -2, 4, -9, -5, 4, -6, 3, -20, 19, -7, 9, -7, -6, 11, -4, -5, 4, 10 }, { -1, 1, -1, -7, 3, -3, -2, -3, 0, 2, -7, 12, -12, 4, -1, -8, -13, -12, -14, -26, 8, 29, -37, -1, -41, -41, -25, 46, 3, 0, 0, -13, 8, 11, -42, 24, 13, 8, -1, 1, -11, 1, -19, -23, 40, -2, -10, -26 }, { -3, -6, -5, 4, -10, -3, 0, -8, -2, 1, -52, -25, 0, -4, -6, -6, -26, 6, 5, -58, 20, -1, 23, 1, 11, 9, -16, -6, 20, 53, 4, -17, -13, 3, 8, -9, -12, 0, 43, -8, 18, -26, 6, -11, 1, -6, 5, -9 }, { -1, 0, 3, 3, 7, -2, -5, 3, 8, 2, 4, 16, 10, -1, -6, 4, 20, 3, -14, -13, -5, 11, -30, 0, 35, 46, -15, 1, -7, -15, -3, 9, 3, -13, 7, -16, -30, 62, 0, -1, -5, 5, 26, -49, 10, 0, 2, -38 }, { -1, 3, -1, 0, 6, 1, 1, -1, 4, 7, 16, 25, 11, -3, -4, 3, -16, 15, 38, 57, -15, 10, -39, -21, -7, -5, -17, -41, 21, 42, 1, -33, 21, -6, -1, -1, -5, -3, 18, -4, 21, -19, -4, -10, 17, -6, 18, -9 }, { -4, -3, -4, -4, -1, -7, -5, -9, -2, -12, -16, -10, -12, -2, -3, 0, -4, -17, -36, -11, -13, 18, -10, -2, -14, -20, -26, -79, -4, -1, 10, 49, -22, -5, -11, 1, 3, 4, -24, 4, 25, 17, -6, 0, 6, -8, 30, -1 } } }, { { { 108, -42, -38, -25, -12, -7, -4, 9, 9, -7, -2, 12, 10, 2, -2, -1, 2, 0, 3, 7, 0, -1, 2, -3, -3, 1, 1, 0, 0, -3, -3, 1, 0, 0, -1, 0, 0, -1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0 }, { 32, 13, 92, -14, -32, -14, 1, -20, -48, -28, -12, -8, 7, 1, -3, 0, -3, 8, 26, 14, -5, -3, 10, 15, 5, 3, 0, 2, 1, -6, -1, 3, -1, 2, -3, -8, -2, 0, -1, 0, -2, -1, 1, 2, 0, 0, 1, 0 }, { -41, -9, -6, -63, -46, 12, 4, 21, 5, -5, 12, 54, 41, 3, 2, 1, 2, -1, 17, 29, 6, 2, 1, -22, -17, 2, 1, 0, -3, -19, -23, -3, 3, -1, -8, -1, 0, -3, -2, 6, 8, -1, 4, 4, 1, 0, -1, -1 }, { 26, 73, 21, -46, 27, -16, -12, -19, 26, 6, 8, 27, -11, -13, -4, -4, 0, -4, -31, -16, 4, 3, -24, -35, -2, 6, -2, 0, 3, 12, -4, -11, 2, -4, 13, 24, 5, -2, 1, 1, 12, 8, 0, -7, 1, -1, -7, 0 }, { 4, -47, 59, 38, -31, 9, 5, 17, 37, 33, -6, 15, 11, 0, 2, 2, -1, -16, -33, -29, -11, -5, -15, -32, -21, -5, 0, -3, -1, 4, 4, 2, -2, 3, 13, 21, 11, 0, 4, 8, 8, 4, -3, -4, 0, -4, -5, -1 }, { -12, 1, -17, -38, -62, -28, 11, -15, 1, 27, 3, -14, 1, 19, 2, 2, 7, 19, -14, -38, 6, 3, 0, 23, 23, 4, 3, 1, -4, 31, 48, 10, 0, 4, 20, 5, -7, 0, 2, -16, -23, -1, -18, -13, -2, 1, 2, 4 }, { 16, 32, -19, 45, -24, -25, 6, -22, -29, 61, 32, 16, 15, 17, -3, 0, 0, 19, 29, -5, -6, -9, -26, -11, -7, -4, 3, 1, -16, -31, -18, -10, -3, 2, 4, -11, -11, -4, 1, 11, 10, 8, 4, 13, 8, 2, 1, -4 }, { -17, -63, 1, -6, 23, -15, 5, -49, -31, -17, 28, 20, -29, 8, 12, 2, 7, 21, 6, 13, 6, 5, -29, -24, 20, 8, -1, 1, 4, 13, -7, -12, 2, -1, 25, 27, -6, -3, -4, -5, 18, 17, -6, -12, 2, 0, -12, 1 }, { 4, -22, 37, -22, 27, 6, 3, -3, 45, 40, 40, 12, -18, -2, 1, 0, -3, 8, 3, 11, 16, -6, -4, 28, 24, -2, -3, -2, -11, -4, -5, -18, -4, -6, -22, -43, -28, -1, -4, -20, -24, -6, 5, 13, 5, 14, 20, 5 }, { -12, -4, -2, 20, 4, -74, 6, -38, 61, -28, -42, 18, 16, 19, 7, 1, 10, 16, 8, 10, -8, 3, 18, 4, -4, -4, 0, -8, -10, -9, -7, 3, 3, -4, -12, -6, -3, -5, 2, 2, 1, -4, 7, 3, 2, 1, 0, 0 }, { 1, -6, 9, -11, 56, 17, 15, 7, -8, -15, -7, 18, 46, 22, -6, -2, -15, 20, 29, -41, -9, 10, -17, -7, -17, -17, -4, -2, -24, -19, 35, 18, -1, 6, 24, -4, -11, -2, 10, 3, -25, -9, -26, -2, 11, 1, 6, 2 }, { -13, -25, -1, -51, 10, -11, -8, -16, 1, 23, -22, -45, -16, -16, 1, 4, -2, -3, 6, -37, -25, -9, -13, 7, -8, 1, 3, 7, -3, -33, -12, 9, 2, 13, 0, -11, 9, 8, 8, 37, 29, 8, 25, 35, 11, -6, -1, -18 }, { 0, 3, 8, -6, -1, -32, 31, 42, 10, -29, 33, -56, 10, 40, -8, 1, -10, -1, -11, 5, 40, 10, -22, 2, -12, -25, -1, -4, -11, -4, -18, -29, 5, -5, 11, 4, 3, 9, 9, 2, 15, 20, 0, 1, 4, 1, -6, -1 }, { -6, -18, 4, -1, 21, -47, -33, -12, -24, 11, 44, -2, 32, -23, 0, 7, 5, -22, -8, -12, 36, 13, 25, -3, -34, 5, 10, 6, 14, 5, 18, 6, -9, -11, -23, 3, 30, 6, 5, 12, -18, -22, 5, 0, -14, -13, 11, -2 }, { 2, 0, 13, -16, 31, -20, 23, 30, 4, 60, -12, -10, -4, 10, 3, -2, 6, 7, 31, 39, -5, 5, 27, 11, 7, 6, -7, -1, 9, -3, -2, 24, 3, 1, 13, 29, 15, -3, -12, 4, 13, -7, -18, -32, -18, -21, -34, -1 }, { -6, -2, 2, 11, -4, -28, -57, 39, 19, -14, 7, -14, 26, -49, -4, 4, 13, 0, 37, 1, -10, -8, -41, 9, 27, 19, 7, -1, 9, 3, -2, -4, 0, 11, 31, 2, -16, -6, -12, -11, 0, 9, -16, -3, 3, 5, -4, 1 }, { -5, -1, -5, -7, 7, -46, 28, 25, -21, -6, 7, 25, -31, 4, -5, 2, -11, -46, -14, 25, -50, -35, -24, 5, -16, -12, -2, 2, 12, 3, 12, 9, -10, 16, 14, -12, 5, 10, 3, -5, -24, -8, -8, 10, 2, 8, 22, 3 }, { 3, 5, -5, 6, -26, 18, 12, -37, 38, -9, 10, -3, -35, -30, -11, 3, -25, -26, 42, 20, 31, 14, -12, 10, -17, -19, 5, 12, -2, -24, 18, 5, -1, -1, 27, 10, 18, 17, 12, 20, -14, -10, -29, -2, 3, -13, -1, -5 }, { -4, -7, 0, -4, -7, -25, -6, 31, -23, 14, -43, -8, -43, -20, -7, 1, -2, 18, 2, 0, 30, 27, -4, -51, -3, -1, -4, 3, -22, -18, -9, 4, 6, -25, -11, 6, -17, -6, 5, -11, -28, -23, -7, -1, 19, 18, 22, 17 }, { -5, -11, -6, -10, 8, 2, 0, -26, -9, 26, -43, -10, 33, -14, -32, 4, -41, -9, 11, 6, -2, -19, -12, 8, -14, -17, 5, 11, 7, 21, -13, -51, -2, -13, -9, 8, 15, 11, -1, -28, -6, 27, -1, -23, -11, 19, 9, 26 }, { 3, 6, 0, -8, -1, 18, 16, -17, 12, -2, 2, -42, 7, -2, 48, 0, 52, 9, 15, 15, -4, -44, -17, -25, -34, 17, -9, -16, -6, -8, 7, -14, -18, 3, -4, 13, 16, -16, -6, -13, -28, -9, 1, -8, -11, 4, 20, 11 }, { 5, 3, 0, 6, 2, -7, 60, 12, -5, -3, -16, 21, 2, -16, 28, 6, 15, -36, 39, -45, 20, 28, -24, 7, 12, 20, -6, 3, 33, 14, 0, -13, 9, -15, -16, -7, 2, -2, -17, -8, 5, 4, 24, 6, -10, -3, 0, -7 }, { 1, -1, 0, -6, 2, 2, -28, -13, -17, 14, -18, 4, 5, 13, 48, -6, 29, -58, -19, 1, 2, 33, 4, 34, -12, -15, -10, -16, -21, -18, -20, -18, 10, -6, 24, -8, -19, -2, 7, -3, 5, 6, -34, -13, 13, 12, -4, 10 }, { 0, -2, -4, -4, -7, 1, 17, -28, 9, -7, 26, -36, 20, 3, -14, -6, -25, -24, -10, -13, -30, 22, -3, -27, 18, 13, -2, -1, 23, -21, -43, 32, 9, 6, -11, -6, -15, -10, -26, -15, -8, -35, -22, -23, -8, 12, -1, 22 }, { -3, 0, 2, -2, -5, 4, -34, 17, 7, -12, 3, 15, -33, 53, -7, 0, -3, 6, 16, -32, -21, 3, -6, 20, 7, 27, 14, 9, 9, -33, -11, -37, -20, -25, -9, 16, 32, 4, -8, 15, -14, -14, -5, -22, -26, -9, 13, 12 }, { -1, -1, -5, -2, -8, -10, 19, 8, -3, -16, 28, 3, -7, -46, 10, -1, -10, -3, 9, -27, -30, -22, 24, 6, 10, -25, -15, -18, -46, -4, 3, -14, -26, -34, -27, 16, -17, -7, 15, 8, 16, -1, 1, -32, 3, -9, -28, 16 }, { 0, 1, 2, -4, -7, 5, -35, -1, 6, 4, 10, -4, -13, 20, 12, -14, 6, 2, 34, 2, -24, 21, -36, -1, -14, -56, -14, -12, -5, 41, -1, 26, 34, -4, -34, 8, 15, 17, 4, -17, 3, -20, 26, -2, -8, 2, -7, 3 }, { -1, -3, 2, 0, -2, 0, 9, 1, 3, -6, 14, -1, -20, -15, -39, -7, 31, 21, -6, 1, -26, 28, -2, 15, -68, 25, 0, 5, 8, -11, 24, -1, 28, -17, -7, -27, -8, -11, -13, -15, 14, 31, -11, -13, -4, 14, -20, 15 }, { -5, -6, -4, 0, -5, -7, -11, -2, -7, -2, -13, 8, -7, -2, -14, -63, -13, 6, -6, -13, 32, -25, -16, 7, -9, 2, -73, -29, 20, -6, -7, -3, -12, 14, -3, -24, 4, -13, -9, 7, -4, -7, -11, -5, -10, -21, -18, -5 }, { 1, 2, 0, 3, -2, -8, -2, 9, 4, -3, 20, 9, -15, -13, 23, -15, -23, 8, 17, -23, 0, 12, 47, -3, -5, 1, -19, -8, -12, -16, -12, 5, 19, 55, 19, 15, 28, 7, -4, -27, -5, 40, 19, -4, -14, 34, 30, 12 }, { 1, 3, 1, -2, 1, -3, -4, 6, -4, -7, -8, 13, -18, -4, 21, 41, 11, 19, -2, -28, 25, -36, 1, -9, -11, -37, 27, 18, 15, 1, -30, 17, 3, 25, -12, -42, 1, 11, -10, -3, 4, 10, -39, -19, -24, -11, -22, 16 }, { 2, 2, -1, 4, -2, -1, 19, 6, -2, 1, 3, 0, 18, -34, 32, -16, 2, 49, -26, 13, -31, 30, -10, 5, 19, -6, -15, 10, 17, 1, -15, -19, 15, -18, 0, -17, 35, 38, 22, 24, -17, 1, -32, 5, -6, -8, 16, -4 } }, { { 76, -45, -77, 23, 32, 0, -3, 11, 13, 0, 1, -18, -18, 0, 0, -1, 2, 0, 5, 6, -1, 0, -2, 5, 5, -2, 0, 0, 0, -4, -4, 1, 0, -1, 1, -1, -1, 1, -1, 1, 2, 0, 0, 0, 0, 0, -1, 0 }, { 37, 11, 2, -51, -59, -14, -3, 23, 65, 22, -1, -2, 8, 3, -3, 0, 5, -14, -41, -19, 2, -3, 7, 23, 12, 1, 1, -3, -1, 8, -1, -2, 1, 2, -2, -12, -5, 2, 1, 1, 4, 0, 0, 3, 0, -1, -2, 0 }, { -52, 25, -12, 26, 11, -9, 1, 15, 31, 23, -5, -65, -44, 1, 0, 1, 0, -2, 8, 7, -2, -1, 8, 42, 28, -2, -1, 0, -4, -24, -23, -1, 1, 0, 3, -7, -3, 5, -1, 11, 11, 0, -4, -3, -2, -1, -2, 1 }, { 38, 0, 33, -22, -34, -19, 2, 19, -12, -1, -17, -14, -10, -3, -1, 1, -1, 6, 55, 45, -3, 2, -2, -17, -2, 3, -1, 2, -5, -43, -35, 0, 0, -2, 17, 35, 13, -1, -5, 9, 5, -4, -7, -18, -3, 1, 3, 4 }, { -38, 2, -33, -12, 8, -3, 6, 28, 45, 20, -10, 11, -18, -13, 3, 1, 2, 2, -4, 6, 13, 6, -21, -57, -17, 3, -2, -1, -4, 13, 15, -4, 0, -2, 21, 49, 19, -2, 0, -22, -29, -6, -1, -13, 0, 8, 14, 0 }, { 31, 7, 39, 3, 0, -19, 13, -4, -9, 7, -12, -43, -36, -15, -3, 1, -2, 24, 16, -4, 15, -8, -11, 7, 2, -1, -1, 2, -3, 31, 50, 3, -3, 4, -5, -14, 2, 4, -1, -34, -46, -7, 18, 26, 6, 7, 11, -10 }, { 30, 11, 38, 13, 30, -5, -12, -27, -4, -4, 1, -8, -21, 9, 2, -1, -5, -16, -41, -36, -9, 11, 4, 2, 28, 5, -2, 0, 1, 8, 1, -10, -2, -5, 32, 48, 5, 0, 5, 3, 0, 4, -29, -47, -10, 6, 13, 11 }, { -15, -72, 35, -56, 40, 27, 6, 6, 12, 25, 10, 9, -24, -18, 7, 2, -3, 11, 11, -21, -7, -1, 9, 7, 12, 6, -5, 3, -13, -22, 3, 4, -2, -1, -13, -6, -1, 1, 3, 13, 2, 1, 6, -1, -4, -3, 0, 2 }, { 23, 29, 26, 10, 32, -2, 1, -8, 23, 18, -14, -6, -19, -28, -6, -2, 9, 5, -13, -3, -5, -10, -18, -40, -36, -8, -3, -1, 12, 11, -13, 8, 4, -4, -17, -4, 9, 1, -3, 23, 52, 12, 16, 17, 7, -22, -42, 4 }, { -9, -31, 29, 24, -7, -14, 4, 48, 12, -58, 22, 13, -26, -2, 1, 3, 2, -31, -16, 23, 4, -1, -38, -10, 30, 3, -3, -4, 15, 12, 5, 12, -1, -2, 13, -9, -27, -1, 1, 10, 5, -6, -1, 9, 13, -14, -11, 1 }, { 6, 53, -33, -43, 17, -36, 3, 3, -26, 3, 34, 27, -23, 1, -4, 1, -1, 12, 22, -21, -2, 1, -24, -5, 31, 5, -2, 4, -8, -17, 25, 21, -1, -6, -1, -6, -23, -2, 8, 25, -1, -6, 0, -6, 3, -16, -6, 11 }, { -19, -29, -18, -21, -30, -2, 15, -17, -15, -6, -14, -24, -6, 1, -4, 2, 19, 33, 7, 10, 9, 2, 6, 8, 3, -7, -1, 2, 21, 47, 27, 19, 9, -9, 10, 10, -5, 0, -6, 0, 31, 16, -32, -35, -4, -20, -37, 25 }, { 4, 22, -6, -52, 30, 32, -12, -36, 9, -23, 7, -3, -9, -3, -1, -5, 5, 8, -6, 49, 15, 8, -18, 8, 17, -1, 0, -2, 8, 20, -34, -26, 3, 7, 25, -10, -17, 3, -11, -28, 4, 6, -9, 19, 16, 13, 2, -16 }, { -13, -14, -20, -17, -20, -27, -4, -18, -25, -27, -4, -7, -23, -5, 18, -1, -6, -5, -19, -16, -14, 4, 13, 0, 18, 33, 6, -3, -4, 4, -9, -3, -1, 17, 5, 34, 40, 6, -1, -1, 18, 7, 49, 32, -6, 0, -22, -38 }, { 4, 24, -2, -17, 28, -14, 16, 42, -7, -38, -11, 11, -10, -28, -1, 1, -4, -8, -8, 18, 2, -4, 52, 10, -1, 11, -7, -1, 4, 18, -7, -9, 9, -16, -51, -5, 11, -6, -1, -21, 2, 17, 5, -27, -22, 18, 7, 19 }, { -9, -19, 2, -23, 35, -60, 0, 18, -33, 45, -36, -9, 25, 29, -1, 5, -5, -38, -13, 12, 3, 2, -1, -2, -2, -6, -1, 5, 32, 15, -9, -9, -1, 3, 8, -4, -18, -6, -9, -1, 2, 1, 1, 10, 17, -2, 7, -5 }, { 3, 17, -7, -24, 5, 54, -37, 6, -10, -17, -37, -26, 1, 8, 3, -2, 0, -38, -4, 10, 14, -7, 10, -1, 0, 1, -4, 4, 23, -4, 24, 31, -9, 6, -7, 11, 6, -5, 6, 29, -21, -38, 21, -6, -5, -30, -1, 6 }, { 10, 14, 14, 11, 29, 8, 17, 25, 15, 17, 15, 16, 30, 28, 11, -10, -15, 7, 14, 14, 3, 13, 19, 21, 16, 13, 5, -16, -16, 7, 8, 13, -8, 10, 16, 17, 12, 1, -21, -31, -8, -20, 3, -1, -6, -25, -68, -11 }, { -8, -9, 8, -20, 22, -40, -20, -23, 36, -37, 50, -32, 20, 37, -3, 2, 6, -3, 14, 2, -18, -7, -3, 1, -34, -27, 1, 2, 12, -2, 5, 20, -1, -6, -14, 4, 27, 11, -9, -3, -5, -10, 8, -7, -14, 5, 7, 4 }, { 1, 12, -5, -20, 9, 20, 25, 49, -26, -19, 11, -31, 18, -30, -16, 0, -2, -6, -4, -33, -31, 19, -3, 13, -33, -14, 6, -3, 2, -1, 3, 6, -3, 7, 38, -1, 19, 19, -3, 4, 5, 4, -36, 19, 16, 2, 5, -19 }, { 5, 7, 2, 7, -21, 31, -33, 19, -18, 38, 41, 9, -35, 16, 4, 0, 14, -15, 18, 0, -46, 12, 16, -12, 7, -10, -7, 6, 40, 23, -2, 9, 1, -14, -14, 7, -5, -1, -29, -24, 6, 13, 9, -1, 5, 13, 5, -14 }, { -7, -11, -9, -4, -14, -14, -14, -23, -15, 4, 4, -2, -15, -35, -7, -14, -14, -23, -16, -3, -22, 3, 2, -5, -9, 0, 19, -19, -11, -11, -21, -7, -19, -14, -9, -9, 0, 22, -24, -30, -26, -37, -11, 2, 27, -15, -26, 58 }, { -4, 0, 4, 7, 9, -22, -32, -1, 28, -30, -50, 35, -16, 6, -8, -1, 32, 30, 19, -18, -20, 34, 43, 5, -1, -18, -1, -5, 3, -4, 5, 0, -9, -2, 10, -1, -16, 6, -10, 6, -7, -8, -11, 14, 32, -16, 6, -17 }, { -2, 3, -2, -13, -3, 18, -18, 23, -5, -23, -30, -18, -10, 58, 11, -8, -36, 5, 5, -29, -12, -7, -17, -21, -3, -18, -2, 1, -35, -8, 2, -15, 10, -6, -8, -4, -15, -3, 10, -22, 7, 35, 8, 6, 16, 22, -28, 28 }, { -1, -3, 4, 1, -1, -7, -53, 33, -28, 11, 23, 17, -23, 8, -1, 0, 18, 20, -15, -1, 61, -6, -2, 21, -23, -27, 2, -3, -6, 0, -6, -33, -5, 16, 8, -2, 31, 25, 4, 6, 0, 6, -3, 1, -14, -16, -7, 10 }, { -4, -4, 1, 2, 4, -18, -70, 4, 0, 9, -16, 6, 10, -35, 16, -1, -21, 2, 0, 14, -15, -5, -22, 13, -4, 21, -2, 5, -19, 5, 8, 43, 15, -12, 6, -16, 5, -10, 14, -13, 8, -1, -36, -3, -22, 32, -16, -24 }, { 1, 0, -1, 1, -7, -6, -11, -16, 18, -3, 25, -9, -5, -14, 19, 6, -49, -36, 34, -4, 31, 30, 33, -11, -14, 15, 7, 25, 14, 2, 22, -34, 0, -10, 1, -4, -3, -10, 17, 14, -3, 26, -27, 14, 16, -20, -14, -7 }, { 0, 1, 5, 3, 7, -2, -36, 22, -1, 4, 20, -39, 50, -20, 19, 0, 27, 37, -9, 12, -22, 20, 1, -19, 23, 33, 8, -8, -10, 8, 9, -26, -12, 0, -6, 9, -15, -6, 9, 13, -4, 19, 22, 2, 16, -18, 13, 27 }, { 0, -4, 1, -3, -6, 0, 10, -5, -17, 11, 11, -11, -12, 16, -21, 7, -7, 9, -46, 42, -6, 54, 14, -15, -6, -28, -16, -4, -36, -8, 8, 8, -5, -42, -18, 0, -4, 5, 34, 14, -10, -10, 0, 0, 0, -1, -12, -30 }, { 3, 2, 5, 4, 7, 8, 4, -3, 5, 10, -6, 13, 0, 20, -11, -11, 11, -16, 5, 17, 15, 15, 0, 8, 14, 24, 52, -17, -8, 0, 18, 34, 34, -24, -3, 6, 33, 46, 2, 8, 13, 28, -4, 26, 24, 10, 27, 26 }, { 4, 7, 1, -1, -1, 0, 20, 5, -2, 2, -2, 11, -19, 17, 91, -3, -25, 34, -24, 16, -15, 8, -1, 12, -25, 2, 2, 15, 17, 6, -8, 14, -7, -1, 5, -9, 9, 3, 1, 12, 4, -20, -10, 10, 6, -9, 26, 20 }, { -2, 1, -2, 1, 2, 1, -1, -3, 11, 7, -15, 25, -12, 6, -37, 10, -25, -4, 6, 35, -59, 7, -19, 25, -4, 2, 6, 19, 2, 17, 25, -29, -26, 34, 11, -9, 23, 7, 21, 15, -3, 18, 10, -1, -27, -13, 1, 25 } }, { { 82, -51, -74, 14, 33, 2, -3, 8, 7, 1, 1, -7, -9, 1, 0, -1, 1, 0, 2, 2, 0, 0, -1, 1, 1, -1, 0, 0, 0, -1, 0, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 64, -10, 41, -78, -50, -14, -1, 11, 27, 8, -2, 17, 17, 0, -3, 0, 1, -3, -12, -4, 1, -1, 3, 0, -1, 3, 0, -1, -1, 1, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -41, 15, -52, -26, -8, -2, 2, 26, 65, 64, -3, -20, -13, -1, 1, 1, 4, -6, -27, -20, 4, 0, 4, 12, 7, -1, 0, -2, -4, 2, 1, -2, 2, 0, 0, -2, 0, 1, 1, 0, 0, -1, 0, 0, 0, 0, 0, 0 }, { -39, -44, -19, -44, 27, 39, -1, -11, -4, -9, 41, 64, 25, -2, 6, 0, 0, 0, -12, -16, -2, 6, -11, -29, -12, 2, 0, 0, 0, 9, 11, 1, -1, -2, -3, 3, -2, -3, 1, -1, -2, 1, 1, 0, 1, 0, 0, 0 }, { 12, 72, -58, 3, -26, -44, -2, 13, -26, -16, 20, 38, 34, 13, -5, -1, 1, -7, -5, 1, 4, 1, -13, -19, -10, 1, 3, -1, 3, 5, 3, 0, 2, -1, 3, 4, -1, -3, 0, -2, -1, -1, -1, 0, 1, 0, 0, 0 }, { -31, -37, -21, 3, -51, 5, 11, 49, 22, -46, -4, 3, -7, -2, 3, 3, 4, -6, 25, 59, 20, 1, -4, -7, 2, 1, 0, -1, 1, -17, -28, -4, 3, 1, 5, 6, 1, 0, -3, 0, 3, -3, -1, -2, 0, 0, -1, 0 }, { -7, 23, -21, -71, 36, -1, -2, -25, -30, 17, -12, -28, 0, -3, -1, -1, 2, 25, 50, 42, 16, 3, 2, 2, -3, 1, -1, 5, -4, -22, -21, -6, 4, 1, 1, 5, 3, 0, -4, 1, 4, 1, 0, -2, 0, 0, -1, 0 }, { 12, 30, 24, -9, 46, -18, 2, 18, 42, -30, 42, -2, -53, -25, -2, 0, -2, -2, 2, 3, 5, -6, -47, -31, 11, 3, -5, -2, -2, -2, 1, 0, -1, -3, 18, 19, -1, 2, 0, -1, -4, -3, 0, -1, 4, 0, 2, 0 }, { -6, 14, -17, -30, 8, 3, -8, -8, 6, -72, 24, -16, -11, 5, 1, -1, 0, -8, -16, -19, -3, 10, 36, 59, 36, 6, 0, -1, 5, 5, 0, -3, 2, -1, -18, -31, -14, 4, 1, 3, 7, 3, 1, 2, -3, 0, -1, 0 }, { 18, 36, 3, 15, 14, 57, -37, -25, 39, -9, -27, 20, 15, 7, -2, -6, 2, -14, -27, 24, 27, 3, 4, -8, -13, -7, 0, -2, 6, 1, -41, -35, -1, 0, -1, 1, 2, -1, -2, 1, 21, 11, -2, -1, 0, 3, -3, 0 }, { -11, -17, 19, -3, 59, -60, 35, 28, 1, 2, -27, 14, 36, -6, 0, 5, -13, -35, -27, 14, 13, 7, 15, 7, 0, 4, -2, -3, 8, 2, -23, -20, -2, -3, -10, -6, -3, -3, -1, 1, 11, 7, 2, 1, 1, 2, -1, 0 }, { 7, -5, 12, 20, -7, -7, -3, -2, -4, 38, 73, 12, 3, 16, 3, 1, -5, -4, 11, 4, -28, 2, 5, 17, 16, 1, 1, -1, -12, -47, -51, -21, -5, -1, 0, -8, -9, 0, 0, 21, 28, 11, -6, -5, -1, 1, -3, 1 }, { -14, -29, -6, -1, -12, -39, -4, -66, 20, -13, 14, -36, 23, 44, 8, 2, 0, -11, -14, 6, 16, -11, -35, 0, 5, -8, 5, 1, 13, 15, 0, -7, 0, 14, 39, 14, -3, 1, -2, -3, 3, 4, -16, -5, 3, 0, 0, -1 }, { 1, -5, -4, -13, -12, 18, -6, 20, -66, 18, -8, 4, -33, -15, 2, -1, -3, -14, -48, 3, 28, -10, -24, 13, 42, 19, 0, 3, 4, 12, -10, -19, -3, 17, 37, 12, -11, -1, 3, 0, 7, 9, -15, -6, 3, 0, 0, -1 }, { -6, -11, -10, 9, -16, -40, 14, -47, 13, -2, -28, 56, -46, -19, 2, 4, 12, 41, 8, -19, 27, 12, 12, 5, 12, 8, 4, -2, -21, -10, -5, -24, -3, -9, -5, 3, 3, -1, 6, 3, 10, 17, 3, -3, -2, 1, -2, 0 }, { 8, 0, 10, 14, -1, 4, 13, -4, 2, 23, 35, 5, 7, -2, -6, 1, 5, 15, 3, 27, 70, -34, -17, 18, 3, -6, -2, 0, 2, 19, 17, 1, 3, -3, -31, -51, -21, -2, -3, -14, -16, -6, 18, 26, 4, -3, -2, -2 }, { -1, -7, -5, -5, -21, -7, 12, -12, -18, 0, 32, -40, -18, -34, -9, 0, 14, 0, -24, 8, -9, 14, 19, -35, -31, -4, 2, 2, 23, 38, -10, -29, -1, -18, -29, 5, 19, 7, -8, -12, 28, 35, 18, 7, -1, 4, -13, 1 }, { -9, -12, 6, -6, 5, -34, -82, 7, -9, 14, -5, 17, -38, 34, 32, 0, -6, -10, -8, 30, -10, 11, 6, -9, 1, -5, -6, 8, 17, 12, 8, 14, 2, -20, -25, -8, -8, -3, -3, 2, -2, -6, 18, 4, 1, -3, 0, 2 }, { 4, 9, -10, 1, -9, 11, 37, -46, 10, 13, -12, 18, -12, -17, -19, 3, 1, -40, -9, 53, -48, 10, -12, -4, 31, 24, 7, -2, 13, 4, 10, 27, 9, -13, -10, -11, -23, -9, 0, 4, -1, -13, 5, 5, 7, -4, -2, 0 }, { 8, 6, 7, 8, 1, 8, 15, 0, 4, 14, 26, 2, -5, 21, 3, -4, -12, -5, 7, 21, 18, 63, 39, 0, 13, 7, 3, -1, -1, 17, 17, -28, -23, -5, 10, 22, 4, 0, -5, -34, -49, -14, -16, -21, -6, 12, 23, -1 }, { -5, -7, 1, 12, 2, -19, -48, -4, 9, 2, 12, -6, 46, -74, 6, -1, 34, 33, -19, 25, -14, 4, 8, 11, 22, 7, 1, -11, -14, 2, 0, 6, 1, 6, 13, 6, -1, 2, -2, -11, -13, -17, -10, -5, -2, 3, 6, -1 }, { 0, -4, -7, 0, -11, -3, -11, -27, -6, -3, 10, -14, 0, -28, 16, -4, -25, -53, -1, -21, 36, -26, 21, -17, 1, 28, 10, -1, -12, -35, -13, 12, -3, -7, -18, 22, 27, 2, 9, 10, -20, -31, 33, 1, -11, 3, 19, -1 }, { 0, 3, -1, -4, 6, -6, 16, -9, -2, 1, 13, 28, -32, 2, -4, -2, 2, 6, -25, 26, -19, -49, 32, 35, -44, -25, -1, 4, 14, 5, -17, 14, 10, 28, 20, 11, 33, 14, -14, -17, -15, -26, -10, -2, -10, 5, 5, -1 }, { -5, -2, -7, -6, -4, -3, -7, 0, 1, -6, -15, 1, -2, -3, 13, -5, -10, -6, 17, -6, -46, -28, -19, -2, 2, 8, 5, -3, -8, 3, -5, -63, -34, 27, 0, -22, -9, 0, -5, -29, -30, 6, 29, 37, 12, 21, 26, -13 }, { 0, -1, 0, 4, 0, -5, -20, 0, 11, 12, 5, 17, -1, -27, -5, 5, 4, -39, 68, -22, 10, -20, 27, 3, 3, 18, 1, 13, 41, 40, -2, -1, 8, 22, 20, -2, -16, -16, -13, -9, 16, 13, -24, -11, 1, -4, -11, 2 }, { 5, 11, 1, 3, 5, 8, 22, 5, 9, -1, 3, -2, 12, 17, 63, -13, -19, 28, -4, 19, -11, -51, 33, -25, 38, 21, -1, -2, -13, 15, 17, 0, 17, -3, -5, 18, 1, -10, 4, -9, 11, 28, 3, -18, -13, 4, -16, 3 }, { 5, 6, -4, 2, -11, 2, 14, -7, 3, 2, 0, 2, 2, -41, 69, -29, -56, 3, 2, -2, 2, 38, -22, 21, -19, -32, 7, 19, 19, -2, -4, 7, 6, 7, 9, -13, -6, 0, -3, 3, 5, -3, -6, 11, 7, -5, -11, -5 }, { -5, -5, -5, -1, -4, -5, -23, 0, 6, -1, 3, 0, 2, -17, -45, 4, -72, 7, -4, 24, -5, -21, -5, 15, -19, 7, 11, 8, -9, -6, 41, -16, -19, -11, -8, -4, 8, -5, 19, 19, 2, 32, -29, -24, -13, 2, 8, 15 }, { -1, -2, 4, 6, 5, -3, -9, 7, 8, 1, 7, -1, 0, 15, -22, -8, -31, 30, -6, 5, 3, 27, 8, 0, -11, 67, 33, -12, 1, 11, -4, 11, 17, 46, 14, 1, 15, -2, -3, 3, 14, -10, 28, 33, 5, -14, -25, -24 }, { 0, 2, -2, -2, -2, -6, 5, -5, 0, -6, 1, -13, -12, -4, 0, 4, -1, 13, -21, 3, 6, -3, 48, -57, -9, -12, -11, 11, 5, -17, 3, 1, -31, 39, 16, -35, -41, -20, 16, 33, -1, -23, -22, 5, 18, -15, 8, 5 }, { -2, -6, -1, 0, -4, -3, -3, -1, -3, -4, -4, -8, -1, 2, -23, -6, -36, 28, -8, -23, -6, -9, 2, -18, -1, 8, 3, 4, 9, 6, -46, -3, 62, -41, 3, -12, -32, -17, -11, -26, -20, -23, -15, -9, 13, 22, 21, 8 }, { -1, 1, -2, 1, 0, 3, 8, 2, 6, -2, -5, 2, 9, 3, -5, 1, -8, 38, -6, -10, -1, -12, -15, 8, 15, 15, 6, 21, 54, 6, -31, 22, -76, -18, -17, 18, 1, -17, -25, 1, 1, -16, 15, -18, -7, -10, 1, 9 } } }, { { { 109, -39, -29, -27, 27, -20, 1, -3, 6, -2, -2, -4, 12, -4, -3, 0, 1, -2, 2, 0, -1, -1, 0, 0, 3, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 6, 68, -81, 21, 11, -18, -15, 40, -40, 20, 2, -4, 1, -5, -3, -3, 1, 14, -15, 1, 3, 1, 0, 0, 2, -2, 0, 0, -1, 1, 1, 2, -2, -1, -1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -24, -73, -65, 35, 13, 58, 7, 3, 8, 19, 3, 4, -26, 9, 1, 2, 4, -1, -7, 5, 2, 1, 1, -1, -4, 1, 0, 1, 1, 1, -1, -1, -2, 0, 1, 1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0 }, { -40, -5, -3, -23, 91, -7, 4, -23, -4, 14, 7, -39, 46, -27, 4, 2, 1, -7, 3, -3, 1, 2, -3, -11, 21, -8, -5, 1, 1, 0, 1, -4, 3, 0, 0, 1, -2, 0, -2, 5, 0, 0, -2, 0, -1, 0, 0, 1 }, { 22, 26, 24, -31, 23, 86, -59, 6, -13, -6, 0, -15, -20, 7, -11, -3, -16, 29, -6, -3, -1, 0, 0, -5, 3, 7, -3, -2, -2, -1, -3, 7, 0, -2, 0, 0, -1, -2, -1, 1, 3, 2, -1, 0, 2, 0, 0, 0 }, { -20, -8, -21, -92, -44, 7, 16, 30, 17, 40, 26, -10, 19, 11, 0, 0, 1, -2, -19, 4, 3, 2, 6, -14, 1, 6, 2, 0, 0, 0, 0, 0, -3, 0, 1, 2, 1, 0, -4, -1, 0, 2, 1, -1, 0, -1, 0, 0 }, { -3, -44, 36, 10, 0, -10, 7, 53, -52, -6, -7, 13, 22, -26, 4, 3, -13, 41, -49, 22, -2, -1, 1, -5, 16, -15, 2, 1, 2, -3, 0, 17, -19, 3, 3, -1, 1, 0, -1, 7, -6, -1, 0, 1, 2, -4, 0, 1 }, { 5, 3, 1, -14, -5, 9, 30, -66, -80, 18, -5, 6, 3, 40, -7, -3, 19, -1, -10, 28, 0, 0, 1, 2, -15, 17, -5, -3, -1, 1, 10, -4, -14, 12, 2, 0, 1, 1, -2, -3, 5, 0, 0, 1, -3, -4, 0, -1 }, { -13, -20, -28, -8, -30, -6, -62, -29, -19, -40, 17, 19, 46, 9, 45, 2, 1, 16, 27, -1, 12, 3, 3, -13, 5, -29, 11, 4, 1, 3, 1, -3, 6, -5, 2, 2, 1, -1, -1, 4, -7, 1, 1, -1, -2, 0, 0, 1 }, { 3, -7, 13, 9, -9, -27, -53, -30, 19, 85, -24, -9, -20, -12, 28, 7, -2, 16, -17, 20, -15, -3, -3, 15, 3, -12, 8, 4, 2, 2, -5, -4, -4, 14, -3, -2, 0, 2, 3, -2, -3, 2, 0, -3, -2, -1, 4, -1 }, { -22, -13, -8, -38, 36, -40, -6, 1, -4, -23, 19, 19, -55, 4, 18, 3, -7, 40, 6, -10, 8, 4, -6, 35, -44, 20, -2, 3, 2, -1, -7, 17, -11, -3, 0, 0, -1, 2, 11, -21, 5, 4, -1, -2, 7, -5, -1, -4 }, { 1, 6, -3, -27, -3, 25, 3, -5, -19, 3, -5, 54, -16, -80, 3, -4, 34, -28, 1, -1, 5, 1, -12, 27, 5, -16, 11, -1, -2, 2, 19, -25, 11, -3, 0, 0, -1, -3, 17, -5, -11, 5, 2, 4, -8, 3, 1, -2 }, { 0, 10, 4, 3, 28, 11, -8, 4, 21, 22, -5, 79, 51, 34, -8, -3, -28, -18, -24, -11, -6, -3, -25, 8, -34, -2, -10, -1, 0, 1, -7, 12, 3, 5, -6, 0, -4, -5, 12, 0, -3, -4, -2, -4, 4, 0, 3, 3 }, { 12, 9, 18, 5, 18, 13, 19, 29, -4, 5, 10, -12, -11, 27, 88, -45, 15, -21, -7, -3, -9, -4, 1, -8, -19, -24, 2, -5, -4, -12, 28, -14, 6, -1, 0, -1, 0, 2, -3, -7, 3, 3, -3, 6, -3, 1, 0, -1 }, { -10, -22, 6, -11, 0, -8, -21, 44, -32, 11, -58, -15, 11, 17, -4, 10, -16, -27, 47, 17, 11, 9, -8, 19, -17, 22, -8, 0, 4, -1, 4, -28, 30, -21, 2, 1, 2, -9, 9, -11, 13, -1, -2, -3, -10, 13, -1, -4 }, { 4, -15, 19, 18, 5, -15, -29, 9, -29, 21, 79, 10, -11, -7, -18, 13, -8, -29, 2, -12, -22, -17, 21, -29, -12, 26, -1, 3, 2, 1, 4, -19, 33, -1, 3, -4, -2, 11, -4, -13, 6, -2, 0, 0, -8, 10, 0, -4 }, { -7, -2, -6, -2, 3, -10, -41, 6, 17, -41, -8, -11, 0, 3, 4, 11, 32, -49, -57, 32, 11, 6, -8, -1, 2, 39, -12, -1, 0, -2, 12, -6, -21, 29, -5, 1, 0, 3, 0, -12, 18, -6, -2, 8, -4, -17, 12, -1 }, { -4, 3, -10, -16, 6, -7, 4, -9, -12, -16, -2, 0, -35, 5, 3, 5, -78, -50, -10, 28, 5, 4, 2, -3, 13, -26, 37, 0, 3, 4, -1, 17, 6, 15, -2, 1, -1, 4, -10, 20, -30, 16, 1, 2, -11, 6, 6, 4 }, { 4, -5, -4, 10, -24, 13, 3, -18, -15, -1, -7, -39, 28, -27, 14, -8, -25, -19, -31, -49, 1, 1, 12, 30, -25, 19, 26, -1, 0, 3, -8, 38, 15, -5, 15, 0, 3, -7, 24, -37, 9, 2, 1, -5, 15, -5, -3, -12 }, { 6, 9, -1, 11, -5, 15, 9, 4, 14, -1, 24, -14, 22, -35, -4, -11, -17, 5, 38, 73, -8, -1, 0, -10, -52, -7, 6, 2, 0, 3, -3, 9, -30, 13, -15, -2, -1, -3, 21, -23, 14, -3, 2, -5, 1, -7, 13, -9 }, { -5, -6, -7, -4, -6, -7, -5, -9, -3, 4, -13, 21, -7, -23, 9, -80, -24, -2, 15, 6, -3, 3, -1, -21, 31, 48, -30, 26, -1, 4, -1, 25, 9, 8, -1, 2, -1, 5, -16, 6, 16, -21, 7, -14, 15, 0, -1, 7 }, { -6, -10, -1, 1, -4, -12, -17, -5, 7, -5, 19, -21, 2, 16, -51, -54, -8, 7, -18, 3, 2, 5, 2, 42, -4, -40, -15, 25, 3, -8, 53, -16, 4, -4, 8, -1, 2, -11, 21, 1, -3, -9, 7, 14, -15, 8, -3, -3 }, { -6, -2, -10, -7, -2, 6, 4, 9, -5, -18, 3, -1, 10, 10, 6, 6, 0, 4, 13, -2, -99, 10, 2, 44, 28, 6, -1, -2, 0, 2, -9, -7, 4, 43, -18, 7, 0, -8, -1, -9, -4, 0, 0, -3, -2, 5, 7, 1 }, { 8, 2, 11, 12, -1, 7, 0, 9, -5, 15, 48, -2, 6, 16, 16, 0, -5, -18, 18, 15, 40, -9, -22, 59, 54, 13, -11, -7, -2, 0, -28, 1, -28, -13, 11, -8, -1, -21, 10, -9, -10, 1, -2, -8, 14, -12, 4, 7 }, { 3, 11, -6, 5, 0, 5, 22, -17, 20, -13, 6, 6, 8, 0, 18, 4, -26, 42, -31, 33, 12, -5, -17, 3, 22, 28, 15, -8, 0, 1, -3, -39, 46, -8, 2, -1, -1, 0, 7, -29, 14, -5, 0, 10, -33, 36, -19, -12 }, { 3, 2, -4, -6, -12, 2, 7, -9, -8, -9, 6, -13, -15, -12, 14, 18, -16, -12, -18, -1, -5, -1, -17, -1, -9, -43, -84, -11, 9, 33, -26, -2, 15, -1, 4, -3, 4, 2, 16, 13, 38, -5, 1, -20, 1, 11, -2, 5 }, { 1, 7, 1, -1, 10, 6, 2, 0, 9, 0, -10, 20, 9, -2, 6, 2, -12, -2, -5, 5, 18, 5, 105, 29, 2, -8, -20, 0, 0, 11, -9, -20, -15, 1, -2, 3, -10, 11, -31, -16, 9, -2, 1, -2, -2, 4, 1, -14 }, { -1, 2, 2, -1, 7, -2, -7, 1, 5, -5, 3, 0, -1, 10, -8, -13, 46, -10, -13, 48, -14, 11, 19, 9, -3, -9, -3, -2, -3, -2, -29, 53, 52, -38, -2, 2, -4, 1, 4, -3, -32, 13, -1, -15, 10, 12, -26, 1 }, { 0, -4, 5, 0, 3, -1, 0, 7, -8, -6, -3, -9, -2, 7, -13, -32, 15, 1, -10, -10, 20, 4, -8, -9, -15, -16, 16, 42, -13, 10, -74, -47, 11, 43, 0, 0, 7, 4, 3, -12, -19, 16, 2, -12, 14, 5, 12, -4 }, { -7, -6, -6, -2, -3, -4, -8, 2, 1, -4, -7, -3, 7, -5, -14, -31, 1, 7, 8, 3, 9, -57, 2, 12, -7, 1, -7, -77, 21, -26, -7, -3, 16, 36, 16, -1, -8, 25, -3, 9, 0, 36, -6, 4, 6, -8, -3, 8 }, { -2, -4, -1, 3, -2, -9, -10, -10, -2, -1, 5, 3, -4, -6, -8, -29, -9, -6, -14, -6, -20, 61, -1, -16, -3, 6, 4, -50, 14, -18, -24, -32, -35, -44, -8, 0, 10, -25, -1, 5, 9, 30, 0, 6, 14, 21, 5, -2 }, { -4, 1, -7, -5, -2, 1, -5, -4, 1, -13, -11, -7, 4, -4, 4, -6, -4, -11, -12, 5, -34, -69, -7, 9, -12, 12, 6, 28, -2, 12, -15, -23, -44, -56, 14, 10, -9, 27, -4, 15, -10, 2, 4, 1, 6, 24, -16, 8 } }, { { 109, -42, -42, -19, 1, -18, -3, 7, 9, -2, -2, 5, 6, 5, -2, -1, 2, 0, 0, 2, -1, -1, 0, 0, -1, 0, 1, -1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 44, 56, 76, -28, -14, -33, -16, -34, -29, -24, -9, -3, 6, -3, -5, -3, 1, 13, 15, 2, -4, -2, 1, 6, 5, 5, 1, -1, -1, 0, 2, 0, -1, 3, 0, -1, 0, 1, -1, -1, -1, 1, 0, 0, 0, 0, 0, 0 }, { -5, -91, 76, -2, -4, 13, 26, -10, 20, -21, -11, 5, -2, 0, 10, 5, -8, -8, 3, 8, -4, -1, 3, 2, -2, -5, -3, 2, 1, -2, -1, 2, -1, 0, 2, -1, 0, 0, -1, 1, 1, 0, -1, 1, 0, 0, -1, 0 }, { -23, -9, -13, -26, -91, -35, 18, 0, 1, 17, 8, 34, 38, 34, 7, 4, 11, 10, 11, 11, 3, 1, 3, -5, -14, -4, 4, 1, 1, 2, -3, -9, -10, -3, 1, 1, 2, -1, -3, -2, -3, -1, 2, -1, 1, 0, -1, 1 }, { 1, 4, 6, -83, -7, 80, -8, 29, -19, 3, 17, 13, 3, -11, -16, -7, -9, 4, -6, 9, 7, 3, 0, -15, 0, 11, -1, -3, -2, -1, 7, 3, -3, -8, -1, 1, 0, -3, 1, 0, 0, 4, 0, 0, -1, 1, 1, 0 }, { -31, -17, -10, -65, 66, -56, 4, -13, 3, 5, 21, -9, 37, -13, 20, 8, 2, 7, 12, -2, 10, 4, -7, -1, 1, 1, -5, 4, 3, -1, -4, -4, -2, -3, -2, 2, -2, 0, 1, -4, -1, 0, -1, -1, 0, 0, 0, 1 }, { -8, -32, -26, 6, -3, 20, -10, -69, -56, 9, 9, 6, -33, -6, 6, 3, 20, 37, 33, 18, 0, -1, 6, 15, 21, 19, 8, 1, 1, 4, 4, -8, -8, 4, 3, 0, 1, 0, -7, -13, -9, 0, 2, -4, -4, -3, -3, 1 }, { 1, -2, -5, 10, 12, -11, 57, 47, -77, -30, 2, -28, 3, 33, 1, -6, -17, 1, 7, 20, 4, 2, 12, 9, -2, -9, -14, -3, -2, -5, 5, 3, -10, -5, 5, 0, 1, -4, -9, -1, 9, 4, -2, 4, 1, 0, -4, 0 }, { -14, -23, -6, 15, -21, -10, -76, 44, -10, -53, -4, -20, 27, -26, 2, 18, 3, 26, 8, 10, 10, 6, 12, 1, 21, 2, 11, 9, 4, 0, -5, 3, 5, -8, 4, 2, 2, -5, -3, -2, -3, -7, 0, 1, -2, -2, 1, -1 }, { 7, -19, 31, 20, 13, -15, -29, 29, -45, 82, 6, 14, 12, -6, -1, 9, -7, 26, -35, -1, -24, -14, -5, -18, -9, -1, 0, 4, 3, -3, 0, -6, -1, 3, 3, -4, 2, 5, 6, 3, -2, -2, -1, -1, 1, 4, 2, 0 }, { -10, -8, -2, -31, -32, -42, 6, 19, -6, 11, 8, -17, -67, -27, 4, 7, -1, -28, -36, 3, 3, -1, 5, 22, 28, 27, 7, 2, 1, 0, 13, 28, 27, 12, 3, 1, 3, 6, 2, -3, 2, 4, 1, -3, -11, -10, -2, -4 }, { 4, 3, -7, 21, -6, -5, 28, 1, -20, 5, -5, 40, 31, -82, -11, 1, -28, -42, 37, 11, -4, -3, -4, -7, 9, 2, 13, -2, 0, 10, 17, 10, 0, -10, -2, -1, 0, -3, 8, 3, -3, 2, 5, -9, -6, 5, 4, -1 }, { -2, 7, 6, 5, 24, -17, -21, 27, 0, -30, 23, 81, -48, 14, 22, 0, 14, -9, 9, 32, -2, 0, -17, -5, -23, 4, -10, 1, 0, -8, -1, -6, -14, -13, -14, 0, -4, 1, 2, -7, -5, 4, -4, 3, -1, 1, 8, 2 }, { 12, 9, 1, 0, -15, 21, 0, -3, -18, 2, 12, -18, 9, -24, 95, 2, 40, -29, -10, -1, -3, -5, 4, -9, -1, -38, -8, -12, -5, -9, -9, -13, 12, 1, 1, -2, 1, 0, 9, -1, 7, 3, 0, 7, 4, -3, 1, -2 }, { -3, 7, -1, -19, 9, 5, 10, 38, 9, 21, -85, 19, -9, -7, 25, 1, 14, 20, 32, -17, -14, 5, 5, 43, 14, 6, -4, -2, 0, -3, -6, -20, -2, 5, 8, 7, 0, -6, -12, -13, -12, -3, -1, -3, 0, -1, -8, 5 }, { 15, 14, 19, 15, -3, 3, 25, 29, 42, 24, 55, -23, -3, -9, 8, -3, 5, 28, 28, 26, 3, -12, -2, 2, 48, 9, 2, -6, -2, -2, -7, -12, -22, -16, -9, -7, -1, -4, -20, -32, -14, -4, 0, -6, -8, -4, -4, 6 }, { -2, -7, 7, -8, 0, 2, -44, 5, -3, 34, 13, -27, -9, 31, -18, 8, -8, -64, 60, 12, -7, -4, 3, 34, -12, -27, -1, 6, 4, -1, 7, -13, 9, -12, 3, -4, 1, -8, -8, 1, 10, 5, 0, -1, 0, 2, 3, 0 }, { -2, -6, 8, 8, 8, 0, 6, 10, -5, -3, -1, -14, 26, -6, -30, -30, 84, -30, -8, 16, -7, -4, 3, 5, -26, 57, 8, -11, 3, -5, 6, -15, 3, 4, 1, 0, 0, -2, 5, -10, -11, -9, -3, -3, -3, -9, 2, -2 }, { 0, -12, 7, 4, -7, -5, 4, 18, -17, -7, 24, 5, -14, 12, -3, -15, 14, -3, 42, -91, 2, 8, -27, -36, 21, 14, -6, -3, 2, 0, -1, -7, 22, -4, 10, -5, 6, 12, 15, 1, -7, 2, -2, -6, 9, 0, 2, 3 }, { 6, 10, 2, -2, 11, 13, 21, 2, -8, -6, 3, 6, 5, 20, -10, 96, 19, -13, -1, -4, -11, -1, -8, -4, 7, 0, 51, 6, -2, -9, -25, 8, 12, 5, 0, 0, 1, 1, 3, 2, -26, -15, -10, -3, -16, -6, -1, -6 }, { 1, 1, 2, 10, 14, 13, -11, -7, -3, -1, 11, 40, 44, 37, 10, -23, -4, -12, -15, 1, -6, -2, 2, 30, 52, 1, -14, -5, -2, -1, 10, 38, 47, 17, -3, -3, -1, -4, -13, -13, -3, 2, -2, -5, -20, -29, -13, -4 }, { -6, 1, -13, -9, -7, 4, 4, 4, 6, -26, 23, -8, 4, -10, -3, -5, -12, 10, 5, 4, -97, -22, -27, 20, -8, -2, -11, 15, 5, 5, -9, -11, 1, 42, 7, 4, 10, 15, 15, -6, 4, -3, 4, 5, 3, 7, -2, 4 }, { 0, -3, 4, -2, 0, -4, 12, 5, -7, -6, 23, 21, -6, -27, -44, -12, 20, 12, -20, -8, 27, 7, 1, 39, 7, -59, 10, 2, 2, 4, -27, -51, 30, 16, -1, -3, -4, -7, -3, 4, -2, 4, 0, 18, 17, -3, -11, 1 }, { -8, -1, -6, -13, 6, -11, -1, -5, -1, -1, -31, 2, -12, 9, -25, -15, 27, -21, -3, 30, -31, -9, 19, -63, 62, -33, -11, 9, 7, 2, -20, -3, -6, -12, 7, 2, -3, 9, -10, 18, -7, 1, 0, 4, 8, 7, 3, 3 }, { 5, 3, -2, 5, -8, 5, 1, -7, -5, 6, 5, 6, 4, -14, -9, 28, -6, -16, 1, -6, 14, -2, 18, 5, -3, 32, -79, 28, 11, -18, -65, 8, -2, 1, 1, -2, -1, -4, -3, -13, 8, -16, 2, 13, 15, -9, -3, 13 }, { 1, 5, 4, -1, 11, -1, -1, 9, 7, -1, 24, 18, -7, -4, 6, -1, 4, 0, 15, -24, -8, -9, 93, -13, -18, 0, 17, 13, -1, 12, 9, 7, -18, 44, 21, -8, -3, -1, -25, 10, 5, 0, -1, -9, 6, -12, -18, -12 }, { 1, 3, 2, 1, 1, 1, 12, 11, 12, 13, 1, -4, -8, -6, -2, -4, 17, 41, 43, 31, 3, -6, 4, -5, -17, -3, -10, -15, -1, -9, -18, 36, 52, 2, -1, -1, -4, 1, 33, 57, 18, -4, -1, -1, -5, -6, -3, -29 }, { -4, -3, -4, 1, -1, -6, -8, -4, 5, -2, 0, 4, 1, -5, -16, 24, -10, -1, 3, 0, -10, 12, 13, -11, 6, 4, -15, -92, -13, -54, 9, -13, -4, 19, 13, 1, -1, 6, -3, -14, 11, 32, 0, 26, -1, -2, -2, 0 }, { -1, -4, 2, -2, -2, 0, -2, 2, -4, 4, 9, 5, 1, -13, -14, -1, 40, 4, -3, -25, -13, 11, -25, 30, 2, -28, 4, -10, 3, -11, -8, 68, -60, -13, 3, 1, 6, -7, -30, 19, 28, 1, -1, 2, 11, -6, 4, 1 }, { 0, 0, 0, 0, 2, -4, -3, -7, -3, 1, 15, 0, -3, 1, 14, -29, -24, -7, -10, 0, -31, 44, 16, 12, -10, 17, 26, -26, -10, 1, -52, 4, 13, -43, 30, 8, 5, -24, -14, 19, -38, -10, 5, -6, 9, 20, -10, 8 }, { -7, -4, -7, -7, 0, -2, -2, -4, -5, -12, -6, 3, 0, 0, 0, -7, -9, 1, -6, -22, -1, -97, 16, 15, -8, 6, 11, -27, 1, -5, -17, 2, 9, -39, -33, -23, 8, 12, -20, 9, -12, 2, 6, 4, 8, 3, 13, 1 }, { 2, 5, -1, 2, 1, 2, 12, -7, 7, -4, 10, 10, 3, -3, -1, 29, 16, 14, -13, -11, -32, 9, 20, 8, 9, 16, -14, 25, 5, -9, 47, -24, 28, -59, 5, 3, 2, -29, -16, 12, 47, 13, -5, 11, 10, 21, -3, -7 } }, { { 104, -43, -51, -12, 22, -15, -2, 8, 4, 1, -3, -1, 2, 0, -2, 0, 2, 1, -1, 1, 0, -1, 0, 1, 0, 1, 0, -1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 43, 30, 77, -68, -32, -37, 0, -6, 9, 1, -7, 0, 22, 1, -6, 0, -2, 0, -2, 1, -1, -2, 0, 2, -1, 4, 0, -2, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, { -11, -92, 52, -20, 24, 45, 12, -31, 27, -8, 0, -5, -1, -6, 6, 3, -3, -3, 3, 5, -2, 0, -1, -3, 3, 0, -1, 1, 1, -1, -1, 0, -1, 1, 2, -1, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0 }, { -18, -30, -28, -17, -81, -1, 16, 42, 61, 32, -4, 4, -13, 9, 1, 4, 5, -14, -12, 3, 5, 0, 3, 2, 0, 0, 4, 0, 1, 1, -3, -2, 0, -2, 0, 2, 1, 1, -1, 0, -1, 0, 2, -1, 0, 0, 0, 0 }, { 37, -1, 30, 66, -52, 10, -10, -8, 10, -66, 2, 22, -31, 16, -4, -2, 1, -3, 22, -3, -7, -1, 5, -3, -4, -3, 3, -1, -1, -1, -1, 2, 2, 1, -1, -2, 2, 0, -1, 1, -1, -1, 1, 1, 0, 0, 1, 0 }, { -19, -31, 14, 36, 18, -92, 52, 20, 6, -20, 2, -6, 18, -19, 6, 10, -6, -13, 1, 3, -3, -1, 0, 4, 4, -2, -2, 2, 3, -1, -3, 3, -3, 0, 1, -1, 0, 2, 1, -1, 0, -1, -1, 1, 0, 1, -1, 0 }, { 12, 27, 25, 10, 37, 12, -8, 51, 45, 1, -18, -52, -59, -38, -6, -2, -5, 7, 1, 2, 4, -4, -4, 8, 19, 5, -6, -2, -1, -1, -1, 0, -2, -5, -1, 0, -1, 2, 2, 1, 4, 2, -2, 0, 0, 0, 0, 0 }, { 0, 28, -25, 5, 9, -11, 4, -71, 68, -3, 7, -43, 18, 17, -6, -1, 12, -31, 29, -10, 1, 1, 0, -8, 12, -6, 2, -1, -1, 4, -5, 0, 1, 7, -3, 1, 0, -1, -2, 2, 2, -1, 1, -1, 1, -2, 1, -1 }, { 17, 13, 8, 16, -6, 18, 79, -31, -25, 44, -39, 12, -31, -6, -39, 0, 12, -19, 5, 2, 6, -7, 0, 14, -4, 12, 3, -9, -1, 2, 2, -8, -3, -1, 2, 1, -1, 2, 1, 4, -4, 3, 1, -2, 0, 0, -2, 0 }, { -5, 15, -22, -45, 3, 27, 52, 24, -4, -59, 57, 7, -3, -23, -28, -3, 5, -16, 17, -9, -6, 9, 2, -22, 5, 12, 1, -6, -1, 0, 1, 1, -1, 2, -2, 0, 1, -2, 1, -1, -1, 4, 1, 1, 0, 0, 0, 0 }, { 18, 14, 13, 34, -1, 42, 14, 25, 8, -3, -11, -12, 73, -7, -7, -7, -10, -37, -59, -1, -6, -3, 3, -8, 22, -14, 3, -5, -4, -1, 0, 15, 0, 3, -1, -1, 1, -6, -1, 0, 0, -4, 0, 2, 1, 0, 2, -1 }, { -2, 12, 9, -5, 46, -4, 0, 16, 34, 2, 1, 57, -18, 49, 1, -2, -23, -38, -19, -48, -1, 1, -1, 16, -30, 14, -6, 0, 0, -5, 0, 3, 21, -3, -6, -1, -2, 2, -6, -4, -4, 3, -3, 2, 2, 3, 1, 0 }, { 0, 7, -7, 2, 0, 1, -9, -17, 27, 0, -19, 62, 14, -84, 15, -5, 22, 17, 5, -41, 15, -4, 2, 3, 10, -17, 5, 2, -2, 6, -6, -5, 9, -4, -3, 2, -2, 1, 5, 3, -5, -3, 2, -1, 6, -3, 1, -1 }, { -13, 0, -15, -21, 6, 9, 4, 19, 0, -48, -75, -3, 27, 21, -6, 3, -2, 0, 32, 13, 60, -7, -10, 9, -13, -3, -5, 0, 1, -1, 0, 2, 0, -23, 3, 9, -1, 2, 2, 6, 2, -2, -2, 2, -2, -3, -2, 0 }, { 8, 11, -3, -13, -6, 8, 28, -3, -17, -3, -5, -5, -20, 0, 99, -35, -22, -35, 13, 4, 0, 1, 5, 6, 16, -27, 10, 1, -8, 2, 13, 4, -2, -4, -2, 0, 2, 0, -2, -4, -4, 2, 2, -3, 3, 2, 1, 0 }, { -11, -8, -17, -18, -13, -13, -18, -33, -15, -38, -8, -10, -26, -14, -8, 8, 0, -25, -59, -2, 12, 4, 7, 61, 37, 32, 6, 2, 1, 1, -2, 9, -1, -4, -1, 3, 2, 2, -4, -17, -1, 3, 1, 4, 2, 3, 1, -2 }, { 3, 17, -10, -1, 3, 5, 40, -21, 31, -18, 4, 5, -4, 12, 14, -7, -19, 76, -46, 16, -1, 1, -12, 15, -33, -17, 5, -4, -4, 3, -9, 27, -28, 7, -1, 1, -2, 0, 8, -6, 4, -3, 1, 5, -1, 0, -8, 0 }, { -1, 9, 2, -1, 17, -6, -13, -1, 16, -5, 5, 40, -10, -4, 1, -8, 43, -34, -14, 82, -4, 4, -52, -3, -3, -10, -10, 3, 1, -1, 11, -9, -12, 9, 2, -2, -8, -2, 16, 2, -3, 1, -2, 2, -7, -5, -1, 5 }, { 0, -4, 8, -5, 4, -2, 13, 9, -10, -13, 1, -21, -10, 26, 12, 0, 91, 11, -29, -34, 5, -2, 17, 3, -4, -43, -28, 0, -2, 0, 12, -14, 15, 5, 2, 2, 1, 1, -12, 7, 8, 3, -6, 1, 0, -9, -2, 2 }, { -3, 1, 2, 2, 11, 3, 7, 15, 12, 0, -10, 21, 20, 31, -3, -37, 18, 28, 22, -3, -34, 22, 11, 32, 58, 23, 2, 7, -1, -5, -4, -34, -48, -11, -9, -7, 0, -8, -6, 0, -6, -2, 0, -3, 11, 6, 3, -2 }, { -3, 0, -8, -4, -6, 8, -2, 5, 1, -12, -31, -19, 16, -25, 7, -7, 12, -10, 14, 4, -67, 39, 5, 38, -62, 15, -1, 2, 0, 3, 2, -6, 25, 12, -1, -10, 2, -17, 9, -13, 13, -1, 1, 1, -6, 6, -1, 1 }, { 10, 1, 12, 19, -4, 11, 4, 4, 4, 15, 53, -13, 24, -7, 10, -25, 17, 1, 11, 5, 42, -26, -10, 48, -31, 26, -1, 3, -3, 0, -1, -22, 9, -50, 4, -1, -1, 10, 10, -25, 8, -9, 0, 1, -3, 10, -5, 1 }, { 7, 11, 1, -1, 4, 15, 6, 6, 1, 6, 14, 12, 7, 4, 16, 93, -3, -4, 21, 11, -15, 3, 3, 39, 16, -34, -17, -24, 5, 8, -28, 3, -11, -25, -3, -4, 2, -3, -2, -7, 3, 9, -2, -6, -5, 6, 2, 0 }, { -6, -8, 0, -7, 0, -6, -10, 3, -2, 5, 13, 0, -3, -2, -49, -33, -18, -5, 17, 1, -9, 1, 2, 45, 3, -78, 18, 6, 2, -6, 36, 31, 6, -17, 0, 0, 1, 0, -1, 10, -18, 2, 0, -3, -2, 1, -1, 2 }, { 0, 4, -2, 0, 5, -2, -3, -7, 9, 2, 1, 14, -1, -12, -4, -7, -6, -7, -3, 37, 8, -4, 81, -8, -16, 4, -49, 8, -1, 7, 23, 9, -17, -13, -49, 3, 6, 8, -23, 0, 16, -6, 0, 0, -9, 5, 17, -2 }, { -1, 2, 4, -2, 13, -1, -6, 5, 1, -5, 0, -3, -4, 1, 1, 11, 20, -10, -11, 17, 3, -10, 41, -5, -26, -9, 93, 0, -1, -20, -32, -24, -14, -6, -22, 1, 1, 7, 0, 16, -18, 14, 4, 6, 2, 0, 8, -13 }, { 0, -1, -6, -2, -7, -3, -2, -14, -2, -11, 0, -8, -5, -8, -7, 2, -36, -7, -26, -12, -6, 13, -12, -8, -11, -25, -21, 6, -4, -2, 6, -79, -15, -29, 4, -3, 2, 14, 22, 52, 22, 6, 5, 6, -9, 21, -3, -2 }, { -1, -6, 3, 0, -4, -2, -15, 3, -11, 5, 1, -8, -4, -5, -2, -5, 13, -30, 5, -33, 4, 4, -3, -2, -29, -1, -12, -3, 2, 4, -12, 37, -85, -3, 4, -1, 2, 1, 51, 1, 13, 5, -2, -7, 23, -13, 9, -10 }, { 8, 5, 5, 3, -1, 4, 9, 0, -3, 8, 7, -1, -5, 2, 0, 21, 2, -1, 1, -2, 25, 51, -3, -1, 4, -4, 4, 94, -16, -19, -13, 13, 5, -7, -8, -25, 4, -20, 4, 5, 2, -34, 4, 8, -3, -9, -1, 0 }, { 0, 0, 1, 3, 0, -4, 1, -9, 5, 2, -7, -4, -3, 2, 8, 9, 18, 8, -12, -4, -12, 47, -9, -35, 3, 21, 15, -25, 6, 2, 39, 31, 7, -79, -4, -16, 4, 24, 4, 3, -27, 5, -1, -1, -6, 9, -1, 1 }, { 4, -3, 7, 10, -1, -3, -3, 2, 1, 9, 15, -1, 2, -2, 0, -3, -9, 3, 0, 0, 56, 66, 5, 11, -5, -3, 4, -54, 7, -1, 17, -20, -2, 34, -15, -22, 3, -42, -1, 2, -6, 19, -2, -2, 2, -15, 5, 0 }, { 2, 4, -1, 1, 1, 6, 2, 3, 8, -4, -1, -3, 7, -2, 10, 42, -1, 3, 4, -15, -1, -27, 0, 9, -7, 14, 17, 16, 2, -7, 91, -20, -31, 27, 4, 7, 1, -7, 10, -9, -29, -22, 1, 10, -18, -1, 2, -1 } } }, { { { 108, -34, -51, 5, -2, -1, -1, 14, 18, 2, 1, -11, -12, -4, 1, -1, 1, 1, 1, 3, -1, 0, -1, 4, 5, 1, -1, 0, -2, -4, -2, 1, 0, -1, 1, 0, 0, 1, 0, 1, 0, 0, 0, -1, -1, 0, 0, 0 }, { 27, 47, 40, -47, -65, -25, 0, 19, 37, 14, 1, 3, 12, 6, -1, 2, -1, -21, -32, -11, 2, -2, 4, 20, 18, 6, 0, -2, 0, 0, -4, -1, 1, 1, 1, -9, -9, -1, 4, 7, 5, -1, -2, -1, 0, -3, -1, 2 }, { 42, 14, 22, 15, 1, 31, -11, -41, -39, -26, 12, 40, 51, 15, 0, 0, 1, -7, -23, -11, 2, 1, -3, -11, -20, -8, 1, 2, 20, 34, 16, 0, -1, 1, -6, -20, -16, -6, -1, -3, 3, 2, 0, 10, 9, -6, -5, 0 }, { 13, -72, 71, -51, 16, 24, 12, 12, -7, 8, 10, 20, -15, -23, 2, -1, 2, 12, 4, -15, -2, 0, 2, -6, 2, 7, 0, -4, -12, -2, 9, 1, 1, -1, -6, 1, 6, 3, 2, -2, -3, 2, 4, 2, -4, 2, -1, -1 }, { -36, -51, -43, -12, 4, 5, -25, 1, 12, -15, 19, 19, 13, 18, 2, -2, -3, -29, -41, -27, -9, -4, 7, 30, 31, 5, -2, 2, 2, -3, 0, 3, -1, 0, -8, -22, -25, -8, 15, 29, 17, 2, -5, -11, -8, -7, -1, 8 }, { 27, 31, 40, 33, 47, 3, -6, -5, -23, 5, 17, -2, -12, 23, 18, 5, -16, -19, 5, -11, -8, 5, 13, 24, 34, 13, 0, 4, -15, -41, -19, 0, 3, -1, 1, 12, 2, -5, 15, 25, 10, 2, -8, -27, -22, 10, 11, 3 }, { 0, -18, 14, 31, -15, -65, 16, 47, -27, -50, 9, 25, -7, 0, 5, 1, -14, -26, 5, 12, 9, -2, -23, -28, 14, 18, 1, -1, 0, 5, 18, 6, -2, -1, 2, 5, -12, -11, 10, 0, -14, -5, 8, 5, 2, -4, 1, -1 }, { -5, -19, 10, 24, -69, 47, -36, -14, 8, -49, 14, -12, 4, -4, 8, -5, -2, 3, 9, 14, -6, 3, -9, -1, 4, 7, -3, -4, -19, -22, -15, -3, -2, -1, 14, 24, 21, 5, 2, -7, -10, -4, -4, -12, -13, 18, 13, -9 }, { 0, -10, 41, 34, 10, 24, -27, 17, 37, -2, -2, -33, -26, 14, -11, 8, 26, -7, 5, 30, 19, -2, -22, -1, 8, -27, -8, 8, 30, 11, 1, 8, 2, 1, 4, -20, -36, -6, -1, 18, 7, -7, 0, 6, 20, -27, -10, 15 }, { -2, 42, -13, 14, 9, 33, 36, 16, 25, -36, 43, 19, -28, -39, -7, 1, 14, 30, -9, -16, -1, -10, -11, 6, 7, 6, 3, -10, -23, 3, 24, 18, 2, -3, -19, -22, 4, 12, 7, 10, 9, 3, 6, 5, -10, -4, -12, 7 }, { 1, 32, -20, -58, 30, 23, -43, 16, 15, -22, 13, 22, -24, 32, 35, 5, -12, -14, 19, 11, -5, -1, -9, -29, 4, 0, -6, 10, 8, 1, 16, 5, -5, 4, 6, 17, -1, -7, 0, -11, -24, -9, 13, 8, 7, 0, 5, -7 }, { -3, -6, 11, 32, 36, 6, 2, 31, 51, 2, 7, 21, 31, 14, -1, -16, 6, -9, -35, 7, 0, 17, 16, 11, 0, 15, 26, -18, -9, 12, -16, -13, -2, 10, 20, 13, 20, 17, -17, -36, -17, -5, 0, 20, 15, 11, 3, -22 }, { 3, -6, 13, 7, -1, -56, -48, -15, 10, -6, 55, -4, -18, 15, -14, -11, 7, 21, 12, -31, -35, 4, 12, 6, -23, -27, 2, 3, 7, -2, 4, 12, 3, -9, -18, -4, 18, 19, -23, -11, 6, 5, 2, 6, 7, 3, -6, -1 }, { -5, -12, 5, 30, -14, -4, -4, -10, 43, 48, 18, 8, 22, 10, 43, 2, -39, 22, 19, 1, 14, -3, -2, -17, -13, 27, -6, 23, -9, 13, 26, 15, 8, -1, -18, -11, 1, -27, 24, 4, -1, 1, 11, 2, -20, 4, -7, 1 }, { 1, -2, 4, 1, 4, -7, -1, -61, -3, 5, 4, -36, -41, -10, 10, -20, -34, -25, -34, -10, 11, 1, -27, -3, 20, 16, 14, -22, -20, 5, 8, -5, -8, 5, 6, -8, -1, 7, 1, -13, -19, -16, 19, 34, 14, -14, -16, -6 }, { -4, 6, -4, 26, -24, 24, -27, 55, -43, 26, -18, 0, -32, -8, 49, -7, -5, 6, -14, -43, 2, 4, 16, 12, -5, -9, -2, -2, 2, 15, 4, -19, 0, -1, -4, -10, 0, 5, -10, -12, 4, 9, 1, 16, 13, -4, -8, -4 }, { -4, -8, -2, -9, -14, 23, 61, 13, -11, -6, 35, -47, 16, 45, 19, 3, -10, -15, 29, -3, -44, 11, 3, 13, 6, 6, 20, -2, 18, 9, 5, 7, -1, -9, -2, -2, -6, -2, -14, -8, 5, 4, 4, 11, 18, -8, -10, 0 }, { 2, 1, -5, -6, -4, 2, -26, 14, -23, 14, -6, 7, 37, 23, -20, -22, 11, 19, 42, 5, 27, -16, -15, 28, 29, 12, 22, -15, -25, -23, 25, 13, -6, 14, 5, -15, 12, 31, -6, 9, -9, -16, 20, 18, 0, -22, -23, 14 }, { -4, 0, -11, 15, -17, 21, 8, 6, -11, 53, 31, 11, -11, 24, -53, 25, 23, -33, -7, -28, 9, 6, -18, -38, 14, -14, -8, -2, -6, -7, 19, 1, 5, -6, -3, 18, 2, 1, 3, -8, -23, -4, 9, -3, -8, 15, 18, -12 }, { -2, -4, -6, 12, -22, -4, 20, -35, 7, 12, 13, 58, -22, -10, 14, 10, 7, 19, 25, 6, 2, 24, 28, 11, 44, 3, -13, 13, 25, -10, -14, -14, -5, 12, 16, 13, -20, 8, -19, 1, -12, -6, 3, 9, 33, -25, 1, 2 }, { 0, -10, 4, -1, 0, -14, 27, -15, 2, -20, -16, -6, -14, 51, 27, 17, 25, 42, -26, -23, 37, -28, -20, 26, 16, -21, -2, 22, 9, 17, 14, 6, 3, 5, 16, 17, 23, 5, 0, -7, -6, -7, 4, -8, -14, 17, 12, -11 }, { 1, -6, 1, 6, -17, 8, 17, -13, 17, -7, -29, 37, -43, 48, -3, -17, -5, -26, 16, 3, 16, -20, 27, -23, -24, -5, 38, -29, -7, -5, -15, 0, 9, 3, -27, -12, 10, 9, -7, 4, 26, 9, -3, -8, -9, 0, -13, 15 }, { 1, 3, 2, -1, -1, -3, -22, -10, -7, 0, -14, -10, -19, 3, -16, 6, 23, 0, 13, 27, -13, 23, 27, 8, 45, 37, 5, -11, 6, 63, 33, 1, 3, -8, -13, 0, 11, 11, 23, -15, 14, 29, -8, -5, -18, 17, 3, -13 }, { 2, 2, 7, 7, 4, 5, -3, 3, 21, -32, -37, -12, -2, 21, -48, 28, -29, 23, 17, -51, -15, -5, 24, -6, 0, 22, -13, 16, -20, -5, 10, -33, -13, 12, 3, -7, -14, -23, 9, -4, -14, 6, 3, 17, 13, -10, -2, -2 }, { 0, -5, 3, 10, 3, -9, 0, -13, 13, -6, -27, -6, 18, -8, 40, 49, 53, -33, -6, -10, -24, 6, 9, -19, -5, 9, -22, -6, -10, -22, 16, 6, -14, 3, -19, 4, 36, 18, 0, 14, 1, -2, 18, 18, 3, -19, -19, 14 }, { -1, -4, 1, 2, 2, 4, 4, -1, 0, -8, 11, 5, -5, -18, -11, 33, -28, -51, 28, 10, 30, -26, 22, 54, -13, -16, -37, 6, 10, 5, 5, 12, 15, 9, 0, -26, 25, 2, -7, -25, -10, 0, 2, 16, -5, 14, -5, -18 }, { -1, -3, -6, -13, -5, -3, -5, 11, -23, 2, 40, -6, -12, 25, -10, 51, -12, 23, -24, 32, 29, 31, 10, -1, -30, 17, -4, -4, -21, 14, -29, -25, -1, 15, 9, -1, 22, 9, 17, 18, 18, -9, 1, 10, 0, -26, -13, 11 }, { 0, 2, -3, -7, 1, -12, 0, -12, 6, -5, 19, -9, -4, -9, 21, -3, 50, -17, 37, -24, 27, 14, -2, 11, -34, 16, 35, -9, 1, -6, 7, -40, -4, 12, 22, -30, -20, -11, 27, 11, -16, 0, -20, 5, -4, 12, 27, -18 }, { 0, 2, -1, 3, 2, 6, -8, 3, 8, 0, 11, -9, 3, -25, -11, -3, -12, -17, 10, -36, 13, -33, 5, -7, -8, 33, 22, 4, 49, 27, 2, 2, 12, 10, 27, 51, 32, 13, 0, 35, 14, -4, 2, -13, 9, -22, -3, 16 }, { 1, 0, 2, 4, -6, 3, 12, 2, -1, 0, -5, 6, -14, 3, -11, -40, 1, -10, -11, 13, -19, 30, 29, 26, -22, -42, 1, 10, 0, 2, 38, 5, -26, 23, 15, 22, 23, -29, 41, 19, -24, -13, 17, -1, -16, -19, -8, 13 }, { 1, 2, -1, -2, -3, 4, -2, -1, -18, 2, 9, 0, -15, 8, -14, -31, 46, -1, -16, 13, -2, -29, 13, 7, -8, 51, -7, 43, 5, -17, -28, 15, 18, -2, -16, 1, 11, -50, 4, -14, 0, -1, 15, 32, 11, 0, -11, -7 }, { 0, -2, 1, 6, -7, 3, -10, 0, -7, 11, -21, 17, -18, -1, -8, 45, -14, 19, -17, 16, -32, -7, -7, 28, -28, 21, 43, -19, 28, -21, 11, 43, -6, -9, 4, -2, -15, 5, 10, 24, -18, -9, 9, 14, 12, 19, 19, -23 } }, { { 121, -2, -4, -6, -35, -3, -1, 1, 2, -3, -1, 2, 17, 1, -1, 0, 1, 0, -1, 1, 0, 0, 0, -1, -8, 0, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, -2 }, { 6, 61, 65, 12, 0, 12, 7, -47, -51, 5, 1, -5, 2, -5, 2, 1, -5, 31, 33, -3, 1, 1, 1, 2, -4, 3, 0, 1, 1, -1, 3, -19, -20, 1, -1, 0, 0, 0, 0, 4, -1, 0, 0, -1, 9, 10, -1, -3 }, { -36, -17, -13, -19, -70, -15, -4, -9, -13, -6, -9, 14, 62, 8, -5, -2, -2, 9, 13, 1, -5, -4, 4, -14, -49, -8, 2, -3, -2, 0, 1, -8, -9, 0, 0, -2, 0, -2, 10, 32, 5, -1, 0, -1, 4, 5, 0, -18 }, { 1, -61, 62, 50, -7, -32, -7, 29, -21, 15, 5, -32, 4, 25, -5, 0, 1, -19, 12, -4, 0, 1, -2, 21, -3, -17, 4, -1, 0, -1, 0, 14, -8, 1, 2, 1, 0, 1, -13, 2, 10, -2, 1, 0, -8, 6, 0, -1 }, { -1, -48, -42, -17, -26, -21, -9, -30, -31, -5, 0, -13, -28, -8, -1, -2, -5, 29, 34, -5, -2, -1, -9, 15, 39, 8, -6, -1, -1, -3, 4, -25, -29, 3, -4, -1, -3, 5, -12, -34, -8, 4, -3, -2, 17, 20, -2, 22 }, { 2, 12, -11, 21, 15, -46, -20, -40, 40, 11, 10, -19, 5, 35, -17, -11, 19, 42, -29, -15, 9, 5, -6, 13, -9, -22, 11, -7, -4, 7, -14, -35, 20, 13, -4, 3, -2, 3, -10, 6, 13, -6, 3, 7, 23, -12, -9, -4 }, { -7, 2, -24, 58, -38, 52, 32, -2, 17, 39, 16, -29, 11, -27, 14, 6, -18, 5, -11, -30, 7, 1, -6, 18, -2, 15, -5, 2, 0, -3, 12, -6, 6, 22, -4, 0, 0, 3, -11, 1, -7, 2, -1, -7, 6, -2, -14, 0 }, { 12, -14, -21, 17, 57, 20, 8, 33, 21, 5, 6, 9, 18, 6, 7, 3, 3, 19, 35, 8, 1, 0, 7, -3, -25, -1, 4, 2, 0, 4, 4, -33, -50, -6, 8, 0, 3, -2, 2, 22, 2, 1, 2, -3, 29, 44, 4, -13 }, { 0, -48, 53, 8, -2, -2, -4, -20, 38, 4, -3, 34, 3, -33, 2, -1, -3, 25, -21, 5, 6, 6, 4, -39, 4, 40, -6, -1, -1, -3, 5, -19, 7, -2, 0, 4, 0, -2, 29, -9, -29, 6, -1, -2, 14, -1, 0, 8 }, { 1, 0, -11, 25, 7, 14, -52, -4, -13, -29, -23, -37, 4, -51, -29, -26, 25, -3, -4, 13, -17, -9, 14, 20, -14, 34, 12, -15, -8, 9, -17, 0, 5, -14, 5, -4, 3, -12, -10, 16, -22, -10, 5, 8, -1, -5, 9, -13 }, { -1, 11, 0, -17, -2, 8, -49, 37, -24, 58, 42, 29, 10, -9, -26, -15, 29, -4, 12, -22, 21, 8, -24, -12, 5, 14, 15, -6, -2, 8, -12, -3, 3, 17, -7, 2, -3, 18, 3, -9, -5, -5, 2, 8, 4, -1, -8, 11 }, { 11, -40, -17, -6, 30, 15, 2, -34, -45, 14, 15, -6, -40, -12, 1, -1, -14, -3, -12, -17, 5, 2, -14, -28, -23, -17, -7, -2, -2, -4, -3, 4, 16, 6, -8, -2, -4, 1, 29, 45, 17, -7, -3, 3, -3, -12, -2, -38 }, { 0, 28, -27, 55, -8, -49, -23, -27, 14, 25, 9, 22, -15, -18, -3, 2, -17, -30, 9, 4, -1, 3, 3, -30, -4, -4, -12, 1, 0, -11, 3, 26, -29, -25, 4, 3, -1, -8, 13, -3, 2, 2, -5, -4, -19, 23, 16, 3 }, { 1, -18, -20, 45, -1, 38, 7, -18, -12, 4, -4, 44, 9, 40, -6, -9, 23, 33, 30, 31, -9, 1, 13, -3, 26, -3, 18, -3, -2, 13, -8, 20, 24, -11, 10, 1, 4, -1, 10, -6, 9, -1, 6, 12, -24, -25, 14, 0 }, { -6, 6, 10, 9, -45, 35, 2, 16, 0, 3, -9, 0, -48, 12, -10, -3, 14, -14, -33, 19, -3, 1, -4, -21, -1, -35, 6, 0, 2, 4, -36, -36, -14, -33, -5, 2, -4, -7, 3, -4, 8, -13, 1, 19, 33, 18, 19, 9 }, { 1, 12, 1, -2, -12, -9, 8, 27, 0, 19, -43, 12, -34, 22, -53, -27, -36, 16, -5, -24, -21, -13, 20, -1, 16, 14, 18, -17, -9, 11, 35, 1, -1, 18, 13, -7, 4, -6, 17, 17, -4, -3, 6, -21, 4, 10, -6, -14 }, { -2, 14, -6, 0, -18, -36, 13, 23, -6, 20, -8, -12, -22, 6, 48, 24, 48, 12, 7, -5, -7, -9, 13, 8, 22, 37, -5, 17, 7, 5, -20, 5, 12, 6, 10, -4, 7, -3, 33, 29, -12, 12, 2, 19, 13, 8, 9, -32 }, { -4, -5, 18, -32, -20, 4, 6, -17, 42, 20, 34, 5, -19, -21, -13, -11, -9, -1, 20, 41, 25, 11, 0, 46, 19, 3, 9, -8, -5, 6, 10, 16, -3, -18, 1, 8, 4, -2, -18, 28, 24, -8, 3, 0, 1, 7, 13, -40 }, { 1, -13, 13, -12, 1, 22, -22, -31, 27, 14, -42, 31, -7, -13, 9, 5, 24, -23, 16, -41, -16, -9, 28, 12, 1, -21, -8, 6, 5, -7, -25, 13, -15, 39, 16, -5, 8, -8, -21, 0, 27, -3, -7, 20, -1, 13, -21, -2 }, { 2, -4, -4, -12, 11, -14, -1, 13, -18, 58, -2, 15, 1, -12, 7, 10, -22, 14, -10, 29, -23, -25, 26, 14, -17, -13, -27, 4, 2, -14, -14, -31, 7, -19, 15, -5, 11, -32, -35, -12, -20, 0, -10, -3, -10, -38, -6, -8 }, { 0, 9, -4, 27, -2, -15, -11, 27, -16, -29, -1, 29, 6, -31, 1, -4, -37, -7, 15, 21, 11, 13, 15, 17, 19, -18, -26, -4, -4, -10, -5, -14, 36, 28, 3, 9, -3, 13, 16, 10, 27, 8, -8, 25, 50, -10, -13, 4 }, { -1, -2, 1, -4, 1, 25, -52, -11, -17, 6, 20, -1, 14, 25, 6, 20, -9, 13, -53, 16, -14, -17, 13, 15, 10, 9, -13, 18, 11, -9, 35, 19, 8, 4, 9, -6, 12, -4, 2, 4, 20, 27, -9, -2, 15, 45, 19, 8 }, { -1, 5, 5, -10, -4, 11, -23, 28, 23, 11, -23, -46, 8, -22, -14, 6, -4, 52, 18, -5, 4, 6, -1, -26, 15, -22, -27, 12, 8, -18, -1, 6, -9, -12, -1, 2, -7, 8, 27, -8, 43, 12, -15, 7, -30, -18, 15, -3 }, { 2, -5, 1, -18, 17, -8, 43, -18, -3, 29, 10, -24, 41, -12, -11, -39, -6, -19, -6, -4, -25, -21, 6, -4, 20, -3, -10, -23, -12, 15, -22, 7, 7, -5, 22, -7, 9, 8, 24, -4, 11, 3, 5, 32, 13, 19, 33, 28 }, { 1, -5, -8, -7, 6, -11, 19, -6, -15, 23, -17, -17, -1, -25, -21, 37, 19, 1, 2, 19, 40, 17, 45, -10, -20, -9, 43, 15, 17, 10, 20, 14, 17, 10, 6, 11, -6, -24, 3, -1, 10, -12, 4, -3, 16, 6, 4, 43 }, { 0, -1, 2, 8, 2, -17, 11, 11, -6, -17, 16, 20, 12, -36, -9, 42, -22, 24, -23, -16, -35, -16, -26, -6, 22, -3, 53, 25, 12, 13, -12, 4, -14, -6, 15, -11, 7, 14, -21, 3, 21, -28, 9, 20, -5, -3, 2, -13 }, { 0, 2, 3, -2, -2, 3, -13, -10, 21, 20, -14, -12, -18, -7, 2, 21, 9, -12, 29, 45, -48, -21, -45, 6, -26, -15, 12, 0, 2, 1, 19, -5, 20, 16, 9, -7, 6, 46, 23, 6, -8, 2, 5, -15, 9, -3, -13, 27 }, { 1, -3, 0, -3, 9, 1, -17, -3, -3, 21, -47, 14, 17, -6, 32, 6, -8, -10, 10, -38, -6, 23, -28, 15, 10, -5, 10, -7, -1, 11, 23, -7, 28, -48, -15, 13, -18, -5, -15, 6, 6, 4, 14, -14, 28, -8, 46, 0 }, { 0, 1, 1, 7, -5, -13, 21, 13, -10, -4, 0, 17, -21, -25, 16, -33, 37, 23, -15, 6, -9, -8, -16, -13, -28, 11, -17, -14, -12, 29, 34, 14, -11, -5, -6, -6, 3, -7, -35, -1, 51, 36, 11, 6, 3, -6, -16, 17 }, { 0, 2, 3, -2, -6, 5, -26, -3, 7, -3, 27, -22, 11, -3, 38, -14, -1, -26, 13, 0, -11, -23, 33, -31, 33, -18, 33, -2, -10, 24, 28, -25, -4, -10, -3, -10, 3, -25, 19, -12, 18, -1, 17, -19, 10, -23, -34, -15 }, { 0, -2, 2, -2, -5, 2, 1, -4, 11, -17, 34, 4, -21, 12, -24, -7, -5, -20, 26, -36, -15, 6, 15, 19, -30, 32, -3, 51, 20, 1, 6, -30, 9, -23, 24, -14, 5, -5, 14, 3, 22, 18, -4, 8, -1, -30, 20, 26 }, { -1, 1, 4, -1, -9, -2, -4, 15, 0, -20, 28, 13, -18, -19, 28, 1, 17, 24, -1, -26, -1, 1, 15, 16, -16, -30, 6, -41, -28, -29, 9, -8, 16, 9, 38, -4, 17, -2, 14, -3, -3, -30, 3, -29, -23, 13, 37, 18 } }, { { 122, -24, 0, -15, -15, -18, 1, 6, 0, -2, 0, 4, 8, 3, 3, 1, -1, -3, -1, 1, 1, -1, 0, -1, -3, -1, -1, 0, 0, 0, 0, 1, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, { 4, 52, 95, 8, -26, -21, -10, -26, -35, -14, -5, 1, 21, 16, 2, 3, 2, 8, 14, 6, -1, 1, 1, -1, -10, -7, -1, 0, 0, -2, -1, -1, -5, -2, -1, 0, 0, 0, 0, 4, 2, 0, 0, 1, 0, 2, 0, -1 }, { 16, 98, -61, -12, 10, -27, -32, -8, 1, 14, 9, 4, 0, 9, 4, 7, 7, 7, -5, -6, 3, -1, -1, -1, -1, -4, -3, 2, 0, -1, -2, -3, 4, 1, -1, 0, 0, 1, -1, -1, 2, 1, 0, 0, 1, -1, 0, 0 }, { 25, 22, -1, 90, -22, 57, -16, 21, -4, 15, -16, -17, -33, -6, -15, 4, 1, 6, 7, -5, -5, 2, 6, 12, 8, 9, 3, 6, 1, -1, 0, -7, -4, 2, 1, 1, 0, -3, -6, -1, -2, -3, 0, 1, 1, 3, 0, 1 }, { -17, 5, -9, -24, -94, 15, -24, 51, 12, -7, -3, 28, 21, 8, -7, 4, -8, -26, -19, 1, 0, 0, -1, -13, -11, 3, -2, 1, 0, -1, 3, 12, 11, 1, -3, 0, 0, 2, 3, 3, -2, 0, 1, -1, -5, -4, 0, 0 }, { 15, 23, 24, -61, 27, 86, -11, -7, 7, -10, 13, 18, -9, -26, -25, 4, 14, 6, -3, 5, 6, 0, -4, -10, 6, 14, 7, 9, 3, -1, -8, -6, 1, 0, 0, 0, 0, 1, 4, -1, -4, -1, -2, 4, 3, -2, 0, 1 }, { 5, 22, 9, 7, -10, -4, 38, -29, 83, -53, -7, 3, -34, 37, -8, -6, -11, -18, -3, -1, 9, 5, 0, 2, 10, 8, -1, 2, 3, 2, 6, 4, 6, -1, -2, -1, 1, -1, 1, -6, -5, -1, -1, 0, 0, -4, 1, 3 }, { -5, -32, -38, 0, -26, 12, -45, -58, -15, -49, -15, -4, 2, 21, -24, 10, 21, 27, 32, 14, 0, 1, 1, -4, -18, -14, -4, 7, -3, -9, -7, -13, -12, -6, -3, -1, -1, -3, 2, 11, 7, 1, -2, 4, 3, 2, 5, -2 }, { 2, 3, -13, -7, -32, 31, 50, -55, -1, 62, 18, -21, 33, 26, -22, -12, 2, -14, 3, -24, -7, -8, -4, 5, -9, -16, -2, 5, 5, -1, 9, 0, 1, 10, 4, 0, 1, 1, -2, 5, 6, 0, -1, -5, -1, 1, -5, -3 }, { -5, -23, 30, -19, -13, -20, -61, -15, 41, 48, 5, -13, -35, 13, -3, 14, 14, 21, -25, -31, -3, -2, -2, 24, 22, 0, -6, 1, -6, -6, -9, -15, 5, 11, 3, 1, 0, -1, -13, -12, 5, 3, 1, 4, 6, 2, -3, 3 }, { 4, -11, 10, 29, 48, 21, -26, 5, 19, 17, -2, 43, 26, 58, 4, 16, 10, -8, -20, 8, 0, 6, -2, -19, -36, -25, -12, -4, -3, 2, 9, 20, 20, 1, 3, 3, 2, 2, 11, 12, 13, 11, 5, -7, -14, -11, 1, -3 }, { -2, 2, -7, -17, -2, -7, 29, -6, -29, 21, -70, 58, -36, 17, -28, -4, -26, 33, -21, -4, -11, 8, 8, 19, -12, 8, 9, 3, 2, 3, -2, -2, -4, -3, 1, 4, -2, -7, -1, -1, 0, -6, -2, 3, 2, 3, 0, 1 }, { 2, 2, -4, 20, -20, 9, 8, -42, 16, -2, -28, 28, 20, -54, 53, -15, 34, 19, -46, 9, -5, 5, 0, -14, 18, -21, -8, -16, 3, -2, -11, 4, 12, 5, -2, 3, -1, 3, 3, -5, -2, 9, 1, 2, 0, -8, -4, 2 }, { 0, 10, 1, -30, 7, 21, 1, 22, 24, 7, -60, -38, 21, 14, 38, 10, 12, -11, 22, 17, -21, -2, 35, 34, 2, -21, -10, -2, 1, 2, -1, 0, -27, -15, 12, 4, -1, -16, -11, 7, 11, 7, 4, -3, -1, 17, 10, -6 }, { 0, 0, -5, -13, -17, 30, 14, 6, -19, -8, 32, 10, -39, 28, 75, 11, -27, 36, 19, -20, 14, -2, -19, 4, -8, -19, -10, -22, -2, 11, 2, -4, -5, 9, -6, -2, 2, 5, 0, -1, 17, 5, 6, -6, 1, 4, -3, 1 }, { -2, 4, 4, 10, -7, -13, 25, 36, 43, 12, 25, 22, 22, -3, -20, -11, 23, 65, 21, 16, 5, -1, 1, -6, -7, -15, 19, 6, 6, -4, -20, -30, -29, -2, 5, -1, 5, 5, 9, 20, 16, 7, -2, 9, 17, 13, 3, -14 }, { 0, 0, 1, -9, -10, -2, 6, -2, -13, 21, 9, -30, -38, 20, -8, 6, -10, 3, -33, 71, -12, 5, 3, -51, 35, -25, -3, -2, 2, -2, 10, -4, -10, -23, -10, 6, -10, 1, 17, 13, 3, 4, 2, -5, -4, 0, 13, -5 }, { 0, 3, -4, -2, 7, 6, 19, 30, -30, -42, 2, -17, 15, 36, -18, -2, 35, 17, -43, -14, -16, -8, -15, 22, 38, -36, -3, 18, 3, -7, 1, -4, 10, 31, 3, -5, 1, 6, -19, -22, 0, 8, 1, -1, 10, -7, -11, 12 }, { -2, -1, 2, -5, -4, -10, 18, 11, -17, 12, -7, 23, -41, 14, 14, -15, 77, -39, 34, -10, -11, -6, 0, -25, 1, 17, -30, 13, 4, -8, -6, -29, 13, -2, 5, 4, -3, 3, 8, 8, 3, 11, 6, 14, 4, -1, -1, -1 }, { -2, -1, -5, 7, -4, -1, 27, -6, 1, 7, -4, 8, 23, 4, 6, 101, -1, 3, 9, 7, -3, 6, -6, -13, 24, 34, -6, 13, -21, -10, -26, -1, 1, 6, -2, 2, 1, 3, 3, -19, 0, -4, 7, 20, 7, -2, -1, 12 }, { 1, 0, -1, 3, 1, 1, -9, -8, 7, -8, 23, 23, 4, -1, 5, -10, -27, 9, 12, 14, -92, -29, 19, -12, 11, 19, -16, -3, -3, 7, 16, -8, 3, 30, 36, 1, 12, -7, -19, 3, -3, -3, 0, 0, 1, 5, -5, 4 }, { 1, 6, 5, -3, -1, -9, 17, 5, 5, -15, 1, -20, -35, -41, -18, 40, 1, -6, -17, -32, -28, -26, 5, -13, -50, -43, -2, 5, -10, -16, -18, 15, -1, 1, 9, -5, 4, -3, 7, 24, 22, 12, -3, -9, -17, -6, -7, -15 }, { 0, 2, 3, -8, 4, 1, -4, 13, 11, 4, -46, -30, 14, 0, -7, -8, -21, 32, 34, -25, 5, -17, 0, -51, 14, -7, -2, -6, 6, 11, 7, -24, 53, 6, -3, -7, 2, 23, 21, -7, 0, -3, 3, -4, -16, -28, -16, 11 }, { 0, 0, 1, 5, 5, -2, -4, -2, 0, -1, 3, 43, 4, -24, -18, 14, -29, -31, 24, -13, 24, 11, 20, 4, 47, -59, -23, 1, -9, -18, 29, -36, -5, 0, -3, 5, -2, 0, -20, 4, 18, 5, 5, -9, 2, 10, 2, -2 }, { 0, 4, 1, -6, -1, -6, -5, 3, 10, 12, -20, 2, -10, -13, -14, 6, 7, -11, 34, 28, -13, 26, -65, 15, 6, -12, 1, -29, 2, -1, 9, 23, -28, 57, 1, -1, 0, 14, 14, 10, -8, 15, 7, -17, -2, -23, -19, 4 }, { 0, -3, -1, 0, 4, 1, -8, -4, 0, 0, -5, 6, -3, 13, 23, 9, 1, -29, 4, -17, -21, 14, -5, -21, 6, -26, 96, -3, 14, -3, -14, -22, 4, 14, -2, 5, -6, -8, -11, 16, -19, -29, -11, 16, 8, 7, 3, -2 }, { -1, -1, -1, -1, -8, -3, 3, -2, -4, 11, 13, 19, -16, 3, -7, 5, 19, 8, 35, 21, 13, -42, 38, 20, 24, -17, 37, -21, 4, -16, -8, 60, 27, -15, 9, -8, 15, 12, -7, -16, -12, 18, 2, -3, -12, -13, -3, 13 }, { -1, -2, 1, -3, -2, -3, -2, -1, 5, 5, 0, 2, -6, -9, 24, -1, 0, 22, 13, -3, 3, 16, -5, 2, 12, -4, 2, 95, 9, -21, 41, 39, 5, 10, 7, 3, 2, -4, 11, 24, -3, -20, -13, 2, -17, -5, 6, 1 }, { 0, 1, 0, -7, -7, -3, 12, 5, 0, -1, 22, -11, -6, -9, -9, 6, 13, 16, 11, 15, -14, 74, 36, 16, -20, -3, -1, -13, 2, 8, -2, -6, 50, 16, -7, 30, -21, -25, -18, -8, 14, -7, 3, -9, -19, -24, 6, 9 }, { 0, 0, 1, 2, -1, 1, -6, -2, 2, 7, 2, 6, -5, -3, 5, -21, -25, -24, 13, 31, -10, 0, -18, 20, -9, -33, -10, 43, -4, 22, -69, -7, 19, -8, 10, -3, 7, 15, 16, -41, 21, -16, -19, 4, 14, -8, -9, 3 }, { 1, 0, 0, 2, 3, 2, -5, 3, -4, -4, 5, 5, 8, 10, -9, -17, -11, 6, 14, -51, -27, 33, 6, -20, 39, -7, -17, -8, 1, 1, -42, 44, -20, -14, -8, 7, -14, -27, 35, -1, -18, 25, -5, 3, 6, -3, 27, -15 }, { 0, 0, 0, -2, 5, 3, 0, 2, -3, 3, -5, -11, -3, 10, 7, -20, -26, -3, -6, 12, 29, -1, 34, -14, -10, 3, -20, 8, -6, -40, -45, 3, -4, 55, -12, 12, 8, 13, -34, 38, -27, 17, 1, 32, 7, -11, -8, 4 } } }, { { { 90, 67, 6, -3, 13, -52, -22, -3, -5, 1, 0, 2, 4, -9, 15, 2, 1, 4, -2, 0, 1, 0, 1, 0, -1, -2, 3, -2, 0, -1, -1, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 1, 0, -1, 0, 0, 0, 0 }, { -52, 75, 39, -9, 49, 42, -39, -10, -1, -1, -1, 3, 4, -16, -12, 10, 1, 3, 2, 1, 0, 0, 2, 0, -1, 1, 4, 3, 0, 1, -2, 0, 0, 1, 1, 0, 1, 0, -1, 1, 0, 0, 0, 0, 0, -1, 0, 0 }, { 12, -53, 80, -3, 48, -23, 14, -56, 4, 4, 2, -3, 5, -17, 7, -2, 19, -9, -2, 1, 2, 2, -1, -2, 2, -2, 4, -2, -2, -4, 4, -1, -1, -1, 1, 1, 0, -1, 1, 1, -1, -1, -1, 1, 0, -1, 0, 0 }, { 69, -6, 20, 16, 8, 87, 25, 14, 8, 1, 2, -3, -5, 9, -44, -1, -7, -2, 7, -1, 2, 1, -1, 1, 3, 6, -6, 14, 3, 3, 1, -3, 2, 2, 0, 1, 0, 1, 1, 1, 0, -2, 3, 1, 0, -1, 0, 0 }, { -9, 15, -8, 79, 8, -13, 7, -5, 72, -11, 1, 1, -47, -13, 6, -7, 6, -34, 8, 1, 1, 2, 3, 0, -4, 15, 4, -1, 1, -3, 9, -3, 0, -2, 2, 0, 2, -1, -2, 0, 2, -4, -1, 0, 0, 1, -1, 0 }, { 3, -33, -18, 4, 65, -17, -9, 69, 4, -1, -2, -1, 9, -56, -10, 6, -41, 7, -1, 2, -1, -1, 0, 5, 1, -9, 17, 2, -3, 11, -6, 4, -3, 0, 1, 0, 0, 2, 1, -4, -1, 2, -2, -2, 2, 0, -2, -1 }, { -14, 36, 4, -14, 30, -6, 95, 24, 6, 1, -1, 4, 23, 16, 28, -44, 3, -3, 0, 2, -1, -1, 2, 4, 8, -9, -10, 1, 11, -1, 7, 3, -1, 2, 1, 0, 1, 1, 3, 0, -5, 3, 4, 2, 2, 1, 0, -1 }, { 7, 5, -54, -12, 8, 24, 23, -37, -20, -35, 4, -23, -2, -64, 5, 5, 48, -1, 20, 4, -6, -2, -4, -8, 10, -5, 31, 2, -1, -17, 7, -8, -5, 2, 0, -1, 0, -1, 0, 6, -4, 3, -5, 5, 3, -1, -3, -1 }, { -12, 9, -10, 2, 16, -39, 20, -17, -37, -7, -2, -2, -47, 4, -75, -24, -3, 29, -7, -3, -1, 0, -2, 1, 0, 35, -15, 33, 0, 4, -7, 4, 1, -5, -1, -2, -1, -1, 3, -1, 0, -14, 2, 0, 1, 0, 1, -3 }, { 3, -2, 4, 19, 7, 23, 14, 14, -39, 41, 0, 34, -51, -11, 48, 13, 21, 30, -32, -1, 4, 2, 1, 3, -25, 28, 20, -19, 1, -8, -8, 23, -6, -4, -1, 2, 1, -2, 0, -4, 10, -2, -4, -8, 7, 4, -1, -1 }, { -1, -2, 31, 22, -1, 2, -3, 22, -27, -60, 6, -59, -26, 22, 38, 4, -15, 35, 32, -6, -4, 0, -7, -1, 25, 12, 0, -18, 2, 5, -13, -5, 8, 2, -4, -1, -2, 2, 2, 0, -5, 2, 0, -2, -1, 2, 4, -1 }, { 8, -18, -36, -6, 61, 1, -24, 11, 13, -17, -4, 16, 9, 52, 7, 12, 56, 13, 17, 3, -4, -1, 3, 0, -12, 17, -41, -8, -4, -16, -2, -7, 6, 1, 2, -2, 1, 1, 1, 2, 11, -8, 15, 5, -4, 2, 1, 0 }, { -6, 7, 2, 44, -6, -11, 25, -11, 11, 4, 34, 24, 34, 9, -13, 44, 5, 62, 16, 22, -2, 5, 0, -15, -4, -32, 16, 19, -20, 7, -31, -3, -4, 6, 1, 1, 1, 1, 6, 6, 2, 13, -6, 2, 3, -3, 2, -2 }, { 1, 2, -21, 11, 14, 6, 8, -27, 22, 13, -52, -30, 13, 7, 5, 25, -2, 20, -42, -45, 5, -4, -1, 29, 41, -6, -3, 4, -13, 11, -19, 33, 23, -3, -3, -2, -3, -1, 6, -6, -15, 4, -4, -12, -6, 2, 5, -3 }, { -4, -2, 25, 33, -23, -8, -19, 33, -10, 11, -28, 10, 23, -16, -24, -44, 48, 23, 20, -31, 0, 0, -4, 31, 4, -15, 16, -9, 11, -41, 6, -8, 16, 6, -2, 1, -1, 4, 4, -6, -9, 11, 1, 6, 1, -2, 0, -2 }, { 5, -7, -21, 25, 8, 18, -23, -37, -4, -5, 17, 4, 5, -19, 12, -62, -27, 27, -15, 16, 2, 2, 2, -4, -6, -30, -49, -26, 29, -2, -26, 16, -10, -1, 0, 2, 0, -1, -1, -3, -7, 11, 11, -10, 2, -4, 0, -4 }, { 2, 0, -5, -4, 0, 3, 15, -28, -10, -17, -38, 46, -9, -11, 14, 11, -37, 3, 52, -33, -4, -3, 0, 33, -37, -7, -12, -2, -1, 21, -5, -28, 23, 5, -2, -3, 0, 8, 2, -23, 19, 1, 1, 3, -14, -2, -2, -3 }, { -5, 5, 1, 37, 10, -4, 10, 10, -51, -4, 4, -21, -9, 5, -5, 30, 8, -41, -29, -24, -4, 2, -9, -12, -27, -63, -33, 11, -9, -10, 18, -11, 1, -5, -6, -2, -5, -2, -6, -4, 5, 12, 19, 2, -4, -6, -5, 1 }, { 1, -4, -23, 14, 30, 1, -13, -16, -29, 32, 12, 13, -20, 41, -3, -18, -6, -15, 23, 5, 16, 7, 7, 3, 40, -25, 41, -17, 6, 30, 22, -21, 18, -5, 5, 5, 3, -1, 1, 6, -26, 23, -27, 9, -6, 1, 1, 2 }, { 2, -3, -4, 13, 8, 2, -9, -10, -2, -37, 0, -2, 25, 27, -6, -23, -1, 3, -21, -8, -31, 1, -17, 0, -45, 4, 48, 9, 15, 28, 36, 43, 7, 13, 1, -7, -2, 0, 4, 0, 33, 14, -15, -21, 6, -2, -3, 0 }, { 0, 6, 1, -3, 1, -5, 14, -4, -4, -20, -9, 0, -6, 12, -33, 17, -7, -12, -3, 37, 5, -2, 13, 50, 0, -12, 4, -70, -31, -25, 2, 18, -33, 4, 6, -4, 3, 3, 5, -24, 8, 6, -13, -14, 23, -7, 0, -2 }, { -1, -4, 0, 15, -2, 2, -13, 0, -15, 6, -12, -12, 6, 1, 24, 1, 16, -4, 11, 44, 18, 0, 11, 58, 11, -5, -4, 65, 19, 29, 10, 7, -35, -1, 5, -2, 0, 8, -6, -28, 6, -12, 18, 3, 20, -4, -5, -1 }, { -4, 4, -10, 40, 2, -1, 4, -13, -30, 27, 21, -4, 50, -12, 3, 12, -22, -8, 19, -6, -6, 4, -14, 4, 21, 46, -23, -21, -6, -7, 44, 9, 5, 16, -8, 2, -4, 2, 12, 3, 9, -46, 1, -3, 1, 2, 3, 1 }, { 0, 0, -1, 11, -1, 1, 3, -4, -3, -19, -10, -6, 24, -2, -4, -2, -6, 20, -8, -15, 67, -1, 60, -16, -17, 14, 2, -7, -5, 3, 30, -3, -6, -47, -4, 6, 5, -2, -31, 6, 15, 8, -4, 17, 6, 21, -5, 13 }, { 0, 2, -12, 7, 14, 2, 1, -12, 7, 29, -13, -45, 14, 20, 1, -8, -14, 10, -15, -8, 2, -5, -24, -1, -35, 9, 24, -6, 17, -18, -25, -65, -36, -1, -4, 1, -3, -1, -2, -21, 10, -23, -27, 13, 10, -7, -5, 1 }, { 1, -2, 1, 13, 3, 3, -3, 2, -21, -32, 10, 28, 19, 13, 6, -8, 0, -36, -4, -11, 3, 3, 21, -1, -6, 0, 20, 18, -6, -17, -49, 13, 0, -15, -3, 1, 2, -1, -12, -12, -45, -58, -28, -28, -10, 9, 2, -5 }, { 0, -2, -2, -11, 1, 3, -1, -5, 12, -19, 43, 2, -6, 0, 8, 8, -17, 21, -45, 12, -3, 11, 17, 35, -2, 2, 6, 16, 11, -41, 34, -27, 50, 9, 8, 5, 6, 10, 10, -35, -15, -2, -2, 19, -20, -1, 5, -6 }, { 1, -3, 1, 2, 4, -1, -1, -6, 1, -19, -37, 37, -7, 14, 3, 24, -29, 14, 1, -2, -15, -16, -11, -28, 27, -8, 6, 13, 47, -47, 35, 9, -41, -12, -9, -7, -2, -5, 6, 3, -25, 1, 2, 11, 18, -8, 4, 8 }, { 2, 0, 5, 8, -6, 6, -6, 5, 0, -20, -15, 20, 8, -9, 9, -19, 12, 15, -27, 4, -20, 1, -21, -8, -2, 16, -17, -3, -60, 42, 30, -29, -31, -10, -2, -4, -3, -10, 8, -22, -52, 9, -11, 14, -8, -4, -1, -6 }, { 2, -3, -3, -2, 6, 7, -10, -5, -1, 11, -11, -9, -13, 6, 22, -28, -22, 4, 15, 6, 0, -15, 11, -4, -4, -14, -12, 42, -67, -45, 11, 16, 3, 21, -2, -3, 1, 4, 0, 23, 22, 4, -48, 8, 11, -6, 8, 5 }, { 0, 1, 0, -7, 0, 2, -1, 1, 7, 3, 7, -3, -2, -1, 1, -2, -1, 1, 15, 19, 20, 0, -61, 10, -5, -10, -4, 7, -5, -15, 8, 10, 16, -81, -11, 1, -10, -8, 34, -7, 11, 0, -12, -18, -9, 39, 2, -17 }, { 0, 0, -3, -20, 3, 1, -3, 0, 19, 13, 40, -1, -16, 5, -1, -9, 4, 14, 25, -52, 7, 16, -3, -17, 13, -17, 7, 4, -18, -2, 16, 21, -26, 3, -17, 7, -6, -4, -17, -72, 8, -10, 16, -18, 14, -9, 2, 6 } }, { { 94, -70, -33, 5, 12, 33, -15, 0, 1, -3, 2, -1, -5, -6, 5, -2, 5, 4, 2, 0, -1, 0, -1, -1, -2, -2, -3, 0, 0, 2, 0, 0, 1, 1, 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0 }, { 59, 23, 62, -31, -54, -26, 24, 38, 26, 12, -1, -6, -12, -20, -12, 8, 7, 0, -1, 0, 3, 0, 0, 3, 6, 7, 0, -2, 2, -2, -7, -6, -4, -2, 0, 2, 0, 1, 2, 3, 4, 5, 1, -2, -1, -2, -1, 0 }, { 49, 59, -52, 34, 28, -53, 44, -12, -16, -7, 2, 3, 7, 6, -20, 11, -6, -2, 1, 2, 0, 1, 0, -2, -1, 1, 4, -2, 1, -3, 1, 0, 1, 0, 1, 0, 0, -1, 1, 0, -1, -1, 1, 1, 0, -1, 0, 0 }, { 40, 57, 49, 9, 15, 39, -39, -33, -16, 1, 5, 11, 20, 35, 40, -16, -18, -10, -6, -3, 2, 2, 3, 1, -1, -1, 6, 11, -1, 0, 7, 7, 3, 0, 0, 1, 1, 0, -2, -5, -7, -6, -1, 5, 4, 3, 1, -2 }, { -3, -16, 16, 72, -45, -7, -5, 34, -51, -42, 13, 23, 29, -24, 6, -8, 10, -18, -10, -6, -5, 4, -1, 0, 0, 8, -5, 4, -3, 0, -3, 4, 2, 2, -2, 0, -2, 1, -3, -3, -5, 0, 1, 5, 3, 2, 0, -1 }, { 1, -51, 52, -2, 23, -21, 32, -40, -23, 9, -2, 7, 30, 34, -43, 27, -21, -25, -11, -3, 4, 1, 0, 1, 6, 16, 15, -17, 3, -8, -7, -1, 2, 0, -1, 1, 0, -1, -1, -4, -2, 2, 5, 3, 3, 2, 1, -1 }, { 2, -14, -41, -32, -53, -43, -20, -13, 11, -13, 4, 2, 11, 34, 27, -29, -46, -26, -18, -9, -7, -2, 0, 8, 14, 21, 24, 10, -7, -12, -7, -5, -3, -4, -2, -3, -1, 1, 0, -4, -4, -1, 1, 6, 7, 2, 1, -4 }, { -1, 13, -5, -32, 26, 30, 16, 9, 23, -60, 33, 57, -5, -7, -11, 14, 8, -5, -40, -24, -14, 2, 12, 16, 28, 13, 1, -3, 3, -1, -12, -15, -3, -3, 2, -2, 2, -3, -2, -1, 7, 9, 3, -1, 6, 4, 3, -4 }, { 2, -25, 37, 26, 31, -62, -15, -10, 28, -23, 29, 15, -44, 5, 16, -30, -4, 39, -2, -10, -6, 7, 5, 0, -6, -27, 5, 11, -10, 0, 17, 5, 1, -2, 1, 0, 1, -2, 0, 0, -2, -7, -1, 0, 1, 0, 1, -1 }, { -4, -7, 7, 14, 60, -10, 5, 63, 17, 33, -10, -3, 18, 0, 28, -30, 0, -31, 1, 12, 6, 1, 5, 9, 20, 36, 22, 20, -8, -2, -19, -13, -12, -3, 2, 2, 3, 5, 6, 9, 6, 7, 6, 4, 0, -1, -3, -4 }, { 7, -1, 1, -40, 18, -44, -34, -27, -32, 14, -14, 9, 27, -40, 5, -16, 45, -16, -26, -8, 3, -5, 1, 11, 12, -3, -45, 3, -8, 22, 3, -7, -11, -6, -4, 1, -1, 1, -1, 5, 7, -2, -14, -5, -3, 1, -1, 0 }, { -3, 4, -4, 48, -18, 9, -15, -32, 20, 44, 45, -22, -10, -8, -8, 11, -1, -2, -9, -33, -10, 12, 12, 31, 24, 13, -9, -10, 5, 0, -13, -33, -28, -14, -5, 3, -1, 2, 7, 20, 24, 12, -3, -15, -6, 1, 1, -6 }, { 1, 5, -2, 34, -4, -15, -49, -5, 58, -12, -31, 20, 6, 11, -15, 22, 6, -10, -19, 44, 23, -5, -12, -23, 11, 19, -16, -36, 14, 13, -5, -8, 14, 6, 11, 3, 2, -3, -4, -8, 1, 9, -7, -2, 2, -1, 1, 1 }, { -2, -2, 8, 14, -8, 17, 33, -42, -15, -10, -47, 7, -44, -32, 14, -22, -15, 2, -29, 25, 19, -7, -13, -21, 24, 4, 8, 29, -10, -17, -17, -31, -7, -5, 1, 2, -1, -3, 9, 11, 15, 6, 0, -4, -6, -9, -6, 3 }, { 3, 12, 7, -13, 22, -8, -45, -9, -16, -23, 15, -23, -29, -45, -23, -1, -18, -27, 30, -4, -16, 2, -4, -17, -34, 29, 18, -14, 0, -28, -41, 1, 6, 6, 0, -6, -2, -6, -11, -3, 2, 22, 10, 1, 1, 1, -4, -2 }, { -3, -5, 1, 10, -7, 7, 42, -42, 54, 8, 23, 3, 26, -39, 20, -14, 8, -26, 10, 5, -6, 7, 5, -9, -26, 22, -20, 27, -7, 6, -3, 29, 14, 6, 3, -1, 3, -4, -12, -15, -27, 0, -10, 16, 11, 10, 0, -2 }, { 0, -5, 10, 1, 3, -6, 22, -8, 1, -53, -13, -46, -20, 41, 39, 13, 32, -28, 18, 3, -28, 3, 2, 0, -8, 16, -22, -7, 17, 35, -15, -9, -15, -12, -2, -6, -4, 3, 9, 16, 6, 2, -14, -8, -13, -3, -1, 0 }, { 2, 1, 4, -2, -6, 1, -6, -18, 20, -30, -19, 5, 62, -1, -19, -18, 9, 46, 39, 17, -15, 2, 16, 18, -6, 8, 13, 18, -16, -16, -6, -6, -13, -3, 1, -5, 5, 11, 15, 27, 24, 22, 14, -16, -23, -16, -5, 8 }, { -1, -2, -1, -11, -5, -7, 10, -7, -20, 5, 37, 3, 5, 18, 23, -4, 23, 37, 26, -3, 32, -9, -37, -15, 21, 33, -3, 4, 3, 11, -21, -13, 32, 42, 18, 7, -7, -12, -17, -11, 29, 23, -1, -14, 9, 15, 10, -1 }, { -1, -8, -1, -9, 1, -15, 7, -7, -14, 16, 2, 41, 4, -32, 66, 43, -26, 11, 26, 22, -11, 7, 28, 2, 5, -1, 9, -35, 41, -4, 0, 6, 0, -9, 0, -2, 8, 7, 12, 6, 10, 6, 0, 0, -7, -5, -4, 2 }, { 1, 1, -6, 0, -15, 1, -5, -2, -18, 34, 1, 33, -29, 33, -18, -13, 37, 10, -4, 18, -36, 25, 41, -30, 2, 25, 0, 12, -12, 8, -27, 23, 26, -27, -14, -6, 9, 2, 16, -5, -5, 19, 6, 6, 1, -2, 4, 3 }, { 0, 0, 6, 1, 11, 0, 3, 11, 3, -4, -8, -45, 29, -16, 6, 4, -28, 21, -26, 0, -55, 19, 13, -31, 25, -10, -13, 1, 0, -1, 8, -28, 45, 4, -15, -14, 2, 0, -5, -32, 17, 0, -10, -16, 19, 6, 12, 1 }, { 0, 1, 3, -4, -1, 7, -1, 3, -10, -8, 14, 13, -18, 0, -32, -21, -40, -22, 55, 38, -5, 5, 16, 22, 42, -3, -41, 4, -15, 23, 6, -5, 9, 13, 11, 1, 4, 6, 4, 3, 10, -26, -31, 5, 9, 2, -4, -3 }, { 0, 2, 3, 8, 3, -2, -7, -11, -6, -12, -33, -30, -13, -15, -1, 6, 11, 34, 1, -9, -4, -14, 9, 50, 47, 36, 13, -8, 8, -3, 5, 37, 8, 3, 4, -2, 3, 8, 5, -4, -33, 6, 6, 35, 28, 21, 9, -17 }, { 0, 1, 2, 0, 7, 5, 9, 16, -6, 4, -14, 10, 5, 5, -5, -22, -51, 46, -13, -32, 7, -7, -10, -12, -30, 35, -49, -17, 10, 18, -27, 6, -21, -11, -6, -1, 1, 6, 10, 5, -22, 14, -35, 3, -9, -3, -5, -4 }, { -1, 1, -1, -13, 6, 3, -4, 0, -6, -14, 52, -31, 13, -11, -3, -1, 1, 10, -16, 29, 26, 23, -28, -29, 30, 17, 7, -6, 5, -2, 16, 22, -1, -29, -15, 2, -4, 11, 38, 18, -17, -12, -2, 18, -10, -37, -11, 22 }, { 0, -1, -1, -1, 1, 1, 10, 3, -3, -6, 13, -23, 4, 5, 4, -24, -6, -4, -34, 28, 33, 4, 29, 48, -16, -42, -11, -14, 17, -5, -50, 17, 45, 3, 3, 9, 3, 0, 1, 10, -1, 25, 1, -13, -3, -11, 1, 1 }, { -1, -1, -1, 4, 3, -6, -8, 1, 10, -6, -21, 10, -9, 4, 7, 34, -6, -13, 17, -36, 33, 41, -17, 14, 5, -6, -17, 29, -16, -6, -8, 12, 22, 15, -56, -11, 8, 46, 15, -13, 13, 16, 0, -1, 4, -7, -19, 14 }, { -1, -3, 1, -2, 3, -7, -12, 11, -9, -1, 10, -10, -10, 6, 9, 60, -21, 7, -25, 23, 3, -15, 13, 16, -24, 31, -33, 51, -37, -18, 22, 15, 8, -12, 27, 10, -5, -16, -4, 18, 15, 8, 2, -8, -2, 0, 11, 2 }, { 0, -1, 1, 7, 3, -4, -6, 2, 11, 4, -2, 4, 9, 2, 5, 17, -7, -12, 19, -29, -20, -49, -19, -21, 44, -41, -19, 16, -7, -10, -45, 30, 5, -13, 22, 7, -11, -17, 27, -8, -15, 10, 12, -1, -24, -14, 10, 11 }, { 0, 0, 1, 4, -1, 2, 3, -4, 2, -3, -5, 1, -7, -6, -2, -16, 6, -12, 10, -35, 14, -44, 25, 8, -22, 27, 11, -15, 1, 6, 32, -19, 45, -15, 15, 20, -1, 4, 50, -26, 18, -8, -6, -8, 14, -34, -4, 28 }, { 1, 1, 0, 3, 1, 4, 7, 1, 7, 2, -14, 4, -5, -4, -2, -23, -6, -16, -14, -23, -10, 7, -5, -14, 8, 15, -4, -26, 4, -15, 19, 58, 20, 27, -4, -4, -2, -8, 5, 59, 42, -20, 1, -22, -20, 26, 20, 15 } }, { { 113, -33, -41, 19, 6, -14, 8, 5, 0, -3, 0, -4, -4, -3, -1, 1, -1, 0, 2, 3, 1, 0, -2, -1, 1, 2, 0, -1, 0, 0, 0, -1, -1, 0, 1, 0, -1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, { 53, 52, 89, -32, -9, -24, 5, -21, 8, 18, 0, 5, 9, 11, 4, 1, -1, -6, -8, -3, 1, 2, 5, 3, 1, 1, 0, 1, 1, 0, -1, 2, 1, -3, -1, 1, 1, 0, -1, -1, 0, 1, 0, 0, 1, 2, 1, 0 }, { -3, -97, 55, -11, -53, 19, 7, 8, 9, 4, 6, -9, -9, 6, -12, 1, -7, 4, 1, -3, -2, 2, -2, -1, 0, 1, 3, -1, -1, -1, -2, -1, 1, 0, -2, 0, -1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, { 24, 37, 17, 32, -16, 100, -25, 32, -4, -18, 9, 2, -10, 1, 2, 9, -4, 16, 13, 0, -2, 1, 0, -3, -6, -7, 4, 2, 3, -1, 0, 2, 4, 4, 1, 1, 0, 0, 0, -1, -1, 1, 1, 1, -1, -1, 0, 1 }, { -8, 17, 10, 78, -58, -51, -2, -10, -13, -44, 22, 7, -16, 20, 12, -5, -6, 6, -2, -5, -1, 0, -5, -2, -1, -2, -2, -1, -2, 2, 4, 4, 1, -2, 2, 3, -3, -1, 1, -1, -4, -1, 0, 0, 2, 0, -1, 0 }, { -2, -31, 37, 40, 76, 14, -5, -64, -16, -16, 12, 14, -9, 3, 1, -5, 15, 14, 8, 6, -1, 1, -2, -15, -17, -5, -5, 2, -1, 0, -2, 4, 8, 6, 5, 1, -1, -6, -2, 2, 3, 1, 0, -1, -2, -2, -1, 1 }, { -3, 14, -14, 2, -6, 31, 101, -23, 4, -5, 0, -5, 0, 55, -24, -4, -6, -6, 1, 3, -1, -1, -1, -4, 11, 11, -6, 10, 2, 4, -4, -2, 3, 2, 1, 0, 0, 0, 4, 1, 0, -1, 0, 0, -1, 0, 2, -2 }, { 6, 6, -35, -38, -42, 19, -45, -81, 21, -16, 7, -23, -6, 23, 4, 5, 8, -7, 9, 1, -13, 1, -2, -12, -14, 5, -12, -1, -2, -3, -3, 4, 8, 2, -2, -2, -1, -2, -7, -2, 1, 2, -3, -3, -1, 3, 0, 0 }, { -6, 7, 8, 21, 17, -8, 0, 11, 97, 4, 4, -15, -60, -10, -10, 2, 5, -15, 12, 21, -3, 3, 4, -4, 1, 24, -2, -1, 0, 1, -7, -5, -11, -7, 3, -1, 4, 3, 4, 4, 1, 1, -1, 0, 1, 1, -5, -2 }, { 3, -4, 5, -36, 19, -9, 3, 27, 19, -79, 50, 26, 31, 0, -14, 1, -10, -26, 20, -13, -31, 1, 5, -1, -11, 10, 7, -2, -1, 0, -1, 2, 8, 12, 6, -2, -3, 0, -7, -8, -1, 0, 1, 1, 2, 2, -1, -1 }, { 0, 9, -1, 7, 2, -10, -25, -10, -14, 6, 1, 0, -5, -5,-107, 19, -50, 10, -11, 2, 3, 2, -2, -4, 2, 2, 7, 1, -7, -2, -11, -12, 1, -1, 0, 1, 0, -3, 0, 0, 3, 3, -7, 0, -1, -4, 1, -1 }, { 1, 10, 2, 6, -28, 11, 37, -30, -10, -11, 11, -5, 6, -99, -16, 9, 36, -21, 4, -2, 0, -2, -3, -5, 2, 5, 5, 2, 3, -12, -2, -13, -12, 3, 0, 2, -3, -1, 1, 8, 10, -3, -1, 0, -3, -3, -6, 0 }, { -2, -9, -4, 43, -4, 18, -16, -12, 36, 36, 20, 24, 68, 7, -5, -2, 3, -51, -41, -1, 3, 9, 7, 13, 11, 7, -2, 3, -3, 1, 13, 5, 1, 0, 1, 2, 0, -3, -12, -10, -6, -2, -1, 1, 5, 8, 6, 1 }, { 2, -3, -14, -19, -8, 10, 7, -9, 0, 1, 8, 79, -53, -8, 5, 5, 2, 14, -60, -29, -6, -2, 25, 8, -6, 7, 4, -1, 2, -3, -11, 19, -2, -13, -1, 0, 4, -8, -5, 1, -9, 2, 0, 3, 3, 5, 4, 0 }, { 0, -5, 1, 4, -8, 1, 9, -19, 39, -36, -69, 33, 36, -10, 6, 19, -25, 26, 9, 21, 24, -21, -3, 27, -20, -18, 17, -5, 3, -2, 1, 9, -14, 0, 6, -2, 3, 8, 5, 0, -1, 9, -2, 0, 0, -6, -5, 3 }, { 3, -1, 5, 1, -4, -4, -19, 18, -5, -21, -31, 8, 10, 27, -44, -7, 90, 7, -4, -6, 8, -10, 0, 20, 2, 34, -27, -2, -3, 1, -6, 3, -10, -3, -1, 1, -1, 5, 5, 16, 4, 4, 9, -6, -5, 0, 4, 2 }, { 2, -3, 9, 0, 3, 10, -10, 1, -33, -18, -45, 3, -27, -4, 6, -27, -33, -81, 2, 3, 18, -16, -8, 1, -16, 33, -19, 18, -1, 11, -2, 8, -12, -1, -1, 2, 1, 0, -8, -6, -6, -15, -3, -1, 7, 6, -6, -2 }, { -2, -4, 7, 3, 17, 3, -3, -4, -11, -29, -13, -39, -11, 13, 17, 77, 7, -24, -38, -14, -2, -6, -10, 15, 37, 12, 45, -16, -1, -9, -4, -5, 1, -11, -2, -3, -5, -4, 0, -4, 3, 4, 7, -3, 5, 10, 10, 0 }, { 3, 0, 6, -6, 11, 10, -4, -7, 15, -40, -10, -36, 2, -17, -4, -59, -13, 20, -53, -6, -16, -1, -8, 16, 47, -28, -28, 14, 2, -3, 14, -3, -5, -15, -2, -2, -4, -1, 12, -2, -20, -3, 0, 5, 4, 2, 6, -5 }, { 0, 2, 1, -11, -6, -4, 1, 19, -2, -11, -3, 5, 2, 5, -2, 15, 16, -11, -46, 71, 1, -1, -24, -67, -15, -28, -7, 2, 0, 3, -3, 11, 3, 5, 26, 1, -1, -20, -4, 13, -10, 0, 6, 1, -13, -1, -2, 4 }, { 1, 0, -3, -12, -4, 3, -12, -10, -5, -1, 22, 15, 7, -2, 10, -25, -3, 26, 4, 30, 33, -13, -20, -6, 43, 61, 52, 31, -15, 13, 3, 0, -3, 10, 22, 10, -3, -1, 15, 12, -6, -2, -7, 16, 11, 2, -3, -7 }, { -2, 1, 3, 8, 2, -2, 10, 8, 5, 1, 1, -36, 24, -21, 14, 16, -18, 36, -29, -11, -8, -1, -5, -2, -41, 59, -32, 18, -4, 18, -39, 35, 23, 0, -4, -3, -2, -3, -8, -3, -5, 9, -10, -12, -3, 9, 3, 0 }, { 0, -4, 4, 2, -3, 0, -5, -5, -10, -1, -11, 29, 6, -8, 15, 34, -21, 4, 24, 28, -27, 12, 23, -7, 65, 22, -64, -17, 3, 11, -12, -11, -5, 8, 1, -11, 9, -1, 8, 15, 1, 1, 2, 4, -10, 7, 9, -5 }, { 0, -3, -2, -12, 4, 1, -1, 0, 1, 0, 46, 1, -7, 1, 3, 35, 2, -7, 4, 1, 49, -33, -16, 33, 6, -34, -44, 30, -9, 35, -8, 5, -14, -26, 13, 24, -9, 10, 14, -9, 0, 5, -5, -9, -5, -12, -14, 2 }, { 0, -1, 0, 2, -3, -2, -1, -2, -16, 0, -3, -8, -3, 1, -4, 8, 13, -4, 3, 38, -49, 32, 36, 33, -13, -16, 27, 60, -7, 40, -3, 13, -32, -10, -5, -15, 10, 9, -5, 3, -2, 8, -8, 5, 12, 6, 1, -6 }, { 0, 2, -1, -4, 1, 4, 4, -4, -19, 5, 29, -12, -5, 1, -4, -16, -6, -3, -1, 51, -1, 2, 0, 49, -8, 8, 0, -62, 7, -41, -18, 46, -22, -7, 15, 3, 2, 16, 0, 0, -15, 1, 3, -7, 12, 5, -6, -3 }, { 0, 0, 0, 2, -3, -6, 3, 2, 3, 2, -11, -3, -13, -13, -15, 5, 11, -11, 21, 8, 6, -19, 23, 6, 27, -19, 2, 21, -9, -11, 18, 66, 79, 18, -1, 6, 9, 3, -1, -9, -8, 1, 1, 10, 1, 12, 13, 11 }, { 0, 2, 0, 4, 1, -1, -2, -2, 8, 14, -8, 4, -1, 1, -5, -2, 2, -8, 17, -40, -24, -2, -48, -14, 25, -20, 9, -4, -8, 16, -29, 56, -46, 40, 1, -3, -18, -11, -2, 16, -22, 18, -9, -1, -3, -5, -3, 5 }, { 0, -1, -1, 5, -2, -1, -6, 0, 1, -2, -2, 4, 7, 5, 7, -20, 6, -17, 4, 2, 12, -2, 12, -2, 11, -24, 15, 16, 6, -22,-105, -26, 25, -6, -4, -3, 8, 9, 5, -1, -5, 19, 8, 9, -5, -1, 5, -19 }, { 0, 1, 1, 0, -2, -3, 2, 2, 0, 11, -2, 6, -17, 2, 7, 6, 1, 4, -15, 14, -28, -12, -36, 53, -9, -2, -15, 21, -12, -20, -4, -40, 17, 69, 29, -1, -10, -6, -26, -8, 13, -13, -2, 10, 2, -13, 6, 11 }, { 1, 2, 1, 2, -3, 5, 3, -3, -2, 3, -3, -3, -6, -7, -2, -20, 5, -9, -6, 6, -15, -30, 9, 13, -2, -6, 18, -59, -11, 81, -3, -23, 33, 3, 13, 6, -1, 1, -10, 10, -9, 29, -25, -8, -7, 4, 3, -4 }, { 0, 0, -2, -6, -1, 0, -2, -2, 2, -12, 4, 2, -7, -4, -3, -9, -4, -5, -11, 5, 37, 58, -16, 16, 6, -3, 0, -6, 59, 36, -3, 13, 14, 31, -12, -10, 1, -9, 9, 4, 46, 17, 27, -15, 17, -7, 11, 10 } } }, { { { 65, 93, -25, 4, -15, 31, -1, 22, -7, -2, 1, 3, -16, 23, -2, 1, 8, -4, 5, -2, -1, 1, -2, -3, 7, 2, 1, 1, 2, 4, -4, 2, 1, -1, 1, 1, 1, 1, 1, 1, -1, 0, 1, 0, 0, 1, 0, 0 }, { -67, 19, -22, 5, -59, 56, 23, -41, 12, -4, 0, -4, -12, 3, 3, 1, 16, -30, 9, 0, -2, -2, -5, 9, -17, 6, 3, 1, 2, -1, -5, -3, 2, 0, 4, -9, 1, 1, 0, -3, 1, -2, 0, -2, 0, -2, 1, 0 }, { -80, 29, -5, -12, 21, -4, -29, 59, -22, -2, -3, 0, -14, 36, -9, -1, 2, 19, -5, -3, -2, 0, 2, -16, 31, -6, -1, -2, -1, 15, -7, 4, -1, -2, -1, 7, 3, 0, 1, 7, -4, 1, 2, 1, 2, 3, -1, 0 }, { 11, -20, 58, -20, 47, 59, 34, 4, -18, 11, -1, 1, -5, 34, -8, -6, 43, -18, -8, 3, 3, -1, 3, -1, -5, 16, 1, 0, 3, 17, -10, -2, -1, 2, 3, -13, 14, 2, 4, -2, 2, -1, 0, -4, 4, -3, 2, 0 }, { -21, 40, 14, -6, 33, 32, 40, 0, 14, -2, -1, -9, 51, -34, 24, 4, -20, 45, -14, 3, 0, -1, 2, 13, 1, 2, 2, 2, 4, -19, 37, -11, 0, 6, -12, 24, -10, 0, -2, 3, 5, 3, -3, 11, -2, 5, -2, 2 }, { -16, 23, -23, -17, 32, -35, 20, 15, -41, 4, -5, 20, -22, -29, 13, -9, 13, -26, 7, -4, 0, 3, 6, 6, -55, 31, -3, 2, 15, -33, 5, 4, -2, 0, 4, -22, 1, 5, 7, -30, 12, -1, -5, 4, -12, -7, 1, 3 }, { -11, 30, 17, 11, 23, -15, -44, -10, 43, -1, 0, -1, 40, 8, -52, -22, 21, -15, 21, 2, 0, 2, -8, 43, -22, -4, -4, -3, -11, 5, 13, -6, 6, 2, 18, -17, 9, -4, 2, -11, 22, -10, -3, 2, 1, -5, 7, 4 }, { 2, -16, -67, 22, 15, -23, 45, 15, 27, -21, 3, -11, 30, 14, 10, -1, 48, 12, -5, 1, -6, 1, 1, 0, 10, 23, 3, 2, 3, 34, -4, -3, 1, 1, -2, -9, 29, -2, 10, 6, -2, 0, 0, -7, 13, -4, 2, -1 }, { -6, 32, -9, -28, 24, -12, 1, -51, -21, 3, -3, -7, 12, -39, -29, -4, -5, -9, -34, 10, -1, 0, 13, -32, 5, -24, 6, 2, -21, 29, -33, -5, -2, 4, -13, -19, 15, -9, -2, 10, -22, 4, 2, -24, 16, -10, -1, -6 }, { 12, -26, -31, -23, -27, 23, -18, 1, -39, -2, -7, -17, 7, 0, -50, -44, 16, 29, -32, -7, -2, -5, -1, -3, -6, 15, -14, -3, -6, -12, 33, -22, -3, 8, -21, 13, 6, -6, 5, -12, 12, -1, -16, 17, -7, 1, -7, 6 }, { -5, 18, 54, 33, -39, -18, -11, 19, 18, -4, 12, -31, 2, -11, 16, -13, 5, 19, -35, 8, 2, -3, 2, -31, -15, 36, -1, 4, 9, -2, -14, -5, 1, 5, -16, -25, 29, 0, 22, -25, -6, 4, -4, -8, -2, -15, -2, 3 }, { 3, -17, -8, 10, -20, 34, 1, 39, -23, 9, 3, 19, 18, -32, 0, -1, -19, 30, 18, -4, 3, -5, 20, 14, -9, -23, 19, 6, -32, 17, -5, 28, -6, 0, 28, -38, 21, -11, -1, -17, 20, 3, 13, -16, 10, -24, 22, 9 }, { -7, 14, 26, 25, -17, -22, 29, -12, -6, 30, 2, 60, -19, -21, -14, -44, 19, 18, 28, -14, 8, 7, 1, 0, 19, 13, -18, -3, -11, 21, 14, 10, -8, -3, -11, 24, 13, -12, 4, 23, -9, 7, -2, 6, 12, 11, -7, -3 }, { 1, 9, 21, -17, -32, -27, -8, 5, -43, 14, -11, -13, 18, -5, 23, 40, 28, -4, -3, -22, 4, -7, -25, 33, -12, -1, 30, 5, -4, 31, 19, -34, 0, -2, -3, 4, 9, 9, -9, 26, 11, -21, -11, -3, 25, 5, 2, -3 }, { 0, 11, 4, -40, -7, -27, 14, -46, 12, 10, -11, 16, -18, 40, 6, 20, -5, 34, -12, 7, -3, -3, 4, -4, 30, 6, 14, 13, -5, -2, 20, 11, -15, -6, 23, -10, 22, -7, 14, -32, 36, -2, 1, 16, -10, -15, 19, 22 }, { 4, -6, -12, -44, -7, 20, -46, -8, 13, 21, -26, 38, 40, -4, 22, 7, 1, 12, 19, -15, 5, 3, 4, -5, 4, 40, -15, -9, 39, -1, -18, 6, 2, 0, -8, -6, 15, 14, 19, 2, -24, 8, 6, -16, 6, -4, -6, -9 }, { 1, 8, 11, -13, -28, -18, 3, -1, -21, 4, 9, -19, 34, -4, 0, -2, 36, -7, -7, 35, -3, -16, 52, 6, 5, 0, -17, -14, 19, 8, -11, 40, -6, 8, 28, 17, -30, 7, -7, 12, 10, 16, 25, 15, -19, 23, 11, 2 }, { -5, -3, -14, 61, 13, 11, -22, -13, -11, 40, 7, 39, 15, 7, -18, 42, 23, 1, -27, 6, 12, 6, 21, -23, -4, -5, 37, 16, 14, -10, 7, -5, -5, 12, -15, 4, -15, 14, 0, -12, 6, 5, -9, 10, -9, 0, -4, 6 }, { -1, 2, -6, 34, 18, 12, -10, -16, -28, 13, -3, -23, 14, -27, 16, -8, -2, -15, 4, -31, 0, -16, -31, -6, 42, -2, -25, -10, 19, 14, -24, -4, -14, -32, 22, 3, 6, -7, 27, -22, 21, -26, -1, 23, -24, 10, 5, 15 }, { 2, -5, -3, -20, 8, 16, -34, 11, 22, -17, 10, 6, -13, -35, 28, -1, 14, -32, 16, 33, -9, -11, 27, -21, 3, 8, 26, 19, -24, 16, 14, -2, -2, -9, -14, 25, 29, -19, 12, 5, 14, -9, -22, 38, 4, 17, -9, 19 }, { 2, 1, 6, -24, -20, -15, 25, 25, 16, 28, -24, -5, 39, 11, 4, -15, 21, -28, 15, -33, 12, 6, -2, -27, 11, -33, 17, 24, -27, -23, -18, -4, 11, 0, -17, 5, -19, -24, 0, -28, -10, 8, -7, 8, -25, -9, -11, 4 }, { -2, -1, -10, 9, 1, -4, 0, -4, -24, 40, 21, -9, 10, 25, 27, -3, -23, -29, 6, 38, 13, 10, 18, 28, 16, -15, -34, -15, -3, -2, 18, -21, 29, 31, -16, 7, 28, -24, 26, -15, -7, 8, -2, -8, 14, -19, -1, 6 }, { 0, 2, 11, 11, 8, -2, -8, -25, -34, -42, 9, -29, -2, -8, -13, 4, 18, 10, 59, -15, -1, 9, 3, -4, 31, 11, 35, 17, 14, -20, 10, 13, 31, 19, -8, 12, 6, 6, 9, -15, -8, 29, 15, -10, 11, -18, 1, 2 }, { -1, 2, -4, 9, 6, -3, 5, -12, -17, 17, 8, -24, 15, 32, -14, 10, -31, 13, 13, -5, -5, -27, 7, 27, -27, 25, 11, 33, -15, -10, -28, 38, -26, -13, -6, 9, 22, -25, 8, 9, -27, -3, -17, 18, 4, 23, -34, -9 }, { 1, -1, 12, -14, -17, -5, 22, 11, -12, -41, 27, 40, 13, -3, -27, 20, -8, 12, 14, 27, -15, -2, -4, 20, 7, -20, 1, 0, 22, 18, -25, -25, 8, 1, -16, -13, -8, 6, 36, -19, -14, -22, -23, 11, -35, 6, -19, -2 }, { 0, 0, 0, -6, -4, -3, 15, 15, 14, 26, 6, -17, -5, -9, -26, 6, 3, -14, 26, 12, -16, -53, 22, -14, 23, -14, 3, -9, 28, -25, 33, -14, -41, -29, -5, -8, 6, 20, 4, -6, -4, -15, -10, -33, 24, -14, -9, -16 }, { 0, 2, 3, 13, 8, 2, -27, -24, -18, -31, 11, 25, 8, 20, 54, -23, 36, 14, -9, 2, -16, -30, -1, 7, 2, -23, -6, -4, -27, -16, 17, -7, -22, -11, 8, -21, -14, -20, -9, -10, -22, 9, -3, -21, -11, -4, -25, -21 }, { 0, -2, -14, -20, 5, 1, -6, -3, 6, 48, 62, -12, -7, -11, 8, -23, 10, 26, 16, 30, 5, -5, -32, 1, 17, 27, 21, 14, -9, -1, -21, -14, 23, -2, 11, -15, -21, 14, -36, -1, -3, -11, -3, -3, -15, 0, -2, -3 }, { 1, -3, -5, -4, 4, 4, -7, 12, 17, 17, -24, -9, -34, -29, -5, -2, 16, -3, -31, 15, 2, -4, -20, 53, 11, -3, 22, 25, 5, 17, 4, 10, -15, 3, 8, 27, -14, -5, 41, -16, -29, 25, 25, -15, -19, -19, -9, -10 }, { 0, 1, -3, 2, 9, 2, -2, -7, 15, 23, -30, -32, -34, -11, 7, 11, 14, 32, 21, -16, 7, 0, 31, 8, -2, -23, -2, -32, -4, 17, 7, -2, 18, 24, -4, -39, 12, 17, 1, -5, -4, 10, -25, 13, -31, 35, -31, -13 }, { 0, 2, 0, 6, 7, -2, -2, -6, 1, 1, -13, 0, 0, 8, 12, -10, -9, -2, 15, -3, -26, -40, -3, 15, -6, 8, -1, -21, -17, 31, -23, 8, -24, 28, -55, 18, -17, 37, -24, -21, 5, 12, -12, 11, -6, -36, 24, 27 }, { -1, 0, -1, -17, 2, 2, -4, 6, 14, 4, 54, 1, -9, -13, -18, 59, 23, 6, 3, -34, 5, -25, -20, -12, -17, 0, -35, -27, -20, -4, 8, 4, -5, 5, -11, 11, 9, -34, 7, -3, -8, 29, 23, 4, -12, -2, 11, 8 } }, { { 116, -32, 27, 7, 12, -16, 12, -21, 5, 5, 2, 3, -3, 1, -2, 0, 5, -4, -4, 2, 2, 1, -2, 0, 0, 1, -1, 1, -1, 1, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 30, 102, -23, 2, 34, -40, -9, 28, 0, -3, 0, 6, 12, -24, 9, -1, 1, 2, 4, 1, -1, 2, 4, -1, -6, 2, 1, 0, 1, 0, 0, 1, 2, -1, -1, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -32, -33, -20, -4, 32, -81, 37, -43, 3, -3, -1, 4, 9, -38, 6, -7, 26, -21, -5, 1, 0, 2, 2, -4, -5, -3, 0, -1, -3, 7, -2, -6, -2, -2, -1, 0, -2, -1, -1, 1, 1, -1, 0, 0, 0, 0, 0, 0 }, { 4, 33, -30, -7, 18, 35, 56, -37, 7, -3, -1, 0, -7, 51, -38, 3, 38, -22, -1, 1, -2, 0, -2, 0, 33, -29, 5, 1, 4, 7, 0, -5, 0, 0, 6, 7, -9, 2, 1, 1, 1, 1, 2, 0, -2, 0, 1, 0 }, { 0, 18, 31, 15, -38, 30, 41, 11, -13, 4, 5, -4, 5, -33, 30, -25, 61, -15, -3, -4, 1, -2, -1, 5, -27, 28, 7, 0, -20, 40, -8, -3, 0, 1, -4, -1, 3, 5, -7, 11, 5, -3, 0, 1, -1, 1, 3, 0 }, { -23, 16, 90, 27, 8, -32, -19, 8, 6, 17, 7, 6, -4, 12, -27, -6, 14, -1, -3, 2, 5, 2, -1, -3, 29, -42, 7, -5, -7, 13, -3, 0, -1, -1, 3, 13, -24, 6, 1, 1, 1, -1, 3, -1, -5, 0, 0, -1 }, { -18, 32, 50, 3, 14, 5, 26, -50, 34, 8, 1, 10, -26, 16, 29, 11, -25, 3, -5, 8, 1, 4, -5, -5, -15, 44, -10, 4, 18, -31, 10, -2, -3, -2, -2, -14, 33, -8, 4, -7, -2, 2, -4, 0, 10, 0, -1, 2 }, { 6, 13, -3, 29, -55, -4, 30, -13, -24, 4, 5, -9, 7, -31, 19, 32, -16, -9, -7, -5, 0, -3, -3, 2, 5, -32, 8, -4, 44, -42, 3, 0, -2, 0, -2, 14, -33, 10, 26, -24, -2, 2, 2, -1, -7, -3, -5, -1 }, { -5, -14, 19, 1, 67, 56, -10, 5, 14, 3, -3, 4, 53, -40, 8, 15, 4, -36, 20, 4, 2, 1, 12, 2, -1, -6, 2, 0, 22, -10, -9, -4, 9, -1, -4, 8, -9, 3, 13, -11, 0, -3, 1, 0, 0, -1, -3, 1 }, { 6, 13, -11, -23, -21, 12, -52, -47, 35, -11, -6, 6, -32, -17, 41, 5, -1, -32, 4, 8, -4, -1, -2, -26, 20, -7, 5, 2, -21, 17, -6, -10, 3, -9, -5, 25, -32, 14, -15, 6, 4, -3, 7, 3, -16, -2, 2, -2 }, { -2, -1, 8, -18, 11, 11, 41, 3, 31, -19, -7, 21, 0, -17, -8, 17, -37, 47, -10, 9, -7, 8, 2, 22, -36, -20, 27, 2, -22, 14, -1, 8, -3, 6, 0, -2, -40, 32, -20, 20, -7, 2, 0, 0, -22, -2, 3, -5 }, { 0, 15, -9, 31, -18, 7, -28, -43, -5, 18, 9, -3, 2, -22, -69, 18, -20, -29, -5, -1, 7, -2, -2, 10, -47, -1, 1, 3, -4, 16, -30, -2, -1, 3, -6, -22, 14, -7, -11, 15, -9, -10, -5, -3, 3, 3, 2, 0 }, { -5, -9, -30, 72, 43, 17, -13, -7, -19, 29, 22, 13, -29, 4, 28, 4, 5, 21, -39, 7, 11, 7, -21, 15, 2, 11, -5, 3, -7, 12, 5, 6, -15, 1, 4, 16, -19, 9, 0, -2, 4, 3, 4, 4, -8, -4, 2, -2 }, { 2, -3, -19, 50, -19, 6, 20, 6, 48, 16, 19, 5, 22, -21, 5, -17, -21, 9, 26, 12, 7, 4, 13, -25, 35, -21, 19, 10, -31, -10, 22, -5, 5, -6, 1, -8, 27, -19, -27, 9, 3, 4, 0, -8, 22, 3, 2, 3 }, { -1, 13, 14, -20, 12, 4, -1, -42, -50, 19, 0, -54, 40, 4, 26, 10, -21, 19, 0, -23, 6, -22, 0, 13, 17, -10, 4, 14, -32, 3, 10, 14, -5, 9, 4, 6, 4, -3, -34, 14, 2, 4, 3, -4, 7, 3, 3, 2 }, { 2, 4, -1, -10, 8, 16, -22, -20, -3, 2, -3, -6, -15, -17, -9, -70, 18, 5, -10, -7, -1, -4, -9, 9, -26, -27, 23, -6, -13, -45, 36, -3, -5, 2, 13, -35, -7, 17, 6, -33, 6, 8, -6, -18, 4, -5, -6, -3 }, { -1, -3, 7, -25, 23, 12, 23, 31, -30, 19, 11, -14, -51, -33, -21, 18, -16, -16, -23, -12, 6, -5, -8, -58, 11, 4, 22, 1, -4, -9, 7, -25, -10, -17, -17, -2, 7, 8, -20, 7, 3, -6, -2, -4, 7, 7, 2, 2 }, { 1, -1, 1, -10, -5, 3, -24, -6, 3, -11, -13, 18, 7, -10, 19, 48, 36, 19, -37, 8, -6, 4, -9, 8, 14, -26, 39, -6, 21, 17, 12, -11, -13, -4, 24, -38, 32, -7, 10, 23, -14, 8, -5, -15, 31, 10, 1, 3 }, { 1, 7, 7, -19, -7, 16, 5, -14, -2, -19, -12, 19, 8, -43, -28, -14, -11, 28, -46, 11, -7, 8, -11, 4, 25, 0, -31, 17, -20, 11, -4, 3, -8, -1, -6, 35, 8, -45, 25, -30, 13, 1, 4, 7, 21, -18, 3, 4 }, { -1, 0, 12, 39, 10, 8, -2, -11, -45, -67, -29, 23, -15, -9, -6, 2, 0, -2, 15, -2, -26, 1, 25, -6, 2, -1, -23, -3, -2, -14, 30, -17, 18, -6, 13, -5, -1, -14, -33, 22, -4, 6, 3, 9, -12, 8, 0, 1 }, { 0, -2, 2, 28, 13, -2, 8, 5, -7, -47, -25, -18, -9, 19, 23, -24, -25, -18, -8, -3, -19, -6, -17, -7, 0, -18, 42, 12, -8, 7, -59, 18, -3, -3, -28, -2, 17, 4, 11, -16, -16, -8, -12, -8, 24, -7, -3, 0 }, { 0, 1, 1, -27, -6, 1, 4, -3, -50, 31, 5, 70, 3, 13, 22, -13, -9, -19, 11, 43, 20, 31, 8, 4, -8, -24, 4, 9, -12, -16, -20, 8, 6, 3, -9, 1, 5, -8, -14, 5, -15, 0, -5, 1, 13, 1, 1, 0 }, { -1, 0, 0, 9, -7, -9, -8, -8, -18, -2, -8, 24, 19, 15, -20, 34, 10, 3, 14, 5, -15, -2, 18, -17, -1, 42, 34, 35, -30, 4, 22, -2, 5, -1, -5, 8, -4, 44, 14, -42, 30, -4, 5, -25, 20, -14, -5, -8 }, { -1, -2, 4, 0, -6, -9, 5, 32, 19, -7, -14, -8, 9, 11, 2, 28, 2, -54, -22, 12, -4, 4, -49, 32, -7, 2, -21, 17, -31, -27, 9, -4, -18, -11, 22, -4, -11, -11, -44, -17, 2, 14, 3, -13, 6, -10, -3, -6 }, { 0, 2, -1, 6, -11, -4, -5, -4, 2, 6, -19, 10, 62, 25, -4, -31, -12, -5, -56, 12, -10, -4, -12, -29, -4, 16, 16, -16, 17, -8, 9, -37, -5, -13, -11, 15, 5, 20, -15, 17, 3, -8, -9, 27, -25, 4, 10, 3 }, { 0, -1, -1, -2, -2, -3, -4, -1, -1, 9, -23, 4, -18, -19, -21, -11, -1, -13, 17, 6, -4, 0, -15, 42, 21, 29, 58, -21, 27, -8, 15, 40, 2, 5, 3, 43, 15, -1, -22, 12, 15, 19, 0, 27, 8, 4, 15, 2 }, { 0, 2, 4, -6, 2, -1, -2, -6, -12, -45, 57, -13, 7, 9, 3, -7, -18, -3, 23, 24, 15, 21, -34, 11, -19, -9, 6, -19, 16, 21, 22, -30, -18, -10, -3, 20, 2, 1, 4, 6, 39, -21, 15, -11, 29, 12, 5, -1 }, { 0, -2, -4, 4, 0, 0, -1, 1, 14, 16, -24, 36, 1, 10, 10, 8, 10, 6, 1, -66, -11, -8, -10, -2, -24, -22, -16, -24, 7, -4, 10, -1, -3, 22, -46, 22, -4, -11, -23, 0, 23, -22, 3, -19, 38, 12, 11, -5 }, { 0, -1, -2, 6, 4, -1, 0, 5, 3, 16, -34, -49, -12, 8, 5, 17, 12, 2, -16, 66, -13, 18, 35, -14, -30, -20, -5, -27, -8, -2, 13, 14, 16, 15, -22, 3, -1, -15, -1, -6, 30, -6, 6, 1, 10, 0, 18, 0 }, { 0, 1, 1, 2, -4, 4, -6, -11, -3, -22, 8, -3, 19, -10, -14, 7, 25, 25, 12, 14, 9, 34, -32, -39, 21, 21, -16, -34, 4, -17, -19, 42, -30, 21, -28, -18, -8, 16, -30, -9, -13, 17, -6, -19, -10, 8, -8, -6 }, { -1, -2, 2, -2, 3, -1, -11, 0, 4, -19, 30, 3, 1, 10, 2, 22, 36, 17, 11, -3, 15, -19, -5, -14, -29, -16, 31, 30, -2, -23, 2, 4, -12, -1, -22, 9, 8, -44, -15, -13, 7, -8, -20, 45, -34, -37, 13, 8 }, { 0, 0, -2, 6, 2, 2, 2, -5, -6, 26, -44, -5, -3, -8, -4, 2, 10, 27, 53, 25, -17, -19, -62, -14, -16, -16, -21, 17, -2, 5, 8, -16, -8, -32, -4, 12, 7, 17, 22, 15, -15, 7, -11, 16, 9, 1, -5, 7 } }, { { 120, 1, 2, -4, 27, -30, 7, -5, -2, 2, 0, 1, 10, -11, 1, 0, 3, -1, -1, 0, 1, 0, 1, 2, -3, 1, -1, 0, 0, 1, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 6,-106, 42, -4, -13, 15, 12, -46, 16, 4, -1, -4, -7, 11, -5, 1, 5, -16, 1, 2, 2, -2, -3, -1, 4, 0, -1, 0, -1, 2, -5, -1, 0, 0, 0, 1, 0, 0, -1, 0, -2, -1, 0, 0, 0, 0, 0, 0 }, { -41, 11, 42, -9, 56, -73, 11, -32, 16, 3, -2, 2, 24, -39, 7, -3, 10, -16, 0, 4, 1, 2, 2, 6, -14, 4, -1, -1, -1, 5, -4, -3, 0, 0, 1, -4, 1, -1, 0, 1, -1, -1, 0, -1, 0, 0, 0, 0 }, { 13, 54, 96, -4, 1, 42, -18, -5, 29, 7, 1, 0, -11, 25, -3, 2, -8, -4, 11, 3, 2, 1, -2, -3, 11, -1, 1, 1, 0, -3, -2, 5, 2, 0, -1, 4, 0, 1, 0, -1, -1, 1, 0, 1, 0, 0, 0, 0 }, { 3, 40, -25, 5, -29, -14, 56, -58, 9, -3, 2, 4, -33, 28, -6, 0, 40, -40, 5, 1, 0, 1, -5, -13, 23, -9, 2, 1, -2, 16, -13, -2, -2, -3, -1, 8, -5, -1, -2, 5, -3, -1, 0, 2, -1, 1, 0, 0 }, { -5, -1, -31, -14, 79, 66, 15, -8, 8, 2, -5, -5, 39, 14, -27, 4, 18, -21, 12, 4, 1, -2, 1, 17, 6, -13, 0, 1, 2, 11, -11, 2, 4, 2, 7, 2, -3, 1, 1, 4, -3, -1, 1, 0, 0, 1, 0, -1 }, { -5, 0, 27, -63, -17, -15, 61, 32, -38, 11, -5, -21, 28, 35, -14, -9, 12, 27, -15, -4, 0, -4, -5, 22, 6, -5, 2, 0, -2, 1, 12, 0, -2, 4, 9, -2, -1, 3, 0, 0, 4, 1, 2, -2, 1, 0, 1, 0 }, { 8, 14, 4, -20, -46, 37, -1, -12, -11, 1, -2, -15, 30, -52, 27, -7, 26, -27, 7, 0, 0, -3, -1, 23, -52, 26, 4, 1, -10, 23, -18, -3, 4, 3, 6, -22, 12, 1, -4, 11, -5, -4, 1, -7, 5, 3, 0, -2 }, { 0, -7, 21, 80, 4, 12, 65, 41, 15, -4, 11, 24, -1, -15, -13, -13, 21, 18, 2, -2, 2, 4, 11, 8, -20, 4, 5, -1, -6, 11, 5, 2, -1, 7, -1, -10, 3, 3, -1, 5, 4, 1, -1, -3, 2, 2, 2, 0 }, { 2, 8, -20, -16, -24, -6, 33, -14, 59, -8, 2, 0, 15, -3, -32, 11, -62, 4, 29, 2, -5, 2, 0, 18, -27, -6, 2, 2, 11, -42, 2, 14, 4, 4, 6, -20, 6, -1, 3, -12, -4, 4, -2, -7, 3, -3, -2, -2 }, { -1, 14, 1, 8, 21, 30, 19, -59, -37, 0, 0, 6, -22, -11, 8, 14, -16, 34, -47, 1, 3, -1, -7, 2, -21, 13, 1, 4, 7, -28, 42, -24, -7, 0, 5, -17, 11, -3, 1, -15, 16, -5, 1, -8, 4, -7, 5, -2 }, { 1, 6, 14, 29, -20, -10, -24, -12, -47, 10, 5, -1, 12, -13, -81, 21, -11, -17, -19, 1, 5, -1, -3, 16, -8, -42, 6, -6, 8, 7, -22, -6, 0, -1, 11, -7, -18, 4, 3, 6, -11, -7, 3, -6, -6, 3, -4, -3 }, { 0, 4, 0, 42, -3, -14, -21, -33, -12, -48, 5, -33, 33, 44, 24, -14, 8, 31, 23, -16, -5, -5, -17, 42, 0, -4, -9, -1, -6, 4, 1, 21, -1, 2, 25, -10, 2, -8, 0, 1, -6, 10, 7, -4, 1, 1, -6, -1 }, { 1, -4, 2, -34, 17, -3, -14, 8, 4, -68, -2, 2, -53, -2, -48, -23, 17, 10, 8, -17, -14, 3, -2, -13, -23, 8, 8, -6, -18, 20, 3, 2, -5, -1, 5, -31, 18, 3, -9, 12, 2, -1, -2, -13, 4, 4, 3, -3 }, { -3, -2, -5, 6, 15, -17, -9, 10, -10, 40, -3, 23, -21, 60, 2, 14, -7, -16, -7, 15, 7, 3, -2, 15, -36, 43, -16, -3, 19, 8, -30, 4, 2, 4, 10, -33, 37, -5, 5, 11, -20, -3, -1, -9, 12, 6, -5, -1 }, { 2, 2, -15, -6, -13, -6, -30, -11, 45, 43, 2, 28, 11, 12, -11, -46, 23, 21, -23, 26, 4, 5, -1, 20, -1, -27, 2, -5, -33, 14, 30, -20, 4, 1, 25, -17, -5, 1, -11, 2, 17, -5, 9, -15, 1, 1, 5, -5 }, { 0, 0, 5, -6, 0, 3, -1, -16, -46, -3, -5, 55, 16, 19, 13, -52, -10, -13, 34, -4, -2, 1, 41, -22, -23, -27, 3, 1, -10, -23, -4, 15, 11, 16, -23, -7, -8, 1, 3, -16, 0, 0, -13, 0, 2, -7, 1, -1 }, { 0, -2, -1, 12, 18, 0, 3, 15, -13, 37, 1, -57, -47, -11, 4, -48, 4, -30, 17, -3, 4, -6, -28, 14, -7, -11, 17, -1, -4, -36, -1, 16, 2, -9, 15, -12, -8, 11, 4, -33, 4, 9, -1, -8, 0, -12, -2, 0 }, { 0, -3, 5, 2, -13, -7, -17, 15, 1, -15, -2, 15, 16, 3, -17, 25, 50, -31, 20, -3, -5, -2, 15, 4, 13, 19, -14, -5, 32, -27, 36, -12, 6, 0, 20, -10, 13, -8, 31, -45, 34, -7, 10, -12, 11, -21, 9, -5 }, { 0, 5, -3, 18, -6, -3, -3, -10, -3, 13, 15, -42, 36, 8, -39, -33, -16, -4, 0, 0, 5, -1, -4, -43, 9, 32, -18, 7, -22, 11, 5, -13, -1, -1, -35, 8, 48, -19, -6, -9, 18, -11, -14, 5, 22, -9, 5, 3 }, { 1, -2, 5, 2, 1, 1, 8, 23, 6, -48, 36, 4, 18, 19, 25, -26, -25, -50, -47, 22, -5, 15, -24, -10, -9, -19, 0, 7, 10, -8, -4, -38, 3, -26, 10, -8, -6, 2, 9, -1, -8, -11, 6, 0, 1, -3, -4, 0 }, { -1, -3, -2, -11, 3, -1, -3, -7, -16, 13, 90, 8, 3, -2, 2, 18, 7, 16, 38, 25, 0, 24, -21, -15, -1, -2, 40, -4, 15, 11, 14, 21, -2, -8, -1, -1, 7, 16, -1, 11, 12, 5, 2, -9, 11, 9, 5, -2 }, { 0, 0, 1, -12, 1, 6, 4, 2, -14, -2, 31, 30, -16, -15, -14, -3, 0, -4, 14, 4, 6, 22, -35, 31, -5, 15, -76, 5, -35, -11, -1, 10, -2, -9, 4, 30, -12, -28, -8, -8, 2, 5, 2, 23, -17, -10, -3, 6 }, { 1, 2, -1, -7, -6, 6, -3, -10, 6, -12, -5, -1, -1, -19, -12, -32, 28, 40, -15, 55, 8, 13, -3, -3, 12, -1, -2, -14, 41, -12, -32, 10, 4, 3, -7, 0, 17, -12, 43, -14, -32, 20, -17, 23, -4, -7, -13, 14 }, { 0, -1, 0, 7, 0, -7, -1, 0, -8, -13, -31, -11, -9, 16, -8, -1, -1, 7, 44, 48, 20, -6, -15, 12, -35, 13, 15, -8, 9, -2, 16, -47, 27, -11, -14, 33, -19, 14, -6, 17, 16, -32, -1, 25, -10, 6, 8, 6 }, { 0, 0, 2, 4, -3, 1, 4, -4, -20, -10, -16, 25, -5, -19, -8, -27, -40, -28, 1, 24, -9, -1, 10, 38, 51, 38, 36, 5, 0, 18, 18, 18, 9, 3, 33, 18, 23, -6, -5, 13, 4, 15, 6, 6, 14, 6, 7, 6 }, { 0, 1, -5, -2, -3, -1, -12, -6, 16, 7, 3, 24, 12, 14, -13, -23, 5, -6, -24, -55, -7, 2, -32, 20, -25, 26, 36, -8, 29, 9, 3, 12, -30, 0, -23, 31, -10, 20, 16, -10, 13, 2, -15, 32, -17, -10, 8, 14 }, { 0, -1, 2, -1, -1, 5, 6, 2, -6, 14, -26, 25, 6, -16, 5, -17, -3, 4, 35, -34, 13, -9, -52, -20, 10, -29, -13, -8, 36, 0, 20, -22, -18, -25, 16, -4, 29, -22, 6, 30, -19, -1, 7, -7, 20, 22, -14, -3 }, { 0, 1, -1, -5, 2, 1, -2, -2, 6, 6, 42, -28, -21, -3, 3, -23, -5, -10, -3, -23, -1, -4, 53, 30, 0, -12, -25, -20, 44, 1, 22, -17, 9, 30, -6, 4, 1, -27, 2, 34, 4, -17, -7, 15, -7, 27, 2, 10 }, { 0, -1, 1, 5, -4, -3, -11, 6, 1, -6, -6, -1, 12, 6, -8, 14, 35, -19, -15, 3, 8, 22, 0, 12, 4, 4, 19, 1, -17, -63, 29, 6, -2, -8, -24, -7, 11, -3, -45, 30, -28, 24, -35, 14, 13, 23, -10, 11 }, { 0, -1, -2, -5, 0, -1, 0, 5, 2, -18, -2, -1, -7, 1, 10, 8, -10, -10, 0, 1, 101, 18, 7, 19, 5, -16, 12, -16, -14, 12, -1, 4, -44, 18, -12, -1, 8, -11, 13, -9, 9, -6, -8, -9, 17, -14, 11, -6 }, { 0, -1, 1, -3, 1, -2, -2, 0, 1, 0, 9, 14, -13, -8, 8, 4, 6, 17, 8, -18, -23, 7, -7, 52, 29, -18, 13, 31, -12, -16, -43, -34, 14, -3, -37, 1, 28, 9, 10, -6, 4, -42, -13, -12, 35, -8, 6, -3 } } }, { { { 114, -37, -35, -3, -15, -2, 0, 15, 14, 0, 1, -4, -3, -5, 0, 0, 1, -4, -3, 1, -1, 0, -1, 2, 3, 3, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, { 49, 82, 68, -15, -25, -17, 2, -16, -14, 1, 0, 14, 23, 15, 0, 2, -4, -7, -7, -2, 2, 0, 1, -2, -6, -3, 1, 0, 1, -1, 1, 6, 5, 0, -1, 1, 1, 0, -1, -1, -1, 0, 1, 0, -2, -2, 0, 1 }, { -31, -11, -31, -37, -94, -36, -2, 15, 16, -1, -3, 9, 33, 9, -4, 0, -7, -22, -21, -6, 0, -2, 0, 4, 0, 4, 0, -2, -1, -2, 0, 8, 8, 0, -2, -1, 0, 1, -4, -5, -4, 0, -1, 1, 0, -1, 0, 2 }, { 1, 78, -90, 7, 14, -7, -13, -25, 15, 15, -1, -12, -3, 9, 4, -2, 4, 12, -1, -3, -1, -2, 2, 7, -1, -10, -4, 2, 0, 1, 3, -4, -1, -2, 0, 0, 0, 0, -2, 2, 4, 0, -1, -3, 0, 1, 2, 0 }, { -4, 11, 18, 24, 33, -66, 12, 50, 55, 12, 2, -17, -21, 19, 5, -1, -9, -29, -21, 4, 3, 1, 2, 18, 28, 10, 5, -1, 0, 3, 1, 2, -3, -7, 2, 1, 1, 2, -4, -13, -6, -2, 2, 2, 5, 7, 3, 1 }, { -2, 27, -2, -71, 18, 64, 10, 49, 27, 3, 2, 20, -12, -27, 3, 5, 8, -13, -18, -8, -1, -3, 2, 2, 19, 18, 0, 3, 2, 2, -10, -5, 1, 5, 2, -1, 2, -1, -3, -8, -3, 3, 0, 4, 6, 3, -1, 0 }, { 2, -23, -8, -83, 53, -60, 4, -29, -20, -1, 8, 21, 0, 16, 1, -4, -5, 5, 4, -6, -3, -2, -2, -10, -5, -10, -3, -1, -1, 2, 3, 3, 3, 6, 2, -2, 1, -2, 2, 1, 4, -1, -1, -3, -3, -2, -2, 1 }, { 0, 20, -19, -6, -14, -17, 41, 39, -77, -21, -9, -24, -33, -11, 11, 2, -22, -18, 36, 13, -3, -2, -6, 4, 16, 12, 5, 2, 2, -1, 0, -5, -21, -6, -2, -2, -3, 4, 8, 0, -6, -6, 0, 5, 6, 7, -1, -4 }, { 0, -2, 20, -20, -35, -14, -3, -52, 19, 21, 3, -27, -64, -39, -4, -3, 1, 26, -7, -13, 2, 0, 1, 16, 31, 11, -6, -2, -1, -3, -11, -31, -21, -7, -1, 1, -1, -3, -1, 2, 5, 2, -4, 0, 13, 11, 3, -9 }, { 0, -8, -3, 11, -5, 11, 70, -3, -2, 89, 11, 25, 14, 14, 3, 5, -23, 12, 10, -25, 0, 5, 9, -5, 2, 0, 11, 5, 3, 0, 7, -5, -5, 8, 2, 3, -1, -6, 2, 4, 2, -6, 0, 0, 0, -1, -2, -2 }, { -1, 4, -9, 12, -1, 2, 67, -43, 45, -61, -5, 33, -4, -11, 9, 2, -28, -6, 2, 25, 4, -1, -2, -22, -1, 19, -1, 3, 2, -1, 1, 2, 3, -1, 2, -1, 1, 1, 11, -1, -9, 0, -5, 6, 4, -3, -6, 0 }, { 1, -10, -2, -4, -2, 34, 17, -31, -19, -9, 5, -31, -38, 52, 10, 10, -13, -40, -49, -22, -1, -1, -9, 10, 10, -28, -2, 8, 1, 1, 8, 26, 23, 0, -5, 0, -4, -5, -13, -22, -8, -5, -1, -5, -3, 0, 6, 12 }, { -1, -2, 10, -29, -5, 16, 35, 10, 31, -24, -2, -60, 23, 48, -14, 6, 11, 38, 27, 10, -7, -3, 0, 33, -12, -18, 14, 0, 1, 5, 2, -17, -14, -5, -1, -1, 0, 4, 3, 23, 16, 2, 2, -7, -2, 1, 5, -10 }, { 2, -1, 0, -2, 23, -7, 21, -17, -6, 6, -16, -62, 49, -64, -6, -4, -10, -27, -30, -20, -6, -5, 4, 10, -31, 19, 6, -3, 2, -8, -5, 2, 9, 3, -6, -2, -3, -5, -9, 3, -12, -6, 0, 3, -2, -5, 0, 0 }, { 0, -9, 6, -8, 2, 10, -13, -21, 11, 10, -6, -5, 31, -7, 78, 6, -3, -21, 41, 14, -1, 4, 13, 36, 24, 0, -36, 7, 0, 6, 22, 22, -20, -6, 1, 1, 2, 3, -9, -17, 0, 4, -2, -5, 1, 20, 8, 5 }, { -1, 2, 6, -10, -7, -3, 4, 8, 22, 21, -46, 2, -39, -15, 10, 7, 4, -35, 10, 17, -4, 1, 18, -21, -53, -48, -8, 3, 3, -3, -7, 16, -17, -22, -5, 2, -1, -3, 8, 28, 17, -4, -2, -14, -30, -13, -2, -1 }, { 2, -1, -1, -8, 7, 17, -20, -21, 9, 13, -16, 7, 5, 6, -79, -6, -40, -34, 38, 20, -3, -1, 5, 23, 17, 6, 15, -14, -7, -17, 3, 24, -19, -15, 4, -2, 2, 2, -13, -14, -10, -9, -2, 9, -2, 11, 8, 4 }, { -2, 4, 0, -4, -5, 3, 0, -4, 7, 9, 105, -16, -1, -16, -1, 2, 1, -34, 11, 31, 14, 4, -27, -10, -18, -14, 2, 0, -1, -9, -6, 5, -8, -9, 13, 2, 2, 5, 5, 12, 11, -5, 0, -4, -5, 0, -6, -6 }, { 0, 3, 4, 3, 0, 5, -24, 15, 19, -24, 10, -2, -2, 3, 9, -14, -62, -4, 20, -72, -4, -4, -13, -18, -21, -18, -13, -6, -1, -7, 18, -22, -32, 23, 3, -3, -5, -6, -1, -2, -2, -10, -4, -12, 6, 10, -14, -11 }, { 0, 1, -2, 11, -2, -9, 15, -15, 7, -14, 1, 12, -5, 0, -7, -1, 64, -45, 45, -51, -9, -8, 2, 13, 7, -1, 18, 5, 2, 4, -36, 9, 2, 40, 4, -7, -2, 5, -12, 3, 16, 0, 8, 12, 4, 1, -10, -1 }, { -1, 6, 4, -5, -6, -7, 7, 11, 13, 3, -1, -28, -10, -23, -17, 9, -3, 28, 33, -15, 21, 1, -18, -19, 27, -3, -24, -3, -1, -9, 28, 50, 41, 22, -6, -1, -4, 8, 19, -14, 10, 16, -4, -13, -31, -28, -11, 29 }, { 0, 0, 2, -9, -2, 10, -8, -11, 9, 19, -9, -30, -1, 34, 6, -36, 26, -16, 10, -3, -18, 0, -17, -61, -4, 56, -6, -5, -9, 17, 7, 12, -19, -15, -3, -4, -4, 7, 34, 6, -30, -2, 1, 3, 3, 9, -2, 0 }, { 0, -2, -2, -3, 4, -4, -17, -6, 5, 0, -4, -10, 7, 8, 6, 103, 1, 1, 5, -11, 17, 6, 1, -31, -1, 16, 30, 11, 14, -23, -9, -3, -21, -9, -9, 2, -2, 0, 12, -14, -8, 3, 16, 7, 2, 13, 0, 4 }, { 0, 1, 0, -5, -5, 0, -16, 0, 2, 0, 11, 5, -35, 8, 17, 8, -16, 9, 4, -1, 2, 4, 24, 42, -45, 56, 30, -6, 3, 0, 25, 37, 20, 30, 12, 1, 4, -5, 2, 40, -8, -9, 7, 1, 3, -2, -6, -3 }, { 0, 3, -1, 9, 0, -6, 18, 7, -8, -10, 17, 12, -3, -1, -27, 26, 19, 12, -19, -11, -60, -5, 3, 5, -12, 21, -53, 0, -2, -15, 15, 24, -33, -9, 21, -6, -4, -17, -23, 3, 7, 31, -3, -20, -19, 23, 7, 2 }, { 0, -2, -3, 1, 5, 2, 9, 1, -8, 2, -6, 3, -1, 20, -21, 5, 8, -18, -1, -9, 76, 8, 7, 16, -26, 25, -67, -7, -6, -11, -11, -16, 0, -3, -21, 8, 13, 12, 7, 11, 7, 23, -10, -2, 24, 8, -5, -3 }, { 0, -1, -2, 4, 5, 3, -7, -2, -6, -11, 7, -2, 16, 0, -1, -2, -19, -13, -16, -20, -7, 7, 37, -10, 51, -5, -1, -3, -2, -10, -7, 20, 11, -12, -5, 0, 2, -4, 48, 61, 39, 9, 0, 10, -5, 13, 22, -33 }, { 0, 1, -3, -1, -2, -2, 12, 4, -4, -19, 17, 1, 1, -10, -5, -27, 26, 11, -5, -21, 50, 16, 50, -16, 1, -3, 35, -8, -3, 13, 32, 21, -34, -28, -12, 11, -3, -29, -22, -18, 2, -3, 2, -14, -14, 17, 12, 14 }, { 0, -1, 0, 4, 5, -1, 3, 0, 0, -1, -13, 24, 2, -10, 7, -5, 4, 11, -11, -25, 15, -10, -83, 27, -5, 7, 23, 13, -5, 3, 8, 31, -19, -42, -29, -7, -6, 24, -4, 20, 17, 0, 0, 1, -5, 5, 17, -10 }, { 0, 0, 0, 1, 0, 1, 2, -6, -4, 5, -19, 1, 5, -11, -17, 6, 18, -21, -25, 23, 7, -2, -14, -4, 10, -15, 17, 7, -1, 11, 67, -13, -23, 51, 15, -5, 7, 21, 22, 10, 25, 6, 3, -26, 27, 30, -20, 7 }, { -1, 1, 0, -2, 0, -2, -1, 0, 3, -6, 1, 5, -2, -8, -14, 10, 6, -18, 19, -18, -21, 81, -5, 9, -7, 5, -2, 28, -4, 21, 29, -32, 36, -34, 11, 4, -23, -2, 7, 2, 2, -4, -6, -15, 19, -11, 21, 11 }, { 0, 0, 0, -2, 0, -1, -8, 3, 0, 0, -3, -10, 3, 6, -4, -19, -16, 14, -12, 11, 5, 35, 3, -6, -3, 11, -1, 86, 7, -5, -32, 24, -30, 28, 5, 5, -3, 0, -7, -5, 27, -3, -32, 23, -5, 5, -13, 17 } }, { { 124, 1, 12, 2, -25, -11, -4, -5, -6, 2, 0, -3, 7, 3, -2, 0, 1, 3, 2, -2, 0, 0, -1, 1, -1, 0, 1, 0, 0, 0, 0, -1, -1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -8, -79, -63, -17, -60, -23, 2, 19, 11, -5, -1, 5, 27, 15, 2, 0, 0, -4, 0, 2, -1, -1, 0, -1, -10, -9, -2, -1, -1, 0, -1, 0, -1, 0, 0, -1, 0, 1, 0, 3, 3, 1, 0, 0, 0, 1, 0, 0 }, { 24, -18, -14, 6, 63, 34, 15, 79, 44, -3, -1, 7, 4, 3, 3, 1, -8, -33, -11, 7, -1, 0, 3, -1, -7, -6, -1, 2, 1, 2, 4, 12, 1, -1, 3, 0, 2, 0, -1, 3, 4, 2, 1, -1, -4, 0, -1, -1 }, { 6, 66, -64, -46, -27, 58, 20, 6, -20, -12, -5, 14, 9, -11, -2, -3, -10, -2, 10, 3, -3, -2, 3, -2, -5, 3, -2, 0, 0, 0, 3, -2, -5, 0, 0, -1, 1, -1, 0, 1, -2, 0, -1, -2, 1, 2, -1, 0 }, { -7, -30, 52, -18, -19, 33, 35, 38, -73, -10, -1, -21, -8, 21, 9, 2, -16, -13, 25, -2, -1, -1, -4, 15, 6, -17, -7, 2, 1, -1, 2, 1, -6, 4, -3, -1, -2, 3, -5, -5, 7, 2, 1, 1, 0, 0, -2, 2 }, { 13, -37, -35, -23, 25, 0, -15, -21, -30, -15, -2, -19, -75, -46, -2, 0, -2, -23, -12, 1, -2, -2, -2, 2, 29, 20, -1, 0, 0, -2, 3, 19, 10, -3, -1, -1, -1, -2, 1, -7, -7, -3, -2, -2, -8, -3, 2, 1 }, { 2, 15, -6, -58, 45, -54, -39, 23, -24, -43, -18, -6, 19, 27, -10, 0, 21, 14, 19, 9, -10, -6, 3, 13, -3, -6, 7, 2, 0, 2, -9, -11, -6, -1, 2, -3, 2, 0, -6, -2, 0, -1, 0, 5, 5, 1, -1, 1 }, { -7, -13, 42, -24, -17, 5, -41, 9, 6, -6, -7, 28, 34, -69, -33, -9, -29, -22, 17, 9, -3, -2, 6, -7, -19, 22, 7, -4, -2, 1, 20, 13, -15, -2, 2, -1, 1, -2, -3, 8, -2, 1, 0, -8, -6, 6, -2, -3 }, { 7, -37, -3, -3, 28, 31, 14, -14, 6, 9, 4, 41, 11, -7, 17, 5, 22, 55, 55, 15, 1, 0, 3, 1, 31, 33, 7, 3, 1, 3, -2, -22, -20, 0, 3, 1, 1, 0, -2, -20, -18, 0, 1, 0, 5, 5, 0, 9 }, { -2, -1, -26, 46, 1, 23, -37, 26, -16, 2, 6, -57, 19, -8, -48, -23, -14, 38, 7, -17, 1, 1, -8, 24, 14, 12, 9, -6, -1, 9, 10, -17, 8, 13, -4, 0, -2, 5, -4, -9, -6, 1, 2, -2, 5, -8, -2, 2 }, { 1, -10, 12, -2, 2, 15, 30, -31, 37, -65, -47, -48, 25, -25, 29, 11, -6, 7, -4, -21, -21, -10, 1, 14, -5, 2, -9, 0, 0, -3, -2, -6, 1, 13, 2, -3, 1, 1, -3, -1, -3, 2, -2, 1, 3, -1, -5, 0 }, { -3, -8, 23, -19, -22, 37, -19, 20, -2, -3, -2, -13, -1, 8, 6, 7, 71, 9, -45, -5, 2, 0, -3, -15, -18, 41, 35, 4, 4, 6, -13, 2, 18, -4, -4, 0, -1, -4, 12, 14, -24, -16, 3, -1, -4, 1, 3, -7 }, { 1, -7, -16, 37, 6, 23, -31, -17, -17, -4, 5, -4, -7, -19, 12, 17, 32, -35, 23, 18, -3, 1, 8, 17, -47, -37, 12, 10, 3, -6, -29, -28, -33, -2, 7, -2, 4, -1, -20, 12, 9, -15, -6, 9, 19, 11, -7, 0 }, { -6, 26, -4, 33, -30, -18, -7, 23, 9, -9, -14, 9, 1, 1, 19, 19, 30, -32, 41, 8, -10, -5, 4, 43, 42, 22, 18, 8, 4, 1, 7, 46, 14, 11, 6, -3, 2, 8, -12, -8, -1, -5, 0, -3, -19, -5, -3, 0 }, { 0, -8, 11, 9, -14, 21, 7, -19, 21, -36, -19, 21, -35, 41, -49, -52, 5, -2, -4, 46, -13, -6, 24, 20, -4, 0, -2, -21, -6, 10, -2, 3, 2, -4, 15, -4, 7, -5, -14, 4, -1, 2, 7, 2, 1, -2, 1, -2 }, { -1, -3, 4, -6, -5, 11, -17, 5, 11, 24, -50, 29, -21, 9, -21, -16, 14, -17, 17, -72, -32, -17, -27, -18, 12, -20, -5, -7, -2, -4, -20, -14, -4, 19, -1, -8, -1, 14, 7, -11, 0, -3, -2, 11, 5, -4, -3, 7 }, { -3, 8, 18, -25, -24, 4, -14, 19, 31, 2, 22, -5, -5, -21, -1, 6, -9, -3, -16, 23, 11, 5, 4, 21, 32, -26, -22, 2, 1, -17, -53, -32, 12, -2, -3, 2, -2, 1, -14, -45, -29, -12, -7, 16, -4, -23, -3, 14 }, { 2, -12, 4, -7, 7, 39, -44, -19, -1, -8, 10, 10, 21, 6, 8, 7, 5, 21, 5, 7, -1, 2, -2, 0, 24, -34, -29, 5, -1, -13, -13, 38, 44, 2, 3, 0, 1, 2, 20, 32, 43, 14, -8, 21, -4, -11, 9, -19 }, { -2, 9, -4, 37, -10, -4, -21, 25, -18, -29, -10, 1, -2, 0, 25, -20, 8, 8, -1, 6, -6, 0, -7, -42, -2, 21, -65, -13, -7, -25, -16, 12, -30, -33, 0, -4, -2, -8, 4, -10, -18, 10, 0, 3, -21, 16, 12, 2 }, { -1, 2, 2, 18, -13, 7, -9, 6, 7, -36, -8, 5, -15, 11, -22, 43, -27, 15, -1, 7, -13, -1, -11, -47, 26, -26, 46, 41, 14, 18, 4, 0, -12, -40, 1, -2, -2, -14, 5, -13, 7, -10, -6, 1, -2, 4, 11, 7 }, { 0, -1, 1, 3, -8, 6, -32, 12, -9, 22, -33, 18, -22, 12, 21, 33, -27, 28, -35, 12, -22, -19, 32, 22, -19, 15, -26, 39, 12, 3, 22, -12, -1, 28, 16, -9, 11, 4, -10, -5, -3, 21, -3, -6, 1, -1, -19, 9 }, { -1, 3, 1, -2, -8, -6, 7, 14, 4, 6, -26, -18, -7, -8, -16, 4, 10, 4, 7, -16, 45, 26, 77, -24, 25, -2, -1, -2, 1, -10, -13, 5, -19, 21, 31, 16, 13, -8, 20, 9, 17, -5, -2, 14, 15, 23, -2, 3 }, { 1, -4, -9, 33, 15, -13, 23, 2, -38, -20, -14, 40, 17, -11, -14, 10, -21, 1, -6, 1, -4, 3, 17, -4, -12, -3, 7, -1, 0, -9, -34, -9, 42, 24, 4, -1, -1, 4, 30, 14, -42, -35, -6, -5, -17, -20, 0, -20 }, { 1, -7, -2, 1, 10, 19, -28, -25, -9, -20, 8, 11, 21, 24, 18, -11, 6, -25, -8, -26, 31, 20, 24, -17, 0, -22, 6, -2, -5, 19, 28, 3, -6, 15, 5, 9, 0, -5, -14, -47, -16, 10, 7, -31, -38, -35, -11, 6 }, { 1, -7, -5, -11, 17, 7, 15, -24, -6, 38, -11, -24, 28, 15, -33, 8, -3, 0, -8, 1, -12, -15, 5, -2, 2, 10, -12, 22, 19, -11, -35, 45, -31, -13, 3, -5, 7, -13, -38, -3, -4, -30, -1, -2, -37, -7, -18, -16 }, { 0, -1, 3, -8, -3, 9, -10, -6, 20, -27, 29, 4, -21, 33, -22, 25, -13, 1, 15, -21, 14, 23, -17, 11, -22, 27, -11, 8, 1, -63, 14, 6, -9, 21, -1, 10, -4, 17, 4, -1, 1, -30, -41, -19, 1, 11, -1, 3 }, { 0, 2, 2, -7, -3, -1, -1, -6, 15, -11, 29, -12, -9, 5, -9, -11, -2, -10, 24, -3, 5, 7, -8, -1, -12, 4, -28, 50, 34, 54, -27, 6, -10, 30, 3, -3, -7, 25, 34, 11, -24, 15, 27, -4, -15, 23, 10, -3 }, { 0, 2, 2, -14, -3, 5, -13, 1, 18, 34, -22, -35, -15, 5, 13, 4, -3, 7, 31, 27, -9, -14, 8, -3, -5, -31, 4, -19, -17, -9, 17, -15, 2, -10, 10, -4, 7, 6, 36, 8, -21, -19, -7, -50, -40, 9, 27, -24 }, { 0, 0, 1, -4, -4, 2, -15, 5, 13, 2, -26, 5, -19, 12, 16, -4, -23, 1, 27, -27, 38, 22, -4, 19, -4, 11, -11, 13, 3, 17, -10, -13, 20, -41, -21, 21, -4, -45, -28, 42, -22, -5, 6, -6, 9, -10, -16, -26 }, { -1, 5, 3, 0, -12, -14, 16, 27, 13, -15, 23, 2, -38, -29, -2, 1, 17, 49, 19, -10, -1, -12, 5, -20, -29, -18, 2, -8, -4, 8, -3, 11, -12, 24, -4, -7, -5, -2, -22, 4, 25, 2, 1, -1, -30, -43, -32, -33 }, { 0, 1, 2, -4, -3, 3, -21, 3, 3, -7, 33, 2, -3, 11, 42, -17, -38, 11, -11, -17, -29, 0, 5, -6, 21, -1, 22, -27, -20, 16, -23, 22, -19, 11, 33, -2, 15, 18, -20, 24, -15, -38, 15, -14, 25, 16, -15, 0 }, { 0, 0, 0, 3, 0, 2, -10, 0, 5, -6, 10, 5, 1, 6, 16, -25, -7, -2, -6, -2, 7, -47, 32, 5, 15, -7, 9, 7, 12, -9, 3, 5, -18, 23, -76, -30, -30, -35, 20, 0, 4, -24, -9, -5, 9, 18, -3, -1 } }, { { 122, 25, 5, -20, 18, -7, -1, 2, -10, 1, 1, -2, 1, 1, -1, 1, 0, -1, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -14, 22, 113, 8, 47, -4, -2, -6, 11, -17, -1, -8, 6, 1, -1, -1, -1, 3, 1, -2, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, -1, 0, 0 }, { 30,-102, 26, 27, -25, -21, 5, -20, 45, -2, -2, 2, 4, 0, 0, 1, -2, -2, 7, -4, 1, 1, -1, 0, 1, 0, 1, -2, -1, -1, 0, 0, 0, -2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 16, 30, -8, 99, 1, 45, 2, 24, 36, 8, -17, 16, -27, 7, 0, 0, 3, 0, 0, -7, 1, 2, -1, 4, -1, -1, 1, 2, 1, 1, 1, 0, 0, 0, 0, 2, 2, -1, 0, 2, -1, -1, 1, 0, 0, -1, 0, 0 }, { -7, -23, -44, 8, 97, -22, -7, 33, 30, -3, -4, -25, 31, -9, -1, -1, 1, 9, -6, -1, -1, 0, 1, -4, 3, 3, -1, -1, -1, 2, 1, 1, -3, 1, 2, -1, 0, 2, 0, 0, 1, 2, 0, -1, 0, 0, 0, 0 }, { 4, -48, 14, -33, 15, 83, 17, 52, -24, -6, 7, -17, -30, 23, -3, 2, 1, -3, -14, 7, 0, -1, 0, 2, -7, 3, 3, 2, 1, 1, -1, -2, -2, 3, 0, 0, 0, 0, 1, 0, -2, 0, 2, 0, 0, 0, 0, 0 }, { 1, -21, 15, 40, 1, -47, -24, 33, -72, 38, 1, 7, -17, -1, -4, 0, -3, 26, -42, 24, -2, 3, -4, 5, -10, 2, 0, -2, -2, 0, 3, -2, -6, 4, -1, 1, -2, 0, 1, -1, 0, -1, 0, -1, -4, 1, 1, 1 }, { -2, 12, 16, -27, -24, -6, 9, 58, 44, 55, 10, 46, 52, 29, 0, 0, 2, 11, -10, 1, -7, -1, -3, -3, 1, 7, 3, 0, 0, 0, 3, 3, -3, 2, -1, 1, 2, 2, 1, -2, 1, 3, 2, 2, -1, -2, 1, 0 }, { 4, 9, 2, 7, -46, -16, -31, 51, 4, -75, -12, -42, 24, 13, -5, -2, -4, 29, -10, -25, 8, 0, 3, -10, -2, 8, 2, -1, -1, -1, 4, 5, -6, -3, 0, -2, -1, 1, -1, -2, -1, 3, 2, 1, -1, 0, 0, 0 }, { -3, 9, -2, 8, -2, -44, 84, 11, -3, -10, -12, -25, -13, 47, 23, 5, 26, -40, -24, -11, 1, 1, 3, 5, -12, -9, 7, 2, 2, 6, -3, -9, 1, 0, 0, 2, 0, -1, 2, -1, -4, -3, 3, -3, 1, 1, 2, 1 }, { 5, 4, 5, 24, -16, 31, 31, -20, -16, 30, 21, -57, 56, -25, 16, 4, 1, 15, -21, 9, 1, 1, 9, -39, 36, -12, 2, 3, 2, 1, -1, 8, -13, 7, -4, 1, -4, 0, -2, -2, 4, -1, -1, 0, -2, -1, 0, 0 }, { -3, 13, 3, -22, -12, -8, -24, -4, 47, 40, -1, -50, -46, -19, -23, -1, -26, -21, -60, -21, -7, 1, 10, -3, -12, 7, -7, -1, 0, -3, -5, -1, 0, 4, -1, 1, -2, -1, 1, -4, -1, 0, -2, -4, -1, 2, 0, -1 }, { 1, 8, -9, 9, 4, 0, -16, -29, 21, -10, 71, -12, -9, 57, -11, -4, 22, 21, -7, 51, 8, -7, 19, -9, -26, 25, -1, -4, -1, 0, 12, -5, -9, 4, -2, -2, 0, 1, 0, -8, 2, 6, 1, 0, -2, -2, 1, -2 }, { -1, -6, -4, -3, 11, 17, -11, -44, -16, 25, -57, -8, 17, 55, -21, -1, 20, 34, -6, -48, -11, 4, -19, -16, -20, 27, -1, -1, 0, -1, 8, -4, -8, -4, 1, 0, -1, -5, 1, -6, 5, 4, -2, 1, -3, -3, -1, -1 }, { 0, 3, -3, 7, 0, -3, 54, -3, -6, -19, -11, 11, 11, -5, -81, -16, -52, 5, -6, 24, -4, 1, 0, 10, 10, 35, -22, -5, -2, -6, 6, 11, -9, 7, -2, -1, 0, -2, 2, -3, 7, -1, -6, 5, -4, 1, 1, -1 }, { -2, -2, 5, 12, -2, -12, -18, 22, -18, 31, 24, -31, -1, 24, -48, -14, 8, -42, 59, -24, 7, 1, 8, -16, 31, 3, 4, -7, -4, 0, 4, -19, 31, -19, -3, 1, -5, 6, -9, 11, -1, 0, 1, 0, 4, 4, -2, 2 }, { 2, 0, 2, 6, -11, 16, -28, -4, -2, -1, -33, -21, 49, 0, -12, -14, 21, -52, -7, 47, -15, 9, -5, 46, -30, -14, -5, -5, -1, -3, 6, -25, 6, 30, -12, 1, 1, -5, 7, 2, -7, 2, -1, -6, -6, 0, 0, 1 }, { 0, 5, 2, -7, -8, -4, 11, 5, 9, 27, -19, -41, -13, 2, 30, 4, -5, 44, 37, 19, 18, 14, 15, 71, 20, 35, -2, 7, 4, -2, 7, 8, 18, -5, 0, 4, -4, -5, 11, -5, 12, 4, 0, 3, -1, 10, -1, 1 }, { -1, 2, 0, -12, -1, -9, -14, -3, 14, -7, -39, -2, -25, 29, -12, 14, 13, 11, -4, 48, -40, -3, 3, -17, 66, -21, 21, 1, 4, 8, -15, 21, -7, 8, -13, -6, -2, -2, -21, 31, -9, -3, 4, -3, 9, -2, 6, 3 }, { -1, -2, -3, 3, 7, 3, -14, -11, -6, 5, -5, -1, 14, 51, 10, 8, -65, -2, 1, 0, 39, 15, 23, 5, -9, -53, -24, 1, 1, -11, -35, 26, 15, 7, 9, 1, 5, 4, 8, 13, 0, -13, -4, -5, 7, -6, 4, 1 }, { 1, 3, -2, 8, -1, -2, 4, 0, 0, -2, 35, -20, 0, 13, 11, 5, -31, 9, 12, -6, -64, -8, -80, 14, -12, -10, -4, 1, 1, -2, -10, 15, 23, -5, -25, 8, 3, -6, 16, 3, -1, 0, -4, -3, 8, 6, -1, 2 }, { 1, -1, -3, -4, 3, 6, 1, -11, 4, -11, 15, 13, 1, 5, -3, -55, 25, 14, -48, -15, 25, 8, -20, 24, 41, -21, -13, -22, -7, -13, 27, 3, 41, -14, 7, -3, -1, -18, 14, 27, -15, 12, -5, 4, 5, 7, -2, 10 }, { 1, -3, 1, 2, 0, -1, -6, 4, -6, -3, -11, 4, -5, 9, 48, -42, -15, -21, 3, 4, -41, -2, 27, -29, 1, 29, -53, 0, -5, -16, 31, 19, 20, 14, 4, -1, 1, 16, -26, 8, 26, 5, -5, 12, -10, 7, -6, -3 }, { 0, 4, 2, -8, -4, -6, 14, 10, 12, 13, -17, -17, -19, -5, 3, -56, -5, 28, 33, 24, -3, 13, -14, -22, -21, -44, -15, -26, -7, -16, 0, -32, -48, -27, 13, 4, 4, -1, 1, -6, -16, 6, -6, -2, -5, -2, -1, -7 }, { 0, -2, 1, 0, 1, -4, -8, 4, -4, 6, 12, -3, -9, 4, -2, -31, 24, -16, 13, -24, 13, -16, -12, 24, 17, 0, -9, -20, -14, -2, -13, 55, -56, 48, -18, 3, -3, 1, 10, -18, 19, -8, 2, 3, 14, -30, 19, -5 }, { -1, 1, 0, -1, -2, 1, -12, 3, 2, 0, -8, -5, 4, 2, 9, 20, 6, -35, -8, 28, 39, 23, -48, -10, 4, 24, -24, 8, 3, -11, 3, 26, -22, -62, -1, -5, 4, -29, -23, 5, 24, -10, 2, 7, 3, -13, -14, -8 }, { 0, -1, 1, 1, 3, 3, -2, -5, -2, 5, -15, 1, 5, 8, 15, -22, -33, -6, -3, 5, 21, -74, 9, 5, -4, -5, 30, -1, -2, 10, 25, 3, 0, -28, -59, -17, -21, -16, -19, -18, -16, 9, 3, -2, -8, -5, -6, 3 }, { 0, 0, 2, -1, -1, -1, 4, 4, 4, -1, -1, 1, -2, -4, -7, 22, 30, 20, -3, -4, 6, -48, 1, 6, 10, -6, -75, 11, -1, -27, -40, -42, 7, 1, -31, -5, -6, -1, -5, 3, 5, -30, -3, 8, -2, 2, 3, -4 }, { 0, -1, -3, 1, 4, -1, -14, -4, -1, -8, 7, 7, -3, 22, 11, 10, -25, -13, -10, -6, -12, 15, -8, 11, 54, -12, -7, 17, 7, -1, 26, -42, -14, 4, 15, 0, 5, -10, 10, -70, -3, -8, 7, 19, -33, -18, 14, -16 }, { 1, -2, -1, 6, 1, 6, 0, -4, -8, -4, 17, 4, 14, 2, 0, -11, 4, 1, -11, -17, -42, 11, 34, 41, 2, 6, 1, -17, -13, -1, -41, 7, -10, -38, 18, -2, -5, -18, -59, -24, -23, -9, -7, -23, 14, -10, -21, -4 }, { 0, 0, 0, 1, 1, 1, -7, -4, -1, 0, 3, 2, 3, 7, 20, -32, -17, -8, -2, -2, 19, 8, -22, -2, 14, 44, -8, 46, 0, 27, -18, -19, -26, 12, -3, 13, 6, 23, -6, 22, -64, -21, 0, -22, 8, 14, 10, 22 }, { 1, -1, 1, -1, -3, -1, 0, 0, 2, 2, -3, -5, 1, -4, -21, -36, 19, 7, 0, 7, -13, -18, 7, 5, -11, -27, -9, 68, 22, 39, 4, 29, 12, -24, 27, -7, -9, 12, 21, -16, 18, -32, 28, -2, 7, -8, -18, -15 } } } }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/stx_tables.h000066400000000000000000000035251517466257200231610ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_STX_TABLES_H #define DAV2D_SRC_STX_TABLES_H #include #include "src/levels.h" EXTERN const uint8_t dav2d_stx_scan_orders_4x4[TX_64X64][2][16]; EXTERN const uint8_t dav2d_stx_scan_orders_8x8[TX_32X32][2][64]; EXTERN const uint8_t dav2d_coeff8x8_mapping[33][48]; EXTERN const int8_t dav2d_stx_4x4_kernel[14][3][8][16]; EXTERN const int8_t dav2d_stx_8x8_kernel[11][3][32][48]; #endif /* DAV2D_SRC_STX_TABLES_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/stx_tmpl.c000066400000000000000000000044711517466257200226570ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/levels.h" #include "src/stx.h" #include "src/stx_tables.h" #include "src/tables.h" static void stxfm_c(coef *const cf_out, const coef *const cf, const int8_t *kernel, const int sz, const int eob HIGHBD_DECL_SUFFIX) { assert(sz == 16 || sz == 48); assert(eob >= 0 && eob < (sz == 16 ? 8 : 32)); const int min = -128 * (1 + BITDEPTH_MAX); const int max = 128 * (1 + BITDEPTH_MAX) - 1; const int h = eob + 1; for (int x = 0; x < sz; x++) { int sum = 0; for (int y = 0; y < h; y++) sum += cf[y] * kernel[y * sz + x]; sum = apply_sign((abs(sum) + 64) >> 7, sum); cf_out[x] = iclip(sum, min, max); } } COLD void bitfn(dav2d_stx_dsp_init)(Dav2dStxDSPContext *const c) { c->stxfm = stxfm_c; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/tables.c000066400000000000000000004455661517466257200222750ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/attributes.h" #include "src/debug.h" #include "src/levels.h" #include "src/tables.h" const uint8_t dav2d_block_dimensions[N_BS_SIZES][4] = { [BS_256x256] = { 64, 64, 6, 6 }, [BS_256x128] = { 64, 32, 6, 5 }, [BS_128x256] = { 32, 64, 5, 6 }, [BS_128x128] = { 32, 32, 5, 5 }, [BS_128x64] = { 32, 16, 5, 4 }, [BS_64x128] = { 16, 32, 4, 5 }, [BS_64x64] = { 16, 16, 4, 4 }, [BS_64x32] = { 16, 8, 4, 3 }, [BS_64x16] = { 16, 4, 4, 2 }, [BS_64x8] = { 16, 2, 4, 1 }, [BS_64x4] = { 16, 1, 4, 0 }, [BS_32x64] = { 8, 16, 3, 4 }, [BS_32x32] = { 8, 8, 3, 3 }, [BS_32x16] = { 8, 4, 3, 2 }, [BS_32x8] = { 8, 2, 3, 1 }, [BS_32x4] = { 8, 1, 3, 0 }, [BS_16x64] = { 4, 16, 2, 4 }, [BS_16x32] = { 4, 8, 2, 3 }, [BS_16x16] = { 4, 4, 2, 2 }, [BS_16x8] = { 4, 2, 2, 1 }, [BS_16x4] = { 4, 1, 2, 0 }, [BS_8x64] = { 2, 16, 1, 4 }, [BS_8x32] = { 2, 8, 1, 3 }, [BS_8x16] = { 2, 4, 1, 2 }, [BS_8x8] = { 2, 2, 1, 1 }, [BS_8x4] = { 2, 1, 1, 0 }, [BS_4x64] = { 1, 16, 0, 4 }, [BS_4x32] = { 1, 8, 0, 3 }, [BS_4x16] = { 1, 4, 0, 2 }, [BS_4x8] = { 1, 2, 0, 1 }, [BS_4x4] = { 1, 1, 0, 0 }, }; const TxfmInfo dav2d_txfm_dimensions[N_RECT_TX_SIZES] = { [ TX_4X4] = { .w = 1, .h = 1, .lw = 0, .lh = 0, .min = 0, .max = 0, .ctx = 0 }, [ TX_8X8] = { .w = 2, .h = 2, .lw = 1, .lh = 1, .min = 1, .max = 1, .sub = TX_4X4, .ctx = 1 }, [ TX_16X16] = { .w = 4, .h = 4, .lw = 2, .lh = 2, .min = 2, .max = 2, .sub = TX_8X8, .ctx = 2 }, [ TX_32X32] = { .w = 8, .h = 8, .lw = 3, .lh = 3, .min = 3, .max = 3, .sub = TX_16X16, .ctx = 3 }, [ TX_64X64] = { .w = 16, .h = 16, .lw = 4, .lh = 4, .min = 4, .max = 4, .sub = TX_32X32, .ctx = 4 }, [RTX_4X8] = { .w = 1, .h = 2, .lw = 0, .lh = 1, .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 }, [RTX_8X4] = { .w = 2, .h = 1, .lw = 1, .lh = 0, .min = 0, .max = 1, .sub = TX_4X4, .ctx = 1 }, [RTX_8X16] = { .w = 2, .h = 4, .lw = 1, .lh = 2, .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 }, [RTX_16X8] = { .w = 4, .h = 2, .lw = 2, .lh = 1, .min = 1, .max = 2, .sub = TX_8X8, .ctx = 2 }, [RTX_16X32] = { .w = 4, .h = 8, .lw = 2, .lh = 3, .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 }, [RTX_32X16] = { .w = 8, .h = 4, .lw = 3, .lh = 2, .min = 2, .max = 3, .sub = TX_16X16, .ctx = 3 }, [RTX_32X64] = { .w = 8, .h = 16, .lw = 3, .lh = 4, .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 }, [RTX_64X32] = { .w = 16, .h = 8, .lw = 4, .lh = 3, .min = 3, .max = 4, .sub = TX_32X32, .ctx = 4 }, [RTX_4X16] = { .w = 1, .h = 4, .lw = 0, .lh = 2, .min = 0, .max = 2, .sub = RTX_4X8, .ctx = 1 }, [RTX_16X4] = { .w = 4, .h = 1, .lw = 2, .lh = 0, .min = 0, .max = 2, .sub = RTX_8X4, .ctx = 1 }, [RTX_8X32] = { .w = 2, .h = 8, .lw = 1, .lh = 3, .min = 1, .max = 3, .sub = RTX_8X16, .ctx = 2 }, [RTX_32X8] = { .w = 8, .h = 2, .lw = 3, .lh = 1, .min = 1, .max = 3, .sub = RTX_16X8, .ctx = 2 }, [RTX_16X64] = { .w = 4, .h = 16, .lw = 2, .lh = 4, .min = 2, .max = 4, .sub = RTX_16X32, .ctx = 3 }, [RTX_64X16] = { .w = 16, .h = 4, .lw = 4, .lh = 2, .min = 2, .max = 4, .sub = RTX_32X16, .ctx = 3 }, [RTX_4X32] = { .w = 1, .h = 8, .lw = 0, .lh = 3, .min = 0, .max = 3, .sub = RTX_4X16, .ctx = 2 }, [RTX_32X4] = { .w = 8, .h = 1, .lw = 3, .lh = 0, .min = 0, .max = 3, .sub = RTX_16X4, .ctx = 2 }, [RTX_8X64] = { .w = 2, .h = 16, .lw = 1, .lh = 4, .min = 1, .max = 4, .sub = RTX_8X32, .ctx = 3 }, [RTX_64X8] = { .w = 16, .h = 2, .lw = 4, .lh = 1, .min = 1, .max = 4, .sub = RTX_32X8, .ctx = 3 }, [RTX_4X64] = { .w = 1, .h = 16, .lw = 0, .lh = 4, .min = 0, .max = 4, .sub = RTX_4X32, .ctx = 2 }, [RTX_64X4] = { .w = 16, .h = 1, .lw = 4, .lh = 0, .min = 0, .max = 4, .sub = RTX_32X4, .ctx = 2 }, }; const uint8_t dav2d_tx_shift[N_RECT_TX_SIZES][2] = { [ TX_4X4] = { 7, 10 }, [ TX_8X8] = { 7, 11 }, [ TX_16X16] = { 6, 13 }, [ TX_32X32] = { 6, 13 }, [ TX_64X64] = { 6, 13 }, [RTX_4X8] = { 7, 10 }, [RTX_8X4] = { 7, 10 }, [RTX_8X16] = { 7, 11 }, [RTX_16X8] = { 7, 11 }, [RTX_16X32] = { 6, 12 }, [RTX_32X16] = { 6, 12 }, [RTX_32X64] = { 6, 12 }, [RTX_64X32] = { 6, 12 }, [RTX_4X16] = { 6, 12 }, [RTX_16X4] = { 6, 12 }, [RTX_8X32] = { 6, 13 }, [RTX_32X8] = { 6, 13 }, [RTX_16X64] = { 6, 13 }, [RTX_64X16] = { 6, 13 }, [RTX_4X32] = { 7, 11 }, [RTX_32X4] = { 7, 11 }, [RTX_8X64] = { 6, 12 }, [RTX_64X8] = { 6, 12 }, [RTX_4X64] = { 6, 13 }, [RTX_64X4] = { 6, 13 }, }; const uint8_t dav2d_tx_ddt_mask[N_RECT_TX_SIZES] = { [ TX_4X4] = 0x00, [ TX_8X8] = 0x42, [ TX_16X16] = 0x42, [ TX_32X32] = 0x00, [ TX_64X64] = 0x00, [RTX_4X8] = 0x40, [RTX_8X4] = 0x02, [RTX_8X16] = 0x42, [RTX_16X8] = 0x42, [RTX_16X32] = 0x02, [RTX_32X16] = 0x40, [RTX_32X64] = 0x00, [RTX_64X32] = 0x00, [RTX_4X16] = 0x40, [RTX_16X4] = 0x02, [RTX_8X32] = 0x02, [RTX_32X8] = 0x40, [RTX_16X64] = 0x02, [RTX_64X16] = 0x40, [RTX_4X32] = 0x00, [RTX_32X4] = 0x00, [RTX_8X64] = 0x02, [RTX_64X8] = 0x40, [RTX_4X64] = 0x00, [RTX_64X4] = 0x00, }; const uint8_t /* enum (Rect)TxfmSize */ dav2d_max_txfm_size_for_bs[N_BS_SIZES][4 /* 444, 422, 420, lossless */] = { [BS_256x256] = { TX_64X64, TX_64X64, TX_64X64, TX_32X32 }, [BS_256x128] = { TX_64X64, TX_64X64, TX_64X64, TX_32X32 }, [BS_128x256] = { TX_64X64, TX_64X64, TX_64X64, TX_32X32 }, [BS_128x128] = { TX_64X64, TX_64X64, TX_64X64, TX_32X32 }, [BS_128x64] = { TX_64X64, TX_64X64, RTX_64X32, TX_32X32 }, [BS_64x128] = { TX_64X64, RTX_32X64, RTX_32X64, TX_32X32 }, [BS_64x64] = { TX_64X64, RTX_32X64, TX_32X32, TX_32X32 }, [BS_64x32] = { RTX_64X32, TX_32X32, RTX_32X16, TX_32X32 }, [BS_64x16] = { RTX_64X16, RTX_32X16, RTX_32X8, RTX_32X16 }, [BS_64x8] = { RTX_64X8, RTX_32X8, RTX_32X4, RTX_32X8 }, [BS_64x4] = { RTX_64X4, RTX_32X4, RTX_32X4, RTX_32X4 }, [BS_32x64] = { RTX_32X64, RTX_16X64, RTX_16X32, TX_32X32 }, [BS_32x32] = { TX_32X32, RTX_16X32, TX_16X16, TX_32X32 }, [BS_32x16] = { RTX_32X16, TX_16X16, RTX_16X8, RTX_32X16 }, [BS_32x8] = { RTX_32X8, RTX_16X8, RTX_16X4, RTX_32X8 }, [BS_32x4] = { RTX_32X4, RTX_16X4, RTX_16X4, RTX_32X4 }, [BS_16x64] = { RTX_16X64, RTX_8X64, RTX_8X32, RTX_16X32 }, [BS_16x32] = { RTX_16X32, RTX_8X32, RTX_8X16, RTX_16X32 }, [BS_16x16] = { TX_16X16, RTX_8X16, TX_8X8, TX_16X16 }, [BS_16x8] = { RTX_16X8, TX_8X8, RTX_8X4, RTX_16X8 }, [BS_16x4] = { RTX_16X4, RTX_8X4, RTX_8X4, RTX_16X4 }, [BS_8x64] = { RTX_8X64, RTX_4X64, RTX_4X32, RTX_8X32 }, [BS_8x32] = { RTX_8X32, RTX_4X32, RTX_4X16, RTX_8X32 }, [BS_8x16] = { RTX_8X16, RTX_4X16, RTX_4X8, RTX_8X16 }, [BS_8x8] = { TX_8X8, RTX_4X8, TX_4X4, TX_8X8 }, [BS_8x4] = { RTX_8X4, TX_4X4, TX_4X4, RTX_8X4 }, [BS_4x64] = { RTX_4X64, RTX_4X64, RTX_4X32, RTX_4X32 }, [BS_4x32] = { RTX_4X32, RTX_4X32, RTX_4X16, RTX_4X32 }, [BS_4x16] = { RTX_4X16, RTX_4X16, RTX_4X8, RTX_4X16 }, [BS_4x8] = { RTX_4X8, RTX_4X8, TX_4X4, RTX_4X8 }, [BS_4x4] = { TX_4X4, TX_4X4, TX_4X4, TX_4X4 }, }; const uint8_t /* enum BlockSize */ dav2d_ss_bs[N_BS_SIZES][3 /* 420, 422, 444 */] = { [BS_256x256] = { BS_128x128, BS_128x256, BS_256x256 }, [BS_256x128] = { BS_128x64, BS_128x128, BS_256x128 }, [BS_128x256] = { BS_64x128, BS_INVALID, BS_128x256 }, [BS_128x128] = { BS_64x64, BS_64x128, BS_128x128 }, [BS_128x64] = { BS_64x32, BS_64x64, BS_128x64 }, [BS_64x128] = { BS_32x64, BS_INVALID, BS_64x128 }, [BS_64x64] = { BS_32x32, BS_32x64, BS_64x64 }, [BS_64x32] = { BS_32x16, BS_32x32, BS_64x32 }, [BS_64x16] = { BS_32x8, BS_32x16, BS_64x16 }, [BS_64x8] = { BS_32x4, BS_32x8, BS_64x8 }, [BS_64x4] = { BS_INVALID, BS_32x4, BS_64x4 }, [BS_32x64] = { BS_16x32, BS_16x64, BS_32x64 }, [BS_32x32] = { BS_16x16, BS_16x32, BS_32x32 }, [BS_32x16] = { BS_16x8, BS_16x16, BS_32x16 }, [BS_32x8] = { BS_16x4, BS_16x8, BS_32x8 }, [BS_32x4] = { BS_INVALID, BS_16x4, BS_32x4 }, [BS_16x64] = { BS_8x32, BS_8x64, BS_16x64 }, [BS_16x32] = { BS_8x16, BS_8x32, BS_16x32 }, [BS_16x16] = { BS_8x8, BS_8x16, BS_16x16 }, [BS_16x8] = { BS_8x4, BS_8x8, BS_16x8 }, [BS_16x4] = { BS_INVALID, BS_8x4, BS_16x4 }, [BS_8x64] = { BS_4x32, BS_4x64, BS_8x64 }, [BS_8x32] = { BS_4x16, BS_4x32, BS_8x32 }, [BS_8x16] = { BS_4x8, BS_4x16, BS_8x16 }, [BS_8x8] = { BS_4x4, BS_4x8, BS_8x8 }, [BS_8x4] = { BS_INVALID, BS_4x4, BS_8x4 }, [BS_4x64] = { BS_INVALID, BS_INVALID, BS_4x64 }, [BS_4x32] = { BS_INVALID, BS_INVALID, BS_4x32 }, [BS_4x16] = { BS_INVALID, BS_INVALID, BS_4x16 }, [BS_4x8] = { BS_INVALID, BS_INVALID, BS_4x8 }, [BS_4x4] = { BS_INVALID, BS_INVALID, BS_4x4 }, }; #if DEBUG_BLOCK_INFO const char *const dav2d_tx1d_names[N_TX_1D_TYPES] = { [DCT] = "dct", [IDENTITY] = "identity", [ADST] = "adst", [FLIPADST] = "flipadst", [DDT] = "ddt", [FLIPDDT] = "flipddt", [WHT] = "wht", }; #endif const int8_t dav2d_tx_part_tbl[N_BS_SIZES][8] = { [BS_4x4] = { TX_4X4, -1, -1, -1, -1, -1, -1, -1 }, [BS_4x8] = { RTX_4X8, -1, TX_4X4, -1, -1, -1, -1, -1 }, [BS_4x16] = { RTX_4X16, -1, RTX_4X8, -1, TX_4X4, -1, -1, -1 }, [BS_4x32] = { RTX_4X32, -1, RTX_4X16, -1, RTX_4X8, -1, -1, -1 }, [BS_4x64] = { RTX_4X64, -1, RTX_4X32, -1, RTX_4X16, -1, -1, -1 }, [BS_8x4] = { RTX_8X4, -1, -1, TX_4X4, -1, -1, -1, -1 }, [BS_8x8] = { TX_8X8, TX_4X4, RTX_8X4, RTX_4X8, -1, -1, -1, -1 }, [BS_8x16] = { RTX_8X16, RTX_4X8, TX_8X8, RTX_4X16, RTX_8X4, -1, TX_4X4, -1 }, [BS_8x32] = { RTX_8X32, RTX_4X16, RTX_8X16, RTX_4X32, TX_8X8, -1, RTX_4X8, -1 }, [BS_8x64] = { RTX_8X64, RTX_4X32, RTX_8X32, RTX_4X64, RTX_8X16, -1, RTX_4X16, -1 }, [BS_16x4] = { RTX_16X4, -1, -1, RTX_8X4, -1, TX_4X4, -1, -1 }, [BS_16x8] = { RTX_16X8, RTX_8X4, RTX_16X4, TX_8X8, -1, RTX_4X8, -1, TX_4X4 }, [BS_16x16] = { TX_16X16, TX_8X8, RTX_16X8, RTX_8X16, RTX_16X4, RTX_4X16, RTX_8X4, RTX_4X8 }, [BS_16x32] = { RTX_16X32, RTX_8X16, TX_16X16, RTX_8X32, RTX_16X8, RTX_4X32, TX_8X8, RTX_4X16 }, [BS_16x64] = { RTX_16X64, RTX_8X32, RTX_16X32, RTX_8X64, TX_16X16, RTX_4X64, RTX_8X16, RTX_4X32 }, [BS_32x4] = { RTX_32X4, -1, -1, RTX_16X4, -1, RTX_8X4, -1, -1 }, [BS_32x8] = { RTX_32X8, RTX_16X4, RTX_32X4, RTX_16X8, -1, TX_8X8, -1, RTX_8X4 }, [BS_32x16] = { RTX_32X16, RTX_16X8, RTX_32X8, TX_16X16, RTX_32X4, RTX_8X16, RTX_16X4, TX_8X8 }, [BS_32x32] = { TX_32X32, TX_16X16, RTX_32X16, RTX_16X32, RTX_32X8, RTX_8X32, RTX_16X8, RTX_8X16 }, [BS_32x64] = { RTX_32X64, RTX_16X32, TX_32X32, RTX_16X64, RTX_32X16, RTX_8X64, TX_16X16, RTX_8X32 }, [BS_64x4] = { RTX_64X4, -1, -1, RTX_32X4, -1, RTX_16X4, -1, -1 }, [BS_64x8] = { RTX_64X8, RTX_32X4, RTX_64X4, RTX_32X8, -1, RTX_16X8, -1, RTX_16X4 }, [BS_64x16] = { RTX_64X16, RTX_32X8, RTX_64X8, RTX_32X16, RTX_64X4, TX_16X16, RTX_32X4, RTX_16X8 }, [BS_64x32] = { RTX_64X32, RTX_32X16, RTX_64X16, TX_32X32, RTX_64X8, RTX_16X32, RTX_32X8, TX_16X16 }, [BS_64x64] = { TX_64X64, TX_32X32, RTX_64X32, RTX_32X64, RTX_64X16, RTX_16X64, RTX_32X16, RTX_16X32 }, [BS_64x128] = { TX_64X64, -1, -1, -1, -1, -1, -1, -1 }, [BS_128x64] = { TX_64X64, -1, -1, -1, -1, -1, -1, -1 }, [BS_128x128] = { TX_64X64, -1, -1, -1, -1, -1, -1, -1 }, [BS_128x256] = { TX_64X64, -1, -1, -1, -1, -1, -1, -1 }, [BS_256x128] = { TX_64X64, -1, -1, -1, -1, -1, -1, -1 }, [BS_256x256] = { TX_64X64, -1, -1, -1, -1, -1, -1, -1 }, }; const uint8_t /* enum TxfmType */ dav2d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES] = { [DC_PRED] = DCT_DCT, [VERT_PRED] = ADST_DCT, [HOR_PRED] = DCT_ADST, [DIAG_DOWN_LEFT_PRED] = DCT_DCT, [DIAG_DOWN_RIGHT_PRED] = ADST_ADST, [VERT_RIGHT_PRED] = ADST_DCT, [HOR_DOWN_PRED] = DCT_ADST, [HOR_UP_PRED] = DCT_ADST, [VERT_LEFT_PRED] = ADST_DCT, [SMOOTH_PRED] = ADST_ADST, [SMOOTH_V_PRED] = ADST_DCT, [SMOOTH_H_PRED] = DCT_ADST, [PAETH_PRED] = ADST_ADST, }; /* sin(t), cos(t), -sin(t) */ const int16_t dav2d_cctx_angle[6][3] = { { 181, 181, -181 }, // 45 degrees { 128, 222, -128 }, // 30 degrees { 222, 128, -222 }, // 60 degrees { -181, 181, 181 }, // -45 degrees { -128, 222, 128 }, // -30 degrees { -222, 128, 222 }, // -60 degrees }; const uint8_t dav2d_mode_to_angle_map[8] = { 90, 180, 45, 135, 113, 157, 203, 67 }; const uint8_t /* enum InterPredMode */ dav2d_comp_inter_pred_modes[][2] = { [NEARMV_NEARMV - NEARMV_NEARMV] = { NEARMV, NEARMV }, [NEWMV_NEARMV - NEARMV_NEARMV] = { NEWMV, NEARMV }, [NEARMV_NEWMV - NEARMV_NEARMV] = { NEARMV, NEWMV }, [GLOBALMV_GLOBALMV - NEARMV_NEARMV] = { GLOBALMV, GLOBALMV }, [NEWMV_NEWMV - NEARMV_NEARMV] = { NEWMV, NEWMV }, [JOINT_NEWMV - NEARMV_NEARMV] = { NEWMV, NEWMV }, [OPFL_NEARMV_NEARMV - NEARMV_NEARMV] = { NEARMV, NEARMV }, [OPFL_NEWMV_NEARMV - NEARMV_NEARMV] = { NEWMV, NEARMV }, [OPFL_NEARMV_NEWMV - NEARMV_NEARMV] = { NEARMV, NEWMV }, [OPFL_NEWMV_NEWMV - NEARMV_NEARMV] = { NEWMV, NEWMV }, [OPFL_JOINT_NEWMV - NEARMV_NEARMV] = { NEWMV, NEWMV }, }; const Dav2dWarpedMotionParams dav2d_default_wm_params = { .type = DAV2D_WM_TYPE_IDENTITY, .matrix = { 0, 0, 1 << 16, 0, 0, 1 << 16, }, .u.p.alpha = 0, .u.p.beta = 0, .u.p.gamma = 0, .u.p.delta = 0, }; const int8_t dav2d_tip_wts[] = { 8, 12, 16, 18, 20, 4, 6, -4 }; const int8_t dav2d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = { { 1 * 12 + 0, 2 * 12 + 0 }, // 6 { 1 * 12 + 0, 2 * 12 - 1 }, // 7 { -1 * 12 + 1, -2 * 12 + 2 }, // 0 { 0 * 12 + 1, -1 * 12 + 2 }, // 1 { 0 * 12 + 1, 0 * 12 + 2 }, // 2 { 0 * 12 + 1, 1 * 12 + 2 }, // 3 { 1 * 12 + 1, 2 * 12 + 2 }, // 4 { 1 * 12 + 0, 2 * 12 + 1 }, // 5 { 1 * 12 + 0, 2 * 12 + 0 }, // 6 { 1 * 12 + 0, 2 * 12 - 1 }, // 7 { -1 * 12 + 1, -2 * 12 + 2 }, // 0 { 0 * 12 + 1, -1 * 12 + 2 }, // 1 }; const uint16_t dav2d_ccso_quant_sz[4 /* scale */][4 /* quant_idx */] = { { 16, 8, 32, 0 }, { 56, 40, 64, 128 }, { 48, 24, 96, 192 }, { 80, 112, 160, 256 } }; const int8_t dav2d_ccso_offset[4][8] = { { 0, 1, -1, 3, -3, 7, -7, -10 }, { 0, 2, -2, 6, -6, 14, -14, -20 }, { 0, 3, -3, 9, -9, 21, -21, -30 }, { 0, 4, -4, 12, -12, 28, -28, -40 }, }; const unsigned dav2d_subset_masks_y[4] = { 0x3f, 0xfc3, 0xfff, 0xffff }; const unsigned dav2d_subset_masks_uv[3] = { 0x3f, 0x3ff, 0x3ffff }; const int8_t dav2d_wiener_ns_filters[64][16] = { { 39, 39, -14, -14, -16, -16, 7, 7, -1, -3, 1, 7 }, { -1, 3, 1, -1, -2, -1, 0, 0, 1, 0, 0, 0 }, { 12, 14, -5, -6, -6, 2, 1, -1, 2, -1, 1, 1 }, { 39, 23, -7, -6, -12, -7, -2, -3, 3, 2, 2, 1 }, { 5, 12, -1, -2, -4, 15, 1, -5, -1, -7, 1, 2 }, { 8, 8, -3, -3, -2, -4, 0, 2, 0, 0, 0, 1 }, { 5, 7, -2, -3, -2, -3, 1, 1, 0, 0, 0, 1 }, { 16, 16, -7, -6, 11, -6, -3, 1, -3, 2, 2, 1 }, { 18, 39, -3, -14, 3, -16, -2, 5, -4, 1, 0, 4 }, { 16, 4, -6, -1, -1, -6, 0, 1, 0, 2, 1, 0 }, { -7, 11, 7, -3, -2, 11, 0, -5, 0, -3, -1, 1 }, { 21, 26, -6, -8, -10, 7, 1, -4, 1, -3, 2, 2 }, { 5, 7, -2, -4, -3, -1, 1, 0, 1, 0, 0, 1 }, { 14, 15, -6, -5, -4, -4, 1, 1, 0, 0, 1, 1 }, { 9, 10, -4, -4, -6, 0, 1, 0, 2, -1, 1, 1 }, { 25, 17, -6, -4, -12, -9, 1, 0, 3, 2, 1, 0 }, { 13, 13, -4, -5, 3, -7, -2, 1, -1, 2, 1, 1 }, { 4, 7, -1, -3, -5, -4, 1, 1, 2, 1, 0, 0 }, { 8, 33, 0, -13, -10, 1, 2, -2, 2, -1, 0, 3 }, { -2, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 11, 11, -5, -5, -4, -5, 1, 1, 1, 1, 1, 1 }, { -2, 17, 4, -3, 7, -6, -3, 2, -3, 0, -1, 1 }, { 30, 15, -7, 0, -5, -13, 0, 1, -1, 2, 1, 0 }, { 33, 26, -8, -4, 7, -12, -4, 0, -5, 0, 2, 2 }, { 8, 10, -4, -5, -4, -4, 1, 1, 1, 1, 1, 1 }, { 23, 24, -8, -8, -1, -9, -1, 1, -1, 1, 2, 2 }, { 18, 39, -5, -10, -11, -9, 5, 3, -2, -1, -1, 3 }, { 7, 15, -3, -6, -5, -5, 2, 2, 0, 0, 0, 2 }, { 12, 9, -5, -3, -8, -5, 2, 1, 2, 1, 1, 0 }, { 11, 8, -4, -1, -5, -6, 1, 1, 0, 1, 1, 0 }, { 3, 3, 0, 1, -2, 13, 0, -4, -1, -4, 1, 1 }, { 4, 3, 2, 2, -4, 15, 0, -6, 1, -6, 0, 0 }, { 13, 16, -7, -5, 15, -3, -3, 2, -6, 1, 1, 1 }, { 32, 20, 1, 9, 15, 15, -7, -7, -7, -7, -2, -2 }, { 16, 18, -6, -7, -9, -9, 2, 2, 2, 2, 1, 1 }, { 10, 11, -3, -3, -7, 8, 1, -3, 2, -4, 1, 1 }, { 36, -1, -12, 4, 15, 15, -8, -8, -5, -6, 7, 1 }, { 36, 12, -12, -3, -11, 5, 1, -1, 3, -3, 2, 1 }, { 1, 1, 0, -1, -2, -3, 0, 1, 1, 1, 0, 0 }, { 11, 11, -5, -5, -1, -4, 0, 1, 0, 1, 1, 1 }, { 5, 7, -3, -4, -3, -1, 1, 0, 1, 0, 1, 1 }, { 8, 14, -3, -4, -4, -8, 1, 3, 0, 1, 0, 1 }, { 28, 39, -9, -13, -13, -16, 3, 6, 1, 2, 0, 3 }, { 1, 1, 0, -1, -3, -2, 1, 0, 1, 1, 0, 0 }, { 17, 17, -5, -1, 2, 5, -1, -2, -3, -4, 1, 1 }, { 8, 12, -1, -4, 3, -5, -2, -1, -2, 2, 1, 1 }, { 4, 8, -2, -2, -5, -3, 2, 1, 1, 0, 0, 0 }, { -3, 6, 2, -2, 1, -2, 0, 1, -1, 0, -1, 1 }, { 5, 6, -1, -2, -4, 4, 0, -2, 1, -2, 1, 1 }, { 14, 12, -4, -5, -8, -7, 1, 0, 3, 3, 1, 0 }, { 17, 18, -7, -8, -8, -3, 1, 0, 2, 0, 2, 2 }, { 39, 18, -14, -3, -5, -14, 1, 1, -1, 4, 3, 0 }, { 17, 0, -4, 2, 15, -5, -5, 0, -5, 2, 1, 0 }, { 16, 14, -6, -5, -2, -8, 0, 2, -1, 1, 1, 2 }, { 39, 2, -10, 8, 15, 15, -8, -8, -7, -8, 6, 1 }, { 18, 7, -4, 1, -10, 2, 1, -2, 2, -2, 1, 0 }, { 39, 39, -13, -14, -16, -16, 6, 5, 6, 4, 1, 3 }, { 24, 22, -7, -5, 15, -6, -5, -2, -6, 0, 2, 2 }, { 28, 0, -9, 9, -10, 14, 2, -1, -1, -8, 2, 0 }, { 2, 2, -1, -1, -2, -1, 1, 0, 1, 0, 0, 0 }, { -5, 11, 6, -3, 15, -3, -6, 1, -3, 0, -1, 1 }, { 39, 32, -9, 3, -6, 12, -1, -5, -6, -8, 2, 1 }, { 5, 4, -2, -1, -4, -4, 1, 1, 1, 1, 0, 0 }, { 25, 25, -8, -8, -7, -5, 1, 0, 0, 0, 1, 2 }, }; const int16_t dav2d_pc_wiener_filters[4][64][13] = { { { 73, 127, -20, -30, -38, -29, 10, 7, -1, -3, 1, 7, -80 }, { -1, 3, 1, -1, -2, -1, 0, 0, 1, 0, 0, 0, 128 }, { 12, 14, -5, -6, -6, 2, 1, -1, 2, -1, 1, 1, 100 }, { 43, 23, -7, -6, -12, -7, -2, -3, 3, 2, 2, 1, 54 }, { 5, 12, -1, -2, -4, 19, 1, -5, -1, -7, 1, 2, 88 }, { 8, 8, -3, -3, -2, -4, 0, 2, 0, 0, 0, 1, 114 }, { 5, 7, -2, -3, -2, -3, 1, 1, 0, 0, 0, 1, 118 }, { 16, 16, -7, -6, 11, -6, -3, 1, -3, 2, 2, 1, 80 }, { 18, 50, -3, -14, 3, -18, -2, 5, -4, 1, 0, 4, 48 }, { 16, 4, -6, -1, -1, -6, 0, 1, 0, 2, 1, 0, 108 }, { -7, 11, 7, -3, -2, 11, 0, -5, 0, -3, -1, 1, 110 }, { 21, 26, -6, -8, -10, 7, 1, -4, 1, -3, 2, 2, 70 }, { 5, 7, -2, -4, -3, -1, 1, 0, 1, 0, 0, 1, 118 }, { 14, 15, -6, -5, -4, -4, 1, 1, 0, 0, 1, 1, 100 }, { 9, 10, -4, -4, -6, 0, 1, 0, 2, -1, 1, 1, 110 }, { 25, 17, -6, -4, -12, -9, 1, 0, 3, 2, 1, 0, 92 }, { 13, 13, -4, -5, 3, -7, -2, 1, -1, 2, 1, 1, 98 }, { 4, 7, -1, -3, -5, -4, 1, 1, 2, 1, 0, 0, 122 }, { 8, 33, 0, -13, -10, 1, 2, -2, 2, -1, 0, 3, 82 }, { -2, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132 }, { 11, 11, -5, -5, -4, -5, 1, 1, 1, 1, 1, 1, 110 }, { -2, 17, 4, -3, 7, -6, -3, 2, -3, 0, -1, 1, 102 }, { 30, 15, -7, 0, -5, -13, 0, 1, -1, 2, 1, 0, 82 }, { 33, 26, -8, -4, 7, -12, -4, 0, -5, 0, 2, 2, 54 }, { 8, 10, -4, -5, -4, -4, 1, 1, 1, 1, 1, 1, 114 }, { 23, 24, -8, -8, -1, -9, -1, 1, -1, 1, 2, 2, 78 }, { 18, 39, -5, -10, -11, -9, 5, 3, -2, -1, -1, 3, 70 }, { 7, 15, -3, -6, -5, -5, 2, 2, 0, 0, 0, 2, 110 }, { 12, 9, -5, -3, -8, -5, 2, 1, 2, 1, 1, 0, 114 }, { 11, 8, -4, -1, -5, -6, 1, 1, 0, 1, 1, 0, 114 }, { 3, 3, 0, 1, -2, 13, 0, -4, -1, -4, 1, 1, 106 }, { 4, 3, 2, 2, -4, 26, 0, -6, 1, -6, 0, 0, 84 }, { 13, 16, -7, -5, 15, -3, -3, 2, -6, 1, 1, 1, 78 }, { 32, 20, 1, 9, 16, 15, -7, -7, -7, -7, -2, -2, 6 }, { 16, 18, -6, -7, -9, -9, 2, 2, 2, 2, 1, 1, 102 }, { 10, 11, -3, -3, -7, 8, 1, -3, 2, -4, 1, 1, 100 }, { 36, -1, -12, 4, 17, 19, -10, -11, -5, -6, 8, 1, 48 }, { 36, 12, -12, -3, -11, 5, 1, -1, 3, -3, 2, 1, 68 }, { 1, 1, 0, -1, -2, -3, 0, 1, 1, 1, 0, 0, 130 }, { 11, 11, -5, -5, -1, -4, 0, 1, 0, 1, 1, 1, 106 }, { 5, 7, -3, -4, -3, -1, 1, 0, 1, 0, 1, 1, 118 }, { 8, 14, -3, -4, -4, -8, 1, 3, 0, 1, 0, 1, 110 }, { 28, 42, -9, -13, -13, -19, 3, 6, 1, 2, 0, 3, 66 }, { 1, 1, 0, -1, -3, -2, 1, 0, 1, 1, 0, 0, 130 }, { 17, 17, -5, -1, 2, 5, -1, -2, -3, -4, 1, 1, 74 }, { 8, 12, -1, -4, 3, -5, -2, -1, -2, 2, 1, 1, 104 }, { 4, 8, -2, -2, -5, -3, 2, 1, 1, 0, 0, 0, 120 }, { -3, 6, 2, -2, 1, -2, 0, 1, -1, 0, -1, 1, 124 }, { 5, 6, -1, -2, -4, 4, 0, -2, 1, -2, 1, 1, 114 }, { 14, 12, -4, -5, -8, -7, 1, 0, 3, 3, 1, 0, 108 }, { 17, 18, -7, -8, -8, -3, 1, 0, 2, 0, 2, 2, 96 }, { 41, 18, -14, -3, -5, -14, 1, 1, -1, 4, 3, 0, 66 }, { 17, 0, -4, 2, 15, -5, -5, 0, -5, 2, 1, 0, 92 }, { 16, 14, -6, -5, -2, -8, 0, 2, -1, 1, 1, 2, 100 }, { 39, 2, -10, 8, 17, 20, -10, -9, -7, -9, 6, 1, 32 }, { 18, 7, -4, 1, -10, 2, 1, -2, 2, -2, 1, 0, 100 }, { 49, 64, -13, -20, -31, -27, 6, 5, 6, 4, 1, 3, 34 }, { 24, 22, -7, -5, 24, -6, -5, -2, -6, 0, 2, 2, 42 }, { 28, 0, -9, 9, -10, 14, 2, -1, -1, -8, 2, 0, 76 }, { 2, 2, -1, -1, -2, -1, 1, 0, 1, 0, 0, 0, 126 }, { -5, 11, 6, -3, 16, -3, -6, 1, -3, 0, -1, 1, 100 }, { 41, 32, -9, 3, -6, 12, -1, -5, -6, -9, 2, 1, 18 }, { 5, 4, -2, -1, -4, -4, 1, 1, 1, 1, 0, 0, 124 }, { 25, 25, -8, -8, -7, -5, 1, 0, 0, 0, 1, 2, 76 }, }, { { 21, 23, -6, -7, -7, 6, 0, -4, 0, -3, 2, 2, 74 }, { 22, 23, -5, -5, -12, 15, 2, -6, 1, -5, 1, 1, 64 }, { 7, 6, -3, -2, -5, -4, 2, 1, 1, 1, 0, 0, 120 }, { 4, 7, -1, -3, -4, -5, 1, 1, 1, 2, 0, 0, 122 }, { 16, 5, -6, 0, -8, 0, 3, -1, 1, -1, 1, 0, 108 }, { 34, 5, -8, 8, 13, 17, -8, -8, -7, -9, 5, 1, 42 }, { 24, 20, -3, -4, -11, -9, -1, -1, 3, 2, 1, 0, 86 }, { 24, 20, -8, -5, 19, -6, -4, -1, -5, 1, 2, 1, 52 }, { 37, 5, -12, 3, 12, -7, 2, 1, -8, 1, 1, 1, 56 }, { 32, 17, -3, -3, -9, -1, -4, -4, 3, -1, 2, 1, 68 }, { 3, 6, 0, -2, -4, -5, 1, 2, 1, 1, -1, 0, 124 }, { 4, 11, -1, -4, -6, -2, 2, 0, 1, 0, 0, 1, 116 }, { 33, 26, -8, -7, 5, -13, -4, 0, -3, 2, 2, 2, 58 }, { 2, 3, -1, -1, -3, -2, 1, 1, 1, 0, 0, 0, 126 }, { 14, 14, -6, -5, -7, -4, 2, 1, 1, 0, 1, 1, 104 }, { 30, 2, -9, 9, -11, 14, 1, 1, 0, -9, 2, 0, 68 }, { 5, 28, 1, -5, 7, -10, -4, 5, -3, -1, -1, 2, 80 }, { 17, 17, -7, -6, -7, 4, 1, -2, 1, -3, 2, 2, 90 }, { 13, 12, -6, -4, 11, -4, -3, 2, -4, 1, 1, 1, 88 }, { 41, 0, -10, 5, 18, 23, -9, -10, -7, -8, 5, 2, 28 }, { 10, 12, -3, -4, -7, -6, 2, 1, 1, 1, 0, 1, 112 }, { 31, 41, -11, -10, -9, -18, 3, 5, -1, 1, 1, 2, 58 }, { 28, 6, -9, 2, -10, 6, 1, -2, 2, -4, 2, 0, 84 }, { 24, 22, -10, -7, 24, -2, -2, -4, -2, -1, 4, 2, 32 }, { 11, 8, -2, -1, -7, 15, 1, -5, 1, -5, 1, 1, 92 }, { -8, 16, 7, -3, -4, 15, 0, -5, 1, -5, -1, 1, 100 }, { 16, 13, -5, -4, -8, -8, 1, 1, 2, 2, 1, 0, 106 }, { 29, 2, -10, 4, 5, -10, -2, 3, -3, 0, 2, 0, 88 }, { 21, 13, -5, 10, 3, 9, -1, -4, -9, -10, 3, 1, 66 }, { 30, 26, -5, -3, 13, -6, -5, -3, -5, -2, 1, 1, 44 }, { 16, 11, -6, -2, -8, -6, 2, 1, 1, 1, 1, 0, 106 }, { 0, 11, 2, -3, 2, -5, -1, 2, -1, 0, -1, 1, 114 }, { 19, 39, -7, -11, -12, -8, 5, 3, -1, -1, 0, 3, 70 }, { 21, 23, -7, -8, -9, -5, 2, 0, 1, 0, 1, 2, 86 }, { 10, 16, -4, -4, -8, -7, 4, 3, 0, 0, -1, 1, 108 }, { 29, 33, -11, -10, -15, -12, 5, 3, 1, 1, 1, 2, 74 }, { 37, 16, -14, -4, -10, 11, 1, -2, 2, -6, 3, 2, 56 }, { 0, 18, 4, -4, 8, -8, -3, 2, -3, 0, -1, 2, 98 }, { 32, 18, 4, 11, 18, 18, -8, -8, -8, -8, -3, -2, 0 }, { 5, 6, -3, -3, -2, -3, 1, 2, 0, 0, 0, 1, 120 }, { 49, 25, -5, -9, -8, -8, -5, -5, 4, 4, 2, 1, 38 }, { 43, 16, -7, 13, 13, 18, -7, -8, -11, -13, 3, 1, 6 }, { -2, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 132 }, { -2, 13, 5, -4, 19, -4, -6, 1, -3, 0, -1, 2, 88 }, { 12, 1, -5, 4, -2, 25, 0, -5, 0, -8, 2, 0, 80 }, { 19, 17, -8, -4, 15, -7, -4, 2, -5, 1, 2, 1, 70 }, { 16, 15, -7, -6, 2, -6, -1, 1, -2, 1, 2, 2, 94 }, { 14, 21, -5, -5, -7, -10, 2, 4, 0, 0, 0, 1, 98 }, { 35, 22, -9, -7, -10, -10, -1, 0, 3, 2, 2, 1, 72 }, { 26, 26, -1, -3, -1, 10, -3, -7, -3, -5, 0, 1, 48 }, { 10, 9, -5, -4, -4, -3, 1, 1, 1, 0, 1, 1, 112 }, { 25, 10, -4, 7, 7, 11, -5, -6, -7, -8, 3, 1, 60 }, { -3, 7, 3, -2, -3, 4, 1, -1, 0, -2, -1, 1, 120 }, { 3, 12, 4, -2, -4, 31, 0, -10, 1, -7, 0, 1, 70 }, { 2, 2, 0, -1, -2, -2, 0, 0, 1, 1, 0, 0, 126 }, { 16, 19, -6, -9, -7, -4, 1, 0, 2, 1, 1, 2, 96 }, { 22, 25, -7, -8, -7, -5, 1, 0, 0, 0, 1, 2, 80 }, { 12, 14, -5, -6, -2, -6, 0, 1, 0, 2, 1, 1, 104 }, { 12, 13, -5, -5, -7, -3, 1, 1, 2, 0, 1, 1, 106 }, { 25, 22, -8, -7, 1, -11, -2, 1, -2, 2, 2, 2, 78 }, { 33, 15, -7, -1, -2, -12, -2, -1, -1, 3, 2, 0, 74 }, { 19, 8, -9, -1, -4, -7, 1, 2, 0, 1, 2, 0, 104 }, { -1, -2, 1, 1, -2, -1, 1, 0, 1, 1, -1, -1, 134 }, { 10, 32, 0, -12, -11, 3, 2, -3, 2, -2, 0, 3, 80 }, }, { { 38, 18, -10, -5, -10, 20, -1, 2, 4, -9, 1, 2, 28 }, { 11, 15, -4, -5, -8, -7, 3, 2, 1, 1, 0, 1, 108 }, { 20, 9, -10, -3, 18, -4, -2, 1, -4, 1, 3, 1, 68 }, { 18, 12, -7, -4, -7, -8, 1, 2, 2, 2, 1, 0, 104 }, { 10, 15, -4, -6, -6, -9, 2, 3, 1, 2, 0, 1, 110 }, { 61, 30, -8, -10, 5, -13, -8, -6, 0, 5, 4, 2, 4 }, { 26, 27, -5, -5, 22, -4, -3, -4, -2, -3, 1, 1, 26 }, { 8, 19, -3, -8, -8, -3, 3, 1, 1, 0, 0, 2, 104 }, { 38, 16, 3, -3, -3, 10, -9, -7, 3, -5, 1, 2, 36 }, { 8, 7, -3, -2, -5, -5, 1, 1, 1, 1, 1, 0, 118 }, { 22, 25, -8, -8, -9, -5, 2, 1, 1, -1, 1, 2, 82 }, { 28, 24, -8, -6, 6, -10, -3, 0, -4, 1, 2, 2, 64 }, { 5, 7, 2, -1, 24, -7, -3, 3, 1, -4, 0, 4, 66 }, { 30, 20, -11, -8, -11, -9, 1, 0, 3, 3, 3, 1, 84 }, { 2, 13, 1, -5, 0, -5, -1, 1, 0, 1, 0, 1, 112 }, { 25, 21, -8, -5, -1, -1, -1, -1, -2, -2, 2, 2, 70 }, { 8, 8, -3, -5, -3, -4, 0, 0, 1, 2, 1, 1, 116 }, { 15, 16, -6, -6, -8, -5, 2, 1, 1, 0, 1, 2, 102 }, { 33, 14, -7, 0, -1, -12, -2, 0, -2, 2, 2, 0, 74 }, { -2, 16, 3, -3, -4, 13, 1, -4, -1, -5, 0, 2, 96 }, { 31, 21, -6, -9, -5, -6, -3, -2, 2, 2, 2, 2, 70 }, { 24, 22, 5, 4, 4, 5, -5, -7, -4, -7, -1, 0, 48 }, { 23, 22, 1, 1, -5, 19, -3, -7, -4, -6, 0, 1, 44 }, { 76, 103, -23, -32, -40, -48, 9, 12, 5, 7, 1, 6, -24 }, { 29, 2, -2, 8, 15, 18, -8, -7, -5, -7, 1, -1, 42 }, { 9, 10, -4, -5, -8, -7, 3, 2, 2, 2, 0, 1, 118 }, { 33, 19, -5, -3, -9, -9, -2, -2, 2, 2, 2, 0, 72 }, { 19, 19, -7, -7, -7, -5, 1, 1, 1, 0, 1, 2, 92 }, { 8, 33, 4, -8, -10, 21, 1, -11, 0, -2, 1, 2, 50 }, { 27, 14, 3, 10, 16, 15, -6, -5, -5, -6, -4, -3, 16 }, { 37, 33, 2, 3, -15, -18, -1, 1, -3, 0, 1, 0, 48 }, { 18, 20, -4, -4, -10, 16, 0, -6, 2, -6, 2, 1, 70 }, { 21, -1, -5, 5, -8, 12, 0, -5, 0, -4, 3, 0, 92 }, { -3, 8, 3, -3, -2, 4, 0, -2, 0, -1, 0, 1, 118 }, { 21, 38, -7, -2, -5, -3, 4, 1, -8, -5, 1, 4, 50 }, { 2, 13, 6, -6, -4, 32, -1, -10, 2, -2, 0, 1, 62 }, { 31, 5, -10, 3, 9, -9, -1, 2, -5, 1, 1, 0, 74 }, { 7, 17, 3, -2, 19, -6, -10, 0, 3, -1, 2, 0, 64 }, { 53, 80, -16, -4, -13, -2, 4, 0, -10, -11, 3, 5, -50 }, { 25, 11, -9, -2, -9, -1, 1, 0, 2, -2, 2, 1, 90 }, { 40, 9, 3, 7, 7, -10, -4, -3, -7, 1, -1, 0, 44 }, { 22, 20, -8, -7, 0, -9, -1, 1, -2, 2, 2, 2, 84 }, { 16, 32, -4, -1, 3, -10, -3, 5, -3, -5, 1, 3, 60 }, { 36, 7, -8, 10, 12, 17, -6, -8, -9, -10, 4, 1, 36 }, { 38, 23, 5, -6, -7, 1, -10, -7, 6, -1, 2, 1, 38 }, { 15, 21, -6, -5, -8, -10, 3, 4, 0, 0, 0, 1, 98 }, { 1, 18, 2, -4, 6, -8, -2, 3, -3, 0, -1, 2, 100 }, { -2, -2, 1, 1, 1, 0, 0, 0, -1, 0, 0, 0, 132 }, { -1, 3, 0, -2, -1, -1, 0, 1, 1, 0, 0, 0, 128 }, { 0, 3, 0, -2, 0, -2, 0, 1, 0, 0, 0, 1, 126 }, { 24, 7, -10, 0, -1, -9, 0, 3, -1, 1, 2, 0, 96 }, { 35, 7, -13, 2, -9, 13, 1, -2, 1, -7, 3, 1, 64 }, { 8, 5, -2, -5, -3, 39, 0, -5, 3, -4, 1, 1, 52 }, { 16, 12, -8, -2, 0, -7, 0, 4, -1, -1, 1, 1, 98 }, { 31, 31, -10, -4, 8, -9, -3, 2, -5, -3, 2, 2, 44 }, { 18, 31, -7, -8, -10, -8, 5, 3, -2, -1, 0, 3, 80 }, { 16, 13, -5, -4, -8, -9, 1, 1, 2, 3, 1, 0, 106 }, { 15, 15, -6, -6, -6, 7, 1, -3, 1, -3, 2, 2, 90 }, { 22, 21, -9, -7, 17, -6, -4, 0, -3, 1, 3, 1, 56 }, { 15, 13, -7, -3, 12, -5, -3, 1, -5, 1, 2, 1, 84 }, { 24, 10, -13, -3, 22, -5, 5, 3, -5, 0, 2, 2, 44 }, { 36, 10, -5, 0, 15, -5, 6, 0, -10, 1, -2, 2, 32 }, { 5, 14, 2, -2, 11, -7, -5, 2, -2, -1, 0, 2, 90 }, { 14, 14, -7, -6, -4, -6, 1, 1, 0, 2, 2, 1, 104 }, }, { { 21, 15, -11, 0, -6, -5, 5, 3, -5, -3, 2, 3, 90 }, { 18, 12, -7, -4, 0, -10, 0, 3, -1, 2, 1, 1, 98 }, { 16, 18, -7, -5, -10, -7, 4, 3, 0, -1, 1, 2, 100 }, { 23, 29, 2, -5, 5, 10, -4, -7, -3, -5, -2, 9, 24 }, { 10, 17, -6, -7, -9, -4, 4, 2, 1, 0, 1, 2, 106 }, { -3, 18, 2, -6, -3, 13, 0, -3, 0, -4, 1, 3, 92 }, { 13, 23, -2, -5, -9, 6, 2, -4, 0, -1, 1, 1, 78 }, { 18, 17, -8, -8, -6, 5, 2, -2, 1, -1, 2, 2, 84 }, { 30, 13, 6, 4, 6, 14, -10, -14, -4, -8, 11, 2, 28 }, { 23, 18, -9, -4, -5, 1, 1, 0, 0, -2, 2, 1, 76 }, { 15, 18, -6, -4, -8, -2, 3, 1, -1, -2, 1, 2, 94 }, { 20, 14, -10, -8, 20, -7, 0, 2, 2, 2, 3, 1, 50 }, { 21, 16, -6, -2, 3, -1, -1, 0, 2, -4, 1, 2, 66 }, { 19, 17, -6, -7, -8, -9, 1, 1, 2, 3, 1, 1, 98 }, { 2, 21, 5, -6, -4, 12, 1, -6, -6, 10, 0, 5, 60 }, { 10, 13, -5, -3, -3, -7, 2, 4, -2, -1, 0, 2, 108 }, { 36, 8, -6, 10, 13, 17, -7, -8, -9, -10, 3, 1, 32 }, { 13, 9, -9, -2, -3, 24, 1, -3, 1, -7, 3, 1, 72 }, { 21, 23, -1, -2, 22, -1, -1, -4, 2, -2, -1, -2, 20 }, { 11, 17, -3, -4, 17, -9, -2, 3, 0, -5, 1, 7, 62 }, { 39, 6, 1, 13, 9, -11, 3, -3, -12, -2, 1, 1, 38 }, { 28, 17, -5, -4, -5, -9, -2, -1, 1, 3, 2, 0, 78 }, { 60, 1, -47, 11, -7, 38, 16, 18, -16, -22, 9, 0, 6 }, { 23, 6, -3, -5, 2, 11, -4, 23, 3, -7, -3, 3, 30 }, { 10, 26, 3, -5, 5, 11, -7, -8, -12, 5, 4, 18, 28 }, { 11, 13, -3, -2, 1, -8, -1, 2, -1, 0, 1, 1, 100 }, { 19, 21, 14, 13, 19, 14, -10, -9, -10, -11, -3, -2, 18 }, { 14, 6, -10, -9, -2, 29, 0, 8, 5, 2, 1, 0, 40 }, { 20, 17, 4, -3, -6, 5, -6, -8, 0, 0, 3, 1, 74 }, { 18, 27, -2, -3, 4, -3, -4, -1, -6, -8, 3, 10, 58 }, { 6, 5, 1, -3, 24, -6, -1, 4, 3, -8, 0, 8, 62 }, { 31, 9, -12, 1, -9, 12, 0, -1, 2, -7, 3, 1, 68 }, { 23, 18, -8, -1, 16, -6, -3, 1, -4, -3, 1, 2, 56 }, { -1, 8, -8, -9, -2, 12, 23, -3, -9, 28, -13, 16, 44 }, { 38, 12, -3, 6, 8, -7, -2, -8, -10, 5, 7, -2, 40 }, { 9, 4, 7, 6, 14, 10, 5, 6, 6, 3, -13, -9, 32 }, { 20, 17, -7, -2, 1, -9, -1, 2, -3, 0, 2, 1, 86 }, { 25, 22, -2, -7, -8, -3, -4, -3, 4, 0, 2, 1, 74 }, { 34, 11, 5, 7, 15, 6, -6, -7, -7, -13, 6, 3, 20 }, { 29, 9, -9, -1, 11, -12, -3, 3, -3, 1, 2, 1, 72 }, { 14, 0, 9, 6, 9, 10, 2, 0, 1, -1, -10, -6, 60 }, { -26, 24, -15, 12, -20, 25, -12, 38, 28, 4, 0, -15, 42 }, { 30, 6, -2, 4, 1, 15, -7, 7, -2, -4, 1, 0, 30 }, { 20, 19, 5, 5, 5, 10, -6, -8, -5, -8, 1, 0, 52 }, { 20, 21, -7, -8, -8, -3, 1, 0, 2, -1, 1, 2, 88 }, { 16, 25, 2, -4, 7, 1, -7, -7, 3, -9, 4, 9, 48 }, { 19, 26, -7, -8, -1, 1, -1, -1, -6, -1, 3, 10, 60 }, { 5, 3, -1, -1, -3, -4, 0, 1, 1, 1, 0, 0, 124 }, { 27, 12, -11, -4, -8, -2, 1, 1, 2, -1, 3, 1, 86 }, { 2, 0, 0, -3, 0, 35, -4, -7, 3, 0, 3, 1, 68 }, { 10, 16, -4, -5, 0, -8, 1, 3, -1, 0, 0, 2, 100 }, { 19, 17, 0, 1, 0, 13, -4, -4, -6, -1, 1, 1, 54 }, { 18, 7, -10, -9, 0, 24, 0, -1, 1, 10, 3, -1, 44 }, { 23, 16, -4, 2, -12, -3, -1, 4, 3, -8, 1, 1, 84 }, { 12, 23, -1, 5, 13, 24, -19, 9, -8, -24, -1, 14, 34 }, { 14, 15, -7, -6, -6, -6, 1, 2, 1, 0, 2, 2, 104 }, { 15, 22, -5, -2, -1, -1, 0, 2, -7, -6, 2, 7, 76 }, { 19, 14, -8, -5, -6, -9, 1, 3, 1, 2, 1, 1, 100 }, { 14, 19, -4, -1, 1, -6, -2, 4, -3, -5, 1, 3, 86 }, { 17, 17, -1, 2, -3, 4, 0, -4, -5, -4, 2, 0, 78 }, { 30, 5, -6, -5, 17, -5, 10, 0, -4, 1, 2, 3, 32 }, { 15, 12, -7, -5, -5, -6, 1, 1, 0, 1, 2, 2, 106 }, { 34, 11, 5, 2, -3, 12, -11, -9, 3, -7, 6, 1, 40 }, { 8, 6, -4, 0, -5, -3, 3, 2, -1, -2, 0, 1, 118 }, } }; const int8_t dav2d_ns_wiener_coef_range_y[16][2] = { { 6, -24 }, { 6, -24 }, { 5, -14 }, { 5, -14 }, { 5, -16 }, { 5, -16 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, }; const int8_t dav2d_ns_wiener_coef_range_uv[18][2] = { { 6, -24 }, { 6, -24 }, { 5, -14 }, { 5, -14 }, { 5, -16 }, { 5, -16 }, { 5, -16 }, { 5, -16 }, { 5, -16 }, { 5, -16 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, { 4, -8 }, }; const uint8_t dav2d_pc_wiener_sub_classify[4][256] = { { 7, 54, 38, 45, 57, 51, 11, 57, 52, 51, 16, 52, 60, 6, 45, 32, 63, 27, 29, 23, 22, 23, 55, 57, 58, 32, 25, 23, 22, 31, 35, 57, 9, 22, 16, 23, 41, 53, 44, 7, 53, 9, 9, 45, 23, 13, 52, 52, 8, 7, 31, 23, 50, 32, 41, 23, 10, 16, 21, 41, 42, 21, 29, 20, 35, 23, 34, 23, 43, 30, 18, 63, 27, 53, 36, 33, 32, 25, 15, 17, 4, 29, 49, 44, 7, 18, 2, 14, 1, 10, 61, 33, 37, 12, 61, 11, 6, 39, 39, 51, 24, 20, 20, 50, 17, 27, 5, 41, 11, 7, 10, 3, 8, 42, 56, 25, 15, 63, 63, 25, 41, 42, 54, 36, 4, 4, 11, 18, 23, 15, 3, 22, 11, 22, 63, 46, 3, 3, 3, 54, 15, 9, 42, 15, 33, 53, 62, 37, 28, 60, 29, 33, 3, 8, 11, 63, 47, 18, 10, 4, 57, 15, 26, 22, 12, 51, 18, 4, 21, 52, 47, 21, 57, 51, 3, 16, 21, 0, 14, 19, 50, 39, 37, 25, 59, 8, 58, 8, 33, 60, 49, 33, 15, 62, 55, 20, 14, 28, 55, 52, 13, 26, 40, 8, 56, 27, 55, 2, 47, 13, 37, 54, 37, 37, 43, 35, 58, 0, 30, 0, 58, 62, 11, 11, 43, 34, 49, 34, 28, 14, 28, 48, 43, 27, 38, 58, 46, 10, 1, 30, 42, 56, 27, 36, 58, 11, 50, 34, 6, 26, 35, 23, 58, 31, 4, 30 }, { 21, 62, 54, 46, 23, 8, 0, 7, 27, 60, 46, 18, 31, 60, 18, 18, 56, 34, 3, 29, 60, 8, 4, 23, 44, 45, 59, 29, 60, 44, 58, 29, 61, 27, 46, 12, 34, 27, 28, 45, 61, 27, 61, 46, 12, 12, 27, 27, 16, 45, 44, 12, 33, 45, 3, 12, 52, 46, 37, 31, 21, 37, 61, 57, 24, 12, 20, 12, 62, 44, 63, 56, 34, 57, 51, 49, 18, 21, 48, 10, 24, 30, 2, 5, 45, 63, 17, 58, 13, 25, 41, 38, 36, 22, 49, 1, 39, 57, 57, 59, 20, 50, 57, 55, 10, 10, 39, 31, 1, 43, 25, 40, 16, 47, 56, 59, 6, 33, 56, 59, 47, 21, 19, 49, 53, 24, 49, 63, 29, 61, 9, 60, 1, 60, 17, 11, 9, 40, 40, 9, 30, 6, 16, 4, 23, 46, 54, 22, 26, 43, 39, 41, 9, 21, 0, 51, 3, 63, 25, 24, 7, 48, 32, 61, 2, 8, 63, 63, 37, 27, 31, 37, 7, 27, 48, 45, 43, 28, 11, 42, 58, 45, 36, 59, 54, 16, 15, 16, 38, 43, 6, 34, 26, 2, 9, 35, 4, 20, 4, 18, 14, 32, 50, 54, 27, 34, 22, 58, 58, 35, 22, 5, 58, 36, 58, 24, 15, 28, 7, 28, 22, 35, 1, 63, 13, 14, 30, 26, 2, 50, 4, 11, 54, 34, 42, 15, 11, 52, 52, 52, 47, 35, 34, 28, 22, 0, 58, 14, 15, 32, 17, 49, 15, 53, 24, 44 }, { 59, 6, 47, 15, 6, 36, 31, 58, 36, 40, 59, 59, 49, 40, 59, 59, 10, 1, 63, 11, 18, 61, 9, 58, 51, 2, 11, 54, 40, 32, 57, 6, 50, 50, 41, 11, 45, 36, 28, 59, 63, 50, 9, 41, 11, 36, 36, 50, 42, 58, 52, 11, 27, 2, 14, 11, 19, 41, 46, 46, 23, 46, 3, 53, 57, 11, 16, 5, 25, 52, 28, 10, 1, 63, 43, 22, 41, 53, 13, 4, 31, 3, 56, 43, 58, 19, 10, 17, 48, 35, 43, 29, 0, 32, 22, 31, 9, 27, 63, 41, 16, 16, 63, 27, 49, 48, 4, 14, 28, 58, 19, 8, 42, 45, 10, 11, 3, 15, 10, 41, 45, 23, 24, 21, 35, 57, 22, 57, 54, 26, 26, 26, 31, 18, 31, 7, 44, 5, 44, 21, 56, 30, 42, 17, 37, 41, 9, 51, 56, 49, 16, 6, 8, 54, 57, 15, 49, 7, 33, 31, 11, 20, 55, 50, 9, 60, 28, 31, 62, 36, 49, 46, 58, 60, 20, 2, 12, 23, 7, 47, 17, 60, 51, 41, 48, 62, 0, 37, 29, 14, 56, 24, 17, 9, 32, 10, 17, 17, 17, 59, 27, 55, 16, 34, 24, 55, 39, 57, 57, 15, 39, 43, 39, 51, 57, 57, 51, 34, 5, 38, 51, 54, 31, 31, 48, 17, 56, 13, 9, 17, 9, 33, 48, 4, 47, 52, 48, 33, 33, 33, 63, 10, 1, 24, 32, 57, 57, 17, 51, 34, 57, 21, 51, 33, 31, 32 }, { 33, 38, 54, 36, 18, 39, 6, 59, 39, 34, 25, 50, 50, 20, 1, 39, 44, 15, 1, 58, 39, 60, 22, 32, 17, 11, 59, 32, 21, 49, 7, 32, 1, 1, 33, 59, 15, 39, 24, 59, 1, 1, 1, 50, 59, 39, 0, 1, 45, 12, 27, 59, 44, 39, 50, 59, 5, 33, 25, 25, 36, 25, 1, 25, 7, 59, 61, 38, 63, 49, 14, 44, 2, 33, 16, 40, 33, 33, 48, 15, 17, 57, 22, 16, 25, 6, 44, 41, 55, 5, 16, 16, 42, 53, 51, 7, 47, 13, 41, 59, 22, 41, 1, 44, 55, 55, 47, 50, 51, 11, 5, 8, 29, 25, 0, 59, 57, 37, 44, 33, 22, 33, 40, 26, 52, 7, 51, 7, 59, 48, 28, 57, 28, 21, 9, 4, 62, 8, 62, 46, 22, 37, 58, 48, 35, 33, 47, 31, 61, 50, 47, 18, 62, 12, 7, 59, 47, 41, 5, 17, 59, 21, 10, 1, 22, 60, 14, 6, 15, 39, 50, 50, 32, 32, 48, 19, 30, 33, 10, 54, 44, 11, 7, 33, 47, 25, 23, 19, 35, 33, 57, 51, 57, 22, 48, 10, 22, 61, 22, 50, 44, 10, 41, 35, 12, 2, 48, 10, 53, 9, 48, 16, 48, 31, 31, 7, 31, 46, 18, 3, 31, 3, 7, 5, 55, 41, 22, 13, 22, 41, 22, 22, 54, 55, 54, 27, 55, 55, 54, 55, 33, 57, 55, 16, 48, 28, 7, 41, 31, 56, 7, 43, 48, 54, 17, 7 } }; const uint8_t dav2d_pc_wiener_sub_classify_ns[ 4 ][ 7 ][ 256 ] = { { { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 }, { 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 1, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 1, 1, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 1, 1, 2, 0, 1, 2, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 0, 0, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 2, 0, 2, 0, 0, 0, 1, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 2, 2, 1, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0 }, { 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 3, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 3, 1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 3, 1, 0, 1, 1, 3, 1, 1, 0, 0, 0, 1, 0, 1, 3, 0, 1, 1, 3, 1, 2, 2, 1, 1, 0, 3, 0, 0, 0, 2, 1, 1, 1, 0, 3, 0, 2, 2, 1, 3, 2, 1, 0, 1, 1, 1, 0, 0, 0, 1, 3, 3, 0, 3, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 3, 1, 2, 2, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 2, 1, 3, 1, 0, 0, 0, 2, 0, 1, 1, 1, 3, 1, 0, 0, 1, 0, 3, 0, 3, 1, 1, 0, 1, 1, 3, 1, 1, 1, 0, 1, 1, 1, 0, 3, 1, 1, 1, 1, 3, 1, 0, 1, 2, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 3, 1, 1, 2, 1, 1, 3, 0, 0, 1, 0, 1, 0, 3, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 3, 0, 1, 1, 3, 2, 0, 1, 1, 0, 0, 3, 0, 1, 0, 0, 0, 0 }, { 4, 2, 3, 1, 4, 4, 1, 4, 1, 4, 1, 1, 5, 5, 1, 1, 1, 3, 5, 4, 5, 4, 0, 4, 0, 1, 4, 4, 5, 0, 0, 4, 5, 5, 1, 4, 3, 1, 2, 4, 1, 5, 5, 1, 4, 1, 1, 1, 4, 4, 0, 4, 1, 1, 3, 4, 0, 1, 1, 3, 4, 1, 5, 5, 0, 4, 5, 4, 3, 0, 1, 1, 3, 1, 2, 2, 1, 4, 0, 3, 0, 5, 5, 2, 4, 1, 1, 0, 3, 0, 2, 2, 1, 3, 2, 1, 5, 1, 1, 4, 5, 5, 5, 1, 3, 3, 5, 3, 1, 4, 0, 0, 4, 4, 4, 4, 0, 1, 1, 4, 3, 4, 2, 2, 0, 0, 1, 1, 4, 0, 0, 5, 1, 5, 1, 0, 0, 0, 0, 2, 0, 5, 4, 0, 2, 1, 3, 1, 5, 5, 5, 2, 0, 4, 1, 1, 3, 1, 0, 0, 4, 0, 3, 5, 3, 4, 1, 0, 1, 1, 3, 1, 4, 4, 0, 1, 1, 1, 0, 3, 1, 1, 1, 4, 3, 4, 0, 4, 2, 5, 5, 2, 0, 3, 0, 5, 0, 5, 0, 1, 1, 3, 5, 4, 4, 3, 0, 1, 3, 1, 1, 2, 1, 1, 3, 0, 0, 1, 0, 1, 0, 3, 1, 1, 3, 5, 5, 5, 5, 0, 5, 0, 3, 3, 3, 0, 0, 0, 3, 0, 4, 4, 3, 2, 0, 1, 1, 5, 5, 3, 0, 4, 0, 0, 0, 0 }, { 0, 2, 6, 5, 0, 0, 7, 0, 5, 0, 5, 5, 1, 1, 5, 5, 7, 4, 1, 0, 1, 0, 3, 0, 3, 5, 0, 0, 1, 3, 3, 0, 1, 1, 5, 0, 4, 5, 2, 0, 5, 1, 1, 5, 0, 7, 5, 5, 0, 0, 3, 0, 7, 5, 4, 0, 3, 5, 5, 4, 0, 5, 1, 1, 3, 0, 1, 0, 6, 3, 7, 7, 4, 5, 2, 2, 5, 0, 3, 6, 3, 1, 1, 2, 0, 7, 7, 3, 6, 3, 2, 2, 7, 4, 2, 7, 1, 5, 5, 0, 1, 1, 1, 7, 6, 4, 1, 4, 7, 0, 3, 3, 0, 0, 0, 0, 3, 7, 7, 0, 4, 0, 2, 2, 3, 3, 7, 7, 0, 3, 3, 1, 7, 1, 7, 3, 3, 3, 3, 2, 3, 1, 0, 3, 2, 5, 4, 7, 1, 1, 1, 2, 3, 0, 7, 7, 6, 7, 3, 3, 0, 3, 4, 1, 4, 0, 7, 3, 5, 5, 6, 5, 0, 0, 3, 5, 5, 7, 3, 6, 7, 5, 7, 0, 4, 0, 3, 0, 2, 1, 1, 2, 3, 4, 3, 1, 3, 1, 3, 5, 7, 4, 1, 0, 0, 4, 3, 7, 6, 7, 7, 2, 7, 7, 6, 3, 3, 7, 3, 7, 3, 4, 7, 7, 6, 1, 1, 1, 1, 3, 1, 3, 6, 4, 6, 3, 3, 3, 6, 3, 0, 0, 4, 2, 3, 7, 7, 1, 1, 4, 3, 0, 3, 3, 3, 3 }, { 9, 2, 6, 5, 9, 9, 11, 9, 5, 9, 5, 5, 10, 1, 5, 5, 7, 4, 1, 9, 10, 9, 8, 9, 3, 5, 9, 9, 10, 3, 3, 9, 10, 10, 5, 9, 4, 5, 2, 9, 5, 10, 10, 5, 9, 7, 5, 5, 0, 9, 3, 9, 7, 5, 4, 9, 8, 5, 5, 4, 0, 5, 1, 1, 3, 9, 1, 9, 6, 3, 7, 7, 4, 5, 2, 2, 5, 9, 8, 6, 3, 1, 10, 2, 9, 7, 7, 8, 6, 8, 2, 2, 11, 4, 2, 11, 1, 5, 5, 9, 1, 1, 1, 7, 6, 4, 10, 4, 11, 9, 8, 8, 0, 0, 0, 9, 8, 7, 7, 9, 4, 0, 2, 2, 3, 3, 11, 7, 9, 8, 8, 10, 11, 10, 7, 8, 8, 8, 8, 2, 8, 10, 0, 8, 2, 5, 4, 11, 1, 10, 1, 2, 8, 0, 11, 7, 6, 7, 8, 3, 9, 8, 4, 10, 4, 9, 7, 3, 5, 5, 6, 5, 9, 9, 8, 5, 5, 11, 8, 6, 7, 5, 11, 9, 4, 0, 3, 0, 2, 10, 10, 2, 8, 4, 8, 1, 8, 1, 8, 5, 7, 4, 1, 0, 0, 4, 8, 7, 6, 7, 11, 2, 11, 11, 6, 3, 3, 11, 3, 11, 3, 4, 11, 11, 6, 1, 10, 1, 1, 8, 1, 8, 6, 4, 6, 3, 8, 8, 6, 3, 0, 0, 4, 2, 3, 11, 7, 1, 1, 4, 3, 9, 3, 3, 3, 3 }, { 14, 1, 11, 12, 14, 14, 0, 14, 12, 14, 12, 12, 10, 3, 12, 12, 7, 13, 3, 14, 10, 14, 5, 14, 2, 12, 14, 14, 10, 2, 2, 14, 10, 10, 12, 14, 13, 12, 1, 14, 12, 10, 10, 12, 14, 7, 12, 12, 15, 14, 2, 14, 7, 12, 13, 14, 5, 12, 12, 13, 15, 12, 3, 4, 2, 14, 4, 14, 11, 2, 7, 7, 13, 12, 1, 6, 12, 14, 5, 11, 2, 3, 10, 1, 14, 7, 7, 5, 9, 5, 6, 6, 0, 8, 6, 0, 3, 12, 12, 14, 3, 4, 4, 7, 11, 13, 10, 13, 0, 14, 5, 5, 15, 15, 15, 14, 5, 7, 7, 14, 13, 15, 1, 1, 2, 2, 0, 7, 14, 5, 5, 10, 0, 10, 7, 5, 5, 5, 5, 1, 5, 10, 15, 5, 6, 12, 8, 0, 3, 10, 3, 6, 5, 15, 0, 7, 11, 7, 5, 2, 14, 5, 13, 10, 8, 14, 7, 2, 12, 12, 11, 12, 14, 14, 5, 12, 12, 0, 5, 9, 7, 12, 0, 14, 8, 15, 2, 15, 6, 10, 10, 6, 5, 8, 5, 4, 5, 3, 5, 12, 7, 13, 3, 15, 15, 13, 5, 7, 11, 7, 0, 1, 0, 0, 11, 2, 2, 0, 2, 0, 2, 8, 0, 0, 11, 4, 10, 4, 3, 5, 3, 5, 11, 13, 11, 2, 5, 5, 9, 2, 15, 15, 13, 1, 2, 0, 7, 4, 3, 13, 2, 14, 2, 2, 2, 2 }, }, { { 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0 }, { 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 0, 1, 0, 1, 0, 1, 2, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 2, 0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 1, 0, 1, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0 }, { 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 0, 1, 0, 1, 0, 1, 2, 0, 0, 1, 0, 1, 3, 1, 1, 1, 0, 2, 0, 0, 0, 3, 1, 0, 1, 0, 2, 0, 3, 3, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 3, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 1, 0, 3, 0, 1, 1, 3, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 1, 0, 1, 2, 1, 0, 1, 3, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0 }, { 5, 2, 2, 5, 1, 5, 1, 1, 5, 5, 5, 5, 2, 5, 5, 5, 1, 4, 2, 1, 5, 5, 0, 1, 0, 5, 5, 1, 5, 0, 4, 1, 0, 5, 5, 1, 4, 5, 1, 5, 0, 5, 0, 5, 1, 1, 5, 5, 5, 5, 0, 1, 1, 5, 2, 1, 4, 5, 5, 2, 5, 5, 0, 5, 0, 1, 4, 1, 2, 0, 4, 1, 4, 5, 3, 1, 5, 5, 4, 2, 0, 0, 0, 3, 5, 4, 1, 4, 2, 4, 3, 3, 0, 0, 1, 1, 0, 5, 5, 5, 4, 0, 5, 4, 2, 2, 0, 2, 1, 5, 4, 0, 5, 4, 1, 5, 0, 1, 1, 5, 4, 5, 3, 1, 4, 0, 1, 4, 1, 0, 0, 5, 1, 5, 1, 4, 0, 0, 0, 0, 0, 0, 5, 0, 1, 5, 2, 0, 0, 5, 0, 3, 0, 5, 1, 3, 2, 4, 4, 0, 1, 4, 4, 0, 0, 5, 4, 4, 5, 5, 2, 5, 1, 5, 4, 5, 5, 1, 4, 2, 4, 5, 0, 5, 2, 5, 0, 5, 3, 5, 0, 4, 0, 0, 0, 1, 0, 4, 0, 5, 4, 4, 0, 2, 5, 4, 0, 4, 4, 1, 0, 3, 4, 0, 4, 0, 0, 1, 1, 1, 0, 1, 1, 4, 2, 4, 0, 0, 0, 0, 0, 4, 2, 4, 2, 0, 4, 4, 4, 4, 4, 1, 4, 1, 0, 1, 4, 4, 0, 4, 1, 1, 0, 4, 0, 0 }, { 0, 2, 2, 0, 1, 0, 5, 1, 0, 0, 0, 0, 2, 0, 0, 0, 5, 3, 2, 1, 0, 0, 6, 1, 7, 0, 0, 1, 0, 7, 3, 1, 6, 0, 0, 1, 3, 0, 5, 0, 6, 0, 6, 0, 1, 1, 0, 0, 0, 0, 7, 1, 5, 0, 2, 1, 3, 0, 0, 2, 0, 0, 6, 0, 7, 1, 3, 1, 2, 7, 3, 5, 3, 0, 4, 5, 0, 0, 3, 2, 7, 6, 6, 4, 0, 3, 5, 3, 2, 3, 4, 4, 7, 7, 5, 5, 6, 0, 0, 0, 3, 6, 0, 3, 2, 2, 6, 2, 5, 0, 3, 7, 0, 3, 5, 0, 6, 5, 5, 0, 3, 0, 4, 5, 3, 7, 5, 3, 1, 6, 7, 0, 5, 0, 5, 3, 7, 7, 7, 7, 6, 6, 0, 6, 1, 0, 2, 7, 6, 0, 6, 4, 7, 0, 5, 4, 2, 3, 3, 7, 1, 3, 3, 6, 6, 0, 3, 3, 0, 0, 2, 0, 1, 0, 3, 0, 0, 5, 3, 2, 3, 0, 7, 0, 2, 0, 7, 0, 4, 0, 6, 3, 6, 6, 7, 5, 6, 3, 6, 0, 3, 3, 6, 2, 0, 3, 7, 3, 3, 5, 7, 4, 3, 7, 3, 7, 7, 5, 1, 5, 7, 5, 5, 3, 2, 3, 6, 6, 6, 6, 6, 3, 2, 3, 2, 7, 3, 3, 3, 3, 3, 5, 3, 5, 7, 5, 3, 3, 7, 3, 5, 5, 7, 3, 7, 7 }, { 8, 2, 2, 9, 1, 11, 5, 1, 11, 11, 9, 9, 2, 11, 9, 9, 5, 10, 2, 1, 11, 11, 6, 1, 7, 11, 8, 1, 11, 7, 3, 1, 6, 11, 9, 1, 10, 11, 5, 11, 6, 11, 6, 9, 1, 1, 11, 11, 0, 11, 7, 1, 5, 11, 2, 1, 10, 9, 0, 2, 8, 0, 6, 9, 7, 1, 10, 1, 2, 7, 3, 5, 10, 9, 4, 5, 9, 8, 3, 2, 7, 6, 6, 4, 11, 3, 5, 3, 2, 3, 4, 4, 7, 7, 5, 5, 6, 9, 9, 8, 10, 6, 9, 3, 2, 2, 6, 2, 5, 0, 3, 7, 0, 3, 5, 8, 6, 5, 5, 8, 3, 8, 4, 5, 3, 7, 5, 3, 1, 6, 7, 11, 5, 11, 5, 10, 7, 7, 7, 7, 6, 6, 0, 6, 1, 9, 2, 7, 6, 0, 6, 4, 7, 8, 5, 4, 2, 3, 3, 7, 1, 3, 3, 6, 6, 11, 3, 3, 0, 11, 2, 0, 1, 11, 3, 11, 0, 5, 10, 2, 3, 11, 7, 8, 2, 0, 7, 0, 4, 0, 6, 10, 6, 6, 7, 5, 6, 10, 6, 9, 3, 3, 6, 2, 11, 10, 7, 3, 3, 5, 7, 4, 3, 7, 3, 7, 7, 5, 1, 5, 7, 5, 5, 3, 2, 3, 6, 6, 6, 6, 6, 10, 2, 10, 2, 7, 10, 10, 10, 10, 3, 5, 10, 5, 7, 5, 3, 3, 7, 3, 5, 5, 7, 3, 7, 7 }, { 7, 4, 4, 13, 15, 2, 3, 15, 2, 2, 13, 13, 12, 2, 13, 13, 5, 14, 12, 15, 2, 2, 10, 15, 0, 2, 7, 15, 2, 0, 9, 15, 10, 2, 13, 15, 14, 2, 5, 2, 10, 2, 10, 13, 15, 15, 2, 2, 6, 2, 0, 15, 5, 2, 12, 15, 14, 13, 6, 12, 7, 6, 10, 13, 0, 15, 14, 15, 4, 0, 1, 5, 14, 13, 8, 3, 13, 7, 9, 12, 0, 10, 10, 8, 2, 1, 5, 9, 4, 1, 8, 8, 0, 0, 3, 3, 10, 13, 13, 7, 14, 10, 13, 9, 12, 12, 10, 12, 3, 6, 1, 0, 6, 11, 5, 7, 10, 5, 5, 7, 11, 7, 8, 3, 1, 0, 3, 1, 15, 10, 0, 2, 3, 2, 5, 14, 0, 0, 0, 0, 10, 10, 6, 10, 15, 13, 4, 0, 10, 6, 10, 8, 0, 7, 3, 8, 12, 1, 1, 0, 15, 9, 11, 10, 10, 2, 1, 1, 6, 2, 12, 6, 15, 2, 9, 2, 6, 5, 14, 4, 9, 2, 0, 7, 4, 6, 0, 6, 8, 6, 10, 14, 10, 10, 0, 5, 10, 14, 10, 13, 9, 11, 10, 4, 2, 14, 0, 9, 9, 5, 0, 8, 9, 0, 9, 0, 0, 5, 15, 5, 0, 5, 3, 1, 4, 9, 10, 10, 10, 10, 10, 14, 4, 14, 4, 0, 14, 14, 14, 14, 11, 5, 14, 5, 0, 3, 9, 9, 0, 11, 5, 3, 0, 1, 0, 0 }, }, { { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, { 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, { 2, 2, 3, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 3, 3, 1, 2, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 2, 3, 3, 1, 1, 3, 2, 1, 2, 3, 3, 3, 1, 1, 2, 2, 3, 1, 2, 3, 1, 1, 2, 3, 1, 1, 1, 2, 2, 1, 2, 3, 1, 1, 1, 3, 2, 3, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 3, 1, 3, 3, 1, 2, 1, 1, 3, 3, 1, 1, 0, 3, 3, 1, 1, 3, 1, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 1, 2, 1, 3, 1, 3, 1, 1, 3, 1, 1, 1, 3, 1, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 1, 3, 3, 3, 3, 3, 2, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 3, 2, 1, 1, 2, 2, 3, 2, 2, 2, 1, 2, 2, 1, 1, 3, 3, 2, 3, 1, 3, 2, 3, 2, 0, 3, 3, 0, 3, 3, 3, 1, 3, 3, 3, 2, 1, 1, 3, 1, 0, 1, 3, 1, 1, 1, 3, 1, 3, 3, 1, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 0, 3, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 3 }, { 2, 2, 4, 5, 2, 2, 1, 2, 2, 2, 2, 2, 4, 2, 2, 2, 1, 4, 4, 5, 2, 2, 4, 2, 3, 2, 5, 5, 2, 4, 1, 2, 4, 4, 1, 5, 4, 2, 1, 2, 4, 4, 4, 1, 5, 2, 2, 4, 1, 2, 3, 5, 1, 2, 4, 5, 1, 1, 2, 2, 5, 2, 4, 5, 1, 5, 4, 2, 4, 3, 1, 1, 4, 4, 5, 5, 1, 5, 1, 4, 1, 4, 4, 5, 2, 1, 1, 4, 4, 1, 5, 0, 3, 4, 5, 1, 4, 1, 4, 1, 4, 4, 4, 1, 4, 4, 4, 4, 1, 2, 1, 3, 1, 4, 1, 5, 4, 5, 1, 1, 4, 5, 0, 5, 1, 1, 5, 1, 5, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 5, 4, 2, 1, 4, 2, 1, 4, 3, 4, 4, 4, 2, 3, 5, 1, 5, 4, 1, 4, 1, 5, 1, 1, 4, 4, 2, 1, 1, 2, 2, 4, 2, 2, 2, 1, 2, 2, 5, 1, 4, 4, 2, 3, 1, 4, 2, 3, 2, 0, 4, 4, 0, 4, 4, 4, 1, 4, 4, 4, 2, 1, 1, 4, 1, 0, 1, 4, 1, 1, 5, 4, 5, 4, 3, 1, 1, 3, 1, 2, 1, 3, 5, 1, 1, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 1, 4, 0, 4, 1, 1, 4, 3, 1, 1, 5, 3, 4, 1, 4 }, { 3, 3, 6, 7, 3, 3, 2, 3, 3, 3, 3, 3, 6, 3, 3, 3, 1, 6, 6, 7, 3, 3, 6, 3, 4, 3, 7, 7, 3, 6, 2, 3, 6, 6, 1, 7, 6, 3, 2, 3, 6, 6, 6, 1, 7, 3, 3, 6, 0, 3, 4, 7, 1, 3, 6, 7, 2, 1, 3, 3, 7, 3, 6, 7, 2, 7, 6, 3, 6, 4, 2, 1, 6, 6, 7, 7, 1, 7, 1, 6, 2, 6, 6, 7, 3, 2, 1, 6, 6, 2, 7, 5, 4, 6, 7, 2, 6, 1, 6, 1, 6, 6, 6, 1, 6, 6, 6, 6, 2, 3, 2, 4, 0, 6, 1, 7, 6, 7, 1, 1, 6, 7, 5, 7, 2, 2, 7, 2, 7, 3, 3, 3, 2, 3, 2, 2, 4, 3, 4, 7, 6, 3, 0, 6, 3, 1, 6, 4, 6, 6, 6, 3, 4, 7, 2, 7, 6, 2, 6, 2, 7, 1, 2, 6, 6, 3, 2, 2, 3, 3, 6, 3, 3, 3, 1, 3, 3, 7, 2, 6, 6, 3, 4, 1, 6, 3, 4, 3, 5, 6, 6, 5, 6, 6, 6, 1, 6, 6, 6, 3, 1, 2, 6, 0, 5, 2, 6, 2, 2, 7, 6, 7, 6, 4, 2, 2, 4, 0, 3, 0, 4, 7, 2, 2, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 1, 6, 5, 6, 2, 2, 6, 4, 0, 2, 7, 4, 6, 2, 6 }, { 10, 3, 8, 7, 3, 3, 11, 3, 3, 3, 10, 10, 8, 3, 10, 10, 1, 9, 9, 7, 3, 3, 8, 3, 4, 3, 7, 7, 3, 6, 11, 3, 6, 6, 1, 7, 9, 3, 2, 10, 9, 6, 8, 1, 7, 3, 3, 6, 0, 3, 4, 7, 1, 3, 9, 7, 11, 1, 10, 10, 7, 10, 6, 7, 11, 7, 6, 3, 8, 4, 2, 1, 9, 9, 7, 7, 1, 7, 1, 9, 11, 6, 6, 7, 3, 11, 1, 6, 8, 2, 7, 5, 4, 6, 7, 11, 8, 1, 9, 1, 6, 6, 9, 1, 8, 8, 9, 9, 2, 3, 11, 4, 0, 9, 1, 7, 6, 7, 1, 1, 9, 7, 5, 7, 2, 11, 7, 11, 7, 10, 10, 10, 11, 3, 11, 11, 4, 3, 4, 7, 6, 10, 0, 6, 10, 1, 8, 4, 6, 8, 6, 3, 4, 7, 11, 7, 8, 11, 8, 11, 7, 1, 11, 6, 8, 3, 2, 11, 10, 3, 8, 10, 3, 3, 1, 3, 10, 7, 11, 8, 6, 3, 4, 1, 8, 10, 4, 10, 5, 9, 6, 5, 6, 8, 6, 1, 6, 6, 6, 10, 1, 11, 6, 0, 5, 11, 6, 11, 11, 7, 6, 7, 6, 4, 11, 11, 4, 0, 3, 0, 4, 7, 11, 11, 8, 6, 6, 1, 8, 6, 8, 8, 8, 9, 8, 4, 8, 8, 8, 8, 9, 1, 9, 5, 6, 11, 11, 6, 4, 0, 11, 7, 4, 8, 11, 6 }, { 6, 0, 9, 1, 0, 7, 10, 0, 7, 7, 6, 6, 9, 7, 6, 6, 12, 14, 14, 1, 7, 7, 9, 0, 15, 0, 1, 1, 7, 13, 10, 0, 13, 13, 12, 1, 14, 7, 8, 6, 14, 13, 9, 12, 1, 7, 7, 13, 5, 0, 15, 1, 12, 0, 14, 1, 10, 12, 6, 6, 1, 6, 13, 1, 10, 1, 13, 7, 9, 15, 8, 12, 14, 14, 11, 11, 12, 1, 12, 14, 10, 13, 13, 11, 0, 10, 12, 13, 9, 8, 11, 4, 15, 13, 11, 10, 9, 12, 14, 12, 13, 13, 14, 12, 9, 9, 14, 14, 8, 0, 10, 2, 5, 14, 12, 1, 13, 1, 12, 12, 14, 1, 3, 11, 8, 10, 11, 10, 1, 6, 6, 6, 10, 7, 10, 10, 2, 7, 2, 11, 13, 6, 5, 13, 6, 12, 9, 15, 13, 9, 13, 0, 2, 1, 10, 1, 9, 10, 9, 10, 1, 12, 10, 13, 9, 0, 8, 10, 6, 7, 9, 6, 0, 0, 12, 0, 6, 1, 10, 9, 13, 0, 15, 12, 9, 6, 15, 6, 4, 14, 13, 3, 13, 9, 13, 12, 13, 13, 13, 6, 12, 10, 13, 5, 3, 10, 13, 10, 10, 1, 13, 11, 13, 15, 10, 10, 15, 5, 7, 5, 15, 1, 10, 10, 9, 13, 13, 12, 9, 13, 9, 9, 9, 14, 9, 15, 9, 9, 9, 9, 14, 12, 14, 3, 13, 10, 10, 13, 15, 5, 10, 11, 15, 9, 10, 13 }, }, { { 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }, { 1, 2, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }, { 1, 2, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 3, 3, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 2, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 0, 3, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1 }, { 4, 5, 1, 1, 3, 1, 4, 4, 1, 2, 1, 1, 1, 5, 1, 1, 4, 1, 1, 4, 1, 5, 1, 4, 1, 5, 4, 4, 1, 1, 1, 4, 1, 1, 4, 4, 1, 1, 0, 4, 1, 1, 1, 1, 4, 1, 4, 1, 5, 5, 0, 4, 4, 1, 1, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 5, 1, 1, 0, 4, 1, 4, 3, 3, 4, 4, 1, 1, 1, 1, 1, 3, 1, 4, 4, 1, 1, 4, 3, 3, 3, 1, 0, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 1, 1, 1, 0, 5, 4, 2, 4, 1, 4, 4, 1, 1, 4, 4, 1, 4, 3, 3, 0, 1, 0, 1, 4, 1, 4, 1, 4, 1, 3, 1, 2, 2, 2, 0, 1, 1, 4, 1, 0, 4, 1, 1, 1, 1, 1, 3, 2, 5, 1, 4, 1, 1, 4, 1, 4, 1, 1, 1, 1, 5, 0, 4, 1, 1, 1, 1, 4, 4, 1, 4, 1, 4, 1, 1, 4, 5, 1, 4, 1, 1, 3, 4, 0, 4, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 0, 5, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 0, 3, 0, 1, 0, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 4, 1, 1, 3, 1, 4, 1, 1, 1, 4, 1, 3, 1, 1, 1, 1 }, { 1, 5, 7, 7, 6, 7, 1, 1, 7, 2, 7, 7, 7, 5, 7, 7, 1, 7, 7, 1, 7, 5, 7, 1, 7, 5, 1, 1, 7, 7, 7, 1, 7, 7, 1, 1, 7, 7, 3, 1, 7, 7, 7, 7, 1, 7, 1, 7, 5, 5, 3, 1, 1, 7, 7, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 1, 7, 5, 7, 7, 3, 1, 7, 1, 6, 4, 1, 1, 7, 7, 7, 7, 7, 6, 7, 1, 1, 7, 7, 1, 6, 6, 6, 7, 3, 7, 7, 7, 7, 1, 7, 7, 7, 1, 7, 7, 7, 7, 3, 5, 1, 2, 1, 7, 1, 1, 7, 7, 1, 1, 7, 1, 4, 6, 3, 7, 3, 7, 1, 7, 1, 7, 1, 7, 6, 7, 2, 2, 2, 3, 7, 7, 1, 7, 0, 1, 7, 7, 7, 7, 7, 6, 2, 5, 7, 1, 7, 7, 1, 7, 1, 7, 7, 7, 7, 5, 3, 1, 7, 7, 7, 7, 1, 1, 7, 1, 7, 1, 7, 7, 1, 5, 7, 1, 7, 7, 6, 1, 0, 1, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 1, 7, 7, 0, 5, 7, 7, 7, 7, 6, 7, 6, 7, 7, 7, 7, 7, 3, 6, 0, 7, 0, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 7, 7, 7, 7, 1, 7, 7, 6, 7, 1, 7, 7, 7, 1, 7, 6, 7, 7, 7, 7 }, { 10, 5, 8, 9, 6, 9, 1, 10, 9, 2, 9, 9, 9, 5, 9, 9, 10, 11, 9, 1, 9, 5, 8, 1, 7, 5, 10, 1, 9, 7, 7, 1, 9, 9, 10, 10, 11, 9, 3, 10, 9, 9, 9, 9, 10, 9, 10, 9, 5, 5, 3, 10, 10, 9, 9, 10, 1, 10, 9, 9, 9, 9, 9, 9, 7, 10, 11, 5, 11, 7, 3, 10, 11, 10, 6, 4, 10, 10, 7, 11, 7, 9, 8, 6, 9, 1, 10, 8, 8, 1, 6, 6, 6, 8, 3, 7, 8, 11, 8, 10, 8, 8, 9, 10, 8, 8, 8, 9, 3, 5, 1, 2, 1, 9, 10, 10, 9, 7, 10, 10, 8, 10, 4, 6, 3, 7, 3, 7, 10, 7, 10, 9, 10, 9, 6, 11, 2, 2, 2, 3, 8, 7, 1, 7, 0, 10, 8, 7, 11, 9, 8, 6, 2, 5, 7, 10, 8, 8, 1, 7, 10, 9, 11, 9, 8, 5, 3, 1, 11, 9, 9, 9, 1, 1, 7, 1, 8, 10, 11, 8, 10, 5, 7, 10, 8, 9, 6, 1, 0, 10, 9, 3, 9, 8, 7, 11, 8, 11, 8, 9, 10, 11, 8, 0, 5, 11, 7, 11, 8, 6, 7, 6, 7, 7, 7, 7, 7, 3, 6, 0, 7, 0, 7, 1, 8, 8, 8, 11, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 10, 9, 8, 6, 7, 10, 7, 8, 7, 1, 7, 6, 7, 8, 7, 7 }, { 9, 2, 13, 8, 1, 8, 4, 9, 8, 6, 8, 8, 8, 2, 8, 8, 9, 14, 8, 10, 8, 2, 13, 10, 7, 12, 9, 10, 8, 7, 7, 10, 8, 8, 9, 9, 14, 8, 11, 9, 8, 8, 8, 8, 9, 8, 9, 8, 12, 12, 5, 9, 9, 8, 8, 9, 4, 9, 8, 8, 8, 8, 8, 8, 7, 9, 14, 2, 14, 7, 5, 9, 14, 9, 1, 3, 9, 9, 7, 14, 7, 8, 13, 1, 8, 4, 9, 13, 13, 4, 1, 1, 0, 13, 11, 7, 13, 14, 13, 9, 13, 13, 8, 9, 13, 13, 13, 8, 11, 12, 4, 6, 10, 8, 9, 9, 8, 7, 9, 9, 13, 9, 3, 1, 5, 7, 11, 7, 9, 7, 9, 8, 9, 8, 1, 14, 6, 6, 6, 11, 13, 7, 10, 7, 15, 9, 13, 7, 14, 8, 13, 1, 6, 12, 7, 9, 13, 13, 4, 7, 9, 8, 14, 8, 13, 2, 5, 4, 14, 8, 8, 8, 10, 10, 7, 10, 13, 9, 14, 13, 9, 12, 7, 9, 13, 8, 0, 10, 15, 9, 8, 11, 8, 13, 7, 14, 13, 14, 13, 8, 9, 14, 13, 15, 12, 14, 7, 14, 13, 1, 7, 1, 7, 7, 7, 7, 7, 11, 1, 15, 7, 15, 7, 4, 13, 13, 13, 14, 13, 13, 13, 13, 13, 13, 13, 5, 13, 13, 13, 13, 9, 8, 13, 1, 7, 9, 7, 13, 7, 4, 7, 1, 7, 13, 7, 7 }, } }; const uint8_t dav2d_pc_weiner_lut_to_class[4096] = { 83, 154, 254, 125, 125, 125, 253, 253, 77, 200, 207, 30, 30, 239, 239, 239, 0, 98, 101, 229, 229, 231, 231, 231, 0, 34, 101, 100, 100, 229, 229, 231, 15, 34, 98, 100, 100, 100, 164, 164, 15, 34, 98, 106, 100, 96, 164, 164, 15, 14, 43, 106, 96, 96, 96, 164, 15, 14, 43, 106, 106, 96, 96, 184, 83, 154, 64, 110, 125, 239, 253, 253, 119, 225, 87, 231, 64, 64, 239, 239, 120, 101, 100, 229, 236, 231, 202, 231, 15, 98, 228, 100, 150, 202, 236, 231, 15, 40, 18, 100, 228, 96, 228, 88, 15, 43, 106, 68, 96, 193, 224, 88, 199, 43, 34, 106, 104, 184, 224, 184, 11, 14, 43, 96, 96, 184, 96, 184, 243, 212, 244, 22, 239, 239, 253, 253, 99, 192, 198, 230, 22, 22, 231, 239, 8, 163, 140, 230, 230, 230, 230, 231, 47, 32, 226, 82, 226, 230, 179, 164, 41, 32, 226, 226, 82, 193, 179, 224, 106, 42, 42, 226, 193, 82, 193, 184, 15, 42, 42, 193, 193, 193, 193, 184, 11, 43, 42, 106, 193, 193, 193, 184, 241, 157, 64, 255, 255, 255, 255, 255, 240, 66, 228, 228, 22, 22, 22, 22, 10, 150, 228, 228, 230, 146, 230, 232, 199, 150, 150, 228, 146, 146, 146, 184, 199, 42, 62, 146, 146, 193, 193, 184, 47, 42, 62, 193, 146, 193, 193, 184, 199, 42, 42, 146, 146, 146, 193, 179, 199, 42, 42, 146, 193, 146, 193, 184, 138, 237, 158, 237, 239, 253, 255, 255, 156, 17, 236, 88, 88, 231, 231, 22, 171, 105, 224, 224, 232, 146, 232, 22, 12, 104, 146, 146, 146, 146, 224, 232, 12, 146, 146, 146, 146, 146, 146, 232, 199, 106, 234, 146, 146, 146, 146, 146, 199, 42, 234, 146, 146, 146, 146, 146, 199, 42, 146, 146, 146, 146, 146, 184, 145, 179, 236, 158, 158, 110, 253, 29, 11, 179, 238, 237, 238, 238, 238, 88, 41, 156, 242, 88, 88, 88, 232, 2, 12, 107, 104, 232, 232, 232, 232, 232, 12, 170, 234, 234, 2, 232, 2, 179, 12, 156, 234, 234, 2, 2, 234, 179, 12, 156, 234, 234, 234, 2, 2, 179, 199, 156, 234, 234, 234, 234, 146, 184, 133, 211, 179, 179, 110, 110, 110, 253, 160, 166, 179, 238, 238, 238, 238, 238, 129, 35, 179, 236, 88, 88, 88, 179, 137, 170, 104, 242, 88, 88, 2, 2, 171, 170, 156, 104, 232, 2, 2, 232, 12, 170, 156, 234, 2, 2, 2, 2, 189, 170, 156, 234, 2, 2, 2, 2, 12, 156, 234, 179, 234, 2, 2, 179, 186, 219, 179, 179, 237, 110, 110, 253, 155, 179, 60, 231, 238, 238, 238, 238, 28, 58, 188, 179, 88, 238, 238, 238, 226, 238, 234, 179, 104, 88, 2, 88, 14, 179, 170, 242, 104, 232, 2, 2, 39, 2, 170, 156, 232, 232, 232, 2, 179, 170, 170, 234, 232, 232, 2, 2, 170, 170, 156, 234, 234, 234, 2, 179, 90, 132, 95, 254, 159, 159, 159, 253, 44, 71, 154, 222, 254, 254, 125, 159, 55, 183, 52, 86, 250, 250, 254, 125, 39, 26, 145, 103, 180, 246, 250, 215, 39, 26, 183, 73, 103, 180, 180, 30, 39, 39, 183, 145, 73, 247, 247, 87, 76, 39, 26, 145, 73, 102, 247, 87, 25, 76, 76, 57, 145, 73, 102, 202, 83, 132, 254, 125, 159, 159, 254, 159, 44, 209, 154, 64, 125, 125, 125, 159, 26, 121, 241, 86, 64, 207, 254, 125, 84, 0, 103, 103, 207, 246, 215, 215, 39, 0, 57, 227, 52, 52, 196, 30, 39, 15, 57, 145, 52, 102, 207, 87, 76, 53, 10, 77, 103, 97, 197, 87, 25, 15, 15, 0, 34, 73, 18, 100, 74, 182, 252, 252, 215, 125, 71, 69, 99, 78, 206, 244, 244, 244, 125, 159, 8, 133, 116, 198, 198, 244, 244, 215, 46, 33, 163, 190, 116, 198, 198, 215, 47, 33, 163, 116, 190, 140, 190, 230, 47, 46, 33, 163, 190, 190, 190, 230, 156, 47, 33, 33, 163, 190, 190, 82, 25, 40, 41, 41, 163, 62, 226, 193, 177, 127, 64, 255, 255, 255, 255, 255, 121, 225, 87, 196, 22, 22, 22, 22, 10, 102, 148, 230, 198, 198, 22, 22, 199, 40, 62, 226, 230, 198, 198, 22, 47, 40, 62, 62, 226, 226, 198, 230, 47, 41, 32, 62, 226, 226, 140, 230, 47, 47, 41, 62, 62, 226, 226, 230, 47, 47, 41, 42, 32, 32, 226, 193, 49, 125, 158, 158, 110, 29, 29, 255, 58, 162, 178, 231, 231, 22, 22, 22, 58, 120, 66, 228, 228, 196, 196, 22, 171, 18, 150, 228, 228, 228, 230, 22, 149, 18, 150, 150, 228, 228, 62, 230, 11, 42, 150, 62, 228, 146, 226, 230, 11, 42, 42, 62, 62, 193, 193, 230, 169, 47, 42, 42, 42, 193, 62, 193, 85, 238, 179, 158, 158, 110, 253, 29, 179, 179, 158, 158, 237, 237, 238, 22, 226, 59, 17, 236, 236, 164, 164, 22, 189, 59, 105, 224, 224, 228, 228, 230, 189, 107, 104, 146, 146, 224, 146, 228, 189, 107, 150, 146, 146, 146, 146, 228, 149, 43, 146, 146, 146, 146, 146, 228, 11, 199, 42, 146, 96, 146, 146, 193, 62, 92, 88, 214, 238, 89, 89, 253, 42, 228, 110, 158, 158, 158, 158, 239, 85, 137, 179, 237, 237, 237, 238, 231, 156, 170, 233, 17, 236, 88, 88, 228, 109, 171, 107, 105, 224, 232, 224, 232, 171, 171, 170, 232, 232, 232, 232, 232, 189, 12, 156, 104, 146, 232, 2, 232, 149, 14, 156, 146, 146, 146, 146, 146, 158, 205, 179, 179, 179, 179, 110, 253, 179, 254, 122, 110, 110, 158, 158, 110, 81, 122, 179, 238, 2, 158, 158, 238, 10, 179, 42, 179, 237, 238, 238, 238, 238, 58, 179, 233, 242, 88, 88, 88, 204, 179, 107, 233, 104, 232, 88, 88, 179, 12, 107, 156, 104, 232, 232, 2, 149, 12, 170, 156, 156, 234, 234, 179, 90, 132, 95, 254, 159, 159, 159, 159, 19, 155, 222, 222, 254, 254, 125, 159, 55, 115, 71, 245, 222, 222, 250, 254, 39, 55, 115, 71, 86, 250, 250, 215, 39, 55, 115, 119, 52, 86, 246, 250, 25, 39, 26, 183, 119, 103, 180, 246, 39, 39, 26, 26, 119, 103, 103, 180, 76, 39, 39, 26, 183, 145, 73, 247, 108, 132, 95, 254, 89, 50, 69, 239, 128, 209, 245, 154, 254, 134, 102, 80, 39, 60, 241, 161, 127, 157, 250, 125, 0, 177, 243, 87, 116, 127, 223, 254, 10, 240, 115, 71, 86, 71, 245, 127, 23, 169, 115, 115, 115, 209, 118, 86, 114, 23, 199, 26, 99, 52, 118, 246, 25, 35, 145, 199, 180, 119, 103, 197, 211, 182, 252, 252, 244, 213, 95, 17, 65, 78, 210, 244, 244, 244, 206, 80, 5, 133, 116, 194, 244, 244, 244, 215, 5, 20, 131, 116, 194, 143, 244, 215, 5, 46, 131, 131, 116, 194, 194, 194, 8, 46, 46, 131, 190, 194, 190, 215, 45, 46, 37, 33, 190, 190, 116, 198, 175, 47, 33, 33, 33, 163, 163, 82, 177, 223, 80, 255, 24, 24, 24, 24, 121, 241, 180, 212, 244, 244, 252, 244, 84, 73, 192, 192, 198, 206, 244, 244, 53, 145, 192, 140, 198, 143, 206, 244, 169, 41, 32, 140, 140, 198, 198, 198, 169, 8, 32, 32, 140, 140, 198, 230, 47, 47, 41, 32, 32, 226, 140, 198, 47, 47, 33, 33, 163, 163, 163, 82, 203, 223, 158, 158, 253, 253, 29, 24, 105, 162, 157, 157, 64, 239, 255, 255, 185, 120, 66, 197, 87, 196, 22, 244, 189, 240, 148, 148, 100, 198, 198, 22, 149, 43, 62, 148, 148, 148, 198, 198, 11, 199, 40, 62, 62, 226, 62, 230, 11, 41, 40, 32, 62, 193, 140, 198, 181, 47, 41, 32, 32, 42, 32, 226, 155, 210, 85, 238, 158, 89, 253, 50, 1, 179, 85, 158, 158, 158, 239, 255, 0, 58, 162, 135, 229, 231, 196, 22, 171, 58, 233, 224, 228, 236, 196, 196, 189, 58, 105, 224, 228, 164, 228, 230, 189, 12, 18, 150, 224, 146, 228, 228, 189, 199, 40, 150, 104, 146, 150, 62, 53, 199, 42, 42, 62, 62, 193, 226, 166, 168, 144, 222, 179, 239, 89, 50, 117, 179, 179, 88, 158, 158, 158, 239, 78, 233, 189, 56, 237, 237, 237, 239, 28, 10, 59, 17, 236, 236, 164, 231, 64, 171, 59, 233, 242, 224, 236, 224, 171, 189, 107, 104, 224, 104, 148, 228, 189, 171, 18, 105, 150, 224, 179, 228, 149, 199, 43, 150, 96, 232, 146, 193, 219, 233, 220, 238, 158, 29, 17, 253, 70, 110, 179, 158, 110, 158, 110, 110, 137, 50, 55, 2, 158, 158, 158, 237, 166, 178, 209, 224, 237, 237, 237, 238, 29, 179, 97, 233, 17, 236, 236, 238, 221, 171, 58, 107, 242, 242, 88, 88, 61, 189, 171, 107, 233, 104, 232, 88, 149, 12, 12, 170, 156, 104, 232, 2, 91, 94, 95, 95, 254, 254, 125, 125, 19, 155, 222, 222, 222, 222, 254, 125, 23, 44, 155, 245, 222, 222, 222, 254, 23, 55, 115, 16, 245, 86, 222, 254, 160, 7, 44, 115, 71, 86, 86, 250, 160, 7, 55, 115, 119, 71, 86, 86, 160, 7, 55, 35, 115, 119, 52, 86, 160, 160, 160, 35, 35, 115, 119, 103, 166, 124, 254, 167, 17, 50, 235, 97, 49, 211, 83, 239, 95, 80, 69, 253, 23, 0, 209, 155, 134, 125, 238, 110, 114, 177, 211, 209, 167, 222, 147, 213, 153, 172, 44, 60, 243, 245, 215, 250, 47, 7, 55, 35, 243, 99, 118, 86, 5, 25, 55, 83, 83, 243, 51, 127, 23, 160, 7, 7, 76, 51, 119, 52, 211, 182, 252, 252, 252, 254, 71, 69, 65, 174, 210, 216, 244, 244, 252, 213, 5, 133, 130, 194, 244, 216, 244, 22, 5, 20, 133, 130, 130, 194, 244, 244, 46, 46, 20, 131, 130, 194, 194, 194, 46, 46, 33, 131, 131, 116, 194, 194, 233, 46, 37, 131, 131, 131, 194, 194, 25, 46, 46, 33, 33, 33, 131, 190, 177, 223, 80, 255, 24, 24, 24, 24, 121, 241, 246, 212, 244, 244, 252, 244, 84, 73, 192, 192, 143, 206, 244, 244, 53, 145, 192, 140, 198, 143, 206, 244, 169, 41, 163, 140, 140, 140, 143, 244, 169, 8, 41, 163, 140, 140, 140, 143, 169, 47, 8, 32, 140, 140, 140, 143, 181, 47, 8, 33, 33, 163, 163, 140, 2, 205, 158, 158, 253, 29, 29, 24, 185, 162, 157, 127, 64, 22, 255, 255, 185, 120, 225, 197, 87, 212, 22, 244, 189, 240, 66, 148, 148, 198, 212, 143, 149, 10, 102, 148, 148, 192, 192, 143, 11, 10, 40, 62, 148, 226, 226, 198, 11, 41, 40, 40, 81, 226, 230, 198, 181, 47, 41, 41, 32, 32, 32, 226, 232, 209, 241, 89, 89, 89, 253, 29, 116, 9, 178, 158, 158, 64, 64, 255, 149, 58, 162, 178, 197, 231, 196, 22, 171, 58, 120, 66, 197, 197, 196, 196, 189, 58, 18, 66, 228, 228, 100, 230, 189, 10, 18, 150, 150, 193, 228, 230, 189, 10, 18, 18, 148, 146, 228, 230, 11, 199, 42, 40, 62, 62, 62, 226, 28, 177, 230, 22, 110, 56, 89, 253, 43, 179, 108, 238, 158, 158, 110, 239, 181, 89, 146, 56, 237, 237, 237, 239, 230, 170, 59, 17, 17, 197, 231, 196, 73, 58, 59, 233, 242, 236, 164, 236, 58, 58, 107, 105, 66, 224, 224, 228, 189, 171, 107, 18, 150, 224, 193, 228, 149, 199, 43, 18, 96, 146, 193, 193, 145, 88, 142, 238, 238, 179, 253, 253, 194, 87, 241, 110, 158, 158, 238, 110, 147, 90, 179, 237, 238, 158, 158, 237, 50, 29, 162, 238, 237, 237, 237, 237, 45, 91, 96, 233, 17, 242, 236, 238, 126, 40, 58, 107, 233, 242, 242, 236, 149, 171, 58, 107, 233, 242, 224, 88, 149, 12, 12, 107, 156, 104, 104, 232, 91, 94, 95, 95, 159, 254, 254, 69, 128, 74, 132, 132, 132, 95, 95, 159, 172, 19, 74, 6, 132, 132, 95, 95, 172, 128, 51, 155, 6, 132, 132, 95, 23, 55, 44, 51, 155, 245, 6, 222, 39, 172, 55, 44, 16, 16, 245, 222, 160, 172, 55, 44, 51, 16, 71, 245, 160, 7, 55, 55, 44, 115, 115, 52, 108, 75, 245, 69, 162, 244, 69, 125, 31, 152, 219, 167, 108, 57, 235, 92, 128, 209, 177, 6, 95, 82, 143, 159, 23, 172, 51, 132, 245, 129, 223, 254, 23, 232, 128, 83, 210, 132, 86, 147, 169, 4, 133, 245, 51, 132, 28, 246, 19, 149, 31, 60, 245, 251, 60, 6, 4, 53, 51, 53, 7, 7, 16, 52, 90, 213, 186, 216, 244, 126, 186, 69, 65, 174, 152, 216, 210, 147, 147, 235, 5, 20, 130, 130, 194, 136, 216, 92, 5, 13, 131, 130, 194, 194, 148, 208, 5, 46, 131, 131, 194, 194, 194, 194, 5, 46, 131, 131, 131, 130, 194, 194, 3, 46, 33, 20, 37, 131, 194, 143, 84, 172, 41, 33, 33, 33, 131, 190, 217, 223, 124, 50, 235, 235, 186, 213, 60, 177, 134, 182, 220, 252, 216, 244, 84, 241, 52, 206, 206, 147, 252, 252, 25, 45, 161, 78, 206, 147, 93, 244, 181, 45, 145, 78, 81, 194, 246, 206, 181, 47, 8, 161, 161, 116, 190, 194, 181, 46, 46, 33, 161, 230, 227, 194, 181, 46, 46, 8, 8, 33, 33, 32, 11, 245, 166, 89, 124, 50, 50, 235, 153, 249, 85, 167, 125, 255, 252, 252, 185, 113, 195, 246, 246, 212, 182, 252, 185, 121, 227, 227, 196, 86, 206, 147, 109, 84, 97, 192, 140, 135, 78, 101, 149, 47, 14, 140, 81, 140, 197, 206, 181, 169, 41, 32, 52, 227, 226, 206, 181, 169, 169, 41, 33, 32, 163, 81, 175, 119, 248, 167, 89, 124, 124, 69, 74, 212, 166, 85, 167, 167, 80, 29, 218, 112, 201, 201, 157, 30, 215, 255, 177, 185, 36, 72, 135, 180, 143, 244, 189, 185, 36, 72, 148, 82, 214, 215, 189, 171, 233, 17, 100, 32, 241, 246, 109, 149, 10, 40, 227, 227, 82, 79, 175, 53, 47, 163, 163, 32, 81, 81, 253, 36, 201, 189, 49, 190, 70, 69, 88, 140, 38, 179, 166, 166, 167, 80, 178, 206, 112, 201, 85, 56, 215, 80, 3, 148, 142, 201, 205, 207, 127, 215, 179, 185, 142, 36, 17, 197, 247, 250, 105, 185, 58, 107, 79, 82, 197, 192, 176, 185, 107, 73, 197, 79, 197, 207, 176, 149, 199, 18, 40, 79, 148, 81, 192, 214, 111, 191, 1, 158, 50, 89, 227, 86, 158, 158, 238, 158, 70, 89, 250, 167, 232, 4, 170, 166, 166, 110, 157, 156, 90, 166, 201, 85, 56, 56, 70, 90, 53, 142, 162, 162, 56, 56, 216, 189, 185, 142, 17, 17, 162, 178, 120, 112, 185, 58, 233, 17, 242, 178, 149, 149, 171, 59, 107, 54, 54, 105, 91, 94, 94, 95, 95, 92, 95, 215, 31, 211, 94, 95, 95, 95, 95, 95, 23, 19, 74, 94, 132, 132, 95, 95, 23, 128, 19, 74, 6, 132, 222, 95, 23, 172, 19, 51, 155, 245, 222, 222, 23, 172, 55, 44, 51, 245, 245, 245, 23, 172, 44, 19, 51, 118, 118, 245, 160, 7, 7, 35, 44, 115, 118, 118, 109, 75, 75, 210, 95, 38, 86, 208, 31, 161, 201, 143, 208, 6, 126, 95, 151, 251, 155, 79, 251, 69, 78, 110, 172, 44, 39, 114, 95, 76, 205, 154, 4, 128, 16, 84, 132, 136, 94, 159, 175, 4, 67, 35, 118, 74, 212, 132, 23, 55, 181, 205, 19, 6, 174, 245, 7, 4, 4, 160, 49, 44, 76, 118, 90, 213, 252, 216, 248, 186, 117, 158, 65, 174, 152, 216, 216, 206, 216, 254, 5, 20, 130, 194, 194, 216, 244, 206, 5, 20, 130, 130, 194, 194, 216, 194, 46, 46, 20, 131, 130, 130, 194, 194, 20, 46, 131, 131, 131, 194, 194, 194, 12, 46, 37, 13, 131, 130, 194, 194, 23, 224, 46, 33, 131, 131, 131, 190, 217, 223, 80, 29, 24, 24, 216, 235, 60, 241, 214, 147, 244, 252, 252, 216, 84, 97, 52, 212, 147, 210, 206, 216, 53, 57, 52, 78, 174, 78, 93, 244, 181, 45, 163, 161, 78, 206, 206, 206, 165, 5, 8, 133, 78, 81, 143, 206, 165, 46, 8, 20, 161, 116, 116, 206, 5, 46, 37, 33, 8, 33, 133, 116, 239, 217, 166, 89, 124, 29, 29, 24, 58, 249, 162, 127, 64, 125, 22, 252, 185, 113, 195, 225, 246, 212, 206, 244, 185, 63, 66, 227, 52, 212, 212, 206, 11, 199, 97, 227, 192, 78, 140, 206, 25, 199, 40, 227, 192, 140, 180, 206, 25, 45, 41, 41, 163, 81, 78, 143, 181, 169, 8, 41, 163, 133, 140, 140, 143, 104, 223, 166, 89, 167, 89, 50, 178, 131, 249, 166, 158, 167, 80, 255, 171, 142, 201, 205, 178, 64, 214, 213, 189, 142, 36, 66, 17, 197, 180, 212, 187, 58, 54, 66, 66, 100, 246, 198, 189, 171, 107, 148, 105, 193, 197, 78, 109, 199, 57, 150, 148, 148, 192, 78, 25, 53, 57, 32, 62, 81, 81, 140, 162, 196, 28, 131, 69, 70, 223, 253, 198, 135, 221, 76, 166, 166, 167, 80, 118, 179, 61, 85, 56, 157, 167, 64, 232, 171, 142, 162, 17, 157, 157, 215, 130, 185, 142, 72, 66, 135, 82, 196, 12, 187, 58, 54, 66, 197, 82, 197, 189, 189, 107, 105, 225, 148, 224, 197, 149, 15, 57, 98, 79, 148, 148, 148, 113, 82, 172, 50, 174, 2, 205, 89, 171, 130, 16, 179, 158, 158, 70, 110, 133, 234, 14, 100, 120, 158, 158, 158, 203, 158, 179, 201, 56, 157, 157, 237, 174, 112, 63, 36, 17, 178, 178, 178, 221, 187, 142, 59, 17, 17, 135, 178, 179, 189, 58, 59, 54, 242, 242, 236, 176, 171, 171, 107, 54, 54, 79, 224, 91, 126, 75, 95, 95, 208, 252, 95, 128, 251, 94, 126, 95, 254, 159, 254, 23, 19, 155, 155, 132, 95, 95, 95, 23, 128, 19, 155, 245, 52, 125, 95, 23, 128, 19, 51, 51, 245, 16, 132, 23, 7, 128, 19, 51, 51, 76, 222, 23, 7, 128, 19, 44, 51, 114, 64, 160, 7, 7, 7, 35, 44, 44, 16, 219, 218, 235, 126, 247, 28, 108, 179, 138, 245, 173, 99, 220, 176, 204, 239, 203, 144, 111, 208, 188, 5, 152, 167, 144, 188, 38, 244, 195, 52, 171, 29, 151, 6, 100, 152, 131, 44, 141, 157, 186, 203, 90, 144, 20, 144, 235, 238, 124, 13, 126, 29, 175, 251, 86, 14, 48, 31, 169, 179, 46, 126, 55, 223, 74, 182, 93, 216, 65, 137, 111, 37, 51, 174, 78, 93, 143, 65, 235, 50, 45, 161, 129, 174, 93, 224, 104, 75, 23, 99, 129, 78, 78, 82, 155, 87, 181, 5, 20, 131, 136, 148, 206, 116, 165, 21, 225, 65, 81, 130, 226, 208, 21, 21, 119, 9, 137, 136, 141, 136, 31, 21, 9, 9, 9, 163, 78, 116, 249, 223, 127, 80, 235, 24, 235, 248, 241, 52, 118, 250, 246, 220, 135, 186, 7, 145, 192, 202, 117, 174, 66, 162, 160, 57, 78, 192, 81, 179, 206, 227, 160, 8, 73, 103, 78, 140, 130, 198, 181, 73, 66, 247, 140, 225, 230, 132, 165, 99, 79, 202, 32, 161, 20, 250, 21, 21, 20, 20, 115, 131, 99, 78, 171, 115, 167, 223, 69, 69, 69, 235, 90, 201, 205, 85, 118, 114, 208, 24, 185, 121, 225, 103, 87, 202, 246, 252, 187, 240, 192, 87, 118, 202, 135, 87, 61, 84, 79, 66, 192, 135, 86, 135, 160, 99, 73, 192, 150, 190, 236, 147, 25, 65, 163, 102, 42, 148, 62, 205, 181, 46, 240, 41, 115, 81, 226, 116, 178, 148, 10, 190, 89, 208, 69, 208, 175, 64, 83, 85, 223, 85, 127, 29, 232, 60, 201, 162, 114, 157, 208, 135, 176, 61, 59, 54, 157, 205, 88, 206, 185, 58, 107, 105, 102, 140, 247, 232, 187, 42, 240, 120, 150, 135, 104, 224, 176, 0, 73, 236, 79, 200, 117, 163, 39, 57, 65, 17, 16, 225, 193, 62, 195, 50, 190, 232, 202, 141, 166, 69, 249, 110, 167, 176, 166, 205, 167, 80, 222, 179, 117, 156, 63, 224, 167, 157, 92, 179, 142, 59, 54, 17, 116, 135, 118, 189, 58, 59, 225, 202, 97, 208, 185, 189, 58, 54, 236, 2, 157, 208, 187, 171, 105, 10, 79, 183, 59, 105, 160, 0, 200, 240, 225, 150, 119, 102, 210, 131, 40, 11, 179, 222, 222, 208, 127, 161, 239, 233, 131, 218, 238, 208, 86, 28, 39, 238, 179, 217, 114, 64, 52, 138, 219, 36, 59, 60, 162, 106, 128, 0, 204, 249, 59, 162, 56, 118, 128, 103, 142, 61, 59, 233, 36, 195, 78, 170, 48, 170, 241, 34, 54, 54, 55, 55, 35, 121, 121, 107, 54, 54, 188, 75, 75, 75, 126, 245, 89, 214, 151, 123, 94, 126, 126, 95, 95, 159, 4, 123, 251, 94, 126, 126, 126, 159, 4, 31, 251, 251, 132, 126, 126, 254, 4, 31, 27, 251, 251, 6, 6, 222, 4, 31, 31, 27, 251, 251, 245, 114, 23, 31, 27, 27, 27, 251, 51, 245, 23, 23, 31, 27, 27, 27, 19, 114, 203, 218, 204, 204, 126, 222, 179, 162, 1, 139, 218, 218, 38, 95, 98, 179, 1, 139, 221, 38, 38, 95, 69, 158, 123, 144, 221, 204, 191, 191, 69, 198, 208, 144, 144, 218, 218, 191, 221, 95, 109, 208, 144, 168, 6, 218, 218, 89, 130, 46, 172, 112, 185, 152, 218, 244, 4, 173, 172, 181, 187, 185, 37, 208, 122, 92, 186, 186, 248, 248, 216, 69, 67, 111, 152, 216, 248, 248, 208, 208, 21, 137, 138, 152, 152, 248, 248, 248, 21, 9, 137, 138, 138, 136, 136, 248, 21, 13, 9, 138, 136, 136, 136, 136, 165, 13, 13, 9, 130, 136, 136, 136, 165, 13, 13, 13, 28, 136, 136, 136, 46, 46, 46, 28, 28, 28, 28, 141, 219, 108, 124, 50, 235, 235, 235, 235, 153, 209, 134, 213, 220, 220, 220, 216, 49, 3, 117, 117, 147, 147, 220, 216, 25, 3, 161, 161, 174, 174, 210, 216, 181, 45, 133, 161, 161, 174, 210, 93, 165, 5, 20, 133, 161, 161, 174, 93, 165, 5, 20, 133, 161, 161, 116, 93, 165, 37, 37, 37, 20, 133, 133, 129, 38, 38, 70, 70, 124, 50, 50, 235, 48, 217, 85, 167, 80, 213, 213, 24, 48, 60, 195, 195, 214, 214, 147, 252, 187, 63, 97, 227, 241, 246, 212, 210, 109, 199, 97, 227, 227, 78, 143, 206, 109, 15, 43, 81, 227, 227, 192, 206, 175, 169, 41, 81, 81, 81, 81, 78, 173, 169, 8, 33, 163, 81, 81, 81, 37, 1, 191, 70, 70, 124, 124, 69, 188, 38, 217, 166, 167, 167, 80, 29, 48, 112, 249, 201, 127, 64, 214, 255, 48, 142, 113, 72, 17, 207, 246, 212, 187, 185, 63, 66, 227, 197, 192, 212, 187, 84, 63, 72, 227, 227, 227, 212, 109, 15, 10, 102, 148, 227, 227, 78, 175, 53, 41, 32, 32, 81, 62, 81, 110, 104, 70, 191, 189, 70, 70, 69, 29, 159, 158, 89, 70, 166, 167, 80, 213, 38, 217, 249, 85, 85, 223, 80, 151, 48, 112, 201, 162, 178, 246, 212, 93, 112, 142, 36, 72, 17, 197, 207, 48, 187, 58, 72, 66, 66, 242, 197, 187, 187, 10, 105, 79, 66, 148, 197, 176, 11, 14, 18, 79, 79, 148, 148, 50, 226, 254, 110, 110, 50, 170, 89, 3, 61, 165, 89, 110, 166, 70, 89, 228, 37, 208, 89, 166, 166, 166, 110, 73, 179, 56, 249, 249, 85, 56, 56, 5, 179, 112, 249, 201, 205, 205, 178, 20, 187, 112, 142, 36, 205, 205, 135, 110, 187, 185, 113, 36, 72, 72, 135, 176, 168, 61, 59, 54, 79, 79, 68 }; ATTR_MCMODEL_SMALL const int8_t ALIGN(dav2d_mc_subpel_filters[6][15][8], 8) = { [DAV2D_FILTER_8TAP_REGULAR] = { { 0, 1, -3, 63, 4, -1, 0, 0 }, { 0, 1, -5, 61, 9, -2, 0, 0 }, { 0, 1, -6, 58, 14, -4, 1, 0 }, { 0, 1, -7, 55, 19, -5, 1, 0 }, { 0, 1, -7, 51, 24, -6, 1, 0 }, { 0, 1, -8, 47, 29, -6, 1, 0 }, { 0, 1, -7, 42, 33, -6, 1, 0 }, { 0, 1, -7, 38, 38, -7, 1, 0 }, { 0, 1, -6, 33, 42, -7, 1, 0 }, { 0, 1, -6, 29, 47, -8, 1, 0 }, { 0, 1, -6, 24, 51, -7, 1, 0 }, { 0, 1, -5, 19, 55, -7, 1, 0 }, { 0, 1, -4, 14, 58, -6, 1, 0 }, { 0, 0, -2, 9, 61, -5, 1, 0 }, { 0, 0, -1, 4, 63, -3, 1, 0 } }, [DAV2D_FILTER_8TAP_SMOOTH] = { { 0, 1, 14, 31, 17, 1, 0, 0 }, { 0, 0, 13, 31, 18, 2, 0, 0 }, { 0, 0, 11, 31, 20, 2, 0, 0 }, { 0, 0, 10, 30, 21, 3, 0, 0 }, { 0, 0, 9, 29, 22, 4, 0, 0 }, { 0, 0, 8, 28, 23, 5, 0, 0 }, { 0, -1, 8, 27, 24, 6, 0, 0 }, { 0, -1, 7, 26, 26, 7, -1, 0 }, { 0, 0, 6, 24, 27, 8, -1, 0 }, { 0, 0, 5, 23, 28, 8, 0, 0 }, { 0, 0, 4, 22, 29, 9, 0, 0 }, { 0, 0, 3, 21, 30, 10, 0, 0 }, { 0, 0, 2, 20, 31, 11, 0, 0 }, { 0, 0, 2, 18, 31, 13, 0, 0 }, { 0, 0, 1, 17, 31, 14, 1, 0 } }, [DAV2D_FILTER_8TAP_SHARP] = { { -1, 1, -3, 63, 4, -1, 1, 0 }, { -1, 3, -6, 62, 8, -3, 2, -1 }, { -1, 4, -9, 60, 13, -5, 3, -1 }, { -2, 5, -11, 58, 19, -7, 3, -1 }, { -2, 5, -11, 54, 24, -9, 4, -1 }, { -2, 5, -12, 50, 30, -10, 4, -1 }, { -2, 5, -12, 45, 35, -11, 5, -1 }, { -2, 6, -12, 40, 40, -12, 6, -2 }, { -1, 5, -11, 35, 45, -12, 5, -2 }, { -1, 4, -10, 30, 50, -12, 5, -2 }, { -1, 4, -9, 24, 54, -11, 5, -2 }, { -1, 3, -7, 19, 58, -11, 5, -2 }, { -1, 3, -5, 13, 60, -9, 4, -1 }, { -1, 2, -3, 8, 62, -6, 3, -1 }, { 0, 1, -1, 4, 63, -3, 1, -1 } /* width <= 4 */ }, [3 + DAV2D_FILTER_8TAP_REGULAR] = { { 0, 0, -2, 63, 4, -1, 0, 0 }, { 0, 0, -4, 61, 9, -2, 0, 0 }, { 0, 0, -5, 58, 14, -3, 0, 0 }, { 0, 0, -6, 55, 19, -4, 0, 0 }, { 0, 0, -6, 51, 24, -5, 0, 0 }, { 0, 0, -7, 47, 29, -5, 0, 0 }, { 0, 0, -6, 42, 33, -5, 0, 0 }, { 0, 0, -6, 38, 38, -6, 0, 0 }, { 0, 0, -5, 33, 42, -6, 0, 0 }, { 0, 0, -5, 29, 47, -7, 0, 0 }, { 0, 0, -5, 24, 51, -6, 0, 0 }, { 0, 0, -4, 19, 55, -6, 0, 0 }, { 0, 0, -3, 14, 58, -5, 0, 0 }, { 0, 0, -2, 9, 61, -4, 0, 0 }, { 0, 0, -1, 4, 63, -2, 0, 0 } }, [3 + DAV2D_FILTER_8TAP_SMOOTH] = { { 0, 0, 15, 31, 17, 1, 0, 0 }, { 0, 0, 13, 31, 18, 2, 0, 0 }, { 0, 0, 11, 31, 20, 2, 0, 0 }, { 0, 0, 10, 30, 21, 3, 0, 0 }, { 0, 0, 9, 29, 22, 4, 0, 0 }, { 0, 0, 8, 28, 23, 5, 0, 0 }, { 0, 0, 7, 27, 24, 6, 0, 0 }, { 0, 0, 6, 26, 26, 6, 0, 0 }, { 0, 0, 6, 24, 27, 7, 0, 0 }, { 0, 0, 5, 23, 28, 8, 0, 0 }, { 0, 0, 4, 22, 29, 9, 0, 0 }, { 0, 0, 3, 21, 30, 10, 0, 0 }, { 0, 0, 2, 20, 31, 11, 0, 0 }, { 0, 0, 2, 18, 31, 13, 0, 0 }, { 0, 0, 1, 17, 31, 15, 0, 0 } /* Bilin scaled being very rarely used, add a new table entry * and use the put/prep_8tap_scaled code, thus acting as a * scaled bilinear filter. */ }, [5] = { { 0, 0, 0, 60, 4, 0, 0, 0 }, { 0, 0, 0, 56, 8, 0, 0, 0 }, { 0, 0, 0, 52, 12, 0, 0, 0 }, { 0, 0, 0, 48, 16, 0, 0, 0 }, { 0, 0, 0, 44, 20, 0, 0, 0 }, { 0, 0, 0, 40, 24, 0, 0, 0 }, { 0, 0, 0, 36, 28, 0, 0, 0 }, { 0, 0, 0, 32, 32, 0, 0, 0 }, { 0, 0, 0, 28, 36, 0, 0, 0 }, { 0, 0, 0, 24, 40, 0, 0, 0 }, { 0, 0, 0, 20, 44, 0, 0, 0 }, { 0, 0, 0, 16, 48, 0, 0, 0 }, { 0, 0, 0, 12, 52, 0, 0, 0 }, { 0, 0, 0, 8, 56, 0, 0, 0 }, { 0, 0, 0, 4, 60, 0, 0, 0 } } }; ATTR_MCMODEL_SMALL const int8_t ALIGN(dav2d_ext_warp_filter[63][8], 8) = { { 0, 0, -1, 127, 2, 0, 0, 0 }, { 0, 0, -2, 127, 4, -1, 0, 0 }, { 0, 0, -3, 126, 6, -1, 0, 0 }, { 0, 1, -4, 125, 8, -2, 0, 0 }, { 0, 1, -5, 124, 11, -3, 0, 0 }, { 0, 1, -6, 123, 13, -3, 0, 0 }, { 0, 1, -6, 122, 15, -4, 0, 0 }, { 0, 1, -7, 120, 17, -4, 1, 0 }, { 0, 1, -8, 119, 20, -5, 1, 0 }, { 0, 1, -9, 118, 22, -5, 1, 0 }, { 0, 1, -9, 117, 24, -6, 1, 0 }, { 0, 1, -10, 115, 27, -6, 1, 0 }, { 0, 1, -10, 114, 29, -7, 1, 0 }, { 0, 1, -11, 112, 32, -7, 1, 0 }, { 0, 1, -11, 111, 34, -8, 1, 0 }, { 0, 1, -11, 109, 36, -8, 1, 0 }, { 0, 2, -12, 107, 39, -9, 1, 0 }, { 0, 2, -12, 105, 41, -9, 1, 0 }, { 0, 2, -12, 103, 44, -10, 1, 0 }, { 0, 2, -13, 102, 46, -10, 1, 0 }, { 0, 2, -13, 99, 49, -10, 1, 0 }, { 0, 2, -13, 98, 51, -11, 1, 0 }, { 0, 2, -13, 95, 54, -11, 1, 0 }, { 0, 2, -14, 93, 56, -11, 2, 0 }, { 0, 2, -14, 91, 59, -12, 2, 0 }, { 0, 2, -14, 89, 61, -12, 2, 0 }, { 0, 2, -14, 87, 63, -12, 2, 0 }, { 0, 2, -14, 85, 66, -13, 2, 0 }, { 0, 2, -14, 83, 68, -13, 2, 0 }, { 0, 2, -14, 80, 71, -13, 2, 0 }, { 0, 2, -14, 78, 73, -13, 2, 0 }, { 0, 2, -13, 75, 75, -13, 2, 0 }, { 0, 2, -13, 73, 78, -14, 2, 0 }, { 0, 2, -13, 71, 80, -14, 2, 0 }, { 0, 2, -13, 68, 83, -14, 2, 0 }, { 0, 2, -13, 66, 85, -14, 2, 0 }, { 0, 2, -12, 63, 87, -14, 2, 0 }, { 0, 2, -12, 61, 89, -14, 2, 0 }, { 0, 2, -12, 59, 91, -14, 2, 0 }, { 0, 2, -11, 56, 93, -14, 2, 0 }, { 0, 1, -11, 54, 95, -13, 2, 0 }, { 0, 1, -11, 51, 98, -13, 2, 0 }, { 0, 1, -10, 49, 99, -13, 2, 0 }, { 0, 1, -10, 46, 102, -13, 2, 0 }, { 0, 1, -10, 44, 103, -12, 2, 0 }, { 0, 1, -9, 41, 105, -12, 2, 0 }, { 0, 1, -9, 39, 107, -12, 2, 0 }, { 0, 1, -8, 36, 109, -11, 1, 0 }, { 0, 1, -8, 34, 111, -11, 1, 0 }, { 0, 1, -7, 32, 112, -11, 1, 0 }, { 0, 1, -7, 29, 114, -10, 1, 0 }, { 0, 1, -6, 27, 115, -10, 1, 0 }, { 0, 1, -6, 24, 117, -9, 1, 0 }, { 0, 1, -5, 22, 118, -9, 1, 0 }, { 0, 1, -5, 20, 119, -8, 1, 0 }, { 0, 1, -4, 17, 120, -7, 1, 0 }, { 0, 0, -4, 15, 122, -6, 1, 0 }, { 0, 0, -3, 13, 123, -6, 1, 0 }, { 0, 0, -3, 11, 124, -5, 1, 0 }, { 0, 0, -2, 8, 125, -4, 1, 0 }, { 0, 0, -1, 6, 126, -3, 0, 0 }, { 0, 0, -1, 4, 127, -2, 0, 0 }, { 0, 0, 0, 2, 127, -1, 0, 0 }, }; ATTR_MCMODEL_SMALL const int8_t ALIGN(dav2d_mc_warp_filter[7*64+1][8], 8) = { // [-3, -2) { 127, 1, 0, 0, 0, 0, 0, 0 }, { 126, 2, 0, 0, 0, 0, 0, 0 }, { 124, 4, 0, 0, 0, 0, 0, 0 }, { 122, 6, 0, 0, 0, 0, 0, 0 }, { 120, 8, 0, 0, 0, 0, 0, 0 }, { 118, 10, 0, 0, 0, 0, 0, 0 }, { 116, 12, 0, 0, 0, 0, 0, 0 }, { 114, 14, 0, 0, 0, 0, 0, 0 }, { 112, 16, 0, 0, 0, 0, 0, 0 }, { 110, 18, 0, 0, 0, 0, 0, 0 }, { 108, 20, 0, 0, 0, 0, 0, 0 }, { 106, 22, 0, 0, 0, 0, 0, 0 }, { 104, 24, 0, 0, 0, 0, 0, 0 }, { 102, 26, 0, 0, 0, 0, 0, 0 }, { 100, 28, 0, 0, 0, 0, 0, 0 }, { 98, 30, 0, 0, 0, 0, 0, 0 }, { 96, 32, 0, 0, 0, 0, 0, 0 }, { 94, 34, 0, 0, 0, 0, 0, 0 }, { 92, 36, 0, 0, 0, 0, 0, 0 }, { 90, 38, 0, 0, 0, 0, 0, 0 }, { 88, 40, 0, 0, 0, 0, 0, 0 }, { 86, 42, 0, 0, 0, 0, 0, 0 }, { 84, 44, 0, 0, 0, 0, 0, 0 }, { 82, 46, 0, 0, 0, 0, 0, 0 }, { 80, 48, 0, 0, 0, 0, 0, 0 }, { 78, 50, 0, 0, 0, 0, 0, 0 }, { 76, 52, 0, 0, 0, 0, 0, 0 }, { 74, 54, 0, 0, 0, 0, 0, 0 }, { 72, 56, 0, 0, 0, 0, 0, 0 }, { 70, 58, 0, 0, 0, 0, 0, 0 }, { 68, 60, 0, 0, 0, 0, 0, 0 }, { 66, 62, 0, 0, 0, 0, 0, 0 }, { 64, 64, 0, 0, 0, 0, 0, 0 }, { 62, 66, 0, 0, 0, 0, 0, 0 }, { 60, 68, 0, 0, 0, 0, 0, 0 }, { 58, 70, 0, 0, 0, 0, 0, 0 }, { 56, 72, 0, 0, 0, 0, 0, 0 }, { 54, 74, 0, 0, 0, 0, 0, 0 }, { 52, 76, 0, 0, 0, 0, 0, 0 }, { 50, 78, 0, 0, 0, 0, 0, 0 }, { 48, 80, 0, 0, 0, 0, 0, 0 }, { 46, 82, 0, 0, 0, 0, 0, 0 }, { 44, 84, 0, 0, 0, 0, 0, 0 }, { 42, 86, 0, 0, 0, 0, 0, 0 }, { 40, 88, 0, 0, 0, 0, 0, 0 }, { 38, 90, 0, 0, 0, 0, 0, 0 }, { 36, 92, 0, 0, 0, 0, 0, 0 }, { 34, 94, 0, 0, 0, 0, 0, 0 }, { 32, 96, 0, 0, 0, 0, 0, 0 }, { 30, 98, 0, 0, 0, 0, 0, 0 }, { 28, 100, 0, 0, 0, 0, 0, 0 }, { 26, 102, 0, 0, 0, 0, 0, 0 }, { 24, 104, 0, 0, 0, 0, 0, 0 }, { 22, 106, 0, 0, 0, 0, 0, 0 }, { 20, 108, 0, 0, 0, 0, 0, 0 }, { 18, 110, 0, 0, 0, 0, 0, 0 }, { 16, 112, 0, 0, 0, 0, 0, 0 }, { 14, 114, 0, 0, 0, 0, 0, 0 }, { 12, 116, 0, 0, 0, 0, 0, 0 }, { 10, 118, 0, 0, 0, 0, 0, 0 }, { 8, 120, 0, 0, 0, 0, 0, 0 }, { 6, 122, 0, 0, 0, 0, 0, 0 }, { 4, 124, 0, 0, 0, 0, 0, 0 }, { 2, 126, 0, 0, 0, 0, 0, 0 }, // [-2, -1) { 0, 127, 1, 0, 0, 0, 0, 0 }, { -1, 127, 2, 0, 0, 0, 0, 0 }, { -2, 127, 4, -1, 0, 0, 0, 0 }, { -3, 126, 6, -1, 0, 0, 0, 0 }, { -3, 125, 8, -2, 0, 0, 0, 0 }, { -4, 124, 11, -3, 0, 0, 0, 0 }, { -5, 123, 13, -3, 0, 0, 0, 0 }, { -5, 121, 15, -3, 0, 0, 0, 0 }, { -6, 120, 18, -4, 0, 0, 0, 0 }, { -7, 119, 20, -4, 0, 0, 0, 0 }, { -7, 118, 22, -5, 0, 0, 0, 0 }, { -8, 116, 25, -5, 0, 0, 0, 0 }, { -8, 115, 27, -6, 0, 0, 0, 0 }, { -9, 113, 30, -6, 0, 0, 0, 0 }, { -9, 112, 32, -7, 0, 0, 0, 0 }, { -9, 110, 34, -7, 0, 0, 0, 0 }, { -10, 108, 37, -7, 0, 0, 0, 0 }, { -10, 107, 39, -8, 0, 0, 0, 0 }, { -10, 105, 41, -8, 0, 0, 0, 0 }, { -11, 103, 44, -8, 0, 0, 0, 0 }, { -11, 101, 47, -9, 0, 0, 0, 0 }, { -11, 99, 49, -9, 0, 0, 0, 0 }, { -11, 97, 51, -9, 0, 0, 0, 0 }, { -11, 95, 54, -10, 0, 0, 0, 0 }, { -11, 93, 56, -10, 0, 0, 0, 0 }, { -12, 91, 59, -10, 0, 0, 0, 0 }, { -12, 89, 61, -10, 0, 0, 0, 0 }, { -12, 87, 64, -11, 0, 0, 0, 0 }, { -12, 85, 66, -11, 0, 0, 0, 0 }, { -12, 82, 69, -11, 0, 0, 0, 0 }, { -12, 80, 71, -11, 0, 0, 0, 0 }, { -12, 78, 73, -11, 0, 0, 0, 0 }, { -11, 75, 75, -11, 0, 0, 0, 0 }, { -11, 73, 78, -12, 0, 0, 0, 0 }, { -11, 71, 80, -12, 0, 0, 0, 0 }, { -11, 69, 82, -12, 0, 0, 0, 0 }, { -11, 66, 85, -12, 0, 0, 0, 0 }, { -11, 64, 87, -12, 0, 0, 0, 0 }, { -10, 61, 89, -12, 0, 0, 0, 0 }, { -10, 59, 91, -12, 0, 0, 0, 0 }, { -10, 56, 93, -11, 0, 0, 0, 0 }, { -10, 54, 95, -11, 0, 0, 0, 0 }, { -9, 51, 97, -11, 0, 0, 0, 0 }, { -9, 49, 99, -11, 0, 0, 0, 0 }, { -9, 47, 101, -11, 0, 0, 0, 0 }, { -8, 44, 103, -11, 0, 0, 0, 0 }, { -8, 41, 105, -10, 0, 0, 0, 0 }, { -8, 39, 107, -10, 0, 0, 0, 0 }, { -7, 37, 108, -10, 0, 0, 0, 0 }, { -7, 34, 110, -9, 0, 0, 0, 0 }, { -7, 32, 112, -9, 0, 0, 0, 0 }, { -6, 30, 113, -9, 0, 0, 0, 0 }, { -6, 27, 115, -8, 0, 0, 0, 0 }, { -5, 25, 116, -8, 0, 0, 0, 0 }, { -5, 22, 118, -7, 0, 0, 0, 0 }, { -4, 20, 119, -7, 0, 0, 0, 0 }, { -4, 18, 120, -6, 0, 0, 0, 0 }, { -3, 15, 121, -5, 0, 0, 0, 0 }, { -3, 13, 123, -5, 0, 0, 0, 0 }, { -3, 11, 124, -4, 0, 0, 0, 0 }, { -2, 8, 125, -3, 0, 0, 0, 0 }, { -1, 6, 126, -3, 0, 0, 0, 0 }, { -1, 4, 127, -2, 0, 0, 0, 0 }, { 0, 2, 127, -1, 0, 0, 0, 0 }, // [-1, 0) { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, -1, 127, 2, 0, 0, 0, 0 }, { 1, -3, 127, 4, -1, 0, 0, 0 }, { 1, -4, 126, 6, -2, 1, 0, 0 }, { 1, -5, 126, 8, -3, 1, 0, 0 }, { 1, -6, 125, 11, -4, 1, 0, 0 }, { 1, -7, 124, 13, -4, 1, 0, 0 }, { 2, -8, 123, 15, -5, 1, 0, 0 }, { 2, -9, 122, 18, -6, 1, 0, 0 }, { 2, -10, 121, 20, -6, 1, 0, 0 }, { 2, -11, 120, 22, -7, 2, 0, 0 }, { 2, -12, 119, 25, -8, 2, 0, 0 }, { 3, -13, 117, 27, -8, 2, 0, 0 }, { 3, -13, 116, 29, -9, 2, 0, 0 }, { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 }, { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 }, { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 }, { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 }, { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 }, { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 }, { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 }, { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 }, { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 }, { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 }, { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 }, { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 }, { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 }, { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 }, { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 }, { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 }, { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 }, { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 }, { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, -9, 29, 116, -13, 3, 0, 0 }, { 2, -8, 27, 117, -13, 3, 0, 0 }, { 2, -8, 25, 119, -12, 2, 0, 0 }, { 2, -7, 22, 120, -11, 2, 0, 0 }, { 1, -6, 20, 121, -10, 2, 0, 0 }, { 1, -6, 18, 122, -9, 2, 0, 0 }, { 1, -5, 15, 123, -8, 2, 0, 0 }, { 1, -4, 13, 124, -7, 1, 0, 0 }, { 1, -4, 11, 125, -6, 1, 0, 0 }, { 1, -3, 8, 126, -5, 1, 0, 0 }, { 1, -2, 6, 126, -4, 1, 0, 0 }, { 0, -1, 4, 127, -3, 1, 0, 0 }, { 0, 0, 2, 127, -1, 0, 0, 0 }, // [0, 1) { 0, 0, 0, 127, 1, 0, 0, 0 }, { 0, 0, -1, 127, 2, 0, 0, 0 }, { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -5, 127, 6, -2, 1, 0 }, { 0, 2, -6, 126, 8, -3, 1, 0 }, { -1, 2, -7, 126, 11, -4, 2, -1 }, { -1, 3, -8, 125, 13, -5, 2, -1 }, { -1, 3, -10, 124, 16, -6, 3, -1 }, { -1, 4, -11, 123, 18, -7, 3, -1 }, { -1, 4, -12, 122, 20, -7, 3, -1 }, { -1, 4, -13, 121, 23, -8, 3, -1 }, { -2, 5, -14, 120, 25, -9, 4, -1 }, { -1, 5, -15, 119, 27, -10, 4, -1 }, { -1, 5, -16, 118, 30, -11, 4, -1 }, { -2, 6, -17, 116, 33, -12, 5, -1 }, { -2, 6, -17, 114, 35, -12, 5, -1 }, { -2, 6, -18, 113, 38, -13, 5, -1 }, { -2, 7, -19, 111, 41, -14, 6, -2 }, { -2, 7, -19, 110, 43, -15, 6, -2 }, { -2, 7, -20, 108, 46, -15, 6, -2 }, { -2, 7, -20, 106, 49, -16, 6, -2 }, { -2, 7, -21, 104, 51, -16, 7, -2 }, { -2, 7, -21, 102, 54, -17, 7, -2 }, { -2, 8, -21, 100, 56, -18, 7, -2 }, { -2, 8, -22, 98, 59, -18, 7, -2 }, { -2, 8, -22, 96, 62, -19, 7, -2 }, { -2, 8, -22, 94, 64, -19, 7, -2 }, { -2, 8, -22, 91, 67, -20, 8, -2 }, { -2, 8, -22, 89, 69, -20, 8, -2 }, { -2, 8, -22, 87, 72, -21, 8, -2 }, { -2, 8, -21, 84, 74, -21, 8, -2 }, { -2, 8, -22, 82, 77, -21, 8, -2 }, { -2, 8, -21, 79, 79, -21, 8, -2 }, { -2, 8, -21, 77, 82, -22, 8, -2 }, { -2, 8, -21, 74, 84, -21, 8, -2 }, { -2, 8, -21, 72, 87, -22, 8, -2 }, { -2, 8, -20, 69, 89, -22, 8, -2 }, { -2, 8, -20, 67, 91, -22, 8, -2 }, { -2, 7, -19, 64, 94, -22, 8, -2 }, { -2, 7, -19, 62, 96, -22, 8, -2 }, { -2, 7, -18, 59, 98, -22, 8, -2 }, { -2, 7, -18, 56, 100, -21, 8, -2 }, { -2, 7, -17, 54, 102, -21, 7, -2 }, { -2, 7, -16, 51, 104, -21, 7, -2 }, { -2, 6, -16, 49, 106, -20, 7, -2 }, { -2, 6, -15, 46, 108, -20, 7, -2 }, { -2, 6, -15, 43, 110, -19, 7, -2 }, { -2, 6, -14, 41, 111, -19, 7, -2 }, { -1, 5, -13, 38, 113, -18, 6, -2 }, { -1, 5, -12, 35, 114, -17, 6, -2 }, { -1, 5, -12, 33, 116, -17, 6, -2 }, { -1, 4, -11, 30, 118, -16, 5, -1 }, { -1, 4, -10, 27, 119, -15, 5, -1 }, { -1, 4, -9, 25, 120, -14, 5, -2 }, { -1, 3, -8, 23, 121, -13, 4, -1 }, { -1, 3, -7, 20, 122, -12, 4, -1 }, { -1, 3, -7, 18, 123, -11, 4, -1 }, { -1, 3, -6, 16, 124, -10, 3, -1 }, { -1, 2, -5, 13, 125, -8, 3, -1 }, { -1, 2, -4, 11, 126, -7, 2, -1 }, { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -2, 6, 127, -5, 1, 0 }, { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, 0, 2, 127, -1, 0, 0 }, // [1, 2) { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, -1, 127, 2, 0, 0 }, { 0, 0, 1, -3, 127, 4, -1, 0 }, { 0, 0, 1, -4, 126, 6, -2, 1 }, { 0, 0, 1, -5, 126, 8, -3, 1 }, { 0, 0, 1, -6, 125, 11, -4, 1 }, { 0, 0, 1, -7, 124, 13, -4, 1 }, { 0, 0, 2, -8, 123, 15, -5, 1 }, { 0, 0, 2, -9, 122, 18, -6, 1 }, { 0, 0, 2, -10, 121, 20, -6, 1 }, { 0, 0, 2, -11, 120, 22, -7, 2 }, { 0, 0, 2, -12, 119, 25, -8, 2 }, { 0, 0, 3, -13, 117, 27, -8, 2 }, { 0, 0, 3, -13, 116, 29, -9, 2 }, { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 }, { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 }, { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 }, { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 }, { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 }, { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 }, { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 }, { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 }, { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 }, { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 }, { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 }, { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 }, { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 }, { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 }, { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 }, { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 }, { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 }, { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 }, { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, -9, 29, 116, -13, 3 }, { 0, 0, 2, -8, 27, 117, -13, 3 }, { 0, 0, 2, -8, 25, 119, -12, 2 }, { 0, 0, 2, -7, 22, 120, -11, 2 }, { 0, 0, 1, -6, 20, 121, -10, 2 }, { 0, 0, 1, -6, 18, 122, -9, 2 }, { 0, 0, 1, -5, 15, 123, -8, 2 }, { 0, 0, 1, -4, 13, 124, -7, 1 }, { 0, 0, 1, -4, 11, 125, -6, 1 }, { 0, 0, 1, -3, 8, 126, -5, 1 }, { 0, 0, 1, -2, 6, 126, -4, 1 }, { 0, 0, 0, -1, 4, 127, -3, 1 }, { 0, 0, 0, 0, 2, 127, -1, 0 }, // [2, 3) { 0, 0, 0, 0, 0, 127, 1, 0 }, { 0, 0, 0, 0, -1, 127, 2, 0 }, { 0, 0, 0, 0, -2, 127, 4, -1 }, { 0, 0, 0, 0, -3, 126, 6, -1 }, { 0, 0, 0, 0, -3, 125, 8, -2 }, { 0, 0, 0, 0, -4, 124, 11, -3 }, { 0, 0, 0, 0, -5, 123, 13, -3 }, { 0, 0, 0, 0, -5, 121, 15, -3 }, { 0, 0, 0, 0, -6, 120, 18, -4 }, { 0, 0, 0, 0, -7, 119, 20, -4 }, { 0, 0, 0, 0, -7, 118, 22, -5 }, { 0, 0, 0, 0, -8, 116, 25, -5 }, { 0, 0, 0, 0, -8, 115, 27, -6 }, { 0, 0, 0, 0, -9, 113, 30, -6 }, { 0, 0, 0, 0, -9, 112, 32, -7 }, { 0, 0, 0, 0, -9, 110, 34, -7 }, { 0, 0, 0, 0, -10, 108, 37, -7 }, { 0, 0, 0, 0, -10, 107, 39, -8 }, { 0, 0, 0, 0, -10, 105, 41, -8 }, { 0, 0, 0, 0, -11, 103, 44, -8 }, { 0, 0, 0, 0, -11, 101, 47, -9 }, { 0, 0, 0, 0, -11, 99, 49, -9 }, { 0, 0, 0, 0, -11, 97, 51, -9 }, { 0, 0, 0, 0, -11, 95, 54, -10 }, { 0, 0, 0, 0, -11, 93, 56, -10 }, { 0, 0, 0, 0, -12, 91, 59, -10 }, { 0, 0, 0, 0, -12, 89, 61, -10 }, { 0, 0, 0, 0, -12, 87, 64, -11 }, { 0, 0, 0, 0, -12, 85, 66, -11 }, { 0, 0, 0, 0, -12, 82, 69, -11 }, { 0, 0, 0, 0, -12, 80, 71, -11 }, { 0, 0, 0, 0, -12, 78, 73, -11 }, { 0, 0, 0, 0, -11, 75, 75, -11 }, { 0, 0, 0, 0, -11, 73, 78, -12 }, { 0, 0, 0, 0, -11, 71, 80, -12 }, { 0, 0, 0, 0, -11, 69, 82, -12 }, { 0, 0, 0, 0, -11, 66, 85, -12 }, { 0, 0, 0, 0, -11, 64, 87, -12 }, { 0, 0, 0, 0, -10, 61, 89, -12 }, { 0, 0, 0, 0, -10, 59, 91, -12 }, { 0, 0, 0, 0, -10, 56, 93, -11 }, { 0, 0, 0, 0, -10, 54, 95, -11 }, { 0, 0, 0, 0, -9, 51, 97, -11 }, { 0, 0, 0, 0, -9, 49, 99, -11 }, { 0, 0, 0, 0, -9, 47, 101, -11 }, { 0, 0, 0, 0, -8, 44, 103, -11 }, { 0, 0, 0, 0, -8, 41, 105, -10 }, { 0, 0, 0, 0, -8, 39, 107, -10 }, { 0, 0, 0, 0, -7, 37, 108, -10 }, { 0, 0, 0, 0, -7, 34, 110, -9 }, { 0, 0, 0, 0, -7, 32, 112, -9 }, { 0, 0, 0, 0, -6, 30, 113, -9 }, { 0, 0, 0, 0, -6, 27, 115, -8 }, { 0, 0, 0, 0, -5, 25, 116, -8 }, { 0, 0, 0, 0, -5, 22, 118, -7 }, { 0, 0, 0, 0, -4, 20, 119, -7 }, { 0, 0, 0, 0, -4, 18, 120, -6 }, { 0, 0, 0, 0, -3, 15, 121, -5 }, { 0, 0, 0, 0, -3, 13, 123, -5 }, { 0, 0, 0, 0, -3, 11, 124, -4 }, { 0, 0, 0, 0, -2, 8, 125, -3 }, { 0, 0, 0, 0, -1, 6, 126, -3 }, { 0, 0, 0, 0, -1, 4, 127, -2 }, { 0, 0, 0, 0, 0, 2, 127, -1 }, // [3, 4) { 0, 0, 0, 0, 0, 0, 127, 1 }, { 0, 0, 0, 0, 0, 0, 126, 2 }, { 0, 0, 0, 0, 0, 0, 124, 4 }, { 0, 0, 0, 0, 0, 0, 122, 6 }, { 0, 0, 0, 0, 0, 0, 120, 8 }, { 0, 0, 0, 0, 0, 0, 118, 10 }, { 0, 0, 0, 0, 0, 0, 116, 12 }, { 0, 0, 0, 0, 0, 0, 114, 14 }, { 0, 0, 0, 0, 0, 0, 112, 16 }, { 0, 0, 0, 0, 0, 0, 110, 18 }, { 0, 0, 0, 0, 0, 0, 108, 20 }, { 0, 0, 0, 0, 0, 0, 106, 22 }, { 0, 0, 0, 0, 0, 0, 104, 24 }, { 0, 0, 0, 0, 0, 0, 102, 26 }, { 0, 0, 0, 0, 0, 0, 100, 28 }, { 0, 0, 0, 0, 0, 0, 98, 30 }, { 0, 0, 0, 0, 0, 0, 96, 32 }, { 0, 0, 0, 0, 0, 0, 94, 34 }, { 0, 0, 0, 0, 0, 0, 92, 36 }, { 0, 0, 0, 0, 0, 0, 90, 38 }, { 0, 0, 0, 0, 0, 0, 88, 40 }, { 0, 0, 0, 0, 0, 0, 86, 42 }, { 0, 0, 0, 0, 0, 0, 84, 44 }, { 0, 0, 0, 0, 0, 0, 82, 46 }, { 0, 0, 0, 0, 0, 0, 80, 48 }, { 0, 0, 0, 0, 0, 0, 78, 50 }, { 0, 0, 0, 0, 0, 0, 76, 52 }, { 0, 0, 0, 0, 0, 0, 74, 54 }, { 0, 0, 0, 0, 0, 0, 72, 56 }, { 0, 0, 0, 0, 0, 0, 70, 58 }, { 0, 0, 0, 0, 0, 0, 68, 60 }, { 0, 0, 0, 0, 0, 0, 66, 62 }, { 0, 0, 0, 0, 0, 0, 64, 64 }, { 0, 0, 0, 0, 0, 0, 62, 66 }, { 0, 0, 0, 0, 0, 0, 60, 68 }, { 0, 0, 0, 0, 0, 0, 58, 70 }, { 0, 0, 0, 0, 0, 0, 56, 72 }, { 0, 0, 0, 0, 0, 0, 54, 74 }, { 0, 0, 0, 0, 0, 0, 52, 76 }, { 0, 0, 0, 0, 0, 0, 50, 78 }, { 0, 0, 0, 0, 0, 0, 48, 80 }, { 0, 0, 0, 0, 0, 0, 46, 82 }, { 0, 0, 0, 0, 0, 0, 44, 84 }, { 0, 0, 0, 0, 0, 0, 42, 86 }, { 0, 0, 0, 0, 0, 0, 40, 88 }, { 0, 0, 0, 0, 0, 0, 38, 90 }, { 0, 0, 0, 0, 0, 0, 36, 92 }, { 0, 0, 0, 0, 0, 0, 34, 94 }, { 0, 0, 0, 0, 0, 0, 32, 96 }, { 0, 0, 0, 0, 0, 0, 30, 98 }, { 0, 0, 0, 0, 0, 0, 28, 100 }, { 0, 0, 0, 0, 0, 0, 26, 102 }, { 0, 0, 0, 0, 0, 0, 24, 104 }, { 0, 0, 0, 0, 0, 0, 22, 106 }, { 0, 0, 0, 0, 0, 0, 20, 108 }, { 0, 0, 0, 0, 0, 0, 18, 110 }, { 0, 0, 0, 0, 0, 0, 16, 112 }, { 0, 0, 0, 0, 0, 0, 14, 114 }, { 0, 0, 0, 0, 0, 0, 12, 116 }, { 0, 0, 0, 0, 0, 0, 10, 118 }, { 0, 0, 0, 0, 0, 0, 8, 120 }, { 0, 0, 0, 0, 0, 0, 6, 122 }, { 0, 0, 0, 0, 0, 0, 4, 124 }, { 0, 0, 0, 0, 0, 0, 2, 126 }, // dummy (replicate row index 447) { 0, 0, 0, 0, 0, 0, 2, 126 }, }; // FIXME we might not need this table anymore (I guess it depends on the SIMD) const uint8_t ALIGN(dav2d_sm_weights[3 /* scale */][64], 16) = { // The ith element is computed as 32 >> min(6, (i << 2) >> scale) // This table merges the AVM scales 0 and 2 into 0 (since they are complementary) [0] = { 32, 8, 2, 0, 0, 0, 0, 0, }, [1] = { 32, 16, 8, 4, 2, 1, 0, 0, }, [2] = { 32, 32, 16, 16, 8, 8, 4, 4, 2, 2, 1, 1, 0, 0, }, }; // Intra derivative for directional predictions. // second_dr_intra_derivative[x] = 64*64/dr_intra_derivative[x] const uint16_t dav2d_dr_intra_derivative[90] = { // Angle in degrees. // Starred (*) values are unused. 0, 4096, 2048, // *, 0.9, 1.8, 1365, 1024, 819, // 2.7, 3.6, 4.5, 682, 585, 512, // 5.4, 6.2, 7.1, 455, 409, 409, 409, 372, // 8.0, 8.9, *, *, 9.8, 341, 292, 273, // 10.6, 12.4, 13.2, 256, 227, 215, // 14.0, 15.7, 16.6, 204, 186, 178, // 17.4, 19.0, 19.8, 170, 157, 151, // 20.6, 22.2, 23.0, 146, 136, 132, // 23.7, 25.2, 25.9, 128, 117, 110, // 26.6, 28.7, 30.2, 107, 99, 97, 97, // 30.9, 32.9, *, 33.4, 93, 87, 83, // 34.5, 36.3, 37.6, 81, 77, 74, // 38.3, 39.7, 40.9, 73, 69, 66, // 41.2, 42.8, 44.1, 64, 62, 59, // 45.0, 45.9, 47.3, 56, 55, 53, // 48.8, 49.3, 50.4, 50, 49, 47, // 52.0, 52.6, 53.7, 44, 42, 42, 41, // 55.5, 56.7, *, 57.4, 38, 37, 35, // 59.3, 60.0, 61.3, 32, 31, 30, // 63.4, 64.2, 64.9, 28, 27, 26, // 66.4, 67.1, 67.9, 24, 23, 22, // 69.4, 70.2, 71.0, 20, 19, 18, // 72.6, 73.5, 74.3, 16, 15, 14, // 76.0, 76.8, 77.7, 12, 11, 10, 10, 10, // 79.4, 80.2, *, *, 81.1, 9, 8, 7, // 82.0, 82.9, 83.8, 6, 5, 4, // 84.6, 85.5, 86.4, 3, 2, 1, // 87.3, 88.2, 89.1, }; const uint8_t dav2d_dc_ibp_weights[32] = { /* Unused */ 0, /* len 1 */ 96, /* len 2 */ 86, 107, /* len 4 */ 77, 90, 102, 115, /* len 8 */ 71, 78, 86, 92, 100, 107, 114, 121, /* len 16 */ 68, 72, 76, 79, 83, 87, 90, 94, 98, 102, 106, 109, 113, 117, 121, 124 }; const uint16_t dav2d_div_recip[128 + 1] = { 512, 508, 504, 500, 496, 493, 489, 485, 482, 478, 475, 471, 468, 465, 462, 458, 455, 452, 449, 446, 443, 440, 437, 434, 431, 428, 426, 423, 420, 417, 415, 412, 410, 407, 405, 402, 400, 397, 395, 392, 390, 388, 386, 383, 381, 379, 377, 374, 372, 370, 368, 366, 364, 362, 360, 358, 356, 354, 352, 350, 349, 347, 345, 343, 341, 340, 338, 336, 334, 333, 331, 329, 328, 326, 324, 323, 321, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306, 305, 303, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 287, 286, 285, 284, 282, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269, 267, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256 }; // Offset values used to adjust the normalized denominator. const uint16_t dav2d_div_scale_sh_offset[8] = { 4822, 5952, 6624, 6792, 6408, 5424, 3792, 1466 }; // Bias for each region's polynomial. const uint16_t dav2d_div_scale_sh_bias[8] = { 12784, 12054, 11670, 11583, 11764, 12195, 12870, 13782 }; // Coefficients for the quadratic (squared) term in the polynomial. const uint8_t dav2d_div_scale_sh_coefw[8] = { 214, 153, 113, 86, 67, 53, 43, 35 }; #if ARCH_X86 #define F(idx, f0, f1, f2, f3, f4, f5, f6) \ [2*idx+0] = f0, [2*idx+1] = f1, \ [2*idx+16] = f2, [2*idx+17] = f3, \ [2*idx+32] = f4, [2*idx+33] = f5, \ [2*idx+48] = f6 #else #define F(idx, f0, f1, f2, f3, f4, f5, f6) \ [1*idx+0] = f0, [1*idx+8] = f1, \ [1*idx+16] = f2, [1*idx+24] = f3, \ [1*idx+32] = f4, [1*idx+40] = f5, \ [1*idx+48] = f6 #endif ATTR_MCMODEL_SMALL const int8_t ALIGN(dav2d_filter_intra_taps[5][64], 64) = { { F( 0, -6, 10, 0, 0, 0, 12, 0 ), F( 1, -5, 2, 10, 0, 0, 9, 0 ), F( 2, -3, 1, 1, 10, 0, 7, 0 ), F( 3, -3, 1, 1, 2, 10, 5, 0 ), F( 4, -4, 6, 0, 0, 0, 2, 12 ), F( 5, -3, 2, 6, 0, 0, 2, 9 ), F( 6, -3, 2, 2, 6, 0, 2, 7 ), F( 7, -3, 1, 2, 2, 6, 3, 5 ), }, { F( 0, -10, 16, 0, 0, 0, 10, 0 ), F( 1, -6, 0, 16, 0, 0, 6, 0 ), F( 2, -4, 0, 0, 16, 0, 4, 0 ), F( 3, -2, 0, 0, 0, 16, 2, 0 ), F( 4, -10, 16, 0, 0, 0, 0, 10 ), F( 5, -6, 0, 16, 0, 0, 0, 6 ), F( 6, -4, 0, 0, 16, 0, 0, 4 ), F( 7, -2, 0, 0, 0, 16, 0, 2 ), }, { F( 0, -8, 8, 0, 0, 0, 16, 0 ), F( 1, -8, 0, 8, 0, 0, 16, 0 ), F( 2, -8, 0, 0, 8, 0, 16, 0 ), F( 3, -8, 0, 0, 0, 8, 16, 0 ), F( 4, -4, 4, 0, 0, 0, 0, 16 ), F( 5, -4, 0, 4, 0, 0, 0, 16 ), F( 6, -4, 0, 0, 4, 0, 0, 16 ), F( 7, -4, 0, 0, 0, 4, 0, 16 ), }, { F( 0, -2, 8, 0, 0, 0, 10, 0 ), F( 1, -1, 3, 8, 0, 0, 6, 0 ), F( 2, -1, 2, 3, 8, 0, 4, 0 ), F( 3, 0, 1, 2, 3, 8, 2, 0 ), F( 4, -1, 4, 0, 0, 0, 3, 10 ), F( 5, -1, 3, 4, 0, 0, 4, 6 ), F( 6, -1, 2, 3, 4, 0, 4, 4 ), F( 7, -1, 2, 2, 3, 4, 3, 3 ), }, { F( 0, -12, 14, 0, 0, 0, 14, 0 ), F( 1, -10, 0, 14, 0, 0, 12, 0 ), F( 2, -9, 0, 0, 14, 0, 11, 0 ), F( 3, -8, 0, 0, 0, 14, 10, 0 ), F( 4, -10, 12, 0, 0, 0, 0, 14 ), F( 5, -9, 1, 12, 0, 0, 0, 12 ), F( 6, -8, 0, 0, 12, 0, 1, 11 ), F( 7, -7, 0, 0, 1, 12, 1, 9 ), } }; // Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512 ATTR_MCMODEL_SMALL const int16_t dav2d_gaussian_sequence[2048] = { 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, 428, -484 }; const int16_t dav2d_deblock_side_thresholds[296] = { -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -14, -13, -11, -10, -9, -7, -6, -4, -3, -2, 0, 0, 2, 3, 5, 6, 7, 9, 10, 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 26, 27, 29, 30, 32, 33, 34, 36, 37, 39, 40, 42, 43, 44, 46, 47, 49, 50, 51, 53, 54, 56, 57, 59, 60, 61, 63, 64, 66, 67, 69, 70, 71, 73, 74, 76, 77, 78, 80, 81, 83, 84, 86, 87, 88, 90, 91, 93, 94, 96, 101, 111, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 249, 259, 269, 279, 289, 299, 309, 319, 329, 339, 349, 359, 368, 378, 388, 398, 408, 418, 428, 438, 448, 458, 468, 478, 488, 497, 507, 517, 527, 537, 547, 557, 567, 577, 587, 597, 607, 616, 626, 636, 646, 656, 666, 676, 686, 696, 706, 716, 726, 736, 745, 755, 765, 775, 785, 795, 805, 815, 825, 835, 845, 855, 864, 874, 884, 894, 904, 914, 924, 934, 944, 954, 964, 974, 984, 993, 1003, 1013, 1023, 1033, 1043, 1053, 1063, 1073, 1083, 1093, 1103, 1112, 1122, 1132, 1142, 1152, 1162, 1172, 1182, 1192, 1202, 1212, 1222, 1232, 1241, 1251, 1261, 1271, 1281, 1291, 1301, 1311, 1321, 1331, 1341, 1351, 1360, 1370, 1380, 1390, 1400, 1410, 1420, 1430, 1440, 1450, 1460, 1470, 1480, 1489, 1499, 1509, 1519, 1529, 1539, 1549, 1559, 1569, 1579, 1589, 1599, 1608, 1618, 1628, 1638, 1648, 1658, 1668, 1678 }; const uint8_t /*enum Dav2dPixelLayout*/ dav2d_layouts[] = { DAV2D_PIXEL_LAYOUT_I420, DAV2D_PIXEL_LAYOUT_I400, DAV2D_PIXEL_LAYOUT_I444, DAV2D_PIXEL_LAYOUT_I422, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/tables.h000066400000000000000000000104101517466257200222520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_TABLES_H #define DAV2D_SRC_TABLES_H #include #include "common/intops.h" #include "src/levels.h" // width, height (in 4px blocks), log2 versions of these two EXTERN const uint8_t dav2d_block_dimensions[N_BS_SIZES][4]; typedef struct TxfmInfo { // width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad uint8_t w, h, lw, lh, min, max, sub, ctx; } TxfmInfo; EXTERN const TxfmInfo dav2d_txfm_dimensions[N_RECT_TX_SIZES]; EXTERN const uint8_t dav2d_tx_shift[N_RECT_TX_SIZES][2]; EXTERN const uint8_t dav2d_tx_ddt_mask[N_RECT_TX_SIZES]; EXTERN const uint8_t /* enum (Rect)TxfmSize */ dav2d_max_txfm_size_for_bs[N_BS_SIZES][4 /* 444, 422, 420, ll */]; EXTERN const uint8_t /* enum BlockSize */ dav2d_ss_bs[N_BS_SIZES][3 /* 420, 422, 444 */]; EXTERN const char *const dav2d_tx1d_names[N_TX_1D_TYPES]; // order: split, horz, vert, horz4, vert4, horz5[small], ver5[small] // the big transforms in horz5 and vert5 are identical to horz or vert EXTERN const int8_t dav2d_tx_part_tbl[N_BS_SIZES][8]; EXTERN const uint8_t /* enum TxfmType */ dav2d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES]; EXTERN const int16_t dav2d_cctx_angle[6][3]; EXTERN const uint8_t dav2d_mode_to_angle_map[8]; EXTERN const uint8_t /* enum InterPredMode */ dav2d_comp_inter_pred_modes[][2]; EXTERN const Dav2dWarpedMotionParams dav2d_default_wm_params; EXTERN const int8_t dav2d_tip_wts[8]; EXTERN const int16_t dav2d_deblock_side_thresholds[296]; EXTERN const int8_t dav2d_cdef_directions[12][2]; EXTERN const uint16_t dav2d_ccso_quant_sz[4][4]; EXTERN const int8_t dav2d_ccso_offset[4][8]; EXTERN const unsigned dav2d_subset_masks_y[4]; EXTERN const unsigned dav2d_subset_masks_uv[3]; EXTERN const int8_t dav2d_wiener_ns_filters[64][16]; EXTERN const int16_t dav2d_pc_wiener_filters[4][64][13]; EXTERN const int8_t dav2d_ns_wiener_coef_range_y[16][2]; EXTERN const int8_t dav2d_ns_wiener_coef_range_uv[18][2]; EXTERN const uint8_t dav2d_pc_weiner_lut_to_class[4096]; EXTERN const uint8_t dav2d_pc_wiener_sub_classify[4][256]; EXTERN const uint8_t dav2d_pc_wiener_sub_classify_ns[4][7][256]; EXTERN const int8_t dav2d_mc_subpel_filters[6][15][8]; EXTERN const int8_t dav2d_ext_warp_filter[63][8]; EXTERN const int8_t dav2d_mc_warp_filter[7*64+1][8]; EXTERN const uint8_t dav2d_sm_weights[3][64]; EXTERN const uint16_t dav2d_dr_intra_derivative[90]; EXTERN const uint8_t dav2d_dc_ibp_weights[32]; EXTERN const uint16_t dav2d_div_recip[128 + 1]; EXTERN const uint16_t dav2d_div_scale_sh_offset[8]; EXTERN const uint16_t dav2d_div_scale_sh_bias[8]; EXTERN const uint8_t dav2d_div_scale_sh_coefw[8]; EXTERN const int8_t dav2d_filter_intra_taps[5][64]; EXTERN const int16_t dav2d_gaussian_sequence[2048]; // for fgs EXTERN const uint8_t /*enum Dav2dPixelLayout*/ dav2d_layouts[4]; #endif /* DAV2D_SRC_TABLES_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/thread.h000066400000000000000000000126271517466257200222630ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_THREAD_H #define DAV2D_SRC_THREAD_H #if defined(_WIN32) #include #include #define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT typedef struct { HANDLE h; void *(*func)(void*); void *arg; } pthread_t; typedef struct { unsigned stack_size; } pthread_attr_t; typedef SRWLOCK pthread_mutex_t; typedef CONDITION_VARIABLE pthread_cond_t; typedef INIT_ONCE pthread_once_t; void dav2d_init_thread(void); void dav2d_set_thread_name(const wchar_t *name); #define dav2d_set_thread_name(name) dav2d_set_thread_name(L##name) int dav2d_pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*func)(void*), void *arg); int dav2d_pthread_join(pthread_t *thread, void **res); int dav2d_pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); #define pthread_create dav2d_pthread_create #define pthread_join(thread, res) dav2d_pthread_join(&(thread), res) #define pthread_once dav2d_pthread_once static inline int pthread_attr_init(pthread_attr_t *const attr) { attr->stack_size = 0; return 0; } static inline int pthread_attr_destroy(pthread_attr_t *const attr) { return 0; } static inline int pthread_attr_setstacksize(pthread_attr_t *const attr, const size_t stack_size) { if (stack_size > UINT_MAX) return 1; attr->stack_size = (unsigned) stack_size; return 0; } static inline int pthread_mutex_init(pthread_mutex_t *const mutex, const void *const attr) { InitializeSRWLock(mutex); return 0; } static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) { return 0; } static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) { AcquireSRWLockExclusive(mutex); return 0; } static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) { ReleaseSRWLockExclusive(mutex); return 0; } static inline int pthread_cond_init(pthread_cond_t *const cond, const void *const attr) { InitializeConditionVariable(cond); return 0; } static inline int pthread_cond_destroy(pthread_cond_t *const cond) { return 0; } static inline int pthread_cond_wait(pthread_cond_t *const cond, pthread_mutex_t *const mutex) { return !SleepConditionVariableSRW(cond, mutex, INFINITE, 0); } static inline int pthread_cond_signal(pthread_cond_t *const cond) { WakeConditionVariable(cond); return 0; } static inline int pthread_cond_broadcast(pthread_cond_t *const cond) { WakeAllConditionVariable(cond); return 0; } #else #include #if defined(__FreeBSD__) /* ALIGN from conflicts with ALIGN from "common/attributes.h" */ #define _SYS_PARAM_H_ #include #endif #if HAVE_PTHREAD_NP_H #include #endif #define dav2d_init_thread() do {} while (0) /* Thread naming support */ #ifdef __linux__ #include static inline void dav2d_set_thread_name(const char *const name) { prctl(PR_SET_NAME, name); } #elif HAVE_PTHREAD_SETNAME_NP && defined(__APPLE__) static inline void dav2d_set_thread_name(const char *const name) { pthread_setname_np(name); } #elif HAVE_PTHREAD_SETNAME_NP && defined(__NetBSD__) static inline void dav2d_set_thread_name(const char *const name) { pthread_setname_np(pthread_self(), "%s", (void*)name); } #elif HAVE_PTHREAD_SETNAME_NP static inline void dav2d_set_thread_name(const char *const name) { pthread_setname_np(pthread_self(), name); } #elif HAVE_PTHREAD_SET_NAME_NP static inline void dav2d_set_thread_name(const char *const name) { pthread_set_name_np(pthread_self(), name); } #elif defined(__HAIKU__) #include static inline void dav2d_set_thread_name(const char *const name) { rename_thread(find_thread(NULL), name); } #else #define dav2d_set_thread_name(name) do {} while (0) #endif #endif #endif /* DAV2D_SRC_THREAD_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/thread_data.h000066400000000000000000000032131517466257200232430ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_THREAD_DATA_H #define DAV2D_SRC_THREAD_DATA_H #include "src/thread.h" struct thread_data { pthread_t thread; pthread_cond_t cond; pthread_mutex_t lock; int inited; }; #endif /* DAV2D_SRC_THREAD_DATA_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/thread_task.c000066400000000000000000001263531517466257200233020ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "common/frame.h" #include "src/thread_task.h" #include "src/fg_apply.h" // This function resets the cur pointer to the first frame theoretically // executable after a task completed (ie. each time we update some progress or // insert some tasks in the queue). // When frame_idx is set, it can be either from a completed task, or from tasks // inserted in the queue, in which case we have to make sure the cur pointer // isn't past this insert. // The special case where frame_idx is UINT_MAX is to handle the reset after // completing a task and locklessly signaling progress. In this case we don't // enter a critical section, which is needed for this function, so we set an // atomic for a delayed handling, happening here. Meaning we can call this // function without any actual update other than what's in the atomic, hence // this special case. static inline int reset_task_cur(const Dav2dContext *const c, struct TaskThreadData *const ttd, unsigned frame_idx) { const unsigned first = atomic_load(&ttd->first); unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX); if (reset_frame_idx < first) { if (frame_idx == UINT_MAX) return 0; reset_frame_idx = UINT_MAX; } if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL) return 0; if (reset_frame_idx != UINT_MAX) { if (frame_idx == UINT_MAX) { if (reset_frame_idx > first + ttd->cur) return 0; ttd->cur = reset_frame_idx - first; goto cur_found; } } else if (frame_idx == UINT_MAX) return 0; if (frame_idx < first) frame_idx += c->n_fc; const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx); const unsigned cur_frame_idx = first + ttd->cur; if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx) return 0; for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++) if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head) break; cur_found: for (unsigned i = ttd->cur; i < c->n_fc; i++) c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL; return 1; } static inline void reset_task_cur_async(struct TaskThreadData *const ttd, unsigned frame_idx, unsigned n_frames) { const unsigned first = atomic_load(&ttd->first); if (frame_idx < first) frame_idx += n_frames; unsigned last_idx = frame_idx; do { frame_idx = last_idx; last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx); } while (last_idx < frame_idx); if (frame_idx == first && atomic_load(&ttd->first) != first) { unsigned expected = frame_idx; atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX); } } static void insert_tasks_between(Dav2dFrameContext *const f, Dav2dTask *const first, Dav2dTask *const last, Dav2dTask *const a, Dav2dTask *const b, const int cond_signal) { struct TaskThreadData *const ttd = f->task_thread.ttd; if (atomic_load(f->c->flush)) return; assert(!a || a->next == b); if (!a) f->task_thread.task_head = first; else a->next = first; if (!b) f->task_thread.task_tail = last; last->next = b; reset_task_cur(f->c, ttd, first->frame_idx); if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1)) pthread_cond_signal(&ttd->cond); } static void insert_tasks(Dav2dFrameContext *const f, Dav2dTask *const first, Dav2dTask *const last, const int cond_signal) { // insert task back into task queue Dav2dTask *t_ptr, *prev_t = NULL; for (t_ptr = f->task_thread.task_head; t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next) { // entropy coding precedes other steps if (t_ptr->type == DAV2D_TASK_TYPE_TILE_ENTROPY) { if (first->type > DAV2D_TASK_TYPE_TILE_ENTROPY) continue; // both are entropy if (first->sby > t_ptr->sby) continue; if (first->sby < t_ptr->sby) { insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); return; } // same sby } else { if (first->type == DAV2D_TASK_TYPE_TILE_ENTROPY) { insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); return; } if (first->sby > t_ptr->sby) continue; if (first->sby < t_ptr->sby) { insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); return; } // same sby if (first->type > t_ptr->type) continue; if (first->type < t_ptr->type) { insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); return; } // same task type } // sort by tile-id assert(first->type == DAV2D_TASK_TYPE_TILE_RECONSTRUCTION || first->type == DAV2D_TASK_TYPE_TILE_MV_RESOLUTION || first->type == DAV2D_TASK_TYPE_TILE_ENTROPY); assert(first->type == t_ptr->type); assert(t_ptr->sby == first->sby); const int pass = first->type == DAV2D_TASK_TYPE_TILE_RECONSTRUCTION ? f->c->task_thread.n_passes - 1 : first->type != DAV2D_TASK_TYPE_TILE_ENTROPY; const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[pass]); const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[pass]); assert(t_tile_idx != p_tile_idx); if (t_tile_idx > p_tile_idx) continue; insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); return; } // append at the end insert_tasks_between(f, first, last, prev_t, NULL, cond_signal); } static inline void insert_task(Dav2dFrameContext *const f, Dav2dTask *const t, const int cond_signal) { insert_tasks(f, t, t, cond_signal); } static inline void add_pending(Dav2dFrameContext *const f, Dav2dTask *const t) { pthread_mutex_lock(&f->task_thread.pending_tasks.lock); t->next = NULL; if (!f->task_thread.pending_tasks.head) f->task_thread.pending_tasks.head = t; else f->task_thread.pending_tasks.tail->next = t; f->task_thread.pending_tasks.tail = t; atomic_store(&f->task_thread.pending_tasks.merge, 1); pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); } static inline int merge_pending_frame(Dav2dFrameContext *const f) { int const merge = atomic_load(&f->task_thread.pending_tasks.merge); if (merge) { pthread_mutex_lock(&f->task_thread.pending_tasks.lock); Dav2dTask *t = f->task_thread.pending_tasks.head; f->task_thread.pending_tasks.head = NULL; f->task_thread.pending_tasks.tail = NULL; atomic_store(&f->task_thread.pending_tasks.merge, 0); pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); while (t) { Dav2dTask *const tmp = t->next; insert_task(f, t, 0); t = tmp; } } return merge; } static inline int merge_pending(const Dav2dContext *const c) { int res = 0; for (unsigned i = 0; i < c->n_fc; i++) res |= merge_pending_frame(&c->fc[i]); return res; } static int create_filter_sbrow(Dav2dFrameContext *const f, const int is_entropy_pass, Dav2dTask **res_t) { const int has_deblock = f->frame_hdr->deblock.level_y[0] || f->frame_hdr->deblock.level_y[1]; const int has_cdef = f->seq_hdr->cdef; const int has_lr = f->lf.restore_planes; Dav2dTask *tasks = f->task_thread.tasks; const int n_passes = f->c->task_thread.n_passes; int num_tasks = f->sbh * (1 + (n_passes > 1)); if (num_tasks > f->task_thread.num_tasks) { const size_t size = sizeof(Dav2dTask) * num_tasks; tasks = dav2d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size); if (!tasks) return -1; memset(tasks, 0, size); f->task_thread.tasks = tasks; f->task_thread.num_tasks = num_tasks; } tasks += f->sbh * is_entropy_pass; if (is_entropy_pass) { f->frame_thread.entropy_progress = 0; } else { const int prog_sz = ((f->sbh + 31) & ~31) >> 5; if (prog_sz > f->frame_thread.prog_sz) { atomic_uint *const prog = dav2d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress, 2 * prog_sz * sizeof(*prog)); if (!prog) return -1; f->frame_thread.frame_progress = prog; f->frame_thread.copy_db_progress = prog + prog_sz; } f->frame_thread.prog_sz = prog_sz; memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint)); memset(f->frame_thread.copy_db_progress, 0, prog_sz * sizeof(atomic_uint)); atomic_store(&f->frame_thread.deblock_progress, 0); } f->frame_thread.next_tile_row[is_entropy_pass] = 0; Dav2dTask *t = &tasks[0]; t->sby = 0; t->recon_progress = 1; t->deblock_progress = 0; t->type = is_entropy_pass ? DAV2D_TASK_TYPE_ENTROPY_PROGRESS : has_deblock ? DAV2D_TASK_TYPE_DEBLOCK_COLS : has_cdef || has_lr /* i.e. LR backup */ ? DAV2D_TASK_TYPE_DEBLOCK_ROWS : DAV2D_TASK_TYPE_RECONSTRUCTION_PROGRESS; t->frame_idx = (int)(f - f->c->fc); *res_t = t; return 0; } int dav2d_task_create_tile_sbrow(Dav2dFrameContext *const f, const int pass, const int cond_signal) { Dav2dTask *tasks = f->task_thread.tile_tasks[0]; const int n_passes = f->c->task_thread.n_passes; const int num_tasks = f->frame_hdr->tiling.t.cols * f->frame_hdr->tiling.t.rows; if (!pass) { int alloc_num_tasks = num_tasks * n_passes; if (alloc_num_tasks > f->task_thread.num_tile_tasks) { const size_t size = sizeof(Dav2dTask) * alloc_num_tasks; tasks = dav2d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size); if (!tasks) return -1; memset(tasks, 0, size); f->task_thread.tile_tasks[0] = tasks; f->task_thread.num_tile_tasks = alloc_num_tasks; } f->task_thread.tile_tasks[1] = tasks + num_tasks; f->task_thread.tile_tasks[2] = tasks + num_tasks * 2; } tasks += num_tasks * pass; Dav2dTask *pf_t = NULL; if (!(n_passes == 3 && pass == 1) && create_filter_sbrow(f, pass + 1 != n_passes, &pf_t)) { return -1; } Dav2dTask *prev_t = NULL; for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) { Dav2dTileState *const ts = &f->ts[tile_idx]; Dav2dTask *t = &tasks[tile_idx]; t->sby = ts->tiling.row_start >> f->sb_shift; if (pf_t && t->sby) { prev_t->next = pf_t; prev_t = pf_t; pf_t = NULL; } t->recon_progress = 0; t->deblock_progress = 0; t->deps_skip = 0; t->type = pass + 1 == n_passes ? DAV2D_TASK_TYPE_TILE_RECONSTRUCTION : !pass ? DAV2D_TASK_TYPE_TILE_ENTROPY : DAV2D_TASK_TYPE_TILE_MV_RESOLUTION; t->frame_idx = (int)(f - f->c->fc); if (prev_t) prev_t->next = t; prev_t = t; } if (pf_t) { prev_t->next = pf_t; prev_t = pf_t; } prev_t->next = NULL; if (!pass || pass + 1 == n_passes) atomic_store(&f->task_thread.done[pass + 1 != n_passes], 0); // XXX in theory this could be done locklessly, at this point they are no // tasks in the frameQ, so no other runner should be using this lock, but // we must add both passes at once pthread_mutex_lock(&f->task_thread.pending_tasks.lock); assert(f->task_thread.pending_tasks.head == NULL || pass > 0); if (!f->task_thread.pending_tasks.head) f->task_thread.pending_tasks.head = &tasks[0]; else f->task_thread.pending_tasks.tail->next = &tasks[0]; f->task_thread.pending_tasks.tail = prev_t; atomic_store(&f->task_thread.pending_tasks.merge, 1); atomic_store(&f->task_thread.init_done, 1); pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); return 0; } void dav2d_task_frame_init(Dav2dFrameContext *const f) { const Dav2dContext *const c = f->c; atomic_store(&f->task_thread.init_done, 0); // schedule init task, which will schedule the remaining tasks Dav2dTask *const t = &f->task_thread.init_task; t->type = DAV2D_TASK_TYPE_INIT; t->frame_idx = (int)(f - c->fc); t->sby = 0; t->recon_progress = t->deblock_progress = 0; insert_task(f, t, 1); } void dav2d_task_delayed_fg(Dav2dContext *const c, Dav2dPicture *const out, const Dav2dPicture *const in) { struct TaskThreadData *const ttd = &c->task_thread; ttd->delayed_fg.in = in; ttd->delayed_fg.out = out; ttd->delayed_fg.type = DAV2D_TASK_TYPE_FG_PREP; atomic_init(&ttd->delayed_fg.progress[0], 0); atomic_init(&ttd->delayed_fg.progress[1], 0); pthread_mutex_lock(&ttd->lock); ttd->delayed_fg.exec = 1; ttd->delayed_fg.finished = 0; pthread_cond_signal(&ttd->cond); do { pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock); } while (!ttd->delayed_fg.finished); pthread_mutex_unlock(&ttd->lock); } static inline int ensure_progress(struct TaskThreadData *const ttd, Dav2dFrameContext *const f, Dav2dTask *const t, const enum TaskType type, atomic_int *const state, int *const target) { // deblock_rows (non-LR portion) depends on deblock of previous sbrow, // so ensure that completed. if not, re-add to task-queue; else, fall-through int p1 = atomic_load(state); if (p1 < t->sby) { t->type = type; t->recon_progress = t->deblock_progress = 0; *target = t->sby; add_pending(f, t); pthread_mutex_lock(&ttd->lock); return 1; } return 0; } static inline int check_tile(Dav2dTask *const t, Dav2dFrameContext *const f) { const int n_passes = f->c->task_thread.n_passes; const int pass = t->type == DAV2D_TASK_TYPE_TILE_RECONSTRUCTION ? n_passes - 1 : t->type != DAV2D_TASK_TYPE_TILE_ENTROPY; const int tile_idx = (int)(t - f->task_thread.tile_tasks[pass]); Dav2dTileState *const ts = &f->ts[tile_idx]; const int p1 = atomic_load(&ts->progress[pass]); if (p1 < t->sby) return 1; int error = p1 == TILE_ERROR; error |= atomic_fetch_or(&f->task_thread.error, error); if (!error && pass > 0) { const int p2 = atomic_load(&ts->progress[pass - 1]); if (p2 <= t->sby) return 1; error = p2 == TILE_ERROR; error |= atomic_fetch_or(&f->task_thread.error, error); } if (!error && f->c->n_fc > 1 && !IS_KEY_OR_INTRA(f->frame_hdr)) { // check reference state const Dav2dThreadPicture *p = &f->cur; const int ss_ver = p->p.p.layout == DAV2D_PIXEL_LAYOUT_I420; const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2); const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift); const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby]; for (int n = t->deps_skip; n < f->frame_hdr->n_ref_frames; n++, t->deps_skip++) { unsigned lowest; if (t->type != DAV2D_TASK_TYPE_TILE_RECONSTRUCTION) { // if temporal mv refs are disabled, we only need this // for the primary ref; if segmentation is disabled, we // don't even need that lowest = p_b; } else { // +8 is postfilter-induced delay const int y = lowest_px[n][0] == INT_MIN ? INT_MIN : lowest_px[n][0] + 8; const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN : lowest_px[n][1] * (1 << ss_ver) + 8; int max = imax(y, uv); if (f->rf.mfmv_mask & (1 << n)) max = imax(max, p_b); if (max == INT_MIN) continue; lowest = iclip(max, 1, f->refp[n].p.p.h); } const unsigned p3 = atomic_load(&f->refp[n].progress[!!pass * 2]); if (p3 < lowest) return 1; atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR); } } return 0; } static inline int get_frame_progress(const Dav2dContext *const c, const Dav2dFrameContext *const f) { unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->cur.progress[2]) : 0; if (frame_prog >= FRAME_ERROR) return f->sbh - 1; int idx = frame_prog >> (f->sb_shift + 7); int prog; do { atomic_uint *state = &f->frame_thread.frame_progress[idx]; const unsigned val = ~atomic_load(state); prog = val ? ctz(val) : 32; if (prog != 32) break; prog = 0; } while (++idx < f->frame_thread.prog_sz); return ((idx << 5) | prog) - 1; } static inline void abort_frame(Dav2dFrameContext *const f, const int error) { atomic_store(&f->task_thread.error, error == DAV2D_ERR(EINVAL) ? 1 : -1); atomic_store(&f->task_thread.task_counter, 0); atomic_store(&f->task_thread.done[0], 1); atomic_store(&f->task_thread.done[1], 1); atomic_store(&f->cur.progress[0], FRAME_ERROR); atomic_store(&f->cur.progress[1], FRAME_ERROR); atomic_store(&f->cur.progress[2], FRAME_ERROR); dav2d_decode_frame_exit(f, error); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } static inline void delayed_fg_task(const Dav2dContext *const c, struct TaskThreadData *const ttd) { const Dav2dPicture *const in = ttd->delayed_fg.in; Dav2dPicture *const out = ttd->delayed_fg.out; #if CONFIG_16BPC int off; if (out->p.bpc != 8) off = (out->p.bpc >> 1) - 4; #endif switch (ttd->delayed_fg.type) { case DAV2D_TASK_TYPE_FG_PREP: ttd->delayed_fg.exec = 0; if (atomic_load(&ttd->cond_signaled)) pthread_cond_signal(&ttd->cond); pthread_mutex_unlock(&ttd->lock); switch (out->p.bpc) { #if CONFIG_8BPC case 8: dav2d_prep_grain_8bpc(&c->dsp[0].fg, out, in, ttd->delayed_fg.scaling_8bpc, ttd->delayed_fg.grain_lut_8bpc); break; #endif #if CONFIG_16BPC case 10: case 12: dav2d_prep_grain_16bpc(&c->dsp[off].fg, out, in, ttd->delayed_fg.scaling_16bpc, ttd->delayed_fg.grain_lut_16bpc); break; #endif default: abort(); } ttd->delayed_fg.type = DAV2D_TASK_TYPE_FG_APPLY; pthread_mutex_lock(&ttd->lock); ttd->delayed_fg.exec = 1; // fall-through case DAV2D_TASK_TYPE_FG_APPLY:; int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); pthread_mutex_unlock(&ttd->lock); const int bs = 16 << out->fgm->block_size; int progmax = (out->p.h + bs - 1) / bs; while (row < progmax) { if (row + 1 < progmax) pthread_cond_signal(&ttd->cond); else { pthread_mutex_lock(&ttd->lock); ttd->delayed_fg.exec = 0; pthread_mutex_unlock(&ttd->lock); } switch (out->p.bpc) { #if CONFIG_8BPC case 8: dav2d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in, ttd->delayed_fg.scaling_8bpc, ttd->delayed_fg.grain_lut_8bpc, row); break; #endif #if CONFIG_16BPC case 10: case 12: dav2d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in, ttd->delayed_fg.scaling_16bpc, ttd->delayed_fg.grain_lut_16bpc, row); break; #endif default: abort(); } row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); atomic_fetch_add(&ttd->delayed_fg.progress[1], 1); } pthread_mutex_lock(&ttd->lock); ttd->delayed_fg.exec = 0; int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; progmax = atomic_load(&ttd->delayed_fg.progress[0]); // signal for completion only once the last runner reaches this if (done >= progmax) { ttd->delayed_fg.finished = 1; pthread_cond_signal(&ttd->delayed_fg.cond); } break; default: abort(); } } void *dav2d_worker_task(void *data) { Dav2dTaskContext *const tc = data; const Dav2dContext *const c = tc->c; struct TaskThreadData *const ttd = tc->task_thread.ttd; dav2d_set_thread_name("dav2d-worker"); pthread_mutex_lock(&ttd->lock); for (;;) { if (tc->task_thread.die) break; if (atomic_load(c->flush)) goto park; merge_pending(c); if (ttd->delayed_fg.exec) { // run delayed film grain first delayed_fg_task(c, ttd); continue; } Dav2dFrameContext *f; Dav2dTask *t, *prev_t = NULL; if (c->n_fc > 1) { // run init tasks second for (unsigned i = 0; i < c->n_fc; i++) { const unsigned first = atomic_load(&ttd->first); f = &c->fc[(first + i) % c->n_fc]; if (atomic_load(&f->task_thread.init_done)) continue; t = f->task_thread.task_head; if (!t) continue; if (t->type == DAV2D_TASK_TYPE_INIT) goto found; if (t->type == DAV2D_TASK_TYPE_INIT_CDF) { // XXX This can be a simple else, if adding tasks of both // passes at once (in dav2d_task_create_tile_sbrow). // Adding the tasks to the pending Q can result in a // thread merging them before setting init_done. // We will need to set init_done before adding to the // pending Q, so maybe return the tasks, set init_done, // and add to pending Q only then. int p1; if (!f->use_pri_sec_cdf) { p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1; } else { p1 = f->src_cdf[0].progress ? atomic_load(f->src_cdf[0].progress) : 1; if (p1 == 1) p1 = f->src_cdf[1].progress ? atomic_load(f->src_cdf[1].progress) : 1; } if (p1) { atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); goto found; } } } } while (ttd->cur < c->n_fc) { // run decoding tasks last const unsigned first = atomic_load(&ttd->first); f = &c->fc[(first + ttd->cur) % c->n_fc]; merge_pending_frame(f); prev_t = f->task_thread.task_cur_prev; t = prev_t ? prev_t->next : f->task_thread.task_head; while (t) { if (t->type == DAV2D_TASK_TYPE_INIT_CDF) goto next; else if (t->type == DAV2D_TASK_TYPE_TILE_ENTROPY || t->type == DAV2D_TASK_TYPE_TILE_MV_RESOLUTION || t->type == DAV2D_TASK_TYPE_TILE_RECONSTRUCTION) { // if not bottom sbrow of tile, this task will be re-added // after it's finished const int res = check_tile(t, f); if (!res) goto found; } else if (t->recon_progress) { const int p = t->type == DAV2D_TASK_TYPE_ENTROPY_PROGRESS; int error = atomic_load(&f->task_thread.error); assert(!atomic_load(&f->task_thread.done[p]) || error); const int tile_row_base = f->frame_hdr->tiling.t.cols * f->frame_thread.next_tile_row[p]; if (t->type == DAV2D_TASK_TYPE_ENTROPY_PROGRESS) { atomic_int *const prog = &f->frame_thread.entropy_progress; const int p1 = atomic_load(prog); if (p1 < t->sby) goto next; atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); } const int pass = t->type == DAV2D_TASK_TYPE_ENTROPY_PROGRESS ? 0 : f->c->task_thread.n_passes - 1; for (int tc = 0; tc < f->frame_hdr->tiling.t.cols; tc++) { Dav2dTileState *const ts = &f->ts[tile_row_base + tc]; const int p2 = atomic_load(&ts->progress[pass]); if (p2 < t->recon_progress) goto next; atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR); } if (t->type != DAV2D_TASK_TYPE_ENTROPY_PROGRESS && c->n_fc > 1) { atomic_store(&f->cur.progress[1], atomic_load(&f->task_thread.error) ? FRAME_ERROR : t->recon_progress); } if (t->sby + 1 < f->sbh) { // add sby+1 to list to replace this one Dav2dTask *next_t = &t[1]; *next_t = *t; next_t->sby++; const int ntr = f->frame_thread.next_tile_row[p] + 1; const int start = f->frame_hdr->tiling.t.row_start_sb[ntr]; if (next_t->sby == start) f->frame_thread.next_tile_row[p] = ntr; next_t->recon_progress = next_t->sby + 1; insert_task(f, next_t, 0); } goto found; } else if (t->type == DAV2D_TASK_TYPE_CDEF) { atomic_uint *prog = f->frame_thread.copy_db_progress; const int p1 = atomic_load(&prog[(t->sby - 1) >> 5]); if (p1 & (1U << ((t->sby - 1) & 31))) goto found; } else { assert(t->deblock_progress); const int p1 = atomic_load(&f->frame_thread.deblock_progress); if (p1 >= t->deblock_progress) { atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); goto found; } } next: prev_t = t; t = t->next; f->task_thread.task_cur_prev = prev_t; } ttd->cur++; } if (reset_task_cur(c, ttd, UINT_MAX)) continue; if (merge_pending(c)) continue; park: tc->task_thread.flushed = 1; pthread_cond_signal(&tc->task_thread.td.cond); // we want to be woken up next time progress is signaled atomic_store(&ttd->cond_signaled, 0); pthread_cond_wait(&ttd->cond, &ttd->lock); tc->task_thread.flushed = 0; reset_task_cur(c, ttd, UINT_MAX); continue; found: // remove t from list if (prev_t) prev_t->next = t->next; else f->task_thread.task_head = t->next; if (!t->next) f->task_thread.task_tail = prev_t; if (t->type > DAV2D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head) ttd->cur++; t->next = NULL; // we don't need to check cond_signaled here, since we found a task // after the last signal so we want to re-signal the next waiting thread // and again won't need to signal after that atomic_store(&ttd->cond_signaled, 1); pthread_cond_signal(&ttd->cond); pthread_mutex_unlock(&ttd->lock); found_unlocked:; const int flush = atomic_load(c->flush); int error = atomic_fetch_or(&f->task_thread.error, flush) | flush; // run it tc->f = f; int sby = t->sby; switch (t->type) { case DAV2D_TASK_TYPE_INIT: { assert(c->n_fc > 1); int res = dav2d_decode_frame_init(f); int p1; if (!f->use_pri_sec_cdf) { p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1; } else { p1 = f->src_cdf[0].progress ? atomic_load(f->src_cdf[0].progress) : 1; if (p1 == 1) p1 = f->src_cdf[1].progress ? atomic_load(f->src_cdf[1].progress) : 1; } if (res || p1 == TILE_ERROR) { pthread_mutex_lock(&ttd->lock); abort_frame(f, res ? res : DAV2D_ERR(EINVAL)); reset_task_cur(c, ttd, t->frame_idx); } else { t->type = DAV2D_TASK_TYPE_INIT_CDF; if (p1) goto found_unlocked; add_pending(f, t); pthread_mutex_lock(&ttd->lock); } continue; } case DAV2D_TASK_TYPE_INIT_CDF: { assert(c->n_fc > 1); int res = DAV2D_ERR(EINVAL); if (!atomic_load(&f->task_thread.error)) { if (f->frame_hdr->tip.frame_mode != 2) { res = dav2d_decode_frame_init_cdf(f); } else { dav2d_decode_tip_frame_init(f); res = 0; } } if (!f->frame_hdr->disable_cdf_update && !f->task_thread.update_set && !f->seq_hdr->avg_cdf_type) { atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1); } if (!res) { assert(c->n_fc > 1); const int n_passes = c->task_thread.n_passes; for (int p = 0; p < n_passes; p++) { const int res = dav2d_task_create_tile_sbrow(f, p, 0); if (res) { pthread_mutex_lock(&ttd->lock); // memory allocation failed atomic_store(&f->task_thread.done[p + 1 != n_passes], 1); atomic_store(&f->task_thread.error, -1); atomic_fetch_sub(&f->task_thread.task_counter, f->frame_hdr->tiling.t.cols * f->frame_hdr->tiling.t.rows + f->sbh); atomic_store(&f->cur.progress[p], FRAME_ERROR); if (p == 2 && atomic_load(&f->task_thread.done[1])) { assert(!atomic_load(&f->task_thread.task_counter)); dav2d_decode_frame_exit(f, DAV2D_ERR(ENOMEM)); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } else { pthread_mutex_unlock(&ttd->lock); } } } pthread_mutex_lock(&ttd->lock); } else { pthread_mutex_lock(&ttd->lock); abort_frame(f, res); reset_task_cur(c, ttd, t->frame_idx); atomic_store(&f->task_thread.init_done, 1); } continue; } case DAV2D_TASK_TYPE_TILE_ENTROPY: case DAV2D_TASK_TYPE_TILE_MV_RESOLUTION: case DAV2D_TASK_TYPE_TILE_RECONSTRUCTION: { const int n_passes = c->task_thread.n_passes; const int pass = t->type == DAV2D_TASK_TYPE_TILE_RECONSTRUCTION ? n_passes - 1 : t->type != DAV2D_TASK_TYPE_TILE_ENTROPY; const int tile_idx = (int)(t - f->task_thread.tile_tasks[pass]); Dav2dTileState *const ts = &f->ts[tile_idx]; tc->ts = ts; tc->by = sby << f->sb_shift; switch (n_passes) { case 1: // this should only happen in single-threaded mode, not here default: abort(); case 2: assert(t->type != DAV2D_TASK_TYPE_TILE_MV_RESOLUTION); tc->task_thread.pass = t->type == DAV2D_TASK_TYPE_TILE_ENTROPY ? PASS_ENTROPY : PASS_MVRES | PASS_RECON; break; case 3: tc->task_thread.pass = t->type == DAV2D_TASK_TYPE_TILE_ENTROPY ? PASS_ENTROPY : t->type == DAV2D_TASK_TYPE_TILE_MV_RESOLUTION ? PASS_MVRES : PASS_RECON; break; } if (!error) error = dav2d_decode_tile_sbrow(tc); const int progress = error ? TILE_ERROR : 1 + sby; // signal progress atomic_fetch_or(&f->task_thread.error, error); if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) { t->sby++; t->deps_skip = 0; if (!check_tile(t, f)) { atomic_store(&ts->progress[pass], progress); reset_task_cur_async(ttd, t->frame_idx, c->n_fc); if (!atomic_fetch_or(&ttd->cond_signaled, 1)) pthread_cond_signal(&ttd->cond); goto found_unlocked; } atomic_store(&ts->progress[pass], progress); add_pending(f, t); pthread_mutex_lock(&ttd->lock); } else { pthread_mutex_lock(&ttd->lock); atomic_store(&ts->progress[pass], progress); reset_task_cur(c, ttd, t->frame_idx); error = atomic_load(&f->task_thread.error); if (!f->frame_hdr->disable_cdf_update && tc->task_thread.pass & PASS_ENTROPY && ((f->task_thread.update_set && f->frame_hdr->tiling.update == tile_idx) || (f->seq_hdr->avg_cdf_type && atomic_fetch_add(&f->task_thread.entropy_task_counter, -1) == 1))) { if (!error) { const int shift = f->frame_hdr->tiling.t.log2_cols + f->frame_hdr->tiling.t.log2_rows; if (shift && f->seq_hdr->avg_cdf_type) { const int n_tiles = 1 << shift; dav2d_cdf_shift(f->out_cdf.data.cdf, &f->ts[0].cdf, shift); for (int n = 1; n < n_tiles; n++) dav2d_cdf_shift_accumulate(f->out_cdf.data.cdf, &f->ts[n].cdf, shift); } else { memcpy(f->out_cdf.data.cdf, &f->ts[f->frame_hdr->tiling.update].cdf, sizeof(CdfContext)); } dav2d_cdf_reset_count(f->frame_hdr, f->out_cdf.data.cdf); } if (c->n_fc > 1) atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1); } if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 && atomic_load(&f->task_thread.done[0]) && (n_passes == 1 || atomic_load(&f->task_thread.done[1]))) { error = atomic_load(&f->task_thread.error); dav2d_decode_frame_exit(f, error == 1 ? DAV2D_ERR(EINVAL) : error ? DAV2D_ERR(ENOMEM) : 0); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } assert(atomic_load(&f->task_thread.task_counter) >= 0); if (!atomic_fetch_or(&ttd->cond_signaled, 1)) pthread_cond_signal(&ttd->cond); } continue; } case DAV2D_TASK_TYPE_DEBLOCK_COLS: if (!atomic_load(&f->task_thread.error)) f->bd_fn.filter_sbrow_deblock_cols(f, sby); if (ensure_progress(ttd, f, t, DAV2D_TASK_TYPE_DEBLOCK_ROWS, &f->frame_thread.deblock_progress, &t->deblock_progress)) continue; // fall-through case DAV2D_TASK_TYPE_DEBLOCK_ROWS: if (!atomic_load(&f->task_thread.error)) f->bd_fn.filter_sbrow_deblock_rows(f, sby); // signal deblock progress if (f->frame_hdr->deblock.level_y[0] || f->frame_hdr->deblock.level_y[1]) { error = atomic_load(&f->task_thread.error); atomic_store(&f->frame_thread.deblock_progress, error ? TILE_ERROR : sby + 1); reset_task_cur_async(ttd, t->frame_idx, c->n_fc); if (!atomic_fetch_or(&ttd->cond_signaled, 1)) pthread_cond_signal(&ttd->cond); } else if (f->seq_hdr->cdef || f->lf.restore_planes) { atomic_fetch_or(&f->frame_thread.copy_db_progress[sby >> 5], 1U << (sby & 31)); // CDEF needs the top buffer to be saved by lr_copy_lpf of the // previous sbrow if (sby) { int prog = atomic_load(&f->frame_thread.copy_db_progress[(sby - 1) >> 5]); if (~prog & (1U << ((sby - 1) & 31))) { t->type = DAV2D_TASK_TYPE_CDEF; t->recon_progress = t->deblock_progress = 0; add_pending(f, t); pthread_mutex_lock(&ttd->lock); continue; } } } // fall-through case DAV2D_TASK_TYPE_CDEF: if (f->seq_hdr->cdef) { if (!atomic_load(&f->task_thread.error)) f->bd_fn.filter_sbrow_cdef(tc, sby); reset_task_cur_async(ttd, t->frame_idx, c->n_fc); if (!atomic_fetch_or(&ttd->cond_signaled, 1)) pthread_cond_signal(&ttd->cond); } // fall-through case DAV2D_TASK_TYPE_LOOP_RESTORATION: if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes) f->bd_fn.filter_sbrow_lr(f, sby); // fall-through case DAV2D_TASK_TYPE_RECONSTRUCTION_PROGRESS: // dummy to cover for no post-filters case DAV2D_TASK_TYPE_ENTROPY_PROGRESS: // dummy to convert tile progress to frame break; default: abort(); } // if task completed [typically LR], signal picture progress as per below const int n_passes = c->task_thread.n_passes; const int sbh = f->sbh; const int sbsz = f->sb_step * 4; if (t->type == DAV2D_TASK_TYPE_ENTROPY_PROGRESS) { error = atomic_load(&f->task_thread.error); const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz; assert(c->task_thread.n_passes > 1); if (c->n_fc > 1 && f->cur.p.data[0] /* upon flush, this can be free'ed already */) { atomic_store(&f->cur.progress[0], error ? FRAME_ERROR : y); } atomic_store(&f->frame_thread.entropy_progress, error ? TILE_ERROR : sby + 1); if (sby + 1 == sbh) atomic_store(&f->task_thread.done[1], 1); pthread_mutex_lock(&ttd->lock); const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1; if (sby + 1 < sbh && num_tasks) { reset_task_cur(c, ttd, t->frame_idx); continue; } if (!num_tasks && atomic_load(&f->task_thread.done[0]) && atomic_load(&f->task_thread.done[1])) { error = atomic_load(&f->task_thread.error); dav2d_decode_frame_exit(f, error == 1 ? DAV2D_ERR(EINVAL) : error ? DAV2D_ERR(ENOMEM) : 0); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } reset_task_cur(c, ttd, t->frame_idx); continue; } // t->type != DAV2D_TASK_TYPE_ENTROPY_PROGRESS atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5], 1U << (sby & 31)); pthread_mutex_lock(&f->task_thread.lock); sby = get_frame_progress(c, f); error = atomic_load(&f->task_thread.error); const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz; if (c->n_fc > 1 && f->cur.p.data[0] /* upon flush, this can be free'ed already */) atomic_store(&f->cur.progress[2], error ? FRAME_ERROR : y); pthread_mutex_unlock(&f->task_thread.lock); if (sby + 1 == sbh) atomic_store(&f->task_thread.done[0], 1); pthread_mutex_lock(&ttd->lock); const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1; if (sby + 1 < sbh && num_tasks) { reset_task_cur(c, ttd, t->frame_idx); continue; } if (!num_tasks && atomic_load(&f->task_thread.done[0]) && (n_passes == 1 || atomic_load(&f->task_thread.done[1]))) { error = atomic_load(&f->task_thread.error); dav2d_decode_frame_exit(f, error == 1 ? DAV2D_ERR(EINVAL) : error ? DAV2D_ERR(ENOMEM) : 0); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); } reset_task_cur(c, ttd, t->frame_idx); } pthread_mutex_unlock(&ttd->lock); return NULL; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/thread_task.h000066400000000000000000000044641517466257200233050ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_THREAD_TASK_H #define DAV2D_SRC_THREAD_TASK_H #include #include "src/internal.h" #define FRAME_ERROR (UINT_MAX - 1) #define TILE_ERROR (INT_MAX - 1) // these functions assume the task scheduling lock is already taken int dav2d_task_create_tile_sbrow(Dav2dFrameContext *f, int pass, int cond_signal); void dav2d_task_frame_init(Dav2dFrameContext *f); void dav2d_task_delayed_fg(Dav2dContext *c, Dav2dPicture *out, const Dav2dPicture *in); void *dav2d_worker_task(void *data); int dav2d_decode_frame_init(Dav2dFrameContext *f); int dav2d_decode_frame_init_cdf(Dav2dFrameContext *f); void dav2d_decode_tip_frame_init(Dav2dFrameContext *f); int dav2d_decode_frame_main(Dav2dFrameContext *f); void dav2d_decode_frame_exit(Dav2dFrameContext *f, int retval); int dav2d_decode_frame(Dav2dFrameContext *f); int dav2d_decode_tile_sbrow(Dav2dTaskContext *t); #endif /* DAV2D_SRC_THREAD_TASK_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/warpmv.c000066400000000000000000000156311517466257200223210ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include "common/intops.h" #include "src/tables.h" #include "src/warpmv.h" static inline int iclip_wmp(const int v) { return iclip((v + 0x20 - (v < 0)) & ~0x3f, -0x8000, 0x7fc0); } int dav2d_resolve_divisor_32(const unsigned d, int *const shift) { *shift = ulog2(d); const int e = d - (1 << *shift); const int f = *shift > 7 ? (e + (1 << (*shift - 8))) >> (*shift - 7) : e << (7 - *shift); assert(f <= 128); *shift += 9; // Use f as lookup into the precomputed table of multipliers return dav2d_div_recip[f]; } int dav2d_get_shear_params(Dav2dWarpedMotionParams *const wm) { const int32_t *const mat = wm->matrix; if (mat[2] <= 0) return 1; wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000); wm->u.p.beta = iclip_wmp(mat[3]); int shift; const int y = apply_sign(dav2d_resolve_divisor_32(abs(mat[2]), &shift), mat[2]); const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y; const int rnd = (1 << shift) >> 1; wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y; wm->u.p.delta = iclip_wmp(mat[5] - apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) - 0x10000); wm->affine = (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) < 0x30000) && (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) < 0x30000); return 0; } static int resolve_divisor_64(const uint64_t d, int *const shift) { *shift = u64log2(d); const int64_t e = d - (1LL << *shift); const int64_t f = *shift > 7 ? (e + (1LL << (*shift - 8))) >> (*shift - 7) : e << (7 - *shift); assert(f <= 128); *shift += 9; // Use f as lookup into the precomputed table of multipliers return dav2d_div_recip[f]; } static int get_mult_shift_ndiag(const int64_t px, const int idet, const int64_t rnd, const int sh) { const int64_t v1 = px * idet; const int v2 = (int) ((v1 + rnd - (v1 < 0)) >> sh); const int v3 = (v2 + 0x20 - (v2 < 0)) & ~0x3f; return iclip(v3, -0x7fc0, 0x7fc0); } static int get_mult_shift_diag(const int64_t px, const int idet, const int64_t rnd, const int sh) { const int64_t v1 = px * idet; const int v2 = (int) ((v1 + rnd - (v1 < 0)) >> sh); const int v3 = (v2 + 0x20 - (v2 < 0x10000)) & ~0x3f; return iclip(v3, 0x8040, 0x17fc0); } void dav2d_set_affine_mv2d(const int bw4, const int bh4, const mv mv, Dav2dWarpedMotionParams *const wm, const int bx4, const int by4) { int32_t *const mat = wm->matrix; const int rsuy = 2 * bh4 - 1; const int rsux = 2 * bw4 - 1; const int isuy = by4 * 4 + rsuy; const int isux = bx4 * 4 + rsux; mat[0] = iclip64to32(mv.x * 0x2000LL - (int64_t) isux * (mat[2] - 0x10000) - (int64_t) isuy * mat[3], -0x8000000, 0x7ffffc0); mat[1] = iclip64to32(mv.y * 0x2000LL - (int64_t) isux * mat[4] - (int64_t) isuy * (mat[5] - 0x10000), -0x8000000, 0x7ffffc0); } int dav2d_find_affine_int(const int (*pts)[2][2], const int np, const int bw4, const int bh4, const mv mv, Dav2dWarpedMotionParams *const wm, const int bx4, const int by4) { int32_t *const mat = wm->matrix; int a[2][2] = { { 0, 0 }, { 0, 0 } }; int bx[2] = { 0, 0 }; int by[2] = { 0, 0 }; const int rsuy = 2 * bh4 - 1; const int rsux = 2 * bw4 - 1; const int suy = rsuy * 8; const int sux = rsux * 8; const int duy = suy + mv.y; const int dux = sux + mv.x; for (int i = 0; i < np; i++) { const int dx = pts[i][1][0] - dux; const int dy = pts[i][1][1] - duy; const int sx = pts[i][0][0] - sux; const int sy = pts[i][0][1] - suy; if (abs(sx - dx) < 256 && abs(sy - dy) < 256) { a[0][0] += ((sx * sx) >> 2) + sx * 2 + 8; a[0][1] += ((sx * sy) >> 2) + sx + sy + 4; a[1][1] += ((sy * sy) >> 2) + sy * 2 + 8; bx[0] += ((sx * dx) >> 2) + sx + dx + 8; bx[1] += ((sy * dx) >> 2) + sy + dx + 4; by[0] += ((sx * dy) >> 2) + sx + dy + 4; by[1] += ((sy * dy) >> 2) + sy + dy + 8; } } // compute determinant of a const int64_t det = (int64_t) a[0][0] * a[1][1] - (int64_t) a[0][1] * a[0][1]; if (det == 0) { mat[2] = mat[5] = 0x10000; mat[3] = mat[4] = 0; dav2d_set_affine_mv2d(bw4, bh4, mv, wm, bx4, by4); return 0; } int shift, idet = apply_sign64(resolve_divisor_64(llabs(det), &shift), det); shift -= 16; if (shift < 0) { idet <<= -shift; shift = 0; } // solve the least-squares const int64_t r = (1LL << shift) >> 1; mat[2] = get_mult_shift_diag((int64_t) a[1][1] * bx[0] - (int64_t) a[0][1] * bx[1], idet, r, shift); mat[3] = get_mult_shift_ndiag((int64_t) a[0][0] * bx[1] - (int64_t) a[0][1] * bx[0], idet, r, shift); mat[4] = get_mult_shift_ndiag((int64_t) a[1][1] * by[0] - (int64_t) a[0][1] * by[1], idet, r, shift); mat[5] = get_mult_shift_diag((int64_t) a[0][0] * by[1] - (int64_t) a[0][1] * by[0], idet, r, shift); dav2d_set_affine_mv2d(bw4, bh4, mv, wm, bx4, by4); return 0; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/warpmv.h000066400000000000000000000036221517466257200223230ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_WARPMV_H #define DAV2D_SRC_WARPMV_H #include "src/levels.h" int dav2d_resolve_divisor_32(unsigned d, int *shift); int dav2d_get_shear_params(Dav2dWarpedMotionParams *wm); int dav2d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4, mv mv, Dav2dWarpedMotionParams *wm, int bx, int by); void dav2d_set_affine_mv2d(int bw4, int bh4, mv mv, Dav2dWarpedMotionParams *wm, int bx, int by); #endif /* DAV2D_SRC_WARPMV_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/wedge.c000066400000000000000000000243121517466257200220740ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "common/intops.h" #include "src/tables.h" #include "src/wedge.h" Dav2dMasks dav2d_masks; enum WedgeDirectionType { WEDGE_0, WEDGE_14, WEDGE_27, WEDGE_45, WEDGE_63, WEDGE_90, WEDGE_117, WEDGE_135, WEDGE_153, WEDGE_166, WEDGE_180, WEDGE_194, WEDGE_207, WEDGE_225, WEDGE_243, WEDGE_270, WEDGE_297, WEDGE_315, WEDGE_333, WEDGE_346, N_WEDGE_DIRECTIONS, }; typedef struct { uint8_t /* enum WedgeDirectionType */ direction; uint8_t x_offset; uint8_t y_offset; } wedge_code_type; static const wedge_code_type wedge_codebook_16[68] = { { WEDGE_0, 5, 4 }, { WEDGE_0, 6, 4 }, { WEDGE_0, 7, 4 }, { WEDGE_14, 4, 4 }, { WEDGE_14, 5, 4 }, { WEDGE_14, 6, 4 }, { WEDGE_14, 7, 4 }, { WEDGE_27, 4, 4 }, { WEDGE_27, 5, 4 }, { WEDGE_27, 6, 4 }, { WEDGE_27, 7, 4 }, { WEDGE_45, 4, 4 }, { WEDGE_45, 5, 4 }, { WEDGE_45, 6, 4 }, { WEDGE_45, 7, 4 }, { WEDGE_63, 4, 4 }, { WEDGE_63, 4, 3 }, { WEDGE_63, 4, 2 }, { WEDGE_63, 4, 1 }, { WEDGE_90, 4, 3 }, { WEDGE_90, 4, 2 }, { WEDGE_90, 4, 1 }, { WEDGE_117, 4, 4 }, { WEDGE_117, 4, 3 }, { WEDGE_117, 4, 2 }, { WEDGE_117, 4, 1 }, { WEDGE_135, 4, 4 }, { WEDGE_135, 3, 4 }, { WEDGE_135, 2, 4 }, { WEDGE_135, 1, 4 }, { WEDGE_153, 4, 4 }, { WEDGE_153, 3, 4 }, { WEDGE_153, 2, 4 }, { WEDGE_153, 1, 4 }, { WEDGE_166, 4, 4 }, { WEDGE_166, 3, 4 }, { WEDGE_166, 2, 4 }, { WEDGE_166, 1, 4 }, { WEDGE_180, 3, 4 }, { WEDGE_180, 2, 4 }, { WEDGE_180, 1, 4 }, { WEDGE_194, 3, 4 }, { WEDGE_194, 2, 4 }, { WEDGE_194, 1, 4 }, { WEDGE_207, 3, 4 }, { WEDGE_207, 2, 4 }, { WEDGE_207, 1, 4 }, { WEDGE_225, 3, 4 }, { WEDGE_225, 2, 4 }, { WEDGE_225, 1, 4 }, { WEDGE_243, 4, 5 }, { WEDGE_243, 4, 6 }, { WEDGE_243, 4, 7 }, { WEDGE_270, 4, 5 }, { WEDGE_270, 4, 6 }, { WEDGE_270, 4, 7 }, { WEDGE_297, 4, 5 }, { WEDGE_297, 4, 6 }, { WEDGE_297, 4, 7 }, { WEDGE_315, 5, 4 }, { WEDGE_315, 6, 4 }, { WEDGE_315, 7, 4 }, { WEDGE_333, 5, 4 }, { WEDGE_333, 6, 4 }, { WEDGE_333, 7, 4 }, { WEDGE_346, 5, 4 }, { WEDGE_346, 6, 4 }, { WEDGE_346, 7, 4 }, }; static void copy2d(uint8_t *dst, const uint8_t *src, const int w8, const int h8, const int x_off, const int y_off) { src += (64 - y_off * h8) * 128 + (64 - x_off * w8); for (int y = 0; y < h8 * 8; y++) { memcpy(dst, src, w8 * 8); src += 128; dst += w8 * 8; } } static void subsample_420(uint8_t *dst, const uint8_t *src, const int w8, const int h8) { for (int y = 0; y < h8 * 4; y++) { for (int x = 0; x < w8 * 4; x++) dst[x] = (src[x * 2 + 0] + src[x * 2 + 1] + src[x * 2 + 0 + w8 * 8] + src[x * 2 + 1 + w8 * 8] + 2) >> 2; dst += w8 * 4; src += w8 * 8 * 2; } } static void subsample_422(uint8_t *dst, const uint8_t *src, const int w8, const int h8) { for (int y = 0; y < h8 * 8; y++) { for (int x = 0; x < w8 * 4; x++) dst[x] = (src[x * 2 + 0] + src[x * 2 + 1] + 1) >> 1; dst += w8 * 4; src += w8 * 8; } } static void fill_tmvp(uint8_t *dst, const uint8_t *src, const int w8, const int h8) { for (int y = 0; y < h8; y++) { for (int x = 0; x < w8; x++) { int score[2] = { 0 }; const uint8_t *sptr = src; for (int yy = y * 8; yy < y * 8 + 8; yy++) { for (int xx = x * 8; xx < x * 8 + 8; xx++) { score[0] += sptr[xx] < 4; score[1] += sptr[xx] > 60; } sptr += w8 * 8; } dst[x] = score[0] >= 60 ? 0 : score[1] >= 60 ? 1 : 2; } dst += w8; src += w8 * 8 * 8; } } static void gen_master(uint8_t *master, const int mul, const enum WedgeDirectionType wd) { static const int8_t cos_lut[N_WEDGE_DIRECTIONS] = { 4, 4, 4, 2, 2, 0, -2, -2, -4, -4, -4, -4, -4, -2, -2, 0, 2, 2, 4, 4 }, sin_lut[N_WEDGE_DIRECTIONS] = { 0, -1, -2, -2, -4, -4, -4, -2, -2, -1, 0, 1, 2, 2, 4, 4, 4, 2, 2, 1 }, weight[29] = { 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; const int s = sin_lut[wd] * mul, c = cos_lut[wd] * mul; for (int y = 0; y < 128; y++) { const int dy = (2 * y - 127) * s; for (int x = 0; x < 128; x++) { const int d = iclip((2 * x - 127) * c + dy, -28, 28); master[x] = 4 * (d >= 0 ? 16 - weight[d] : weight[-d]); } master += 128; } } static COLD void init_wedge_masks(void) { int o = 0; for (enum BlockSize bs = BS_64x64; bs < N_BS_SIZES; bs++) { const uint8_t *const b_dim = dav2d_block_dimensions[bs]; if (b_dim[0] == 1 || b_dim[1] == 1) continue; dav2d_masks.offsets.wedge[bs - BS_64x64] = o; o += b_dim[0] * b_dim[1] >> 2; } assert(o * 0x1100 == sizeof(dav2d_masks.wedge_444) && o < 256); dav2d_masks.wedge[0] = dav2d_masks.wedge_444; dav2d_masks.wedge[1] = dav2d_masks.wedge_422; dav2d_masks.wedge[2] = dav2d_masks.wedge_420; uint8_t master[128 * 128]; enum WedgeDirectionType wd = N_WEDGE_DIRECTIONS; for (int widx = 0; widx < 68; widx++) { const wedge_code_type *const cb = &wedge_codebook_16[widx]; if (cb->direction != wd) { gen_master(master, 2 /* sharp edge */, cb->direction); wd = cb->direction; } #define fill(w8, h8, sz) do { \ uint8_t *const wm = WEDGE_MASK(BS_##sz, w8 * 2, h8 * 2, widx, 0); \ copy2d(wm, master, w8, h8, cb->x_offset, cb->y_offset); \ subsample_422(WEDGE_MASK(BS_##sz, w8 * 2, h8 * 2, widx, 1), wm, w8, h8); \ subsample_420(WEDGE_MASK(BS_##sz, w8 * 2, h8 * 2, widx, 2), wm, w8, h8); \ fill_tmvp(WEDGE_TMVP(BS_##sz, w8 * 2, h8 * 2, widx), wm, w8, h8); \ } while (0) fill(1, 1, 8x8); fill(1, 2, 8x16); fill(2, 1, 16x8); fill(2, 2, 16x16); } for (int widx = 0; widx < 68; widx++) { const wedge_code_type *const cb = &wedge_codebook_16[widx]; if (cb->direction != wd) { gen_master(master, 1 /* soft edge */, cb->direction); wd = cb->direction; } fill(1, 4, 8x32); fill(1, 8, 8x64); fill(2, 4, 16x32); fill(2, 8, 16x64); fill(4, 1, 32x8); fill(4, 2, 32x16); fill(4, 4, 32x32); fill(4, 8, 32x64); fill(8, 1, 64x8); fill(8, 2, 64x16); fill(8, 4, 64x32); fill(8, 8, 64x64); #undef fill } } static COLD void build_nondc_ii_masks(uint8_t *const mask_v, const int w, const int h, const int step) { static const uint8_t ii_weights_1d[64] = { 60, 56, 52, 48, 45, 42, 39, 37, 34, 32, 30, 28, 26, 24, 22, 21, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }; uint8_t *const mask_h = &mask_v[w * h]; uint8_t *const mask_sm = &mask_h[w * h]; for (int y = 0, off = 0; y < h; y++, off += w) { memset(&mask_v[off], ii_weights_1d[y * step], w); for (int x = 0; x < w; x++) { mask_sm[off + x] = ii_weights_1d[imin(x, y) * step]; mask_h[off + x] = ii_weights_1d[x * step]; } } } static COLD void init_ii_masks(void) { memset(dav2d_masks.ii_dc, 32, 64 * 64); int o = 0; for (enum BlockSize bs = BS_64x64; bs < N_BS_SIZES; bs++) { const uint8_t *const b_dim = dav2d_block_dimensions[bs]; dav2d_masks.offsets.ii_nondc[bs - BS_64x64] = o; // we rely on 4x4 being the last entry here o += b_dim[0] * b_dim[1] >> 1; } // but sadly, o will still go one bit too far to fit in uint8_t... assert(o * 0x60 + 0x30 == sizeof(dav2d_masks.ii_nondc) && o < UINT16_MAX); #define fill(w, h, s) \ build_nondc_ii_masks(II_MASK(BS_##w##x##h, 0, 0, 1), w, h, s) fill( 4, 4, 16); fill( 4, 8, 8); fill( 4, 16, 4); fill( 4, 32, 2); fill( 4, 64, 1); fill( 8, 4, 8); fill( 8, 8, 8); fill( 8, 16, 4); fill( 8, 32, 2); fill( 8, 64, 1); fill(16, 4, 4); fill(16, 8, 4); fill(16, 16, 4); fill(16, 32, 2); fill(16, 64, 1); fill(32, 4, 2); fill(32, 8, 2); fill(32, 16, 2); fill(32, 32, 2); fill(32, 64, 1); fill(64, 4, 1); fill(64, 8, 1); fill(64, 16, 1); fill(64, 32, 1); fill(64, 64, 1); #undef fill } COLD void dav2d_init_ii_wedge_masks(void) { // This function is guaranteed to be called only once init_wedge_masks(); init_ii_masks(); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/wedge.h000066400000000000000000000055571517466257200221130ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_WEDGE_H #define DAV2D_SRC_WEDGE_H #include "src/levels.h" typedef struct { /* Offsets, in units of 8 bytes, relative to the start of the struct. */ struct { uint8_t wedge[N_BS_SIZES - BS_64x64 - 6]; uint16_t ii_nondc[N_BS_SIZES - BS_64x64]; } offsets; uint8_t *wedge[3]; uint8_t ALIGN(wedge_444[68 * (64 + 32 + 16 + 8) * (64 + 32 + 16 + 8)], 64); uint8_t ALIGN(wedge_422[68 * (32 + 16 + 8 + 4) * (64 + 32 + 16 + 8)], 64); uint8_t ALIGN(wedge_420[68 * (32 + 16 + 8 + 4) * (32 + 16 + 8 + 4)], 64); uint8_t ALIGN(wedge_tmvp[68 * (8 + 4 + 2 + 1) * (8 + 4 + 2 + 1)], 64); uint8_t ALIGN(ii_dc[64 * 64], 64); uint8_t ALIGN(ii_nondc[(64 + 32 + 16 + 8 + 4) * (64 + 32 + 16 + 8 + 4) * 3], 64); } Dav2dMasks; #define II_MASK(bs, bw4, bh4, ii_mode) \ (ii_mode == II_DC_PRED ? dav2d_masks.ii_dc : \ &dav2d_masks.ii_nondc[dav2d_masks.offsets.ii_nondc[bs - BS_64x64] * 0x60 + \ 16 * (bw4) * (bh4) * (ii_mode - 1)]) #define WEDGE_MASK(bs, bw4, bh4, widx, ssidx) \ &dav2d_masks.wedge[ssidx][(dav2d_masks.offsets.wedge[bs - BS_64x64] * 0x1100 + \ 16 * bw4 * bh4 * widx) >> (ssidx)] #define WEDGE_TMVP(bs, bw4, bh4, widx) \ &dav2d_masks.wedge_tmvp[dav2d_masks.offsets.wedge[bs - BS_64x64] * 68 + \ ((bw4) * (bh4) >> 2) * widx] EXTERN Dav2dMasks dav2d_masks; void dav2d_init_ii_wedge_masks(void); #endif /* DAV2D_SRC_WEDGE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/win32/000077500000000000000000000000001517466257200215755ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/win32/thread.c000066400000000000000000000064731517466257200232220ustar00rootroot00000000000000/* * Copyright © 2018-2021, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #if defined(_WIN32) #include #include #include #include "common/attributes.h" #include "src/thread.h" static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR); COLD void dav2d_init_thread(void) { #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) HANDLE kernel32 = GetModuleHandleW(L"kernel32.dll"); if (kernel32) set_thread_description = (void*)GetProcAddress(kernel32, "SetThreadDescription"); #endif } #undef dav2d_set_thread_name COLD void dav2d_set_thread_name(const wchar_t *const name) { if (set_thread_description) /* Only available since Windows 10 1607 */ set_thread_description(GetCurrentThread(), name); } static COLD unsigned __stdcall thread_entrypoint(void *const data) { pthread_t *const t = data; t->arg = t->func(t->arg); return 0; } COLD int dav2d_pthread_create(pthread_t *const thread, const pthread_attr_t *const attr, void *(*const func)(void*), void *const arg) { const unsigned stack_size = attr ? attr->stack_size : 0; thread->func = func; thread->arg = arg; thread->h = (HANDLE)_beginthreadex(NULL, stack_size, thread_entrypoint, thread, STACK_SIZE_PARAM_IS_A_RESERVATION, NULL); return !thread->h; } COLD int dav2d_pthread_join(pthread_t *const thread, void **const res) { if (WaitForSingleObject(thread->h, INFINITE)) return 1; if (res) *res = thread->arg; return !CloseHandle(thread->h); } COLD int dav2d_pthread_once(pthread_once_t *const once_control, void (*const init_routine)(void)) { BOOL pending = FALSE; if (InitOnceBeginInitialize(once_control, 0, &pending, NULL) != TRUE) return 1; if (pending == TRUE) init_routine(); return !InitOnceComplete(once_control, 0, NULL); } #endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/000077500000000000000000000000001517466257200212605ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef.h000066400000000000000000000062371517466257200223420ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/cdef.h" #define decl_cdef_fns(ext) \ decl_cdef_fn(BF(dav2d_cdef_filter_4x4, ext)); \ decl_cdef_fn(BF(dav2d_cdef_filter_4x8, ext)); \ decl_cdef_fn(BF(dav2d_cdef_filter_8x8, ext)) decl_cdef_fns(avx512icl); decl_cdef_fns(avx2); decl_cdef_fns(sse4); decl_cdef_fns(ssse3); decl_cdef_fns(sse2); decl_cdef_dir_fn(BF(dav2d_cdef_dir, avx2)); decl_cdef_dir_fn(BF(dav2d_cdef_dir, sse4)); decl_cdef_dir_fn(BF(dav2d_cdef_dir, ssse3)); static ALWAYS_INLINE void cdef_dsp_init_x86(Dav2dCdefDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); #if BITDEPTH == 8 if (!(flags & DAV2D_X86_CPU_FLAG_SSE2)) return; c->fb[0] = BF(dav2d_cdef_filter_8x8, sse2); c->fb[1] = BF(dav2d_cdef_filter_4x8, sse2); c->fb[2] = BF(dav2d_cdef_filter_4x4, sse2); #endif if (!(flags & DAV2D_X86_CPU_FLAG_SSSE3)) return; c->dir = BF(dav2d_cdef_dir, ssse3); c->fb[0] = BF(dav2d_cdef_filter_8x8, ssse3); c->fb[1] = BF(dav2d_cdef_filter_4x8, ssse3); c->fb[2] = BF(dav2d_cdef_filter_4x4, ssse3); if (!(flags & DAV2D_X86_CPU_FLAG_SSE41)) return; c->dir = BF(dav2d_cdef_dir, sse4); #if BITDEPTH == 8 c->fb[0] = BF(dav2d_cdef_filter_8x8, sse4); c->fb[1] = BF(dav2d_cdef_filter_4x8, sse4); c->fb[2] = BF(dav2d_cdef_filter_4x4, sse4); #endif #if ARCH_X86_64 if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; c->dir = BF(dav2d_cdef_dir, avx2); c->fb[0] = BF(dav2d_cdef_filter_8x8, avx2); c->fb[1] = BF(dav2d_cdef_filter_4x8, avx2); c->fb[2] = BF(dav2d_cdef_filter_4x4, avx2); if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; c->fb[0] = BF(dav2d_cdef_filter_8x8, avx512icl); c->fb[1] = BF(dav2d_cdef_filter_4x8, avx512icl); c->fb[2] = BF(dav2d_cdef_filter_4x4, avx512icl); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef16_avx2.asm000066400000000000000000000717471517466257200240120ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA %macro DIR_TABLE 1 ; stride db 1 * %1 + 0, 2 * %1 + 0 db 1 * %1 + 0, 2 * %1 - 2 db -1 * %1 + 2, -2 * %1 + 4 db 0 * %1 + 2, -1 * %1 + 4 db 0 * %1 + 2, 0 * %1 + 4 db 0 * %1 + 2, 1 * %1 + 4 db 1 * %1 + 2, 2 * %1 + 4 db 1 * %1 + 0, 2 * %1 + 2 db 1 * %1 + 0, 2 * %1 + 0 db 1 * %1 + 0, 2 * %1 - 2 db -1 * %1 + 2, -2 * %1 + 4 db 0 * %1 + 2, -1 * %1 + 4 %endmacro dir_table4: DIR_TABLE 16 dir_table8: DIR_TABLE 32 pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3 dir_shift: times 2 dw 0x4000 times 2 dw 0x1000 pw_2048: times 2 dw 2048 pw_m16384: times 2 dw -16384 cextern cdef_dir_8bpc_avx2.main SECTION .text %macro CDEF_FILTER 2 ; w, h DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp movifnidn prid, r5m movifnidn secd, r6m mov dird, r7m vpbroadcastd m8, [base+pw_2048] lea dirq, [base+dir_table%1+dirq*2] test prid, prid jz .sec_only %if WIN64 vpbroadcastw m6, prim movaps [rsp+16*0], xmm9 movaps [rsp+16*1], xmm10 %else movd xm6, prid vpbroadcastw m6, xm6 %endif lzcnt pridmpd, prid rorx tmpd, prid, 2 cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, tmpd ; pri >>= 2 mov tmpd, r8m ; damping and prid, 4 sub tmpd, 31 vpbroadcastd m9, [base+pri_taps+priq+8*0] vpbroadcastd m10, [base+pri_taps+priq+8*1] test secd, secd jz .pri_only %if WIN64 movaps r8m, xmm13 vpbroadcastw m13, secm movaps r4m, xmm11 movaps r6m, xmm12 %else movd xm0, secd vpbroadcastw m13, xm0 %endif lzcnt secd, secd xor prid, prid add pridmpd, tmpd cmovs pridmpd, prid add secd, tmpd lea tmpq, [px] mov [pri_shift], pridmpq mov [sec_shift], secq %rep %1*%2/16 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec %endrep %if WIN64 movaps xmm11, r4m movaps xmm12, r6m movaps xmm13, r8m %endif jmp .pri_end .pri_only: add pridmpd, tmpd cmovs pridmpd, secd lea tmpq, [px] mov [pri_shift], pridmpq %rep %1*%2/16 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri %endrep .pri_end: %if WIN64 movaps xmm9, [rsp+16*0] movaps xmm10, [rsp+16*1] %endif .end: RET .sec_only: mov tmpd, r8m ; damping %if WIN64 vpbroadcastw m6, secm %else movd xm6, secd vpbroadcastw m6, xm6 %endif tzcnt secd, secd sub tmpd, secd mov [sec_shift], tmpq lea tmpq, [px] %rep %1*%2/16 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec %endrep jmp .end %if %1 == %2 ALIGN function_align .pri: movsx offq, byte [dirq+4] ; off_k0 %if %1 == 4 mova m1, [tmpq+32*0] punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 %else mova xm1, [tmpq+32*0] vinserti128 m1, [tmpq+32*1], 1 movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+5] ; off_k1 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m0, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m0, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 ; adiff_k1p0 paddw m0, m2 ; constrain(diff_k0) psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m0, m9 ; pri_tap_k0 pmullw m7, m10 ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 add tmpq, 32*2 paddw m0, m1 %if %1 == 4 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] %endif ret ALIGN function_align .sec: movsx offq, byte [dirq+8] ; off1_k0 %if %1 == 4 mova m1, [tmpq+32*0] punpcklqdq m1, [tmpq+32*1] movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 %else mova xm1, [tmpq+32*0] vinserti128 m1, [tmpq+32*1], 1 movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+0] ; off2_k0 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+9] ; off1_k1 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 psignw m2, m3 ; constrain(diff_k0s1) pabsw m3, m4 ; adiff_k0s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 %else movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+1] ; off2_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif paddw m0, m7 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 psignw m4, m5 ; constrain(diff_k1s3) paddw m0, m4 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 add tmpq, 32*2 paddw m0, m1 %if %1 == 4 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] %endif ret ALIGN function_align .pri_sec: movsx offq, byte [dirq+8] ; off2_k0 %if %1 == 4 mova m1, [tmpq+32*0] punpcklqdq m1, [tmpq+32*1] movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 %else mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+0] ; off3_k0 pmaxsw m11, m2, m3 pminuw m12, m2, m3 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m13, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m13, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+9] ; off2_k1 psignw m2, m3 ; constrain(diff_k0s1) pmaxsw m11, m4 pminuw m12, m4 pmaxsw m11, m5 pminuw m12, m5 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 paddw m0, m2 pabsw m3, m4 ; adiff_k0s2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m13, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m13, m3 pminsw m4, m2 %if %1 == 4 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 %else movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+1] ; off3_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) pmaxsw m11, m2 pminuw m12, m2 pmaxsw m11, m3 pminuw m12, m3 paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m13, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m13, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+4] ; off1_k0 paddw m0, m7 psignw m2, m3 ; constrain(diff_k1s1) pmaxsw m11, m4 pminuw m12, m4 pmaxsw m11, m5 pminuw m12, m5 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m13, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m13, m3 pminsw m4, m2 paddw m0, m7 %if %1 == 4 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 %else movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+5] ; off1_k1 psignw m4, m5 ; constrain(diff_k1s3) pmaxsw m11, m2 pminuw m12, m2 pmaxsw m11, m3 pminuw m12, m3 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 paddw m0, m4 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m7, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m7, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif psignw m2, m3 ; constrain(diff_k0p1) paddw m7, m2 ; constrain(diff_k0) pmaxsw m11, m4 pminuw m12, m4 pmaxsw m11, m5 pminuw m12, m5 psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 pabsw m3, m4 ; adiff_k1p0 pmullw m7, m9 ; pri_tap_k0 paddw m0, m7 psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m7, m10 ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 add tmpq, 32*2 pmaxsw m11, m1 pminuw m12, m1 paddw m0, m1 pminsw m0, m11 pmaxsw m0, m12 %if %1 == 4 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] %endif ret %endif %endmacro INIT_YMM avx2 cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \ pri, sec, edge %if WIN64 %define px rsp+16*6 %define offq r8 %define pri_shift rsp+16*2 %define sec_shift rsp+16*3 %else %define px rsp+16*4 %define offq r4 %define pri_shift rsp+16*0 %define sec_shift rsp+16*1 %endif %define base r8-dir_table4 mov edged, r9m lea r8, [dir_table4] movu xm0, [dstq+strideq*0] movu xm1, [dstq+strideq*1] lea r9, [strideq*3] movu xm2, [dstq+strideq*2] movu xm3, [dstq+r9 ] vpbroadcastd m7, [base+pw_m16384] mova [px+16*0+0], xm0 mova [px+16*1+0], xm1 mova [px+16*2+0], xm2 mova [px+16*3+0], xm3 test edgeb, 4 ; HAVE_TOP jz .no_top movu xm0, [topq+strideq*0] movu xm1, [topq+strideq*1] mova [px-16*2+0], xm0 mova [px-16*1+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd xm0, [topq+strideq*0-4] movd xm1, [topq+strideq*1-4] movd [px-16*2-4], xm0 movd [px-16*1-4], xm1 jmp .top_done .no_top: mova [px-16*2+0], m7 .top_no_left: movd [px-16*2-4], xm7 movd [px-16*1-4], xm7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movu xm0, [botq+strideq*0] movu xm1, [botq+strideq*1] mova [px+16*4+0], xm0 mova [px+16*5+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd xm0, [botq+strideq*0-4] movd xm1, [botq+strideq*1-4] movd [px+16*4-4], xm0 movd [px+16*5-4], xm1 jmp .bottom_done .no_bottom: mova [px+16*4+0], m7 .bottom_no_left: movd [px+16*4-4], xm7 movd [px+16*5-4], xm7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movd xm0, [leftq+4*0] movd xm1, [leftq+4*1] movd xm2, [leftq+4*2] movd xm3, [leftq+4*3] movd [px+16*0-4], xm0 movd [px+16*1-4], xm1 movd [px+16*2-4], xm2 movd [px+16*3-4], xm3 jmp .left_done .no_left: REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5 .padding_done: CDEF_FILTER 4, 4 cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \ pri, sec, edge mov edged, r9m movu xm0, [dstq+strideq*0] movu xm1, [dstq+strideq*1] lea r9, [strideq*3] movu xm2, [dstq+strideq*2] movu xm3, [dstq+r9 ] lea r6, [dstq+strideq*4] movu xm4, [r6 +strideq*0] movu xm5, [r6 +strideq*1] movu xm6, [r6 +strideq*2] movu xm7, [r6 +r9 ] lea r8, [dir_table4] mova [px+16*0+0], xm0 mova [px+16*1+0], xm1 mova [px+16*2+0], xm2 mova [px+16*3+0], xm3 mova [px+16*4+0], xm4 mova [px+16*5+0], xm5 mova [px+16*6+0], xm6 mova [px+16*7+0], xm7 vpbroadcastd m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movu xm0, [topq+strideq*0] movu xm1, [topq+strideq*1] mova [px-16*2+0], xm0 mova [px-16*1+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd xm0, [topq+strideq*0-4] movd xm1, [topq+strideq*1-4] movd [px-16*2-4], xm0 movd [px-16*1-4], xm1 jmp .top_done .no_top: mova [px-16*2+0], m7 .top_no_left: movd [px-16*2-4], xm7 movd [px-16*1-4], xm7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movu xm0, [botq+strideq*0] movu xm1, [botq+strideq*1] mova [px+16*8+0], xm0 mova [px+16*9+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd xm0, [botq+strideq*0-4] movd xm1, [botq+strideq*1-4] movd [px+16*8-4], xm0 movd [px+16*9-4], xm1 jmp .bottom_done .no_bottom: mova [px+16*8+0], m7 .bottom_no_left: movd [px+16*8-4], xm7 movd [px+16*9-4], xm7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movd xm0, [leftq+4*0] movd xm1, [leftq+4*1] movd xm2, [leftq+4*2] movd xm3, [leftq+4*3] movd [px+16*0-4], xm0 movd [px+16*1-4], xm1 movd [px+16*2-4], xm2 movd [px+16*3-4], xm3 movd xm0, [leftq+4*4] movd xm1, [leftq+4*5] movd xm2, [leftq+4*6] movd xm3, [leftq+4*7] movd [px+16*4-4], xm0 movd [px+16*5-4], xm1 movd [px+16*6-4], xm2 movd [px+16*7-4], xm3 jmp .left_done .no_left: REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 4, 8 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \ pri, sec, edge %if WIN64 %define px rsp+32*4 %else %define px rsp+32*3 %endif %define base r8-dir_table8 mov edged, r9m movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea r6, [dstq+strideq*2] movu m2, [r6 +strideq*0] movu m3, [r6 +strideq*1] lea r6, [r6 +strideq*2] movu m4, [r6 +strideq*0] movu m5, [r6 +strideq*1] lea r6, [r6 +strideq*2] movu m6, [r6 +strideq*0] movu m7, [r6 +strideq*1] lea r8, [dir_table8] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 mova [px+32*4+0], m4 mova [px+32*5+0], m5 mova [px+32*6+0], m6 mova [px+32*7+0], m7 vpbroadcastd m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd xm0, [topq+strideq*0-4] movd xm1, [topq+strideq*1-4] movd [px-32*2-4], xm0 movd [px-32*1-4], xm1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], xm7 movd [px-32*1-4], xm7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd xm0, [botq+strideq*0-4] movd xm1, [botq+strideq*1-4] movd [px+32*8-4], xm0 movd [px+32*9-4], xm1 jmp .bottom_done .no_bottom: mova [px+32*8+0], m7 mova [px+32*9+0], m7 .bottom_no_left: movd [px+32*8-4], xm7 movd [px+32*9-4], xm7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movd xm0, [leftq+4*0] movd xm1, [leftq+4*1] movd xm2, [leftq+4*2] movd xm3, [leftq+4*3] movd [px+32*0-4], xm0 movd [px+32*1-4], xm1 movd [px+32*2-4], xm2 movd [px+32*3-4], xm3 movd xm0, [leftq+4*4] movd xm1, [leftq+4*5] movd xm2, [leftq+4*6] movd xm3, [leftq+4*7] movd [px+32*4-4], xm0 movd [px+32*5-4], xm1 movd [px+32*6-4], xm2 movd [px+32*7-4], xm3 jmp .left_done .no_left: REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 8, 8 cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax lea r6, [dir_shift] shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc vpbroadcastd m4, [r6+bdmaxq*4] lea r6, [strideq*3] mova xm0, [srcq+strideq*0] mova xm1, [srcq+strideq*1] mova xm2, [srcq+strideq*2] mova xm3, [srcq+r6 ] lea srcq, [srcq+strideq*4] vinserti128 m0, [srcq+r6 ], 1 vinserti128 m1, [srcq+strideq*2], 1 vinserti128 m2, [srcq+strideq*1], 1 vinserti128 m3, [srcq+strideq*0], 1 REPX {pmulhuw x, m4}, m0, m1, m2, m3 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef16_avx512.asm000066400000000000000000000522461517466257200241510ustar00rootroot00000000000000; Copyright © 2022, VideoLAN and dav2d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 dw 1, 2, 1, 10, 9, 18, 8, 17 dw 8, 16, 8, 15, -7,-14, 1, -6 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 db 2, 4, 2, 36, 34, 68, 32, 66 db 32, 64, 32, 62,-30,-60, 2,-28 pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 sec_taps4: dw 32, 16 pw_m16384: times 2 dw -16384 pw_2048: times 2 dw 2048 pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) edge_mask8: dw 0x2121, 0x2020, 0x0101 SECTION .text %macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp psubw %1, %2, %3 pabsw %1, %1 vpcmpgtw k1, %3, %2 vpsrlvw %7, %1, %6 psubusw %7, %5, %7 pminsw %1, %7 vpsubw %1{k1}, %4, %1 %endmacro ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 ; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 ; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 INIT_ZMM avx512icl cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r6-cdef_dirs4 lea r6, [cdef_dirs4] movu xm3, [dstq+strideq*0] vinserti32x4 ym3, [dstq+strideq*1], 1 mova xm2, [leftq] lea r2, [dstq+strideq*2] vinserti32x4 m3, [r2+strideq*0], 2 mova m5, [base+cdef_perm] vinserti32x4 m3, [r2+strideq*1], 3 vpermt2d m2, m5, m3 vinserti32x4 m1, m2, [topq+strideq*0-4], 0 vinserti32x4 m1, [topq+strideq*1-4], 1 mov r3d, edgem movifnidn prid, prim punpcklwd m3, m3 ; px psrlw m5, 8 vpbroadcastd m0, [base+pd_268435568] pxor m12, m12 cmp r3d, 0x0f jne .mask_edges vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 .main: test prid, prid jz .sec_only lzcnt r4d, prid rorx r3d, prid, 2 vpbroadcastw m13, prim cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, r3d ; pri >>= 2 mov r3d, dampingm and prid, 4 sub r3d, 31 vpbroadcastd m15, [base+pri_taps4+priq] xor prid, prid add r4d, r3d cmovns prid, r4d ; pri_shift mov r4d, dirm vpbroadcastw m14, prid mov r5d, secm vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] call .constrain test r5d, r5d jz .end_no_clip lzcnt r5d, r5d vpbroadcastw m13, secm add r3d, r5d pminuw m6, m3, m8 pmaxsw m7, m3, m8 pminuw m6, m9 pmaxsw m7, m9 call .constrain_sec pminuw m6, m8 pmaxsw m7, m8 pminuw m6, m9 pmaxsw m7, m9 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain pminuw m6, m8 pmaxsw m7, m8 pminuw m6, m9 pmaxsw m7, m9 psrldq m8, m6, 2 vpshldd m3, m0, 8 psrldq m9, m7, 2 paddd m0, m3 pminuw m6, m8 psrldq m0, 1 pmaxsw m7, m9 pmaxsw m0, m6 pminsw m0, m7 vpmovdw ym0, m0 jmp .end .sec_only: tzcnt r5d, secm mov r3d, dampingm vpbroadcastw m13, secm mov r4d, dirm sub r3d, r5d ; sec_shift call .constrain_sec vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain .end_no_clip: mova ym1, [base+end_perm4] vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) vpermb m0, m1, m0 .end: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm0, ym0, 1 movq [r2+strideq*0], xm0 movhps [r2+strideq*1], xm0 RET .mask_edges: vpbroadcastd m6, [base+pw_m16384] test r3b, 0x08 jz .mask_edges_no_bottom ; avoid buffer overread vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 kmovw k1, [base+edge_mask4-8+r3*2] jmp .mask_edges_main .mask_edges_no_bottom: kmovw k1, [base+edge_mask4+8+r3*2] .mask_edges_main: or r3d, 0x04 vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 kmovw k1, [base+edge_mask4-8+r3*2] vmovdqa32 m2{k1}, m6 jmp .main .constrain_sec: vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] vpbroadcastw m14, r3d vpbroadcastd m15, [base+sec_taps4] .constrain: paddw m8, m5, m9 vpermi2w m8, m1, m2 ; k0p0 k1p0 psubw m9, m5, m9 vpermi2w m9, m1, m2 ; k0p1 k1p1 CONSTRAIN m10, m8, m3, m12, m13, m14, m11 vpdpwssd m0, m10, m15 CONSTRAIN m10, m9, m3, m12, m13, m14, m11 vpdpwssd m0, m10, m15 ret ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 ; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 ; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge lea r6, [cdef_dirs4] movu xm18, [dstq+strideq*0] vinserti128 ym18, [dstq+strideq*1], 1 mova xm1, [leftq+16*0] mova xm2, [leftq+16*1] lea r2, [strideq*3] vinserti32x4 m18, [dstq+strideq*2], 2 mova m5, [base+cdef_perm] vinserti32x4 m18, [dstq+r2 ], 3 vpermt2d m1, m5, m18 vinserti32x4 m0, m1, [topq+strideq*0-4], 0 vinserti32x4 m0, [topq+strideq*1-4], 1 lea r3, [dstq+strideq*4] movu xm19, [r3+strideq*0] vinserti128 ym19, [r3+strideq*1], 1 vinserti32x4 m19, [r3+strideq*2], 2 vinserti32x4 m19, [r3+r2 ], 3 mov r3d, edgem movifnidn prid, prim vpermt2d m2, m5, m19 vpbroadcastd m16, [base+pd_268435568] pxor m12, m12 punpcklwd m18, m18 ; px (top) psrlw m5, 8 punpcklwd m19, m19 ; px (bottom) mova m17, m16 vshufi32x4 m1, m2, q3210 cmp r3d, 0x0f jne .mask_edges vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 .main: test prid, prid jz .sec_only lzcnt r4d, prid rorx r3d, prid, 2 vpbroadcastw m13, prim cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, r3d ; pri >>= 2 mov r3d, dampingm and prid, 4 sub r3d, 31 vpbroadcastd m15, [base+pri_taps4+priq] xor prid, prid add r4d, r3d cmovns prid, r4d ; pri_shift mov r4d, dirm vpbroadcastw m14, prid mov r5d, secm vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] call .constrain test r5d, r5d jz .end_no_clip lzcnt r5d, r5d vpbroadcastw m13, secm add r3d, r5d pminuw m3, m18, m6 pmaxsw m4, m18, m6 pminuw m20, m19, m7 pmaxsw m21, m19, m7 pminuw m3, m8 pmaxsw m4, m8 pminuw m20, m9 pmaxsw m21, m9 call .constrain_sec pminuw m3, m6 pmaxsw m4, m6 pminuw m20, m7 pmaxsw m21, m7 pminuw m3, m8 pmaxsw m4, m8 pminuw m20, m9 pmaxsw m21, m9 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain pminuw m3, m6 pmaxsw m4, m6 mov r3, 0xcccccccccccccccc pminuw m20, m7 pmaxsw m21, m7 kmovq k1, r3 pminuw m3, m8 pmaxsw m4, m8 pminuw m20, m9 pmaxsw m21, m9 vbroadcasti32x4 m0, [base+deint_shuf] vpshldd m6, m20, m3, 16 vmovdqu8 m3{k1}, m20 vpshldd m18, m16, 8 vpshldd m7, m21, m4, 16 vmovdqu8 m4{k1}, m21 vpshldd m19, m17, 8 pminuw m3, m6 paddd m16, m18 pmaxsw m4, m7 paddd m17, m19 psrldq m16, 1 palignr m16{k1}, m17, m17, 15 lea r6, [dstq+strideq*4] pmaxsw m16, m3 pminsw m16, m4 pshufb m16, m0 movq [dstq+strideq*0], xm16 movhps [r6 +strideq*0], xm16 vextracti128 xm17, ym16, 1 movq [dstq+strideq*1], xm17 movhps [r6 +strideq*1], xm17 vextracti32x4 xm17, m16, 2 movq [dstq+strideq*2], xm17 movhps [r6 +strideq*2], xm17 vextracti32x4 xm16, m16, 3 movq [dstq+r2 ], xm16 movhps [r6 +r2 ], xm16 RET .sec_only: mov r4d, dirm tzcnt r5d, secm mov r3d, dampingm vpbroadcastw m13, secm sub r3d, r5d ; sec_shift call .constrain_sec vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain .end_no_clip: mova ym20, [base+end_perm4] vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m19, m17, 8 paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddd m17, m19 vpermb m16, m20, m16 vpermb m17, m20, m17 movq [dstq+strideq*0], xm16 movhps [dstq+strideq*1], xm16 vextracti128 xm16, ym16, 1 movq [dstq+strideq*2], xm16 movhps [dstq+r2 ], xm16 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm17 movhps [dstq+strideq*1], xm17 vextracti128 xm17, ym17, 1 movq [dstq+strideq*2], xm17 movhps [dstq+r2 ], xm17 RET .mask_edges: vpbroadcastd m6, [base+pw_m16384] test r3b, 0x08 jz .mask_edges_no_bottom ; avoid buffer overread vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 kmovw k1, [base+edge_mask4-8+r3*2] jmp .mask_edges_main .mask_edges_no_bottom: kmovw k1, [base+edge_mask4+8+r3*2] .mask_edges_main: mov r4d, r3d or r3d, 0x0c vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 kmovw k1, [base+edge_mask4-8+r3*2] or r4d, 0x04 vmovdqa32 m1{k1}, m6 kmovw k1, [base+edge_mask4-8+r4*2] vmovdqa32 m2{k1}, m6 jmp .main .constrain_sec: vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] vpbroadcastw m14, r3d vpbroadcastd m15, [base+sec_taps4] .constrain: paddw m7, m5, m9 mova m6, m0 vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) psubw m9, m5, m9 mova m8, m0 vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) CONSTRAIN m10, m6, m18, m12, m13, m14, m11 vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) vpdpwssd m16, m10, m15 CONSTRAIN m10, m7, m19, m12, m13, m14, m11 vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) vpdpwssd m17, m10, m15 CONSTRAIN m10, m8, m18, m12, m13, m14, m11 vpdpwssd m16, m10, m15 CONSTRAIN m10, m9, m19, m12, m13, m14, m11 vpdpwssd m17, m10, m15 ret cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r6-cdef_dirs8 lea r6, [cdef_dirs8] movu ym17, [dstq+strideq*0] vinserti32x8 m17, [dstq+strideq*1], 1 movq xm4, [leftq+8*0] movq xm5, [leftq+8*1] psrld m2, [base+cdef_perm], 16 movq xm6, [leftq+8*2] movq xm7, [leftq+8*3] lea r2, [strideq*3] movu ym16, [topq+strideq*0-4] vinserti32x8 m16, [topq+strideq*1-4], 1 lea r3, [dstq+strideq*4] movu ym18, [dstq+strideq*2] vinserti32x8 m18, [dstq+r2 ], 1 movu ym19, [r3+strideq*0] vinserti32x8 m19, [r3+strideq*1], 1 movu ym20, [r3+strideq*2] vinserti32x8 m20, [r3+r2 ], 1 vshufi32x4 m0, m17, m18, q2020 ; px (top) mov r3d, edgem vshufi32x4 m1, m19, m20, q2020 ; px (bottom) movifnidn prid, prim vpermt2d m17, m2, m4 vpermt2d m18, m2, m5 pxor m12, m12 vpermt2d m19, m2, m6 vpermt2d m20, m2, m7 cmp r3d, 0x0f jne .mask_edges movu ym21, [botq+strideq*0-4] vinserti32x8 m21, [botq+strideq*1-4], 1 .main: mova [rsp+64*0], m16 ; top mova [rsp+64*1], m17 ; 0 1 mova [rsp+64*2], m18 ; 2 3 mova [rsp+64*3], m19 ; 4 5 mova [rsp+64*4], m20 ; 6 7 mova [rsp+64*5], m21 ; bottom test prid, prid jz .sec_only lzcnt r4d, prid rorx r3d, prid, 2 vpbroadcastw m13, prim cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, r3d ; pri >>= 2 mov r3d, dampingm and prid, 4 sub r3d, 31 add r4d, r3d ; pri_shift vpbroadcastw m14, r4d mov r4d, dirm vpbroadcastd m2, [base+pri_taps8+priq*2+0] vpbroadcastd m3, [base+pri_taps8+priq*2+4] movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 pmaxsw m14, m12 call .constrain mov r5d, secm pmullw m16, m8, m2 pmullw m17, m9, m2 test r5d, r5d jnz .pri_sec movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 call .constrain pmullw m8, m3 pmullw m9, m3 jmp .end_no_clip .pri_sec: lzcnt r5d, r5d add r3d, r5d ; sec_shift movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 pminuw m18, m0, m4 pmaxsw m19, m0, m4 pminuw m20, m1, m5 pmaxsw m21, m1, m5 call .min_max_constrain2 movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 pmullw m8, m3 pmullw m9, m3 vpbroadcastw m13, secm vpbroadcastw m14, r3d paddw m16, m8 paddw m17, m9 call .min_max_constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 mova m2, m8 mova m3, m9 call .min_max_constrain movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 paddw m2, m8 paddw m3, m9 call .min_max_constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 paddw m2, m2 paddw m3, m3 paddw m16, m8 paddw m17, m9 call .min_max_constrain vpbroadcastd m10, [base+pw_2048] paddw m16, m2 paddw m17, m3 paddw m16, m8 paddw m17, m9 psraw m8, m16, 15 psraw m9, m17, 15 paddw m16, m8 paddw m17, m9 pmulhrsw m16, m10 pmulhrsw m17, m10 pminuw m18, m4 pmaxsw m19, m4 pminuw m20, m5 pmaxsw m21, m5 pminuw m18, m6 pmaxsw m19, m6 pminuw m20, m7 pmaxsw m21, m7 paddw m16, m0 paddw m17, m1 pmaxsw m16, m18 pmaxsw m17, m20 pminsw m16, m19 pminsw m17, m21 jmp .end .sec_only: tzcnt r5d, secm mov r4d, dirm mov r3d, dampingm vpbroadcastw m13, secm sub r3d, r5d movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] vpbroadcastw m14, r3d call .constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] mova m16, m8 mova m17, m9 call .constrain movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] paddw m16, m8 paddw m17, m9 call .constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] paddw m16, m16 paddw m17, m17 paddw m16, m8 paddw m17, m9 call .constrain .end_no_clip: vpbroadcastd m10, [base+pw_2048] paddw m16, m8 paddw m17, m9 psraw m8, m16, 15 psraw m9, m17, 15 paddw m16, m8 paddw m17, m9 pmulhrsw m16, m10 pmulhrsw m17, m10 paddw m16, m0 paddw m17, m1 .end: mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm17 vextracti128 [dstq+strideq*1], ym17, 1 vextracti32x4 [dstq+strideq*2], m17, 2 vextracti32x4 [dstq+r2 ], m17, 3 RET .mask_edges: vpbroadcastd m2, [base+pw_m16384] test r3b, 0x08 jz .mask_edges_no_bottom ; avoid buffer overread movu ym21, [botq+strideq*0-4] vinserti32x8 m21, [botq+strideq*1-4], 1 jmp .mask_edges_top .mask_edges_no_bottom: mova m21, m2 .mask_edges_top: test r3b, 0x04 jnz .mask_edges_main mova m16, m2 .mask_edges_main: and r3d, 0x03 cmp r3d, 0x03 je .main kmovw k1, [base+edge_mask8+r3*2] vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 vmovdqa32 m17{k1}, m2 vmovdqa32 m18{k1}, m2 vmovdqa32 m19{k1}, m2 vmovdqa32 m20{k1}, m2 vmovdqa32 m21{k1}, m2 jmp .main ALIGN function_align .min_max_constrain: pminuw m18, m4 pmaxsw m19, m4 pminuw m20, m5 pmaxsw m21, m5 .min_max_constrain2: pminuw m18, m6 pmaxsw m19, m6 pminuw m20, m7 pmaxsw m21, m7 .constrain: %define tmp rsp+gprsize+68 movu m4, [tmp+r5+64*0] vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) movu m5, [tmp+r5+64*2] vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) neg r5 movu m6, [tmp+r5+64*0] vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) movu m7, [tmp+r5+64*2] vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) CONSTRAIN m8, m4, m0, m12, m13, m14, m15 CONSTRAIN m9, m5, m1, m12, m13, m14, m15 CONSTRAIN m10, m6, m0, m12, m13, m14, m15 CONSTRAIN m11, m7, m1, m12, m13, m14, m15 paddw m8, m10 paddw m9, m11 ret %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef16_sse.asm000066400000000000000000001011331517466257200237030ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright (c) 2017-2021, The rav1e contributors ; Copyright (c) 2021, Nathan Egge ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA %macro DUP8 1-* %rep %0 times 8 dw %1 %rotate 1 %endrep %endmacro pri_taps: DUP8 4, 2, 3, 3 dir_table: db 1 * 32 + 0, 2 * 32 + 0 db 1 * 32 + 0, 2 * 32 - 2 db -1 * 32 + 2, -2 * 32 + 4 db 0 * 32 + 2, -1 * 32 + 4 db 0 * 32 + 2, 0 * 32 + 4 db 0 * 32 + 2, 1 * 32 + 4 db 1 * 32 + 2, 2 * 32 + 4 db 1 * 32 + 0, 2 * 32 + 2 db 1 * 32 + 0, 2 * 32 + 0 db 1 * 32 + 0, 2 * 32 - 2 db -1 * 32 + 2, -2 * 32 + 4 db 0 * 32 + 2, -1 * 32 + 4 dir_shift: times 4 dw 0x4000 times 4 dw 0x1000 pw_128: times 4 dw 128 pw_2048: times 8 dw 2048 pw_m16384: times 8 dw -16384 cextern cdef_dir_8bpc_ssse3.main cextern cdef_dir_8bpc_sse4.main cextern shufw_6543210x SECTION .text %if ARCH_X86_32 DECLARE_REG_TMP 5, 3 %elif WIN64 DECLARE_REG_TMP 8, 4 %else DECLARE_REG_TMP 8, 6 %endif %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir mova m8, [base+pw_2048] %else DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir %define m8 [base+pw_2048] %define m9 [rsp+16*1+gprsize] %define m10 [rsp+16*2+gprsize] %endif movifnidn prid, r5m movifnidn secd, r6m test prid, prid jz .sec_only movd m6, r5m %if ARCH_X86_32 mov [rsp+24], pridmpd %endif bsr pridmpd, prid lea tmpd, [priq*4] cmp dword r10m, 0x3ff ; if (bpc == 10) cmove prid, tmpd ; pri <<= 2 mov tmpd, r8m ; damping mov dird, r7m and prid, 16 pshufb m6, m7 ; splat lea dirq, [base+dir_table+dirq*2] lea priq, [base+pri_taps+priq*2] test secd, secd jz .pri_only mova [rsp], m6 movd m6, secd tzcnt secd, secd sub pridmpd, tmpd sub tmpd, secd pshufb m6, m7 xor secd, secd neg pridmpd cmovs pridmpd, secd %if ARCH_X86_32 mov [pri_shift+4], secd mov [sec_shift+4], secd %endif mov [pri_shift+0], pridmpq mov [sec_shift+0], tmpq lea tmpq, [px] %if WIN64 movaps r4m, m9 movaps r6m, m10 %elif ARCH_X86_32 mov pridmpd, [rsp+24] %endif %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec %endrep %if WIN64 movaps m9, r4m movaps m10, r6m %endif jmp .end .pri_only: sub tmpd, pridmpd cmovs tmpd, secd %if ARCH_X86_32 mov pridmpd, [rsp+24] mov [pri_shift+4], secd %endif mov [pri_shift+0], tmpq lea tmpq, [px] %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri %endrep .end: RET .sec_only: mov tmpd, r8m ; damping movd m6, r6m tzcnt secd, secd mov dird, r7m pshufb m6, m7 sub tmpd, secd lea dirq, [base+dir_table+dirq*2] %if ARCH_X86_32 mov [sec_shift+4], prid %endif mov [sec_shift+0], tmpq lea tmpq, [px] %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec %endrep jmp .end %if %1 == %2 %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir %else DEFINE_ARGS dst, stride, tmp, off, pri, _, dir %endif ALIGN function_align .pri: movsx offq, byte [dirq+4] ; off_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0p0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0p1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+5] ; off_k1 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m0, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m0, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1p0 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1p1 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 ; adiff_k1p0 paddw m0, m2 ; constrain(diff_k0) psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m0, [priq+16*0] ; pri_tap_k0 pmullw m7, [priq+16*1] ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret ALIGN function_align .sec: movsx offq, byte [dirq+8] ; off1_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0s1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+0] ; off2_k0 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k0s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k0s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+9] ; off1_k1 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 psignw m2, m3 ; constrain(diff_k0s1) pabsw m3, m4 ; adiff_k0s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k1s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k1s1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+1] ; off2_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif paddw m0, m7 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 psignw m4, m5 ; constrain(diff_k1s3) paddw m0, m4 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret ALIGN function_align .pri_sec: movsx offq, byte [dirq+8] ; off2_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0s1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+0] ; off3_k0 pabsw m4, m2 %if ARCH_X86_64 pabsw m10, m3 pmaxsw m9, m2, m3 pminsw m10, m4 %else pabsw m7, m3 pmaxsw m5, m2, m3 pminsw m4, m7 mova m9, m5 mova m10, m4 %endif psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k0s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k0s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+9] ; off2_k1 pabsw m7, m4 psignw m2, m3 pabsw m3, m5 ; constrain(diff_k0s1) %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m7 pmaxsw m9, m5 pminsw m10, m3 %else pminsw m7, m10 pminsw m7, m3 pmaxsw m3, m9, m4 pmaxsw m3, m5 mova m10, m7 mova m9, m3 %endif psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 paddw m0, m2 pabsw m3, m4 ; adiff_k0s2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k1s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k1s1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+1] ; off3_k1 paddw m0, m7 pabsw m7, m2 psignw m4, m5 ; constrain(diff_k0s3) pabsw m5, m3 %if ARCH_X86_64 pmaxsw m9, m2 pminsw m10, m7 pmaxsw m9, m3 pminsw m10, m5 %else pminsw m7, m10 pminsw m7, m5 pmaxsw m5, m9, m2 pmaxsw m5, m3 mova m10, m7 mova m9, m5 %endif paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+4] ; off1_k0 paddw m0, m7 pabsw m7, m4 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m5 %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m7 pmaxsw m9, m5 pminsw m10, m3 %else pminsw m7, m10 pminsw m7, m3 pmaxsw m3, m9, m4 pmaxsw m3, m5 mova m10, m7 mova m9, m3 %endif psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k0p0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0p1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+5] ; off1_k1 pabsw m7, m2 psignw m4, m5 ; constrain(diff_k1s3) pabsw m5, m3 %if ARCH_X86_64 pmaxsw m9, m2 pminsw m10, m7 pmaxsw m9, m3 pminsw m10, m5 %else pminsw m7, m10 pminsw m7, m5 pmaxsw m5, m9, m2 pmaxsw m5, m3 mova m10, m7 mova m9, m5 %endif psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 paddw m0, m4 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m7, [rsp+gprsize], m5 pabsw m5, m3 ; adiff_k0p1 pminsw m7, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m7, m2 ; constrain(diff_k0p0) psubusw m2, [rsp+gprsize], m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1p0 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1p1 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 paddw m7, m2 ; constrain(diff_k0) pabsw m2, m5 %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m3 pmaxsw m9, m5 pminsw m10, m2 %else pminsw m3, m10 pminsw m3, m2 pmaxsw m2, m9, m4 pmaxsw m2, m5 mova m10, m3 mova m9, m2 %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 pabsw m3, m4 ; adiff_k1p0 pmullw m7, [priq+16*0] ; pri_tap_k0 paddw m0, m7 psrlw m2, m3, [pri_shift+gprsize] psubusw m7, [rsp+16*0+gprsize], m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, [rsp+16*0+gprsize], m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m7, [priq+16*1] ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if ARCH_X86_64 pmaxsw m9, m1 pminsw m0, m9 %else pmaxsw m2, m9, m1 pminsw m0, m2 %endif pminsw m1, m10 pmaxsw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_64 cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ pri, sec, edge %define px rsp+32*4 %else cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left %define botq topq %define px rsp+32*5 %endif %define base t0-dir_table %define pri_shift px-16*6 %define sec_shift px-16*5 mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea t1, [dstq+strideq*2] movu m2, [t1 +strideq*0] movu m3, [t1 +strideq*1] movddup m7, [base+pw_m16384] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], m7 movd [px-32*1-4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*4+0], m0 mova [px+32*5+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*4-4], m0 movd [px+32*5-4], m1 jmp .bottom_done .no_bottom: mova [px+32*4+0], m7 mova [px+32*5+0], m7 .bottom_no_left: movd [px+32*4-4], m7 movd [px+32*5-4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0-4], m0 movd [px+32*1-4], m1 movd [px+32*2-4], m2 movd [px+32*3-4], m3 jmp .left_done .no_left: REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5 .padding_done: CDEF_FILTER 4, 4 %if ARCH_X86_64 cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ pri, sec, edge %else cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea t1, [dstq+strideq*2] movu m2, [t1 +strideq*0] movu m3, [t1 +strideq*1] lea t1, [t1 +strideq*2] movu m4, [t1 +strideq*0] movu m5, [t1 +strideq*1] lea t1, [t1 +strideq*2] movu m6, [t1 +strideq*0] movu m7, [t1 +strideq*1] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 mova [px+32*4+0], m4 mova [px+32*5+0], m5 mova [px+32*6+0], m6 mova [px+32*7+0], m7 movddup m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], m7 movd [px-32*1-4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*8-4], m0 movd [px+32*9-4], m1 jmp .bottom_done .no_bottom: mova [px+32*8+0], m7 mova [px+32*9+0], m7 .bottom_no_left: movd [px+32*8-4], m7 movd [px+32*9-4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0-4], m0 movd [px+32*1-4], m1 movd [px+32*2-4], m2 movd [px+32*3-4], m3 movd m0, [leftq+4*4] movd m1, [leftq+4*5] movd m2, [leftq+4*6] movd m3, [leftq+4*7] movd [px+32*4-4], m0 movd [px+32*5-4], m1 movd [px+32*6-4], m2 movd [px+32*7-4], m3 jmp .left_done .no_left: REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 4, 8 %if ARCH_X86_64 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ pri, sec, edge %else cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif mov edged, r9m LEA t0, dir_table mova m0, [dstq+strideq*0+ 0] movd m1, [dstq+strideq*0+16] mova m2, [dstq+strideq*1+ 0] movd m3, [dstq+strideq*1+16] lea t1, [dstq+strideq*2] mova m4, [t1 +strideq*0+ 0] movd m5, [t1 +strideq*0+16] mova m6, [t1 +strideq*1+ 0] movd m7, [t1 +strideq*1+16] lea t1, [t1 +strideq*2] mova [px+32*0+ 0], m0 movd [px+32*0+16], m1 mova [px+32*1+ 0], m2 movd [px+32*1+16], m3 mova [px+32*2+ 0], m4 movd [px+32*2+16], m5 mova [px+32*3+ 0], m6 movd [px+32*3+16], m7 mova m0, [t1 +strideq*0+ 0] movd m1, [t1 +strideq*0+16] mova m2, [t1 +strideq*1+ 0] movd m3, [t1 +strideq*1+16] lea t1, [t1 +strideq*2] mova m4, [t1 +strideq*0+ 0] movd m5, [t1 +strideq*0+16] mova m6, [t1 +strideq*1+ 0] movd m7, [t1 +strideq*1+16] mova [px+32*4+ 0], m0 movd [px+32*4+16], m1 mova [px+32*5+ 0], m2 movd [px+32*5+16], m3 mova [px+32*6+ 0], m4 movd [px+32*6+16], m5 mova [px+32*7+ 0], m6 movd [px+32*7+16], m7 movddup m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp mova m0, [topq+strideq*0+ 0] mova m1, [topq+strideq*0+16] mova m2, [topq+strideq*1+ 0] mova m3, [topq+strideq*1+16] mova [px-32*2+ 0], m0 movd [px-32*2+16], m1 mova [px-32*1+ 0], m2 movd [px-32*1+16], m3 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+ 0], m7 movd [px-32*2+16], m7 mova [px-32*1+ 0], m7 movd [px-32*1+16], m7 .top_no_left: movd [px-32*2- 4], m7 movd [px-32*1- 4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp mova m0, [botq+strideq*0+ 0] movd m1, [botq+strideq*0+16] mova m2, [botq+strideq*1+ 0] movd m3, [botq+strideq*1+16] mova [px+32*8+ 0], m0 movd [px+32*8+16], m1 mova [px+32*9+ 0], m2 movd [px+32*9+16], m3 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*8- 4], m0 movd [px+32*9- 4], m1 jmp .bottom_done .no_bottom: mova [px+32*8+ 0], m7 movd [px+32*8+16], m7 mova [px+32*9+ 0], m7 movd [px+32*9+16], m7 .bottom_no_left: movd [px+32*8- 4], m7 movd [px+32*9- 4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0- 4], m0 movd [px+32*1- 4], m1 movd [px+32*2- 4], m2 movd [px+32*3- 4], m3 movd m0, [leftq+4*4] movd m1, [leftq+4*5] movd m2, [leftq+4*6] movd m3, [leftq+4*7] movd [px+32*4- 4], m0 movd [px+32*5- 4], m1 movd [px+32*6- 4], m2 movd [px+32*7- 4], m3 jmp .left_done .no_left: REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 8, 8 %macro CDEF_DIR 0 %if ARCH_X86_64 cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax lea r6, [dir_shift] shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc movddup m7, [r6+bdmaxq*8] lea r6, [strideq*3] mova m0, [srcq+strideq*0] mova m1, [srcq+strideq*1] mova m2, [srcq+strideq*2] mova m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] mova m4, [srcq+strideq*0] mova m5, [srcq+strideq*1] mova m6, [srcq+strideq*2] REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhuw m7, [srcq+r6 ] pxor m8, m8 packuswb m9, m0, m1 packuswb m10, m2, m3 packuswb m11, m4, m5 packuswb m12, m6, m7 REPX {psadbw x, m8}, m9, m10, m11, m12 packssdw m9, m10 packssdw m11, m12 packssdw m9, m11 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %else cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax mov bdmaxd, bdmaxm LEA r2, dir_shift shr bdmaxd, 11 movddup m7, [r2+bdmaxq*8] lea r3, [strideq*3] pmulhuw m3, m7, [srcq+strideq*0] pmulhuw m4, m7, [srcq+strideq*1] pmulhuw m5, m7, [srcq+strideq*2] pmulhuw m6, m7, [srcq+r3 ] movddup m1, [r2-dir_shift+pw_128] lea srcq, [srcq+strideq*4] pxor m0, m0 packuswb m2, m3, m4 psubw m3, m1 psubw m4, m1 mova [esp+0x00], m3 mova [esp+0x10], m4 packuswb m3, m5, m6 psadbw m2, m0 psadbw m3, m0 psubw m5, m1 psubw m6, m1 packssdw m2, m3 mova [esp+0x20], m5 mova [esp+0x50], m6 pmulhuw m4, m7, [srcq+strideq*0] pmulhuw m5, m7, [srcq+strideq*1] pmulhuw m6, m7, [srcq+strideq*2] pmulhuw m7, [srcq+r3 ] packuswb m3, m4, m5 packuswb m1, m6, m7 psadbw m3, m0 psadbw m1, m0 packssdw m3, m1 movddup m1, [r2-dir_shift+pw_128] LEA r2, shufw_6543210x jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif %endmacro INIT_XMM ssse3 CDEF_DIR INIT_XMM sse4 CDEF_DIR dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef_avx2.asm000066400000000000000000001622611517466257200236330ustar00rootroot00000000000000; Copyright © 2018, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 %macro JMP_TABLE 2-* %xdefine %1_jmptable %%table %xdefine %%base mangle(private_prefix %+ _%1_avx2) %%table: %rep %0 - 1 dd %%base %+ .%2 - %%table %rotate 1 %endrep %endmacro %macro CDEF_FILTER_JMP_TABLE 1 JMP_TABLE cdef_filter_%1_8bpc, \ d6k0, d6k1, d7k0, d7k1, \ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ d0k0, d0k1, d1k0, d1k1 %endmacro SECTION_RODATA 32 pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 dd 0x80, 0x00, 0x00 blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 dd 0x00, 0x00 blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 dd 0x0000 blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 dd 0x0000, 0x0000 blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 tap_table: ; masks for 8 bit shifts db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 ; weights db 4, 2, 3, 3, 2, 1 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 db 1 * 16 + 0, 2 * 16 + 0 db 1 * 16 + 0, 2 * 16 - 1 ; the last 6 are repeats of the first 6 so we don't need to & 7 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 CDEF_FILTER_JMP_TABLE 4x4 CDEF_FILTER_JMP_TABLE 4x8 CDEF_FILTER_JMP_TABLE 8x8 SECTION .text %macro PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] mov dird, r7m lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] lea dirq, [tableq+dirq*2*4] %if %1 == 4 %if %2 == 4 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ table, dir, dirjmp, stride3, k %else DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ table, dir, dirjmp, dst4, stride3, k lea dst4q, [dstq+strideq*4] %endif %else DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ table, dir, dirjmp, top2, stride3, k mov hq, -8 lea top1q, [top1q+strideq*0] lea top2q, [top1q+strideq*1] %endif %if %1 == 4 lea stride3q, [strideq*3] %endif %endmacro %macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max mov kd, 1 pxor m15, m15 ; sum %if %2 == 8 pxor m12, m12 %if %1 == 4 movd xm4, [dstq +strideq*0] movd xm6, [dstq +strideq*1] movd xm5, [dstq +strideq*2] movd xm7, [dstq +stride3q ] vinserti128 m4, [dst4q+strideq*0], 1 vinserti128 m6, [dst4q+strideq*1], 1 vinserti128 m5, [dst4q+strideq*2], 1 vinserti128 m7, [dst4q+stride3q ], 1 punpckldq m4, m6 punpckldq m5, m7 %else movq xm4, [dstq+strideq*0] movq xm5, [dstq+strideq*1] vinserti128 m4, [dstq+strideq*2], 1 vinserti128 m5, [dstq+stride3q ], 1 %endif punpcklqdq m4, m5 %else movd xm4, [dstq+strideq*0] movd xm5, [dstq+strideq*1] vinserti128 m4, [dstq+strideq*2], 1 vinserti128 m5, [dstq+stride3q ], 1 punpckldq m4, m5 %endif %if %3 == 1 mova m7, m4 ; min mova m8, m4 ; max %endif %endmacro %macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength ; mul_tap, w, h, clip ; load p0/p1 movsxd dirjmpq, [dirq+kq*4+%1*2*4] add dirjmpq, tableq call dirjmpq %if %8 == 1 pmaxub m7, m5 pminub m8, m5 pmaxub m7, m6 pminub m8, m6 %endif ; accumulate sum[m15] over p0/p1 %if %7 == 4 punpcklbw m5, m6 punpcklbw m6, m4, m4 psubusb m9, m5, m6 psubusb m5, m6, m5 por m9, m5 ; abs_diff_p01(p01 - px) pcmpeqb m5, m9 por m5, %5 psignb m6, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 pminub m5, m9 pmaddubsw m5, m6 paddw m15, m5 %else psubusb m9, m5, m4 psubusb m5, m4, m5 psubusb m11, m6, m4 psubusb m6, m4, m6 por m9, m5 ; abs_diff_p0(p0 - px) por m11, m6 ; abs_diff_p1(p1 - px) pcmpeqb m5, m9 pcmpeqb m6, m11 punpckhbw m10, m9, m11 punpcklbw m9, m11 por m5, %5 por m11, m6, %5 punpckhbw m6, m5, m11 punpcklbw m5, m11 psignb m11, %5, m6 psrlw m6, m10, %2 ; emulate 8-bit shift pand m6, %3 psubusb m6, %4, m6 pminub m6, m10 pmaddubsw m6, m11 paddw m12, m6 psignb m11, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 pminub m5, m9 pmaddubsw m5, m11 paddw m15, m5 %endif %endmacro %macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip %if %2 == 4 %if %5 == 1 punpcklbw m4, %3 %endif pcmpgtw %3, m15 paddw m15, %3 pmulhrsw m15, %4 %if %5 == 0 packsswb m15, m15 paddb m4, m15 %else paddw m4, m15 packuswb m4, m4 ; clip px in [0x0,0xff] pminub m4, m7 pmaxub m4, m8 %endif vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 movd [dstq+strideq*2], xm5 pextrd [dstq+strideq*1], xm4, 1 pextrd [dstq+stride3q ], xm5, 1 %else pcmpgtw m6, %3, m12 pcmpgtw m5, %3, m15 paddw m12, m6 paddw m15, m5 %if %5 == 1 punpckhbw m5, m4, %3 punpcklbw m4, %3 %endif pmulhrsw m12, %4 pmulhrsw m15, %4 %if %5 == 0 packsswb m15, m12 paddb m4, m15 %else paddw m5, m12 paddw m4, m15 packuswb m4, m5 ; clip px in [0x0,0xff] pminub m4, m7 pmaxub m4, m8 %endif vextracti128 xm5, m4, 1 %if %1 == 4 movd [dstq +strideq*0], xm4 movd [dst4q+strideq*0], xm5 pextrd [dstq +strideq*1], xm4, 1 pextrd [dst4q+strideq*1], xm5, 1 pextrd [dstq +strideq*2], xm4, 2 pextrd [dst4q+strideq*2], xm5, 2 pextrd [dstq +stride3q ], xm4, 3 pextrd [dst4q+stride3q ], xm5, 3 %else movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm5 movhps [dstq+strideq*1], xm4 movhps [dstq+stride3q ], xm5 %endif %endif %endmacro %macro BORDER_PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] mov dird, r7m lea dirq, [tableq+dirq*2+14] %if %1*%2*2/mmsize > 1 %if %1 == 4 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off %else DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off %endif mov hd, %1*%2*2/mmsize %else DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off %endif lea stkq, [px] pxor m11, m11 %endmacro %macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max mov kd, 1 %if %1 == 4 movq xm4, [stkq+32*0] movhps xm4, [stkq+32*1] movq xm5, [stkq+32*2] movhps xm5, [stkq+32*3] vinserti128 m4, xm5, 1 %else mova xm4, [stkq+32*0] ; px vinserti128 m4, [stkq+32*1], 1 %endif pxor m15, m15 ; sum %if %3 == 1 mova m7, m4 ; max mova m8, m4 ; min %endif %endmacro %macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength ; mul_tap, w, clip ; load p0/p1 movsx offq, byte [dirq+kq+%1] ; off1 %if %6 == 4 movq xm5, [stkq+offq*2+32*0] ; p0 movq xm6, [stkq+offq*2+32*2] movhps xm5, [stkq+offq*2+32*1] movhps xm6, [stkq+offq*2+32*3] vinserti128 m5, xm6, 1 %else movu xm5, [stkq+offq*2+32*0] ; p0 vinserti128 m5, [stkq+offq*2+32*1], 1 %endif neg offq ; -off1 %if %6 == 4 movq xm6, [stkq+offq*2+32*0] ; p1 movq xm9, [stkq+offq*2+32*2] movhps xm6, [stkq+offq*2+32*1] movhps xm9, [stkq+offq*2+32*3] vinserti128 m6, xm9, 1 %else movu xm6, [stkq+offq*2+32*0] ; p1 vinserti128 m6, [stkq+offq*2+32*1], 1 %endif %if %7 == 1 ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them pmaxsw m7, m5 ; max after p0 pminuw m8, m5 ; min after p0 pmaxsw m7, m6 ; max after p1 pminuw m8, m6 ; min after p1 %endif ; accumulate sum[m15] over p0/p1 ; calculate difference before converting psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) ; convert to 8-bits with signed saturation ; saturating to large diffs has no impact on the results packsswb m5, m6 ; group into pairs so we can accumulate using maddubsw pshufb m5, m12 pabsb m9, m5 psignb m10, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 ; use unsigned min since abs diff can equal 0x80 pminub m5, m9 pmaddubsw m5, m10 paddw m15, m5 %endmacro %macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip pcmpgtw m9, m11, m15 paddw m15, m9 pmulhrsw m15, %2 paddw m4, m15 %if %3 == 1 pminsw m4, m7 pmaxsw m4, m8 %endif packuswb m4, m4 vextracti128 xm5, m4, 1 %if %1 == 4 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 movd [dstq+strideq*2], xm5 pextrd [dstq+stride3q ], xm5, 1 %else movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 %endif %endmacro %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge mov edged, edgem cmp edged, 0xf jne .border_block PUSH r11 PUSH r12 %if %2 == 4 %assign regs_used 13 ALLOC_STACK 0x60, 16 pmovzxbw xm0, [leftq+1] vpermq m0, m0, q0110 psrldq m1, m0, 4 vpalignr m2, m0, m0, 12 movu [rsp+0x10], m0 movu [rsp+0x28], m1 movu [rsp+0x40], m2 %elif %1 == 4 %assign regs_used 14 PUSH r13 ALLOC_STACK 8*2+%1*%2*1, 16 pmovzxwd m0, [leftq] mova [rsp+0x10], m0 %else %assign regs_used 15 PUSH r13 PUSH r14 ALLOC_STACK 8*4+%1*%2*2+32, 16 lea r11, [strideq*3] movu xm4, [dstq+strideq*2] pmovzxwq m0, [leftq+0] pmovzxwq m1, [leftq+8] vinserti128 m4, [dstq+r11], 1 pmovzxbd m2, [leftq+1] pmovzxbd m3, [leftq+9] mov [rsp+16], botq mova [rsp+0x20], m0 mova [rsp+0x40], m1 mova [rsp+0x60], m2 mova [rsp+0x80], m3 mova [rsp+0xa0], m4 lea botq, [dstq+strideq*4] %endif DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping mov dampingd, r8m xor zerod, zerod movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm test prid, prid jz .sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift test secdmpd, secdmpd jz .pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps PREP_REGS %1, %2 %if %1*%2 > mmsize .v_loop: %endif LOAD_BLOCK %1, %2, 1 .k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 dec kq jge .k_loop vpbroadcastd m10, [pw_2048] pxor m9, m9 ADJUST_PIXEL %1, %2, m9, m10, 1 %if %1*%2 > mmsize lea dstq, [dstq+strideq*4] lea top1q, [rsp+0xa0] lea top2q, [rsp+0xb0] mov botq, [rsp+16] add hq, 4 jl .v_loop %endif RET .pri_only: DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir vpbroadcastb m0, xm0 ; pri_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps PREP_REGS %1, %2 vpbroadcastd m3, [pw_2048] pxor m1, m1 %if %1*%2 > mmsize .pri_v_loop: %endif LOAD_BLOCK %1, %2 .pri_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 dec kq jge .pri_k_loop ADJUST_PIXEL %1, %2, m1, m3 %if %1*%2 > mmsize lea dstq, [dstq+strideq*4] lea top1q, [rsp+0xa0] lea top2q, [rsp+0xb0] mov botq, [rsp+16] add hq, 4 jl .pri_v_loop %endif RET .sec_only: DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table lea tableq, [tap_table] vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir vpbroadcastb m1, xm1 ; sec_strength lea secq, [tableq+12] ; sec_taps PREP_REGS %1, %2 vpbroadcastd m2, [pw_2048] pxor m0, m0 %if %1*%2 > mmsize .sec_v_loop: %endif LOAD_BLOCK %1, %2 .sec_k_loop: vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 dec kq jge .sec_k_loop ADJUST_PIXEL %1, %2, m0, m2 %if %1*%2 > mmsize lea dstq, [dstq+strideq*4] lea top1q, [rsp+0xa0] lea top2q, [rsp+0xb0] mov botq, [rsp+16] add hq, 4 jl .sec_v_loop %endif RET .d0k0: %if %1 == 4 %if %2 == 4 vpbroadcastq m6, [dstq+strideq*1-1] vpbroadcastq m10, [dstq+strideq*2-1] movd xm5, [topq+strideq*1+1] movd xm9, [dstq+strideq*0+1] psrldq m11, m6, 2 psrldq m12, m10, 2 vinserti128 m6, [dstq+stride3q -1], 1 vinserti128 m10, [botq -1], 1 vpblendd m5, m11, 0x10 vpblendd m9, m12, 0x10 movu m11, [blend_4x4+16] punpckldq m6, m10 punpckldq m5, m9 vpblendvb m6, [rsp+gprsize+0x28], m11 %else movd xm5, [topq +strideq*1+1] movq xm6, [dstq +strideq*1-1] movq xm10, [dstq +stride3q -1] movq xm11, [dst4q+strideq*1-1] pinsrd xm5, [dstq +strideq*0+1], 1 movhps xm6, [dstq +strideq*2-1] movhps xm10, [dst4q+strideq*0-1] movhps xm11, [dst4q+strideq*2-1] psrldq xm9, xm6, 2 shufps xm5, xm9, q2010 ; -1 +0 +1 +2 shufps xm6, xm10, q2020 ; +1 +2 +3 +4 psrldq xm9, xm11, 2 psrldq xm10, 2 shufps xm10, xm9, q2020 ; +3 +4 +5 +6 movd xm9, [dst4q+stride3q -1] pinsrd xm9, [botq -1], 1 shufps xm11, xm9, q1020 ; +5 +6 +7 +8 pmovzxbw m9, [leftq+3] vinserti128 m6, xm11, 1 movu m11, [blend_4x8_0+4] vinserti128 m5, xm10, 1 vpblendvb m6, m9, m11 %endif %else lea r13, [blend_8x8_0+16] movq xm5, [top2q +1] vbroadcasti128 m10, [dstq+strideq*1-1] vbroadcasti128 m11, [dstq+strideq*2-1] movhps xm5, [dstq+strideq*0+1] vinserti128 m6, m10, [dstq+stride3q-1], 1 vinserti128 m9, m11, [botq -1], 1 psrldq m10, 2 psrldq m11, 2 punpcklqdq m6, m9 movu m9, [r13+hq*2*1+16*1] punpcklqdq m10, m11 vpblendd m5, m10, 0xF0 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 %endif ret .d1k0: .d2k0: .d3k0: %if %1 == 4 %if %2 == 4 movq xm6, [dstq+strideq*0-1] movq xm9, [dstq+strideq*1-1] vinserti128 m6, [dstq+strideq*2-1], 1 vinserti128 m9, [dstq+stride3q -1], 1 movu m11, [rsp+gprsize+0x10] pcmpeqd m12, m12 psrldq m5, m6, 2 psrldq m10, m9, 2 psrld m12, 24 punpckldq m6, m9 punpckldq m5, m10 vpblendvb m6, m11, m12 %else movq xm6, [dstq +strideq*0-1] movq xm9, [dstq +strideq*2-1] movhps xm6, [dstq +strideq*1-1] movhps xm9, [dstq +stride3q -1] movq xm10, [dst4q+strideq*0-1] movhps xm10, [dst4q+strideq*1-1] psrldq xm5, xm6, 2 psrldq xm11, xm9, 2 shufps xm5, xm11, q2020 movq xm11, [dst4q+strideq*2-1] movhps xm11, [dst4q+stride3q -1] shufps xm6, xm9, q2020 shufps xm9, xm10, xm11, q2020 vinserti128 m6, xm9, 1 pmovzxbw m9, [leftq+1] psrldq xm10, 2 psrldq xm11, 2 shufps xm10, xm11, q2020 vpbroadcastd m11, [blend_4x8_0+4] vinserti128 m5, xm10, 1 vpblendvb m6, m9, m11 %endif %else movu xm5, [dstq+strideq*0-1] movu xm9, [dstq+strideq*1-1] vinserti128 m5, [dstq+strideq*2-1], 1 vinserti128 m9, [dstq+stride3q -1], 1 movu m10, [blend_8x8_0+16] punpcklqdq m6, m5, m9 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 psrldq m5, 2 psrldq m9, 2 punpcklqdq m5, m9 %endif ret .d4k0: %if %1 == 4 %if %2 == 4 vpbroadcastq m10, [dstq+strideq*1-1] vpbroadcastq m11, [dstq+strideq*2-1] movd xm6, [topq+strideq*1-1] movd xm9, [dstq+strideq*0-1] psrldq m5, m10, 2 psrldq m12, m11, 2 vpblendd m6, m10, 0x10 vpblendd m9, m11, 0x10 movu m10, [blend_4x4] vinserti128 m5, [dstq+stride3q +1], 1 vinserti128 m12, [botq +1], 1 punpckldq m6, m9 punpckldq m5, m12 vpblendvb m6, [rsp+gprsize+0x40], m10 %else movd xm6, [topq +strideq*1-1] movq xm9, [dstq +strideq*1-1] movq xm10, [dstq +stride3q -1] movq xm11, [dst4q+strideq*1-1] pinsrd xm6, [dstq +strideq*0-1], 1 movhps xm9, [dstq +strideq*2-1] movhps xm10, [dst4q+strideq*0-1] movhps xm11, [dst4q+strideq*2-1] psrldq xm5, xm9, 2 shufps xm6, xm9, q2010 psrldq xm9, xm10, 2 shufps xm5, xm9, q2020 shufps xm10, xm11, q2020 movd xm9, [dst4q+stride3q +1] vinserti128 m6, xm10, 1 pinsrd xm9, [botq +1], 1 psrldq xm11, 2 pmovzxbw m10, [leftq-1] shufps xm11, xm9, q1020 movu m9, [blend_4x8_0] vinserti128 m5, xm11, 1 vpblendvb m6, m10, m9 %endif %else lea r13, [blend_8x8_0+8] movq xm6, [top2q -1] vbroadcasti128 m5, [dstq+strideq*1-1] vbroadcasti128 m9, [dstq+strideq*2-1] movhps xm6, [dstq+strideq*0-1] movu m11, [r13+hq*2*1+16*1] punpcklqdq m10, m5, m9 vinserti128 m5, [dstq+stride3q -1], 1 vinserti128 m9, [botq -1], 1 vpblendd m6, m10, 0xF0 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 psrldq m5, 2 psrldq m9, 2 punpcklqdq m5, m9 %endif ret .d5k0: .d6k0: .d7k0: %if %1 == 4 %if %2 == 4 movd xm6, [topq+strideq*1 ] vpbroadcastd m5, [dstq+strideq*1 ] vpbroadcastd m9, [dstq+strideq*2 ] vpblendd xm6, [dstq+strideq*0-4], 0x2 vpblendd m5, m9, 0x22 vpblendd m6, m5, 0x30 vinserti128 m5, [dstq+stride3q ], 1 vpblendd m5, [botq -20], 0x20 %else movd xm6, [topq +strideq*1] movd xm5, [dstq +strideq*1] movd xm9, [dstq +stride3q ] movd xm10, [dst4q+strideq*1] movd xm11, [dst4q+stride3q ] pinsrd xm6, [dstq +strideq*0], 1 pinsrd xm5, [dstq +strideq*2], 1 pinsrd xm9, [dst4q+strideq*0], 1 pinsrd xm10, [dst4q+strideq*2], 1 pinsrd xm11, [botq ], 1 punpcklqdq xm6, xm5 punpcklqdq xm5, xm9 punpcklqdq xm9, xm10 punpcklqdq xm10, xm11 vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 %endif %else movq xm6, [top2q ] movq xm5, [dstq+strideq*1] movq xm9, [dstq+stride3q ] movhps xm6, [dstq+strideq*0] movhps xm5, [dstq+strideq*2] movhps xm9, [botq ] vinserti128 m6, xm5, 1 vinserti128 m5, xm9, 1 %endif ret .d0k1: %if %1 == 4 %if %2 == 4 movd xm6, [dstq+strideq*2-2] movd xm9, [dstq+stride3q -2] movd xm5, [topq+strideq*0+2] movd xm10, [topq+strideq*1+2] pinsrw xm6, [leftq+4], 0 pinsrw xm9, [leftq+6], 0 vinserti128 m5, [dstq+strideq*0+2], 1 vinserti128 m10, [dstq+strideq*1+2], 1 vinserti128 m6, [botq+strideq*0-2], 1 vinserti128 m9, [botq+strideq*1-2], 1 punpckldq m5, m10 punpckldq m6, m9 %else movq xm6, [dstq +strideq*2-2] movd xm10, [dst4q+strideq*2-2] movd xm5, [topq +strideq*0+2] movq xm9, [dst4q+strideq*0-2] movhps xm6, [dstq +stride3q -2] pinsrw xm10, [dst4q+stride3q ], 3 pinsrd xm5, [topq +strideq*1+2], 1 movhps xm9, [dst4q+strideq*1-2] pinsrd xm10, [botq +strideq*0-2], 2 pinsrd xm5, [dstq +strideq*0+2], 2 pinsrd xm10, [botq +strideq*1-2], 3 pinsrd xm5, [dstq +strideq*1+2], 3 shufps xm11, xm6, xm9, q3131 shufps xm6, xm9, q2020 movu m9, [blend_4x8_3+8] vinserti128 m6, xm10, 1 vinserti128 m5, xm11, 1 vpblendvb m6, [rsp+gprsize+0x10+8], m9 %endif %else lea r13, [blend_8x8_1+16] movq xm6, [dstq+strideq*2-2] movq xm9, [dstq+stride3q -2] movq xm5, [top1q +2] movq xm10, [top2q +2] movu m11, [r13+hq*2*2+16*2] vinserti128 m6, [botq+strideq*0-2], 1 vinserti128 m9, [botq+strideq*1-2], 1 vinserti128 m5, [dstq+strideq*0+2], 1 vinserti128 m10, [dstq+strideq*1+2], 1 punpcklqdq m6, m9 punpcklqdq m5, m10 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 %endif ret .d1k1: %if %1 == 4 %if %2 == 4 vpbroadcastq m6, [dstq+strideq*1-2] vpbroadcastq m9, [dstq+strideq*2-2] movd xm5, [topq+strideq*1+2] movd xm10, [dstq+strideq*0+2] psrldq m11, m6, 4 psrldq m12, m9, 4 vpblendd m5, m11, 0x10 movq xm11, [leftq+2] vinserti128 m6, [dstq+stride3q-2], 1 punpckldq xm11, xm11 vpblendd m10, m12, 0x10 pcmpeqd m12, m12 pmovzxwd m11, xm11 psrld m12, 16 punpckldq m6, m9 vpbroadcastd m9, [botq-2] vpblendvb m6, m11, m12 punpckldq m5, m10 vpblendd m6, m9, 0x20 %else movd xm5, [topq +strideq*1+2] movq xm6, [dstq +strideq*1-2] movq xm9, [dstq +stride3q -2] movq xm10, [dst4q+strideq*1-2] movd xm11, [dst4q+stride3q -2] pinsrd xm5, [dstq +strideq*0+2], 1 movhps xm6, [dstq +strideq*2-2] movhps xm9, [dst4q+strideq*0-2] movhps xm10, [dst4q+strideq*2-2] pinsrd xm11, [botq -2], 1 shufps xm5, xm6, q3110 shufps xm6, xm9, q2020 shufps xm9, xm10, q3131 shufps xm10, xm11, q1020 movu m11, [blend_4x8_2+4] vinserti128 m6, xm10, 1 vinserti128 m5, xm9, 1 vpblendvb m6, [rsp+gprsize+0x10+4], m11 %endif %else lea r13, [blend_8x8_1+16] movq xm5, [top2q +2] vbroadcasti128 m6, [dstq+strideq*1-2] vbroadcasti128 m9, [dstq+strideq*2-2] movhps xm5, [dstq+strideq*0+2] shufps m10, m6, m9, q2121 vinserti128 m6, [dstq+stride3q -2], 1 vinserti128 m9, [botq -2], 1 movu m11, [r13+hq*2*1+16*1] vpblendd m5, m10, 0xF0 punpcklqdq m6, m9 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 %endif ret .d2k1: %if %1 == 4 %if %2 == 4 movq xm11, [leftq] movq xm6, [dstq+strideq*0-2] movq xm9, [dstq+strideq*1-2] vinserti128 m6, [dstq+strideq*2-2], 1 vinserti128 m9, [dstq+stride3q -2], 1 punpckldq xm11, xm11 psrldq m5, m6, 4 psrldq m10, m9, 4 pmovzxwd m11, xm11 punpckldq m6, m9 punpckldq m5, m10 pblendw m6, m11, 0x05 %else movq xm5, [dstq +strideq*0-2] movq xm9, [dstq +strideq*2-2] movq xm10, [dst4q+strideq*0-2] movq xm11, [dst4q+strideq*2-2] movhps xm5, [dstq +strideq*1-2] movhps xm9, [dstq +stride3q -2] movhps xm10, [dst4q+strideq*1-2] movhps xm11, [dst4q+stride3q -2] shufps xm6, xm5, xm9, q2020 shufps xm5, xm9, q3131 shufps xm9, xm10, xm11, q2020 shufps xm10, xm11, q3131 pmovzxwd m11, [leftq] vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 pblendw m6, m11, 0x55 %endif %else mova m11, [rsp+gprsize+0x20+hq*8+64] movu xm5, [dstq+strideq*0-2] movu xm9, [dstq+strideq*1-2] vinserti128 m5, [dstq+strideq*2-2], 1 vinserti128 m9, [dstq+stride3q -2], 1 shufps m6, m5, m9, q1010 shufps m5, m9, q2121 pblendw m6, m11, 0x11 %endif ret .d3k1: %if %1 == 4 %if %2 == 4 vpbroadcastq m11, [dstq+strideq*1-2] vpbroadcastq m12, [dstq+strideq*2-2] movd xm6, [topq+strideq*1-2] movd xm9, [dstq+strideq*0-2] pblendw m11, [leftq-16+2], 0x01 pblendw m12, [leftq-16+4], 0x01 pinsrw xm9, [leftq- 0+0], 0 psrldq m5, m11, 4 psrldq m10, m12, 4 vinserti128 m5, [dstq+stride3q +2], 1 vinserti128 m10, [botq +2], 1 vpblendd m6, m11, 0x10 vpblendd m9, m12, 0x10 punpckldq m6, m9 punpckldq m5, m10 %else movd xm6, [topq +strideq*1-2] movq xm5, [dstq +strideq*1-2] movq xm9, [dstq +stride3q -2] movq xm10, [dst4q+strideq*1-2] movd xm11, [dst4q+stride3q +2] pinsrw xm6, [dstq +strideq*0 ], 3 movhps xm5, [dstq +strideq*2-2] movhps xm9, [dst4q+strideq*0-2] movhps xm10, [dst4q+strideq*2-2] pinsrd xm11, [botq +2], 1 shufps xm6, xm5, q2010 shufps xm5, xm9, q3131 shufps xm9, xm10, q2020 shufps xm10, xm11, q1031 movu m11, [blend_4x8_2] vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 vpblendvb m6, [rsp+gprsize+0x10-4], m11 %endif %else lea r13, [blend_8x8_1+8] movq xm6, [top2q -2] vbroadcasti128 m5, [dstq+strideq*1-2] vbroadcasti128 m10, [dstq+strideq*2-2] movhps xm6, [dstq+strideq*0-2] punpcklqdq m9, m5, m10 vinserti128 m5, [dstq+stride3q -2], 1 vinserti128 m10, [botq -2], 1 movu m11, [r13+hq*2*1+16*1] vpblendd m6, m9, 0xF0 shufps m5, m10, q2121 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 %endif ret .d4k1: %if %1 == 4 %if %2 == 4 vinserti128 m6, [dstq+strideq*0-2], 1 vinserti128 m9, [dstq+strideq*1-2], 1 movd xm5, [dstq+strideq*2+2] movd xm10, [dstq+stride3q +2] pblendw m6, [leftq-16+0], 0x01 pblendw m9, [leftq-16+2], 0x01 vinserti128 m5, [botq+strideq*0+2], 1 vinserti128 m10, [botq+strideq*1+2], 1 vpblendd m6, [topq+strideq*0-2], 0x01 vpblendd m9, [topq+strideq*1-2], 0x01 punpckldq m5, m10 punpckldq m6, m9 %else movd xm6, [topq +strideq*0-2] movq xm5, [dstq +strideq*2-2] movq xm9, [dst4q+strideq*0-2] movd xm10, [dst4q+strideq*2+2] pinsrd xm6, [topq +strideq*1-2], 1 movhps xm5, [dstq +stride3q -2] movhps xm9, [dst4q+strideq*1-2] pinsrd xm10, [dst4q+stride3q +2], 1 pinsrd xm6, [dstq +strideq*0-2], 2 pinsrd xm10, [botq +strideq*0+2], 2 pinsrd xm6, [dstq +strideq*1-2], 3 pinsrd xm10, [botq +strideq*1+2], 3 shufps xm11, xm5, xm9, q2020 shufps xm5, xm9, q3131 movu m9, [blend_4x8_3] vinserti128 m6, xm11, 1 vinserti128 m5, xm10, 1 vpblendvb m6, [rsp+gprsize+0x10-8], m9 %endif %else lea r13, [blend_8x8_1] movu m11, [r13+hq*2*2+16*2] movq xm6, [top1q -2] movq xm9, [top2q -2] movq xm5, [dstq+strideq*2+2] movq xm10, [dstq+stride3q +2] vinserti128 m6, [dstq+strideq*0-2], 1 vinserti128 m9, [dstq+strideq*1-2], 1 vinserti128 m5, [botq+strideq*0+2], 1 vinserti128 m10, [botq+strideq*1+2], 1 punpcklqdq m6, m9 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 punpcklqdq m5, m10 %endif ret .d5k1: %if %1 == 4 %if %2 == 4 movd xm6, [topq+strideq*0-1] movd xm9, [topq+strideq*1-1] movd xm5, [dstq+strideq*2+1] movd xm10, [dstq+stride3q +1] pcmpeqd m12, m12 pmovzxbw m11, [leftq-8+1] psrld m12, 24 vinserti128 m6, [dstq+strideq*0-1], 1 vinserti128 m9, [dstq+strideq*1-1], 1 vinserti128 m5, [botq+strideq*0+1], 1 vinserti128 m10, [botq+strideq*1+1], 1 punpckldq m6, m9 pxor m9, m9 vpblendd m12, m9, 0x0F punpckldq m5, m10 vpblendvb m6, m11, m12 %else movd xm6, [topq +strideq*0-1] movq xm5, [dstq +strideq*2-1] movq xm9, [dst4q+strideq*0-1] movd xm10, [dst4q+strideq*2+1] pinsrd xm6, [topq +strideq*1-1], 1 movhps xm5, [dstq +stride3q -1] movhps xm9, [dst4q+strideq*1-1] pinsrd xm10, [dst4q+stride3q +1], 1 pinsrd xm6, [dstq +strideq*0-1], 2 pinsrd xm10, [botq +strideq*0+1], 2 pinsrd xm6, [dstq +strideq*1-1], 3 pinsrd xm10, [botq +strideq*1+1], 3 shufps xm11, xm5, xm9, q2020 vinserti128 m6, xm11, 1 pmovzxbw m11, [leftq-3] psrldq xm5, 2 psrldq xm9, 2 shufps xm5, xm9, q2020 movu m9, [blend_4x8_1] vinserti128 m5, xm10, 1 vpblendvb m6, m11, m9 %endif %else lea r13, [blend_8x8_0] movu m11, [r13+hq*2*2+16*2] movq xm6, [top1q -1] movq xm9, [top2q -1] movq xm5, [dstq+strideq*2+1] movq xm10, [dstq+stride3q +1] vinserti128 m6, [dstq+strideq*0-1], 1 vinserti128 m9, [dstq+strideq*1-1], 1 vinserti128 m5, [botq+strideq*0+1], 1 vinserti128 m10, [botq+strideq*1+1], 1 punpcklqdq m6, m9 punpcklqdq m5, m10 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 %endif ret .d6k1: %if %1 == 4 %if %2 == 4 movd xm6, [topq+strideq*0] movd xm9, [topq+strideq*1] movd xm5, [dstq+strideq*2] movd xm10, [dstq+stride3q ] vinserti128 m6, [dstq+strideq*0], 1 vinserti128 m9, [dstq+strideq*1], 1 vinserti128 m5, [botq+strideq*0], 1 vinserti128 m10, [botq+strideq*1], 1 punpckldq m6, m9 punpckldq m5, m10 %else movd xm5, [dstq +strideq*2] movd xm6, [topq +strideq*0] movd xm9, [dst4q+strideq*2] pinsrd xm5, [dstq +stride3q ], 1 pinsrd xm6, [topq +strideq*1], 1 pinsrd xm9, [dst4q+stride3q ], 1 pinsrd xm5, [dst4q+strideq*0], 2 pinsrd xm6, [dstq +strideq*0], 2 pinsrd xm9, [botq +strideq*0], 2 pinsrd xm5, [dst4q+strideq*1], 3 pinsrd xm6, [dstq +strideq*1], 3 pinsrd xm9, [botq +strideq*1], 3 vinserti128 m6, xm5, 1 vinserti128 m5, xm9, 1 %endif %else movq xm5, [dstq+strideq*2] movq xm9, [botq+strideq*0] movq xm6, [top1q ] movq xm10, [dstq+strideq*0] movhps xm5, [dstq+stride3q ] movhps xm9, [botq+strideq*1] movhps xm6, [top2q ] movhps xm10, [dstq+strideq*1] vinserti128 m5, xm9, 1 vinserti128 m6, xm10, 1 %endif ret .d7k1: %if %1 == 4 %if %2 == 4 movd xm5, [dstq+strideq*2-1] movd xm9, [dstq+stride3q -1] movd xm6, [topq+strideq*0+1] movd xm10, [topq+strideq*1+1] pinsrb xm5, [leftq+ 5], 0 pinsrb xm9, [leftq+ 7], 0 vinserti128 m6, [dstq+strideq*0+1], 1 vinserti128 m10, [dstq+strideq*1+1], 1 vinserti128 m5, [botq+strideq*0-1], 1 vinserti128 m9, [botq+strideq*1-1], 1 punpckldq m6, m10 punpckldq m5, m9 %else movd xm6, [topq +strideq*0+1] movq xm9, [dstq +strideq*2-1] movq xm10, [dst4q+strideq*0-1] movd xm11, [dst4q+strideq*2-1] pinsrd xm6, [topq +strideq*1+1], 1 movhps xm9, [dstq +stride3q -1] movhps xm10, [dst4q+strideq*1-1] pinsrd xm11, [dst4q+stride3q -1], 1 pinsrd xm6, [dstq +strideq*0+1], 2 pinsrd xm11, [botq +strideq*0-1], 2 pinsrd xm6, [dstq +strideq*1+1], 3 pinsrd xm11, [botq +strideq*1-1], 3 shufps xm5, xm9, xm10, q2020 vinserti128 m5, xm11, 1 pmovzxbw m11, [leftq+5] psrldq xm9, 2 psrldq xm10, 2 shufps xm9, xm10, q2020 movu m10, [blend_4x8_1+8] vinserti128 m6, xm9, 1 vpblendvb m5, m11, m10 %endif %else lea r13, [blend_8x8_0+16] movq xm5, [dstq+strideq*2-1] movq xm9, [botq+strideq*0-1] movq xm6, [top1q +1] movq xm10, [dstq+strideq*0+1] movhps xm5, [dstq+stride3q -1] movhps xm9, [botq+strideq*1-1] movhps xm6, [top2q +1] movhps xm10, [dstq+strideq*1+1] movu m11, [r13+hq*2*2+16*2] vinserti128 m5, xm9, 1 vinserti128 m6, xm10, 1 vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 %endif ret .border_block: DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge RESET_STACK_STATE %assign stack_offset stack_offset - (regs_used - 11) * gprsize %assign regs_used 11 ALLOC_STACK 2*16+(%2+4)*32, 16 %define px rsp+2*16+2*32 pcmpeqw m14, m14 psllw m14, 15 ; 0x8000 ; prepare pixel buffers - body/right %if %1 == 4 INIT_XMM avx2 %endif %if %2 == 8 lea dst4q, [dstq+strideq*4] %endif lea stride3q, [strideq*3] test edgeb, 2 ; have_right jz .no_right pmovzxbw m1, [dstq+strideq*0] pmovzxbw m2, [dstq+strideq*1] pmovzxbw m3, [dstq+strideq*2] pmovzxbw m4, [dstq+stride3q] mova [px+0*32], m1 mova [px+1*32], m2 mova [px+2*32], m3 mova [px+3*32], m4 %if %2 == 8 pmovzxbw m1, [dst4q+strideq*0] pmovzxbw m2, [dst4q+strideq*1] pmovzxbw m3, [dst4q+strideq*2] pmovzxbw m4, [dst4q+stride3q] mova [px+4*32], m1 mova [px+5*32], m2 mova [px+6*32], m3 mova [px+7*32], m4 %endif jmp .body_done .no_right: %if %1 == 4 movd xm1, [dstq+strideq*0] movd xm2, [dstq+strideq*1] movd xm3, [dstq+strideq*2] movd xm4, [dstq+stride3q] pmovzxbw xm1, xm1 pmovzxbw xm2, xm2 pmovzxbw xm3, xm3 pmovzxbw xm4, xm4 movq [px+0*32], xm1 movq [px+1*32], xm2 movq [px+2*32], xm3 movq [px+3*32], xm4 %else pmovzxbw xm1, [dstq+strideq*0] pmovzxbw xm2, [dstq+strideq*1] pmovzxbw xm3, [dstq+strideq*2] pmovzxbw xm4, [dstq+stride3q] mova [px+0*32], xm1 mova [px+1*32], xm2 mova [px+2*32], xm3 mova [px+3*32], xm4 %endif movd [px+0*32+%1*2], xm14 movd [px+1*32+%1*2], xm14 movd [px+2*32+%1*2], xm14 movd [px+3*32+%1*2], xm14 %if %2 == 8 %if %1 == 4 movd xm1, [dst4q+strideq*0] movd xm2, [dst4q+strideq*1] movd xm3, [dst4q+strideq*2] movd xm4, [dst4q+stride3q] pmovzxbw xm1, xm1 pmovzxbw xm2, xm2 pmovzxbw xm3, xm3 pmovzxbw xm4, xm4 movq [px+4*32], xm1 movq [px+5*32], xm2 movq [px+6*32], xm3 movq [px+7*32], xm4 %else pmovzxbw xm1, [dst4q+strideq*0] pmovzxbw xm2, [dst4q+strideq*1] pmovzxbw xm3, [dst4q+strideq*2] pmovzxbw xm4, [dst4q+stride3q] mova [px+4*32], xm1 mova [px+5*32], xm2 mova [px+6*32], xm3 mova [px+7*32], xm4 %endif movd [px+4*32+%1*2], xm14 movd [px+5*32+%1*2], xm14 movd [px+6*32+%1*2], xm14 movd [px+7*32+%1*2], xm14 %endif .body_done: ; top test edgeb, 4 ; have_top jz .no_top test edgeb, 1 ; have_left jz .top_no_left test edgeb, 2 ; have_right jz .top_no_right pmovzxbw m1, [topq+strideq*0-(%1/2)] pmovzxbw m2, [topq+strideq*1-(%1/2)] movu [px-2*32-%1], m1 movu [px-1*32-%1], m2 jmp .top_done .top_no_right: pmovzxbw m1, [topq+strideq*0-%1] pmovzxbw m2, [topq+strideq*1-%1] movu [px-2*32-%1*2], m1 movu [px-1*32-%1*2], m2 movd [px-2*32+%1*2], xm14 movd [px-1*32+%1*2], xm14 jmp .top_done .top_no_left: test edgeb, 2 ; have_right jz .top_no_left_right pmovzxbw m1, [topq+strideq*0] pmovzxbw m2, [topq+strideq*1] mova [px-2*32+0], m1 mova [px-1*32+0], m2 movd [px-2*32-4], xm14 movd [px-1*32-4], xm14 jmp .top_done .top_no_left_right: %if %1 == 4 movd xm1, [topq+strideq*0] pinsrd xm1, [topq+strideq*1], 1 pmovzxbw xm1, xm1 movq [px-2*32+0], xm1 movhps [px-1*32+0], xm1 %else pmovzxbw xm1, [topq+strideq*0] pmovzxbw xm2, [topq+strideq*1] mova [px-2*32+0], xm1 mova [px-1*32+0], xm2 %endif movd [px-2*32-4], xm14 movd [px-1*32-4], xm14 movd [px-2*32+%1*2], xm14 movd [px-1*32+%1*2], xm14 jmp .top_done .no_top: movu [px-2*32-%1], m14 movu [px-1*32-%1], m14 .top_done: ; left test edgeb, 1 ; have_left jz .no_left pmovzxbw xm1, [leftq+ 0] %if %2 == 8 pmovzxbw xm2, [leftq+ 8] %endif movd [px+0*32-4], xm1 pextrd [px+1*32-4], xm1, 1 pextrd [px+2*32-4], xm1, 2 pextrd [px+3*32-4], xm1, 3 %if %2 == 8 movd [px+4*32-4], xm2 pextrd [px+5*32-4], xm2, 1 pextrd [px+6*32-4], xm2, 2 pextrd [px+7*32-4], xm2, 3 %endif jmp .left_done .no_left: movd [px+0*32-4], xm14 movd [px+1*32-4], xm14 movd [px+2*32-4], xm14 movd [px+3*32-4], xm14 %if %2 == 8 movd [px+4*32-4], xm14 movd [px+5*32-4], xm14 movd [px+6*32-4], xm14 movd [px+7*32-4], xm14 %endif .left_done: ; bottom DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge test edgeb, 8 ; have_bottom jz .no_bottom test edgeb, 1 ; have_left jz .bottom_no_left test edgeb, 2 ; have_right jz .bottom_no_right pmovzxbw m1, [botq+strideq*0-(%1/2)] pmovzxbw m2, [botq+strideq*1-(%1/2)] movu [px+(%2+0)*32-%1], m1 movu [px+(%2+1)*32-%1], m2 jmp .bottom_done .bottom_no_right: pmovzxbw m1, [botq+strideq*0-%1] pmovzxbw m2, [botq+strideq*1-%1] movu [px+(%2+0)*32-%1*2], m1 movu [px+(%2+1)*32-%1*2], m2 %if %1 == 8 movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu %endif movd [px+(%2+0)*32+%1*2], xm14 movd [px+(%2+1)*32+%1*2], xm14 jmp .bottom_done .bottom_no_left: test edgeb, 2 ; have_right jz .bottom_no_left_right pmovzxbw m1, [botq+strideq*0] pmovzxbw m2, [botq+strideq*1] mova [px+(%2+0)*32+0], m1 mova [px+(%2+1)*32+0], m2 movd [px+(%2+0)*32-4], xm14 movd [px+(%2+1)*32-4], xm14 jmp .bottom_done .bottom_no_left_right: %if %1 == 4 movd xm1, [botq+strideq*0] pinsrd xm1, [botq+strideq*1], 1 pmovzxbw xm1, xm1 movq [px+(%2+0)*32+0], xm1 movhps [px+(%2+1)*32+0], xm1 %else pmovzxbw xm1, [botq+strideq*0] pmovzxbw xm2, [botq+strideq*1] mova [px+(%2+0)*32+0], xm1 mova [px+(%2+1)*32+0], xm2 %endif movd [px+(%2+0)*32-4], xm14 movd [px+(%2+1)*32-4], xm14 movd [px+(%2+0)*32+%1*2], xm14 movd [px+(%2+1)*32+%1*2], xm14 jmp .bottom_done .no_bottom: movu [px+(%2+0)*32-%1], m14 movu [px+(%2+1)*32-%1], m14 .bottom_done: ; actual filter INIT_YMM avx2 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero %undef edged ; register to shuffle values into after packing vbroadcasti128 m12, [shufb_lohi] mov dampingd, r8m xor zerod, zerod movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm test prid, prid jz .border_sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift test secdmpd, secdmpd jz .border_pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps BORDER_PREP_REGS %1, %2 %if %1*%2*2/mmsize > 1 .border_v_loop: %endif BORDER_LOAD_BLOCK %1, %2, 1 .border_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 dec kq jge .border_k_loop vpbroadcastd m10, [pw_2048] BORDER_ADJUST_PIXEL %1, m10, 1 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd jg .border_v_loop %endif RET .border_pri_only: DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 vpbroadcastb m0, xm0 ; pri_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps BORDER_PREP_REGS %1, %2 vpbroadcastd m1, [pw_2048] %if %1*%2*2/mmsize > 1 .border_pri_v_loop: %endif BORDER_LOAD_BLOCK %1, %2 .border_pri_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 dec kq jge .border_pri_k_loop BORDER_ADJUST_PIXEL %1, m1 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd jg .border_pri_v_loop %endif RET .border_sec_only: DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 vpbroadcastb m1, xm1 ; sec_strength lea secq, [tableq+12] ; sec_taps BORDER_PREP_REGS %1, %2 vpbroadcastd m0, [pw_2048] %if %1*%2*2/mmsize > 1 .border_sec_v_loop: %endif BORDER_LOAD_BLOCK %1, %2 .border_sec_k_loop: vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 dec kq jge .border_sec_k_loop BORDER_ADJUST_PIXEL %1, m0 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd jg .border_sec_v_loop %endif RET %endmacro CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 INIT_YMM avx2 cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] movq xm2, [srcq+strideq*2] movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m4, [srcq+stride3q ] vpbroadcastq m5, [srcq+strideq*2] vpblendd m0, m4, 0xf0 vpblendd m1, m5, 0xf0 vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m5, [srcq+strideq*0] vpblendd m2, m4, 0xf0 vpblendd m3, m5, 0xf0 pxor m4, m4 punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 punpcklbw m3, m4 cglobal_label .main vpbroadcastd m4, [pw_128] PROLOGUE 3, 4, 15 psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 ; shuffle registers to generate partial_sum_diag[0-1] together vperm2i128 m7, m0, m0, 0x01 vperm2i128 m6, m1, m1, 0x01 vperm2i128 m5, m2, m2, 0x01 vperm2i128 m4, m3, m3, 0x01 ; start with partial_sum_hv[0-1] paddw m8, m0, m1 paddw m9, m2, m3 phaddw m10, m0, m1 phaddw m11, m2, m3 paddw m8, m9 phaddw m10, m11 vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 paddw xm8, xm9 ; partial_sum_hv[1] phaddw xm10, xm11 ; partial_sum_hv[0] vinserti128 m8, xm10, 1 vpbroadcastd m9, [div_table+44] pmaddwd m8, m8 pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] ; create aggregates [lower half]: ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x ; and [upper half]: ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd pslldq m9, m1, 2 psrldq m10, m1, 14 pslldq m11, m2, 4 psrldq m12, m2, 12 pslldq m13, m3, 6 psrldq m14, m3, 10 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 pslldq m11, m4, 8 psrldq m12, m4, 8 pslldq m13, m5, 10 psrldq m14, m5, 6 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 pslldq m11, m6, 12 psrldq m12, m6, 4 pslldq m13, m7, 14 psrldq m14, m7, 2 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] vbroadcasti128 m14, [shufw_6543210x] vbroadcasti128 m13, [div_table+16] vbroadcasti128 m12, [div_table+0] paddw m9, m0 ; partial_sum_diag[0/1][0-7] pshufb m10, m14 punpckhwd m11, m9, m10 punpcklwd m9, m10 pmaddwd m11, m11 pmaddwd m9, m9 pmulld m11, m13 pmulld m9, m12 paddd m9, m11 ; cost0[a-d] | cost4[a-d] ; merge horizontally and vertically for partial_sum_alt[0-3] paddw m10, m0, m1 paddw m11, m2, m3 paddw m12, m4, m5 paddw m13, m6, m7 phaddw m0, m4 phaddw m1, m5 phaddw m2, m6 phaddw m3, m7 ; create aggregates [lower half]: ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx ; and [upper half]: ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd pslldq m4, m11, 2 psrldq m11, 14 pslldq m5, m12, 4 psrldq m12, 12 pslldq m6, m13, 6 psrldq m13, 10 paddw m4, m10 paddw m11, m12 vpbroadcastd m12, [div_table+44] paddw m5, m6 paddw m11, m13 ; partial_sum_alt[3/2] right vbroadcasti128 m13, [div_table+32] paddw m4, m5 ; partial_sum_alt[3/2] left pshuflw m5, m11, q3012 punpckhwd m6, m11, m4 punpcklwd m4, m5 pmaddwd m6, m6 pmaddwd m4, m4 pmulld m6, m12 pmulld m4, m13 paddd m4, m6 ; cost7[a-d] | cost5[a-d] ; create aggregates [lower half]: ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx ; and [upper half]: ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd pslldq m5, m1, 2 psrldq m1, 14 pslldq m6, m2, 4 psrldq m2, 12 pslldq m7, m3, 6 psrldq m3, 10 paddw m5, m0 paddw m1, m2 paddw m6, m7 paddw m1, m3 ; partial_sum_alt[0/1] right paddw m5, m6 ; partial_sum_alt[0/1] left pshuflw m0, m1, q3012 punpckhwd m1, m5 punpcklwd m5, m0 pmaddwd m1, m1 pmaddwd m5, m5 pmulld m1, m12 pmulld m5, m13 paddd m5, m1 ; cost1[a-d] | cost3[a-d] mova xm0, [pd_47130256+ 16] mova m1, [pd_47130256] phaddd m9, m8 phaddd m5, m4 phaddd m9, m5 vpermd m0, m9 ; cost[0-3] vpermd m1, m9 ; cost[4-7] | cost[0-3] ; now find the best cost pmaxsd xm2, xm0, xm1 pshufd xm3, xm2, q1032 pmaxsd xm2, xm3 pshufd xm3, xm2, q2301 pmaxsd xm2, xm3 ; best cost ; find the idx using minpos ; make everything other than the best cost negative via subtraction ; find the min of unsigned 16-bit ints to sort out the negative values psubd xm4, xm1, xm2 psubd xm3, xm0, xm2 packssdw xm3, xm4 phminposuw xm3, xm3 ; convert idx to 32-bits psrld xm3, 16 movd eax, xm3 ; get idx^4 complement vpermd m3, m1 psubd xm2, xm3 psrld xm2, 10 movd [varq], xm2 RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef_avx512.asm000066400000000000000000000755561517466257200240130ustar00rootroot00000000000000; Copyright © 2020, VideoLAN and dav2d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 %macro DUP4 1-* %rep %0 times 4 db %1 %rotate 1 %endrep %endmacro %macro DIRS 16 ; cdef_directions[] %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 ; masking away unused bits allows us to use a single vpaddd {1to16} ; instruction instead of having to do vpbroadcastd + paddb db %13 & 0x3f, -%13 & 0x3f %rotate 1 %endrep %endmacro SECTION_RODATA 64 lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55 db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21 db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25 db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53 db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57 end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 sec_tap: db 32, 32, 16, 16 pd_268435568: dd 268435568 SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif ; lut: ; t0 t1 t2 t3 t4 t5 t6 t7 ; T0 T1 T2 T3 T4 T5 T6 T7 ; L0 L1 00 01 02 03 04 05 ; L2 L3 10 11 12 13 14 15 ; L4 L5 20 21 22 23 24 25 ; L6 L7 30 31 32 33 34 35 ; b0 b1 b2 b3 b4 b5 b6 b7 ; B0 B1 B2 B3 B4 B5 B6 B7 INIT_ZMM avx512icl cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r7-edge_mask movq xmm0, [dstq+strideq*0] movhps xmm0, [dstq+strideq*1] lea r7, [edge_mask] movq xmm1, [topq+strideq*0-2] movhps xmm1, [topq+strideq*1-2] mov r6d, edgem vinserti32x4 ym0, ymm0, [leftq], 1 lea r2, [strideq*3] vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 mova m5, [base+lut_perm_4x4] vinserti32x4 m0, [dstq+r2], 2 test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 m1, [botq+strideq*0-4], 2 vinserti32x4 m0, [botq+strideq*1-4], 3 .main: movifnidn prid, prim mov t0d, dirm mova m3, [base+px_idx] mov r3d, dampingm vpermi2b m5, m0, m1 ; lut vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m7, m7 lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m6, m3, m5 ; px cmp r6d, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 %macro CDEF_FILTER_4x4_PRI 0 vpcmpub k1, m6, m1, 6 ; px > pN psubb m2, m1, m6 lzcnt r6d, prid vpsubb m2{k1}, m6, m1 ; abs(diff) vpbroadcastb m4, prid and prid, 1 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift movifnidn secd, secm vpbroadcastd m10, [base+pri_tap+priq*4] vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) pminub m2, m4 vpdpbusd m0, m2, m10 ; sum %endmacro CDEF_FILTER_4x4_PRI test secd, secd jz .end_no_clip call .sec .end_clip: pminub m4, m6, m1 pmaxub m1, m6 pminub m5, m2, m3 pmaxub m2, m3 pminub m4, m5 pmaxub m2, m1 psrldq m1, m4, 2 psrldq m3, m2, 2 pminub m1, m4 vpcmpw k1, m0, m7, 1 vpshldd m6, m0, 8 pmaxub m2, m3 pslldq m3, m1, 1 psubw m7, m0 paddusw m0, m6 ; clip >0xff vpsubusw m0{k1}, m6, m7 ; clip <0x00 pslldq m4, m2, 1 pminub m1, m3 pmaxub m2, m4 pmaxub m0, m1 pminub m0, m2 jmp .end .sec_only: movifnidn secd, secm call .sec .end_no_clip: vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) .end: mova xm1, [base+end_perm] vpermb m0, m1, m0 ; output in bits 8-15 of each dword movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 RET .mask_edges_sec_only: movifnidn secd, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: vpbroadcastq m8, [base+edge_mask+r6*8] test prid, prid jz .mask_edges_sec_only vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m8, m2 ; index in-range mova m1, m6 vpermb m1{k1}, m2, m5 CDEF_FILTER_4x4_PRI test secd, secd jz .end_no_clip call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m8, m4 mova m2, m6 vpermb m2{k1}, m4, m5 vpshufbitqmb k1, m8, m9 mova m3, m6 vpermb m3{k1}, m9, m5 jmp .sec_main ALIGN function_align .sec: vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 .sec_main: vpbroadcastd m8, [base+sec_tap] vpcmpub k1, m6, m2, 6 psubb m4, m2, m6 vpbroadcastb m12, secd lzcnt secd, secd vpsubb m4{k1}, m6, m2 vpcmpub k2, m6, m3, 6 vpbroadcastq m11, [r3+secq*8] gf2p8affineqb m10, m4, m11, 0 psubb m5, m3, m6 mova m9, m8 vpsubb m8{k1}, m7, m8 psubusb m10, m12, m10 vpsubb m5{k2}, m6, m3 pminub m4, m10 vpdpbusd m0, m4, m8 gf2p8affineqb m11, m5, m11, 0 vpsubb m9{k2}, m7, m9 psubusb m12, m11 pminub m5, m12 vpdpbusd m0, m5, m9 ret DECLARE_REG_TMP 2, 7 ; lut top lut bottom ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 ; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 ; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 ; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 ; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 ; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 ; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r8-edge_mask vpbroadcastd ym21, strided mov r6d, edgem lea r8, [edge_mask] movq xm1, [topq+strideq*0-2] pmulld ym21, [base+pd_01234567] kxnorb k1, k1, k1 movq xm2, [topq+strideq*1-2] vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 mova m14, [base+lut_perm_4x8a] movu m15, [base+lut_perm_4x8b] test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 ym1, [botq+strideq*0-2], 1 vinserti32x4 ym2, [botq+strideq*1-2], 1 .main: punpcklqdq ym1, ym2 vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ movifnidn prid, prim mov t0d, dirm mova m16, [base+px_idx] mov r3d, dampingm vpermi2b m14, m0, m1 ; lut top vpermi2b m15, m0, m1 ; lut bottom vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m20, m20 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m2, m16, m14 ; pxt vpermb m3, m16, m15 ; pxb mova m1, m0 cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 vpermb m5, m6, m15 ; pNb %macro CDEF_FILTER_4x8_PRI 0 vpcmpub k1, m2, m4, 6 ; pxt > pNt vpcmpub k2, m3, m5, 6 ; pxb > pNb psubb m6, m4, m2 psubb m7, m5, m3 lzcnt r6d, prid vpsubb m6{k1}, m2, m4 ; abs(diff_top) vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) vpbroadcastb m13, prid vpbroadcastq m9, [r3+r6*8] and prid, 1 vpbroadcastd m11, [base+pri_tap+priq*4] vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift mova m10, m11 movifnidn t1d, secm vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) pminub m6, m12 pminub m7, m13 vpdpbusd m0, m6, m10 ; sum top vpdpbusd m1, m7, m11 ; sum bottom %endmacro CDEF_FILTER_4x8_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m10, m4, m2 pminub m12, m6, m8 pminub m11, m5, m3 pminub m13, m7, m9 pmaxub m4, m2 pmaxub m6, m8 pmaxub m5, m3 pmaxub m7, m9 pminub m10, m12 pminub m11, m13 pmaxub m4, m6 pmaxub m5, m7 mov r2d, 0xAAAAAAAA kmovd k1, r2d kxnorb k2, k2, k2 ; hw lw vpshrdd m12, m0, m1, 16 ; m1lw m0hw vpshrdd m6, m10, m11, 16 ; m11lw m10hw vpshrdd m8, m4, m5, 16 ; m5lw m4hw vpblendmw m7{k1}, m10, m11 ; m11hw m10lw vpblendmw m9{k1}, m4, m5 ; m5hw m4lw vpblendmw m4{k1}, m0, m12 ; m1lw m0lw vpblendmw m5{k1}, m12, m1 ; m1hw m0hw vpshrdd m2, m3, 16 pminub m6, m7 pmaxub m8, m9 mova ym14, [base+end_perm] vpcmpw k1, m4, m20, 1 vpshldw m2, m5, 8 pslldq m7, m6, 1 pslldq m9, m8, 1 psubw m5, m20, m4 paddusw m0, m4, m2 ; clip >0xff pminub m6, m7 pmaxub m8, m9 psubusw m0{k1}, m2, m5 ; clip <0x00 pmaxub m0, m6 pminub m0, m8 vpermb m0, m14, m0 vpscatterdd [dstq+ym21]{k2}, ym0 RET .sec_only: movifnidn t1d, secm call .sec .end_no_clip: mova ym4, [base+end_perm] kxnorb k1, k1, k1 vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m3, m1, 8 paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddw m1, m3 pslld m0, 16 vpshrdd m0, m1, 16 vpermb m0, m4, m0 ; output in bits 8-15 of each word vpscatterdd [dstq+ym21]{k1}, ym0 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: mov t1d, r6d or r6d, 8 ; top 4x4 has bottom or t1d, 4 ; bottom 4x4 has top vpbroadcastq m17, [base+edge_mask+r6*8] vpbroadcastq m18, [base+edge_mask+t1*8] test prid, prid jz .mask_edges_sec_only vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m17, m6 ; index in-range vpshufbitqmb k2, m18, m6 mova m4, m2 mova m5, m3 vpermb m4{k1}, m6, m14 vpermb m5{k2}, m6, m15 CDEF_FILTER_4x8_PRI test t1d, t1d jz .end_no_clip call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m17, m10 vpshufbitqmb k2, m18, m10 vpshufbitqmb k3, m17, m11 vpshufbitqmb k4, m18, m11 mova m6, m2 mova m7, m3 mova m8, m2 mova m9, m3 vpermb m6{k1}, m10, m14 vpermb m7{k2}, m10, m15 vpermb m8{k3}, m11, m14 vpermb m9{k4}, m11, m15 jmp .sec_main ALIGN function_align .sec: vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 vpermb m7, m8, m15 ; pNb vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 vpermb m9, m9, m15 ; pNb .sec_main: vpbroadcastb m18, t1d lzcnt t1d, t1d vpcmpub k1, m2, m6, 6 vpcmpub k2, m3, m7, 6 vpcmpub k3, m2, m8, 6 vpcmpub k4, m3, m9, 6 vpbroadcastq m17, [r3+t1*8] psubb m10, m6, m2 psubb m11, m7, m3 psubb m12, m8, m2 psubb m13, m9, m3 vpsubb m10{k1}, m2, m6 ; abs(dt0) vpsubb m11{k2}, m3, m7 ; abs(db0) vpsubb m12{k3}, m2, m8 ; abs(dt1) vpsubb m13{k4}, m3, m9 ; abs(db1) vpbroadcastd m19, [base+sec_tap] gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) pminub m10, m14 pminub m11, m15 pminub m12, m16 pminub m13, m17 mova m14, m19 mova m15, m19 mova m16, m19 vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) vpdpbusd m0, m10, m14 vpdpbusd m1, m11, m15 vpdpbusd m0, m12, m16 vpdpbusd m1, m13, m19 ret ; lut tl lut tr ; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb ; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb ; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09 ; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19 ; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 ; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 ; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 ; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 ; lut bl lut br ; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 ; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 ; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 ; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 ; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69 ; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79 ; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb ; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r8-edge_mask movu xm16, [dstq+strideq*0] pinsrd xm16, [leftq+4*0], 3 mov r6d, edgem vinserti128 ym16, [dstq+strideq*1], 1 lea r10, [dstq+strideq*4] movu xm17, [dstq+strideq*2] vinserti32x4 m16, [topq+strideq*0-2], 2 lea r9, [strideq*3] pinsrd xm17, [leftq+4*1], 3 vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T lea r8, [edge_mask] vinserti128 ym17, [dstq+r9 ], 1 vpbroadcastd ym18, [leftq+4*2] vpblendd ym17, ym18, 0x80 movu xm18, [r10 +strideq*2] vinserti32x4 m17, [r10 +strideq*0], 2 pinsrd xm18, [leftq+4*3], 3 vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5 vinserti128 ym18, [r10 +r9 ], 1 test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 m18, [botq+strideq*0-2], 2 vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B .main: mova m0, [base+lut_perm_8x8a] movu m1, [base+lut_perm_8x8b] mova m30, [base+px_idx] vpermb m16, m0, m16 movifnidn prid, prim vpermb m17, m1, m17 mov t0d, dirm vpermb m18, m0, m18 mov r3d, dampingm vshufi32x4 m12, m16, m17, q2020 ; lut tl vshufi32x4 m13, m16, m17, q3131 ; lut tr vshufi32x4 m14, m17, m18, q0220 ; lut bl vshufi32x4 m15, m17, m18, q1331 ; lut br vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m31, m31 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m4, m30, m12 ; pxtl mova m1, m0 vpermb m5, m30, m13 ; pxtr mova m2, m0 vpermb m6, m30, m14 ; pxbl mova m3, m0 vpermb m7, m30, m15 ; pxbr cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 vpermb m9, m11, m13 ; pNtr vpermb m10, m11, m14 ; pNbl vpermb m11, m11, m15 ; pNbr %macro CDEF_FILTER_8x8_PRI 0 vpcmpub k1, m4, m8, 6 ; pxtl > pNtl vpcmpub k2, m5, m9, 6 ; pxtr > pNtr vpcmpub k3, m6, m10, 6 ; pxbl > pNbl vpcmpub k4, m7, m11, 6 ; pxbr > pNbr psubb m16, m8, m4 psubb m17, m9, m5 psubb m18, m10, m6 psubb m19, m11, m7 lzcnt r6d, prid vpsubb m16{k1}, m4, m8 ; abs(diff_tl) vpsubb m17{k2}, m5, m9 ; abs(diff_tr) vpsubb m18{k3}, m6, m10 ; abs(diff_bl) vpsubb m19{k4}, m7, m11 ; abs(diff_br) vpbroadcastq m28, [r3+r6*8] vpbroadcastb m29, prid and prid, 1 vpbroadcastd m27, [base+pri_tap+priq*4] vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift mova m24, m27 mova m25, m27 mova m26, m27 movifnidn t1d, secm vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) pminub m16, m20 pminub m17, m21 pminub m18, m22 pminub m19, m23 vpdpbusd m0, m16, m24 ; sum tl vpdpbusd m1, m17, m25 ; sum tr vpdpbusd m2, m18, m26 ; sum bl vpdpbusd m3, m19, m27 ; sum br %endmacro CDEF_FILTER_8x8_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m20, m8, m4 pminub m24, m12, m16 pminub m21, m9, m5 pminub m25, m13, m17 pminub m22, m10, m6 pminub m26, m14, m18 pminub m23, m11, m7 pminub m27, m15, m19 pmaxub m8, m4 pmaxub m12, m16 pmaxub m9, m5 pmaxub m13, m17 pmaxub m10, m6 pmaxub m14, m18 pmaxub m11, m7 pmaxub m15, m19 pminub m20, m24 pminub m21, m25 pminub m22, m26 pminub m23, m27 pmaxub m8, m12 pmaxub m9, m13 pmaxub m10, m14 pmaxub m11, m15 mov r2d, 0xAAAAAAAA kmovd k1, r2d vpshrdd m24, m0, m1, 16 vpshrdd m25, m2, m3, 16 vpshrdd m12, m20, m21, 16 vpshrdd m14, m22, m23, 16 vpshrdd m16, m8, m9, 16 vpshrdd m18, m10, m11, 16 vpblendmw m13{k1}, m20, m21 vpblendmw m15{k1}, m22, m23 vpblendmw m17{k1}, m8, m9 vpblendmw m19{k1}, m10, m11 vpblendmw m20{k1}, m0, m24 vpblendmw m21{k1}, m24, m1 vpblendmw m22{k1}, m2, m25 vpblendmw m23{k1}, m25, m3 vpshrdd m4, m5, 16 vpshrdd m6, m7, 16 pminub m12, m13 pminub m14, m15 pmaxub m16, m17 pmaxub m18, m19 mova m8, [base+end_perm_clip] vpcmpw k2, m20, m31, 1 vpcmpw k3, m22, m31, 1 vpshldw m4, m21, 8 vpshldw m6, m23, 8 kunpckdq k1, k1, k1 kxnorb k4, k4, k4 vpshrdw m11, m12, m14, 8 vpshrdw m15, m16, m18, 8 vpblendmb m13{k1}, m12, m14 vpblendmb m17{k1}, m16, m18 psubw m21, m31, m20 psubw m23, m31, m22 paddusw m0, m20, m4 ; clip >0xff paddusw m1, m22, m6 pminub m11, m13 pmaxub m15, m17 psubusw m0{k2}, m4, m21 ; clip <0x00 psubusw m1{k3}, m6, m23 psrlw m0, 8 vmovdqu8 m0{k1}, m1 pmaxub m0, m11 pminub m0, m15 vpermb m0, m8, m0 vextracti32x4 xm1, m0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*2], xm1 movq [r10 +strideq*0], xm2 movq [r10 +strideq*2], xm3 movhps [dstq+strideq*1], xm0 movhps [dstq+r9 ], xm1 movhps [r10 +strideq*1], xm2 movhps [r10 +r9 ], xm3 RET .sec_only: movifnidn t1d, secm call .sec .end_no_clip: mova xm8, [base+end_perm] kxnorb k1, k1, k1 vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m5, m1, 8 vpshldd m6, m2, 8 vpshldd m7, m3, 8 paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddw m1, m5 paddw m2, m6 paddw m3, m7 vpermb m0, m8, m0 vpermb m1, m8, m1 vpermb m2, m8, m2 vpermb m3, m8, m3 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm0 movq [r10 +strideq*0], xm5 movq [r10 +strideq*2], xm2 movhps [dstq+strideq*1], xm4 movhps [dstq+r9 ], xm0 movhps [r10 +strideq*1], xm5 movhps [r10 +r9 ], xm2 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: mov t0d, r6d mov t1d, r6d or t0d, 0xA ; top-left 4x4 has bottom and right or t1d, 0x9 ; top-right 4x4 has bottom and left vpbroadcastq m26, [base+edge_mask+t0*8] vpbroadcastq m27, [base+edge_mask+t1*8] mov t1d, r6d or r6d, 0x6 ; bottom-left 4x4 has top and right or t1d, 0x5 ; bottom-right 4x4 has top and left vpbroadcastq m28, [base+edge_mask+r6*8] vpbroadcastq m29, [base+edge_mask+t1*8] mov t0d, dirm test prid, prid jz .mask_edges_sec_only vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m26, m20 ; index in-range vpshufbitqmb k2, m27, m20 vpshufbitqmb k3, m28, m20 vpshufbitqmb k4, m29, m20 mova m8, m4 mova m9, m5 mova m10, m6 mova m11, m7 vpermb m8{k1}, m20, m12 vpermb m9{k2}, m20, m13 vpermb m10{k3}, m20, m14 vpermb m11{k4}, m20, m15 mova [rsp+0x00], m26 mova [rsp+0x40], m27 mova [rsp+0x80], m28 mova [rsp+0xC0], m29 CDEF_FILTER_8x8_PRI test t1d, t1d jz .end_no_clip mova m26, [rsp+0x00] mova m27, [rsp+0x40] mova m28, [rsp+0x80] mova m29, [rsp+0xC0] call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m26, m20 vpshufbitqmb k2, m27, m20 vpshufbitqmb k3, m28, m20 vpshufbitqmb k4, m29, m20 mova m16, m4 mova m17, m5 mova m18, m6 mova m19, m7 vpermb m16{k1}, m20, m12 vpermb m17{k2}, m20, m13 vpermb m18{k3}, m20, m14 vpermb m19{k4}, m20, m15 vpshufbitqmb k1, m26, m21 vpshufbitqmb k2, m27, m21 vpshufbitqmb k3, m28, m21 vpshufbitqmb k4, m29, m21 vpermb m12, m21, m12 vpermb m13, m21, m13 vpermb m14, m21, m14 vpermb m15, m21, m15 vpblendmb m12{k1}, m4, m12 vpblendmb m13{k2}, m5, m13 vpblendmb m14{k3}, m6, m14 vpblendmb m15{k4}, m7, m15 jmp .sec_main ALIGN function_align .sec: vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 vpermb m17, m20, m13 ; pNtr vpermb m18, m20, m14 ; pNbl vpermb m19, m20, m15 ; pNbr vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 vpermb m13, m21, m13 ; pNtr vpermb m14, m21, m14 ; pNbl vpermb m15, m21, m15 ; pNbr .sec_main: %macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants vpcmpub k1, m4, %1, 6 vpcmpub k2, m5, %2, 6 vpcmpub k3, m6, %3, 6 vpcmpub k4, m7, %4, 6 psubb m20, %1, m4 psubb m21, %2, m5 psubb m22, %3, m6 psubb m23, %4, m7 %if %5 vpbroadcastb m28, t1d lzcnt t1d, t1d vpbroadcastq m29, [r3+t1*8] %endif vpsubb m20{k1}, m4, %1 vpsubb m21{k2}, m5, %2 vpsubb m22{k3}, m6, %3 vpsubb m23{k4}, m7, %4 gf2p8affineqb m24, m20, m29, 0 gf2p8affineqb m25, m21, m29, 0 gf2p8affineqb m26, m22, m29, 0 gf2p8affineqb m27, m23, m29, 0 %if %5 vpbroadcastd m30, [base+sec_tap] %endif psubusb m24, m28, m24 psubusb m25, m28, m25 psubusb m26, m28, m26 psubusb m27, m28, m27 pminub m20, m24 pminub m21, m25 pminub m22, m26 pminub m23, m27 mova m24, m30 mova m25, m30 mova m26, m30 mova m27, m30 vpsubb m24{k1}, m31, m30 vpsubb m25{k2}, m31, m30 vpsubb m26{k3}, m31, m30 vpsubb m27{k4}, m31, m30 vpdpbusd m0, m20, m24 vpdpbusd m1, m21, m25 vpdpbusd m2, m22, m26 vpdpbusd m3, m23, m27 %endmacro CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 CDEF_FILTER_8x8_SEC m12, m13, m14, m15 ret %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cdef_sse.asm000066400000000000000000001174221517466257200235440ustar00rootroot00000000000000; Copyright © 2018, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2019, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 %macro DUP8 1-* %rep %0 times 8 db %1 %rotate 1 %endrep %endmacro div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105, 105, 105, 105, 105 div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 dw 168, 168, 140, 140, 120, 120, 105, 105 dw 420, 420, 210, 210, 140, 140, 105, 105 dw 105, 105, 105, 105, 105, 105, 105, 105 const shufw_6543210x, \ db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_8: times 8 dw 8 pw_128: times 8 dw 128 pw_256: times 8 dw 256 pw_2048: times 8 dw 2048 pw_0x7FFF: times 8 dw 0x7FFF pw_0x8000: times 8 dw 0x8000 tap_table: ; masks for 8-bit shift emulation DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 ; weights DUP8 4, 2, 3, 3, 2, 1 ; taps indices db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 db 1 * 16 + 0, 2 * 16 + 0 db 1 * 16 + 0, 2 * 16 - 1 ; the last 6 are repeats of the first 6 so we don't need to & 7 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 SECTION .text %macro movif32 2 %if ARCH_X86_32 mov %1, %2 %endif %endmacro %macro PMOVZXBW 2-3 0 ; %3 = half %if cpuflag(sse4) && %3 == 0 pmovzxbw %1, %2 %else %if %3 == 1 movd %1, %2 %else movq %1, %2 %endif punpcklbw %1, m7 %endif %endmacro %macro PSHUFB_0 2 %if cpuflag(ssse3) pshufb %1, %2 %else punpcklbw %1, %1 pshuflw %1, %1, q0000 punpcklqdq %1, %1 %endif %endmacro %macro MOVDDUP 2 %if cpuflag(ssse3) movddup %1, %2 %else movq %1, %2 punpcklqdq %1, %1 %endif %endmacro %macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax ; load p0/p1 movsx offq, byte [dirq+kq+%1+14*8] ; off1 %if %6 == 4 movq m5, [stkq+offq*2+32*0] ; p0 movhps m5, [stkq+offq*2+32*1] %else movu m5, [stkq+offq*2+32*0] ; p0 %endif neg offq ; -off1 %if %6 == 4 movq m6, [stkq+offq*2+32*0] ; p1 movhps m6, [stkq+offq*2+32*1] %else movu m6, [stkq+offq*2+32*0] ; p1 %endif %if %7 %if cpuflag(sse4) ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them pmaxsw m7, m5 pminuw m8, m5 pmaxsw m7, m6 pminuw m8, m6 %else pcmpeqw m3, m14, m5 pminsw m8, m5 ; min after p0 pandn m3, m5 pmaxsw m7, m3 ; max after p0 pcmpeqw m3, m14, m6 pminsw m8, m6 ; min after p1 pandn m3, m6 pmaxsw m7, m3 ; max after p1 %endif %endif ; accumulate sum[m13] over p0/p1 psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) packsswb m5, m6 ; convert pixel diff to 8-bit %if cpuflag(ssse3) pshufb m5, m13 ; group diffs p0 and p1 into pairs pabsb m6, m5 psignb m3, %5, m5 %else movlhps m6, m5 punpckhbw m6, m5 pxor m5, m5 pcmpgtb m5, m6 paddb m6, m5 pxor m6, m5 paddb m3, %5, m5 pxor m3, m5 %endif pand m9, %3, m6 ; emulate 8-bit shift psrlw m9, %2 psubusb m5, %4, m9 pminub m5, m6 ; constrain(diff_p) %if cpuflag(ssse3) pmaddubsw m5, m3 ; constrain(diff_p) * taps %else psrlw m9, m5, 8 psraw m6, m3, 8 psllw m5, 8 psllw m3, 8 pmullw m9, m6 pmulhw m5, m3 paddw m5, m9 %endif paddw m0, m5 %endmacro %macro LOAD_BODY 3 ; dst, src, block_width %if %3 == 4 PMOVZXBW m0, [%2+strideq*0] PMOVZXBW m1, [%2+strideq*1] PMOVZXBW m2, [%2+strideq*2] PMOVZXBW m3, [%2+stride3q] mova [%1+32*0], m0 mova [%1+32*1], m1 mova [%1+32*2], m2 mova [%1+32*3], m3 %else movu m0, [%2+strideq*0] movu m1, [%2+strideq*1] movu m2, [%2+strideq*2] movu m3, [%2+stride3q] punpcklbw m4, m0, m7 punpckhbw m0, m7 mova [%1+32*0+ 0], m4 mova [%1+32*0+16], m0 punpcklbw m4, m1, m7 punpckhbw m1, m7 mova [%1+32*1+ 0], m4 mova [%1+32*1+16], m1 punpcklbw m4, m2, m7 punpckhbw m2, m7 mova [%1+32*2+ 0], m4 mova [%1+32*2+16], m2 punpcklbw m4, m3, m7 punpckhbw m3, m7 mova [%1+32*3+ 0], m4 mova [%1+32*3+16], m3 %endif %endmacro %macro CDEF_FILTER_END 2 ; w, minmax pxor m6, m6 pcmpgtw m6, m0 paddw m0, m6 %if cpuflag(ssse3) pmulhrsw m0, m15 %else paddw m0, m15 psraw m0, 4 %endif paddw m4, m0 %if %2 pminsw m4, m7 pmaxsw m4, m8 %endif packuswb m4, m4 %if %1 == 4 movd [dstq+strideq*0], m4 psrlq m4, 32 movd [dstq+strideq*1], m4 add stkq, 32*2 lea dstq, [dstq+strideq*2] %else movq [dstq], m4 add stkq, 32 add dstq, strideq %endif %endmacro %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ dst, stride, left, top, bot, pri, dst4, edge, \ stride3 %define px rsp+3*16+2*32 %define base 0 %else cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ dst, stride, left, edge, stride3 %define topq r2 %define botq r2 %define dst4q r2 LEA r5, tap_table %define px esp+7*16+2*32 %define base r5-tap_table %endif mov edged, r9m %if cpuflag(sse4) %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] %else %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] %endif mova m6, OUT_OF_BOUNDS_MEM pxor m7, m7 ; prepare pixel buffers - body/right %if %2 == 8 lea dst4q, [dstq+strideq*4] %endif lea stride3q, [strideq*3] test edgeb, 2 ; have_right jz .no_right LOAD_BODY px, dstq, %1 %if %2 == 8 LOAD_BODY px+4*32, dst4q, %1 %endif jmp .body_done .no_right: PMOVZXBW m0, [dstq+strideq*0], %1 == 4 PMOVZXBW m1, [dstq+strideq*1], %1 == 4 PMOVZXBW m2, [dstq+strideq*2], %1 == 4 PMOVZXBW m3, [dstq+stride3q ], %1 == 4 mova [px+32*0], m0 mova [px+32*1], m1 mova [px+32*2], m2 mova [px+32*3], m3 movd [px+32*0+%1*2], m6 movd [px+32*1+%1*2], m6 movd [px+32*2+%1*2], m6 movd [px+32*3+%1*2], m6 %if %2 == 8 PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 mova [px+32*4], m0 mova [px+32*5], m1 mova [px+32*6], m2 mova [px+32*7], m3 movd [px+32*4+%1*2], m6 movd [px+32*5+%1*2], m6 movd [px+32*6+%1*2], m6 movd [px+32*7+%1*2], m6 %endif .body_done: ; top movifnidn topq, r3mp test edgeb, 4 ; have_top jz .no_top test edgeb, 1 ; have_left jz .top_no_left test edgeb, 2 ; have_right jz .top_no_right %if %1 == 4 PMOVZXBW m0, [topq+strideq*0-2] PMOVZXBW m1, [topq+strideq*1-2] %else movu m0, [topq+strideq*0-4] movu m1, [topq+strideq*1-4] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 movu [px-32*2+8], m2 movu [px-32*1+8], m3 %endif movu [px-32*2-%1], m0 movu [px-32*1-%1], m1 jmp .top_done .top_no_right: %if %1 == 4 PMOVZXBW m0, [topq+strideq*0-%1] PMOVZXBW m1, [topq+strideq*1-%1] movu [px-32*2-8], m0 movu [px-32*1-8], m1 %else movu m0, [topq+strideq*0-%1] movu m1, [topq+strideq*1-%2] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 mova [px-32*2-16], m0 mova [px-32*2+ 0], m2 mova [px-32*1-16], m1 mova [px-32*1+ 0], m3 %endif movd [px-32*2+%1*2], m6 movd [px-32*1+%1*2], m6 jmp .top_done .top_no_left: test edgeb, 2 ; have_right jz .top_no_left_right %if %1 == 4 PMOVZXBW m0, [topq+strideq*0] PMOVZXBW m1, [topq+strideq*1] %else movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 movd [px-32*2+16], m2 movd [px-32*1+16], m3 %endif movd [px-32*2- 4], m6 movd [px-32*1- 4], m6 mova [px-32*2+ 0], m0 mova [px-32*1+ 0], m1 jmp .top_done .top_no_left_right: PMOVZXBW m0, [topq+strideq*0], %1 == 4 PMOVZXBW m1, [topq+strideq*1], %1 == 4 movd [px-32*2-4], m6 movd [px-32*1-4], m6 mova [px-32*2+0], m0 mova [px-32*1+0], m1 movd [px-32*2+%1*2], m6 movd [px-32*1+%1*2], m6 jmp .top_done .no_top: movu [px-32*2- 4], m6 movu [px-32*1- 4], m6 %if %1 == 8 movq [px-32*2+12], m6 movq [px-32*1+12], m6 %endif .top_done: ; left test edgeb, 1 ; have_left jz .no_left movifnidn leftq, leftmp %if %2 == 4 movq m0, [leftq] %else movu m0, [leftq] %endif %if %2 == 4 punpcklbw m0, m7 %else punpckhbw m1, m0, m7 punpcklbw m0, m7 movhlps m3, m1 movd [px+32*4-4], m1 movd [px+32*6-4], m3 psrlq m1, 32 psrlq m3, 32 movd [px+32*5-4], m1 movd [px+32*7-4], m3 %endif movhlps m2, m0 movd [px+32*0-4], m0 movd [px+32*2-4], m2 psrlq m0, 32 psrlq m2, 32 movd [px+32*1-4], m0 movd [px+32*3-4], m2 jmp .left_done .no_left: movd [px+32*0-4], m6 movd [px+32*1-4], m6 movd [px+32*2-4], m6 movd [px+32*3-4], m6 %if %2 == 8 movd [px+32*4-4], m6 movd [px+32*5-4], m6 movd [px+32*6-4], m6 movd [px+32*7-4], m6 %endif .left_done: ; bottom movifnidn botq, r4mp test edgeb, 8 ; have_bottom jz .no_bottom test edgeb, 1 ; have_left jz .bottom_no_left test edgeb, 2 ; have_right jz .bottom_no_right %if %1 == 4 PMOVZXBW m0, [botq+strideq*0-(%1/2)] PMOVZXBW m1, [botq+strideq*1-(%1/2)] %else movu m0, [botq+strideq*0-4] movu m1, [botq+strideq*1-4] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 movu [px+32*(%2+0)+8], m2 movu [px+32*(%2+1)+8], m3 %endif movu [px+32*(%2+0)-%1], m0 movu [px+32*(%2+1)-%1], m1 jmp .bottom_done .bottom_no_right: %if %1 == 4 PMOVZXBW m0, [botq+strideq*0-4] PMOVZXBW m1, [botq+strideq*1-4] movu [px+32*(%2+0)-8], m0 movu [px+32*(%2+1)-8], m1 %else movu m0, [botq+strideq*0-8] movu m1, [botq+strideq*1-8] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 mova [px+32*(%2+0)-16], m0 mova [px+32*(%2+0)+ 0], m2 mova [px+32*(%2+1)-16], m1 mova [px+32*(%2+1)+ 0], m3 movd [px+32*(%2-1)+16], m6 ; overwritten by first mova %endif movd [px+32*(%2+0)+%1*2], m6 movd [px+32*(%2+1)+%1*2], m6 jmp .bottom_done .bottom_no_left: test edgeb, 2 ; have_right jz .bottom_no_left_right %if %1 == 4 PMOVZXBW m0, [botq+strideq*0] PMOVZXBW m1, [botq+strideq*1] %else movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 mova [px+32*(%2+0)+16], m2 mova [px+32*(%2+1)+16], m3 %endif mova [px+32*(%2+0)+ 0], m0 mova [px+32*(%2+1)+ 0], m1 movd [px+32*(%2+0)- 4], m6 movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .bottom_no_left_right: PMOVZXBW m0, [botq+strideq*0], %1 == 4 PMOVZXBW m1, [botq+strideq*1], %1 == 4 mova [px+32*(%2+0)+ 0], m0 mova [px+32*(%2+1)+ 0], m1 movd [px+32*(%2+0)+%1*2], m6 movd [px+32*(%2+1)+%1*2], m6 movd [px+32*(%2+0)- 4], m6 movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .no_bottom: movu [px+32*(%2+0)- 4], m6 movu [px+32*(%2+1)- 4], m6 %if %1 == 8 movq [px+32*(%2+0)+12], m6 movq [px+32*(%2+1)+12], m6 %endif .bottom_done: ; actual filter %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec mova m13, [shufb_lohi] %if cpuflag(ssse3) mova m15, [pw_2048] %else mova m15, [pw_8] %endif mova m14, m6 %else DEFINE_ARGS dst, pridmp, sec, damping, pri, tap %xdefine m8 m1 %xdefine m9 m2 %xdefine m10 m0 %xdefine m13 [base+shufb_lohi] %xdefine m14 OUT_OF_BOUNDS_MEM %if cpuflag(ssse3) %xdefine m15 [base+pw_2048] %else %xdefine m15 [base+pw_8] %endif %endif movifnidn prid, r5m movifnidn secd, r6m mov dampingd, r8m movif32 [esp+0x3C], r1d test prid, prid jz .sec_only movd m1, r5m bsr pridmpd, prid test secd, secd jz .pri_only movd m10, r6m tzcnt secd, secd and prid, 1 sub pridmpd, dampingd sub secd, dampingd xor dampingd, dampingd add prid, prid neg pridmpd cmovs pridmpd, dampingd neg secd PSHUFB_0 m1, m7 PSHUFB_0 m10, m7 %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec lea tapq, [tap_table] MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask mov [rsp+0x00], pridmpq ; pri_shift mov [rsp+0x10], secq ; sec_shift DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off %else MOVDDUP m2, [tapq+pridmpq*8] MOVDDUP m3, [tapq+secq*8] mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP mov [esp+0x00], pridmpd mov [esp+0x30], secd DEFINE_ARGS dst, stride, dir, stk, pri, tap, h %define offq dstq %define kd strided %define kq strideq mova [esp+0x10], m2 mova [esp+0x40], m3 mova [esp+0x20], m1 mova [esp+0x50], m10 %endif mov dird, r7m lea stkq, [px] lea priq, [tapq+8*8+priq*8] ; pri_taps mov hd, %1*%2/8 lea dirq, [tapq+dirq*2] .v_loop: movif32 [esp+0x38], dstd mov kd, 1 %if %1 == 4 movq m4, [stkq+32*0] movhps m4, [stkq+32*1] %else mova m4, [stkq+32*0] ; px %endif pxor m0, m0 ; sum mova m7, m4 ; max mova m8, m4 ; min .k_loop: MOVDDUP m2, [priq+kq*8] %if ARCH_X86_64 ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 %else ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 %endif dec kd jge .k_loop movif32 dstq, [esp+0x38] movif32 strideq, [esp+0x3C] CDEF_FILTER_END %1, 1 dec hd jg .v_loop RET .pri_only: %if ARCH_X86_64 DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap lea tapq, [tap_table] %else DEFINE_ARGS dst, pridmp, zero, damping, pri, tap %endif and prid, 1 xor zerod, zerod sub dampingd, pridmpd cmovs dampingd, zerod add prid, prid PSHUFB_0 m1, m7 MOVDDUP m7, [tapq+dampingq*8] mov [rsp+0x00], dampingq %if ARCH_X86_64 DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off %else mov [rsp+0x04], zerod DEFINE_ARGS dst, stride, dir, stk, pri, tap, h %endif mov dird, r7m lea stkq, [px] lea priq, [tapq+8*8+priq*8] mov hd, %1*%2/8 lea dirq, [tapq+dirq*2] .pri_v_loop: movif32 [esp+0x38], dstd mov kd, 1 %if %1 == 4 movq m4, [stkq+32*0] movhps m4, [stkq+32*1] %else mova m4, [stkq+32*0] %endif pxor m0, m0 .pri_k_loop: MOVDDUP m2, [priq+kq*8] ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 dec kd jge .pri_k_loop movif32 dstq, [esp+0x38] movif32 strideq, [esp+0x3C] CDEF_FILTER_END %1, 0 dec hd jg .pri_v_loop RET .sec_only: %if ARCH_X86_64 DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec %else DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero %endif movd m1, r6m tzcnt secd, secd mov dird, r7m xor zerod, zerod sub dampingd, secd cmovs dampingd, zerod PSHUFB_0 m1, m7 %if ARCH_X86_64 lea tapq, [tap_table] %else mov [rsp+0x04], zerod %endif mov [rsp+0x00], dampingq MOVDDUP m7, [tapq+dampingq*8] lea dirq, [tapq+dirq*2] %if ARCH_X86_64 DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k %else DEFINE_ARGS dst, stride, off, stk, dir, tap, h %endif lea stkq, [px] mov hd, %1*%2/8 .sec_v_loop: mov kd, 1 %if %1 == 4 movq m4, [stkq+32*0] movhps m4, [stkq+32*1] %else mova m4, [stkq+32*0] %endif pxor m0, m0 .sec_k_loop: MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 %if ARCH_X86_32 MOVDDUP m2, [tapq+12*8+kq*8] %endif ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 dec kd jge .sec_k_loop movif32 strideq, [esp+0x3C] CDEF_FILTER_END %1, 0 dec hd jg .sec_v_loop RET %endmacro %macro MULLD 2 %if cpuflag(sse4) pmulld %1, %2 %else %if ARCH_X86_32 %define m15 m1 %endif pmulhuw m15, %1, %2 pmullw %1, %2 pslld m15, 16 paddd %1, m15 %endif %endmacro %macro CDEF_DIR 0 %if ARCH_X86_64 cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var lea r6, [strideq*3] movq m1, [srcq+strideq*0] movhps m1, [srcq+strideq*1] movq m3, [srcq+strideq*2] movhps m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+r6 ] pxor m8, m8 psadbw m9, m1, m8 psadbw m2, m3, m8 psadbw m4, m5, m8 psadbw m6, m7, m8 packssdw m9, m2 packssdw m4, m6 packssdw m9, m4 punpcklbw m0, m1, m8 punpckhbw m1, m8 punpcklbw m2, m3, m8 punpckhbw m3, m8 punpcklbw m4, m5, m8 punpckhbw m5, m8 punpcklbw m6, m7, m8 punpckhbw m7, m8 cglobal_label .main mova m8, [pw_128] psubw m0, m8 psubw m1, m8 psubw m2, m8 psubw m3, m8 psubw m4, m8 psubw m5, m8 psubw m6, m8 psubw m7, m8 psllw m8, 3 psubw m9, m8 ; partial_sum_hv[0] paddw m8, m0, m1 paddw m10, m2, m3 paddw m8, m4 paddw m10, m5 paddw m8, m6 paddw m10, m7 paddw m8, m10 ; partial_sum_hv[1] pmaddwd m8, m8 pmaddwd m9, m9 phaddd m9, m8 SWAP m8, m9 MULLD m8, [div_table%+SUFFIX+48] pslldq m9, m1, 2 psrldq m10, m1, 14 pslldq m11, m2, 4 psrldq m12, m2, 12 pslldq m13, m3, 6 psrldq m14, m3, 10 paddw m9, m0 paddw m10, m12 paddw m11, m13 paddw m10, m14 ; partial_sum_diag[0] top/right half paddw m9, m11 ; partial_sum_diag[0] top/left half pslldq m11, m4, 8 psrldq m12, m4, 8 pslldq m13, m5, 10 psrldq m14, m5, 6 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 pslldq m11, m6, 12 psrldq m12, m6, 4 pslldq m13, m7, 14 psrldq m14, m7, 2 paddw m9, m11 paddw m10, m12 paddw m9, m13 ; partial_sum_diag[0][0-7] paddw m10, m14 ; partial_sum_diag[0][8-14,zero] pshufb m10, [shufw_6543210x] punpckhwd m11, m9, m10 punpcklwd m9, m10 pmaddwd m11, m11 pmaddwd m9, m9 MULLD m11, [div_table%+SUFFIX+16] MULLD m9, [div_table%+SUFFIX+0] paddd m9, m11 ; cost[0a-d] pslldq m10, m0, 14 psrldq m11, m0, 2 pslldq m12, m1, 12 psrldq m13, m1, 4 pslldq m14, m2, 10 psrldq m15, m2, 6 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m15 pslldq m12, m3, 8 psrldq m13, m3, 8 pslldq m14, m4, 6 psrldq m15, m4, 10 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m15 pslldq m12, m5, 4 psrldq m13, m5, 12 pslldq m14, m6, 2 psrldq m15, m6, 14 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m15 ; partial_sum_diag[1][8-14,zero] paddw m10, m7 ; partial_sum_diag[1][0-7] pshufb m11, [shufw_6543210x] punpckhwd m12, m10, m11 punpcklwd m10, m11 pmaddwd m12, m12 pmaddwd m10, m10 MULLD m12, [div_table%+SUFFIX+16] MULLD m10, [div_table%+SUFFIX+0] paddd m10, m12 ; cost[4a-d] phaddd m9, m10 ; cost[0a/b,4a/b] paddw m10, m0, m1 paddw m11, m2, m3 paddw m12, m4, m5 paddw m13, m6, m7 phaddw m0, m4 phaddw m1, m5 phaddw m2, m6 phaddw m3, m7 ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) pslldq m4, m11, 2 psrldq m5, m11, 14 pslldq m6, m12, 4 psrldq m7, m12, 12 pslldq m14, m13, 6 psrldq m15, m13, 10 paddw m4, m10 paddw m5, m7 paddw m4, m6 paddw m5, m15 ; partial_sum_alt[3] right paddw m4, m14 ; partial_sum_alt[3] left pshuflw m6, m5, q3012 punpckhwd m5, m4 punpcklwd m4, m6 pmaddwd m5, m5 pmaddwd m4, m4 MULLD m5, [div_table%+SUFFIX+48] MULLD m4, [div_table%+SUFFIX+32] paddd m4, m5 ; cost[7a-d] pslldq m5, m10, 6 psrldq m6, m10, 10 pslldq m7, m11, 4 psrldq m10, m11, 12 pslldq m11, m12, 2 psrldq m12, 14 paddw m5, m7 paddw m6, m10 paddw m5, m11 paddw m6, m12 paddw m5, m13 pshuflw m7, m6, q3012 punpckhwd m6, m5 punpcklwd m5, m7 pmaddwd m6, m6 pmaddwd m5, m5 MULLD m6, [div_table%+SUFFIX+48] MULLD m5, [div_table%+SUFFIX+32] paddd m5, m6 ; cost[5a-d] pslldq m6, m1, 2 psrldq m7, m1, 14 pslldq m10, m2, 4 psrldq m11, m2, 12 pslldq m12, m3, 6 psrldq m13, m3, 10 paddw m6, m0 paddw m7, m11 paddw m6, m10 paddw m7, m13 ; partial_sum_alt[3] right paddw m6, m12 ; partial_sum_alt[3] left pshuflw m10, m7, q3012 punpckhwd m7, m6 punpcklwd m6, m10 pmaddwd m7, m7 pmaddwd m6, m6 MULLD m7, [div_table%+SUFFIX+48] MULLD m6, [div_table%+SUFFIX+32] paddd m6, m7 ; cost[1a-d] pshufd m0, m0, q1032 pshufd m1, m1, q1032 pshufd m2, m2, q1032 pshufd m3, m3, q1032 pslldq m10, m0, 6 psrldq m11, m0, 10 pslldq m12, m1, 4 psrldq m13, m1, 12 pslldq m14, m2, 2 psrldq m2, 14 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m2 paddw m10, m3 pshuflw m12, m11, q3012 punpckhwd m11, m10 punpcklwd m10, m12 pmaddwd m11, m11 pmaddwd m10, m10 MULLD m11, [div_table%+SUFFIX+48] MULLD m10, [div_table%+SUFFIX+32] paddd m10, m11 ; cost[3a-d] phaddd m9, m8 ; cost[0,4,2,6] phaddd m6, m10 phaddd m5, m4 phaddd m6, m5 ; cost[1,3,5,7] pshufd m4, m9, q3120 ; now find the best cost %if cpuflag(sse4) pmaxsd m9, m6 pshufd m0, m9, q1032 pmaxsd m0, m9 pshufd m1, m0, q2301 pmaxsd m0, m1 ; best cost %else pcmpgtd m0, m9, m6 pand m9, m0 pandn m0, m6 por m9, m0 pshufd m1, m9, q1032 pcmpgtd m0, m9, m1 pand m9, m0 pandn m0, m1 por m9, m0 pshufd m1, m9, q2301 pcmpgtd m0, m9, m1 pand m9, m0 pandn m0, m1 por m0, m9 %endif ; get direction and variance punpckhdq m1, m4, m6 punpckldq m4, m6 psubd m2, m0, m1 psubd m3, m0, m4 %if WIN64 WIN64_RESTORE_XMM %define tmp rsp+stack_offset+8 %else %define tmp rsp-40 %endif mova [tmp+0x00], m2 ; emulate ymm in stack mova [tmp+0x10], m3 pcmpeqd m1, m0 ; compute best cost mask pcmpeqd m4, m0 packssdw m4, m1 pmovmskb eax, m4 ; get byte-idx from mask tzcnt eax, eax mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm shr eax, 1 ; get direction by converting byte-idx to word-idx shr r1d, 10 mov [varq], r1d %else cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 %define base r2-shufw_6543210x LEA r2, shufw_6543210x pxor m0, m0 lea stride3q, [strideq*3] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] mova m1, [base+pw_128] psadbw m2, m5, m0 psadbw m3, m7, m0 packssdw m2, m3 punpcklbw m4, m5, m0 punpckhbw m5, m0 punpcklbw m6, m7, m0 punpckhbw m7, m0 psubw m4, m1 psubw m5, m1 psubw m6, m1 psubw m7, m1 mova [esp+0x00], m4 mova [esp+0x10], m5 mova [esp+0x20], m6 mova [esp+0x50], m7 lea srcq, [srcq+strideq*4] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] psadbw m3, m5, m0 psadbw m0, m7 packssdw m3, m0 pxor m0, m0 punpcklbw m4, m5, m0 punpckhbw m5, m0 punpcklbw m6, m7, m0 punpckhbw m7, m0 cglobal_label .main psubw m4, m1 psubw m5, m1 psubw m6, m1 psubw m7, m1 packssdw m2, m3 psllw m1, 3 psubw m2, m1 ; partial_sum_hv[0] pmaddwd m2, m2 mova m3, [esp+0x50] mova m0, [esp+0x00] paddw m0, [esp+0x10] paddw m1, m3, [esp+0x20] paddw m0, m4 paddw m1, m5 paddw m0, m6 paddw m1, m7 paddw m0, m1 ; partial_sum_hv[1] pmaddwd m0, m0 phaddd m2, m0 MULLD m2, [base+div_table%+SUFFIX+48] mova [esp+0x30], m2 mova m1, [esp+0x10] pslldq m0, m1, 2 psrldq m1, 14 paddw m0, [esp+0x00] pslldq m2, m3, 6 psrldq m3, 10 paddw m0, m2 paddw m1, m3 mova m3, [esp+0x20] pslldq m2, m3, 4 psrldq m3, 12 paddw m0, m2 ; partial_sum_diag[0] top/left half paddw m1, m3 ; partial_sum_diag[0] top/right half pslldq m2, m4, 8 psrldq m3, m4, 8 paddw m0, m2 paddw m1, m3 pslldq m2, m5, 10 psrldq m3, m5, 6 paddw m0, m2 paddw m1, m3 pslldq m2, m6, 12 psrldq m3, m6, 4 paddw m0, m2 paddw m1, m3 pslldq m2, m7, 14 psrldq m3, m7, 2 paddw m0, m2 ; partial_sum_diag[0][0-7] paddw m1, m3 ; partial_sum_diag[0][8-14,zero] mova m3, [esp+0x50] pshufb m1, [base+shufw_6543210x] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 MULLD m2, [base+div_table%+SUFFIX+16] MULLD m0, [base+div_table%+SUFFIX+ 0] paddd m0, m2 ; cost[0a-d] mova [esp+0x40], m0 mova m1, [esp+0x00] pslldq m0, m1, 14 psrldq m1, 2 paddw m0, m7 pslldq m2, m3, 8 psrldq m3, 8 paddw m0, m2 paddw m1, m3 mova m3, [esp+0x20] pslldq m2, m3, 10 psrldq m3, 6 paddw m0, m2 paddw m1, m3 mova m3, [esp+0x10] pslldq m2, m3, 12 psrldq m3, 4 paddw m0, m2 paddw m1, m3 pslldq m2, m4, 6 psrldq m3, m4, 10 paddw m0, m2 paddw m1, m3 pslldq m2, m5, 4 psrldq m3, m5, 12 paddw m0, m2 paddw m1, m3 pslldq m2, m6, 2 psrldq m3, m6, 14 paddw m0, m2 ; partial_sum_diag[1][0-7] paddw m1, m3 ; partial_sum_diag[1][8-14,zero] mova m3, [esp+0x50] pshufb m1, [base+shufw_6543210x] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 MULLD m2, [base+div_table%+SUFFIX+16] MULLD m0, [base+div_table%+SUFFIX+ 0] paddd m0, m2 ; cost[4a-d] phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] phaddd m1, [esp+0x30] ; cost[0,4,2,6] mova [esp+0x30], m1 phaddw m0, [esp+0x00], m4 phaddw m1, [esp+0x10], m5 paddw m4, m5 mova m2, [esp+0x20] paddw m5, m2, m3 phaddw m2, m6 paddw m6, m7 phaddw m3, m7 mova m7, [esp+0x00] paddw m7, [esp+0x10] mova [esp+0x00], m0 mova [esp+0x10], m1 mova [esp+0x20], m2 pslldq m1, m4, 4 pslldq m2, m6, 6 pslldq m0, m5, 2 paddw m1, m2 paddw m0, m7 psrldq m2, m5, 14 paddw m0, m1 ; partial_sum_alt[3] left psrldq m1, m4, 12 paddw m1, m2 psrldq m2, m6, 10 paddw m1, m2 ; partial_sum_alt[3] right pshuflw m1, m1, q3012 punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 MULLD m2, [base+div_table%+SUFFIX+48] MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m2 ; cost[7a-d] mova [esp+0x40], m0 pslldq m0, m7, 6 psrldq m7, 10 pslldq m1, m5, 4 psrldq m5, 12 pslldq m2, m4, 2 psrldq m4, 14 paddw m0, m6 paddw m7, m5 paddw m0, m1 paddw m7, m4 paddw m0, m2 pshuflw m2, m7, q3012 punpckhwd m7, m0 punpcklwd m0, m2 pmaddwd m7, m7 pmaddwd m0, m0 MULLD m7, [base+div_table%+SUFFIX+48] MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m7 ; cost[5a-d] mova [esp+0x50], m0 mova m7, [esp+0x10] mova m2, [esp+0x20] pslldq m0, m7, 2 psrldq m7, 14 pslldq m4, m2, 4 psrldq m2, 12 pslldq m5, m3, 6 psrldq m6, m3, 10 paddw m0, [esp+0x00] paddw m7, m2 paddw m4, m5 paddw m7, m6 ; partial_sum_alt[3] right paddw m0, m4 ; partial_sum_alt[3] left pshuflw m2, m7, q3012 punpckhwd m7, m0 punpcklwd m0, m2 pmaddwd m7, m7 pmaddwd m0, m0 MULLD m7, [base+div_table%+SUFFIX+48] MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m7 ; cost[1a-d] SWAP m0, m4 pshufd m0, [esp+0x00], q1032 pshufd m1, [esp+0x10], q1032 pshufd m2, [esp+0x20], q1032 pshufd m3, m3, q1032 mova [esp+0x00], m4 pslldq m4, m0, 6 psrldq m0, 10 pslldq m5, m1, 4 psrldq m1, 12 pslldq m6, m2, 2 psrldq m2, 14 paddw m4, m3 paddw m0, m1 paddw m5, m6 paddw m0, m2 paddw m4, m5 pshuflw m2, m0, q3012 punpckhwd m0, m4 punpcklwd m4, m2 pmaddwd m0, m0 pmaddwd m4, m4 MULLD m0, [base+div_table%+SUFFIX+48] MULLD m4, [base+div_table%+SUFFIX+32] paddd m4, m0 ; cost[3a-d] mova m1, [esp+0x00] mova m2, [esp+0x50] mova m0, [esp+0x30] ; cost[0,4,2,6] phaddd m1, m4 phaddd m2, [esp+0x40] ; cost[1,3,5,7] phaddd m1, m2 pshufd m2, m0, q3120 ; now find the best cost %if cpuflag(sse4) pmaxsd m0, m1 pshufd m3, m0, q1032 pmaxsd m3, m0 pshufd m0, m3, q2301 pmaxsd m0, m3 %else pcmpgtd m3, m0, m1 pand m0, m3 pandn m3, m1 por m0, m3 pshufd m4, m0, q1032 pcmpgtd m3, m0, m4 pand m0, m3 pandn m3, m4 por m0, m3 pshufd m4, m0, q2301 pcmpgtd m3, m0, m4 pand m0, m3 pandn m3, m4 por m0, m3 %endif ; get direction and variance mov vard, varm punpckhdq m3, m2, m1 punpckldq m2, m1 psubd m1, m0, m3 psubd m4, m0, m2 mova [esp+0x00], m1 ; emulate ymm in stack mova [esp+0x10], m4 pcmpeqd m3, m0 ; compute best cost mask pcmpeqd m2, m0 packssdw m2, m3 pmovmskb eax, m2 ; get byte-idx from mask tzcnt eax, eax mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm shr eax, 1 ; get direction by converting byte-idx to word-idx shr r1d, 10 mov [vard], r1d %endif RET %endmacro INIT_XMM sse4 CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 CDEF_DIR INIT_XMM ssse3 CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 CDEF_DIR INIT_XMM sse2 CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cpu.c000066400000000000000000000071321517466257200222160ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "common/attributes.h" #include "src/cpu.h" #include "src/x86/cpu.h" typedef struct { uint32_t eax, ebx, edx, ecx; } CpuidRegisters; void dav2d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf); uint64_t dav2d_cpu_xgetbv(unsigned xcr); #define X(reg, mask) (((reg) & (mask)) == (mask)) COLD unsigned dav2d_get_cpu_flags_x86(void) { union { CpuidRegisters r; struct { uint32_t max_leaf; char vendor[12]; }; } cpu; dav2d_cpu_cpuid(&cpu.r, 0, 0); unsigned flags = dav2d_get_default_cpu_flags(); if (cpu.max_leaf >= 1) { CpuidRegisters r; dav2d_cpu_cpuid(&r, 1, 0); const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff); if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ { flags |= DAV2D_X86_CPU_FLAG_SSE2; if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ { flags |= DAV2D_X86_CPU_FLAG_SSSE3; if (X(r.ecx, 0x00080000)) /* SSE4.1 */ flags |= DAV2D_X86_CPU_FLAG_SSE41; } } #if ARCH_X86_64 /* We only support >128-bit SIMD on x86-64. */ if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ { const uint64_t xcr0 = dav2d_cpu_xgetbv(0); if (X(xcr0, 0x00000006)) /* XMM/YMM */ { if (cpu.max_leaf >= 7) { dav2d_cpu_cpuid(&r, 7, 0); if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ { flags |= DAV2D_X86_CPU_FLAG_AVX2; if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ { if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42)) flags |= DAV2D_X86_CPU_FLAG_AVX512ICL; } } } } } #endif if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) { if ((flags & DAV2D_X86_CPU_FLAG_AVX2) && family <= 0x19) { /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+, Zen 4 */ flags |= DAV2D_X86_CPU_FLAG_SLOW_GATHER; } } } return flags; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cpu.h000066400000000000000000000041011517466257200222140ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_X86_CPU_H #define DAV2D_SRC_X86_CPU_H enum CpuFlags { DAV2D_X86_CPU_FLAG_SSE2 = 1 << 0, DAV2D_X86_CPU_FLAG_SSSE3 = 1 << 1, DAV2D_X86_CPU_FLAG_SSE41 = 1 << 2, DAV2D_X86_CPU_FLAG_AVX2 = 1 << 3, DAV2D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/ * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */ DAV2D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough * to cause performance regressions. */ }; unsigned dav2d_get_cpu_flags_x86(void); #endif /* DAV2D_SRC_X86_CPU_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/cpuid.asm000066400000000000000000000036161517466257200230740ustar00rootroot00000000000000; Copyright © 2018, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION .text cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf mov r4, regsmp mov eax, leafm mov ecx, subleafm %if ARCH_X86_64 mov r5, rbx %endif cpuid mov [r4+4*0], eax mov [r4+4*1], ebx mov [r4+4*2], edx mov [r4+4*3], ecx %if ARCH_X86_64 mov rbx, r5 %endif RET cglobal cpu_xgetbv, 0, 0, 0, xcr movifnidn ecx, xcrm xgetbv %if ARCH_X86_64 shl rdx, 32 or rax, rdx %endif RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/deblock.h000066400000000000000000000042551517466257200230420ustar00rootroot00000000000000/* * Copyright © 2018-2021, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/deblock.h" #define decl_deblock_sb_fns(ext) \ decl_deblock_sb_fn(BF(dav2d_deblock_h_sb_y, ext)); \ decl_deblock_sb_fn(BF(dav2d_deblock_v_sb_y, ext)); \ decl_deblock_sb_fn(BF(dav2d_deblock_h_sb_uv, ext)); \ decl_deblock_sb_fn(BF(dav2d_deblock_v_sb_uv, ext)) decl_deblock_sb_fns(avx2); static ALWAYS_INLINE void deblock_dsp_init_x86(Dav2dDeblockDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); #if ARCH_X86_64 #if BITDEPTH == 8 if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; c->deblock_sb[0][0] = BF(dav2d_deblock_h_sb_y, avx2); c->deblock_sb[0][1] = BF(dav2d_deblock_v_sb_y, avx2); c->deblock_sb[1][0] = BF(dav2d_deblock_h_sb_uv, avx2); c->deblock_sb[1][1] = BF(dav2d_deblock_v_sb_uv, avx2); #endif #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/deblock_avx2.asm000066400000000000000000001322051517466257200243300ustar00rootroot00000000000000; Copyright © 2018-2026, VideoLAN and dav2d authors ; Copyright © 2018-2026, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 32 pd_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 rev_shuf: times 2 db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 side_shuf_zxbw: db 0, -1, 4, -1, 8, -1, 12, -1, 3, -1, 7, -1, 11, -1, 15, -1 qthr_mul_lut: dw 0, 32, 25, 19, 19, 18, 17, 0 base_mul_lut: dw 0, 1360, 816, 592, 448, 320, 240, 0 base_mul_lut_y_edge: dw 0, 1360, 816, 592, 448, 320, 320, 0 mul1_lut: dw 0, 1360, 816*2, 592*3, 448*4, 320*6, 240*8, 0 mul2_lut: dw 0, 0, 816*1, 592*2, 448*3, 320*5, 240*7, 0 mul3_lut: dw 0, 0, 0, 592*1, 448*2, 320*4, 240*6, 0 mul4_lut: dw 0, 0, 0, 0, 448*1, 320*3, 240*5, 0 mul5_lut: dw 0, 0, 0, 0, 0, 320*2, 240*4, 0 mul6_lut: dw 0, 0, 0, 0, 0, 320*1, 240*3, 0 mul7_lut: dw 0, 0, 0, 0, 0, 0, 240*2, 0 mul8_lut: dw 0, 0, 0, 0, 0, 0, 240*1, 0 mul1_lut_uv_edge: dw 0, 1360, 816*2, 816*2, 816*2, 816*2, 816*2, 0 mul2_lut_uv_edge: dw 0, 0, 816*1, 816*1, 816*1, 816*1, 816*1, 0 mul1_lut_y_edge: dw 0, 1360, 816*2, 592*3, 448*4, 320*6, 320*6, 0 side_shuf: db 0, 4, 8, 12, 3, 7, 11, 15 mul_lut_offset: times 2 db 0, 1 pb_12_m4: times 2 db 12, -4 pb_1_m2: times 2 db 1, -2 pb_m2_1: times 2 db -2, 1 pb_m2_3: times 2 db -2, 3 pb_m3_4: times 2 db -3, 4 pb_m5_6: times 2 db -5, 6 pb_m6_7: times 2 db -6, 7 pb_128: times 4 db 0x80 pw_12288: times 2 dw 12288 pw_24576: times 2 dw 24576 pw_1360: times 2 dw 1360 pw_4_3: dw 4, 3 pw_45: dw 45, 45 pw_45_40: dw 45, 40 SECTION .text %macro TRANSPOSE_16x2_AND_WRITE_2x32 3 ; transpose 16x2 punpcklbw m%3, m%1, m%2 punpckhbw m%1, m%2 ; write out pextrw [dstq+strideq*0-1], xm%3, 0 pextrw [dstq+strideq*1-1], xm%3, 1 pextrw [dstq+strideq*2-1], xm%3, 2 pextrw [dstq+stride3q-1], xm%3, 3 lea dstq, [dstq+strideq*4-1] pextrw [dstq+strideq*0], xm%3, 4 pextrw [dstq+strideq*1], xm%3, 5 pextrw [dstq+strideq*2], xm%3, 6 pextrw [dstq+stride3q], xm%3, 7 lea dstq, [dstq+strideq*4] pextrw [dstq+strideq*0], xm%1, 0 pextrw [dstq+strideq*1], xm%1, 1 pextrw [dstq+strideq*2], xm%1, 2 pextrw [dstq+stride3q], xm%1, 3 lea dstq, [dstq+strideq*4] pextrw [dstq+strideq*0], xm%1, 4 pextrw [dstq+strideq*1], xm%1, 5 pextrw [dstq+strideq*2], xm%1, 6 pextrw [dstq+stride3q], xm%1, 7 lea dstq, [dstq+strideq*4] vextracti128 xm%3, m%3, 1 vextracti128 xm%1, m%1, 1 pextrw [dstq+strideq*0], xm%3, 0 pextrw [dstq+strideq*1], xm%3, 1 pextrw [dstq+strideq*2], xm%3, 2 pextrw [dstq+stride3q], xm%3, 3 lea dstq, [dstq+strideq*4] pextrw [dstq+strideq*0], xm%3, 4 pextrw [dstq+strideq*1], xm%3, 5 pextrw [dstq+strideq*2], xm%3, 6 pextrw [dstq+stride3q], xm%3, 7 lea dstq, [dstq+strideq*4] pextrw [dstq+strideq*0], xm%1, 0 pextrw [dstq+strideq*1], xm%1, 1 pextrw [dstq+strideq*2], xm%1, 2 pextrw [dstq+stride3q], xm%1, 3 lea dstq, [dstq+strideq*4] pextrw [dstq+strideq*0], xm%1, 4 pextrw [dstq+strideq*1], xm%1, 5 pextrw [dstq+strideq*2], xm%1, 6 pextrw [dstq+stride3q], xm%1, 7 lea dstq, [dstq+strideq*4+1] %endmacro %macro TRANSPOSE_16x8_AND_WRITE_8x32 9 ; 16x8 transpose punpcklbw m%9, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklbw m%4, m%5, m%6 punpckhbw m%5, m%6 punpcklbw m%6, m%7, m%8 punpckhbw m%7, m%8 punpcklwd m%8, m%9, m%2 punpckhwd m%9, m%2 punpcklwd m%2, m%1, m%3 punpckhwd m%1, m%3 punpcklwd m%3, m%4, m%6 punpckhwd m%4, m%6 punpcklwd m%6, m%5, m%7 punpckhwd m%5, m%7 punpckldq m%7, m%8, m%3 punpckhdq m%8, m%3 punpckldq m%3, m%9, m%4 punpckhdq m%9, m%4 punpckldq m%4, m%2, m%6 punpckhdq m%2, m%6 punpckldq m%6, m%1, m%5 punpckhdq m%1, m%5 ; write 8x32 movq [dstq+strideq*0-4], xm%7 movhps [dstq+strideq*1-4], xm%7 movq [dstq+strideq*2-4], xm%8 movhps [dstq+stride3q -4], xm%8 lea dstq, [dstq+strideq*4-4] movq [dstq+strideq*0], xm%3 movhps [dstq+strideq*1], xm%3 movq [dstq+strideq*2], xm%9 movhps [dstq+stride3q ], xm%9 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm%4 movhps [dstq+strideq*1], xm%4 movq [dstq+strideq*2], xm%2 movhps [dstq+stride3q ], xm%2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm%6 movhps [dstq+strideq*1], xm%6 movq [dstq+strideq*2], xm%1 movhps [dstq+stride3q ], xm%1 lea dstq, [dstq+strideq*4] vextracti128 xm%7, m%7, 1 vextracti128 xm%8, m%8, 1 vextracti128 xm%3, m%3, 1 vextracti128 xm%9, m%9, 1 vextracti128 xm%4, m%4, 1 vextracti128 xm%2, m%2, 1 vextracti128 xm%6, m%6, 1 vextracti128 xm%1, m%1, 1 movq [dstq+strideq*0], xm%7 movhps [dstq+strideq*1], xm%7 movq [dstq+strideq*2], xm%8 movhps [dstq+stride3q ], xm%8 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm%3 movhps [dstq+strideq*1], xm%3 movq [dstq+strideq*2], xm%9 movhps [dstq+stride3q ], xm%9 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm%4 movhps [dstq+strideq*1], xm%4 movq [dstq+strideq*2], xm%2 movhps [dstq+stride3q ], xm%2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm%6 movhps [dstq+strideq*1], xm%6 movq [dstq+strideq*2], xm%1 movhps [dstq+stride3q ], xm%1 lea dstq, [dstq+strideq*4+4] %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 mova %3, m15 %endif ; input in m0-15 punpcklbw m15, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklbw m7, m8, m9 punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 punpcklbw m11, m12, m13 punpckhbw m12, m13 mova m13, %3 mova %3, m12 punpcklbw m12, m14, m13 punpckhbw m13, m14, m13 ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 punpcklwd m14, m15, m1 punpckhwd m15, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpcklwd m6, m7, m9 punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 punpcklwd m10, m11, m12 punpckhwd m11, m12 mova m12, %3 mova %3, m11 punpcklwd m11, m12, m13 punpckhwd m12, m13 ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 punpckldq m13, m14, m2 punpckhdq m14, m2 punpckldq m2, m15, m3 punpckhdq m15, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 punpckldq m11, m8, m12 punpckhdq m8, m12 mova m12, %3 mova %3, m8 punpckldq m8, m7, m12 punpckhdq m7, m12 ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 punpcklqdq m12, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 punpcklqdq m8, m15, m7 punpckhqdq m15, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 punpcklqdq m10, m1, m9 punpckhqdq m1, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 mova m11, %3 mova %3, m12 punpcklqdq m12, m0, m11 punpckhqdq m0, m11 %if %2 == 0 mova m11, %3 %endif ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 SWAP 3, 14, 12, 9 %endmacro %macro STORE_16X16 0 movu [dstq+strideq*0-8], xm0 movu [dstq+strideq*1-8], xm1 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea dstq, [dstq+strideq*4-8] movu [dstq+strideq*0], xm4 movu [dstq+strideq*1], xm5 movu [dstq+strideq*2], xm6 movu [dstq+stride3q ], xm7 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0], xm8 movu [dstq+strideq*1], xm9 movu [dstq+strideq*2], xm10 movu [dstq+stride3q ], xm11 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0], xm12 movu [dstq+strideq*1], xm13 movu [dstq+strideq*2], xm14 movu [dstq+stride3q ], xm15 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m0, 1 vextracti128 [dstq+strideq*1], m1, 1 vextracti128 [dstq+strideq*2], m2, 1 vextracti128 [dstq+stride3q ], m3, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m4, 1 vextracti128 [dstq+strideq*1], m5, 1 vextracti128 [dstq+strideq*2], m6, 1 vextracti128 [dstq+stride3q ], m7, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m8, 1 vextracti128 [dstq+strideq*1], m9, 1 vextracti128 [dstq+strideq*2], m10, 1 vextracti128 [dstq+stride3q ], m11, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m12, 1 vextracti128 [dstq+strideq*1], m13, 1 vextracti128 [dstq+strideq*2], m14, 1 vextracti128 [dstq+stride3q ], m15, 1 lea dstq, [dstq+strideq*4+8] %endmacro ; Compute: ; pos_pix -= diff ; neg_pix += diff ; except when masked out by the associated lossless mask ; diff_lo/hi should be within [-128, 127]. This can be used for everything, ; but the first line of pixels. This can be verified by plugging in the max ; values of q_thr into (=>) the clamp for delta_m2 => delta_m2 => diff. ; For context, the max q_thr is 120 with bitdepth of 8. %macro SUB_ADD_DIFF 6 ; pos_pix, neg_pix, diff_lo, diff_hi, ; pos_ll_mask, neg_ll_mask packsswb %3, %4 pxor %1, m15 pxor %2, m15 pandn %4, %5, %3 pandn %3, %6, %3 psubsb %1, %4 paddsb %2, %3 pxor %1, m15 pxor %2, m15 %endmacro ; Perform the same operation as above, but with support for larger diffs. ; Also, no masking and doesn't modify pixels in place. %macro SUB_ADD_DIFF_LARGE 7 ; pos_dst, neg_dst, pos_src, neg_src, ; diff_lo, diff_hi, tmp punpcklbw %1, %3, m15 punpckhbw %2, %3, m15 psubw %1, %5 psubw %2, %6 packuswb %1, %2 punpcklbw %2, %4, m15 punpckhbw %7, %4, m15 paddw %2, %5 paddw %7, %6 packuswb %2, %7 %endmacro %macro FILTER 3 ; width [1/3/4/6/8], dir [h/v], is_edge [0,1] %assign is_edge %3 %assign is_chroma_edge is_edge && (%1 == 3 || %1 == 4) ; load data %ifidn %2, v ; load 6-8 pixels, remainder will be read inline lea tmpq, [dstq+mstrideq*4] mova m0, [dstq+strideq*0] ; 0 mova m1, [tmpq+stride3q] ; -1 mova m2, [dstq+strideq*1] ; 1 mova m3, [tmpq+strideq*2] ; -2 mova m4, [dstq+strideq*2] ; 2 mova m5, [tmpq+strideq*1] ; -3 %if %1 >= 3 mova m6, [dstq+stride3q] ; 3 %if !is_chroma_edge mova m7, [tmpq+strideq*0] ; -4 %endif %endif %else ; load lines %if %1 <= 3 ; TODO: for w == 1, we need 6 cols for rows 0,3, 4,7, 8,11 but, we only ; need the 4 cols of rows 1,2, 5,6, 9,10... Optimize the transpose ; for this %assign off %1 == 1 ? 3 : %1 + 1 movq xm3, [dstq+strideq*0-off] movq xm4, [dstq+strideq*1-off] movq xm5, [dstq+strideq*2-off] movq xm6, [dstq+stride3q -off] lea tmpq, [dstq+strideq*8-off] movhps xm3, [tmpq+strideq*0] movhps xm4, [tmpq+strideq*1] movhps xm5, [tmpq+strideq*2] movhps xm6, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*8] movq xm7, [tmpq+strideq*0] movq xm8, [tmpq+strideq*1] movq xm9, [tmpq+strideq*2] movq xm11, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*8] movhps xm7, [tmpq+strideq*0] movhps xm8, [tmpq+strideq*1] movhps xm9, [tmpq+strideq*2] movhps xm11, [tmpq+stride3q ] vinserti128 m3, xm7, 1 vinserti128 m4, xm8, 1 vinserti128 m5, xm9, 1 vinserti128 m6, xm11, 1 lea tmpq, [dstq+strideq*4-off] movq xm12, [tmpq+strideq*0] movq xm13, [tmpq+strideq*1] movq xm14, [tmpq+strideq*2] movq xm15, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*8] movhps xm12, [tmpq+strideq*0] movhps xm13, [tmpq+strideq*1] movhps xm14, [tmpq+strideq*2] movhps xm15, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*8] movq xm7, [tmpq+strideq*0] movq xm8, [tmpq+strideq*1] movq xm9, [tmpq+strideq*2] movq xm11, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*8] movhps xm7, [tmpq+strideq*0] movhps xm8, [tmpq+strideq*1] movhps xm9, [tmpq+strideq*2] movhps xm11, [tmpq+stride3q ] vinserti128 m12, xm7, 1 vinserti128 m13, xm8, 1 vinserti128 m14, xm9, 1 vinserti128 m15, xm11, 1 ; transpose 8x16 ; xm3: A-H0,A-H8 ; xm4: A-H1,A-H9 ; xm5: A-H2,A-H10 ; xm6: A-H3,A-H11 ; xm12: A-H4,A-H12 ; xm13: A-H5,A-H13 ; xm14: A-H6,A-H14 ; xm15: A-H7,A-H15 punpcklbw m7, m3, m4 punpckhbw m3, m4 punpcklbw m4, m5, m6 punpckhbw m5, m6 punpcklbw m6, m12, m13 punpckhbw m12, m13 punpcklbw m13, m14, m15 punpckhbw m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 punpcklwd m15, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 punpcklwd m5, m6, m13 punpckhwd m6, m13 punpcklwd m13, m12, m14 punpckhwd m12, m14 ; xm15: A0-3,B0-3,C0-3,D0-3 ; xm7: E0-3,F0-3,G0-3,H0-3 ; xm4: A8-11,B8-11,C8-11,D8-11 ; xm3: E8-11,F8-11,G8-11,H8-11 ; xm5: A4-7,B4-7,C4-7,D4-7 ; xm6: E4-7,F4-7,G4-7,H4-7 ; xm13: A12-15,B12-15,C12-15,D12-15 ; xm12: E12-15,F12-15,G12-15,H12-15 punpckldq m14, m15, m5 punpckhdq m15, m5 punpckldq m5, m7, m6 %if %1 == 3 punpckhdq m7, m6 %endif punpckldq m6, m4, m13 punpckhdq m4, m13 punpckldq m13, m3, m12 %if %1 == 3 punpckhdq m12, m3, m12 %endif ; xm14: A0-7,B0-7 ; xm15: C0-7,D0-7 ; xm5: E0-7,F0-7 ; xm7: G0-7,H0-7 ; xm6: A8-15,B8-15 ; xm4: C8-15,D8-15 ; xm13: E8-15,F8-15 ; xm12: G8-15,H8-15 punpcklqdq m3, m14, m6 punpckhqdq m14, m6 punpckhqdq m6, m15, m4 punpcklqdq m15, m4 punpcklqdq m4, m5, m13 punpckhqdq m13, m5, m13 %if %1 == 3 punpcklqdq m5, m7, m12 punpckhqdq m12, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 ; xm6: D0-15 ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 ; xm12: H0-15 SWAP 2, 13 SWAP 6, 12, 1 SWAP 3, 15, 7 SWAP 4, 5, 14, 0 ; 3,14,15,6,4,13,5,12 -> 7,5,3,1,0,2,4,6 mova [rsp+7*32], m7 mova [rsp+6*32], m6 %else SWAP 0, 6 SWAP 1, 15 SWAP 4, 13, 2 SWAP 3, 14, 5 ; 3,14,15,6,4,13 -> 5,3,1,0,2,4 %endif %else ; load and 16x16 transpose. Don't always use all pixels but we'll need the ; remainder at the end for the second transpose ; TODO: w == 4 only requires 8 pixels for output and could optimize this ; transpose movu xm0, [dstq+strideq*0-8] movu xm1, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4-8] movu xm4, [tmpq+strideq*0] movu xm5, [tmpq+strideq*1] movu xm6, [tmpq+strideq*2] movu xm7, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] movu xm8, [tmpq+strideq*0] movu xm9, [tmpq+strideq*1] movu xm10, [tmpq+strideq*2] movu xm11, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] movu xm12, [tmpq+strideq*0] movu xm13, [tmpq+strideq*1] movu xm14, [tmpq+strideq*2] movu xm15, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] vinserti128 m0, [tmpq+strideq*0], 1 vinserti128 m1, [tmpq+strideq*1], 1 vinserti128 m2, [tmpq+strideq*2], 1 vinserti128 m3, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m4, [tmpq+strideq*0], 1 vinserti128 m5, [tmpq+strideq*1], 1 vinserti128 m6, [tmpq+strideq*2], 1 vinserti128 m7, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m8, [tmpq+strideq*0], 1 vinserti128 m9, [tmpq+strideq*1], 1 vinserti128 m10, [tmpq+strideq*2], 1 vinserti128 m11, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m12, [tmpq+strideq*0], 1 vinserti128 m13, [tmpq+strideq*1], 1 vinserti128 m14, [tmpq+strideq*2], 1 vinserti128 m15, [tmpq+stride3q ], 1 %if %1 >= 6 TRANSPOSE_16X16B 0, 1, [rsp+15*32] mova [rsp+13*32], m1 mova [rsp+11*32], m2 mova [rsp+9*32], m3 mova [rsp+7*32], m4 mova [rsp+6*32], m11 mova [rsp+8*32], m12 mova [rsp+10*32], m13 mova [rsp+12*32], m14 mova [rsp+14*32], m15 %else TRANSPOSE_16X16B 0, 0, [rsp+0*32] mova [rsp+9*32], m3 mova [rsp+7*32], m4 mova [rsp+6*32], m11 mova [rsp+8*32], m12 %endif ; 4,5,6,7,8,9,10,11 -> 7,5,3,1,0,2,4,6 SWAP 0, 8 SWAP 2, 9 SWAP 6, 11, 3 SWAP 7, 4, 10, 1 %endif %endif ; Compute derivatives and transition ; To select filter length, only the sides of each x4 segment are needed. ; i.e. 0 _ _ 3 4 _ _ 7 8 _ _ 11 12 _ _ 15 | 16 _ _ ... vpbroadcastq m14, [side_shuf] vbroadcasti128 m15, [side_shuf_zxbw] ; s[0-3][0], t[0-3][0] ; where s and t are the respective sides pshufb m12, m0, m14 ; s[0-3][1], t[0-3][1] pshufb m11, m2, m14 ; lo 64-bit: s[0], s[1] ; hi 64-bit: t[0], t[1] punpcklbw m12, m11 ; s[0-3][-1], t[0-3][-1] pshufb m13, m1, m14 ; s[0-3][-2], t[0-3][-2] pshufb m11, m3, m14 ; lo 64-bit: s[-1], s[-2] ; hi 64-bit: t[-1], t[-2] punpcklbw m13, m11 ; Compute the second derivative and average the results for each side vpbroadcastd m14, [pb_1_m2] pmaddubsw m10, m12, m14 pmaddubsw m14, m13, m14 pshufb m8, m4, m15 pshufb m9, m5, m15 paddw m8, m10 paddw m9, m14 ; lo: abs(s[0] - 2 * s[1] + s[2]) ; hi: abs(t[0] - 2 * t[1] + t[2]) pabsw m8, m8 ; lo: abs(s[-1] - 2 * s[-2] + s[-3]) ; hi: abs(t[-1] - 2 * t[-2] + t[-3]) pabsw m9, m9 punpcklwd m14, m8, m9 punpckhwd m8, m9 ; second_deriv[1], second_deriv[-2] pavgw m8, m14 %if %1 >= 3 ; compute transition vpbroadcastd m14, [pb_m2_1] pmaddubsw m9, m12, m14 pmaddubsw m14, m13, m14 pshufb m11, m1, m15 pshufb m10, m0, m15 paddw m9, m11 paddw m10, m14 ; lo: abs(s[-1] - 2 * s[0] + s[1]) ; hi: abs(t[-1] - 2 * t[0] + t[1]) pabsw m9, m9 ; lo: abs(s[0] - 2 * s[-1] + s[-2]) ; hi: abs(t[0] - 2 * t[-1] + t[-2]) pabsw m10, m10 punpcklwd m14, m9, m10 punpckhwd m9, m10 ; second_deriv[0], second_deriv[-1] pavgw m9, m14 ; transition = second_deriv[-1] + second_deriv[0] ; duplicate the results across pairs of words pshufb m14, m9, [rev_shuf] paddw m9, m14 vpbroadcastd m14, [pb_m2_3] pmaddubsw m10, m12, m14 pshufb m11, m6, m15 psubw m10, m11 pabsw m10, m10 %if !is_chroma_edge pmaddubsw m14, m13, m14 pshufb m11, m7, m15 psubw m14, m11 pabsw m14, m14 punpcklwd m11, m10, m14 punpckhwd m10, m14 %else ; Skip the negative test by duplicating the positive derivatives punpcklwd m11, m10, m10 punpckhwd m10, m10 %endif pavgw m10, m11 %if %1 >= 4 ; load 4, -5 %ifidn %2, v mova m6, [dstq+strideq*4] mova m7, [tmpq+mstrideq*1] %else mova m6, [rsp+8*32] mova m7, [rsp+9*32] %endif vpbroadcastd m14, [pb_m3_4] pmaddubsw m11, m12, m14 pshufb m6, m15 psubw m11, m6 pabsw m11, m11 %if !is_chroma_edge pmaddubsw m14, m13, m14 pshufb m7, m15 psubw m14, m7 pabsw m14, m14 punpcklwd m6, m11, m14 punpckhwd m11, m14 %else punpcklwd m6, m11, m11 punpckhwd m11, m11 %endif pavgw m6, m11 %if %1 >= 6 ; load 6 %ifidn %2, v mova m7, [dstq+stride3q*2] %else mova m7, [rsp+12*32] %endif vpbroadcastd m14, [pb_m5_6] pmaddubsw m11, m12, m14 pmaddubsw m14, m13, m14 pshufb m7, m15 psubw m11, m7 ; load -7 %ifidn %2, v sub tmpq, strideq mova m7, [tmpq+mstrideq*2] add tmpq, strideq %else mova m7, [rsp+13*32] %endif pshufb m7, m15 psubw m14, m7 pabsw m11, m11 pabsw m14, m14 punpcklwd m7, m11, m14 punpckhwd m11, m14 pavgw m7, m11 %if %1 == 8 vpbroadcastd m14, [pb_m6_7] pmaddubsw m12, m14 %if !is_edge pmaddubsw m13, m14 %endif ; load 7 %ifidn %2, v add dstq, stride3q mova m14, [dstq+strideq*4] sub dstq, stride3q %else mova m14, [rsp+14*32] %endif pshufb m14, m15 psubw m12, m14 pabsw m12, m12 %if !is_edge ; load -8 %ifidn %2, v mova m14, [tmpq+mstrideq*4] ; -8 %else mova m14, [rsp+15*32] %endif pshufb m14, m15 psubw m13, m14 pabsw m13, m13 punpcklwd m11, m12, m13 punpckhwd m12, m13 %else punpcklwd m11, m12, m12 punpckhwd m12, m12 %endif pavgw m11, m12 %endif %endif %endif %endif movq xm15, [side_thrq] punpcklbw xm15, xm15 pmovzxbw m15, xm15 ; side thr comparisons ; duplicate side_thr to compare against both s[] and t[] at the same time ; successes have pairs of words [0,0] ; failures are [-1,-1], [-1,0], or [0,-1] %if %1 >= 3 %if %1 >= 6 vpbroadcastd m12, [pw_24576] pmulhuw m12, m15 ; (side_thr * 6) >> 4 pcmpgtw m7, m12 psrlw m12, 1 ; (side_thr * 3) >> 4 %else vpbroadcastd m12, [pw_12288] pmulhuw m12, m15 ; (side_thr * 3) >> 4 %endif pcmpgtw m10, m12 psrlw m14, m15, 2 ; side_thr >> 2 psrlw m13, m15, 3 ; side_thr >> 3 pcmpgtw m12, m8, m14 %if %1 >= 4 pcmpgtw m6, m14 %endif pcmpgtw m13, m8, m13 ; Or together earlier conditions ; The second_deriv[1] and second_deriv[-2] compares are subsets of later ; tests, so ors are skipped for those. por m13, m10 %if %1 >= 4 por m6, m13 %if %1 >= 6 por m7, m6 %if %1 == 8 psrlw m14, m15, 1 ; (side_thr * 8) >> 4 pcmpgtw m11, m14 por m11, m7 %endif %endif %endif %endif pcmpgtw m8, m15 movq xm14, [q_thrq] punpcklbw xm14, xm14 pmovzxbw m14, xm14 pxor m15, m15 ; q_thr comparisons then combine with side_thresh compares %if %1 >= 3 vpbroadcastd m10, [pw_4_3] pmullw m10, m14 ; q_thr * 4, q_thr * 3 pcmpgtw m10, m9, m10 por m13, m10 ; combine side_thr and q_thr conditions pblendw m10, m15, 0xaa por m12, m10 %if %1 == 8 ; q_thr * 32 > transition << 4 paddw m10, m14, m14 pcmpgtw m10, m9, m10 por m11, m10 %endif %if %1 >= 4 psllw m9, 4 ; transition <<= 4 %if %1 == 4 vpbroadcastd m10, [pw_45] pmullw m10, m14 pcmpgtw m10, m9, m10 por m6, m10 %else ; %1 >= 6 vpbroadcastd m10, [pw_45_40] pmullw m10, m14 pcmpgtw m10, m9, m10 por m7, m10 pblendw m10, m15, 0xaa por m6, m10 %endif %endif %endif ; For each width, check sets of comparisons worked. ; Successes are dword -1's now. pcmpeqd m8, m15 %if %1 >= 3 pcmpeqd m12, m15 pcmpeqd m13, m15 %if %1 >= 4 pcmpeqd m6, m15 %if %1 >= 6 pcmpeqd m7, m15 %if %1 == 8 pcmpeqd m11, m15 %endif %endif %endif %endif ; Contrain width selection by the masks mova m15, [pd_mask] %if %1 >= 3 %if %1 >= 4 %if %1 == 8 vpbroadcastb m10, [maskq+6] pand m11, m15 pand m11, m10 pcmpeqd m11, m15 %endif vpbroadcastb m9, [maskq+4] %if %1 == 8 por m9, m10 %endif pand m10, m9, m15 pand m6, m10 pcmpeqd m6, m15 %if %1 >= 6 pand m7, m10 pcmpeqd m7, m15 %endif %endif vpbroadcastb m10, [maskq+2] %if %1 >= 4 por m10, m9 %endif pand m9, m10, m15 pand m12, m9 pand m13, m9 pcmpeqd m12, m15 pcmpeqd m13, m15 %endif vpbroadcastb m9, [maskq+0] %if %1 >= 3 por m9, m10 %endif pand m8, m9 pand m8, m15 pcmpeqd m8, m15 ; Create a lookup table index based on the filter size. %if %1 >= 3 %if %1 == 8 paddb m8, m11 %endif %if %1 >= 6 paddb m12, m7 %endif %if %1 >= 4 paddb m13, m6 %endif vpbroadcastd m11, [mul_lut_offset] paddb m8, m12 paddb m8, m13 paddb m8, m8 psubb m8, m11, m8 %endif ; delta_m2 = 4 * (3 * (dst[0] - dst[-1]) - (dst[1] - dst[-2]) vpbroadcastd m13, [pb_12_m4] punpcklbw m9, m1, m3 punpckhbw m10, m1, m3 punpcklbw m11, m0, m2 punpckhbw m12, m0, m2 pmaddubsw m9, m13 pmaddubsw m10, m13 pmaddubsw m11, m13 pmaddubsw m12, m13 psubw m9, m11, m9 psubw m10, m12, m10 ; q_thr_clamp = q_thr * q_thresh_mults[] %if %1 == 1 psllw m11, m14, 5 ; q_thr * q_thresh_mult[0] %else vbroadcasti128 m11, [qthr_mul_lut] pshufb m11, m8 pmullw m11, m14 %endif punpckhdq m12, m11, m11 punpckldq m11, m11 ; iclip(delta_m2, -q_thr_clamp, q_thr_clamp) pminsw m9, m11 pminsw m10, m12 pxor m15, m15 psubw m11, m15, m11 psubw m12, m15, m12 pmaxsw m9, m11 pmaxsw m10, m12 %if !is_edge ; +0, -1 %if %1 == 1 vpbroadcastd m11, [pw_1360] pmulhrsw m6, m9, m11 pmulhrsw m7, m10, m11 %else vbroadcasti128 m11, [mul1_lut] pshufd m14, m8, q3322 pshufd m8, m8, q1100 pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m6, m9, m11 pmulhrsw m7, m10, m12 %endif SUB_ADD_DIFF_LARGE m11, m12, m0, m1, m6, m7, m13 mova m13, [pd_mask] vpbroadcastb m6, [ll_maskq+2] ; pos vpbroadcastb m7, [ll_maskq] ; neg pand m6, m13 pand m7, m13 pcmpeqd m6, m13 pcmpeqd m7, m13 %if %1 == 1 pandn m6, m8 pandn m7, m8 vpblendvb m0, m11, m6 vpblendvb m1, m12, m7 %else vpblendvb m0, m11, m0, m6 vpblendvb m1, m12, m1, m7 %endif %ifidn %2, v mova [dstq+strideq*0], m0 mova [tmpq+stride3q], m1 %elif %1 == 1 TRANSPOSE_16x2_AND_WRITE_2x32 1, 0, 11 %endif %if %1 >= 3 ; +1, -2 vbroadcasti128 m11, [mul2_lut] vpbroadcastd m15, [pb_128] pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 SUB_ADD_DIFF m2, m3, m11, m12, m6, m7 %ifidn %2, v mova [dstq+strideq*1], m2 mova [tmpq+strideq*2], m3 %elif %1 == 4 mova [rsp+0*32], m0 mova [rsp+1*32], m1 %elif %1 >= 6 mova [rsp+0*32], m0 mova [rsp+1*32], m1 mova [rsp+2*32], m2 mova [rsp+3*32], m3 %endif ; +2, -3 vbroadcasti128 m11, [mul3_lut] %ifidn %2, h pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 %else pshufb m0, m11, m8 pshufb m1, m11, m14 pmulhrsw m11, m9, m0 pmulhrsw m12, m10, m1 %endif SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 %ifidn %2, v mova [dstq+strideq*2], m4 mova [tmpq+strideq*1], m5 %elif %1 == 3 mova m12, [rsp+6*32] mova m13, [rsp+7*32] TRANSPOSE_16x8_AND_WRITE_8x32 13, 5, 3, 1, 0, 2, 4, 12, 11 %elif %1 == 4 SWAP 0, 4 SWAP 1, 5 %else mova [rsp+4*32], m4 mova [rsp+5*32], m5 %endif %if %1 >= 6 && %isidn(%2, v) ; start at +3, -4 lea dstq, [dstq+stride3q] vbroadcasti128 m11, [base_mul_lut] pshufb m8, m11, m8 pshufb m14, m11, m14 mov nd, %1-3 .loop_w%1: mova m4, [dstq] mova m5, [tmpq] psubusw m0, m8 psubusw m1, m14 pmulhrsw m11, m9, m0 pmulhrsw m12, m10, m1 SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 mova [dstq], m4 mova [tmpq], m5 add dstq, strideq sub tmpq, strideq dec nd jg .loop_w%1 %if %1 == 6 sub dstq, stride3q sub dstq, stride3q %else lea dstq, [dstq+8*mstrideq] %endif %elif %1 >= 4 ; +3, -4 %ifidn %2, v mova m4, [dstq+stride3q] mova m5, [tmpq+strideq*0] %else mova m4, [rsp+6*32] mova m5, [rsp+7*32] %endif vbroadcasti128 m11, [mul4_lut] pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 %ifidn %2, v mova [dstq+stride3q], m4 mova [tmpq+strideq*0], m5 %elif %1 == 4 mova m12, [rsp+0*32] mova m13, [rsp+1*32] TRANSPOSE_16x8_AND_WRITE_8x32 5, 1, 3, 13, 12, 2, 0, 4, 11 %elif %1 == 6 SWAP 0, 4 SWAP 1, 5 %else mova [rsp+6*32], m4 mova [rsp+7*32], m5 %endif ; Only horizontal deblock from here on out %if %1 >= 6 ; +4, -5 mova m4, [rsp+8*32] mova m5, [rsp+9*32] vbroadcasti128 m11, [mul5_lut] pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 %if %1 == 6 SWAP 2, 4 SWAP 3, 5 %else mova [rsp+8*32], m4 mova [rsp+9*32], m5 %endif ; +5, -6 mova m4, [rsp+10*32] mova m5, [rsp+11*32] vbroadcasti128 m11, [mul6_lut] pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 %if %1 == 6 SWAP 11, 0 SWAP 2, 5, 12 SWAP 4, 1, 13 ; 5,3,1,0,2,4 -> 2,3,4,11,12,13 mova m0, [rsp+15*32] mova m1, [rsp+13*32] mova m5, [rsp+5*32] mova m6, [rsp+3*32] mova m7, [rsp+1*32] mova m8, [rsp+0*32] mova m9, [rsp+2*32] mova m10, [rsp+4*32] mova m14, [rsp+12*32] TRANSPOSE_16X16B 1, 0, [rsp+14*32] STORE_16X16 %else SWAP 0, 4 SWAP 1, 5 %endif %if %1 == 8 ; +6, -7 mova m4, [rsp+12*32] mova m5, [rsp+13*32] vbroadcasti128 m11, [mul7_lut] pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 SWAP 2, 4 SWAP 3, 5 ; +7, -8 mova m4, [rsp+14*32] mova m5, [rsp+15*32] vbroadcasti128 m11, [mul8_lut] pshufb m12, m11, m14 pshufb m11, m8 pmulhrsw m11, m9, m11 pmulhrsw m12, m10, m12 SUB_ADD_DIFF m4, m5, m11, m12, m6, m7 SWAP 15, 4 SWAP 0, 5, 13 SWAP 2, 1, 3, 14 ; 5,3,1,0,2,4 -> 0,1,2,13,14,15 ; TODO: optimize this store away mova [rsp+14*32], m15 mova m3, [rsp+9*32] mova m4, [rsp+7*32] mova m5, [rsp+5*32] mova m6, [rsp+3*32] mova m7, [rsp+1*32] mova m8, [rsp+0*32] mova m9, [rsp+2*32] mova m10, [rsp+4*32] mova m11, [rsp+6*32] mova m12, [rsp+8*32] TRANSPOSE_16X16B 1, 0, [rsp+14*32] STORE_16X16 %endif %endif %endif %endif %else ; edge variant ; compute pos and neg sides of the filter seperately %ifidn %2, h ; Move to stack so we can access in a loop mova [rsp+2*32], m2 %if !is_chroma_edge mova [rsp+3*32], m3 %endif mova [rsp+4*32], m4 mova [rsp+5*32], m5 %endif mova m13, [pd_mask] vpbroadcastb m6, [ll_maskq+2] ; pos vpbroadcastb m7, [ll_maskq] ; neg pand m6, m13 pand m7, m13 pcmpeqd m6, m13 pcmpeqd m7, m13 pshufd m14, m8, q3322 pshufd m8, m8, q1100 %if is_chroma_edge vbroadcasti128 m13, [mul1_lut_uv_edge] %else vbroadcasti128 m13, [mul1_lut_y_edge] %endif ; -1 pshufb m4, m13, m8 pshufb m5, m13, m14 pmulhrsw m12, m4, m9 punpcklbw m11, m1, m15 paddw m11, m12 pmulhrsw m13, m5, m10 punpckhbw m12, m1, m15 paddw m12, m13 packuswb m11, m12 vpblendvb m11, m1, m7 %ifidn %2, v mova [tmpq+stride3q], m11 %else mova [rsp+1*32], m11 %endif ; 0 vbroadcasti128 m13, [mul1_lut] pshufb m11, m13, m8 pshufb m12, m13, m14 pmulhrsw m13, m11, m9 punpcklbw m1, m0, m15 psubw m1, m13 pmulhrsw m13, m12, m10 punpckhbw m15, m0, m15 psubw m15, m13 packuswb m1, m15 vpblendvb m1, m0, m6 %ifidn %2, v mova [dstq], m1 %else mova [rsp+0*32], m1 %endif vpbroadcastd m15, [pb_128] %if is_chroma_edge ; -2 vbroadcasti128 m13, [mul2_lut_uv_edge] pshufb m4, m13, m8 pshufb m5, m13, m14 pmulhrsw m4, m9 pmulhrsw m5, m10 packsswb m4, m5 pxor m3, m15 pandn m4, m7, m4 paddsb m3, m4 pxor m3, m15 %ifidn %2, v mova [tmpq+strideq*2], m3 %else mova [rsp+3*32], m3 %endif %else ; start at -2 %ifidn %2, v lea tmpq, [tmpq+strideq*2] %else lea tmpq, [rsp+3*32] %endif vbroadcasti128 m13, [base_mul_lut_y_edge] pshufb m2, m13, m8 pshufb m3, m13, m14 mov nd, 5 .neg_loop_w%1: mova m13, [tmpq] psubusw m4, m2 psubusw m5, m3 pmulhrsw m0, m4, m9 pmulhrsw m1, m5, m10 packsswb m0, m1 pxor m13, m15 pandn m0, m7, m0 paddsb m13, m0 pxor m13, m15 mova [tmpq], m13 %ifidn %2, v sub tmpq, strideq %else add tmpq, 32*2 %endif dec nd jg .neg_loop_w%1 %endif ; Start at 1 %ifidn %2, v lea tmpq, [dstq+strideq] %else lea tmpq, [rsp+32*2] %endif vbroadcasti128 m13, [base_mul_lut] pshufb m4, m13, m8 pshufb m5, m13, m14 vbroadcasti128 m13, [mul1_lut] pshufb m2, m13, m8 pshufb m3, m13, m14 mov nd, %1-1 .pos_loop_w%1: mova m13, [tmpq] psubusw m2, m4 psubusw m3, m5 pmulhrsw m0, m2, m9 pmulhrsw m1, m3, m10 packsswb m0, m1 pxor m13, m15 pandn m0, m6, m0 psubsb m13, m0 pxor m13, m15 mova [tmpq], m13 %ifidn %2, v add tmpq, strideq %else add tmpq, 32*2 %endif dec nd jg .pos_loop_w%1 %ifidn %2, h %if %1 != 8 mova m7, [rsp+7*32] mova m5, [rsp+5*32] mova m3, [rsp+3*32] mova m1, [rsp+1*32] mova m0, [rsp+0*32] mova m2, [rsp+2*32] mova m4, [rsp+4*32] mova m6, [rsp+6*32] TRANSPOSE_16x8_AND_WRITE_8x32 7, 5, 3, 1, 0, 2, 4, 6, 11 %else mova m0, [rsp+15*32] mova m1, [rsp+13*32] mova m2, [rsp+11*32] mova m3, [rsp+9*32] mova m4, [rsp+7*32] mova m5, [rsp+5*32] mova m6, [rsp+3*32] mova m7, [rsp+1*32] mova m8, [rsp+0*32] mova m9, [rsp+2*32] mova m10, [rsp+4*32] mova m11, [rsp+6*32] mova m12, [rsp+8*32] mova m13, [rsp+10*32] mova m14, [rsp+12*32] TRANSPOSE_16X16B 1, 0, [rsp+14*32] STORE_16X16 %endif %endif %endif %endmacro INIT_YMM avx2 cglobal deblock_v_sb_y_8bpc, 6, 12, 16, \ dst, stride, mask, ll_mask, q_thr, side_thr, \ edge, w, stride3, mstride, tmp, n movifnidn edged, edgem movifnidn wd, wm mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] .loop: cmp byte [maskq+6], 0 ; vmask[3] je .v6 test edged, edged jnz .v8_edge FILTER 8, v, 0 jmp .end .v8_edge: FILTER 8, v, 1 jmp .end .v6: cmp byte [maskq+4], 0 ; vmask[2] je .v3_test FILTER 6, v, 0 jmp .end .v3_test: cmp byte [maskq+2], 0 ; vmask[1] je .v1_test call .v3 jmp .end .v1_test: cmp byte [maskq+0], 0 ; vmask[0] je .end call .v1 .end: add dstq, 32 add maskq, 1 add ll_maskq, 1 add q_thrq, 8 add side_thrq, 8 sub wd, 8 jg .loop RET ALIGN function_align .v3: FILTER 3, v, 0 ret ALIGN function_align .v1: FILTER 1, v, 0 ret INIT_YMM avx2 cglobal deblock_h_sb_y_8bpc, 6, 11, 16, 32 * 16, \ dst, stride, mask, ll_mask, q_thr, side_thr, \ edge, h, stride3, tmp, n movifnidn edged, edgem movifnidn hd, hm lea stride3q, [strideq*3] .loop: cmp byte [maskq+6], 0 ; hmask[3] je .h6 test edged, edged jnz .h8_edge ; TODO: test edges inside horz filters to reduce binary size. Edges are ; less common for the horz case, so branches should be almost free. FILTER 8, h, 0 jmp .end .h8_edge: FILTER 8, h, 1 jmp .end .h6: cmp byte [maskq+4], 0 ; hmask[2] je .h3 FILTER 6, h, 0 jmp .end .h3: cmp byte [maskq+2], 0 ; hmask[1] je .h1_test FILTER 3, h, 0 jmp .end .h1_test: cmp byte [maskq+0], 0 ; hmask[0] je .no_filter call .h1 jmp .end .no_filter: lea dstq, [dstq+stride3q*8] lea dstq, [dstq+strideq*8] .end: add maskq, 1 add ll_maskq, 1 add q_thrq, 8 add side_thrq, 8 sub hd, 8 jg .loop RET ALIGN function_align .h1: FILTER 1, h, 0 ret INIT_YMM avx2 cglobal deblock_v_sb_uv_8bpc, 6, 12, 16, \ dst, stride, mask, ll_mask, q_thr, side_thr, \ edge, w, stride3, mstride, tmp, n movifnidn edged, edgem movifnidn wd, wm mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] .loop: cmp byte [maskq+4], 0 ; vmask[2] je .v3 test edged, edged jnz .v4_edge FILTER 4, v, 0 jmp .end .v4_edge: FILTER 4, v, 1 jmp .end .v3: cmp byte [maskq+2], 0 ; vmask[1] je .v1_test test edged, edged jnz .v3_edge call mangle(private_prefix %+ _deblock_v_sb_y_8bpc_avx2).v3 jmp .end .v3_edge: FILTER 3, v, 1 jmp .end .v1_test: cmp byte [maskq+0], 0 ; vmask[0] je .end call mangle(private_prefix %+ _deblock_v_sb_y_8bpc_avx2).v1 .end: add dstq, 32 add maskq, 1 add ll_maskq, 1 add q_thrq, 8 add side_thrq, 8 sub wd, 8 jg .loop RET INIT_YMM avx2 cglobal deblock_h_sb_uv_8bpc, 6, 11, 16, 32 * 10, \ dst, stride, mask, ll_mask, q_thr, side_thr, \ edge, h, stride3, tmp, n movifnidn edged, edgem movifnidn hd, hm lea stride3q, [strideq*3] .loop: cmp byte [maskq+4], 0 ; vmask[2] je .h3 test edged, edged jnz .h4_edge FILTER 4, h, 0 jmp .end .h4_edge: FILTER 4, h, 1 jmp .end .h3: cmp byte [maskq+2], 0 ; vmask[1] je .h1_test test edged, edged jnz .h3_edge ; TODO: call the luma version instead duplicating FILTER 3, h, 0 jmp .end .h3_edge: FILTER 3, h, 1 jmp .end .h1_test: cmp byte [maskq+0], 0 ; vmask[0] je .no_filter call mangle(private_prefix %+ _deblock_h_sb_y_8bpc_avx2).h1 jmp .end .no_filter: lea dstq, [dstq+stride3q*8] lea dstq, [dstq+strideq*8] .end: add maskq, 1 add ll_maskq, 1 add q_thrq, 8 add side_thrq, 8 sub hd, 8 jg .loop RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain.h000066400000000000000000000102671517466257200234070ustar00rootroot00000000000000/* * Copyright © 2018-2022, VideoLAN and dav2d authors * Copyright © 2018-2022, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/filmgrain.h" #define decl_fg_fns(ext) \ decl_generate_grain_y_fn(BF(dav2d_generate_grain_y, ext)); \ decl_generate_grain_uv_fn(BF(dav2d_generate_grain_uv_420, ext)); \ decl_generate_grain_uv_fn(BF(dav2d_generate_grain_uv_422, ext)); \ decl_generate_grain_uv_fn(BF(dav2d_generate_grain_uv_444, ext)); \ decl_fgy_32x32xn_fn(BF(dav2d_fgy_32x32xn, ext)); \ decl_fguv_32x32xn_fn(BF(dav2d_fguv_32x32xn_i420, ext)); \ decl_fguv_32x32xn_fn(BF(dav2d_fguv_32x32xn_i422, ext)); \ decl_fguv_32x32xn_fn(BF(dav2d_fguv_32x32xn_i444, ext)) decl_fg_fns(ssse3); decl_fg_fns(avx2); decl_fg_fns(avx512icl); static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav2dFilmGrainDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_SSSE3)) return; c->generate_grain_y = BF(dav2d_generate_grain_y, ssse3); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_generate_grain_uv_420, ssse3); c->fgy_32x32xn = BF(dav2d_fgy_32x32xn, ssse3); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_fguv_32x32xn_i420, ssse3); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_generate_grain_uv_422, ssse3); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_generate_grain_uv_444, ssse3); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_fguv_32x32xn_i422, ssse3); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_fguv_32x32xn_i444, ssse3); #if ARCH_X86_64 if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; c->generate_grain_y = BF(dav2d_generate_grain_y, avx2); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_generate_grain_uv_420, avx2); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_generate_grain_uv_422, avx2); c->generate_grain_uv[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_generate_grain_uv_444, avx2); if (!(flags & DAV2D_X86_CPU_FLAG_SLOW_GATHER)) { c->fgy_32x32xn = BF(dav2d_fgy_32x32xn, avx2); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_fguv_32x32xn_i420, avx2); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_fguv_32x32xn_i422, avx2); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_fguv_32x32xn_i444, avx2); } if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; if (BITDEPTH == 8 || !(flags & DAV2D_X86_CPU_FLAG_SLOW_GATHER)) { c->fgy_32x32xn = BF(dav2d_fgy_32x32xn, avx512icl); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I420 - 1] = BF(dav2d_fguv_32x32xn_i420, avx512icl); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I422 - 1] = BF(dav2d_fguv_32x32xn_i422, avx512icl); c->fguv_32x32xn[DAV2D_PIXEL_LAYOUT_I444 - 1] = BF(dav2d_fguv_32x32xn_i444, avx512icl); } #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain16_avx2.asm000066400000000000000000002205511517466257200250460ustar00rootroot00000000000000; Copyright © 2021-2022, VideoLAN and dav2d authors ; Copyright © 2021-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 16 pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_27_17_17_27: dw 27, 17, 17, 27 pw_23_22: dw 23, 22, 0, 32 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 gen_ar0_shift: times 4 db 128 times 4 db 64 times 4 db 32 times 4 db 16 pd_16: dd 16 pd_m65536: dd -65536 pb_1: times 4 db 1 grain_max: times 2 dw 511 times 2 dw 2047 grain_min: times 2 dw -512 times 2 dw -2048 fg_max: times 2 dw 1023 times 2 dw 4095 times 2 dw 960 times 2 dw 3840 times 2 dw 940 times 2 dw 3760 fg_min: times 2 dw 0 times 2 dw 64 times 2 dw 256 uv_offset_mul: dd 256 dd 1024 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16, 8 round_vals: dw 32, 64, 128, 256, 512, 1024 pb_8_9_0_1: db 8, 9, 0, 1 %macro JMP_TABLE 1-* %xdefine %1_table %%table %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%prefix %+ .ar%2 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) INIT_YMM avx2 cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax %define base r4-generate_grain_y_16bpc_avx2_table lea r4, [generate_grain_y_16bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] mov r3, -73*82*2 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] lea r7d, [bdmaxq+1] movq xm4, [base+mul_bits] shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc movq xm5, [base+hmul_bits] sub r6, r7 mova xm6, [base+pb_mask] sub bufq, r3 vpbroadcastw xm7, [base+round+r6*2-2] lea r6, [gaussian_sequence] movsxd r5, [r4+r5*4] .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm3, xm2 ; 4 next output seeds pshuflw xm0, xm3, q3333 psrlw xm3, 5 pand xm2, xm0, xm1 movq r7, xm3 psrlw xm3, xm2, 10 por xm2, xm3 pmullw xm2, xm4 pmulhuw xm0, xm5 movzx r8d, r7w pshufb xm3, xm6, xm2 psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm0, xm2 movd xm2, [r6+r8*2] rorx r8, r7, 32 por xm3, xm0 shr r7d, 16 pinsrw xm2, [r6+r7*2], 1 pshuflw xm0, xm3, q3333 movzx r7d, r8w psrlw xm3, 5 pinsrw xm2, [r6+r7*2], 2 shr r8d, 16 movq r7, xm3 pinsrw xm2, [r6+r8*2], 3 movzx r8d, r7w pinsrw xm2, [r6+r8*2], 4 rorx r8, r7, 32 shr r7d, 16 pinsrw xm2, [r6+r7*2], 5 movzx r7d, r8w pinsrw xm2, [r6+r7*2], 6 shr r8d, 16 pinsrw xm2, [r6+r8*2], 7 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support mova [bufq+r3], xm2 add r3, 8*2 jl .loop ; auto-regression code add r5, r4 jmp r5 .ar1: DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd xm4, [fg_dataq+FGData.ar_coeffs_y] DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 pinsrb xm4, [base+pb_1], 3 pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd sub bufq, 2*(82*73-(82*3+79)) mov hd, 70 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: mov xq, -76 movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu xm0, [bufq+xq*2-82*2-2] ; top/left psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right punpcklwd xm0, xm2 punpcklwd xm1, xm3 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 dec hd jg .y_loop_ar1 .ar0: RET .ar2: DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 vpbroadcastw xm10, [base+round_vals-12+shiftq*2] pxor m1, m1 punpcklwd xm10, xm1 pcmpgtb m1, m0 punpcklbw m0, m1 ; cf5-11,0-4 vpermq m1, m0, q3333 ; cf4 vbroadcasti128 m11, [base+gen_shufA] pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] vbroadcasti128 m12, [base+gen_shufB] pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] punpckhwd xm1, xm0 pshufhw xm9, xm0, q2121 pshufd xm8, xm1, q0000 ; cf[4,9] sar bdmaxd, 1 punpckhqdq xm9, xm9 ; cf[10,11] movd xm4, bdmaxd ; max_grain pcmpeqd xm5, xm5 sub bufq, 2*(82*73-(82*3+79)) pxor xm5, xm4 ; min_grain DEFINE_ARGS buf, fg_data, h, x mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] pmaddwd m0, m6 punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, m7 pmaddwd xm2, xm8 paddd m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm10 paddd xm2, xm0 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] paddd xm2, xm1 pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] .x_loop_ar2_inner: pmaddwd xm3, xm9, xm0 psrldq xm0, 2 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; skip packssdw because we only care about one value paddd xm3, xm1 pminsd xm3, xm4 psrldq xm1, 4 pmaxsd xm3, xm5 pextrw [bufq+xq*2], xm3, 0 punpcklwd xm3, xm3 pblendw xm0, xm3, 0010b inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] sar bdmaxd, 1 movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 vpbroadcastw xm11, [base+round_vals+shiftq*2-12] movd xm12, bdmaxd ; max_grain punpcklbw m7, m7 ; sign-extension punpcklbw m0, m0 ; sign-extension punpcklbw xm1, xm1 REPX {psraw x, 8}, m7, m0, xm1 pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] pshufd xm7, xm7, q3333 ; cf[6,13] pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] paddw xm0, xm11, xm11 pcmpeqd xm13, xm13 pblendw xm10, xm1, xm0, 00001000b pxor xm13, xm12 ; min_grain DEFINE_ARGS buf, fg_data, h, x sub bufq, 2*(82*73-(82*3+79)) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, m4 pmaddwd m2, m6 pmaddwd m3, m5 paddd m0, m2 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] paddd m0, m3 psrldq m3, m2, 2 punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] paddd m0, m3 psrldq m3, m2, 4 psrldq m2, 6 vpblendd m2, m11, 0x0f ; rounding constant punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] vextracti128 xm2, m1, 1 punpcklwd xm1, xm2 pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] paddd m0, m3 vextracti128 xm2, m0, 1 paddd xm0, xm1 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] paddd xm0, xm2 .x_loop_ar3_inner: pmaddwd xm2, xm1, xm10 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; skip packssdw because we only care about one value pminsd xm2, xm12 pmaxsd xm2, xm13 pextrw [bufq+xq*2], xm2, 0 pslldq xm2, 4 psrldq xm1, 2 pblendw xm1, xm2, 0100b inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 dec hd jg .y_loop_ar3 RET %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax %define base r8-generate_grain_uv_%1_16bpc_avx2_table lea r8, [generate_grain_uv_%1_16bpc_avx2_table] movifnidn bdmaxd, bdmaxm vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r5d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] lea r6d, [bdmaxq+1] movq xm4, [base+mul_bits] shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc movq xm5, [base+hmul_bits] sub r5, r6 mova xm6, [base+pb_mask] vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] vpbroadcastw xm7, [base+round+r5*2-2] pxor xm0, xm2 lea r6, [gaussian_sequence] %if %2 mov r7d, 73-35*%3 add bufq, 44*2 .loop_y: mov r5, -44*2 %else mov r5, -82*73*2 sub bufq, r5 %endif .loop_x: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 movq r10, xm2 movzx r9d, r10w movd xm2, [r6+r9*2] rorx r9, r10, 32 shr r10d, 16 pinsrw xm2, [r6+r10*2], 1 movzx r10d, r9w pinsrw xm2, [r6+r10*2], 2 shr r9d, 16 pinsrw xm2, [r6+r9*2], 3 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support movq [bufq+r5], xm2 add r5, 8 jl .loop_x %if %2 add bufq, 82*2 dec r7d jg .loop_y %endif ; auto-regression code movsxd r6, [fg_dataq+FGData.ar_coeff_lag] movsxd r6, [r8+r6*4] add r6, r8 jmp r6 INIT_YMM avx2 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] sar bdmaxd, 1 vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] movd xm6, bdmaxd pcmpeqw m7, m7 pmaddubsw m4, m0 ; ar_coeff << (14 - shift) vpbroadcastw m6, xm6 ; max_gain pxor m7, m6 ; min_grain DEFINE_ARGS buf, bufy, h, x %if %2 vpbroadcastw m5, [base+hmul_bits+2+%3*2] sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) %else sub bufq, 2*(82*70-3) %endif add bufyq, 2*(3+82*3) mov hd, 70-35*%3 .y_loop_ar0: %if %2 ; first 32 pixels movu xm0, [bufyq+16*0] vinserti128 m0, [bufyq+16*2], 1 movu xm1, [bufyq+16*1] vinserti128 m1, [bufyq+16*3], 1 %if %3 movu xm2, [bufyq+82*2+16*0] vinserti128 m2, [bufyq+82*2+16*2], 1 movu xm3, [bufyq+82*2+16*1] vinserti128 m3, [bufyq+82*2+16*3], 1 paddw m0, m2 paddw m1, m3 %endif phaddw m0, m1 movu xm1, [bufyq+16*4] vinserti128 m1, [bufyq+16*6], 1 movu xm2, [bufyq+16*5] vinserti128 m2, [bufyq+16*7], 1 %if %3 movu xm3, [bufyq+82*2+16*4] vinserti128 m3, [bufyq+82*2+16*6], 1 paddw m1, m3 movu xm3, [bufyq+82*2+16*5] vinserti128 m3, [bufyq+82*2+16*7], 1 paddw m2, m3 %endif phaddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 %else xor xd, xd .x_loop_ar0: movu m0, [bufyq+xq*2] movu m1, [bufyq+xq*2+32] %endif paddw m0, m0 paddw m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 %if %2 paddw m0, [bufq+ 0] paddw m1, [bufq+32] %else paddw m0, [bufq+xq*2+ 0] paddw m1, [bufq+xq*2+32] %endif pminsw m0, m6 pminsw m1, m6 pmaxsw m0, m7 pmaxsw m1, m7 %if %2 movu [bufq+ 0], m0 movu [bufq+32], m1 ; last 6 pixels movu xm0, [bufyq+32*4] movu xm1, [bufyq+32*4+16] %if %3 paddw xm0, [bufyq+32*4+82*2] paddw xm1, [bufyq+32*4+82*2+16] %endif phaddw xm0, xm1 movu xm1, [bufq+32*2] pmulhrsw xm0, xm5 paddw xm0, xm0 pmulhrsw xm0, xm4 paddw xm0, xm1 pminsw xm0, xm6 pmaxsw xm0, xm7 vpblendd xm0, xm1, 0x08 movu [bufq+32*2], xm0 %else movu [bufq+xq*2+ 0], m0 movu [bufq+xq*2+32], m1 add xd, 32 cmp xd, 64 jl .x_loop_ar0 ; last 12 pixels movu m0, [bufyq+64*2] movu m1, [bufq+64*2] paddw m0, m0 pmulhrsw m0, m4 paddw m0, m1 pminsw m0, m6 pmaxsw m0, m7 vpblendd m0, m1, 0xc0 movu [bufq+64*2], m0 %endif add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar0 RET INIT_XMM avx2 .ar1: DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd vpbroadcastw xm6, [base+hmul_bits+2+%3*2] vpbroadcastd xm3, xm3 %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu xm0, [bufq+xq*2-82*2-2] ; top/left %if %2 movu xm2, [bufyq+xq*4] %else movq xm2, [bufyq+xq*2] %endif %if %2 %if %3 phaddw xm2, [bufyq+xq*4+82*2] punpckhqdq xm1, xm2, xm2 paddw xm2, xm1 %else phaddw xm2, xm2 %endif pmulhrsw xm2, xm6 %endif psrldq xm1, xm0, 4 ; top/right punpcklwd xm1, xm2 psrldq xm2, xm0, 2 ; top punpcklwd xm0, xm2 pmaddwd xm1, xm5 pmaddwd xm0, xm4 paddd xm1, xm3 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 RET INIT_YMM avx2 .ar2: %if WIN64 %assign stack_size_padded 136 SUB rsp, stack_size_padded WIN64_PUSH_XMM 13 + %2, 8 %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vbroadcasti128 m10, [base+gen_shufA] sar bdmaxd, 1 vbroadcasti128 m11, [base+gen_shufB] movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 pinsrb xm7, [base+pb_1], 5 pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 pmovsxbw m7, xm7 movd xm8, bdmaxd ; max_grain pshufd m4, m7, q0000 vpbroadcastw xm12, [base+round_vals-12+shiftq*2] pshufd m5, m7, q1111 pcmpeqd xm9, xm9 pshufd m6, m7, q2222 pxor xm9, xm8 ; min_grain pshufd xm7, xm7, q3333 DEFINE_ARGS buf, bufy, fg_data, h, x %if %2 vpbroadcastw xm13, [base+hmul_bits+2+%3*2] sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] pmaddwd m0, m4 pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, m5 punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] %if %2 movu xm3, [bufyq+xq*4] %if %3 paddw xm3, [bufyq+xq*4+82*2] %endif phaddw xm3, xm3 pmulhrsw xm3, xm13 %else movq xm3, [bufyq+xq*2] %endif punpcklwd xm3, xm12 ; luma, round interleaved vpblendd m2, m3, 0x0f pmaddwd m2, m6 paddd m1, m0 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] paddd m2, m1 vextracti128 xm1, m2, 1 paddd xm2, xm1 pshufd xm1, xm0, q3321 pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword .x_loop_ar2_inner: pmaddwd xm3, xm7, xm0 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; we do not need to packssdw since we only care about one value paddd xm3, xm1 psrldq xm1, 4 pminsd xm3, xm8 pmaxsd xm3, xm9 pextrw [bufq+xq*2], xm3, 0 psrldq xm0, 2 pslldq xm3, 2 pblendw xm0, xm3, 00000010b inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 RET .ar3: %if WIN64 %assign stack_offset 32 %assign stack_size_padded 152 SUB rsp, stack_size_padded WIN64_PUSH_XMM 14 + %2, 8 %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vpbroadcastw xm11, [base+round_vals-12+shiftq*2] sar bdmaxd, 1 movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] pmovsxbw m7, xm7 %if %2 vpbroadcastw xm14, [base+hmul_bits+2+%3*2] %endif pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] pinsrb xm0, [base+pb_1], 3 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 pmovsxbw m0, xm0 movd xm12, bdmaxd ; max_grain pshufd m8, m0, q0000 pshufd m9, m0, q1111 pcmpeqd xm13, xm13 punpckhqdq xm10, xm0, xm0 pxor xm13, xm12 ; min_grain pinsrw xm10, [base+round_vals-10+shiftq*2], 3 DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, m4 pmaddwd m2, m6 pmaddwd m3, m5 paddd m0, m2 paddd m0, m3 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] %if %2 movu xm3, [bufyq+xq*4] %if %3 paddw xm3, [bufyq+xq*4+82*2] %endif phaddw xm3, xm3 pmulhrsw xm3, xm14 %else movq xm3, [bufyq+xq*2] %endif punpcklwd m1, m3 pmaddwd m1, m7 paddd m0, m1 psrldq m1, m2, 4 psrldq m3, m2, 6 vpblendd m3, m11, 0x0f ; rounding constant punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] psrldq m3, m2, 2 punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] paddd m0, m1 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] paddd m0, m2 vextracti128 xm2, m0, 1 paddd xm0, xm2 .x_loop_ar3_inner: pmaddwd xm2, xm1, xm10 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] psrldq xm1, 2 ; no need to packssdw since we only care about one value pminsd xm2, xm12 pmaxsd xm2, xm13 pextrw [bufq+xq*2], xm2, 0 pslldq xm2, 4 pblendw xm1, xm2, 00000100b inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 RET %endmacro cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, unused, sby, see %define base r11-grain_min lea r11, [grain_min] mov r6d, r9m ; bdmax mov r9d, [fg_dataq+FGData.clip_to_restricted_range] mov r7d, [fg_dataq+FGData.scaling_shift] mov sbyd, sbym vpbroadcastd m8, r9m shr r6d, 11 ; is_12bpc vpbroadcastd m9, [base+grain_min+r6*4] shlx r10d, r9d, r6d vpbroadcastd m10, [base+grain_max+r6*4] lea r9d, [r6+r9*4] vpbroadcastw m11, [base+mul_bits+r7*2-12] vpbroadcastd m12, [base+fg_min+r10*4] vpbroadcastd m13, [base+fg_max+r9*4] test sbyd, sbyd setnz r7b vpbroadcastd m14, [base+pd_16] test r7b, [fg_dataq+FGData.overlap_flag] jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak mov grain_lutq, grain_lutmp mov hd, hm .loop_y: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m4, m2, 0x55 psrld m2, m1, 16 mova m9, m6 pand m2, m8 vpgatherdd m5, [scalingq+m2-2], m6 pblendw m5, m3, 0x55 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m4, [grain_lutq+offxyq*2] pmulhrsw m5, [grain_lutq+offxyq*2+32] ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hd jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq*2] cmp byte [fg_dataq+FGData.overlap_flag], 0 je .loop_x movq xm7, [pw_27_17_17_27] cmp dword r8m, 0 ; sby jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m4, m2, 0x55 psrld m2, m1, 16 mova m9, m6 pand m2, m8 vpgatherdd m5, [scalingq+m2-2], m6 pblendw m5, m3, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] movd xm6, [grain_lutq+left_offxyq*2] punpcklwd xm6, xm3 pmaddwd xm6, xm7 paddd xm6, xm14 psrad xm6, 5 packssdw xm6, xm6 pmaxsw xm6, xm9 pminsw xm6, xm10 vpblendd m3, m6, 0x01 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m4, m3 pmulhrsw m5, [grain_lutq+offxyq*2+32] ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hd jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq*2] cmp dword r8m, 0 ; sby jne .loop_x_hv_overlap jmp .loop_x_h_overlap .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see, src_bak movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq .loop_x_v_overlap: vpbroadcastd m15, [pw_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, unused, top_offxy rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, unused, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_v_overlap: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m9, m6 pand m4, m8 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq*2] movu m5, [grain_lutq+top_offxyq*2] punpcklwd m4, m5, m6 punpckhwd m5, m6 pmaddwd m4, m15 pmaddwd m5, m15 movu m7, [grain_lutq+offxyq*2+32] movu m6, [grain_lutq+top_offxyq*2+32] paddd m4, m14 paddd m5, m14 psrad m4, 5 psrad m5, 5 packssdw m4, m5 punpcklwd m5, m6, m7 punpckhwd m6, m7 pmaddwd m5, m15 pmaddwd m6, m15 paddd m5, m14 paddd m6, m14 psrad m5, 5 psrad m6, 5 packssdw m5, m6 pmaxsw m4, m9 pmaxsw m5, m9 pminsw m4, m10 pminsw m5, m10 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m11 pmaddubsw m3, m11 paddw m2, m2 paddw m3, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hb jz .end_y_v_overlap vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: add wq, 32 jge .end lea srcq, [src_bakq+wq*2] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: vpbroadcastd m15, [pw_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+32] lea left_offxyd, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_hv_overlap: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m9, m6 pand m4, m8 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m7, [grain_lutq+offxyq*2] movd xm6, [grain_lutq+left_offxyq*2] movu m5, [grain_lutq+top_offxyq*2] movd xm4, [grain_lutq+topleft_offxyq*2] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklwd xm6, xm7 punpcklwd xm4, xm5 punpcklqdq xm6, xm4 movddup xm4, [pw_27_17_17_27] pmaddwd xm6, xm4 paddd xm6, xm14 psrad xm6, 5 packssdw xm6, xm6 pmaxsw xm6, xm9 pminsw xm6, xm10 pshuflw xm4, xm6, q1032 vpblendd m6, m7, 0xfe vpblendd m4, m5, 0xfe ; followed by v interpolation (top | cur -> cur) punpckhwd m5, m7 pmaddwd m5, m15 punpcklwd m4, m6 pmaddwd m4, m15 movu m7, [grain_lutq+offxyq*2+32] movu m6, [grain_lutq+top_offxyq*2+32] paddd m5, m14 paddd m4, m14 psrad m5, 5 psrad m4, 5 packssdw m4, m5 punpcklwd m5, m6, m7 punpckhwd m6, m7 pmaddwd m5, m15 pmaddwd m6, m15 paddd m5, m14 paddd m6, m14 psrad m5, 5 psrad m6, 5 packssdw m5, m6 pmaxsw m4, m9 pmaxsw m5, m9 pminsw m4, m10 pminsw m5, m10 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m11 pmaddubsw m3, m11 paddw m2, m2 paddw m3, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hb jz .end_y_hv_overlap vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_hv_overlap movq xm7, [pw_27_17_17_27] jmp .loop_y_h_overlap .end_y_hv_overlap: add wq, 32 lea srcq, [src_bakq+wq*2] jl .loop_x_hv_overlap .end: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r12-grain_min lea r12, [grain_min] mov r9d, r13m ; bdmax mov r7d, [fg_dataq+FGData.scaling_shift] mov r11d, is_idm mov sbyd, sbym vpbroadcastw m11, [base+mul_bits+r7*2-12] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] shr r9d, 11 ; is_12bpc vpbroadcastd m8, [base+grain_min+r9*4] shlx r10d, r6d, r9d vpbroadcastd m9, [base+grain_max+r9*4] vpbroadcastw m10, r13m shlx r6d, r6d, r11d vpbroadcastd m12, [base+fg_min+r10*4] lea r6d, [r9+r6*2] vpbroadcastd m13, [base+fg_max+r6*4] test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused, sby, see, overlap %if %1 mov r6d, r11m vpbroadcastd m0, [base+pb_8_9_0_1] vpbroadcastd m1, [base+uv_offset_mul+r9*4] vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m0 ; { uv_luma_mult, uv_mult } pmaddwd m15, m1 %else %if %2 vpbroadcastq m15, [base+pw_23_22] %else vpbroadcastq m15, [base+pw_27_17_17_27] %endif vpbroadcastd m14, [base+pd_16] %endif test r7b, [fg_dataq+FGData.overlap_flag] jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused2, unused3, see, unused4, unused5, unused6, luma, lstride mov lumaq, r9mp mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r9mp, r10 mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, unused2, unused3, luma, lstride rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq+ 0] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 pminuw m3, m10 ; clip_pixel() %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m4, [grain_lutq+offxyq*2] %if %2 pmulhrsw m5, [grain_lutq+offxyq*2+82*2] %else pmulhrsw m5, [grain_lutq+offxyq*2+32] %endif ; dst = clip_pixel(src, noise) %if %1 paddw m0, m4 paddw m1, m5 %else paddw m0, m4, [srcq] %if %2 paddw m1, m5, [srcq+strideq] %else paddw m1, m5, [srcq+32] %endif %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else mova [dstq+32], m1 add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(2<<%2) %if %2 sub hb, 2 %else dec hb %endif jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp byte [fg_dataq+FGData.overlap_flag], 0 je %%loop_x cmp dword r8m, 0 ; sby jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 ; clip_pixel() pminuw m3, m10 %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m2, [grain_lutq+offxyq*2] %if %2 movu m3, [grain_lutq+offxyq*2+82*2] %else movu m3, [grain_lutq+offxyq*2+32] %endif movd xm6, [grain_lutq+left_offxyq*2] %if %2 pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} punpckldq xm7, xm2, xm3 ; {cur0, cur1} punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} %else punpcklwd xm6, xm2 %endif %if %1 %if %2 vpbroadcastq xm7, [pw_23_22] %else movq xm7, [pw_27_17_17_27] %endif pmaddwd xm6, xm7 vpbroadcastd xm7, [pd_16] paddd xm6, xm7 %else pmaddwd xm6, xm15 paddd xm6, xm14 %endif psrad xm6, 5 packssdw xm6, xm6 pmaxsw xm6, xm8 pminsw xm6, xm9 vpblendd m2, m6, 0x01 %if %2 pshuflw xm6, xm6, q1032 vpblendd m3, m6, 0x01 %endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m2, m4 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) %if %1 paddw m0, m2 paddw m1, m3 %else paddw m0, m2, [srcq] %if %2 paddw m1, m3, [srcq+strideq] %else paddw m1, m3, [srcq+32] %endif %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else mova [dstq+32], m1 add srcq, strideq add dstq, strideq add lumaq, r10mp %endif add grain_lutq, 82*(2<<%2) %if %2 sub hb, 2 %else dec hb %endif jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp dword r8m, 0 ; sby jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, unused1, unused2, unused3, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, top_offxy, unused2, luma, lstride mov lumaq, r9mp mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r9mp, r10 mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x_v_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 lea r10, [pw_27_17_17_27] %endif %%loop_y_v_overlap: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 ; clip_pixel() pminuw m3, m10 %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq*2] movu m3, [grain_lutq+top_offxyq*2] punpcklwd m2, m3, m6 punpckhwd m3, m6 ; { top, cur } %if %3 vpbroadcastd m0, [pw_23_22] %elif %2 vpbroadcastd m0, [pw_27_17_17_27] %else vpbroadcastd m0, [r10] %endif REPX {pmaddwd x, m0}, m2, m3 %if %1 vpbroadcastd m1, [pd_16] REPX {paddd x, m1}, m2, m3 %else REPX {paddd x, m14}, m2, m3 %endif REPX {psrad x, 5}, m2, m3 packssdw m2, m3 %if %2 movu m3, [grain_lutq+offxyq*2+82*2] %else movu m3, [grain_lutq+offxyq*2+32] %endif %if %3 pmaxsw m2, m8 pminsw m2, m9 %else %if %2 movu m7, [grain_lutq+top_offxyq*2+82*2] punpckhwd m6, m3, m7 ; { cur, top } punpcklwd m3, m7 %else movu m7, [grain_lutq+top_offxyq*2+32] punpckhwd m6, m7, m3 punpcklwd m3, m7, m3 ; { top, cur } %endif pmaddwd m6, m0 pmaddwd m3, m0 %if %1 paddd m6, m1 paddd m3, m1 %else paddd m6, m14 paddd m3, m14 %endif psrad m6, 5 psrad m3, 5 packssdw m3, m6 pmaxsw m2, m8 pmaxsw m3, m8 pminsw m2, m9 pminsw m3, m9 %endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m2, m4 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m2, [srcq] %if %2 paddw m1, m3, [srcq+strideq] %else paddw m1, m3, [srcq+32] %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 sub hb, 2 %else mova [dstq+32], m1 dec hb %endif jle %%end_y_v_overlap %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(2<<%2) %if %2 jmp %%loop_y %else add hd, 0x80000000 jc %%loop_y add r10, 4 jmp %%loop_y_v_overlap %endif %%end_y_v_overlap: add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride %if %2 == 0 lea r14, [pw_27_17_17_27] %endif lea topleft_offxyq, [top_offxyq+(32>>%2)] lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %%loop_y_hv_overlap: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 ; clip_pixel() pminuw m3, m10 %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m0, [grain_lutq+offxyq*2] movd xm2, [grain_lutq+left_offxyq*2] movu m6, [grain_lutq+top_offxyq*2] %if %2 pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 movu m3, [grain_lutq+offxyq*2+82*2] punpckldq xm1, xm0, xm3 ; { cur0, cur1 } %if %3 vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } %else vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] vpblendd m2, m7, 0x20 movd xm7, [grain_lutq+top_offxyq*2+82*2] punpckldq xm7, xm6 vinserti128 m1, xm7, 1 movu m7, [grain_lutq+top_offxyq*2+82*2] %endif punpcklwd m2, m1 ; { cur, left } %if %1 vpbroadcastq m1, [pw_23_22] pmaddwd m2, m1 vpbroadcastd m1, [pd_16] paddd m2, m1 psrad m2, 5 packssdw m2, m2 vpermq m2, m2, q3120 %else pmaddwd m2, m15 paddd m2, m14 psrad m2, 5 vextracti128 xm1, m2, 1 packssdw xm2, xm1 %endif %else pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 movu m3, [grain_lutq+offxyq*2+32] movu m7, [grain_lutq+top_offxyq*2+32] punpckldq xm1, xm0, xm6 punpcklwd xm2, xm1 ; { cur, left } %if %1 movddup xm1, [pw_27_17_17_27] pmaddwd xm2, xm1 vpbroadcastd m1, [pd_16] paddd xm2, xm1 %else pmaddwd xm2, xm15 paddd xm2, xm14 %endif psrad xm2, 5 packssdw xm2, xm2 %endif pmaxsw xm2, xm8 pminsw xm2, xm9 vpblendd m0, m2, 0x01 %if %2 pshufd xm2, xm2, q0321 vpblendd m3, m2, 0x01 %if %3 == 0 pshufd xm2, xm2, q0321 vpblendd m7, m2, 0x01 %endif %endif pshuflw xm2, xm2, q1032 vpblendd m2, m6, 0xfe punpckhwd m6, m0 ; { top, cur } punpcklwd m2, m0 %if %3 vpbroadcastd m0, [pw_23_22] %elif %2 vpbroadcastd m0, [pw_27_17_17_27] %else vpbroadcastd m0, [r14] %endif pmaddwd m6, m0 pmaddwd m2, m0 %if %1 paddd m6, m1 paddd m2, m1 %else paddd m6, m14 paddd m2, m14 %endif psrad m6, 5 psrad m2, 5 packssdw m2, m6 %if %3 pmaxsw m2, m8 pminsw m2, m9 %else %if %2 punpckhwd m6, m3, m7 punpcklwd m3, m7 ; { cur, top } %else punpckhwd m6, m7, m3 punpcklwd m3, m7, m3 ; { top, cur } %endif REPX {pmaddwd x, m0}, m6, m3 %if %1 REPX {paddd x, m1}, m6, m3 %else REPX {paddd x, m14}, m6, m3 %endif REPX {psrad x, 5}, m6, m3 packssdw m3, m6 pmaxsw m2, m8 pmaxsw m3, m8 pminsw m2, m9 pminsw m3, m9 %endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m2, m4 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m2, [srcq] %if %2 paddw m1, m3, [srcq+strideq] %else paddw m1, m3, [srcq+32] %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else mova [dstq+32], m1 add srcq, strideq add dstq, strideq add lumaq, r10mp %endif add grain_lutq, 82*(2<<%2) %if %2 sub hb, 2 jg %%loop_y_h_overlap %else dec hb jle %%end_y_hv_overlap add hd, 0x80000000 jc %%loop_y_h_overlap add r14, 4 jmp %%loop_y_hv_overlap %endif %%end_y_hv_overlap: add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] jmp %%loop_x_hv_overlap %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 FGUV_FN 420, 1, 1 GEN_GRAIN_UV_FN 422, 1, 0 FGUV_FN 422, 1, 0 GEN_GRAIN_UV_FN 444, 0, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain16_avx512.asm000066400000000000000000000763211517466257200252200ustar00rootroot00000000000000; Copyright © 2022, VideoLAN and dav2d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 16 scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 fg_min: times 2 dw 0 times 2 dw 64 times 2 dw 256 fg_max: times 2 dw 1023 times 2 dw 4095 times 2 dw 960 times 2 dw 3840 times 2 dw 940 times 2 dw 3760 scale_rnd: dd 64 dd 16 uv_offset_mul: dd 256 dd 1024 pb_8_9_0_1: db 8, 9, 0, 1 cextern pb_0to63 SECTION .text INIT_ZMM avx512icl cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ grain_lut, offx, sby, see, offy, src_bak %define base r11-fg_min lea r11, [fg_min] mov r6d, r9m ; bdmax mov r9d, [fg_dataq+FGData.clip_to_restricted_range] mov r7d, [fg_dataq+FGData.scaling_shift] mov sbyd, sbym vpbroadcastd m6, r9m shr r6d, 11 ; is_12bpc vbroadcasti32x4 m7, [base+scale_mask] shlx r10d, r9d, r6d vpbroadcastd m10, [base+scale_shift+r7*4-32] lea r9d, [r6+r9*4] vpbroadcastd m8, [base+fg_min+r10*4] kxnorw k1, k1, k1 ; 0xffff vpbroadcastd m9, [base+fg_max+r9*4] mov r12, 0xeeeeeeeeeeeeeeee vpbroadcastd m19, [base+scale_rnd+r6*4] kshiftrb k2, k1, 4 ; 0xf vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] kmovq k3, r12 vpbroadcastd m11, [base+scale_shift+r6*8+4] test sbyd, sbyd setnz r7b vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] test r7b, [fg_dataq+FGData.overlap_flag] jnz .v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak mov grain_lutq, grain_lutmp mov hd, hm .loop_y: movu m4, [grain_lutq+offxyq*2+82*0] movu m5, [grain_lutq+offxyq*2+82*2] call .add_noise sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq*2] cmp byte [fg_dataq+FGData.overlap_flag], 0 je .loop_x test sbyd, sbyd jnz .hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ sby, see, offy, src_bak, left_offxy lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: movu m4, [grain_lutq+offxyq*2+82*0] movu m5, [grain_lutq+offxyq*2+82*2] movd xm17, [grain_lutq+left_offxyq*2-82*1] pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 punpckldq xm16, xm4, xm5 punpcklwd xm17, xm16 mova xm16, xm19 vpdpwssd xm16, xm20, xm17 psrad xm16, 1 packssdw xm16, xm16 vpsravw xm16, xm11 vmovdqu8 m4{k2}, m16 vpalignr m5{k2}, m16, m16, 4 call .add_noise sub hb, 2 jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq*2] test sbyd, sbyd jnz .hv_overlap jmp .loop_x_h_overlap .v_overlap: movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ sby, see, offy, src_bak, _, top_offxy rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak, _, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu m16, [grain_lutq+offxyq*2+82*0] movu m0, [grain_lutq+top_offxyq*2+82*0] movu m17, [grain_lutq+offxyq*2+82*2] movu m1, [grain_lutq+top_offxyq*2+82*2] punpckhwd m4, m0, m16 punpcklwd m0, m16 punpckhwd m5, m1, m17 punpcklwd m1, m17 call .add_noise_v sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq*2] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to .v_overlap, and instead always fall-through to .hv_overlap .hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+73] lea left_offxyd, [offyq+73] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu m5, [grain_lutq+offxyq*2+82*0] movu m0, [grain_lutq+top_offxyq*2+82*0] movd xm17, [grain_lutq+left_offxyq*2-82*1] pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 movu m2, [grain_lutq+offxyq*2+82*2] movu m1, [grain_lutq+top_offxyq*2+82*2] movd xm18, [grain_lutq+left_offxyq*2+82*1] pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 punpckldq xm16, xm5, xm0 punpcklwd xm17, xm16 mova xm16, xm19 vpdpwssd xm16, xm20, xm17 punpckldq xm17, xm2, xm1 punpcklwd xm18, xm17 mova xm17, xm19 vpdpwssd xm17, xm20, xm18 punpckhwd m4, m0, m5 punpcklwd m0, m5 punpckhwd m5, m1, m2 punpcklwd m1, m2 psrad xm16, 1 psrad xm17, 1 packssdw xm16, xm17 vpsravw xm16, xm11 vpshuflw m0{k2}, m16, q1302 punpckhqdq xm16, xm16 vpshuflw m1{k2}, m16, q1302 call .add_noise_v sub hb, 2 jg .loop_y_h_overlap add wq, 32 lea srcq, [src_bakq+wq*2] jl .hv_overlap .end: RET ALIGN function_align .add_noise_v: mova m2, m19 vpdpwssd m2, m12, m4 mova m3, m19 vpdpwssd m3, m13, m5 mova m4, m19 vpdpwssd m4, m12, m0 mova m5, m19 vpdpwssd m5, m13, m1 REPX {psrad x, 1}, m2, m3, m4, m5 packssdw m4, m2 packssdw m5, m3 vpsravw m4, m11 vpsravw m5, m11 .add_noise: mova m0, [srcq+strideq*0] mova m1, [srcq+strideq*1] kmovw k4, k1 pand m16, m6, m0 psrld m3, m0, 16 vpgatherdd m2{k4}, [scalingq+m16] vpcmpud k4, m3, m6, 2 ; px <= bdmax vpgatherdd m16{k4}, [scalingq+m3] kmovw k4, k1 pand m17, m6, m1 vpgatherdd m3{k4}, [scalingq+m17] vpshufb m2{k3}, m16, m7 psrld m16, m1, 16 vpcmpud k4, m16, m6, 2 vpgatherdd m17{k4}, [scalingq+m16] vpshufb m3{k3}, m17, m7 vpsllvw m2, m10 vpsllvw m3, m10 pmulhrsw m4, m2 pmulhrsw m5, m3 add grain_lutq, 82*4 paddw m0, m4 paddw m1, m5 pmaxsw m0, m8 pmaxsw m1, m8 pminsw m0, m9 pminsw m1, m9 mova [dstq+srcq], m0 add srcq, strideq mova [dstq+srcq], m1 add srcq, strideq ret %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r12-fg_min lea r12, [fg_min] mov r9d, r13m ; bdmax mov r7d, [fg_dataq+FGData.scaling_shift] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] mov r11d, is_idm kxnorw k1, k1, k1 ; 0xffff vpbroadcastd m5, r13m mov r13, 0xeeeeeeeeeeeeeeee vbroadcasti32x4 m6, [base+scale_mask] shr r9d, 11 ; is_12bpc vpbroadcastd m7, [base+scale_shift+r7*4-32] shlx r10d, r6d, r9d mov sbyd, sbym shlx r6d, r6d, r11d vpbroadcastd m8, [base+fg_min+r10*4] lea r6d, [r9+r6*2] vpbroadcastd m9, [base+fg_max+r6*4] kmovq k2, r13 vpbroadcastd m20, [base+scale_rnd+r9*4] packssdw m4, m5, m5 vpbroadcastd m21, [base+scale_shift+r9*8+4] %if %2 mova m12, [pb_0to63] ; pw_even mov r13d, 0x0101 vpbroadcastq m10, [base+pw_23_22+r9*8] kmovw k3, r13d %if %3 pshufd m11, m10, q0000 %else vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] vmovdqu16 m11{k1}, m16 %endif psrlw m13, m12, 8 ; pw_odd %else vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] kshiftrb k3, k1, 7 ; 0x01 kshiftrb k4, k1, 4 ; 0x0f pshufd m11, m10, q0000 %endif mov lstrideq, r10mp test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ _, sby, see, lstride %if %1 mov r6d, r11m vpbroadcastd m0, [base+uv_offset_mul+r9*4] vpbroadcastd m1, [base+pb_8_9_0_1] vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] pmaddwd m14, m0 pshufb m15, m1 ; { uv_luma_mult, uv_mult } %endif test r7b, [fg_dataq+FGData.overlap_flag] jnz %%v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma mov lumaq, r9mp lea r12, [srcq+wq*2] lea r13, [dstq+wq*2] lea r14, [lumaq+wq*(2<<%2)] mov r9mp, r12 mov r10mp, r13 mov r11mp, r14 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: %if %2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+offxyq*2+82*2] %endif call %%add_noise sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp byte [fg_dataq+FGData.overlap_flag], 0 je %%loop_x cmp dword r8m, 0 ; sby jne %%hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma, left_offxy lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma, left_offxy mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: %if %2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 movd xm16, [grain_lutq+left_offxyq*2+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 movd xm17, [grain_lutq+left_offxyq*2+82*4] vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 punpckldq m16, m17 punpckldq m17, m18, m19 punpcklwd m16, m17 mova m17, m20 vpdpwssd m17, m16, m10 psrad m17, 1 packssdw m17, m17 vpsravw m17, m21 %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+offxyq*2+82*2] movd xm16, [grain_lutq+left_offxyq*2+82*0] pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 punpckldq xm17, xm18, xm19 punpcklwd xm16, xm17 mova xm17, xm20 vpdpwssd xm17, xm16, xm10 psrad xm17, 1 packssdw xm17, xm17 vpsravw xm17, xm21 %endif vmovdqa32 m18{k3}, m17 vpshufd m19{k3}, m17, q0321 call %%add_noise sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp dword r8m, 0 ; sby jne %%hv_overlap jmp %%loop_x_h_overlap %%v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ _, sby, see, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma, _, top_offxy mov lumaq, r9mp lea r12, [srcq+wq*2] lea r13, [dstq+wq*2] lea r14, [lumaq+wq*(2<<%2)] mov r9mp, r12 mov r10mp, r13 mov r11mp, r14 neg wq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma, _, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %3 movu ym16, [grain_lutq+offxyq*2+82*0] movu ym1, [grain_lutq+top_offxyq*2+82*0] vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 punpcklwd ym17, ym1, ym16 punpckhwd ym1, ym16 %elif %2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym17, [grain_lutq+top_offxyq*2+82*0] vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 punpcklwd m16, m17, m18 punpckhwd m17, m18 %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+top_offxyq*2+82*0] movu m2, [grain_lutq+offxyq*2+82*2] movu m16, [grain_lutq+top_offxyq*2+82*2] punpckhwd m1, m19, m18 punpcklwd m19, m18 punpckhwd m18, m2, m16 punpcklwd m2, m16 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to %%v_overlap, and instead always fall-through to %%hv_overlap %%hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+(32>>%2)] lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 ; grain = grain_lut[offy+y][offx+x] %if %2 movd xm16, [grain_lutq+left_offxyq*2+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 movd xm17, [grain_lutq+left_offxyq*2+82*4] vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 punpckldq m16, m17 punpckldq m17, m18, m19 punpcklwd m16, m17 movu ym1, [grain_lutq+top_offxyq*2+82*0] movd xm17, [grain_lutq+topleft_offxyq*2+82*0] mova m0, m20 vpdpwssd m0, m16, m10 %if %3 punpcklwd xm17, xm1 mova xm16, xm20 vpdpwssd xm16, xm17, xm10 psrad xm16, 1 %else vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 punpcklwd m17, m1 mova m16, m20 vpdpwssd m16, m17, m10 psrad m16, 1 %endif psrad m0, 1 packssdw m0, m16 vpsravw m0, m21 vmovdqa32 m18{k3}, m0 vpshufd m19{k3}, m0, q0321 %if %3 vpunpckhdq ym1{k3}, ym0, ym0 punpcklwd ym17, ym1, ym18 punpckhwd ym1, ym18 %else vpunpckhdq m1{k3}, m0, m0 punpcklwd m16, m1, m18 punpckhwd m17, m1, m18 %endif %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+top_offxyq*2+82*0] movd xm17, [grain_lutq+left_offxyq*2+82*0] pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 punpckldq xm16, xm18, xm19 punpcklwd xm17, xm16 movu m2, [grain_lutq+offxyq*2+82*2] movu m0, [grain_lutq+top_offxyq*2+82*2] movd xm16, [grain_lutq+left_offxyq*2+82*2] pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 punpckldq xm1, xm2, xm0 punpcklwd xm1, xm16, xm1 mova xm16, xm20 vpdpwssd xm16, xm17, xm10 mova xm17, xm20 vpdpwssd xm17, xm1, xm10 punpckhwd m1, m19, m18 punpcklwd m19, m18 punpckhwd m18, m2, m0 punpcklwd m2, m0 psrad xm16, 1 psrad xm17, 1 packssdw xm16, xm17 vpsravw xm16, xm21 vpshuflw m19{k4}, m16, q1302 punpckhqdq xm16, xm16 vpshuflw m2{k4}, m16, q3120 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] jmp %%hv_overlap ALIGN function_align %%add_noise_v: %if %3 mova ym16, ym20 vpdpwssd ym16, ym17, ym11 mova ym17, ym20 vpdpwssd ym17, ym1, ym11 psrad ym16, 1 psrad ym17, 1 packssdw ym16, ym17 vpsravw m18{k1}, m16, m21 %elif %2 mova m18, m20 vpdpwssd m18, m16, m11 mova m16, m20 vpdpwssd m16, m17, m11 psrad m18, 1 psrad m16, 1 packssdw m18, m16 vpsravw m18, m21 %else mova m16, m20 vpdpwssd m16, m1, m11 mova m17, m20 vpdpwssd m17, m18, m11 mova m18, m20 vpdpwssd m18, m19, m11 mova m19, m20 vpdpwssd m19, m2, m11 REPX {psrad x, 1}, m16, m17, m18, m19 packssdw m18, m16 packssdw m19, m17 vpsravw m18, m21 vpsravw m19, m21 %endif %%add_noise: %if %2 mova m2, [lumaq+lstrideq*(0<<%3)] mova m0, [lumaq+lstrideq*(1<<%3)] lea lumaq, [lumaq+lstrideq*(2<<%3)] mova m3, [lumaq+lstrideq*(0<<%3)] mova m1, [lumaq+lstrideq*(1<<%3)] mova m16, m12 vpermi2w m16, m2, m0 vpermt2w m2, m13, m0 mova m17, m12 vpermi2w m17, m3, m1 vpermt2w m3, m13, m1 pavgw m2, m16 pavgw m3, m17 %elif %1 mova m2, [lumaq+lstrideq*0] mova m3, [lumaq+lstrideq*1] %endif %if %2 mova ym16, [srcq+strideq*0] vinserti32x8 m16, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] %else mova m16, [srcq+strideq*0] %endif %if %1 punpckhwd m17, m2, m16 mova m0, m14 vpdpwssd m0, m17, m15 punpcklwd m17, m2, m16 mova m2, m14 vpdpwssd m2, m17, m15 %endif %if %2 mova ym17, [srcq+strideq*0] vinserti32x8 m17, [srcq+strideq*1], 1 %else mova m17, [srcq+strideq*1] %endif %if %1 psrad m0, 6 psrad m2, 6 packusdw m2, m0 punpckhwd m0, m3, m17 mova m1, m14 vpdpwssd m1, m15, m0 punpcklwd m0, m3, m17 mova m3, m14 vpdpwssd m3, m15, m0 psrad m1, 6 psrad m3, 6 packusdw m3, m1 pminuw m2, m4 pminuw m3, m4 .add_noise_main: ; scaling[luma_src] kmovw k5, k1 pand m1, m5, m2 vpgatherdd m0{k5}, [scalingq+m1] kmovw k5, k1 psrld m2, 16 vpgatherdd m1{k5}, [scalingq+m2] vpshufb m0{k2}, m1, m6 kmovw k5, k1 psrld m1, m3, 16 vpgatherdd m2{k5}, [scalingq+m1] kmovw k5, k1 pand m3, m5 vpgatherdd m1{k5}, [scalingq+m3] vpshufb m1{k2}, m2, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) vpsllvw m0, m7 vpsllvw m1, m7 pmulhrsw m18, m0 pmulhrsw m19, m1 add grain_lutq, 82*(4<<%2) lea lumaq, [lumaq+lstrideq*(2<<%3)] lea srcq, [srcq+strideq*2] paddw m16, m18 paddw m17, m19 pmaxsw m16, m8 pmaxsw m17, m8 pminsw m16, m9 pminsw m17, m9 %if %2 mova [dstq+strideq*0], ym16 vextracti32x8 [dstq+strideq*1], m16, 1 lea dstq, [dstq+strideq*2] mova [dstq+strideq*0], ym17 vextracti32x8 [dstq+strideq*1], m17, 1 %else mova [dstq+strideq*0], m16 mova [dstq+strideq*1], m17 %endif lea dstq, [dstq+strideq*2] ret %else %if %2 pand m2, m4 pand m3, m4 %else pand m2, m4, [lumaq+lstrideq*0] pand m3, m4, [lumaq+lstrideq*1] %endif jmp .add_noise_main %endif %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 %endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain16_sse.asm000066400000000000000000002734561517466257200247740ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" SECTION_RODATA 16 pd_16: times 4 dd 16 pw_1: times 8 dw 1 pw_16384: times 8 dw 16384 pw_8192: times 8 dw 8192 pw_23_22: dw 23, 22 times 3 dw 0, 32 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 pw_27_17_17_27: dw 27, 17, 17, 27 times 2 dw 0, 32 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512, 1024 max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 min: dw 0, 16*4, 16*16 ; these two should be next to each other pw_4: times 2 dw 4 pw_16: times 2 dw 16 %macro JMP_TABLE 1-* %xdefine %1_table %%table %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%prefix %+ .ar%2 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 SECTION .text %if ARCH_X86_32 %undef base %define PIC_ptr(a) base+a %else %define PIC_ptr(a) a %endif %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg %assign %%idx 0 %define %%tmp %2 %if %0 == 8 %define %%tmp %8 %endif %rep (%6/2) %if %%idx == 0 movd %5 %+ d, %2 pshuflw %%tmp, %2, q3232 %else movd %5 %+ d, %%tmp %if %6 == 8 %if %%idx == 2 punpckhqdq %%tmp, %%tmp %elif %%idx == 4 psrlq %%tmp, 32 %endif %endif %endif movzx %4 %+ d, %5 %+ w shr %5 %+ d, 16 %if %%idx == 0 movd %1, [%3+%4*%7] %else pinsrw %1, [%3+%4*%7], %%idx + 0 %endif pinsrw %1, [%3+%5*%7], %%idx + 1 %assign %%idx %%idx+2 %endrep %endmacro %macro SPLATD 2 ; dst, src %ifnidn %1, %2 movd %1, %2 %endif pshufd %1, %1, q0000 %endmacro %macro SPLATW 2 ; dst, src %ifnidn %1, %2 movd %1, %2 %endif pshuflw %1, %1, q0000 punpcklqdq %1, %1 %endmacro INIT_XMM ssse3 %if ARCH_X86_64 cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax lea r4, [pb_mask] %define base r4-pb_mask %else cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax LEA r4, $$ %define base r4-$$ %endif movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r3d, [fg_dataq+FGData.grain_scale_shift] lea r5d, [bdmaxq+1] shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc sub r3, r5 SPLATW m6, [base+round+r3*2-2] mova m5, [base+pb_mask] SPLATW m0, [fg_dataq+FGData.seed] mov r3, -73*82*2 sub bufq, r3 %if ARCH_X86_64 lea r6, [gaussian_sequence] %endif .loop: pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m2, m3 psllq m3, m2, 15 por m2, m3 ; aggregate each bit into next seed's high bit pmulhuw m3, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 vpgatherdw m3, m2, r6, r5, r7, 4, 2 %else vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 %endif paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 ; shifts by 0, which pmulhrsw does not support pmulhrsw m3, m6 movq [bufq+r3], m3 add r3, 4*2 jl .loop ; auto-regression code movsxd r3, [fg_dataq+FGData.ar_coeff_lag] movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] jmp r3 .ar1: %if WIN64 DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 lea bufq, [r0-2*(82*73-(82*3+79))] PUSH r8 %else %if ARCH_X86_64 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 %else ; x86-32 DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 PUSH r6 %define shiftd r1d %endif sub bufq, 2*(82*73-(82*3+79)) %endif movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd m4, [fg_dataq+FGData.ar_coeffs_y] mov shiftd, [fg_dataq+FGData.ar_coeff_shift] %if WIN64 DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 %elif ARCH_X86_64 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 %else ; x86-32 %undef shiftd DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 %define hd dword r0m %define maxd dword minm %endif %if cpuflag(sse4) pmovsxbw m4, m4 %else pxor m3, m3 pcmpgtb m3, m4 punpcklbw m4, m3 %endif pinsrw m4, [base+pw_1], 3 pshufd m5, m4, q1111 pshufd m4, m4, q0000 SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd mov hd, 70 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: mov xq, -76 movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu m0, [bufq+xq*2-82*2-2] ; top/left psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 punpcklwd m1, m3 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 dec hd jg .y_loop_ar1 %if WIN64 POP r8 %elif ARCH_X86_32 POP r6 %undef maxd %undef hd %endif .ar0: RET .ar2: %if ARCH_X86_32 ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m0, [base+round_vals-12+shiftq*2] pshuflw m0, m0, q0000 movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 pxor m2, m2 punpcklwd m0, m2 pcmpgtb m2, m6 punpckhbw m3, m6, m2 punpcklbw m6, m2 pshufd m2, m6, q3333 pshufd m1, m6, q2222 pshufd m7, m6, q1111 pshufd m6, m6, q0000 pshufd m4, m3, q1111 pshufd m3, m3, q0000 %if ARCH_X86_64 SWAP 0, 12 SWAP 1, 8 SWAP 2, 9 SWAP 3, 10 SWAP 4, 11 %else %define m12 [rsp+0*16] %define m8 [rsp+1*16] %define m9 [rsp+2*16] %define m10 [rsp+3*16] %define m11 [rsp+4*16] mova m12, m0 mova m8, m1 mova m9, m2 mova m10, m3 mova m11, m4 mov bdmaxd, bdmaxm %endif sar bdmaxd, 1 SPLATW m0, bdmaxd ; max_grain pcmpeqw m1, m1 %if !cpuflag(sse4) pcmpeqw m2, m2 psrldq m2, 14 pslldq m2, 2 pxor m2, m1 %endif pxor m1, m0 ; min_grain %if ARCH_X86_64 SWAP 0, 13 SWAP 1, 14 SWAP 2, 15 %else %define m13 [rsp+5*16] %define m14 [rsp+6*16] mova m13, m0 mova m14, m1 %if !cpuflag(sse4) %define m15 [rsp+7*16] mova m15, m2 %endif %endif sub bufq, 2*(82*73-(82*3+79)) DEFINE_ARGS buf, fg_data, h, x mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] psrldq m2, m0, 2 psrldq m3, m0, 4 psrldq m4, m0, 6 psrldq m5, m0, 8 punpcklwd m0, m2 punpcklwd m3, m4 punpcklwd m5, m1 psrldq m2, m1, 2 psrldq m4, m1, 4 punpcklwd m2, m4 psrldq m4, m1, 6 psrldq m1, 8 punpcklwd m4, m1 pmaddwd m0, m6 pmaddwd m3, m7 pmaddwd m5, m8 pmaddwd m2, m9 pmaddwd m4, m10 paddd m0, m3 paddd m5, m2 paddd m0, m4 paddd m0, m5 ; accumulated top 2 rows paddd m0, m12 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] pshufd m4, m1, q3321 pxor m2, m2 pcmpgtw m2, m4 punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] .x_loop_ar2_inner: pmaddwd m2, m1, m11 paddd m2, m0 psrldq m0, 4 ; shift top to next pixel psrad m2, [fg_dataq+FGData.ar_coeff_shift] paddd m2, m4 packssdw m2, m2 pminsw m2, m13 pmaxsw m2, m14 psrldq m4, 4 pslldq m2, 2 psrldq m1, 2 %if cpuflag(sse4) pblendw m1, m2, 00000010b %else pand m1, m15 pandn m3, m15, m2 por m1, m3 %endif ; overwrite previous pixel, this should be ok movd [bufq+xq*2-2], m1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 dec hd jg .y_loop_ar2 %if ARCH_X86_32 %undef m8 %undef m9 %undef m10 %undef m11 %undef m12 %undef m13 %undef m14 %undef m15 %endif RET .ar3: DEFINE_ARGS buf, fg_data, bdmax, shift %if WIN64 mov r6, rsp and rsp, ~15 sub rsp, 64 %define tmp rsp %elif ARCH_X86_64 %define tmp rsp+stack_offset-72 %else ALLOC_STACK -16*12 %define tmp rsp mov bdmaxd, bdmaxm %endif sar bdmaxd, 1 SPLATW m7, bdmaxd ; max_grain pcmpeqw m6, m6 %if !cpuflag(sse4) pcmpeqw m4, m4 psrldq m4, 14 pslldq m4, 4 pxor m4, m6 %endif pxor m6, m7 ; min_grain mov shiftd, [fg_dataq+FGData.ar_coeff_shift] %if ARCH_X86_64 SWAP 6, 14 SWAP 7, 15 %else %define m14 [rsp+10*16] %define m15 [esp+11*16] mova m14, m6 mova m15, m7 %endif ; build cf0-1 until 18-19 in m5-12 and r0/1 pxor m1, m1 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 %if cpuflag(sse4) pshufd m4, m2, q3333 %else pshufd m5, m2, q3333 mova [tmp+48], m5 %endif pshufd m3, m2, q2222 pshufd m1, m2, q0000 pshufd m2, m2, q1111 pshufd m7, m0, q2222 pshufd m6, m0, q1111 pshufd m5, m0, q0000 pshufd m0, m0, q3333 %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 2, 10 SWAP 3, 11 SWAP 4, 12 %else %define m8 [rsp+4*16] %define m9 [esp+5*16] %define m10 [rsp+6*16] %define m11 [esp+7*16] %define m12 [rsp+8*16] mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 mova m12, m4 %endif ; build cf20,round in r2 ; build cf21-23,round*2 in m13 pxor m1, m1 movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pcmpgtb m1, m0 punpcklbw m0, m1 pshufd m1, m0, q0000 pshufd m2, m0, q1111 mova [tmp+ 0], m1 mova [tmp+16], m2 psrldq m3, m0, 10 pinsrw m3, [base+round_vals+shiftq*2-10], 3 %if ARCH_X86_64 SWAP 3, 13 %else %define m13 [esp+9*16] mova m13, m3 %endif pinsrw m0, [base+round_vals+shiftq*2-12], 5 pshufd m3, m0, q2222 mova [tmp+32], m3 DEFINE_ARGS buf, fg_data, h, x sub bufq, 2*(82*73-(82*3+79)) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, m5 pmaddwd m2, m6 pmaddwd m3, m7 paddd m0, m2 paddd m0, m3 ; m0 = top line first 6 multiplied by cf, m1 = top line last entry movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, m8 pmaddwd m4, m9 pmaddwd m3, m10 pmaddwd m2, m11 paddd m1, m4 paddd m3, m2 paddd m0, m1 paddd m0, m3 ; m0 = top 2 lines multiplied by cf movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] punpcklwd m2, [base+pw_1] %if cpuflag(sse4) pmaddwd m1, m12 %else pmaddwd m1, [tmp+48] %endif pmaddwd m3, [tmp+ 0] pmaddwd m4, [tmp+16] pmaddwd m2, [tmp+32] paddd m1, m3 paddd m4, m2 paddd m0, m1 paddd m0, m4 ; m0 = top 3 lines multiplied by cf plus rounding for downshift movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pmaddwd m2, m1, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] packssdw m2, m2 pminsw m2, m15 pmaxsw m2, m14 pslldq m2, 4 psrldq m1, 2 %if cpuflag(sse4) pblendw m1, m2, 00000100b %else pand m1, m12 pandn m3, m12, m2 por m1, m3 %endif ; overwrite a couple of pixels, should be ok movq [bufq+xq*2-4], m1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 dec hd jg .y_loop_ar3 %if WIN64 mov rsp, r6 %elif ARCH_X86_32 %undef m8 %undef m9 %undef m10 %undef m11 %undef m12 %undef m13 %undef m14 %undef m15 %endif RET %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 %if ARCH_X86_64 cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg %define base r8-pb_mask lea r8, [pb_mask] movifnidn bdmaxd, bdmaxm lea r6d, [bdmaxq+1] %else cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %define base r2-$$ LEA r2, $$ mov fg_dataq, r2m mov r6d, r4m inc r6d %endif movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc sub r5, r6 SPLATW m6, [base+round+r5*2-2] mova m5, [base+pb_mask] SPLATW m0, [fg_dataq+FGData.seed] %if ARCH_X86_64 SPLATW m2, [base+pw_seed_xor+uvq*4] %else mov r5d, r3m SPLATW m2, [base+pw_seed_xor+r5*4] %endif pxor m0, m2 %if ARCH_X86_64 lea r6, [gaussian_sequence] %endif %if %2 mov hd, 73-35*%3 add bufq, 44*2 .loop_y: mov xq, -44 %else mov xq, -82*73 add bufq, 82*73*2 %endif .loop_x: pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m2, m3 psllq m3, m2, 15 por m2, m3 ; aggregate each bit into next seed's high bit pmulhuw m3, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 vpgatherdw m3, m2, r6, r9, r10, 4, 2 %else vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 %endif paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 ; shifts by 0, which pmulhrsw does not support pmulhrsw m3, m6 movq [bufq+xq*2], m3 add xq, 4 jl .loop_x %if %2 add bufq, 82*2 dec hd jg .loop_y %endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] jmp r5 .ar0: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift ALLOC_STACK -16*2 mov bufyq, r1m mov uvd, r3m %endif imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] SPLATW m3, [base+hmul_bits+shiftq*2-10] %if ARCH_X86_64 sar bdmaxd, 1 SPLATW m1, bdmaxd ; max_gain %else SPLATW m1, r4m psraw m1, 1 %endif pcmpeqw m7, m7 pxor m7, m1 ; min_grain %if ARCH_X86_64 SWAP 1, 14 DEFINE_ARGS buf, bufy, h, x %else %define m14 [rsp+0*16] mova m14, m1 DEFINE_ARGS buf, bufy, pic_reg, h, x %endif pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 %if %2 SPLATW m6, [base+hmul_bits+2+%3*2] %endif SPLATW m4, m4 pxor m5, m5 %if %2 %if !cpuflag(sse4) pcmpeqw m2, m2 pslldq m2, 12 %if ARCH_X86_64 SWAP 2, 12 %else %define m12 [rsp+1*16] mova m12, m2 %endif %endif %endif %if %2 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) %else sub bufq, 2*(82*70-3) %endif add bufyq, 2*(3+82*3) mov hd, 70-35*%3 .y_loop_ar0: ; first 32 pixels xor xd, xd .x_loop_ar0: movu m0, [bufyq+xq*(2<<%2)] %if %2 %if %3 movu m2, [bufyq+xq*4+82*2] paddw m0, m2 %endif movu m1, [bufyq+xq*4 +16] %if %3 movu m2, [bufyq+xq*4+82*2+16] paddw m1, m2 %endif phaddw m0, m1 pmulhrsw m0, m6 %endif punpckhwd m1, m0, m5 punpcklwd m0, m5 REPX {pmaddwd x, m4}, m0, m1 REPX {psrad x, 5}, m0, m1 packssdw m0, m1 pmulhrsw m0, m3 movu m1, [bufq+xq*2] paddw m0, m1 pminsw m0, m14 pmaxsw m0, m7 cmp xd, 72-40*%2 je .end movu [bufq+xq*2], m0 add xd, 8 jmp .x_loop_ar0 ; last 6/4 pixels .end: %if %2 %if cpuflag(sse4) pblendw m0, m1, 11000000b %else pand m1, m12 pandn m2, m12, m0 por m0, m1, m2 %endif movu [bufq+xq*2], m0 %else movq [bufq+xq*2], m0 %endif add bufq, 82*2 add bufyq, 82*(2<<%3) dec hd jg .y_loop_ar0 %if ARCH_X86_32 %undef m12 %undef m14 %endif RET .ar1: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x %else RESET_STACK_STATE DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 mov bufyq, r1m mov uvd, r3m %endif imul uvd, 28 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] %if WIN64 DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 %if %2 lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] %else lea bufq, [r0-2*(82*69+3)] %endif %else %if ARCH_X86_64 DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 %else DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 %define hd dword r1m %define mind dword r3m %define maxd dword r4m %endif %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif %endif %if ARCH_X86_64 mov shiftd, [r2+FGData.ar_coeff_shift] %else mov shiftd, [r3+FGData.ar_coeff_shift] %endif pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 ; cf0-4 in words pshuflw m4, m4, q2100 psrldq m4, 2 ; cf0-3,4 in words pshufd m5, m4, q1111 pshufd m4, m4, q0000 movd m3, [base+round_vals+shiftq*2-12] ; rnd pxor m6, m6 punpcklwd m3, m6 %if %2 SPLATW m6, [base+hmul_bits+2+%3*2] %endif SPLATD m3, m3 add bufyq, 2*(79+82*3) mov hd, 70-35*%3 sar maxd, 1 %if ARCH_X86_64 mov mind, maxd xor mind, -1 %else DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 mov r2, maxd xor r2, -1 mov mind, r2 %endif .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu m0, [bufq+xq*2-82*2-2] ; top/left %if %2 movu m7, [bufyq+xq*4] %if %3 movu m1, [bufyq+xq*4+82*2] phaddw m7, m1 %else phaddw m7, m7 %endif %else movq m7, [bufyq+xq*2] %endif psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 %if %2 %if %3 pshufd m2, m7, q3232 paddw m7, m2 %endif pmulhrsw m7, m6 %endif punpcklwd m1, m7 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 paddd m0, m3 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 %if ARCH_X86_32 %undef maxd %undef mind %undef hd %endif RET .ar2: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift ALLOC_STACK -16*8 mov bufyq, r1m mov uvd, r3m %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 %if ARCH_X86_64 sar bdmaxd, 1 SPLATW m5, bdmaxd ; max_grain %else SPLATW m5, r4m psraw m5, 1 %endif pcmpeqw m6, m6 %if !cpuflag(sse4) pcmpeqw m7, m7 psrldq m7, 14 pslldq m7, 2 pxor m7, m6 %endif pxor m6, m5 ; min_grain %if %2 && cpuflag(sse4) SPLATW m7, [base+hmul_bits+2+%3*2] %endif %if ARCH_X86_64 SWAP 5, 13 SWAP 6, 14 SWAP 7, 15 %else %define m13 [rsp+5*16] %define m14 [rsp+6*16] %define m15 [rsp+7*16] mova m13, m5 mova m14, m6 mova m15, m7 %endif ; coef values movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] pxor m1, m1 pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 pinsrw m2, [base+round_vals-12+shiftq*2], 5 pshufd m6, m0, q0000 pshufd m7, m0, q1111 pshufd m1, m0, q3333 pshufd m0, m0, q2222 pshufd m3, m2, q1111 pshufd m4, m2, q2222 pshufd m2, m2, q0000 %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 2, 10 SWAP 3, 11 SWAP 4, 12 %else %define m8 [rsp+0*16] %define m9 [rsp+1*16] %define m10 [rsp+2*16] %define m11 [rsp+3*16] %define m12 [rsp+4*16] mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 mova m12, m4 %endif %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, h, x %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x %endif %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] psrldq m4, m0, 2 ; y=-2,x=[-1,+5] psrldq m1, m0, 4 ; y=-2,x=[-0,+5] psrldq m3, m0, 6 ; y=-2,x=[+1,+5] psrldq m2, m0, 8 ; y=-2,x=[+2,+5] punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] pmaddwd m0, m6 pmaddwd m1, m7 pmaddwd m2, m8 paddd m0, m1 paddd m0, m2 psrldq m3, m5, 2 ; y=-1,x=[-1,+5] psrldq m1, m5, 4 ; y=-1,x=[-0,+5] psrldq m4, m5, 6 ; y=-1,x=[+1,+5] psrldq m2, m5, 8 ; y=-1,x=[+2,+5] punpcklwd m3, m1 punpcklwd m4, m2 pmaddwd m3, m9 pmaddwd m4, m10 paddd m3, m4 paddd m0, m3 ; luma component & rounding %if %2 movu m1, [bufyq+xq*4] %if %3 movu m2, [bufyq+xq*4+82*2] phaddw m1, m2 pshufd m2, m1, q3232 paddw m1, m2 %else phaddw m1, m1 %endif %if cpuflag(sse4) pmulhrsw m1, m15 %elif %3 pmulhrsw m1, [base+pw_8192] %else pmulhrsw m1, [base+pw_16384] %endif %else movq m1, [bufyq+xq*2] %endif punpcklwd m1, [base+pw_1] pmaddwd m1, m12 paddd m0, m1 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] pshufd m2, m1, q3321 pxor m3, m3 pcmpgtw m3, m2 punpcklwd m2, m3 ; y=0,x=[0,3] in dword .x_loop_ar2_inner: pmaddwd m3, m1, m11 paddd m3, m0 psrldq m0, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] ; we do not need to packssdw since we only care about one value paddd m3, m2 packssdw m3, m3 pminsw m3, m13 pmaxsw m3, m14 psrldq m1, 2 pslldq m3, 2 psrldq m2, 4 %if cpuflag(sse4) pblendw m1, m3, 00000010b %else pand m1, m15 pandn m4, m15, m3 por m1, m4 %endif ; overwrite previous pixel, should be ok movd [bufq+xq*2-2], m1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 %if ARCH_X86_32 %undef m13 %undef m14 %undef m15 %endif RET .ar3: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %if WIN64 mov r6, rsp and rsp, ~15 sub rsp, 96 %define tmp rsp %else %define tmp rsp+stack_offset-120 %endif %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift ALLOC_STACK -16*14 mov bufyq, r1m mov uvd, r3m %define tmp rsp %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 SPLATW m4, [base+round_vals-12+shiftq*2] pxor m5, m5 pcmpgtw m5, m4 punpcklwd m4, m5 %if ARCH_X86_64 sar bdmaxd, 1 SPLATW m6, bdmaxd ; max_grain %else SPLATW m6, r4m psraw m6, 1 %endif pcmpeqw m7, m7 %if !cpuflag(sse4) pcmpeqw m3, m3 psrldq m3, 14 pslldq m3, 4 pxor m3, m7 %endif pxor m7, m6 ; min_grain %if %2 && cpuflag(sse4) SPLATW m3, [base+hmul_bits+2+%3*2] %endif %if ARCH_X86_64 SWAP 3, 11 SWAP 4, 12 SWAP 6, 14 SWAP 7, 15 %else %define m11 [rsp+ 9*16] %define m12 [rsp+10*16] %define m14 [rsp+12*16] %define m15 [rsp+13*16] mova m11, m3 mova m12, m4 mova m14, m6 mova m15, m7 %endif ; cf from y=-3,x=-3 until y=-3,x=-2 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pxor m1, m1 pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 pshufd m1, m0, q0000 pshufd m3, m0, q1111 pshufd m4, m0, q2222 pshufd m0, m0, q3333 pshufd m5, m2, q0000 pshufd m6, m2, q1111 mova [tmp+16*0], m1 mova [tmp+16*1], m3 mova [tmp+16*2], m4 mova [tmp+16*3], m0 mova [tmp+16*4], m5 mova [tmp+16*5], m6 pshufd m6, m2, q2222 pshufd m7, m2, q3333 ; cf from y=-1,x=-1 to y=0,x=-1 + luma component movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] pxor m1, m1 pcmpgtb m1, m0 punpckhbw m2, m0, m1 ; luma punpcklbw m0, m1 pshufd m3, m0, q3232 psrldq m5, m0, 10 ; y=0,x=[-3 to -1] + "1.0" for current pixel pinsrw m5, [base+round_vals-10+shiftq*2], 3 ; y=-1,x=[-1 to +2] pshufd m1, m0, q0000 pshufd m0, m0, q1111 ; y=-1,x=+3 + luma punpcklwd m3, m2 pshufd m3, m3, q0000 %if ARCH_X86_64 SWAP 1, 8 SWAP 0, 9 SWAP 3, 10 SWAP 5, 13 DEFINE_ARGS buf, bufy, fg_data, h, x %else %define m8 [rsp+ 6*16] %define m9 [rsp+ 7*16] %define m10 [rsp+ 8*16] %define m13 [rsp+11*16] mova m8, m1 mova m9, m0 mova m10, m3 mova m13, m5 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x %endif %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: ; first line movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, [tmp+0*16] pmaddwd m2, [tmp+1*16] pmaddwd m3, [tmp+2*16] paddd m0, m2 paddd m0, m3 ; first 6 x of top y ; second line [m0/1 are busy] movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, [tmp+3*16] pmaddwd m4, [tmp+4*16] pmaddwd m3, [tmp+5*16] pmaddwd m5, m6 paddd m1, m4 paddd m3, m5 paddd m0, m1 paddd m0, m3 ; top 2 lines ; third line [m0 is busy] & luma + round movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] %if %2 movu m5, [bufyq+xq*4] %if %3 movu m4, [bufyq+xq*4+82*2] phaddw m5, m4 %else phaddw m5, m5 %endif %else movq m5, [bufyq+xq*2] %endif palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] %if %3 pshufd m4, m5, q3232 paddw m5, m4 %endif %if %2 %if cpuflag(sse4) pmulhrsw m5, m11 %elif %3 pmulhrsw m5, [base+pw_8192] %else pmulhrsw m5, [base+pw_16384] %endif %endif punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] punpcklwd m2, m5 pmaddwd m1, m7 pmaddwd m3, m8 pmaddwd m4, m9 pmaddwd m2, m10 paddd m1, m3 paddd m4, m2 paddd m0, m12 ; += round paddd m1, m4 paddd m0, m1 movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pmaddwd m2, m1, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] packssdw m2, m2 pminsw m2, m14 pmaxsw m2, m15 pslldq m2, 4 psrldq m1, 2 %if cpuflag(sse4) pblendw m1, m2, 00000100b %else pand m1, m11 pandn m3, m11, m2 por m1, m3 %endif ; overwrite previous pixels, should be ok movq [bufq+xq*2-4], m1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 %if WIN64 mov rsp, r6 %elif ARCH_X86_32 %undef m8 %undef m9 %undef m10 %undef m11 %undef m12 %undef m13 %undef m14 %undef m15 %endif RET %endmacro generate_grain_uv_fn 420, 1, 1 generate_grain_uv_fn 422, 1, 0 generate_grain_uv_fn 444, 0, 0 %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov r5, r8m %define r0m [rsp+8*mmsize+ 3*gprsize] %define r2m [rsp+8*mmsize+ 5*gprsize] %define r4m [rsp+8*mmsize+ 7*gprsize] %define r6m [rsp+8*mmsize+ 9*gprsize] %define r7m [rsp+8*mmsize+10*gprsize] %define r8m [rsp+8*mmsize+11*gprsize] mov r0m, r0 mov r2m, r1 mov r4m, r2 mov r6m, r3 mov r7m, r4 mov r8m, r5 %else cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov scalingq, r5m mov fg_dataq, r3m %if STACK_ALIGNMENT < mmsize mov r6, r9m %define r9m [rsp+8*mmsize+ 4*gprsize] %define r3m [rsp+8*mmsize+ 6*gprsize] %define r5m [rsp+8*mmsize+ 8*gprsize] mov r9m, r6 %endif LEA r5, $$ %define base r5-$$ mov r5m, picptrq %else cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] SPLATW m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] %if ARCH_X86_32 DECLARE_REG_TMP 0, 3 %else DECLARE_REG_TMP 9, 10 %endif mov t0d, r9m ; bdmax sar t0d, 11 ; is_12bpc inc t0d mov t1d, r6d imul t1d, t0d dec t0d SPLATW m5, [base+min+t1*2] lea t0d, [t0d*3] lea t0d, [r6d*2+t0d] SPLATW m4, [base+max+t0*2] SPLATW m2, r9m pcmpeqw m1, m1 psraw m7, m2, 1 ; max_grain pxor m1, m7 ; min_grain SPLATD m6, [base+pd_16] SCRATCH 1, 9, 0 SCRATCH 2, 10, 1 SCRATCH 3, 11, 2 SCRATCH 4, 12, 3 SCRATCH 5, 13, 4 SCRATCH 6, 14, 5 SCRATCH 7, 15, 6 mova m6, [base+pw_27_17_17_27] ; for horizontal filter %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 DECLARE_REG_TMP 0 %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see DECLARE_REG_TMP 7 %endif mov sbyd, r8m movzx t0d, byte [fg_dataq+FGData.overlap_flag] test t0d, t0d jz .no_vertical_overlap test sbyd, sbyd jnz .vertical_overlap .no_vertical_overlap: mov dword r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, src_bak %endif lea src_bakq, [srcq+wq*2] mov r9mp, src_bakq neg wq sub dstmp, srcq %if ARCH_X86_32 mov r4m, wq %endif .loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak %endif .loop_x_odd: movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y: ; src pand m0, m10, [srcq+ 0] pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 %else vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 %endif REPX {psrlw x, 8}, m2, m3 ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2] movu m5, [grain_lutq+offxyq*2+16] ; noise = round2(scaling[src] * grain, scaling_shift) REPX {pmullw x, m11}, m2, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp ; src += stride add grain_lutq, 82*2 dec hd jg .loop_y %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif btc dword r8m, 2 jc .next_blk add offxyd, 16 test dword r8m, 2 jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r12d, 16 ; top_offxy += 16 %endif jmp .loop_x_odd_v_overlap .next_blk: test dword r8m, 1 jz .loop_x ; r8m = sbym test dword r8m, 2 jnz .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: %if ARCH_X86_32 add offxyd, 16 mov [rsp+8*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy %endif mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] movd m4, [grain_lutq+r5*2] %else movd m4, [grain_lutq+left_offxyq*2] %endif punpcklwd m4, m5 pmaddwd m4, m6 paddd m4, m14 psrad m4, 5 packssdw m4, m4 pminsw m4, m15 pmaxsw m4, m9 shufps m4, m5, q3210 ; src pand m0, m10, [srcq+ 0] pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 %else vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 %endif REPX {psrlw x, 8}, m2, m3 ; noise = round2(scaling[src] * grain, scaling_shift) movu m5, [grain_lutq+offxyq*2+16] REPX {pmullw x, m11}, m2, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp add grain_lutq, 82*2 dec hd jg .loop_y_h_overlap %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif or dword r8m, 4 add offxyd, 16 ; r8m = sbym test dword r8m, 2 jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r12d, 16 ; top_offxy += 16 %endif jmp .loop_x_odd_v_overlap .end: RET .vertical_overlap: or t0d, 2 mov r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see %endif movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul t0d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add t0d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and t0d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, t0d %if ARCH_X86_32 xor sbyd, seed DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, src_bak %endif lea src_bakq, [srcq+wq*2] mov r9mp, src_bakq neg wq sub dstmp, srcq %if ARCH_X86_32 mov r4m, wq %endif .loop_x_v_overlap: %if ARCH_X86_32 mov r5, r5m SPLATD m7, [base+pw_27_17_17_27] mov seed, r3m %else SPLATD m7, [pw_27_17_17_27] %endif ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, unused, top_offxy mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, unused, top_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 .loop_x_odd_v_overlap: %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)] mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+1*gprsize] movu m2, [grain_lutq+r5*2] %else movu m2, [grain_lutq+top_offxyq*2] %endif punpckhwd m4, m2, m3 punpcklwd m2, m3 REPX {pmaddwd x, m7}, m4, m2 REPX {paddd x, m14}, m4, m2 REPX {psrad x, 5}, m4, m2 packssdw m2, m4 pminsw m2, m15 pmaxsw m2, m9 movu m4, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m3, [grain_lutq+r5*2+16] %else movu m3, [grain_lutq+top_offxyq*2+16] %endif punpckhwd m5, m3, m4 punpcklwd m3, m4 REPX {pmaddwd x, m7}, m5, m3 REPX {paddd x, m14}, m5, m3 REPX {psrad x, 5}, m5, m3 packssdw m3, m5 pminsw m3, m15 pmaxsw m3, m9 ; src pand m0, m10, [srcq+ 0] ; m0-1: src as word pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] ; noise = round2(scaling[src] * grain, scaling_shift) %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 %else vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 %endif psrlw m4, 8 pmullw m4, m11 pmulhrsw m4, m2 %if ARCH_X86_32 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 %else vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 %endif psrlw m5, 8 pmullw m5, m11 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif btc dword r8m, 2 jc .next_blk_v %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 jmp .loop_x_odd_v_overlap .next_blk_v: ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r0, [rsp+8*mmsize+1*gprsize] add r3, 16 add r0, 16 mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy mov seed, r3m xor r0, r0 %else ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)] movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movu m2, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy movu m4, [grain_lutq+r0*2] movd m5, [grain_lutq+r5*2] mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy movd m3, [grain_lutq+r5*2] %else movu m4, [grain_lutq+top_offxyq*2] movd m5, [grain_lutq+left_offxyq*2] movd m3, [grain_lutq+topleft_offxyq*2] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklwd m5, m2 punpcklwd m3, m4 REPX {pmaddwd x, m6}, m5, m3 REPX {paddd x, m14}, m5, m3 REPX {psrad x, 5}, m5, m3 packssdw m5, m3 pminsw m5, m15 pmaxsw m5, m9 shufps m3, m5, m2, q3210 shufps m5, m4, q3232 ; followed by v interpolation (top | cur -> cur) movu m0, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m1, [grain_lutq+r0*2+16] %else movu m1, [grain_lutq+top_offxyq*2+16] %endif punpcklwd m2, m5, m3 punpckhwd m5, m3 punpcklwd m3, m1, m0 punpckhwd m1, m0 REPX {pmaddwd x, m7}, m2, m5, m3, m1 REPX {paddd x, m14}, m2, m5, m3, m1 REPX {psrad x, 5}, m2, m5, m3, m1 packssdw m2, m5 packssdw m3, m1 REPX {pminsw x, m15}, m2, m3 REPX {pmaxsw x, m9}, m2, m3 ; src pand m0, m10, [srcq+ 0] pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] ; noise = round2(scaling[src] * grain, scaling_shift) %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 %else vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 %endif psrlw m4, 8 pmullw m4, m11 pmulhrsw m2, m4 %if ARCH_X86_32 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 %else vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 %endif psrlw m5, 8 pmullw m5, m11 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: or dword r8m, 4 %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov r5, r5m add offxyd, 16 add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else add offxyd, 16 add top_offxyd, 16 mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif jmp .loop_x_odd_v_overlap .end_hv: RET %if ARCH_X86_32 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 %endif %macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r1m mov r2, r2m mov r4, r3m mov r3, r4m mov r5, r5m %define r0m [rsp+8*mmsize+ 3*gprsize] %define r1m [rsp+8*mmsize+ 4*gprsize] %define r2m [rsp+8*mmsize+ 5*gprsize] %define r3m [rsp+8*mmsize+ 6*gprsize] %define r4m [rsp+8*mmsize+ 7*gprsize] %define r5m [rsp+8*mmsize+ 8*gprsize] mov r0m, r0 mov r2m, r2 mov r4m, r3 mov r5m, r5 mov r0, r6m mov r2, r7m mov r3, r8m mov r5, r9m %define r6m [rsp+8*mmsize+ 9*gprsize] %define r7m [rsp+8*mmsize+10*gprsize] %define r8m [rsp+8*mmsize+11*gprsize] %define r9m [rsp+8*mmsize+12*gprsize] mov r6m, r0 mov r7m, r2 mov r8m, r3 mov r9m, r5 mov r2, r10m mov r3, r11m mov r5, r12m mov r0, r13m %define r10m [rsp+8*mmsize+13*gprsize] %define r11m [rsp+8*mmsize+14*gprsize] %define r12m [rsp+8*mmsize+15*gprsize] mov r10m, r2 mov r11m, r3 mov r12m, r5 SPLATW m2, r13m %else cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused mov srcq, srcm mov fg_dataq, r3m %endif LEA r5, $$ %define base r5-$$ DECLARE_REG_TMP 0, 2, 3 %else cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask lea r8, [pb_mask] DECLARE_REG_TMP 9, 10, 11 %endif mov r6d, [fg_dataq+FGData.scaling_shift] SPLATW m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] %if STACK_ALIGNMENT >= mmsize mov t0d, r13m ; bdmax %endif sar t0d, 11 ; is_12bpc inc t0d mov t1d, r6d imul t1d, t0d dec t0d SPLATW m5, [base+min+t1*2] lea t1d, [t0d*3] mov t2d, r12m inc t2d imul r6d, t2d add t1d, r6d SPLATW m4, [base+max+t1*2] %if STACK_ALIGNMENT >= mmsize SPLATW m2, r13m %endif SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 %define mzero m7 %if %3 SPLATD m2, [base+pw_23_22] %endif %if ARCH_X86_32 mov scalingq, r5m mov r5m, r5 %else mov r13mp, strideq %endif pcmpeqw m0, m0 psraw m1, m10, 1 pxor m0, m1 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap DECLARE_REG_TMP 0 %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap DECLARE_REG_TMP 9 %endif %if %1 mov r6d, r11m SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] punpcklwd m6, m1, m0 SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] SPLATD m7, [base+pw_4+t0*4] pmullw m5, m7 %else SPLATD m6, [base+pd_16] %if %2 mova m5, [base+pw_23_22] %else mova m5, [base+pw_27_17_17_27] %endif %endif SCRATCH 6, 14, 6 SCRATCH 5, 15, 7 %if ARCH_X86_32 DECLARE_REG_TMP 0 %else DECLARE_REG_TMP 7 %endif mov sbyd, r8m mov t0d, [fg_dataq+FGData.overlap_flag] test t0d, t0d jz %%no_vertical_overlap test sbyd, sbyd jnz %%vertical_overlap %%no_vertical_overlap: mov r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, see, w, picptr, luma mov dstq, r0mp mov lumaq, r9mp mov wq, r4m lea r3, [srcq+wq*2] mov r1mp, r3 lea r3, [dstq+wq*2] mov r11mp, r3 lea r3, [lumaq+wq*(2<<%2)] mov r12mp, r3 %if %3 shl r10mp, 1 %endif %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused2, unused3, see, unused4, unused5, unused6, luma, lstride mov lstrideq, r10mp %if %3 add lstrideq, lstrideq %endif mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 %endif neg wq %if ARCH_X86_32 mov r4mp, wq %endif %%loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, unused2, unused3, luma, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride %endif %if %2 == 0 %%loop_x_odd: %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: ; src mova m0, [srcq] mova m1, [srcq+16] ; m0-1: src as word ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9m %endif mova m4, [lumaq+ 0] mova m6, [lumaq+(16<<%2)] %if %2 phaddw m4, [lumaq+16] phaddw m6, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9m, lumaq %endif %if %2 pavgw m4, mzero pavgw m6, mzero %endif %if %1 punpckhwd m3, m4, m0 punpcklwd m4, m0 punpckhwd m5, m6, m1 punpcklwd m6, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m3, m4, m5, m6 REPX {psrad x, 6}, m3, m4, m5, m6 packssdw m4, m3 packssdw m6, m5 REPX {paddw x, m15}, m4, m6 REPX {pmaxsw x, mzero}, m4, m6 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() %else REPX {pand x, m10}, m4, m6 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 %else vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 %endif REPX {psrlw x, 8}, m3, m5 ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2] movu m6, [grain_lutq+offxyq*2+16] ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m3, m5 pmulhrsw m4, m3 pmulhrsw m6, m5 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hd jg %%loop_y %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma mov wq, r4mp %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0m, dstq mov r9m, lumaq mov r4m, wq %endif %if %2 == 0 btc dword r8m, 2 jc %%next_blk add offxyd, 16 test dword r8m, 2 jz %%loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 %endif jmp %%loop_x_odd_v_overlap %%next_blk: %endif test dword r8m, 1 je %%loop_x ; r8m = sbym test dword r8m, 2 jnz %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: %if ARCH_X86_32 add offxyd, 16 mov [rsp+8*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx mov offxd, seed mov offyd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_h_overlap: mova m0, [srcq] mova m1, [srcq+16] ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9m %endif mova m4, [lumaq+ 0] mova m6, [lumaq+(16<<%2)] %if %2 phaddw m4, [lumaq+16] phaddw m6, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9m, lumaq %endif %if %2 pavgw m4, mzero pavgw m6, mzero %endif %if %1 punpckhwd m3, m4, m0 punpcklwd m4, m0 punpckhwd m5, m6, m1 punpcklwd m6, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m3, m4, m5, m6 REPX {psrad x, 6}, m3, m4, m5, m6 packssdw m4, m3 packssdw m6, m5 REPX {paddw x, m15}, m4, m6 REPX {pmaxsw x, mzero}, m4, m6 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() %else REPX {pand x, m10}, m4, m6 %endif ; grain = grain_lut[offy+y][offx+x] movu m7, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] movd m5, [grain_lutq+r5*2] %else movd m5, [grain_lutq+left_offxyq*2+ 0] %endif punpcklwd m5, m7 ; {left0, cur0} %if %1 %if ARCH_X86_32 mov r5, r5m %endif %if %2 pmaddwd m5, [PIC_ptr(pw_23_22)] %else pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] %endif paddd m5, [PIC_ptr(pd_16)] %else pmaddwd m5, m15 paddd m5, m14 %endif psrad m5, 5 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, m9 shufps m5, m7, q3210 movu m3, [grain_lutq+offxyq*2+16] ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 %else vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 %endif REPX {psrlw x, 8}, m7, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m7, m4 pmulhrsw m5, m7 pmulhrsw m3, m4 ; dst = clip_pixel(src, noise) paddw m0, m5 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hd jg %%loop_y_h_overlap %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut mov wq, r4mp %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0mp, dstq mov r9mp, lumaq mov r4m, wq %endif %if %2 ; r8m = sbym test dword r8m, 2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %else or dword r8m, 4 add offxyd, 16 ; r8m = sbym test dword r8m, 2 jz %%loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxy += 16 %endif jmp %%loop_x_odd_v_overlap %endif %%end: RET %%vertical_overlap: or t0d, 2 mov r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, unused1, unused2, unused3, lstride %endif movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul t0d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add t0d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and t0d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, t0d %if ARCH_X86_32 xor sbyd, seed DEFINE_ARGS dst, src, scaling, see, w, picptr, luma mov r3m, seed mov dstq, r0mp mov lumaq, r9mp mov wq, r4m lea r3, [srcq+wq*2] mov r1mp, r3 lea r3, [dstq+wq*2] mov r11mp, r3 lea r3, [lumaq+wq*(2<<%2)] mov r12mp, r3 %if %3 shl r10mp, 1 %endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, unused3, unused4, unused5, luma, lstride mov lstrideq, r10mp %if %3 add lstrideq, lstrideq %endif mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 %endif neg wq %if ARCH_X86_32 mov r4m, wq %endif %%loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m xor t0d, t0d %else ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, top_offxy, unused2, luma, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 %if %2 == 0 %%loop_x_odd_v_overlap: %endif %if %3 == 0 %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)] %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_v_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy movu m5, [grain_lutq+r0*2] %else movu m5, [grain_lutq+top_offxyq*2] %endif punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m7, m5 %if %1 %if ARCH_X86_32 mov r5, r5m %endif REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 packssdw m3, m5, m7 pmaxsw m3, m8 pminsw m3, m9 ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m5, [grain_lutq+r0*2+16] %else movu m5, [grain_lutq+top_offxyq*2+16] %endif punpckhwd m7, m5, m4 punpcklwd m5, m4 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m7, m5 %if %1 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 packssdw m4, m5, m7 pmaxsw m4, m8 pminsw m4, m9 ; src mova m0, [srcq] mova m1, [srcq+16] ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9mp %endif mova m5, [lumaq+ 0] mova m6, [lumaq+(16<<%2)] %if %2 phaddw m5, [lumaq+16] phaddw m6, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif %if %2 pavgw m5, mzero pavgw m6, mzero %endif %if %1 punpckhwd m7, m5, m0 punpcklwd m5, m0 REPX {pmaddwd x, m14}, m7, m5 REPX {psrad x, 6}, m7, m5 packssdw m5, m7 punpckhwd m7, m6, m1 punpcklwd m6, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m7, m6 REPX {psrad x, 6}, m7, m6 packssdw m6, m7 pxor mzero, mzero REPX {paddw x, m15}, m5, m6 REPX {pmaxsw x, mzero}, m5, m6 REPX {pminsw x, m10}, m5, m6 ; clip_pixel() %else REPX {pand x, m10}, m5, m6 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 %else vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 %endif REPX {psrlw x, 8}, m7, m5 ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m7, m5 pmulhrsw m3, m7 pmulhrsw m4, m5 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 dec hw jle %%end_y_v_overlap %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 %if %3 jmp %%loop_y %else btc hd, 16 jc %%loop_y %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] jmp %%loop_y_v_overlap %endif %%end_y_v_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0mp, dstq mov r9mp, lumaq mov r4m, wq %endif %if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %else btc dword r8m, 2 jc %%loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 %endif jmp %%loop_x_odd_v_overlap %endif %%loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy add offxyd, 16 add t0d, 16 mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut mov seed, r3m xor t0d, t0d %else ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 %if %3 == 0 %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)] %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy movd m5, [grain_lutq+r5*2] %else movd m5, [grain_lutq+left_offxyq*2] %endif movu m7, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+2*gprsize] movu m4, [grain_lutq+r0*2] %if %2 pinsrw m5, [grain_lutq+r5*2], 2 %else movd m3, [grain_lutq+r5*2] %endif %else movu m4, [grain_lutq+top_offxyq*2] %if %2 pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } %else movd m3, [grain_lutq+topleft_offxyq*2] %endif %endif %if %2 == 0 punpckldq m5, m3 %endif punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } %if %1 %if ARCH_X86_32 mov r5, r5m %endif %if %2 movddup m0, [PIC_ptr(pw_23_22)] %else movddup m0, [PIC_ptr(pw_27_17_17_27)] %endif %else pshufd m0, m15, q1010 %endif pmaddwd m5, m0 %if %1 paddd m5, [PIC_ptr(pd_16)] %else paddd m5, m14 %endif psrad m5, 5 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, m9 shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter shufps m5, m4, q3231 ; top0-7 post-h_filter punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m7, m5 %if %1 REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 %else REPX {paddd x, m14}, m5, m7 %endif REPX {psrad x, 5}, m5, m7 packssdw m3, m5, m7 pmaxsw m3, m8 pminsw m3, m9 ; right half movu m4, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m0, [grain_lutq+r0*2+16] %else movu m0, [grain_lutq+top_offxyq*2+16] %endif punpckhwd m1, m0, m4 punpcklwd m0, m4 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m1, m0 %if %1 REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 %else REPX {paddd x, m14}, m1, m0 %endif REPX {psrad x, 5}, m1, m0 packssdw m4, m0, m1 pmaxsw m4, m8 pminsw m4, m9 ; src mova m0, [srcq] mova m1, [srcq+16] ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9mp %endif mova m6, [lumaq+ 0] mova m5, [lumaq+(16<<%2)] %if %2 phaddw m6, [lumaq+16] phaddw m5, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif %if %2 pavgw m6, mzero pavgw m5, mzero %endif %if %1 punpckhwd m7, m6, m0 punpcklwd m6, m0 REPX {pmaddwd x, m14}, m7, m6 REPX {psrad x, 6}, m7, m6 packssdw m6, m7 punpckhwd m7, m5, m1 punpcklwd m5, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m7, m5 REPX {psrad x, 6}, m7, m5 packssdw m5, m7 pxor mzero, mzero REPX {paddw x, m15}, m6, m5 REPX {pmaxsw x, mzero}, m6, m5 REPX {pminsw x, m10}, m6, m5 ; clip_pixel() %else REPX {pand x, m10}, m6, m5 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 %else %if %3 == 0 ; register shortage :) push r12 %endif vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 %if %3 == 0 pop r12 %endif %endif REPX {psrlw x, 8}, m7, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m7, m6 pmulhrsw m3, m7 pmulhrsw m4, m6 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hw %if %3 jg %%loop_y_h_overlap %else jle %%end_y_hv_overlap btc hd, 16 jc %%loop_y_h_overlap %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] jmp %%loop_y_hv_overlap %%end_y_hv_overlap: %endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov dstmp, dstq mov r9mp, lumaq mov r4m, wq %endif %if %2 jmp %%loop_x_hv_overlap %else or dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxy += 16 %endif jmp %%loop_x_odd_v_overlap %endif %%end_hv: RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain_avx2.asm000066400000000000000000001774201517466257200247050ustar00rootroot00000000000000; Copyright © 2019-2022, VideoLAN and dav2d authors ; Copyright © 2019-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 32 pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 ; note: the order of (some of) the following constants matter pb_27_17: times 2 db 27, 17 byte_blend: db 0, 0, 0, -1 pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 pb_17_27: times 2 db 17, 27 pb_1: times 4 db 1 pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 fg_min: times 4 db 0 times 4 db 16 fg_max: times 4 db 255 times 4 db 240 times 4 db 235 pd_m65536: dd -65536 pw_8: times 2 dw 8 pw_1024: times 2 dw 1024 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 pw_1: dw 1 %macro JMP_TABLE 2-* %1_8bpc_%2_table: %xdefine %%base %1_8bpc_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %rep %0 - 2 dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 SECTION .text INIT_YMM avx2 cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data %define base r4-generate_grain_y_8bpc_avx2_table lea r4, [generate_grain_y_8bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movq xm4, [base+mul_bits] movq xm5, [base+hmul_bits] mov r7, -73*82 mova xm6, [base+pb_mask] sub bufq, r7 vpbroadcastw xm7, [base+round+r6*2] lea r6, [gaussian_sequence] movsxd r5, [r4+r5*4] .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm3, xm2 ; 4 next output seeds pshuflw xm0, xm3, q3333 psrlw xm3, 5 pand xm2, xm0, xm1 movq r2, xm3 psrlw xm3, xm2, 10 por xm2, xm3 pmullw xm2, xm4 pmulhuw xm0, xm5 movzx r3d, r2w pshufb xm3, xm6, xm2 psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm0, xm2 movd xm2, [r6+r3*2] rorx r3, r2, 32 por xm3, xm0 shr r2d, 16 pinsrw xm2, [r6+r2*2], 1 pshuflw xm0, xm3, q3333 movzx r2d, r3w psrlw xm3, 5 pinsrw xm2, [r6+r2*2], 2 shr r3d, 16 movq r2, xm3 pinsrw xm2, [r6+r3*2], 3 movzx r3d, r2w pinsrw xm2, [r6+r3*2], 4 rorx r3, r2, 32 shr r2d, 16 pinsrw xm2, [r6+r2*2], 5 movzx r2d, r3w pinsrw xm2, [r6+r2*2], 6 shr r3d, 16 pinsrw xm2, [r6+r3*2], 7 pmulhrsw xm2, xm7 packsswb xm2, xm2 movq [bufq+r7], xm2 add r7, 8 jl .loop ; auto-regression code add r5, r4 jmp r5 .ar1: DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd xm5, [fg_dataq+FGData.ar_coeffs_y] mova xm2, [base+gen_shufC] DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 pinsrb xm5, [base+pb_1], 3 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd pmovsxbw xm5, xm5 pshufd xm4, xm5, q0000 pshufd xm5, xm5, q1111 sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm1, [bufq+xq-82-3] pshufb xm0, xm1, xm2 punpckhwd xm1, xm3 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d movsx val0d, byte [bufq+xq] sarx val3d, val3d, shiftd add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: %if WIN64 %assign stack_size_padded 168 SUB rsp, stack_size_padded WIN64_PUSH_XMM 16, 8 %endif DEFINE_ARGS buf, fg_data, h, x mov r6d, [fg_dataq+FGData.ar_coeff_shift] pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 vpbroadcastd xm10, [base+round_vals-14+r6*2] movd xm11, [base+byte_blend+1] pmovsxbw xm9, xm9 pshufd xm4, xm7, q0000 mova xm12, [base+gen_shufA] pshufd xm5, xm7, q3333 mova xm13, [base+gen_shufB] pshufd xm6, xm7, q1111 mova xm14, [base+gen_shufC] pshufd xm7, xm7, q2222 mova xm15, [base+gen_shufD] pshufd xm8, xm9, q0000 psrld xm10, 16 pshufd xm9, xm9, q1111 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pshufb xm2, xm0, xm12 pmaddwd xm2, xm4 pshufb xm3, xm1, xm13 pmaddwd xm3, xm5 paddd xm2, xm3 pshufb xm3, xm0, xm14 pmaddwd xm3, xm6 punpckhqdq xm0, xm0 punpcklwd xm0, xm1 pmaddwd xm0, xm7 pshufb xm1, xm15 pmaddwd xm1, xm8 paddd xm2, xm10 paddd xm2, xm3 paddd xm0, xm1 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm1, xm0 pmaddwd xm3, xm9, xm1 psrldq xm1, 4 ; y=0,x=0 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw xm3, xm1 packsswb xm3, xm3 pextrb [bufq+xq], xm3, 0 pslldq xm3, 2 vpblendvb xm0, xm3, xm11 psrldq xm0, 1 inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET INIT_YMM avx2 .ar3: %if WIN64 ALLOC_STACK 16*14 %assign stack_size stack_size - 16*4 WIN64_PUSH_XMM 12, 8 %else ALLOC_STACK 16*12 %endif mov r6d, [fg_dataq+FGData.ar_coeff_shift] movd xm11, [base+byte_blend] pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pshufd m0, m1, q0000 mova [rsp+16* 0], m0 pshufd m0, m1, q1111 mova [rsp+16* 2], m0 pshufd m0, m1, q2222 mova [rsp+16* 4], m0 pshufd m1, m1, q3333 mova [rsp+16* 6], m1 pshufd xm0, xm2, q0000 mova [rsp+16* 8], xm0 pshufd xm0, xm2, q1111 mova [rsp+16* 9], xm0 psrldq xm7, xm2, 10 mova m8, [base+gen_shufA] pinsrw xm2, [base+pw_1], 5 mova m9, [base+gen_shufC] pshufd xm2, xm2, q2222 movu m10, [base+gen_shufE] vpbroadcastw xm6, [base+round_vals-12+r6*2] pinsrw xm7, [base+round_vals+r6*2-10], 3 mova [rsp+16*10], xm2 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] punpcklbw m3, m5, m5 punpckhwd m5, m4 psraw m3, 8 punpcklbw m5, m5 psraw m5, 8 punpcklbw xm4, xm4 psraw xm4, 8 pshufb m0, m3, m8 pmaddwd m0, [rsp+16*0] pshufb m1, m3, m9 pmaddwd m1, [rsp+16*2] shufps m2, m3, m5, q1032 paddd m0, m1 pshufb m1, m2, m8 vperm2i128 m3, m4, 0x21 pmaddwd m1, [rsp+16*4] shufps xm2, xm3, q1021 vpblendd m2, m3, 0xf0 pshufb m2, m10 paddd m0, m1 pmaddwd m2, [rsp+16*6] pshufb xm1, xm4, xm9 pmaddwd xm1, [rsp+16*8] shufps xm4, xm5, q1132 paddd m0, m2 pshufb xm2, xm4, xm8 pshufd xm4, xm4, q2121 pmaddwd xm2, [rsp+16*9] punpcklwd xm4, xm6 pmaddwd xm4, [rsp+16*10] vextracti128 xm3, m0, 1 paddd xm0, xm1 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] paddd xm2, xm4 paddd xm0, xm2 paddd xm0, xm3 .x_loop_ar3_inner: pmovsxbw xm2, xm1 pmaddwd xm2, xm7 pshufd xm3, xm2, q1111 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb xm2, xm2 pextrb [bufq+xq], xm2, 0 pslldq xm2, 3 vpblendvb xm1, xm2, xm11 psrldq xm1, 1 inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv %define base r4-generate_grain_uv_%1_8bpc_avx2_table lea r4, [generate_grain_uv_%1_8bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] movq xm4, [base+mul_bits] movq xm5, [base+hmul_bits] mova xm6, [base+pb_mask] vpbroadcastw xm7, [base+round+r6*2] vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] pxor xm0, xm2 lea r6, [gaussian_sequence] %if %2 mov r7d, 73-35*%3 add bufq, 44 .loop_y: mov r5, -44 %else mov r5, -73*82 sub bufq, r5 %endif .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 movq r8, xm2 movzx r9d, r8w movd xm2, [r6+r9*2] rorx r9, r8, 32 shr r8d, 16 pinsrw xm2, [r6+r8*2], 1 movzx r8d, r9w pinsrw xm2, [r6+r8*2], 2 shr r9d, 16 pinsrw xm2, [r6+r9*2], 3 pmulhrsw xm2, xm7 packsswb xm2, xm2 movd [bufq+r5], xm2 add r5, 4 jl .loop %if %2 add bufq, 82 dec r7d jg .loop_y %endif ; auto-regression code movsxd r6, [fg_dataq+FGData.ar_coeff_lag] movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] add r6, r4 jmp r6 INIT_YMM avx2 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd xm3, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h pmovsxbw xm2, xm2 %if %2 vpbroadcastd m7, [base+pb_1] vpbroadcastw m6, [base+hmul_bits+2+%3*2] %endif vpbroadcastw m2, xm2 vpbroadcastw m3, xm3 pxor m12, m12 %if %2 sub bufq, 82*(73-35*%3)+82-(82*3+41) %else sub bufq, 82*70-3 %endif add bufyq, 3+82*3 mov hd, 70-35*%3 .y_loop_ar0: %if %2 ; first 32 pixels movu xm4, [bufyq] vinserti128 m4, [bufyq+32], 1 %if %3 movu xm0, [bufyq+82] vinserti128 m0, [bufyq+82+32], 1 %endif movu xm5, [bufyq+16] vinserti128 m5, [bufyq+48], 1 %if %3 movu xm1, [bufyq+82+16] vinserti128 m1, [bufyq+82+48], 1 %endif pmaddubsw m4, m7, m4 %if %3 pmaddubsw m0, m7, m0 %endif pmaddubsw m5, m7, m5 %if %3 pmaddubsw m1, m7, m1 paddw m4, m0 paddw m5, m1 %endif pmulhrsw m4, m6 pmulhrsw m5, m6 %else xor r3d, r3d ; first 32x2 pixels .x_loop_ar0: movu m4, [bufyq+r3] pcmpgtb m0, m12, m4 punpckhbw m5, m4, m0 punpcklbw m4, m0 %endif pmullw m4, m2 pmullw m5, m2 pmulhrsw m4, m3 pmulhrsw m5, m3 %if %2 movu m1, [bufq] %else movu m1, [bufq+r3] %endif pcmpgtb m8, m12, m1 punpcklbw m0, m1, m8 punpckhbw m1, m8 paddw m0, m4 paddw m1, m5 packsswb m0, m1 %if %2 movu [bufq], m0 %else movu [bufq+r3], m0 add r3d, 32 cmp r3d, 64 jl .x_loop_ar0 %endif ; last 6/12 pixels movu xm4, [bufyq+32*2] %if %2 %if %3 movu xm5, [bufyq+32*2+82] %endif pmaddubsw xm4, xm7, xm4 %if %3 pmaddubsw xm5, xm7, xm5 paddw xm4, xm5 %endif movq xm0, [bufq+32] pmulhrsw xm4, xm6 pmullw xm4, xm2 pmulhrsw xm4, xm3 pcmpgtb xm5, xm12, xm0 punpcklbw xm5, xm0, xm5 paddw xm4, xm5 packsswb xm4, xm4 pblendw xm0, xm4, xm0, 1000b movq [bufq+32], xm0 %else movu xm0, [bufq+64] pcmpgtb xm1, xm12, xm4 punpckhbw xm5, xm4, xm1 punpcklbw xm4, xm1 pmullw xm5, xm2 pmullw xm4, xm2 vpblendd xm1, xm3, xm12, 0x0c pmulhrsw xm5, xm1 pmulhrsw xm4, xm3 pcmpgtb xm1, xm12, xm0 punpckhbw xm8, xm0, xm1 punpcklbw xm0, xm1 paddw xm5, xm8 paddw xm0, xm4 packsswb xm0, xm5 movu [bufq+64], xm0 %endif add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET INIT_XMM avx2 .ar1: DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd %if %2 vpbroadcastd xm7, [base+pb_1] vpbroadcastw xm6, [base+hmul_bits+2+%3*2] %endif vpbroadcastd xm3, xm3 %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm0, [bufq+xq-82-1] ; top/left %if %2 movq xm8, [bufyq+xq*2] %if %3 movq xm9, [bufyq+xq*2+82] %endif %endif psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right %if %2 pmaddubsw xm8, xm7, xm8 %if %3 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 %endif pmulhrsw xm8, xm6 %else pmovsxbw xm8, [bufyq+xq] %endif punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 paddd xm0, xm3 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar1 RET .ar2: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vpbroadcastw xm13, [base+round_vals-12+shiftq*2] pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 pinsrw xm0, [base+pw_1], 5 %if %2 vpbroadcastw xm12, [base+hmul_bits+2+%3*2] vpbroadcastd xm11, [base+pb_1] %endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd xm4, xm7, q0000 pshufd xm5, xm7, q3333 pshufd xm6, xm7, q1111 pshufd xm7, xm7, q2222 pshufd xm8, xm0, q0000 pshufd xm9, xm0, q1111 pshufd xm10, xm0, q2222 %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pshufb xm2, xm0, [base+gen_shufA] pmaddwd xm2, xm4 pshufb xm3, xm1, [base+gen_shufB] pmaddwd xm3, xm5 paddd xm2, xm3 pshufb xm3, xm0, [base+gen_shufC] pmaddwd xm3, xm6 punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] punpcklwd xm0, xm1 pmaddwd xm0, xm7 pshufb xm1, [gen_shufD] pmaddwd xm1, xm8 paddd xm2, xm3 paddd xm0, xm1 paddd xm2, xm0 %if %2 movq xm0, [bufyq+xq*2] %if %3 movq xm3, [bufyq+xq*2+82] %endif pmaddubsw xm0, xm11, xm0 %if %3 pmaddubsw xm3, xm11, xm3 paddw xm0, xm3 %endif pmulhrsw xm0, xm12 %else pmovsxbw xm0, [bufyq+xq] %endif punpcklwd xm0, xm13 pmaddwd xm0, xm10 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm0, xm0 pmaddwd xm3, xm0, xm9 psrldq xm0, 2 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] pslldq xm3, 2 paddw xm3, xm0 pblendw xm0, xm3, 00000010b packsswb xm0, xm0 pextrb [bufq+xq], xm0, 1 inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET INIT_YMM avx2 .ar3: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] movd xm13, [base+round_vals-10+shiftq*2] vpbroadcastd xm14, [base+round_vals-14+shiftq*2] pshufd m6, m0, q0000 pshufd m7, m0, q1111 pshufd m8, m0, q2222 pshufd m9, m0, q3333 pshufd xm10, xm1, q0000 pshufd xm11, xm1, q1111 pshufhw xm12, xm1, q0000 psraw xm2, 8 palignr xm13, xm1, 10 punpckhwd xm12, xm2 ; interleave luma cf psrld xm14, 16 DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 vpbroadcastw xm15, [base+hmul_bits+2+%3*2] sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] vpblendd m3, m1, 0x0f pxor m0, m0 pcmpgtb m2, m0, m3 pcmpgtb m0, m4 punpcklbw m1, m3, m2 punpckhbw m3, m2 punpcklbw m2, m4, m0 punpckhbw xm4, xm0 pshufb m0, m1, [base+gen_shufA] pmaddwd m0, m6 pshufb m5, m1, [base+gen_shufC] pmaddwd m5, m7 shufps m1, m3, q1032 paddd m0, m5 pshufb m5, m1, [base+gen_shufA] pmaddwd m5, m8 shufps xm1, xm3, q2121 vpblendd m1, m2, 0xf0 pshufb m1, [base+gen_shufE] pmaddwd m1, m9 paddd m0, m5 pshufb xm3, xm2, [base+gen_shufC] paddd m0, m1 pmaddwd xm3, xm10 palignr xm1, xm4, xm2, 2 punpckhwd xm1, xm2, xm1 pmaddwd xm1, xm11 palignr xm4, xm2, 12 paddd xm3, xm1 %if %2 vpbroadcastd xm5, [base+pb_1] movq xm1, [bufyq+xq*2] pmaddubsw xm1, xm5, xm1 %if %3 movq xm2, [bufyq+xq*2+82] pmaddubsw xm5, xm2 paddw xm1, xm5 %endif pmulhrsw xm1, xm15 %else pmovsxbw xm1, [bufyq+xq] %endif punpcklwd xm4, xm1 pmaddwd xm4, xm12 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] vextracti128 xm2, m0, 1 paddd xm0, xm14 paddd xm3, xm4 paddd xm0, xm3 paddd xm0, xm2 .x_loop_ar3_inner: pmovsxbw xm1, xm1 pmaddwd xm2, xm13, xm1 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] psrldq xm1, 2 ; don't packssdw, we only care about one value punpckldq xm2, xm2 pblendw xm1, xm2, 0100b packsswb xm1, xm1 pextrb [bufq+xq], xm1, 2 inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET %endmacro INIT_YMM avx2 cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, see, overlap %define base r9-pd_m65536 lea r9, [pd_m65536] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] vpbroadcastd m8, [base+pd_m65536] vpbroadcastw m9, [base+mul_bits+r6*2-14] vpbroadcastd m10, [base+fg_min+r7*4] vpbroadcastd m11, [base+fg_max+r7*8] vpbroadcastd m12, [base+pw_1024] movq xm13, [base+pb_27_17_17_27] test sbyd, sbyd setnz r7b pxor m7, m7 test r7b, overlapb jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap mov hd, hm mov grain_lutq, grain_lutmp .loop_y: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq] punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm5 pmaddubsw xm4, xm13, xm4 pmulhrsw xm4, xm12 packsswb xm4, xm4 vpblendd m4, m5, 0xfe punpckhbw m5, m7 punpcklbw m4, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap jmp .loop_x_h_overlap .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused, sby, see, overlap movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x_v_overlap: vpbroadcastd m14, [pb_27_17] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_v_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] punpcklbw m5, m4, m6 punpckhbw m4, m6 pmaddubsw m5, m14, m5 pmaddubsw m4, m14, m4 pmulhrsw m5, m12 pmulhrsw m4, m12 packsswb m5, m4 punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hb jz .end_y_v_overlap vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: add wq, 32 jge .end lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: vpbroadcastd m14, [pb_27_17] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+32] lea left_offxyd, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_hv_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq] movd xm7, [grain_lutq+left_offxyq] movu m4, [grain_lutq+top_offxyq] movd xm5, [grain_lutq+topleft_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw xm7, xm6 punpcklbw xm5, xm4 pmaddubsw xm7, xm13, xm7 pmaddubsw xm5, xm13, xm5 pmulhrsw xm7, xm12 pmulhrsw xm5, xm12 packsswb xm7, xm7 packsswb xm5, xm5 vpblendd m7, m6, 0xfe vpblendd m5, m4, 0xfe ; followed by v interpolation (top | cur -> cur) punpckhbw m4, m6 punpcklbw m5, m7 pmaddubsw m4, m14, m4 pmaddubsw m5, m14, m5 pmulhrsw m4, m12 pmulhrsw m5, m12 pxor m7, m7 packsswb m5, m4 punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hb jz .end_y_hv_overlap vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: add wq, 32 lea srcq, [src_bakq+wq] jl .loop_x_hv_overlap .end: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, overlap, uv_pl, is_id %define base r11-pd_m65536 lea r11, [pd_m65536] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, is_idm mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] vpbroadcastd m8, [base+pd_m65536] vpbroadcastw m9, [base+mul_bits+r6*2-14] vpbroadcastd m10, [base+fg_min+r7*4] shlx r7d, r7d, r9d vpbroadcastd m11, [base+fg_max+r7*4] vpbroadcastd m12, [base+pw_1024] pxor m7, m7 test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, sby, see, overlap, uv_pl %if %1 mov r6d, uv_plm vpbroadcastd m0, [base+pw_8] vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m0 ; uv_luma_mult, uv_mult %elif %2 vpbroadcastq m15, [base+pb_23_22] %else vpbroadcastq xm15, [base+pb_27_17_17_27] %endif %if %3 vpbroadcastw m13, [base+pb_23_22] %elif %2 pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 %endif test r7b, overlapb jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, unused5, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm5, [grain_lutq+offxyq+ 0] vinserti128 m5, [grain_lutq+offxyq+82], 1 %else movu m5, [grain_lutq+offxyq] %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 sub hb, 1+%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm5, [grain_lutq+offxyq+ 0] vinserti128 m5, [grain_lutq+offxyq+82], 1 movd xm4, [grain_lutq+left_offxyq+ 0] vinserti128 m4, [grain_lutq+left_offxyq+82], 1 punpcklbw m4, m5 %if %1 vpbroadcastq m0, [pb_23_22] pmaddubsw m4, m0, m4 %else pmaddubsw m4, m15, m4 %endif pmulhrsw m4, m12 packsswb m4, m4 vpblendd m4, m5, 0xee %else movu m5, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm5 %if %1 movq xm0, [pb_27_17_17_27] pmaddubsw xm4, xm0, xm4 %else pmaddubsw xm4, xm15, xm4 %endif pmulhrsw xm4, xm12 packsswb xm4, xm4 vpblendd m4, m5, 0xfe %endif punpckhbw m5, m7 punpcklbw m4, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(1+%2) sub hb, 1+%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, overlap, unused1, unused2, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused1, unused2, see, overlap, unused3, unused4, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x_v_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 vpbroadcastd m13, [pb_27_17] %endif %%loop_y_v_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %3 == 0 %if %2 movu xm0, [grain_lutq+offxyq] vinserti128 m0, [grain_lutq+offxyq+82], 1 movu xm4, [grain_lutq+top_offxyq] vinserti128 m4, [grain_lutq+top_offxyq+82], 1 %else movu m0, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] %endif punpcklbw m5, m4, m0 punpckhbw m4, m0 pmaddubsw m5, m13, m5 pmaddubsw m4, m13, m4 pmulhrsw m5, m12 pmulhrsw m4, m12 packsswb m5, m4 %else movq xm4, [grain_lutq+offxyq] vinserti128 m4, [grain_lutq+offxyq+8], 1 movq xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+8], 1 punpcklbw m5, m4 pmaddubsw m5, m13, m5 pmulhrsw m5, m12 vextracti128 xm4, m5, 1 packsswb xm5, xm4 ; only interpolate first line, insert second line unmodified vinserti128 m5, [grain_lutq+offxyq+82], 1 %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif sub hb, 1+%2 jle %%end_y_v_overlap %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 %if %2 == 0 vpbroadcastd m13, [pb_17_27] add hd, 0x80000000 jnc %%loop_y_v_overlap %endif jmp %%loop_y %%end_y_v_overlap: add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride lea topleft_offxyd, [top_offxyq+(32>>%2)] lea left_offxyd, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 vpbroadcastd m13, [pb_27_17] %endif %%loop_y_hv_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm4, [grain_lutq+offxyq] vinserti128 m4, [grain_lutq+offxyq+82], 1 movd xm0, [grain_lutq+left_offxyq] vinserti128 m0, [grain_lutq+left_offxyq+82], 1 movd xm6, [grain_lutq+topleft_offxyq] %if %3 movq xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+8], 1 %else vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 movu xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+82], 1 %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m0, m4 %if %3 punpcklbw xm6, xm5 %else punpcklbw m6, m5 %endif punpcklqdq m0, m6 %if %1 vpbroadcastq m6, [pb_23_22] pmaddubsw m0, m6, m0 %else pmaddubsw m0, m15, m0 %endif pmulhrsw m0, m12 packsswb m0, m0 vpblendd m4, m0, 0x11 %if %3 pshuflw xm0, xm0, q1032 vpblendd m5, m0, 0x01 %else pshuflw m0, m0, q1032 vpblendd m5, m0, 0x11 %endif %else movu m4, [grain_lutq+offxyq] movd xm0, [grain_lutq+left_offxyq] movu m5, [grain_lutq+top_offxyq] movd xm6, [grain_lutq+topleft_offxyq] punpcklbw xm0, xm4 punpcklbw xm6, xm5 punpcklqdq xm0, xm6 %if %1 vpbroadcastq xm6, [pb_27_17_17_27] pmaddubsw xm0, xm6, xm0 %else pmaddubsw xm0, xm15, xm0 %endif pmulhrsw xm0, xm12 packsswb xm0, xm0 vpblendd m4, m0, 0x01 pshuflw xm0, xm0, q1032 vpblendd m5, m0, 0x01 %endif ; followed by v interpolation (top | cur -> cur) %if %3 vpermq m0, m4, q3120 punpcklbw m5, m0 pmaddubsw m5, m13, m5 pmulhrsw m5, m12 vextracti128 xm0, m5, 1 packsswb xm5, xm0 vpblendd m5, m4, 0xf0 %else punpckhbw m0, m5, m4 punpcklbw m5, m4 pmaddubsw m4, m13, m0 pmaddubsw m5, m13, m5 pmulhrsw m4, m12 pmulhrsw m5, m12 packsswb m5, m4 %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 sub hb, 1+%2 %if %2 jg %%loop_y_h_overlap %else je %%end_y_hv_overlap vpbroadcastd m13, [pb_17_27] add hd, 0x80000000 jnc %%loop_y_hv_overlap jmp %%loop_y_h_overlap %endif %%end_y_hv_overlap: add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 FGUV_FN 420, 1, 1 GEN_GRAIN_UV_FN 422, 1, 0 FGUV_FN 422, 1, 0 GEN_GRAIN_UV_FN 444, 0, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain_avx512.asm000066400000000000000000000700401517466257200250410ustar00rootroot00000000000000; Copyright © 2022, VideoLAN and dav2d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 pb_27_17: times 2 db 27, 17 pb_23_22: times 2 db 23, 22 pw_8: times 2 dw 8 pw_1024: times 2 dw 1024 pb_17_27: times 2 db 17, 27 fg_max: times 4 db 255 times 4 db 240 times 4 db 235 fg_min: times 4 db 0 times 4 db 16 noise_rnd: times 2 dw 128 times 2 dw 64 times 2 dw 32 times 2 dw 16 SECTION .text INIT_ZMM avx512icl cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, see, overlap %define base r11-fg_min lea r11, [fg_min] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] mov r12, 0x0000000f0000000f ; h_overlap mask mova m0, [scalingq+64*0] mova m1, [scalingq+64*1] mova m2, [scalingq+64*2] mova m3, [scalingq+64*3] kmovq k1, r12 vbroadcasti32x4 m4, [base+interleave_hl] vpbroadcastd ym16, [base+pb_27_17] vpbroadcastd m12, [base+pb_17_27] vpbroadcastd m6, [base+noise_rnd+r6*4-32] test sbyd, sbyd setnz r6b vpbroadcastd m7, [base+fg_min+r7*4] vpbroadcastd m8, [base+fg_max+r7*8] pxor m5, m5 vpbroadcastd m9, [base+pw_1024] vpbroadcastq m10, [base+pb_27_17_17_27] vmovdqa64 m12{k1}, m16 test r6b, overlapb jnz .v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, overlap mov grain_lutq, grain_lutmp mov hd, hm .loop_y: movu ym21, [grain_lutq+offxyq-82] vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 call .add_noise sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x test sbyd, sbyd jnz .hv_overlap .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, left_offxy rorx offyd, seed, 8 mov left_offxyd, offxd ; previous column's offy*stride rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: movu ym20, [grain_lutq+offxyq-82] vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 movd xm19, [grain_lutq+left_offxyq-50] vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 punpcklbw m19, m20 pmaddubsw m19, m10, m19 pmulhrsw m19, m9 punpckhbw m21, m20, m5 packsswb m20{k1}, m19, m19 punpcklbw m20, m5, m20 call .add_noise_h sub hb, 2 jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] test sbyd, sbyd jnz .hv_overlap jmp .loop_x_h_overlap .v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ h, sby, see, overlap movzx r6d, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, r6d, 173 * 0x00010001 imul r6d, 37 * 0x01000100 add r7d, (105 << 16) | 188 add r6d, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and r6d, 0xff00ff00 xor seed, r7d xor seed, r6d ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offxd, [offyq+offxq*2+0x10001*829+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, overlap, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu ym19, [grain_lutq+offxyq-82] vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 movu ym21, [grain_lutq+top_offxyq-82] vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 punpckhbw m20, m21, m19 punpcklbw m21, m19 call .add_noise_v sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to .v_overlap, and instead always fall-through to h+v overlap .hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, left_offxy, top_offxy, topleft_offxy mov topleft_offxyd, top_offxyd rorx offyd, seed, 8 mov left_offxyd, offxd rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offxd, [offyq+offxq*2+0x10001*829+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu ym19, [grain_lutq+offxyq-82] vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 movd xm16, [grain_lutq+left_offxyq-50] vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 movu ym21, [grain_lutq+top_offxyq-82] vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 movd xm17, [grain_lutq+topleft_offxyq-50] vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m16, m19 pmaddubsw m16, m10, m16 punpcklbw m17, m21 pmaddubsw m17, m10, m17 punpckhbw m20, m21, m19 pmulhrsw m16, m9 pmulhrsw m17, m9 packsswb m19{k1}, m16, m16 packsswb m21{k1}, m17, m17 ; followed by v interpolation (top | cur -> cur) punpcklbw m21, m19 call .add_noise_v sub hb, 2 jg .loop_y_h_overlap add wq, 32 lea srcq, [src_bakq+wq] jl .hv_overlap .end: RET ALIGN function_align .add_noise_v: pmaddubsw m20, m12, m20 pmaddubsw m21, m12, m21 pmulhrsw m20, m9 pmulhrsw m21, m9 packsswb m21, m20 .add_noise: punpcklbw m20, m5, m21 punpckhbw m21, m5 .add_noise_h: mova ym18, [srcq+strideq*0] vinserti32x8 m18, [srcq+strideq*1], 1 mova m19, m0 punpcklbw m16, m18, m5 vpermt2b m19, m18, m1 ; scaling[ 0..127] vpmovb2m k2, m18 punpckhbw m17, m18, m5 vpermi2b m18, m2, m3 ; scaling[128..255] vmovdqu8 m19{k2}, m18 ; scaling[src] pshufb m19, m4 pmaddubsw m18, m19, m20 pmaddubsw m19, m21 add grain_lutq, 82*2 pmulhrsw m18, m6 ; noise pmulhrsw m19, m6 paddw m16, m18 paddw m17, m19 packuswb m16, m17 pmaxub m16, m7 pminub m16, m8 mova [dstq+srcq], ym16 add srcq, strideq vextracti32x8 [dstq+srcq], m16, 1 add srcq, strideq ret %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ scaling, grain_lut, h, sby, luma, \ overlap, uv_pl, is_id, _, stride3 lea r11, [fg_min] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, is_idm mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] %if %2 mov r12, 0x000f000f000f000f ; h_overlap mask vpbroadcastq m10, [base+pb_23_22_0_32] lea stride3q, [strideq*3] %else mov r12, 0x0000000f0000000f vpbroadcastq m10, [base+pb_27_17_17_27] %endif mova m0, [scalingq+64*0] mova m1, [scalingq+64*1] mova m2, [scalingq+64*2] mova m3, [scalingq+64*3] kmovq k1, r12 vbroadcasti32x4 m4, [base+interleave_hl] vpbroadcastd m6, [base+noise_rnd+r6*4-32] vpbroadcastd m7, [base+fg_min+r7*4] shlx r7d, r7d, r9d vpbroadcastd m8, [base+fg_max+r7*4] test sbyd, sbyd setnz r7b vpbroadcastd m9, [base+pw_1024] mova m11, [base+pb_even] mova m12, [base+pb_odd] pxor m5, m5 mov r5, r10mp ; lstride cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ h, sby, see, overlap, uv_pl, _, _, stride3 %if %1 mov r6d, uv_plm vpbroadcastd m16, [base+pw_8] vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m16 ; uv_luma_mult, uv_mult %endif test r7b, overlapb jnz %%v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, overlap, _, _, _, stride3 mov lumaq, r9mp lea r11, [srcq+wq] lea r12, [dstq+wq] lea r13, [lumaq+wq*(1+%2)] mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, overlap, _, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: %if %2 movu xm21, [grain_lutq+offxyq+82*0] vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 %endif call %%add_noise sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x cmp dword r8m, 0 ; sby jne %%hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, left_offxy, _, _, _, stride3 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, left_offxy, _, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: %if %2 movu xm20, [grain_lutq+offxyq +82*0] movd xm19, [grain_lutq+left_offxyq+82*0] vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 %else movu ym20, [grain_lutq+offxyq + 0] movd xm19, [grain_lutq+left_offxyq+ 0] vinserti32x8 m20, [grain_lutq+offxyq +82], 1 vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 %endif punpcklbw m19, m20 pmaddubsw m19, m10, m19 punpckhbw m21, m20, m5 pmulhrsw m19, m9 vpacksswb m20{k1}, m19, m19 punpcklbw m20, m5, m20 call %%add_noise_h sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq cmp dword r8m, 0 ; sby jne %%hv_overlap jmp %%loop_x_h_overlap %%v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ _, sby, see, overlap, _, _, _, stride3 movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed %if %3 vpbroadcastd m13, [base+pb_23_22] kxnorw k3, k3, k3 ; v_overlap mask %elif %2 vbroadcasti32x8 m13, [base+pb_27_17] kxnord k3, k3, k3 pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 %else vpbroadcastd ym16, [base+pb_27_17] vpbroadcastd m13, [base+pb_17_27] vmovdqa64 m13{k1}, m16 %endif DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, overlap, top_offxy, _, _, stride3 mov lumaq, r9mp lea r11, [srcq+wq] lea r12, [dstq+wq] lea r13, [lumaq+wq*(1<<%2)] mov r11mp, r11 mov r12mp, r12 neg wq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0x000f000f and offxd, 0x000f000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, overlap, top_offxy, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %3 movu xm18, [grain_lutq+offxyq+82*0] movu xm20, [grain_lutq+top_offxyq+82*0] ; only interpolate first line, insert remaining line unmodified vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 punpcklbw xm19, xm20, xm18 punpckhbw xm20, xm18 %elif %2 movu xm18, [grain_lutq+offxyq+82*0] vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 movu xm20, [grain_lutq+top_offxyq+82*0] vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 punpcklbw ym19, ym20, ym18 punpckhbw ym20, ym18 %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 movu ym20, [grain_lutq+top_offxyq+82*0] vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq %%hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 lea topleft_offxyd, [top_offxyq+(32>>%2)] lea left_offxyd, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0x000f000f and offxd, 0x000f000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 movu xm21, [grain_lutq+offxyq+82*0] movd xm16, [grain_lutq+left_offxyq+82*0] vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 movd xm18, [grain_lutq+topleft_offxyq+82*0] movu xm20, [grain_lutq+top_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m16, m21 %if %3 punpcklbw xm18, xm20 %else vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 punpcklbw ym18, ym20 %endif punpcklqdq m16, m18 pmaddubsw m16, m10, m16 pmulhrsw m16, m9 packsswb m16, m16 vmovdqu8 m21{k1}, m16 %if %3 vpalignr xm20{k1}, xm16, xm16, 4 punpcklbw xm19, xm20, xm21 punpckhbw xm20, xm21 %else vpalignr ym20{k1}, ym16, ym16, 4 punpcklbw ym19, ym20, ym21 punpckhbw ym20, ym21 %endif %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 movd xm16, [grain_lutq+left_offxyq+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 movu ym20, [grain_lutq+top_offxyq+82*0] vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 movd xm18, [grain_lutq+topleft_offxyq+82*0] vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 punpcklbw m16, m21 punpcklbw m18, m20 punpcklqdq m16, m18 pmaddubsw m16, m10, m16 pmulhrsw m16, m9 packsswb m16, m16 vpalignr m20{k1}, m16, m16, 4 vmovdqu8 m21{k1}, m16 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq jmp %%hv_overlap ALIGN function_align %%add_noise_v: %if %3 pmaddubsw xm19, xm13, xm19 pmaddubsw xm20, xm13, xm20 pmulhrsw xm19, xm9 pmulhrsw xm20, xm9 vpacksswb m21{k3}, m19, m20 %elif %2 pmaddubsw ym19, ym13, ym19 pmaddubsw ym20, ym13, ym20 pmulhrsw ym19, ym9 pmulhrsw ym20, ym9 vpacksswb m21{k3}, m19, m20 %else punpcklbw m19, m20, m21 punpckhbw m20, m21 pmaddubsw m19, m13, m19 pmaddubsw m20, m13, m20 pmulhrsw m19, m9 pmulhrsw m20, m9 packsswb m21, m19, m20 %endif %%add_noise: punpcklbw m20, m5, m21 punpckhbw m21, m5 %%add_noise_h: mova ym18, [lumaq+lstrideq*(0<<%3)] vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 %if %2 lea lumaq, [lumaq+lstrideq*(2<<%3)] mova ym16, [lumaq+lstrideq*(0<<%3)] vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 mova xm17, [srcq+strideq*0] mova m19, m11 vpermi2b m19, m18, m16 vinserti128 ym17, [srcq+strideq*1], 1 vpermt2b m18, m12, m16 vinserti32x4 m17, [srcq+strideq*2], 2 pavgb m18, m19 vinserti32x4 m17, [srcq+stride3q ], 3 %else mova ym17, [srcq+strideq*0] vinserti32x8 m17, [srcq+strideq*1], 1 %endif %if %1 punpckhbw m19, m18, m17 punpcklbw m18, m17 ; { luma, chroma } pmaddubsw m19, m14 pmaddubsw m18, m14 psraw m19, 6 psraw m18, 6 paddw m19, m15 paddw m18, m15 packuswb m18, m19 .add_noise_main: mova m19, m0 vpermt2b m19, m18, m1 ; scaling[ 0..127] vpmovb2m k2, m18 vpermi2b m18, m2, m3 ; scaling[128..255] vmovdqu8 m19{k2}, m18 ; scaling[src] pshufb m19, m4 pmaddubsw m18, m19, m20 pmaddubsw m19, m21 add grain_lutq, 82*2<<%2 lea lumaq, [lumaq+lstrideq*(2<<%3)] lea srcq, [srcq+strideq*(2<<%2)] pmulhrsw m18, m6 ; noise pmulhrsw m19, m6 punpcklbw m16, m17, m5 ; chroma punpckhbw m17, m5 paddw m16, m18 paddw m17, m19 packuswb m16, m17 pmaxub m16, m7 pminub m16, m8 %if %2 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+stride3q ], m16, 3 %else mova [dstq+strideq*0], ym16 vextracti32x8 [dstq+strideq*1], m16, 1 %endif lea dstq, [dstq+strideq*(2<<%2)] ret %else jmp .add_noise_main %endif %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain_common.asm000066400000000000000000000041521517466257200253040ustar00rootroot00000000000000; Copyright © 2019-2022, VideoLAN and dav2d authors ; Copyright © 2019-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. struc FGData .seed: resd 1 .num_y_points: resd 1 .y_points: resb 14 * 2 .chroma_scaling_from_luma: resd 1 .num_uv_points: resd 2 .uv_points: resb 2 * 10 * 2 .scaling_shift: resd 1 .ar_coeff_lag: resd 1 .ar_coeffs_y: resb 24 .ar_coeffs_uv: resb 2 * 28 ; includes padding .ar_coeff_shift: resq 1 .grain_scale_shift: resd 1 .uv_mult: resd 2 .uv_luma_mult: resd 2 .uv_offset: resd 2 .overlap_flag: resd 1 .clip_to_restricted_range: resd 1 endstruc cextern gaussian_sequence dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/filmgrain_sse.asm000066400000000000000000002602251517466257200246130ustar00rootroot00000000000000; Copyright © 2019-2021, VideoLAN and dav2d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" SECTION_RODATA pw_1024: times 8 dw 1024 pb_27_17_17_27: db 27, 17, 17, 27 times 6 db 0, 32 pb_23_22_h: db 23, 22 times 7 db 0, 32 pb_27_17: times 8 db 27, 17 pb_17_27: times 8 db 17, 27 pb_23_22: times 8 db 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 max: dw 255, 240, 235 min: dw 0, 16 pw_1: dw 1 %macro JMP_TABLE 2-* %xdefine %1_8bpc_%2_table %%table %xdefine %%base %1_8bpc_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 SECTION .text %if ARCH_X86_32 %define PIC_ptr(a) base+a %else %define PIC_ptr(a) a %endif %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro INIT_XMM ssse3 cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r2d, [fg_dataq+FGData.grain_scale_shift] movd m2, [base+round+r2*2] movd m0, [fg_dataq+FGData.seed] mova m5, [base+pb_mask] pshuflw m2, m2, q0000 pshuflw m0, m0, q0000 mov r2, -73*82 sub bufq, r2 lea r3, [base+gaussian_sequence] .loop: pand m6, m0, m1 psrlw m3, m6, 10 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m6, m4 ; bits 0x0f00 are set pshufb m3, m5, m6 ; set 15th bit for next 4 seeds psllq m6, m3, 30 por m3, m6 psllq m6, m3, 15 por m3, m6 ; aggregate each bit into next seed's high bit pmulhuw m6, m0, m7 por m3, m6 ; 4 next output seeds pshuflw m0, m3, q3333 psrlw m3, 5 %if ARCH_X86_64 movq r6, m3 mov r8, r6 movzx r5d, r6w shr r6d, 16 shr r8, 32 movzx r7, r8w shr r8, 16 movd m6, [r3+r5*2] pinsrw m6, [r3+r6*2], 1 pinsrw m6, [r3+r7*2], 2 pinsrw m6, [r3+r8*2], 3 %else movd r6, m3 pshuflw m3, m3, q3232 movzx r5, r6w shr r6, 16 movd m6, [r3+r5*2] pinsrw m6, [r3+r6*2], 1 movd r6, m3 movzx r5, r6w shr r6, 16 pinsrw m6, [r3+r5*2], 2 pinsrw m6, [r3+r6*2], 3 %endif pmulhrsw m6, m2 packsswb m6, m6 movd [bufq+r2], m6 add r2, 4 jl .loop ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] jmp r2 .ar1: %if ARCH_X86_32 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max %elif WIN64 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 mov bufq, r0 %else DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 %endif movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd m4, [fg_dataq+FGData.ar_coeffs_y] mov ecx, [fg_dataq+FGData.ar_coeff_shift] %if ARCH_X86_32 mov r1m, cf3d DEFINE_ARGS buf, shift, val3, min, max, x, val0 %define hd r0mp %define cf3d r1mp %elif WIN64 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 %else DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 %endif pxor m6, m6 pcmpgtb m7, m6, m4 punpcklbw m4, m7 pinsrw m4, [base+pw_1], 3 pshufd m5, m4, q1111 pshufd m4, m4, q0000 movd m3, [base+round_vals+shiftq*2-12] ; rnd pshuflw m3, m3, q0000 sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: movq m0, [bufq+xq-82-1] ; top/left pcmpgtb m7, m6, m0 punpcklbw m0, m7 psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 punpcklwd m1, m3 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: %if ARCH_X86_32 ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m6, [base+round_vals-12+shiftq*2] movd m7, [base+byte_blend+1] SCRATCH 7, 15, 7 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 pxor m7, m7 pshuflw m6, m6, q0000 punpcklwd m6, m7 pcmpgtb m4, m7, m0 pcmpgtb m5, m7, m1 punpcklbw m0, m4 punpcklbw m1, m5 DEFINE_ARGS buf, fg_data, h, x pshufd m4, m1, q0000 pshufd m5, m1, q1111 pshufd m3, m0, q3333 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pcmpgtb m2, m7, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] psrldq m3, m1, 2 ; y=-1,x=[-1,+5] psrldq m4, m1, 4 ; y=-1,x=[+0,+5] punpcklwd m2, m0, m5 punpcklwd m3, m4 pmaddwd m2, m8 pmaddwd m3, m11 paddd m2, m3 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] psrldq m5, m0, 6 ; y=-2,x=[+1,+5] psrldq m6, m0, 8 ; y=-2,x=[+2,+5] punpcklwd m4, m5 punpcklwd m6, m1 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] psrldq m1, m1, 8 ; y=-1,x=[+2,+5] punpcklwd m5, m1 pmaddwd m4, m9 pmaddwd m6, m10 pmaddwd m5, m12 paddd m4, m6 paddd m2, m5 paddd m2, m4 paddd m2, m14 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pcmpgtb m4, m7, m0 punpcklbw m1, m0, m4 pmaddwd m3, m1, m13 paddd m3, m2 psrldq m1, 4 ; y=0,x=0 psrldq m2, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw m3, m1 packsswb m3, m3 pslldq m3, 2 pand m3, m15 pandn m1, m15, m0 por m0, m1, m3 psrldq m0, 1 ; overwrite 2 pixels, but that's ok movd [bufq+xq-1], m0 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, fg_data, shift %if ARCH_X86_32 ALLOC_STACK -16*14 %elif WIN64 SUB rsp, 16*6 %assign stack_size_padded (stack_size_padded+16*6) %assign stack_size (stack_size+16*6) %else ALLOC_STACK -16*6 %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m6, [base+round_vals-12+shiftq*2] movd m7, [base+byte_blend] movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pxor m3, m3 pcmpgtb m4, m3, m0 pcmpgtb m3, m2 pshuflw m6, m6, q0000 SCRATCH 6, 14, 12 SCRATCH 7, 15, 13 punpckhbw m1, m0, m4 punpcklbw m0, m4 punpcklbw m2, m3 pshufd m3, m0, q1111 pshufd m4, m0, q2222 pshufd m5, m0, q3333 pshufd m0, m0, q0000 mova [rsp+ 0*16], m0 mova [rsp+ 1*16], m3 mova [rsp+ 2*16], m4 mova [rsp+ 3*16], m5 pshufd m6, m1, q1111 pshufd m7, m1, q2222 pshufd m5, m1, q3333 pshufd m1, m1, q0000 pshufd m3, m2, q1111 psrldq m0, m2, 10 pinsrw m2, [base+pw_1], 5 pshufd m4, m2, q2222 pshufd m2, m2, q0000 pinsrw m0, [base+round_vals+shiftq*2-10], 3 mova [rsp+ 4*16], m1 mova [rsp+ 5*16], m6 SCRATCH 7, 8, 6 SCRATCH 5, 9, 7 SCRATCH 2, 10, 8 SCRATCH 3, 11, 9 SCRATCH 4, 12, 10 SCRATCH 0, 13, 11 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] pxor m3, m3 pcmpgtb m3, m0 punpckhbw m2, m0, m3 punpcklbw m0, m3 psrldq m5, m0, 2 psrldq m6, m0, 4 psrldq m7, m0, 6 punpcklwd m4, m0, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 0*16] pmaddwd m6, [rsp+ 1*16] paddd m4, m6 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] pxor m5, m5 pcmpgtb m5, m1 punpckhbw m3, m1, m5 punpcklbw m1, m5 palignr m6, m2, m0, 10 palignr m7, m2, m0, 12 psrldq m0, 8 punpcklwd m0, m6 punpcklwd m7, m1 pmaddwd m0, [rsp+ 2*16] pmaddwd m7, [rsp+ 3*16] paddd m0, m7 paddd m0, m4 psrldq m4, m1, 2 psrldq m5, m1, 4 psrldq m6, m1, 6 psrldq m7, m1, 8 punpcklwd m4, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 4*16] pmaddwd m6, [rsp+ 5*16] paddd m4, m6 paddd m0, m4 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor m7, m7 pcmpgtb m7, m2 punpckhbw m5, m2, m7 punpcklbw m2, m7 palignr m7, m3, m1, 10 palignr m3, m1, 12 psrldq m1, m2, 2 punpcklwd m7, m3 punpcklwd m3, m2, m1 pmaddwd m7, m8 pmaddwd m3, m9 paddd m7, m3 paddd m0, m7 psrldq m6, m2, 4 psrldq m1, m2, 6 psrldq m3, m2, 8 palignr m4, m5, m2, 10 palignr m5, m5, m2, 12 punpcklwd m6, m1 punpcklwd m3, m4 punpcklwd m5, m14 pmaddwd m6, m10 pmaddwd m3, m11 pmaddwd m5, m12 paddd m0, m6 paddd m3, m5 paddd m0, m3 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pxor m5, m5 pcmpgtb m5, m1 punpcklbw m2, m1, m5 pmaddwd m2, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb m2, m2 pslldq m2, 3 pand m2, m15 pandn m3, m15, m1 por m1, m2, m3 movd [bufq+xq-3], m1 psrldq m1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv movifnidn r2, r2mp movifnidn r3, r3mp LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] movd m6, [base+round+r5*2] mova m5, [base+pb_mask] movd m0, [fg_dataq+FGData.seed] movd m2, [base+pw_seed_xor+uvq*4] pxor m0, m2 pshuflw m6, m6, q0000 pshuflw m0, m0, q0000 lea r6, [base+gaussian_sequence] %if %2 %if ARCH_X86_64 mov r7d, 73-35*%3 %else mov r3mp, 73-35*%3 %endif add bufq, 44 .loop_y: mov r5, -44 .loop_x: %else mov r5, -82*73 sub bufq, r5 .loop: %endif pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m3, m2 psllq m2, m3, 15 por m3, m2 ; aggregate each bit into next seed's high bit pmulhuw m2, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 movd r9d, m2 pshuflw m2, m2, q3232 movzx r8, r9w shr r9, 16 movd m3, [r6+r8*2] pinsrw m3, [r6+r9*2], 1 movd r9d, m2 movzx r8, r9w shr r9, 16 pinsrw m3, [r6+r8*2], 2 pinsrw m3, [r6+r9*2], 3 %else movd r2, m2 pshuflw m2, m2, q3232 movzx r1, r2w shr r2, 16 movd m3, [r6+r1*2] pinsrw m3, [r6+r2*2], 1 movd r2, m2 movzx r1, r2w shr r2, 16 pinsrw m3, [r6+r1*2], 2 pinsrw m3, [r6+r2*2], 3 %endif pmulhrsw m3, m6 packsswb m3, m3 movd [bufq+r5], m3 add r5, 4 %if %2 jl .loop_x add bufq, 82 %if ARCH_X86_64 dec r7d %else dec r3mp %endif jg .loop_y %else jl .loop %endif %if ARCH_X86_32 mov r2, r2mp %endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] jmp r5 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 ALLOC_STACK -2*16 %endif imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd m4, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h, x pxor m0, m0 pcmpgtb m0, m5 punpcklbw m5, m0 movd m7, [base+pb_1] %if %2 movd m6, [base+hmul_bits+2+%3*2] %endif pshuflw m5, m5, q0000 pshuflw m4, m4, q0000 pshufd m7, m7, q0000 %if %2 pshuflw m6, m6, q0000 %endif punpcklqdq m5, m5 punpcklqdq m4, m4 %if %2 punpcklqdq m6, m6 %endif pcmpeqw m1, m1 pslldq m1, 12>>%2 SCRATCH 1, 8, 0 SCRATCH 4, 9, 1 %if %2 sub bufq, 82*(73-35*%3)+82-(82*3+41) %else sub bufq, 82*70-3 %endif add bufyq, 3+82*3 mov hd, 70-35*%3 .y_loop_ar0: xor xd, xd .x_loop_ar0: ; first 32 pixels %if %2 movu m1, [bufyq+xq*2] %if %3 movu m2, [bufyq+xq*2+82] %endif movu m3, [bufyq+xq*2+16] %if %3 movu m4, [bufyq+xq*2+82+16] %endif pmaddubsw m0, m7, m1 %if %3 pmaddubsw m1, m7, m2 %endif pmaddubsw m2, m7, m3 %if %3 pmaddubsw m3, m7, m4 paddw m0, m1 paddw m2, m3 %endif pmulhrsw m0, m6 pmulhrsw m2, m6 %else movu m0, [bufyq+xq] pxor m6, m6 pcmpgtb m6, m0 punpckhbw m2, m0, m6 punpcklbw m0, m6 %endif pmullw m0, m5 pmullw m2, m5 pmulhrsw m0, m9 pmulhrsw m2, m9 movu m1, [bufq+xq] pxor m4, m4 pcmpgtb m4, m1 punpckhbw m3, m1, m4 %if %2 punpcklbw m1, m4 paddw m2, m3 paddw m0, m1 %else punpcklbw m6, m1, m4 paddw m2, m3 paddw m0, m6 %endif packsswb m0, m2 %if %2 movu [bufq+xq], m0 add xd, 16 cmp xd, 32 jl .x_loop_ar0 ; last 6/12 pixels movu m1, [bufyq+xq*(1+%2)] %if %3 movu m2, [bufyq+xq*2+82] %endif pmaddubsw m0, m7, m1 %if %3 pmaddubsw m1, m7, m2 paddw m0, m1 %endif pmulhrsw m0, m6 pmullw m0, m5 pmulhrsw m0, m9 movq m1, [bufq+xq] pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 paddw m0, m2 packsswb m0, m0 pandn m2, m8, m0 pand m1, m8 por m2, m1 movq [bufq+xq], m2 %else add xd, 16 cmp xd, 80 je .y_loop_final_ar0 movu [bufq+xq-16], m0 jmp .x_loop_ar0 .y_loop_final_ar0: pandn m2, m8, m0 pand m1, m8 por m2, m1 movu [bufq+xq-16], m2 %endif add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET .ar1: %if ARCH_X86_32 RESET_STACK_STATE %endif DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x imul uvd, 28 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 %if ARCH_X86_32 mov r3mp, cf3d DEFINE_ARGS buf, shift, fg_data, val3, min, max, x %elif WIN64 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x mov bufq, r0 %else DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m3, [base+round_vals+shiftq*2-12] ; rnd %if %2 movd m7, [base+pb_1] movd m6, [base+hmul_bits+2+%3*2] %endif psrldq m4, 1 %if ARCH_X86_32 DEFINE_ARGS buf, shift, val0, val3, min, max, x %elif WIN64 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 %else DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 %endif pxor m5, m5 punpcklwd m3, m5 %if %2 punpcklwd m6, m6 %endif pcmpgtb m5, m4 punpcklbw m4, m5 pshufd m5, m4, q1111 pshufd m4, m4, q0000 pshufd m3, m3, q0000 %if %2 pshufd m7, m7, q0000 pshufd m6, m6, q0000 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif %if ARCH_X86_32 add r1mp, 79+82*3 mov r0mp, 70-35*%3 %else add bufyq, 79+82*3 mov hd, 70-35*%3 %endif mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: %if %2 %if ARCH_X86_32 mov r2, r1mp movq m0, [r2+xq*2] %if %3 movq m1, [r2+xq*2+82] %endif %else movq m0, [bufyq+xq*2] %if %3 movq m1, [bufyq+xq*2+82] %endif %endif pmaddubsw m2, m7, m0 %if %3 pmaddubsw m0, m7, m1 paddw m2, m0 %endif pmulhrsw m2, m6 %else %if ARCH_X86_32 mov r2, r1mp movd m2, [r2+xq] %else movd m2, [bufyq+xq] %endif pxor m0, m0 pcmpgtb m0, m2 punpcklbw m2, m0 %endif movq m0, [bufq+xq-82-1] ; top/left pxor m1, m1 pcmpgtb m1, m0 punpcklbw m0, m1 psrldq m1, m0, 4 ; top/right punpcklwd m1, m2 psrldq m2, m0, 2 ; top punpcklwd m0, m2 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 paddd m0, m3 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 %if ARCH_X86_32 imul val3d, r3mp %else imul val3d, cf3d %endif add val3d, val0d sar val3d, shiftb movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 %if ARCH_X86_32 add r1mp, 82<<%3 dec r0mp %else add bufyq, 82<<%3 dec hd %endif jg .y_loop_ar1 RET .ar2: %if ARCH_X86_32 ALLOC_STACK -8*16 %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 movd m7, [base+round_vals-12+shiftq*2] movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 pxor m2, m2 pcmpgtb m2, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 pinsrw m1, [base+pw_1], 5 punpcklwd m7, m7 pshufd m7, m7, q0000 DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd m4, m1, q0000 pshufd m5, m1, q1111 pshufd m6, m1, q2222 pshufd m3, m0, q3333 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 SCRATCH 7, 15, 7 %if %2 movd m7, [base+hmul_bits+2+%3*2] movd m6, [base+pb_1] punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: pxor m2, m2 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pcmpgtb m2, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] psrldq m3, m1, 2 ; y=-1,x=[-1,+5] psrldq m4, m1, 4 ; y=-1,x=[+0,+5] punpcklwd m2, m0, m5 punpcklwd m3, m4 pmaddwd m2, m8 pmaddwd m3, m11 paddd m2, m3 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] psrldq m5, m0, 6 ; y=-2,x=[+1,+5] psrldq m0, 8 ; y=-2,x=[+2,+5] punpcklwd m4, m5 punpcklwd m0, m1 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] psrldq m1, m1, 8 ; y=-1,x=[+2,+5] punpcklwd m3, m1 pmaddwd m4, m9 pmaddwd m0, m10 pmaddwd m3, m12 paddd m4, m0 paddd m2, m3 paddd m2, m4 %if %2 movq m1, [bufyq+xq*2] %if %3 movq m3, [bufyq+xq*2+82] %endif pmaddubsw m0, m6, m1 %if %3 pmaddubsw m1, m6, m3 paddw m0, m1 %endif pmulhrsw m0, m7 %else movd m0, [bufyq+xq] pxor m1, m1 pcmpgtb m1, m0 punpcklbw m0, m1 %endif punpcklwd m0, m15 pmaddwd m0, m14 paddd m2, m0 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] pxor m4, m4 movd m5, [base+byte_blend+1] punpcklbw m5, m5 .x_loop_ar2_inner: pcmpgtb m1, m4, m0 punpcklbw m0, m1 pmaddwd m3, m0, m13 paddd m3, m2 psrldq m2, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] pslldq m3, 4 pand m3, m5 paddw m0, m3 packsswb m0, m0 movd [bufq+xq-2], m0 psrldq m0, 1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET .ar3: %if ARCH_X86_32 RESET_STACK_STATE %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 ALLOC_STACK -15*16 %else SUB rsp, 16*7 %assign stack_size_padded (stack_size_padded+16*7) %assign stack_size (stack_size+16*7) %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 pxor m3, m3 pcmpgtb m3, m0 punpckhbw m1, m0, m3 punpcklbw m0, m3 pshufd m2, m0, q1111 pshufd m3, m0, q2222 pshufd m4, m0, q3333 pshufd m0, m0, q0000 pshufd m5, m1, q1111 pshufd m6, m1, q2222 pshufd m7, m1, q3333 pshufd m1, m1, q0000 mova [rsp+ 0*16], m0 mova [rsp+ 1*16], m2 mova [rsp+ 2*16], m3 mova [rsp+ 3*16], m4 mova [rsp+ 4*16], m1 mova [rsp+ 5*16], m5 mova [rsp+ 6*16], m6 SCRATCH 7, 8, 7 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] pxor m4, m4 pcmpgtb m4, m2 punpckhbw m5, m2, m4 punpcklbw m2, m4 pshufd m4, m2, q3232 punpcklwd m3, m4, m5 pshuflw m5, m4, q3321 pshufd m4, m3, q0000 pshufd m3, m2, q1111 pshufd m2, m2, q0000 pinsrw m5, [base+round_vals+shiftq*2-10], 3 SCRATCH 2, 9, 8 SCRATCH 3, 10, 9 SCRATCH 4, 11, 10 SCRATCH 5, 12, 11 movd m2, [base+round_vals-12+shiftq*2] %if %2 movd m1, [base+pb_1] movd m3, [base+hmul_bits+2+%3*2] %endif pxor m0, m0 punpcklwd m2, m0 %if %2 punpcklwd m3, m3 %endif pshufd m2, m2, q0000 %if %2 pshufd m1, m1, q0000 pshufd m3, m3, q0000 SCRATCH 1, 13, 12 %endif SCRATCH 2, 14, 13 %if %2 SCRATCH 3, 15, 14 %endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] pxor m4, m4 pcmpgtb m4, m0 punpckhbw m3, m0, m4 punpcklbw m0, m4 psrldq m5, m0, 2 psrldq m6, m0, 4 psrldq m7, m0, 6 punpcklwd m4, m0, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 0*16] pmaddwd m6, [rsp+ 1*16] paddd m4, m6 palignr m2, m3, m0, 10 palignr m3, m0, 12 psrldq m0, 8 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] pxor m6, m6 pcmpgtb m6, m1 punpckhbw m5, m1, m6 punpcklbw m1, m6 punpcklwd m0, m2 punpcklwd m3, m1 pmaddwd m0, [rsp+ 2*16] pmaddwd m3, [rsp+ 3*16] paddd m0, m3 paddd m0, m4 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor m7, m7 pcmpgtb m7, m2 punpckhbw m6, m2, m7 punpcklbw m2, m7 palignr m3, m5, m1, 10 palignr m5, m1, 12 psrldq m4, m2, 2 punpcklwd m3, m5 punpcklwd m5, m2, m4 pmaddwd m3, [rsp+ 6*16] pmaddwd m5, m8 paddd m3, m5 paddd m0, m3 psrldq m3, m1, 2 psrldq m4, m1, 4 psrldq m5, m1, 6 psrldq m1, 8 punpcklwd m3, m4 punpcklwd m5, m1 pmaddwd m3, [rsp+ 4*16] pmaddwd m5, [rsp+ 5*16] paddd m3, m5 paddd m0, m3 %if %2 movq m1, [bufyq+xq*2] %if %3 movq m3, [bufyq+xq*2+82] %endif pmaddubsw m7, m13, m1 %if %3 pmaddubsw m5, m13, m3 paddw m7, m5 %endif pmulhrsw m7, m15 %else movd m7, [bufyq+xq] pxor m1, m1 pcmpgtb m1, m7 punpcklbw m7, m1 %endif psrldq m1, m2, 4 psrldq m3, m2, 6 palignr m4, m6, m2, 10 palignr m6, m2, 12 psrldq m2, 8 punpcklwd m1, m3 punpcklwd m2, m4 punpcklwd m6, m7 pmaddwd m1, m9 pmaddwd m2, m10 pmaddwd m6, m11 paddd m1, m2 paddd m0, m6 paddd m0, m1 paddd m0, m14 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] pxor m4, m4 movd m5, [base+byte_blend] .x_loop_ar3_inner: pcmpgtb m2, m4, m1 punpcklbw m3, m1, m2 pmaddwd m2, m3, m12 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw, we only care about one value packsswb m2, m2 pandn m3, m5, m1 pslld m2, 24 pand m2, m5 por m1, m2, m3 movd [bufq+xq-3], m1 psrldq m1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET %endmacro generate_grain_uv_fn 420, 1, 1 generate_grain_uv_fn 422, 1, 0 generate_grain_uv_fn 444, 0, 0 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg %assign %%idx 0 %define %%tmp %2 %if %0 == 6 %define %%tmp %6 %endif %rep 4 %if %%idx == 0 movd %5 %+ d, %2 pshuflw %%tmp, %2, q3232 %else movd %5 %+ d, %%tmp %if %%idx == 2 punpckhqdq %%tmp, %%tmp %elif %%idx == 4 psrlq %%tmp, 32 %endif %endif movzx %4 %+ d, %5 %+ w shr %5 %+ d, 16 %if %%idx == 0 movd %1, [%3+%4] %else pinsrw %1, [%3+%4], %%idx + 0 %endif pinsrw %1, [%3+%5], %%idx + 1 %assign %%idx %%idx+2 %endrep %endmacro INIT_XMM ssse3 ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov r5, r8m mov [rsp+5*mmsize+ 4*gprsize], r0 mov [rsp+5*mmsize+ 6*gprsize], r1 mov [rsp+5*mmsize+ 8*gprsize], r2 mov [rsp+5*mmsize+10*gprsize], r3 mov [rsp+5*mmsize+11*gprsize], r4 mov [rsp+5*mmsize+12*gprsize], r5 %else cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize %define r0m [rsp+5*mmsize+ 4*gprsize] %define r1m [rsp+5*mmsize+ 5*gprsize] %define r2m [rsp+5*mmsize+ 6*gprsize] %define r3m [rsp+5*mmsize+ 7*gprsize] %define r4m [rsp+5*mmsize+ 8*gprsize] %define r5m [rsp+5*mmsize+ 9*gprsize] %define r6m [rsp+5*mmsize+10*gprsize] %define r7m [rsp+5*mmsize+11*gprsize] %define r8m [rsp+5*mmsize+12*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, picptrq %else cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r7, [pb_mask] %define base r7-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] movd m4, [base+max+r6*4] movd m5, [base+min+r6*2] punpcklwd m3, m3 punpcklwd m4, m4 punpcklwd m5, m5 pshufd m3, m3, q0000 pshufd m4, m4, q0000 pshufd m5, m5, q0000 SCRATCH 3, 11, 0 SCRATCH 4, 12, 1 SCRATCH 5, 13, 2 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %endif mov sbyd, r8m mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 test overlapd, overlapd jz .no_vertical_overlap mova m6, [base+pw_1024] mova m7, [base+pb_27_17_17_27] SCRATCH 6, 14, 3 SCRATCH 7, 15, 4 test sbyd, sbyd jnz .vertical_overlap ; fall-through .no_vertical_overlap: mov r8m, overlapd %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ unused1, unused2, see, unused3 %endif lea src_bakq, [srcq+wq] neg wq sub dstmp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r4m, wq DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 %endif .loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, unused mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, ; r6m=grain_lut, r7m=h, r8m=overlap_v|h DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, unused %endif .loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp .loop_y: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else vpgatherdw m4, m0, scalingq-1, r12, r13, m3 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m4 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 add srcq, r2mp add grain_lutq, 82 dec hd jg .loop_y %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp %else lea srcq, [src_bakq+wq] %endif btc dword r8m, 2 jc .next_blk add offxyd, 16 test dword r8m, 2 ; r8m & 2 = have_top_overlap jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif jnz .loop_x_odd_v_overlap .next_blk: test dword r8m, 1 jz .loop_x test dword r8m, 2 jnz .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: %if ARCH_X86_32 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, ; r6m=grain_lut, r7m=h, r8m=overlap_v|h DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 add offxyd, 16 ; left_offxyd mov [rsp+5*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 mov seed, r3m %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy %endif mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else vpgatherdw m4, m0, scalingq-1, r12, r13, m3 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+5*mmsize+0*gprsize] movd m7, [grain_lutq+r5] %else movd m7, [grain_lutq+left_offxyq] %endif punpcklbw m7, m3 pmaddubsw m6, m15, m7 pmulhrsw m6, m14 packsswb m6, m6 shufps m6, m3, q3210 pcmpgtb m2, m6 punpcklbw m7, m6, m2 punpckhbw m6, m2 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m7, m4 pmullw m6, m5 pmulhrsw m7, m11 pmulhrsw m6, m11 ; dst = clip_pixel(src, noise) paddw m0, m7 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 add srcq, r2mp add grain_lutq, 82 dec hd jg .loop_y_h_overlap %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r1m add srcq, r4m %else lea srcq, [src_bakq+wq] %endif xor dword r8m, 4 add offxyd, 16 ; since this half-block had left-overlap, the next does not test dword r8m, 2 ; have_top_overlap jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif jmp .loop_x_odd_v_overlap .end: RET .vertical_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap %endif or overlapd, 2 ; top_overlap: overlap & 2 mov r8m, overlapd movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul tmpd, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add tmpd, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and tmpd, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, tmpd %if ARCH_X86_32 xor sbyd, seed ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ tmp, unused2, see, unused3 %endif lea src_bakq, [srcq+wq] neg wq sub dstmp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %endif .loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed, ; because of the 'and tmpd, 0x00ff00ff' above mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, unused, top_offxy mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, unused, top_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+5*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 .loop_x_odd_v_overlap: %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] mov [rsp+5*mmsize+12], r5 %else mova m8, [pb_27_17] %endif mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else vpgatherdw m4, m0, scalingq-1, r12, r13, m3 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+5*mmsize+1*gprsize] movu m7, [grain_lutq+r5] %else movu m7, [grain_lutq+top_offxyq] %endif punpckhbw m6, m7, m3 punpcklbw m7, m3 %if ARCH_X86_32 mov r5, [rsp+5*mmsize+12] pmaddubsw m3, [r5], m6 pmaddubsw m6, [r5], m7 %else pmaddubsw m3, m8, m6 pmaddubsw m6, m8, m7 %endif pmulhrsw m3, m14 pmulhrsw m6, m14 packsswb m6, m3 pcmpgtb m7, m2, m6 punpcklbw m2, m6, m7 punpckhbw m6, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m4 pmullw m6, m5 pmulhrsw m2, m11 pmulhrsw m6, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif add srcq, r2mp add grain_lutq, 82 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines btc hd, 16 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp %else lea srcq, [src_bakq+wq] %endif btc dword r8m, 2 jc .loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif jmp .loop_x_odd_v_overlap .loop_x_hv_overlap: %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] mov [rsp+5*mmsize+12], r5 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak mov r5, [rsp+5*mmsize+1*gprsize] mov r4, offxyd add r5, 16 add r4, 16 mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak xor tmpd, tmpd mov seed, r3m %else mova m8, [pb_27_17] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ tmp, unused2, see, unused3 ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut movzx r5, offxyw ; top_offxy mov [rsp+5*mmsize+1*gprsize], r5 %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy movzx top_offxyd, offxyw %endif shr offxyd, 16 mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy movu m6, [grain_lutq+r5] mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy movd m4, [grain_lutq+r0] movd m7, [grain_lutq+r5] %else movu m6, [grain_lutq+top_offxyq] movd m4, [grain_lutq+left_offxyq] movd m7, [grain_lutq+topleft_offxyq] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m4, m3 punpcklbw m7, m6 pmaddubsw m2, m15, m4 pmaddubsw m4, m15, m7 pmulhrsw m2, m14 pmulhrsw m4, m14 packsswb m2, m2 packsswb m4, m4 shufps m2, m3, q3210 shufps m4, m6, q3210 ; followed by v interpolation (top | cur -> cur) punpcklbw m3, m4, m2 punpckhbw m4, m2 %if ARCH_X86_32 mov r5, [rsp+5*mmsize+12] pmaddubsw m7, [r5], m4 pmaddubsw m4, [r5], m3 %else pmaddubsw m7, m8, m4 pmaddubsw m4, m8, m3 %endif pmulhrsw m7, m14 pmulhrsw m4, m14 packsswb m4, m7 pxor m2, m2 pcmpgtb m7, m2, m4 punpcklbw m3, m4, m7 punpckhbw m4, m7 ; src mova m0, [srcq] punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m5, m0, scalingq-1, r0, r5, m7 vpgatherdw m6, m1, scalingq-1, r0, r5, m7 %else vpgatherdw m5, m0, scalingq-1, r13, r14, m7 vpgatherdw m6, m1, scalingq-1, r13, r14, m7 %endif REPX {psrlw x, 8}, m5, m6 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m3, m5 pmullw m4, m6 pmulhrsw m3, m11 pmulhrsw m4, m11 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif add srcq, r2mp add grain_lutq, 82 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines btc hd, 16 jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r1m add srcq, r4m %else lea srcq, [src_bakq+wq] %endif xor dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif jmp .loop_x_odd_v_overlap .end_hv: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov [rsp+7*mmsize+3*gprsize], r0 mov [rsp+7*mmsize+5*gprsize], r1 mov [rsp+7*mmsize+7*gprsize], r2 mov [rsp+7*mmsize+9*gprsize], r3 mov [rsp+7*mmsize+10*gprsize], r4 mov r0, r8m mov r1, r9m mov r2, r10m mov r4, r11m mov r3, r12m mov [rsp+7*mmsize+11*gprsize], r0 mov [rsp+7*mmsize+12*gprsize], r1 mov [rsp+7*mmsize+13*gprsize], r2 mov [rsp+7*mmsize+14*gprsize], r4 %else cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize %define r0m [rsp+7*mmsize+ 3*gprsize] %define r1m [rsp+7*mmsize+ 4*gprsize] %define r2m [rsp+7*mmsize+ 5*gprsize] %define r3m [rsp+7*mmsize+ 6*gprsize] %define r4m [rsp+7*mmsize+ 7*gprsize] %define r5m [rsp+7*mmsize+ 8*gprsize] %define r6m [rsp+7*mmsize+ 9*gprsize] %define r7m [rsp+7*mmsize+10*gprsize] %define r8m [rsp+7*mmsize+11*gprsize] %define r9m [rsp+7*mmsize+12*gprsize] %define r10m [rsp+7*mmsize+13*gprsize] %define r11m [rsp+7*mmsize+14*gprsize] %define r12m [rsp+7*mmsize+15*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, r5 %else cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] lea tmpd, [r6d*2] %if ARCH_X86_32 && STACK_ALIGNMENT < mmsize test r3, r3 %else cmp dword r12m, 0 ; is_idm %endif movd m5, [base+min+r6*2] cmovne r6d, tmpd movd m4, [base+max+r6*2] punpcklwd m3, m3 punpcklwd m5, m5 punpcklwd m4, m4 pshufd m3, m3, q0000 pshufd m5, m5, q0000 pshufd m4, m4, q0000 SCRATCH 3, 11, 0 SCRATCH 4, 12, 1 SCRATCH 5, 13, 2 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %endif %if %1 mov r6d, dword r11m movd m0, [fg_dataq+FGData.uv_mult+r6*4] movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] punpcklbw m6, m1, m0 movd m7, [fg_dataq+FGData.uv_offset+r6*4] punpcklwd m6, m6 punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 SCRATCH 6, 14, 3 SCRATCH 7, 15, 4 %endif mov sbyd, r8m mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 test overlapd, overlapd jz %%no_vertical_overlap %if ARCH_X86_32 %if %2 mova m1, [base+pb_23_22_h] %else mova m1, [base+pb_27_17_17_27] %endif mova m0, [base+pw_1024] %else %if %2 mova m1, [pb_23_22_h] %else mova m1, [pb_27_17_17_27] %endif mova m0, [pw_1024] %endif SCRATCH 0, 8, 5 SCRATCH 1, 9, 6 test sbyd, sbyd jnz %%vertical_overlap ; fall-through %%no_vertical_overlap: mov r8m, overlapd %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak %define luma_bakq lumaq mov wq, r4m %if %3 shl r10mp, 1 %endif %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak mov lstrideq, r10mp %endif mov lumaq, r9mp lea src_bakq, [srcq+wq] lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r11m, luma_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq mov r12mp, strideq %endif %%loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak %endif %%loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: ; src %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else vpgatherdw m7, m4, scalingq-1, r12, r2 vpgatherdw m5, m6, scalingq-1, r12, r2 %endif REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq+ 0] pcmpgtb m6, m2, m3 punpcklbw m2, m3, m6 punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; we already incremented lumaq above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 dec hw jg %%loop_y %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 == 0 ; adjust top_offxy %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif add offxyd, 16 btc dword r8m, 2 jc %%loop_x_even test dword r8m, 2 jz %%loop_x_odd jmp %%loop_x_odd_v_overlap %%loop_x_even: %endif test dword r8m, 1 jz %%loop_x ; r8m = sbym test dword r8m, 2 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: %if ARCH_X86_32 %if %2 lea r6, [offxyd+16] mov [rsp+7*mmsize+0*gprsize], r6 %else mov [rsp+7*mmsize+0*gprsize], offxyd %endif DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut mov seed, r3m %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride %if %2 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx %else mov left_offxyd, offyd %endif %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_h_overlap: ; src %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else vpgatherdw m7, m4, scalingq-1, r12, r2 vpgatherdw m5, m6, scalingq-1, r12, r2 %endif REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq+ 0] %if ARCH_X86_32 mov r0, [rsp+7*mmsize+0*gprsize] movd m2, [grain_lutq+r0+ 0] %else movd m2, [grain_lutq+left_offxyq+ 0] %endif punpcklbw m2, m4 pmaddubsw m3, m9, m2 pmulhrsw m3, m8 packsswb m3, m3 shufps m3, m4, q3210 pxor m4, m4 pcmpgtb m4, m3 punpcklbw m2, m3, m4 punpckhbw m3, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; lumaq has already been incremented above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 dec hw jg %%loop_y_h_overlap %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 == 0 xor dword r8m, 4 ; adjust top_offxyd %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif add offxyd, 16 %endif ; r8m = sbym test dword r8m, 2 %if %2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %else jne %%loop_x_odd_v_overlap jmp %%loop_x_odd %endif %%end: RET %%vertical_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap %endif or overlapd, 2 ; top_overlap: overlap & 2 mov r8m, overlapd movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul tmpd, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add tmpd, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and tmpd, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, tmpd %if ARCH_X86_32 xor sbyd, seed ; (cur_seed << 16) | top_seed DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %if %3 shl r10mp, 1 %endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak mov lstrideq, r10mp %endif mov lumaq, r9mp lea src_bakq, [srcq+wq] lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r11m, luma_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq mov r12mp, strideq %endif %%loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m xor tmpd, tmpd %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak %endif movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 mov [rsp+7*mmsize+1*gprsize], top_offxyd DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %endif %%loop_x_odd_v_overlap: mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m %endif %if %3 mova m1, [PIC_ptr(pb_23_22)] %else mova m1, [PIC_ptr(pb_27_17)] %endif %%loop_y_v_overlap: %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else vpgatherdw m7, m4, scalingq-1, r12, r2 vpgatherdw m5, m6, scalingq-1, r12, r2 %endif REPX {psrlw x, 8}, m7, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r0, [rsp+7*mmsize+1*gprsize] movu m4, [grain_lutq+r0] %else movu m4, [grain_lutq+top_offxyq] %endif punpckhbw m6, m4, m3 punpcklbw m4, m3 pmaddubsw m2, m1, m6 pmaddubsw m3, m1, m4 pmulhrsw m2, m8 pmulhrsw m3, m8 packsswb m3, m2 pxor m6, m6 pcmpgtb m6, m3 punpcklbw m2, m3, m6 punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; unpack chroma_source pxor m4, m4 punpckhbw m6, m0, m4 punpcklbw m0, m4 ; m0-1: src as word %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m6, m3 pmaxsw m0, m13 pmaxsw m6, m13 pminsw m0, m12 pminsw m6, m12 packuswb m0, m6 movifnidn dstq, dstmp mova [dstq+srcq], m0 dec hw je %%end_y_v_overlap %if ARCH_X86_32 add srcq, r2mp ; lumaq has already been incremented above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 %if %3 == 0 btc hd, 16 %if ARCH_X86_32 mov r5, r5m %endif mova m1, [PIC_ptr(pb_17_27)] jnc %%loop_y_v_overlap %endif jmp %%loop_y %%end_y_v_overlap: %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %else %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 btc dword r8m, 2 jnc %%loop_x_odd_v_overlap %endif %%loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused mov r6, [rsp+7*mmsize+1*gprsize] %if %2 lea r0, [r3d+16] add r6, 16 mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy %else mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy %endif mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused mov seed, r3m xor tmpd, tmpd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride %if %2 lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offxyq+16] %else mov topleft_offxyq, top_offxyq mov left_offxyq, offxyq %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak %endif movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 mov [rsp+7*mmsize+1*gprsize], top_offxyd %endif mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m %endif %if %3 mova m3, [PIC_ptr(pb_23_22)] %else mova m3, [PIC_ptr(pb_27_17)] %endif %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] %if ARCH_X86_32 mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy movd m1, [grain_lutq+r0] mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy %else movd m1, [grain_lutq+topleft_offxyq] %endif movu m2, [grain_lutq+offxyq] %if ARCH_X86_32 movu m6, [grain_lutq+r5] movd m4, [grain_lutq+r0] %else movu m6, [grain_lutq+top_offxyq] movd m4, [grain_lutq+left_offxyq] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m1, m6 punpcklbw m4, m2 pmaddubsw m0, m9, m1 pmaddubsw m1, m9, m4 REPX {pmulhrsw x, m8}, m0, m1 packsswb m0, m1 shufps m4, m0, m2, q3232 shufps m0, m6, q3210 ; followed by v interpolation (top | cur -> cur) punpcklbw m2, m0, m4 punpckhbw m0, m4 pmaddubsw m4, m3, m0 pmaddubsw m1, m3, m2 pmulhrsw m4, m8 pmulhrsw m1, m8 packsswb m1, m4 ; src %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else %if %3 vpgatherdw m7, m4, scalingq-1, r2, r12 vpgatherdw m5, m6, scalingq-1, r2, r12 %else vpgatherdw m7, m4, scalingq-1, r2, r13 vpgatherdw m5, m6, scalingq-1, r2, r13 %endif %endif REPX {psrlw x, 8}, m7, m5 ; unpack grain pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 punpckhbw m1, m4 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m7 pmullw m1, m5 pmulhrsw m2, m11 pmulhrsw m1, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; unpack chroma source pxor m4, m4 punpckhbw m5, m0, m4 punpcklbw m0, m4 ; m0-1: src as word ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m5, m1 pmaxsw m0, m13 pmaxsw m5, m13 pminsw m0, m12 pminsw m5, m12 packuswb m0, m5 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; lumaq has been adjusted above already %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*(1+%2)] %else add lumaq, r10mp %endif %endif add grain_lutq, 82 dec hw %if %3 jg %%loop_y_h_overlap %else jle %%end_y_hv_overlap %if ARCH_X86_32 mov r5, r5m %endif mova m3, [PIC_ptr(pb_17_27)] btc hd, 16 jnc %%loop_y_hv_overlap %if ARCH_X86_64 mov lstrideq, r10mp %endif jmp %%loop_y_h_overlap %%end_y_hv_overlap: %if ARCH_X86_64 mov lstrideq, r10mp %endif %endif %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 jmp %%loop_x_hv_overlap %else %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 xor dword r8m, 4 jmp %%loop_x_odd_v_overlap %endif %%end_hv: RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 %endmacro FGUV_FN 420, 1, 1 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif FGUV_FN 422, 1, 0 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif FGUV_FN 444, 0, 0 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred.h000066400000000000000000000132631517466257200225410ustar00rootroot00000000000000/* * Copyright © 2018-2021, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/ipred.h" #define decl_fn(type, name) \ decl_##type##_fn(BF(dav2d_##name, ssse3)); \ decl_##type##_fn(BF(dav2d_##name, avx2)); \ decl_##type##_fn(BF(dav2d_##name, avx512icl)) #define init_fn(type0, type1, name, suffix) \ c->type0[type1] = BF(dav2d_##name, suffix) #define init_angular_ipred_fn(type, name, suffix) \ init_fn(intra_pred, type, name, suffix) decl_fn(angular_ipred, ipred_dc); decl_fn(angular_ipred, ipred_dc_128); decl_fn(angular_ipred, ipred_dc_top); decl_fn(angular_ipred, ipred_dc_left); decl_fn(angular_ipred, ipred_h); decl_fn(angular_ipred, ipred_v); decl_fn(angular_ipred, ipred_paeth); decl_fn(angular_ipred, ipred_smooth); decl_fn(angular_ipred, ipred_smooth_h); decl_fn(angular_ipred, ipred_smooth_v); decl_fn(angular_ipred, ipred_z1); decl_fn(angular_ipred, ipred_z2); decl_fn(angular_ipred, ipred_z3); decl_fn(angular_ipred, ipred_filter); decl_fn(pal_pred, pal_pred); static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav2dIntraPredDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_SSSE3)) return; #if 0 init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); #endif init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); #if 0 init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3); init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3); init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3); #if 0 init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3); init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3); init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3); //init_angular_ipred_fn(DIP_PRED, ipred_dip, ssse3); #endif c->pal_pred = BF(dav2d_pal_pred, ssse3); #if ARCH_X86_64 if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; #if BITDEPTH == 8 init_angular_ipred_fn(DC_PRED, ipred_dc, avx2); #endif init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2); #if BITDEPTH == 8 init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2); init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); #if BITDEPTH == 8 init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); #endif #if 0 init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); //init_angular_ipred_fn(DIP_PRED, ipred_dip, avx2); #endif c->pal_pred = BF(dav2d_pal_pred, avx2); if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; #if BITDEPTH == 8 #if 0 init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl); #endif init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl); #if 0 init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl); init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); #endif #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); #if 0 init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl); init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); //init_angular_ipred_fn(DIP_PRED, ipred_dip, avx512icl); #endif c->pal_pred = BF(dav2d_pal_pred, avx512icl); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred16_avx2.asm000066400000000000000000005377731517466257200242220ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 %macro SMOOTH_WEIGHTS 1-* const smooth_weights_1d_16bpc ; sm_weights[] << 7 %rep %0 dw %1*128 %rotate 1 %endrep const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] %rep %0 dw %1, 256-%1 %rotate 1 %endrep %endmacro SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 %if ARCH_X86_64 ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 pw_m1024: times 2 dw -1024 pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 pb_90: times 4 db 90 z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 z_filter_k: dw 4, 4, 5, 5, 4, 4 dw 8, 8, 6, 6, 4, 4 dw 0, 0, 0, 0, 2, 2 %define pw_2 (z_filter_k+32) %define pw_4 (z_filter_k+ 0) %define pw_16 (z2_ymul8 +20) pw_1: times 2 dw 1 pw_3: times 2 dw 3 pw_62: times 2 dw 62 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) %define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm add tlq, 2 movd xm4, wd pxor xm3, xm3 pavgw xm4, xm3 tzcnt wd, wd movd xm5, wd movu m0, [tlq] lea r5, [ipred_dc_left_16bpc_avx2_table] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm sub tlq, hq movd xm4, hd sub tlq, hq pxor xm3, xm3 pavgw xm4, xm3 tzcnt r6d, hd movd xm5, r6d movu m0, [tlq] lea r5, [ipred_dc_left_16bpc_avx2_table] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: paddw m0, [tlq+96] paddw m0, [tlq+64] .h32: paddw m0, [tlq+32] .h16: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h8: psrldq xm1, xm0, 8 paddw xm0, xm1 .h4: punpcklwd xm0, xm3 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 paddd xm0, xm4 psrld xm0, xm5 lea stride3q, [strideq*3] vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 jmp wq cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd xm4, r5d tzcnt r5d, r5d movd xm5, r5d lea r5, [ipred_dc_16bpc_avx2_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw xm4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq xm0, [tlq-8] jmp wq .w4: movq xm1, [tlq+2] paddw m0, m4 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrld m1, m0, 16 paddw m0, m1 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: vextracti128 xm1, m0, 1 paddw xm0, xm1 lea r2d, [hq*2] mov r6d, 0xAAAB6667 shrx r6d, r6d, r2d punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 paddd xm0, xm1 movd xm1, r6d psrld xm0, 2 pmulhuw xm0, xm1 psrlw xm0, 1 .w4_end: vpbroadcastw xm0, xm0 .s4: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: mova xm0, [tlq-16] jmp wq .w8: vextracti128 xm1, m0, 1 paddw xm0, [tlq+2] paddw xm0, xm4 paddw xm0, xm1 psrld xm1, xm0, 16 paddw xm0, xm1 pblendw xm0, xm3, 0xAA psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w8_end: vpbroadcastw xm0, xm0 .s8: movu [dstq+strideq*0], xm0 movu [dstq+strideq*1], xm0 movu [dstq+strideq*2], xm0 movu [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova m0, [tlq-32] jmp wq .w16: paddw m0, [tlq+2] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w16_end: vpbroadcastw m0, xm0 .s16: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m0 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-64] paddw m0, [tlq-32] jmp wq .w32: paddw m0, [tlq+ 2] paddw m0, [tlq+34] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpcklwd xm1, xm0, xm3 punpckhwd xm0, xm3 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x6667AAAB shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w32_end: vpbroadcastw m0, xm0 mova m1, m0 .s32: movu [dstq+strideq*0+32*0], m0 movu [dstq+strideq*0+32*1], m1 movu [dstq+strideq*1+32*0], m0 movu [dstq+strideq*1+32*1], m1 movu [dstq+strideq*2+32*0], m0 movu [dstq+strideq*2+32*1], m1 movu [dstq+stride3q +32*0], m0 movu [dstq+stride3q +32*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-128] mova m1, [tlq- 96] paddw m0, [tlq- 64] paddw m1, [tlq- 32] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] paddw m0, [tlq+34] paddw m1, [tlq+66] paddw m0, [tlq+98] paddw m0, m1 vextracti128 xm1, m0, 1 paddw xm0, xm1 punpcklwd xm1, xm0, xm3 punpckhwd xm0, xm3 paddd xm1, xm4 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 64 je .w64_end mov r6d, 0x6667AAAB shrx r6d, r6d, hd movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w64_end: vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 .s64: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m shr r6d, 11 lea r5, [ipred_dc_splat_16bpc_avx2_table] tzcnt wd, wd movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] mova m1, m0 mova m2, m0 mova m3, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+34] movu m2, [tlq+66] movu m3, [tlq+98] lea r5, [ipred_dc_splat_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq %macro IPRED_H 2 ; w, store_type vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] vpbroadcastw m2, [tlq-6] vpbroadcastw m3, [tlq-8] sub tlq, 8 mov%2 [dstq+strideq*0], m0 mov%2 [dstq+strideq*1], m1 mov%2 [dstq+strideq*2], m2 mov%2 [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET ALIGN function_align %endmacro cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 movifnidn hd, hm lea r5, [ipred_h_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq INIT_XMM avx2 .w4: IPRED_H 4, q .w8: IPRED_H 8, a INIT_YMM avx2 .w16: IPRED_H 16, a .w32: vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] vpbroadcastw m2, [tlq-6] vpbroadcastw m3, [tlq-8] sub tlq, 8 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m2 mova [dstq+strideq*2+32*1], m2 mova [dstq+stride3q +32*0], m3 mova [dstq+stride3q +32*1], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET .w64: vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] sub tlq, 4 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*0+32*2], m0 mova [dstq+strideq*0+32*3], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m1 mova [dstq+strideq*1+32*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %macro PAETH 3 ; top, signed_ldiff, ldiff paddw m0, m%2, m1 psubw m7, m3, m0 ; tldiff psubw m0, m%1 ; tdiff pabsw m7, m7 pabsw m0, m0 pminsw m7, m0 pcmpeqw m0, m7 pcmpgtw m7, m%3, m7 vpblendvb m0, m3, m%1, m0 vpblendvb m0, m1, m0, m7 %endmacro cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h %define base r5-ipred_paeth_16bpc_avx2_table movifnidn hd, hm lea r5, [ipred_paeth_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] vpbroadcastw m3, [tlq] ; topleft add wq, r5 jmp wq .w4: vpbroadcastq m2, [tlq+2] ; top movsldup m6, [base+ipred_hv_shuf] lea r3, [strideq*3] psubw m4, m2, m3 pabsw m5, m4 .w4_loop: sub tlq, 8 vpbroadcastq m1, [tlq] pshufb m1, m6 ; left PAETH 2, 4, 5 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: vbroadcasti128 m2, [tlq+2] movsldup m6, [base+ipred_hv_shuf] psubw m4, m2, m3 pabsw m5, m4 .w8_loop: sub tlq, 4 vpbroadcastd m1, [tlq] pshufb m1, m6 PAETH 2, 4, 5 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m2, [tlq+2] psubw m4, m2, m3 pabsw m5, m4 .w16_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 movu [dstq], m0 add dstq, strideq dec hd jg .w16_loop RET ALIGN function_align .w32: movu m2, [tlq+2] movu m6, [tlq+34] %if WIN64 movaps r4m, xmm8 movaps r6m, xmm9 %endif psubw m4, m2, m3 psubw m8, m6, m3 pabsw m5, m4 pabsw m9, m8 .w32_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 movu [dstq+32*0], m0 PAETH 6, 8, 9 movu [dstq+32*1], m0 add dstq, strideq dec hd jg .w32_loop %if WIN64 movaps xmm8, r4m movaps xmm9, r6m %endif RET ALIGN function_align .w64: WIN64_SPILL_XMM 16 movu m2, [tlq+ 2] movu m6, [tlq+34] movu m10, [tlq+66] movu m13, [tlq+98] psubw m4, m2, m3 psubw m8, m6, m3 psubw m11, m10, m3 psubw m14, m13, m3 pabsw m5, m4 pabsw m9, m8 pabsw m12, m11 pabsw m15, m14 .w64_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq+32*0], m0 PAETH 6, 8, 9 mova [dstq+32*1], m0 PAETH 10, 11, 12 mova [dstq+32*2], m0 PAETH 13, 14, 15 mova [dstq+32*3], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_16bpc_avx2_table lea r6, [ipred_smooth_v_16bpc_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] neg hq vpbroadcastw m5, [tlq+hq*2] ; bottom add wq, r6 jmp wq .w4: vpbroadcastq m4, [tlq+2] ; top movsldup m3, [base+ipred_hv_shuf] lea r6, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m3 pmulhrsw m0, m4 paddw m0, m5 vextracti128 xm1, m0, 1 movhps [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movq [dstq+r6 ], xm0 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop .ret: RET .w8: vbroadcasti128 m4, [tlq+2] movsldup m3, [base+ipred_hv_shuf] lea r6, [strideq*3] psubw m4, m5 .w8_loop: vpbroadcastd m0, [weightsq+hq*2+0] vpbroadcastd m1, [weightsq+hq*2+4] pshufb m0, m3 pshufb m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 vextracti128 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], xm0 vextracti128 [dstq+strideq*2], m1, 1 mova [dstq+r6 ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: movu m4, [tlq+2] lea r6, [strideq*3] psubw m4, m5 .w16_loop: vpbroadcastw m0, [weightsq+hq*2+0] vpbroadcastw m1, [weightsq+hq*2+2] vpbroadcastw m2, [weightsq+hq*2+4] vpbroadcastw m3, [weightsq+hq*2+6] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r6 ], m3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: WIN64_SPILL_XMM 7 movu m4, [tlq+ 2] movu m6, [tlq+34] psubw m4, m5 psubw m6, m5 .w32_loop: vpbroadcastw m1, [weightsq+hq*2+0] vpbroadcastw m3, [weightsq+hq*2+2] pmulhrsw m0, m4, m1 pmulhrsw m1, m6 pmulhrsw m2, m4, m3 pmulhrsw m3, m6 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: WIN64_SPILL_XMM 8 movu m3, [tlq+ 2] movu m4, [tlq+34] movu m6, [tlq+66] movu m7, [tlq+98] REPX {psubw x, m5}, m3, m4, m6, m7 .w64_loop: vpbroadcastw m2, [weightsq+hq*2] pmulhrsw m0, m3, m2 pmulhrsw m1, m4, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*0], m0 pmulhrsw m0, m6, m2 mova [dstq+32*1], m1 pmulhrsw m1, m7, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 %define base r6-ipred_smooth_h_16bpc_avx2_table lea r6, [ipred_smooth_h_16bpc_avx2_table] mov wd, wm movifnidn hd, hm vpbroadcastw m5, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [r6+wq*4] sub tlq, hq lea stride3q, [strideq*3] add wq, r6 jmp wq .w4: vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] movsldup m3, [base+ipred_hv_shuf] .w4_loop: vpbroadcastq m0, [tlq+hq-8] ; left pshufb m0, m3 psubw m0, m5 ; left - right pmulhrsw m0, m4 paddw m0, m5 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] movsldup m3, [base+ipred_hv_shuf] .w8_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastd m1, [tlq+hq-8] pshufb m0, m3 pshufb m1, m3 psubw m0, m5 psubw m1, m5 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w8_loop RET .w16: movu m4, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: vpbroadcastq m3, [tlq+hq-8] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w16_loop RET .w32: WIN64_SPILL_XMM 7 movu m4, [base+smooth_weights_1d_16bpc+32*2] movu m6, [base+smooth_weights_1d_16bpc+32*3] .w32_loop: vpbroadcastw m1, [tlq+hq-2] vpbroadcastw m3, [tlq+hq-4] psubw m1, m5 psubw m3, m5 pmulhrsw m0, m4, m1 pmulhrsw m1, m6 pmulhrsw m2, m4, m3 pmulhrsw m3, m6 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w32_loop RET .w64: WIN64_SPILL_XMM 8 movu m3, [base+smooth_weights_1d_16bpc+32*4] movu m4, [base+smooth_weights_1d_16bpc+32*5] movu m6, [base+smooth_weights_1d_16bpc+32*6] movu m7, [base+smooth_weights_1d_16bpc+32*7] .w64_loop: vpbroadcastw m2, [tlq+hq-2] psubw m2, m5 pmulhrsw m0, m3, m2 pmulhrsw m1, m4, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*0], m0 pmulhrsw m0, m6, m2 mova [dstq+32*1], m1 pmulhrsw m1, m7, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq sub hq, 1*2 jg .w64_loop RET %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] pmaddwd m0, m%1, m%3 pmaddwd m1, m%2, m%4 paddd m0, m%5 paddd m1, m%6 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m5 %endmacro cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_16bpc_avx2_table lea r6, [ipred_smooth_16bpc_avx2_table] mov wd, wm vpbroadcastw m4, [tlq+wq*2] ; right tzcnt wd, wd mov hd, hm sub tlq, hq sub tlq, hq movsxd wq, [r6+wq*4] pxor m5, m5 add wq, r6 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] jmp wq .w4: WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom vpbroadcastq m6, [tlq+hq*2+2] movsldup m7, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] punpcklwd m6, m0 ; top, bottom punpcklqdq m8, m9, m9 punpckhqdq m9, m9 lea r3, [strideq*3] .w4_loop: vpbroadcastq m3, [tlq+hq*2-8] vbroadcasti128 m1, [v_weightsq] pshufb m3, m7 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m10 pshufb m0, m1, m8 pshufb m1, m9 SMOOTH_2D_END 0, 1, 6, 6, 2, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 16 sub hd, 4 jg .w4_loop RET .w8: WIN64_SPILL_XMM 12 vpbroadcastw m0, [tlq] ; bottom vbroadcasti128 m7, [tlq+hq*2+2] movsldup m8, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 .w8_loop: vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastq m1, [v_weightsq] pshufb m3, m8 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m11 pshufb m1, m9 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hd, 2 jg .w8_loop RET .w16: WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 .w16_loop: vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastd m1, [v_weightsq+0] punpcklwd m3, m4 ; left, right pshufd m2, m3, q1111 pmaddwd m10, m8, m2 pmaddwd m2, m9 pshufd m3, m3, q0000 SMOOTH_2D_END 1, 1, 6, 7, 10, 2 vpbroadcastd m1, [v_weightsq+4] pmaddwd m2, m8, m3 pmaddwd m3, m9 mova [dstq+strideq*0], m0 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hq, 2 jg .w16_loop RET .w32: WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] movu m9, [tlq+hq*2+34] mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 .w32_loop: vpbroadcastw m3, [tlq+hq*2-2] vpbroadcastd m14, [v_weightsq] punpcklwd m3, m4 pmaddwd m1, m10, m3 pmaddwd m2, m11, m3 pmaddwd m0, m6, m14 paddd m0, m1 pmaddwd m1, m7, m14 paddd m1, m2 pmaddwd m2, m12, m3 pmaddwd m3, m13 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 14, 14, 8, 9, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec hd jg .w32_loop RET .w64: PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base mov dst_baseq, dstq mov tl_baseq, tlq mov v_weights_baseq, v_weightsq xor xq, xq .w64_loop_x: mov yq, hq lea tlq, [tl_baseq+hq*2] vpbroadcastw m0, [tl_baseq] ; bottom movu m7, [tlq+xq*2+ 2] movu m9, [tlq+xq*2+34] mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 lea tlq, [tl_baseq-2] .w64_loop_y: vpbroadcastw m3, [tlq+yq*2] vpbroadcastd m1, [v_weightsq] punpcklwd m3, m4 pmaddwd m14, m10, m3 pmaddwd m15, m11, m3 pmaddwd m2, m12, m3 pmaddwd m3, m13 pmaddwd m0, m6, m1 paddd m0, m14 pmaddwd m14, m7, m1 paddd m14, m15 psrld m0, 8 psrld m14, 8 packssdw m0, m14 pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec yq jg .w64_loop_y lea dstq, [dst_baseq+32*2] add r6, 16*8 mov v_weightsq, v_weights_baseq add xq, 32 test xb, 64 jz .w64_loop_x RET cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase lea r6, [ipred_z1_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea r7, [dr_intra_derivative] movsxd wq, [r6+wq*4] add tlq, 2 add wq, r6 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [r7+dxq] xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m5, [pw_62] jmp wq .w4: ALLOC_STACK -64, 7 cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) vpbroadcastw xm3, [tlq+14] movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 pxor xm4, xm4 paddw xm2, xm0 vpbroadcastw xm0, r8m ; pixel_max mova [rsp+32], xm3 movd xm3, dxd pmaxsw xm2, xm4 mov r3d, dxd pavgw xm2, xm4 vpbroadcastw m3, xm3 pminsw xm2, xm0 punpcklwd xm0, xm1, xm2 punpckhwd xm1, xm2 lea r5, [strideq*3] pslldq m2, m3, 8 mova [rsp+ 0], xm0 mova [rsp+16], xm1 paddw m6, m3, m3 paddw m3, m2 vpblendd m4, m6, 0xf0 paddw m6, m6 paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 vbroadcasti128 m4, [z_upsample] .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu xm2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base2 vinserti128 m1, [rsp+r3*2], 1 ; 0 2 lea r3d, [r2+dxq] shr r2d, 6 ; base3 vinserti128 m2, [rsp+r2*2], 1 ; 1 3 pshufb m1, m4 pshufb m2, m4 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m5, m3 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m3, m6 ; xpos += dx paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r5 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_upsample_loop RET ALIGN function_align .filter_strength: ; w4/w8/w16 %define base r3-z_filter_t0 movd xm0, maxbased lea r3, [z_filter_t0] movd xm1, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m1, xm1 pcmpeqb m0, [base+z_filter_wh] mova xm2, [r3+angleq*8] pand m0, m1 pcmpgtb m0, m2 pmovmskb r5d, m0 ret .w4_no_upsample: mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastw xm3, [tlq+14] mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 paddw xm2, xm0 pmullw xm2, xm4 movd [rsp+16], xm3 cmp r5d, 3 jne .w4_3tap paddw xm1, xm2 palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 movzx r3d, word [tlq+14] movzx r2d, word [tlq+12] inc maxbased paddw xm2, xm0 sub r2d, r3d paddw xm2, xm2 lea r2d, [r2+r3*8+4] shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 mov [rsp+16], r2w .w4_3tap: pxor xm0, xm0 paddw xm1, xm2 mov tlq, rsp psrlw xm1, 3 cmp hd, 8 sbb maxbased, -1 pavgw xm0, xm1 mova [tlq], xm0 .w4_main: movd xm3, dxd vpbroadcastq m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] shl maxbased, 6 vpbroadcastw m3, xm3 movd xm0, maxbased mov r3d, dxd ; xpos vpbroadcastw m0, xm0 paddw m4, m3, m3 psubw m1, m0 ; -max_base_x vpblendd m3, m4, 0xcc paddw m0, m4, m3 vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 paddw m4, m4 paddw m3, m1 .w4_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [tlq+r3*2] lea r3d, [r5+dxq] shr r5d, 6 ; base1 movu xm2, [tlq+r5*2] lea r5d, [r3+dxq] shr r3d, 6 ; base2 vinserti128 m1, [tlq+r3*2], 1 ; 0 2 lea r3d, [r5+dxq] shr r5d, 6 ; base3 vinserti128 m2, [tlq+r5*2], 1 ; 1 3 punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 vpblendd m1, m2, 0xcc pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 ; xpos < max_base_x paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w4_loop lea r6, [strideq*3] .w4_end_loop: movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 movq [dstq+strideq*2], xm6 movq [dstq+r6 ], xm6 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_end_loop .w4_end: RET .w8: ALLOC_STACK -64, 7 lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ .w8_upsample_h8: paddw m2, m1 paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f add dxd, dxd psubw m0, m2, m0 psraw m0, 3 pxor m4, m4 paddw m2, m0 vpbroadcastw m0, r8m movd xm3, dxd pmaxsw m2, m4 mov r3d, dxd pavgw m2, m4 vpbroadcastw m3, xm3 pminsw m2, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 vbroadcasti128 m4, [z_upsample] mova [rsp+ 0], xm0 mova [rsp+16], xm1 paddw m6, m3, m3 vextracti128 [rsp+32], m0, 1 vextracti128 [rsp+48], m1, 1 vpblendd m3, m6, 0xf0 ; xpos0 xpos1 .w8_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [rsp+r3*2] movu xm2, [rsp+r3*2+16] lea r3d, [r2+dxq] shr r2d, 6 ; base1 vinserti128 m1, [rsp+r2*2], 1 vinserti128 m2, [rsp+r2*2+16], 1 pshufb m1, m4 pshufb m2, m4 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m3, m6 paddw m0, m1 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_upsample_loop RET .w8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: lea maxbased, [hq+7] test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w8_main popcnt r5d, r5d vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m2 cmp hd, 8 jl .w8_filter_h4 punpckhwd m2, m2 vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g je .w8_filter_end ; 8x4 and 8x8 are always 3-tap movzx r3d, word [tlq+30] mov maxbased, 16 mov [rsp+32], r3d cmp r5d, 3 jne .w8_filter_end punpcklwd xm6, xm0, xm0 vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movzx r5d, word [tlq+28] mov [rsp+34], r3w paddw m2, m6 sub r5d, r3d inc maxbased paddw m2, m2 lea r3d, [r5+r3*8+4] paddw m1, m2 shr r3d, 3 mov [rsp+32], r3w jmp .w8_filter_end .w8_filter_h4: pshuflw m3, m2, q3321 vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ .w8_filter_end: paddw m0, m3 pmullw m0, m4 mov tlq, rsp pxor m2, m2 paddw m0, m1 psrlw m0, 3 pavgw m0, m2 mova [tlq], m0 .w8_main: movd xm3, dxd vbroadcasti128 m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m3, xm3 movd xm0, maxbased mov r3d, dxd vpbroadcastw m0, xm0 paddw m4, m3, m3 psubw m1, m0 vpblendd m3, m4, 0xf0 ; xpos0 xpos1 paddw m3, m1 .w8_loop: lea r5d, [r3+dxq] shr r3d, 6 movu xm0, [tlq+r3*2] movu xm1, [tlq+r3*2+2] lea r3d, [r5+dxq] shr r5d, 6 vinserti128 m0, [tlq+r5*2], 1 vinserti128 m1, [tlq+r5*2+2], 1 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w8_loop .w8_end_loop: mova [dstq+strideq*0], xm6 mova [dstq+strideq*1], xm6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_end_loop .w8_end: RET .w16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(h+15, 31) jmp .w16_main .w16: ALLOC_STACK -96, 7 lea maxbased, [hq+15] test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w16_main popcnt r5d, r5d mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h cmp r5d, 3 jne .w16_filter_3tap vpbroadcastd m2, [base+pw_3] punpcklwd xm0, xm0 vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m0, m2 pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m0, m1 psrlw m0, 2 movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g cmp hd, 8 jl .w16_filter_5tap_h4 punpckhwd m3, m3 je .w16_filter_5tap_h8 vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h movzx r3d, word [tlq+62] movzx r2d, word [tlq+60] pavgw m2, m4 sub r2d, r3d paddw m1, m3 lea r2d, [r2+r3*8+4] paddw m1, m2 shr r2d, 3 psrlw m1, 2 mov [rsp+66], r3w mov [rsp+64], r2w mov tlq, rsp mov r3d, 33 cmp hd, 16 cmovg maxbased, r3d jmp .w16_filter_end2 .w16_filter_5tap_h8: vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 pavgw xm2, xm4 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 jmp .w16_filter_end2 .w16_filter_5tap_h4: pshuflw xm4, xm3, q3332 ; 4 5 5 5 pshuflw xm3, xm3, q3321 ; 3 4 5 5 pavgw xm2, xm4 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 jmp .w16_filter_end2 .w16_filter_3tap: vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m4 pmullw m3, m2 paddw m0, m1 cmp hd, 8 je .w16_filter_3tap_h8 jl .w16_filter_3tap_h4 punpckhwd m2, m2 vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g jmp .w16_filter_end .w16_filter_3tap_h4: pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ jmp .w16_filter_end .w16_filter_3tap_h8: psrldq xm2, 2 pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 .w16_filter_end: paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f pmullw m2, m4 psrlw m0, 3 pxor m1, m1 paddw m2, m3 psrlw m2, 3 pavgw m0, m1 pavgw m1, m2 .w16_filter_end2: mov tlq, rsp mova [tlq+ 0], m0 mova [tlq+32], m1 .w16_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r3d, dxd vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] psubw m3, m0 .w16_loop: lea r5d, [r3+dxq] shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] lea r3d, [r5+dxq] shr r5d, 6 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m1, m0 movu m0, [tlq+r5*2] vpblendvb m2, m6, m1, m2 movu m1, [tlq+r5*2+2] mova [dstq+strideq*0], m2 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+strideq*1], m0 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w16_loop .w16_end_loop: mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: RET .w32: ALLOC_STACK -160, 8 lea maxbased, [hq+31] mov r3d, 63 cmp hd, 32 cmova maxbased, r3d test angled, 0x400 jnz .w32_main vpbroadcastd m2, [pw_3] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m1, m2 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i mov r3, rsp paddw m0, m1 lea r5d, [maxbaseq-31] psrlw m0, 2 mova [r3], m0 .w32_filter_loop: mova m0, [tlq+30] paddw m1, m2, [tlq+28] add tlq, 32 paddw m0, [tlq+0] pavgw m1, [tlq+4] paddw m0, [tlq+2] add r3, 32 paddw m0, m1 psrlw m0, 2 mova [r3], m0 sub r5d, 16 jg .w32_filter_loop movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h punpckhwd m1, m0, m0 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g jl .w32_filter_h8 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h movzx r5d, word [tlq+62] movzx r2d, word [tlq+60] pavgw m2, m3 sub r2d, r5d paddw m0, m1 lea r2d, [r2+r5*8+4] paddw m0, m2 shr r2d, 3 psrlw m0, 2 mova [r3+32], m0 mov [r3+66], r5w mov [r3+64], r2w mov tlq, rsp mov r3d, 65 cmp hd, 64 cmove maxbased, r3d jmp .w32_main .w32_filter_h8: vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 pavgw xm2, xm3 paddw xm0, xm1 mov tlq, rsp paddw xm0, xm2 psrlw xm0, 2 mova [r3+32], xm0 .w32_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r5d, dxd vpbroadcastd m7, [pw_m1024] ; -16 * 64 vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] psubw m3, m0 .w32_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 psraw m1, m3, 15 vpblendvb m0, m6, m0, m1 mova [dstq+32*0], m0 movu m0, [tlq+r3*2+32] movu m1, [tlq+r3*2+34] add r5d, dxd psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m7, m3 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+32*1], m0 dec hd jz .w32_end add dstq, strideq cmp r5d, maxbased jb .w32_loop .w32_end_loop: mova [dstq+32*0], m6 mova [dstq+32*1], m6 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: ALLOC_STACK -256, 10 lea maxbased, [hq+63] test angled, 0x400 jnz .w64_main vpbroadcastd m2, [pw_3] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m1, m2 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i mov r3, rsp paddw m0, m1 lea r5d, [hq+32] psrlw m0, 2 mova [r3], m0 .w64_filter_loop: mova m0, [tlq+30] paddw m1, m2, [tlq+28] add tlq, 32 paddw m0, [tlq+0] pavgw m1, [tlq+4] paddw m0, [tlq+2] add r3, 32 paddw m0, m1 psrlw m0, 2 mova [r3], m0 sub r5d, 16 jg .w64_filter_loop movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h punpckhwd m1, m0, m0 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h pavgw m2, m3 paddw m0, m1 paddw m0, m2 mov tlq, rsp psrlw m0, 2 mova [r3+32], m0 .w64_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r5d, dxd vpbroadcastd m7, [pw_m1024] ; -16 * 64 vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] paddw m8, m7, m7 ; -32 * 64 psubw m3, m0 paddw m9, m8, m7 ; -48 * 64 .w64_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 psraw m1, m3, 15 vpblendvb m0, m6, m0, m1 mova [dstq+32*0], m0 movu m0, [tlq+r3*2+32] movu m1, [tlq+r3*2+34] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m7, m3 vpblendvb m0, m6, m0, m1 mova [dstq+32*1], m0 movu m0, [tlq+r3*2+64] movu m1, [tlq+r3*2+66] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m8, m3 vpblendvb m0, m6, m0, m1 mova [dstq+32*2], m0 movu m0, [tlq+r3*2+96] movu m1, [tlq+r3*2+98] add r5d, dxd psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m9, m3 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+32*3], m0 dec hd jz .w64_end add dstq, strideq cmp r5d, maxbased jb .w64_loop .w64_end_loop: mova [dstq+32*0], m6 mova [dstq+32*1], m6 mova [dstq+32*2], m6 mova [dstq+32*3], m6 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea dxq, [dr_intra_derivative-90] movsxd wq, [r9+wq*4] mova m1, [tlq- 0] movzx dyd, angleb xor angled, 0x400 mova m2, [tlq- 32] mov r8, dxq sub dxq, dyq mova m3, [tlq- 64] add wq, r9 add r9, z_filter_t0-ipred_z2_16bpc_avx2_table mova m4, [tlq- 96] and dyd, ~1 mova m5, [tlq-128] and dxq, ~1 movzx dyd, word [r8+dyq] ; angle - 90 movzx dxd, word [dxq+270] ; 180 - angle vpbroadcastd m11, [base+pw_62] mova [rsp+128], m1 mova [rsp+ 96], m2 mova [rsp+ 64], m3 neg dxd mova [rsp+ 32], m4 neg dyq mova [rsp+ 0], m5 jmp wq .w4: vbroadcasti128 m10, [base+z2_x_shuf] vpbroadcastq m6, [base+z_base_inc+2] lea r8d, [dxq+(65<<6)] ; xpos mov r10d, (63-4)<<6 test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) movq xm0, [tlq+2] ; 1 2 3 4 movq xm1, [tlq+0] ; 0 1 2 3 pshuflw xm2, xm0, q3321 ; 2 3 4 4 pshuflw xm3, xm1, q2100 ; 0 0 1 2 vpbroadcastw xm4, r8m ; pixel_max vbroadcasti128 m10, [base+z_upsample] paddw xm1, xm0 paddw xm2, xm3 lea r8d, [r8+dxq+(1<<6)] psubw xm2, xm1, xm2 add dxd, dxd psraw xm2, 3 pxor xm3, xm3 sub r10d, 3<<6 paddw xm1, xm2 paddw m6, m6 pmaxsw xm1, xm3 sub angled, 1075 ; angle - 53 pavgw xm1, xm3 lea r3d, [hq+3] pminsw xm1, xm4 xor angled, 0x7f ; 180 - angle punpcklwd xm1, xm0 movu [rsp+130], xm1 call .filter_strength jmp .w4_filter_left ALIGN function_align .filter_strength: movd xm8, r3d mov r3d, angled movd xm7, angled vpbroadcastb m8, xm8 shr r3d, 8 ; is_sm << 1 vpbroadcastb m7, xm7 pcmpeqb m8, [base+z_filter_wh] mova xm9, [r9+r3*8] pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 ret ALIGN function_align .upsample_left: ; h4/h8 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 vpbroadcastw xm4, r8m ; pixel_max cmp hd, 8 je .upsample_left_h8 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 jmp .upsample_left_end .upsample_left_h8: pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 .upsample_left_end: paddw xm1, xm0 paddw xm2, xm3 psubw xm2, xm1, xm2 add dyq, dyq psraw xm2, 3 pxor xm3, xm3 paddw xm1, xm2 pmaxsw xm1, xm3 pavgw xm1, xm3 pminsw xm1, xm4 punpcklwd xm2, xm0, xm1 punpckhwd xm0, xm1 mova [rsp+ 96+gprsize], xm2 mova [rsp+112+gprsize], xm0 ret .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above popcnt r3d, r3d vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] psrldq xm0, xm1, 2 ; 1 2 3 4 pshuflw xm2, xm1, q2100 ; 0 0 1 2 pmullw xm4, xm0 pshuflw xm3, xm0, q3321 ; 2 3 4 4 paddw xm1, xm3 pshuflw xm3, xm0, q3332 ; 3 4 4 4 pmullw xm1, xm5 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] paddw xm2, xm3 vpbroadcastd xm3, r6m ; max_width pmullw xm2, xm5 packssdw xm3, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm3, [base+pw_1to16] pxor xm4, xm4 psrlw xm1, 3 pminsw xm3, xm11 ; clip to byte range since there's no variable word blend pavgw xm1, xm4 vpblendvb xm1, xm0, xm3 movq [rsp+130], xm1 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd xm0, [base+pb_90] psubb xm0, xm7 ; 180 - angle pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f vpbroadcastd m5, r7m ; max_height cmp r3d, 3 je .w4_filter_left_s3 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pmullw m2, m0 cmp hd, 8 jl .w4_filter_left_h4 movu m4, [tlq-34] punpcklwd m1, m0, m0 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e je .w4_filter_left_end vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e jmp .w4_filter_left_end .w4_upsample_left: call .upsample_left mov r11, -16 vbroadcasti128 m9, [base+z_upsample] jmp .w4_main_upsample_left .w4_filter_left_s3: ; can only be h16 movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m4, [base+pw_3] paddw m1, m0, m2 punpckhwd m2, m2 vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g punpcklwd xm3, xm0, xm0 paddw m2, m4 vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d paddw m1, m4 pavgw m2, m3 paddw m1, m2 psrlw m1, 2 jmp .w4_filter_left_end2 .w4_filter_left_h4: pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e .w4_filter_left_end: paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m3 paddw m1, m2 pxor m2, m2 psrlw m1, 3 pavgw m1, m2 .w4_filter_left_end2: packssdw m5, m5 psubw m5, [base+pw_16to1] pminsw m5, m11 vpblendvb m1, m0, m5 mova [rsp+96], m1 .w4_main: vbroadcasti128 m9, [base+z2_x_shuf] mov r11, -8 .w4_main_upsample_left: movd xm5, dyd mova m4, [base+z2_y_shuf_h4] mov r2d, r8d movd xm0, dxd vpbroadcastw m5, xm5 rorx r5, dyq, 5 lea r8d, [dyq*3] pmullw m5, [base+z2_ymul] rorx r9, dyq, 4 sar dyd, 6 vpbroadcastw m0, xm0 sar r8d, 6 pand m5, m11 ; frac_y neg dyd psllw m5, 9 add r5d, dyd add r8d, dyd add r9d, dyd paddw m7, m0, m0 lea dyq, [rsp+dyq*2+126] vpblendd m0, m7, 0xcc add dyq, r11 neg r5d paddw m1, m0, m7 neg r8d vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 neg r9d paddw m7, m7 paddw m6, m0 .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm1, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu xm3, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 vinserti128 m1, [rsp+r2*2], 1 lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 vinserti128 m3, [rsp+r3*2], 1 pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 pand m2, m11, m6 punpcklqdq m0, m1, m3 punpckhqdq m1, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w4_toponly movu xm2, [dyq] vinserti128 m2, [dyq+r8*2], 1 movu xm3, [dyq+r5*2] vinserti128 m3, [dyq+r9*2], 1 pshufb m2, m9 pshufb m3, m9 punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m2, m3 psubw m2, m1 pmulhrsw m2, m5 psraw m3, m6, 15 ; base_x < topleft paddw m1, m2 vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 vpblendvb m0, m1, m3 .w4_toponly: paddw m6, m7 ; xpos += dx lea r3, [strideq*3] add dyq, r11 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r2d, r10d jge .w4_loop .w4_leftonly_loop: movu xm1, [dyq] vinserti128 m1, [dyq+r8*2], 1 movu xm2, [dyq+r5*2] vinserti128 m2, [dyq+r9*2], 1 add dyq, r11 pshufb m1, m9 pshufb m2, m9 punpckhwd m0, m1, m2 punpcklwd m1, m2 psubw m1, m0 pmulhrsw m1, m5 paddw m0, m1 vpermd m0, m4, m0 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_leftonly_loop .w4_end: RET .w8: mov r10d, hd test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] xor r8d, r8d mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 vpbroadcastw xm4, r8m ; pixel_max paddw xm1, xm0 paddw xm2, xm3 not r8d psubw xm2, xm1, xm2 add dxd, dxd psraw xm2, 3 sub angled, 53 ; angle - 53 pxor xm3, xm3 paddw xm2, xm1 lea r3d, [hq+7] pmaxsw xm2, xm3 xor angled, 0x7f ; 180 - angle pavgw xm2, xm3 pminsw xm2, xm4 punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 movu [rsp+130], xm1 movu [rsp+146], xm2 call .filter_strength jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above popcnt r3d, r3d vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x pmullw xm4, xm0 pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x paddw xm1, xm3 vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x paddw xm2, xm3 vpbroadcastd xm3, r6m ; max_width pmullw xm1, xm5 pmullw xm2, xm6 packssdw xm3, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm3, [base+pw_1to16] pxor xm4, xm4 psrlw xm1, 3 pminsw xm3, xm11 pavgw xm1, xm4 vpblendvb xm1, xm0, xm3 movu [rsp+130], xm1 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 .w8_filter_left: test r3d, r3d jz .w8_main popcnt r3d, r3d cmp r3d, 3 jne .w8_filter_left_s12 vpbroadcastd m6, [base+pw_3] vpbroadcastd m7, [base+pw_16] cmp hd, 16 ; flags needed for later jmp .filter_left_s3b .w8_upsample_left: call .upsample_left vbroadcasti128 m7, [base+z2_y_shuf_us] lea r11, [rsp+118] mov r8, -8 jmp .w8_main_upsample_left .w16_filter_left_s12: xor r8d, r8d .w8_filter_left_s12: mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f vpbroadcastd m5, r7m ; max_height vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pmullw m2, m0 cmp hd, 8 jl .w8_filter_left_h4 movu m4, [tlq-34] punpcklwd m1, m0, m0 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e je .w8_filter_left_end vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e jmp .w8_filter_left_end .w8_filter_left_h4: pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e .w8_filter_left_end: paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m3 paddw m1, m2 pxor m2, m2 psrlw m1, 3 pavgw m1, m2 packssdw m5, m5 psubw m5, [base+pw_16to1] pminsw m5, m11 vpblendvb m1, m0, m5 mova [rsp+96], m1 test r8d, r8d jz .w8_main ; upsample_main vbroadcasti128 m10, [base+z_upsample] vbroadcasti128 m7, [base+z2_y_shuf] lea r5, [rsp+120] movd xm1, dyd vbroadcasti128 m4, [base+z_base_inc+2] movd xm2, dxd vpbroadcastw m1, xm1 vpbroadcastw m2, xm2 mov r7, dstq paddw m4, m4 pmullw m0, m1, [base+z2_ymul8] paddw m5, m2, m2 psllw xm1, 3 vpblendd m2, m5, 0xf0 lea r2d, [dxq+(66<<6)] ; xpos paddw m4, m2 pshufd m6, m0, q2020 psraw xm0, 6 pxor xm1, xm1 psubw xm8, xm1, xm0 pand m6, m11 punpckhwd xm9, xm8, xm1 psllw m6, 9 punpcklwd xm8, xm1 .w8_upsample_above_loop: lea r3d, [r2+dxq] shr r2d, 6 movu xm1, [rsp+r2*2] movu xm2, [rsp+r2*2+16] lea r2d, [r3+dxq] shr r3d, 6 vinserti128 m1, [rsp+r3*2], 1 vinserti128 m2, [rsp+r3*2+16], 1 pshufb m1, m10 pshufb m2, m10 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 punpckhqdq m1, m2 pand m2, m11, m4 psubw m1, m0 psllw m2, 9 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w8_upsample_above_toponly mova m1, m5 vpgatherdq m3, [r5+xm9*2], m5 mova m5, m1 vpgatherdq m2, [r5+xm8*2], m1 pshufb m3, m7 pshufb m2, m7 punpckldq m1, m2, m3 punpckhdq m2, m3 psubw m2, m1 pmulhrsw m2, m6 paddw m1, m2 vpermq m1, m1, q3120 psraw m2, m4, 15 vpblendvb m0, m1, m2 .w8_upsample_above_toponly: paddw m4, m5 sub r5, 4 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_ret lea dstq, [dstq+strideq*2] jmp .w8_upsample_above_loop .w8_main: vbroadcasti128 m7, [base+z2_y_shuf] lea r11, [rsp+120] mov r8, -4 .w8_main_upsample_left: movd xm1, dyd vbroadcasti128 m4, [base+z_base_inc+2] movd xm2, dxd vpbroadcastw m1, xm1 vpbroadcastw m2, xm2 mov r7, dstq pmullw m0, m1, [base+z2_ymul8] paddw m5, m2, m2 psllw xm1, 3 vpblendd m2, m5, 0xf0 ; xpos0 xpos1 lea r9d, [dxq+(65<<6)] ; xpos paddw m4, m2 movd [rsp+284], xm1 .w8_loop0: mov r2d, r9d mova [rsp+288], m0 mov r5, r11 mova [rsp+320], m4 pshufd m6, m0, q2020 psraw xm0, 6 pxor xm1, xm1 psubw xm8, xm1, xm0 ; base_y pand m6, m11 ; frac_y punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 psllw m6, 9 punpcklwd xm8, xm1 ; base_y 0 1 4 5 .w8_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2*2] movu xm1, [rsp+r2*2+2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vinserti128 m0, [rsp+r3*2], 1 vinserti128 m1, [rsp+r3*2+2], 1 pand m2, m11, m4 psubw m1, m0 psllw m2, 9 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w8_toponly mova m1, m5 vpgatherdq m3, [r5+xm9*2], m5 mova m5, m1 vpgatherdq m2, [r5+xm8*2], m1 pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m2, m3 psubw m2, m1 pmulhrsw m2, m6 paddw m1, m2 vpermq m1, m1, q3120 psraw m2, m4, 15 ; base_x < topleft vpblendvb m0, m1, m2 .w8_toponly: paddw m4, m5 ; xpos += dx add r5, r8 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r2d, (63-8)<<6 jge .w8_loop .w8_leftonly_loop: mova m0, m5 vpgatherdq m4, [r5+xm9*2], m5 mova m5, m0 vpgatherdq m3, [r5+xm8*2], m0 add r5, r8 pshufb m2, m4, m7 pshufb m1, m3, m7 punpckldq m0, m1, m2 punpckhdq m1, m2 psubw m1, m0 pmulhrsw m1, m6 paddw m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_leftonly_loop .w8_end: sub r10d, 1<<8 jl .w8_ret vpbroadcastd m0, [rsp+284] add r7, 16 paddw m0, [rsp+288] ; base_y += 8*dy add r9d, 8<<6 vpbroadcastd m4, [pw_512] movzx hd, r10b paddw m4, [rsp+320] ; base_x += 8*64 mov dstq, r7 jmp .w8_loop0 .w8_ret: RET .w16: movd xm0, [tlq+32] lea r10d, [hq+(1<<8)] movd [rsp+160], xm0 test angled, 0x400 jnz .w8_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above popcnt r3d, r3d vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g punpcklwd xm2, xm1, xm1 vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e punpckhwd m3, m0, m0 pmullw m4, m0 vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g paddw m1, m3 vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g paddw m2, m3 vpbroadcastd m3, r6m ; max_width pmullw m1, m5 pmullw m2, m6 packssdw m3, m3 paddw m1, m4 paddw m1, m2 psubw m3, [base+pw_1to16] pxor m4, m4 psrlw m1, 3 pminsw m3, m11 pavgw m1, m4 vpblendvb m1, m0, m3 movu [rsp+130], m1 .w16_no_filter_above: vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 test r3d, r3d jz .w8_main popcnt r3d, r3d cmp r3d, 3 jne .w16_filter_left_s12 vpbroadcastd m6, [base+pw_3] vpbroadcastd m7, [base+pw_16] cmp hd, 4 jne .filter_left_s3 movq xm0, [tlq-8] ; 0 1 2 3 movq xm1, [tlq-6] ; 1 2 3 4 vpbroadcastd xm5, r7m ; max_height movq xm4, [base+pw_16to1+24] ; 4to1 pshuflw xm2, xm0, q2100 ; 0 0 1 2 pshuflw xm3, xm1, q3321 ; 2 3 4 4 paddw xm1, xm0 paddw xm1, xm2 pshuflw xm2, xm0, q1000 ; 0 0 0 1 paddw xm3, xm6 packssdw xm5, xm5 pavgw xm2, xm3 psubw xm5, xm4 paddw xm1, xm2 pminsw xm5, xm11 psrlw xm1, 2 vpblendvb xm1, xm0, xm5 movq [rsp+120], xm1 jmp .w8_main .w32: mova m2, [tlq+32] movd xm0, [tlq+64] lea r10d, [hq+(3<<8)] mova [rsp+160], m2 movd [rsp+192], xm0 test angled, 0x400 jnz .w8_main vpbroadcastd m6, [base+pw_3] vpbroadcastd m0, r6m ; max_width vpbroadcastd m7, [base+pw_16] mov r3d, 32 packssdw m0, m0 psubw m0, [base+pw_1to16] pminsw m8, m0, m11 psubw m9, m8, m7 .w32_filter_above: movu m0, [tlq+2] punpcklwd xm4, xm1, xm1 paddw m2, m6, [tlq+6] paddw m1, m0 vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m1, [tlq+4] movu m3, [tlq+r3+2] paddw m5, m6, [tlq+r3-2] pavgw m2, m4 punpckhwd m4, m3, m3 paddw m1, m2 vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h pavgw m2, m5 paddw m5, m3, [tlq+r3] paddw m4, m5 psrlw m1, 2 paddw m2, m4 vpblendvb m1, m0, m8 psrlw m2, 2 vpblendvb m2, m3, m9 movu [rsp+130], m1 movu [rsp+r3+130], m2 .filter_left_s3: cmp hd, 16 jl .filter_left_s3_h8 ; h8 .filter_left_s3b: mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i vpbroadcastd m5, r7m ; max_height paddw m1, m0, m2 punpckhwd m2, m2 mov r3d, hd vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i packssdw m5, m5 not r3 psubw m5, [base+pw_16to1] paddw m2, m6 pminsw m8, m11, m5 je .filter_left_s3_end ; h16 paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m2 psrlw m1, 2 vpblendvb m3, m1, m0, m8 mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j psubw m8, m7 mova [rsp+96], m3 jnp .filter_left_s3_end ; h32 mova m5, [tlq-96] paddw m1, [tlq-66] pavgw m2, [tlq-68] paddw m1, m2 paddw m4, m5, [tlq-94] paddw m2, m6, [tlq-92] psrlw m1, 2 paddw m4, [tlq- 98] pavgw m2, [tlq-100] vpblendvb m3, m1, m0, m8 mova m0, [tlq-128] psubw m8, m7 paddw m4, m2 paddw m1, m0, [tlq-126] paddw m2, m6, [tlq-124] psrlw m4, 2 mova [rsp+64], m3 vpblendvb m4, m5, m8 psubw m8, m7 mova [rsp+32], m4 .filter_left_s3_end: punpcklwd xm3, xm0, xm0 vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m4 pavgw m2, m3 paddw m1, m2 psrlw m1, 2 vpblendvb m1, m0, m8 mova [rsp+r3*2+130], m1 jmp .w8_main .filter_left_s3_h8: mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 vpbroadcastd xm5, r7m ; max_height paddw xm1, xm0, xm3 pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 paddw xm1, xm2 vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 paddw xm3, xm6 packssdw xm5, xm5 pavgw xm2, xm3 psubw xm5, [base+pw_16to1+16] ; 8to1 paddw xm1, xm2 pminsw xm5, xm11 psrlw xm1, 2 vpblendvb xm1, xm0, xm5 mova [rsp+112], xm1 jmp .w8_main .w64: mova m2, [tlq+ 32] mova m3, [tlq+ 64] mova m4, [tlq+ 96] movd xm0, [tlq+128] lea r10d, [hq+(7<<8)] mova [rsp+160], m2 mova [rsp+192], m3 mova [rsp+224], m4 movd [rsp+256], xm0 test angled, 0x400 jnz .w8_main vpbroadcastd m6, [base+pw_3] movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h movu m4, [tlq+66] paddw m3, m6, [tlq+62] paddw m7, m4, [tlq+64] pavgw m3, [tlq+70] paddw m7, [tlq+68] paddw m2, m5 vpbroadcastd m5, r6m ; max_width mov r3d, 96 packssdw m5, m5 paddw m3, m7 psubw m5, [base+pw_1to16] psrlw m2, 2 vpbroadcastd m7, [base+pw_16] psrlw m3, 2 pminsw m8, m11, m5 psubw m9, m8, m7 vpblendvb m2, m0, m9 psubw m9, m7 vpblendvb m3, m4, m9 psubw m9, m7 movu [rsp+162], m2 movu [rsp+194], m3 jmp .w32_filter_above cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase lea r6, [ipred_z3_16bpc_avx2_table] tzcnt hd, hm movifnidn angled, anglem lea r7, [dr_intra_derivative+45*2-1] sub tlq, 2 movsxd hq, [r6+hq*4] sub angled, 180 add hq, r6 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e movzx dyd, word [r7+dyq] vpbroadcastd m5, [pw_62] mov org_wd, wd jmp hq .h4: ALLOC_STACK -64, 7 lea r7, [strideq*3] cmp angleb, 40 jae .h4_no_upsample lea r4d, [angleq-1024] sar r4d, 7 add r4d, wd jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 pshufd xm3, xm1, q0000 paddw xm1, xm2 paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 vpbroadcastw xm4, r8m ; pixel_max add dyd, dyd psubw xm0, xm1, xm0 mova [rsp+ 0], xm3 movd xm3, dyd psraw xm0, 3 neg dyd paddw xm1, xm0 pxor xm0, xm0 lea r2d, [dyq+(16<<6)+63] ; ypos pmaxsw xm1, xm0 pavgw xm1, xm0 vpbroadcastw m3, xm3 pminsw xm1, xm4 punpckhwd xm0, xm1, xm2 punpcklwd xm1, xm2 paddw m2, m3, m3 mova [rsp+32], xm0 punpcklwd m3, m2 mova [rsp+16], xm1 paddw m4, m2, m2 paddw m2, m3 vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 .h4_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 movu xm1, [rsp+r2*2] lea r2d, [r4+dyq] shr r4d, 6 movu xm2, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 vinserti128 m1, [rsp+r2*2], 1 lea r2d, [r4+dyq] shr r4d, 6 vinserti128 m2, [rsp+r4*2], 1 psrld m0, m1, 16 pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 pslld m2, 16 pblendw m1, m2, 0xaa pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m3, m4 paddw m1, m0 vextracti128 xm2, m1, 1 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 movhps [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r7 ], xm1 add dstq, 8 sub wd, 4 jg .h4_upsample_loop RET ALIGN function_align .filter_strength: ; h4/h8/h16 %define base r4-z_filter_t0 lea r4, [z_filter_t0] movd xm0, maxbased movd xm1, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m1, xm1 pcmpeqb m0, [base+z_filter_wh] pand m0, m1 mova xm1, [r4+angleq*8] pcmpgtb m0, m1 pmovmskb r5d, m0 ret .h4_no_upsample: mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 popcnt r5d, r5d mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] pmullw xm2, xm0 pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 paddw xm1, xm0, xm3 movd [rsp+12], xm0 pmullw xm1, xm4 cmp r5d, 3 jne .h4_filter_3tap pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 movzx r4d, word [tlq-14] movzx r2d, word [tlq-12] inc maxbased paddw xm1, xm2 paddw xm0, xm3 sub r2d, r4d paddw xm2, xm0, xm0 lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+14], r2w .h4_filter_3tap: pxor xm0, xm0 paddw xm1, xm2 lea tlq, [rsp+30] psrlw xm1, 3 cmp wd, 8 sbb maxbased, -1 pavgw xm0, xm1 mova [rsp+16], xm0 .h4_main: movd xm3, dyd neg maxbaseq vbroadcasti128 m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m3, xm3 lea r4d, [maxbaseq+3*64] neg dyq movd xm2, r4d sub tlq, 8 lea r4, [dyq+63] ; ypos punpcklwd m1, m1 paddw m0, m3, m3 vpbroadcastw m2, xm2 punpcklwd m3, m0 paddw m4, m0, m0 paddw m0, m3 psubw m2, m1 vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 or maxbased, 63 paddw m3, m2 .h4_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu xm1, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 movu xm2, [tlq+r5*2] lea r5, [r4+dyq] sar r4, 6 ; base2 vinserti128 m1, [tlq+r4*2], 1 lea r4, [r5+dyq] sar r5, 6 ; base3 vinserti128 m2, [tlq+r5*2], 1 punpckhwd m0, m1, m2 punpcklwd m1, m2 pand m2, m5, m3 palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 ; ypos < max_base_y paddw m3, m4 paddw m1, m0 vpblendvb m1, m6, m1, m2 vextracti128 xm2, m1, 1 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 movhps [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r7 ], xm1 sub wd, 4 jz .h4_end add dstq, 8 cmp r4d, maxbased jg .h4_loop .h4_end_loop: movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 movq [dstq+strideq*2], xm6 movq [dstq+r7 ], xm6 add dstq, 8 sub wd, 4 jg .h4_end_loop .h4_end: RET .h8: lea r4d, [angleq+216] ALLOC_STACK -64, 8 mov r4b, wb lea r7, [strideq*3] cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d cmp wd, 8 je .h8_upsample_w8 pshufhw xm3, xm2, q1000 vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d .h8_upsample_w8: paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastw m4, r8m ; pixel_max add dyd, dyd psubw m0, m1, m0 movd xm6, dyd psraw m0, 3 neg dyd paddw m1, m0 pxor m0, m0 pmaxsw m1, m0 lea r4d, [dyq+(16<<6)+63] ; ypos pavgw m1, m0 vpbroadcastw m6, xm6 pminsw m1, m4 punpckhwd m0, m1, m2 punpcklwd m1, m2 vextracti128 [rsp+48], m0, 1 vextracti128 [rsp+32], m1, 1 paddw m7, m6, m6 mova [rsp+16], xm0 mova [rsp+ 0], xm1 punpcklwd m6, m7 ; ypos0 ypos1 .h8_upsample_loop: lea r2d, [r4+dyq] shr r4d, 6 ; base0 movu m1, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] lea r2d, [r4+dyq] shr r4d, 6 ; base2 movu m3, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 ; base3 movu m4, [rsp+r2*2] psrld m0, m1, 16 pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 pslld m2, 16 pblendw m1, m2, 0xaa psrld m2, m3, 16 pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 pslld m4, 16 pblendw m3, m4, 0xaa pand m4, m5, m6 paddw m6, m7 psllw m4, 9 psubw m1, m0 pmulhrsw m1, m4 pand m4, m5, m6 psllw m4, 9 psubw m3, m2 pmulhrsw m3, m4 paddw m6, m7 lea r2, [dstq+strideq*4] paddw m1, m0 paddw m3, m2 punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 movhps [r2 +strideq*0], xm0 movq [r2 +strideq*1], xm0 movhps [r2 +strideq*2], xm1 movq [r2 +r7 ], xm1 movhps [dstq+strideq*0], xm2 movq [dstq+strideq*1], xm2 movhps [dstq+strideq*2], xm3 movq [dstq+r7 ], xm3 add dstq, 8 sub wd, 4 jg .h8_upsample_loop RET .h8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: lea maxbased, [wq+7] test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h8_main popcnt r5d, r5d mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] pmullw m2, m0 cmp wd, 8 jl .h8_filter_w4 punpcklwd xm0, xm0 vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movd [rsp+28], xm0 paddw m1, m3 mov r4d, 16 pmullw m1, m4 cmovg maxbased, r4d cmp r5d, 3 jne .h8_filter_3tap punpckhwd m3, m3 vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g movzx r4d, word [tlq-30] movzx r2d, word [tlq-28] inc maxbased paddw m1, m2 paddw m0, m3 sub r2d, r4d paddw m2, m0, m0 lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+30], r2w jmp .h8_filter_3tap .h8_filter_w4: pshufhw xm1, xm0, q2100 vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e paddw m1, m3 pmullw m1, m4 .h8_filter_3tap: pxor m0, m0 paddw m1, m2 lea tlq, [rsp+62] psrlw m1, 3 pavgw m0, m1 mova [rsp+32], m0 .h8_main: movd xm4, dyd neg maxbaseq vbroadcasti128 m1, [z_base_inc] vpbroadcastw m7, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 lea r4d, [maxbaseq+7*64] neg dyq movd xm2, r4d sub tlq, 16 lea r4, [dyq+63] paddw m6, m4, m4 vpbroadcastw m2, xm2 vpblendd m4, m6, 0xf0 ; ypos0 ypos1 psubw m2, m1 or maxbased, 63 paddw m4, m2 .h8_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu xm0, [tlq+r4*2+2] movu xm1, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 vinserti128 m0, [tlq+r5*2+2], 1 vinserti128 m1, [tlq+r5*2], 1 lea r5, [r4+dyq] sar r4, 6 ; base2 pand m3, m5, m4 psllw m3, 9 psubw m1, m0 pmulhrsw m1, m3 psraw m3, m4, 15 paddw m4, m6 paddw m0, m1 movu xm1, [tlq+r4*2+2] movu xm2, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base3 vpblendvb m0, m7, m0, m3 vinserti128 m1, [tlq+r5*2+2], 1 vinserti128 m2, [tlq+r5*2], 1 pand m3, m5, m4 psllw m3, 9 psubw m2, m1 pmulhrsw m2, m3 psraw m3, m4, 15 paddw m4, m6 lea r5, [dstq+strideq*4] paddw m1, m2 vpblendvb m1, m7, m1, m3 punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 vextracti128 xm3, m2, 1 punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 vextracti128 xm3, m0, 1 movhps [dstq+strideq*0], xm1 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movq [dstq+r7 ], xm2 punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 movhps [r5 +strideq*0], xm1 movq [r5 +strideq*1], xm1 movhps [r5 +strideq*2], xm0 movq [r5 +r7 ], xm0 sub wd, 4 jz .h8_end add dstq, 8 cmp r4d, maxbased jg .h8_loop lea r6, [strideq*5] lea r2, [strideq+r7*2] ; stride*7 test wd, 4 jz .h8_end_loop movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 movq [dstq+strideq*2], xm7 movq [dstq+r7 ], xm7 movq [dstq+strideq*4], xm7 movq [dstq+r6 ], xm7 movq [dstq+r7*2 ], xm7 movq [dstq+r2 ], xm7 add dstq, 8 sub wd, 4 jz .h8_end .h8_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 mova [dstq+strideq*2], xm7 mova [dstq+r7 ], xm7 mova [dstq+strideq*4], xm7 mova [dstq+r6 ], xm7 mova [dstq+r7*2 ], xm7 mova [dstq+r2 ], xm7 add dstq, 16 sub wd, 8 jg .h8_end_loop .h8_end: RET .h16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(w+15, 31) jmp .h16_main ALIGN function_align .h16: ALLOC_STACK -96, 10 lea maxbased, [wq+15] lea r7, [strideq*3] test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h16_main ; filter_strength == 0 popcnt r5d, r5d movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h pmullw m1, m7 paddw m1, m2 cmp wd, 8 jg .h16_filter_w16 mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 pmullw xm6, xm3 jl .h16_filter_w4 pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 cmp r5d, 3 jne .h16_filter_w8_3tap vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 .h16_filter_w8_5tap: punpckhwd m0, m0 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw xm4, xm4 paddw m0, m0 paddw xm6, xm4 paddw m1, m0 .h16_filter_w8_3tap: paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 pmullw xm3, xm7 pxor m0, m0 paddw xm3, xm6 psrlw xm3, 3 pavgw xm3, xm0 mova [rsp+48], xm3 jmp .h16_filter_end .h16_filter_w4: pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 cmp r5d, 3 jne .h16_filter_w8_3tap pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 jmp .h16_filter_w8_5tap .h16_filter_w16: mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f pmullw m6, m3 punpcklwd xm3, xm3 vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g mov r4d, 32 cmp wd, 16 cmovg maxbased, r4d movd [rsp+28], xm3 pmullw m4, m7 cmp r5d, 3 jne .h16_filter_w16_3tap punpckhwd m0, m0 vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movzx r4d, word [tlq-62] movzx r2d, word [tlq-60] or maxbased, 1 paddw m3, m3 sub r2d, r4d paddw m0, m0 lea r2d, [r2+r4*8+4] paddw m4, m3 shr r2d, 3 paddw m1, m0 mov [rsp+30], r2w .h16_filter_w16_3tap: pxor m0, m0 paddw m4, m6 psrlw m4, 3 pavgw m4, m0 mova [rsp+32], m4 .h16_filter_end: psrlw m1, 3 lea tlq, [rsp+94] pavgw m1, m0 mova [rsp+64], m1 .h16_main: movd xm8, dyd neg maxbaseq vpbroadcastw m9, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m8, xm8 lea r4d, [maxbaseq+dyq+15*64] neg dyq movd xm7, r4d sub tlq, 32 lea r4, [dyq+63] vpbroadcastw m7, xm7 or maxbased, 63 psubw m7, [z_base_inc] .h16_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu m0, [tlq+r4*2+2] movu m2, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 movu m1, [tlq+r5*2+2] movu m3, [tlq+r5*2] lea r5, [r4+dyq] sar r4, 6 ; base3 pand m6, m5, m7 psllw m6, 9 psubw m2, m0 pmulhrsw m2, m6 psraw m6, m7, 15 paddw m7, m8 paddw m0, m2 movu m2, [tlq+r4*2+2] movu m4, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base3 vpblendvb m0, m9, m0, m6 pand m6, m5, m7 psllw m6, 9 psubw m3, m1 pmulhrsw m3, m6 psraw m6, m7, 15 paddw m7, m8 paddw m1, m3 vpblendvb m1, m9, m1, m6 pand m6, m5, m7 psllw m6, 9 psubw m4, m2 pmulhrsw m4, m6 psraw m6, m7, 15 paddw m7, m8 paddw m2, m4 movu m3, [tlq+r5*2+2] movu m4, [tlq+r5*2] vpblendvb m2, m9, m2, m6 pand m6, m5, m7 psllw m6, 9 psubw m4, m3 pmulhrsw m4, m6 psraw m6, m7, 15 paddw m7, m8 lea r5, [dstq+strideq*4] paddw m3, m4 vpblendvb m3, m9, m3, m6 punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 vextracti128 xm6, m3, 1 punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 vextracti128 xm2, m4, 1 movhps [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 vextracti128 xm6, m1, 1 movhps [dstq+strideq*2], xm2 movq [dstq+r7 ], xm2 vextracti128 xm2, m0, 1 movhps [r5 +strideq*0], xm6 movq [r5 +strideq*1], xm6 movhps [r5 +strideq*2], xm2 movq [r5 +r7 ], xm2 lea r5, [dstq+strideq*8] movhps [r5 +strideq*0], xm3 movq [r5 +strideq*1], xm3 movhps [r5 +strideq*2], xm4 movq [r5 +r7 ], xm4 lea r5, [r5+strideq*4] movhps [r5 +strideq*0], xm1 movq [r5 +strideq*1], xm1 movhps [r5 +strideq*2], xm0 movq [r5 +r7 ], xm0 sub wd, 4 jz .h16_end add dstq, 8 cmp r4d, maxbased jg .h16_loop mov hd, 4 .h16_end_loop0: mov r6d, wd mov r2, dstq test wb, 4 jz .h16_end_loop movq [dstq+strideq*0], xm9 movq [dstq+strideq*1], xm9 movq [dstq+strideq*2], xm9 movq [dstq+r7 ], xm9 and r6d, 120 jz .h16_end_w4 add dstq, 8 .h16_end_loop: mova [dstq+strideq*0], xm9 mova [dstq+strideq*1], xm9 mova [dstq+strideq*2], xm9 mova [dstq+r7 ], xm9 add dstq, 16 sub r6d, 8 jg .h16_end_loop .h16_end_w4: lea dstq, [r2+strideq*4] dec hd jg .h16_end_loop0 .h16_end: RET .h32: ALLOC_STACK -160, 9 lea maxbased, [wq+31] and maxbased, 31 or maxbased, 32 ; imin(w+31, 63) test angled, 0x400 jnz .h32_main vpbroadcastd m2, [pw_3] movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i punpckhwd m1, m0, m0 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m2 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f lea r4, [rsp+128] paddw m0, m1 lea r5d, [maxbaseq-31] psrlw m0, 2 mova [r4], m0 .h32_filter_loop: mova m0, [tlq-62] paddw m1, m2, [tlq-66] paddw m0, [tlq-64] pavgw m1, [tlq-58] paddw m0, [tlq-60] sub tlq, 32 sub r4, 32 paddw m0, m1 psrlw m0, 2 mova [r4], m0 sub r5d, 16 jg .h32_filter_loop jl .h32_filter_h8 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movzx r5d, word [tlq-62] movzx r2d, word [tlq-60] pavgw m2, m3 sub r2d, r5d paddw m0, m1 lea r2d, [r2+r5*8+4] paddw m0, m2 shr r2d, 3 psrlw m0, 2 mova [r4-32], m0 mov [r4-36], r5w mov [r4-34], r2w lea tlq, [rsp+158] mov r4d, 65 cmp wd, 64 cmove maxbased, r4d jmp .h32_main .h32_filter_h8: mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 lea tlq, [rsp+158] pavgw xm2, xm3 paddw xm0, xm1 paddw xm0, xm2 psrlw xm0, 2 mova [r4-16], xm0 .h32_main: movd xm6, dyd neg maxbaseq vpbroadcastw m7, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m6, xm6 lea r4d, [maxbaseq+dyq+15*64] neg dyq movd xm4, r4d vpbroadcastd m8, [pw_m1024] lea r4, [dyq+63] vpbroadcastw m4, xm4 or maxbased, 63 psubw m4, [z_base_inc] .h32_loop: mov r5, r4 sar r5, 6 movu m1, [tlq+r5*2-64] movu m0, [tlq+r5*2-62] pand m3, m5, m4 psllw m3, 9 psubw m1, m0 pmulhrsw m1, m3 pcmpgtw m2, m8, m4 paddw m0, m1 vpblendvb m0, m7, m0, m2 movu m2, [tlq+r5*2-32] movu m1, [tlq+r5*2-30] add r4, dyq sub rsp, 64 psubw m2, m1 pmulhrsw m2, m3 psraw m3, m4, 15 paddw m4, m6 mova [rsp+32*0], m0 paddw m1, m2 vpblendvb m1, m7, m1, m3 mova [rsp+32*1], m1 dec wd jz .h32_transpose cmp r4d, maxbased jg .h32_loop .h32_end_loop: sub rsp, 64 mova [rsp+32*0], m7 mova [rsp+32*1], m7 dec wd jg .h32_end_loop .h32_transpose: lea r3, [strideq*3] lea r4, [strideq*5] mov r8, dstq lea r5, [strideq+r3*2] .h32_transpose_loop0: lea r6, [rsp+32] lea r2, [r8+org_wq*2-16] .h32_transpose_loop: mova m0, [r6+64*7] mova m1, [r6+64*6] mova m2, [r6+64*5] mova m3, [r6+64*4] mova m4, [r6+64*3] mova m5, [r6+64*2] mova m6, [r6+64*1] mova m7, [r6+64*0] punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 lea dstq, [r2+strideq*8] sub r6, 32 punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 punpckhqdq m5, m7, m1 ; 8 0 vextracti128 [r2 +strideq*0], m5, 1 punpcklqdq m7, m1 ; 9 1 mova [dstq+strideq*0], xm5 punpckhqdq m1, m8, m3 ; 10 2 vextracti128 [r2 +strideq*1], m7, 1 punpcklqdq m8, m3 ; 11 3 mova [dstq+strideq*1], xm7 punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 vextracti128 [r2 +strideq*2], m1, 1 punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 mova [dstq+strideq*2], xm1 punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 vextracti128 [r2 +r3 ], m8, 1 punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 mova [dstq+r3 ], xm8 punpckhqdq m6, m3, m2 ; 12 4 vextracti128 [r2 +strideq*4], m6, 1 punpcklqdq m3, m2 ; 13 5 mova [dstq+strideq*4], xm6 punpckhqdq m2, m0, m4 ; 14 6 vextracti128 [r2 +r4 ], m3, 1 punpcklqdq m0, m4 ; 15 7 mova [dstq+r4 ], xm3 vextracti128 [r2 +r3*2 ], m2, 1 mova [dstq+r3*2 ], xm2 vextracti128 [r2 +r5 ], m0, 1 mova [dstq+r5 ], xm0 lea r2, [dstq+strideq*8] cmp r6, rsp jae .h32_transpose_loop add rsp, 64*8 sub org_wd, 8 jg .h32_transpose_loop0 .h32_end: RET .h64: ALLOC_STACK -256, 10 lea maxbased, [wq+63] test angled, 0x400 jnz .h64_main vpbroadcastd m2, [pw_3] movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i punpckhwd m1, m0, m0 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m2 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f lea r4, [rsp+224] paddw m0, m1 lea r5d, [wq+32] psrlw m0, 2 mova [r4], m0 .h64_filter_loop: mova m0, [tlq-62] paddw m1, m2, [tlq-66] paddw m0, [tlq-64] pavgw m1, [tlq-58] paddw m0, [tlq-60] sub tlq, 32 sub r4, 32 paddw m0, m1 psrlw m0, 2 mova [r4], m0 sub r5d, 16 jg .h64_filter_loop mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e lea tlq, [rsp+254] pavgw m2, m3 paddw m0, m1 paddw m0, m2 psrlw m0, 2 mova [r4-32], m0 .h64_main: neg maxbaseq movd xm4, dyd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 lea r4d, [maxbaseq+dyq+15*64] neg dyq vpbroadcastd m7, [pw_m1024] movd xm3, r4d lea r4, [dyq+63] paddw m8, m7, m7 vpbroadcastw m3, xm3 or maxbased, 63 paddw m9, m8, m7 psubw m3, [z_base_inc] .h64_loop: mov r5, r4 sar r5, 6 movu m1, [tlq+r5*2-128] movu m0, [tlq+r5*2-126] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 sub rsp, 128 paddw m0, m1 pcmpgtw m1, m9, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*0], m0 movu m1, [tlq+r5*2-96] movu m0, [tlq+r5*2-94] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m8, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*1], m0 movu m1, [tlq+r5*2-64] movu m0, [tlq+r5*2-62] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m7, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*2], m0 movu m1, [tlq+r5*2-32] movu m0, [tlq+r5*2-30] psubw m1, m0 pmulhrsw m1, m2 add r4, dyq psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [rsp+32*3], m0 dec wd jz .h64_transpose cmp r4d, maxbased jg .h64_loop .h64_end_loop: sub rsp, 128 mova [rsp+32*0], m6 mova [rsp+32*1], m6 mova [rsp+32*2], m6 mova [rsp+32*3], m6 dec wd jg .h64_end_loop .h64_transpose: lea r2, [strideq*3] lea r3, [strideq*5] mov r5, dstq lea r4, [strideq+r2*2] .h64_transpose_loop0: lea r6, [rsp+112] lea dstq, [r5+org_wq*2-32] .h64_transpose_loop: mova xm0, [r6+128*15] vinserti128 m0, [r6+128* 7], 1 mova xm1, [r6+128*14] vinserti128 m1, [r6+128* 6], 1 mova xm2, [r6+128*13] vinserti128 m2, [r6+128* 5], 1 mova xm3, [r6+128*12] vinserti128 m3, [r6+128* 4], 1 mova xm4, [r6+128*11] vinserti128 m4, [r6+128* 3], 1 mova xm5, [r6+128*10] vinserti128 m5, [r6+128* 2], 1 mova xm6, [r6+128* 9] vinserti128 m6, [r6+128* 1], 1 mova xm7, [r6+128* 8] vinserti128 m7, [r6+128* 0], 1 punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 sub r6, 16 punpckhdq m7, m8, m1 punpckldq m8, m1 punpckhdq m1, m3, m5 punpckldq m3, m5 punpckhqdq m5, m7, m1 punpcklqdq m7, m1 punpckhqdq m1, m8, m3 punpcklqdq m8, m3 punpckhdq m3, m0, m2 mova [dstq+strideq*0], m5 punpckldq m0, m2 mova [dstq+strideq*1], m7 punpckhdq m2, m4, m6 mova [dstq+strideq*2], m1 punpckldq m4, m6 mova [dstq+r2 ], m8 punpckhqdq m6, m3, m2 mova [dstq+strideq*4], m6 punpcklqdq m3, m2 mova [dstq+r3 ], m3 punpckhqdq m2, m0, m4 mova [dstq+r2*2 ], m2 punpcklqdq m0, m4 mova [dstq+r4 ], m0 lea dstq, [dstq+strideq*8] cmp r6, rsp jae .h64_transpose_loop add rsp, 128*16 sub org_wd, 16 jg .h64_transpose_loop0 .h64_end: RET %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 %else pshufb xm%2, %4 %endif vinserti128 m%2, xm%2, 1 pshufd m%1, m%2, q0000 pmaddwd m%1, m2 pshufd m%3, m%2, q1111 pmaddwd m%3, m3 paddd m%1, m1 paddd m%1, m%3 pshufd m%3, m%2, q2222 pmaddwd m%3, m4 paddd m%1, m%3 pshufd m%3, m%2, q3333 pmaddwd m%3, m5 paddd m%1, m%3 psrad m%1, 4 packusdw m%1, m%1 pminsw m%1, m%5 %endmacro %macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax pshufb m%2, m%6 vpermq m%4, m%2, q3232 vinserti128 m%2, xm%2, 1 pshufd m%1, m%2, q0000 pshufd m%3, m%4, q0000 pmaddwd m%1, m2 pmaddwd m%3, m2 paddd m%1, m1 paddd m%3, m1 pshufd m%5, m%2, q1111 pmaddwd m%5, m3 paddd m%1, m%5 pshufd m%5, m%4, q1111 pmaddwd m%5, m3 paddd m%3, m%5 pshufd m%5, m%2, q2222 pmaddwd m%5, m4 paddd m%1, m%5 pshufd m%5, m%4, q2222 pmaddwd m%5, m4 paddd m%3, m%5 pshufd m%5, m%2, q3333 pmaddwd m%5, m5 paddd m%1, m%5 pshufd m%5, m%4, q3333 pmaddwd m%5, m5 paddd m%3, m%5 psrad m%1, 4 psrad m%3, 4 packusdw m%1, m%3 pminsw m%1, m%7 %endmacro ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. One redundant ; block is calculated for w8 and w16, two for w32. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 5 1 2 3 5 b c d f ; 2 2 3 2 4 5 7 2 4 5 7 c e f h ; 3 3 4 4 6 7 9 4 6 7 9 e g h j ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter %define base r6-ipred_filter_16bpc_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 add filterq, r6 lea r6, [ipred_filter_16bpc_avx2_table] vbroadcasti128 m0, [tlq-6] movsxd wq, [r6+wq*4] vpbroadcastd m1, [base+pd_8] pmovsxbw m2, [filterq+16*0] pmovsxbw m3, [filterq+16*1] pmovsxbw m4, [filterq+16*2] pmovsxbw m5, [filterq+16*3] add wq, r6 mov hd, hm jmp wq .w4: WIN64_SPILL_XMM 10 mova xm8, [base+filter_shuf2] vpbroadcastw m9, r8m ; bitdepth_max lea r7, [6+hq*2] sub tlq, r7 jmp .w4_loop_start .w4_loop: pinsrq xm0, [tlq+hq*2], 0 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER_1BLK 6, 0, 7, 8, 9 vextracti128 xm0, m6, 1 movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm0 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: WIN64_SPILL_XMM 16 vbroadcasti128 m14, [base+filter_shuf3] vpbroadcastw m15, r8m ; bitdepth_max FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 pslldq m8, m0, 4 psrldq m7, m6, 2 psrldq m0, m6, 10 punpcklwd m7, m0 vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 lea r7, [16+hq*2] sub tlq, r7 jmp .w8_loop_start .w8_loop: vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 vpermq m6, m9, q2031 psrldq m0, m6, 2 psrldq m6, 10 punpcklwd m6, m0 vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 mova m10, m9 .w8_loop_start: vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 call .main vpblendd m10, m9, 0xCC mova [dstq+strideq*0], xm10 vextracti128 [dstq+strideq*1], m10, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: ALLOC_STACK 32, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 TAIL_CALL .w16_main, 0 .w16_main: mova xm10, [base+filter_shuf2] FILTER_1BLK 13, 0, 6, 10, 15 vpermq m12, m13, q3120 mova xm14, [base+filter_shuf3] vinserti128 m14, [base+filter_shuf1], 1 vpbroadcastq m0, [tlq+10] vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ psrldq m6, m12, 8 vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 punpcklwd m6, m12 vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 vpblendd m13, m12, 0xCC vpermq m12, m12, q2031 ; 6___ 5___ psrldq xm6, xm12, 2 psrldq xm8, xm12, 12 vpblendd xm6, xm8, 0x01 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ FILTER_1BLK 11, 6, 8, 10, 15 vpermq m11, m11, q3120 pshufd m9, m11, q1032 movu m8, [tlq+6] ; __43 210_ | ____ ____ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 lea r7, [20+hq*2] sub tlq, r7 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 jmp .w16_loop_start .w16_loop: vpermq m13, m13, q3322 vpermq m11, m9, q2020 vpermq m9, m9, q1302 vpermq m6, m12, q0123 psrldq m7, 4 vpblendd m13, m10, 0xCC vpblendd m9, m7, 0x40 mova m0, [rsp+8] mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 .w16_loop_start: mova m13, m12 vpblendd m0, [tlq+hq*2], 0x0C psrldq m7, m12, 8 punpcklwd m7, m12 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 vpermq m12, m10, q2031 mova [rsp+8], m0 psrldq m8, m11, 8 psrldq xm6, xm12, 2 psrldq xm7, xm12, 10 psrldq xm0, xm13, 2 punpcklwd m8, m11 punpcklwd xm7, xm6 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 call .main vpermq m8, m11, q3120 vpblendd m6, m8, m9, 0xCC mova [dstq+strideq*0+16], xm6 vextracti128 [dstq+strideq*1+16], m6, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop vpermq m8, m9, q3120 vextracti128 xm0, m8, 1 ; 4321 ____ pshufd xm11, xm11, q1032 vpblendd xm0, xm11, 0x02 ; 4321 0___ psrldq xm6, xm8, 2 psrldq xm7, xm8, 12 pblendw xm0, xm6, 0x4 ; 4321 05__ pblendw xm0, xm7, 0x2 ; 4321 056_ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 vpermq m12, m13, q1302 vpblendd m12, m10, 0xCC vpblendd m9, m6, 0xCC mova [dstq+strideq*0+ 0], xm12 mova [dstq+strideq*0+16], xm9 vextracti128 [dstq+strideq*1+ 0], m12, 1 vextracti128 [dstq+strideq*1+16], m9, 1 ret ALIGN function_align .w32: ALLOC_STACK 64, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 lea r3, [dstq+32] lea r5d, [hd*2+20] call .w16_main mov dstq, r3 lea tlq, [tlq+r5+32] sub r5d, 20 shr r5d, 1 sub r5d, 2 lea r4, [dstq+strideq*2-2] DEFINE_ARGS dst, stride, tl, stride3, left, h lea stride3q, [strideq*3] movu m8, [tlq-6] ; 4321 0___ mova xm10, [base+filter_shuf2] pinsrw xm0, xm8, [dstq+strideq*0-2], 2 pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ pinsrw xm9, [leftq+strideq*0], 5 pinsrw xm9, [leftq+strideq*1], 4 FILTER_1BLK 13, 0, 6, 10, 15 vpermq m12, m13, q3120 mova xm14, [base+filter_shuf3] vinserti128 m14, [base+filter_shuf1], 1 psrldq m6, m12, 8 punpcklwd m7, m6, m12 vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 vpblendd m13, m12, 0xCC pinsrw xm9, [leftq+strideq*2], 3 pinsrw xm9, [leftq+stride3q ], 2 lea leftq, [leftq+strideq*4] pinsrw xm9, [leftq+strideq*0], 1 pinsrw xm9, [leftq+strideq*1], 0 movq [rsp+32], xm9 mov r7d, 1 pslldq m8, m9, 4 vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ vpermq m12, m12, q2031 ; 6___ 5___ psrldq xm6, xm12, 2 psrldq xm7, xm12, 12 vpblendd xm6, xm7, 0x01 ; ____ _56_ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ FILTER_1BLK 11, 6, 7, 10, 15 vpermq m11, m11, q3120 pshufd m9, m11, q1032 vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 jmp .w32_loop_start .w32_loop_last: mova m0, [rsp+0] jmp .w32_loop .w32_loop_left: mova m0, [rsp+0] vpblendd m0, [rsp+32+r7*4-12], 0x0C dec r7d jg .w32_loop cmp hd, 2 je .w32_loop pinsrw xm6, [rsp+32], 6 pinsrw xm6, [leftq+strideq*2], 5 pinsrw xm6, [leftq+stride3q ], 4 lea leftq, [leftq+strideq*4] pinsrw xm6, [leftq+strideq*0], 3 pinsrw xm6, [leftq+strideq*1], 2 pinsrw xm6, [leftq+strideq*2], 1 pinsrw xm6, [leftq+stride3q ], 0 lea leftq, [leftq+strideq*4] movu [rsp+36], xm6 pinsrw xm6, [leftq+strideq*0], 1 pinsrw xm6, [leftq+strideq*1], 0 movd [rsp+32], xm6 mov r7d, 4 .w32_loop: vpermq m13, m13, q3322 vpermq m11, m9, q2020 vpermq m9, m9, q1302 vpermq m6, m12, q0123 psrldq m7, 4 vpblendd m13, m10, 0xCC vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 .w32_loop_start: mova m13, m12 psrldq m7, m12, 8 punpcklwd m7, m12 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 vpermq m12, m10, q2031 mova [rsp+0], m0 psrldq m8, m11, 8 psrldq xm6, xm12, 2 psrldq xm7, xm12, 10 psrldq xm0, xm13, 2 punpcklwd m8, m11 punpcklwd xm7, xm6 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 call .main vpermq m8, m11, q3120 vpblendd m6, m8, m9, 0xCC mova [dstq+strideq*0+16], xm6 vextracti128 [dstq+strideq*1+16], m6, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop_left jz .w32_loop_last vpermq m8, m9, q3120 vextracti128 xm0, m8, 1 ; 4321 ____ pshufd xm11, xm11, q1032 vpblendd xm0, xm11, 0x02 ; 4321 0___ psrldq xm6, xm8, 2 psrldq xm7, xm8, 12 pblendw xm0, xm6, 0x4 ; 4321 05__ pblendw xm0, xm7, 0x2 ; 4321 056_ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 vpermq m12, m13, q1302 vpblendd m12, m10, 0xCC vpblendd m9, m6, 0xCC mova [dstq+strideq*0+ 0], xm12 mova [dstq+strideq*0+16], xm9 vextracti128 [dstq+strideq*1+ 0], m12, 1 vextracti128 [dstq+strideq*1+16], m9, 1 RET .main: FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 ret %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm add tlq, 2 movd xm4, wd pxor m6, m6 vpbroadcastw m7, r7m pavgw xm4, xm6 tzcnt wd, wd movd xm5, wd movu m0, [tlq] lea t0, [ipred_cfl_left_16bpc_avx2_table] movsxd r6, [t0+wq*4] add r6, t0 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half sub tlq, hq movd xm4, hd sub tlq, hq pxor m6, m6 vpbroadcastw m7, r7m pavgw xm4, xm6 tzcnt r6d, hd movd xm5, r6d movu m0, [tlq] lea t0, [ipred_cfl_left_16bpc_avx2_table] movsxd r6, [t0+r6*4] add r6, t0 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table tzcnt wd, wd movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: paddw m0, [tlq+32] .h16: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h8: psrldq xm1, xm0, 8 paddw xm0, xm1 .h4: punpcklwd xm0, xm6 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 paddd xm0, xm4 psrld xm0, xm5 vpbroadcastw m0, xm0 jmp wq cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea t0d, [wq+hq] movd xm4, t0d tzcnt t0d, t0d movd xm5, t0d lea t0, [ipred_cfl_16bpc_avx2_table] tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw xm4, 1 pxor m6, m6 vpbroadcastw m7, r7m add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movq xm0, [tlq-8] jmp wq .w4: movq xm1, [tlq+2] paddw m0, m4 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrld m1, m0, 16 paddw m0, m1 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: vextracti128 xm1, m0, 1 paddw xm0, xm1 lea r2d, [hq*2] mov r6d, 0xAAAB6667 shrx r6d, r6d, r2d punpckhwd xm1, xm0, xm6 punpcklwd xm0, xm6 paddd xm0, xm1 movd xm1, r6d psrld xm0, 2 pmulhuw xm0, xm1 psrlw xm0, 1 .w4_end: vpbroadcastw m0, xm0 .s4: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] IPRED_CFL 4 pmaxsw m4, m6 pminsw m4, m7 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm5 movhps [dstq+strideq*1], xm4 movhps [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: mova xm0, [tlq-16] jmp wq .w8: vextracti128 xm1, m0, 1 paddw xm0, [tlq+2] paddw xm0, xm4 paddw xm0, xm1 psrld xm1, xm0, 16 paddw xm0, xm1 pblendw xm0, xm6, 0xAA psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w8_end: vpbroadcastw m0, xm0 .s8: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+strideq*0], xm4 mova [dstq+strideq*2], xm5 vextracti128 [dstq+strideq*1], m4, 1 vextracti128 [dstq+r6 ], m5, 1 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova m0, [tlq-32] jmp wq .w16: paddw m0, [tlq+2] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpckhwd xm1, xm0, xm6 punpcklwd xm0, xm6 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w16_end: vpbroadcastw m0, xm0 .s16: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+strideq*0], m4 mova [dstq+strideq*1], m5 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-64] paddw m0, [tlq-32] jmp wq .w32: paddw m0, [tlq+ 2] paddw m0, [tlq+34] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpcklwd xm1, xm0, xm6 punpckhwd xm0, xm6 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x6667AAAB shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w32_end: vpbroadcastw m0, xm0 .s32: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+32*0], m4 mova [dstq+32*1], m5 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha mov r6d, r7m shr r6d, 11 lea t0, [ipred_cfl_splat_16bpc_avx2_table] tzcnt wd, wd movifnidn hd, hm movsxd wq, [t0+wq*4] vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] pxor m6, m6 vpbroadcastw m7, r7m add wq, t0 movifnidn acq, acmp jmp wq cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm vpbroadcastd m5, [pw_2] mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 jg .w16 je .w8 .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: mova xm0, [ypxq+strideq*2] mova xm1, [ypxq+r3 ] vinserti128 m0, [ypxq+strideq*0], 1 vinserti128 m1, [ypxq+strideq*1], 1 lea ypxq, [ypxq+strideq*4] pmaddwd m0, m5 pmaddwd m1, m5 paddd m0, m1 vextracti128 xm1, m0, 1 paddd m4, m0 packssdw xm1, xm0 mova [acq], xm1 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc vpermq m1, m1, q1111 pslld xm0, 2 .w4_hpad_loop: mova [acq], m1 paddd m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .dc .w8: mov r5, acq test wpadd, wpadd jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 vextracti128 xm1, m0, 1 paddd m4, m0 packssdw xm1, xm0, xm1 mova [acq], xm1 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc vinserti128 m1, xm1, 1 pslld m0, 2 jmp .hpad .w8_wpad1: pmaddwd xm0, xm5, [ypxq+strideq*0] pmaddwd xm3, xm5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd xm0, xm3 pshufd xm3, xm0, q3333 packssdw xm1, xm0, xm3 paddd xm0, xm3 paddd xm4, xm0 mova [acq], xm1 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad: mova m0, [ypxq+strideq*0+ 0] mova m1, [ypxq+strideq*1+ 0] cmp wpadd, 2 jl .w16_wpad1 je .w16_wpad2 vpbroadcastd m2, [ypxq+strideq*0+12] vpbroadcastd m3, [ypxq+strideq*1+12] vpblendd m0, m2, 0xf0 vpblendd m1, m3, 0xf0 jmp .w16_wpad_end .w16_wpad2: vpbroadcastd m2, [ypxq+strideq*0+28] vpbroadcastd m3, [ypxq+strideq*1+28] jmp .w16_wpad_end .w16_wpad1: vpbroadcastd m2, [ypxq+strideq*0+44] vpbroadcastd m3, [ypxq+strideq*1+44] vinserti128 m2, [ypxq+strideq*0+32], 0 vinserti128 m3, [ypxq+strideq*1+32], 0 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] REPX {pmaddwd x, m5}, m0, m1, m2, m3 paddd m0, m1 paddd m2, m3 packssdw m1, m0, m2 paddd m0, m2 vpermq m1, m1, q3120 paddd m4, m0 mova [acq], m1 add acq, 32 dec hd jg .w16_wpad jmp .w16_hpad .w16: mov r5, acq test wpadd, wpadd jnz .w16_wpad .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+ 0] pmaddwd m2, m5, [ypxq+strideq*0+32] pmaddwd m1, m5, [ypxq+strideq*1+ 0] pmaddwd m3, m5, [ypxq+strideq*1+32] lea ypxq, [ypxq+strideq*2] paddd m0, m1 paddd m2, m3 packssdw m1, m0, m2 paddd m0, m2 vpermq m1, m1, q3120 paddd m4, m0 mova [acq], m1 add acq, 32 dec hd jg .w16_loop .w16_hpad: add hpadd, hpadd jz .dc paddd m0, m0 .hpad: mova [acq+32*0], m1 paddd m4, m0 mova [acq+32*1], m1 add acq, 32*2 sub hpadd, 4 jg .hpad .dc: vextracti128 xm1, m4, 1 sub r5, acq ; -w*h*2 tzcnt r1d, r5d paddd xm4, xm1 sub r1d, 2 punpckhqdq xm1, xm4, xm4 movd xm0, r1d paddd xm1, xm4 pshuflw xm4, xm1, q1032 paddd xm1, xm4 psrld xm1, xm0 pxor xm0, xm0 pavgw xm1, xm0 vpbroadcastw m1, xm1 .dc_loop: mova m0, [acq+r5] psubw m0, m1 mova [acq+r5], m0 add r5, 32 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm vpbroadcastd m5, [pw_4] mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 jg .w16 je .w8 .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: mova xm0, [ypxq+strideq*0] mova xm1, [ypxq+strideq*1] vinserti128 m0, [ypxq+strideq*2], 1 vinserti128 m1, [ypxq+r3 ], 1 lea ypxq, [ypxq+strideq*4] pmaddwd m0, m5 pmaddwd m1, m5 paddd m4, m0 packssdw m0, m1 paddd m4, m1 mova [acq], m0 add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vextracti128 xm1, m1, 1 vpermq m0, m0, q3333 pslld xm1, 2 .w4_hpad_loop: mova [acq], m0 paddd m4, m1 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w8: mov r5, acq test wpadd, wpadd jnz .w8_wpad1 .w8_loop: pmaddwd m1, m5, [ypxq+strideq*0] pmaddwd m0, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m4, m1 packssdw m1, m0 paddd m4, m0 vpermq m2, m1, q3120 mova [acq], m2 add acq, 32 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vpermq m1, m1, q3131 pslld m0, 2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w8_wpad1: vpbroadcastd m1, [ypxq+strideq*0+12] vpbroadcastd m0, [ypxq+strideq*1+12] vinserti128 m1, [ypxq+strideq*0+ 0], 0 vinserti128 m0, [ypxq+strideq*1+ 0], 0 lea ypxq, [ypxq+strideq*2] pmaddwd m1, m5 pmaddwd m0, m5 paddd m4, m1 packssdw m1, m0 paddd m4, m0 vpermq m2, m1, q3120 mova [acq], m2 add acq, 32 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16: mov r5, acq test wpadd, wpadd jnz .w16_wpad .w16_loop: pmaddwd m2, m5, [ypxq+strideq*0+ 0] pmaddwd m1, m5, [ypxq+strideq*0+32] pmaddwd m0, m5, [ypxq+strideq*1+ 0] pmaddwd m3, m5, [ypxq+strideq*1+32] lea ypxq, [ypxq+strideq*2] paddd m4, m2 packssdw m2, m1 paddd m4, m1 packssdw m1, m0, m3 paddd m0, m3 vpermq m2, m2, q3120 paddd m4, m0 vpermq m1, m1, q3120 mova [acq+32*0], m2 mova [acq+32*1], m1 add acq, 32*2 sub hd, 2 jg .w16_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad .w16_wpad: mova m2, [ypxq+strideq*0+ 0] mova m0, [ypxq+strideq*1+ 0] cmp wpadd, 2 jl .w16_wpad1 je .w16_wpad2 vpbroadcastd m1, [ypxq+strideq*0+12] vpbroadcastd m3, [ypxq+strideq*1+12] vpblendd m2, m1, 0xf0 vpblendd m0, m3, 0xf0 jmp .w16_wpad_end .w16_wpad2: vpbroadcastd m1, [ypxq+strideq*0+28] vpbroadcastd m3, [ypxq+strideq*1+28] jmp .w16_wpad_end .w16_wpad1: vpbroadcastd m1, [ypxq+strideq*0+44] vpbroadcastd m3, [ypxq+strideq*1+44] vinserti128 m1, [ypxq+strideq*0+32], 0 vinserti128 m3, [ypxq+strideq*1+32], 0 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] REPX {pmaddwd x, m5}, m2, m0, m1, m3 paddd m4, m2 packssdw m2, m1 paddd m4, m1 packssdw m1, m0, m3 paddd m0, m3 vpermq m2, m2, q3120 paddd m4, m0 vpermq m1, m1, q3120 mova [acq+32*0], m2 mova [acq+32*1], m1 add acq, 32*2 sub hd, 2 jg .w16_wpad jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] tzcnt wd, wm movifnidn hpadd, hpadm vpbroadcastd m5, [pw_1] movsxd wq, [r6+wq*4] shl hpadd, 2 add wq, r6 mov hd, hm pxor m4, m4 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq xm0, [ypxq+strideq*0] movhps xm0, [ypxq+strideq*1] vpbroadcastq m1, [ypxq+strideq*2] vpbroadcastq m2, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psllw m0, 3 pmaddwd m1, m0, m5 mova [acq], m0 add acq, 32 paddd m4, m1 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vpermq m0, m0, q3333 paddd m1, m1 mova [acq+32*0], m0 vpermq m1, m1, q3333 mova [acq+32*1], m0 add acq, 32*2 paddd m4, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w8: lea r3, [strideq*3] mov r5, acq .w8_loop: mova xm2, [ypxq+strideq*0] vinserti128 m2, [ypxq+strideq*1], 1 mova xm1, [ypxq+strideq*2] vinserti128 m1, [ypxq+r3 ], 1 lea ypxq, [ypxq+strideq*4] psllw m2, 3 psllw m1, 3 mova [acq+32*0], m2 pmaddwd m2, m5 mova [acq+32*1], m1 pmaddwd m0, m1, m5 add acq, 32*2 paddd m4, m2 paddd m4, m0 sub hd, 4 jg .w8_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vperm2i128 m1, m1, 0x11 pslld m0, 2 pxor m2, m2 vpblendd m0, m2, 0x0f jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w16_wpad2: vpbroadcastw m3, [ypxq+strideq*0+14] vpbroadcastw m0, [ypxq+strideq*1+14] vpblendd m2, m3, 0xf0 vpblendd m1, m0, 0xf0 jmp .w16_wpad_end .w16: mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] test wpadd, wpadd jnz .w16_wpad2 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] psllw m2, 3 psllw m1, 3 mova [acq+32*0], m2 pmaddwd m2, m5 mova [acq+32*1], m1 pmaddwd m0, m1, m5 add acq, 32*2 paddd m4, m2 paddd m4, m0 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc paddd m0, m0 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w32: mov r5, acq test wpadd, wpadd jnz .w32_wpad .w32_loop: mova m0, [ypxq+ 0] mova m1, [ypxq+32] add ypxq, strideq psllw m0, 3 psllw m1, 3 pmaddwd m2, m0, m5 mova [acq+32*0], m0 pmaddwd m3, m1, m5 mova [acq+32*1], m1 add acq, 32*2 paddd m2, m3 paddd m4, m2 dec hd jg .w32_loop .w32_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc paddd m2, m2 .w32_hpad_loop: mova [acq+32*0], m0 mova [acq+32*1], m1 paddd m4, m2 mova [acq+32*2], m0 mova [acq+32*3], m1 add acq, 32*4 sub hpadd, 2 jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w32_wpad: mova m0, [ypxq+ 0] cmp wpadd, 4 jl .w32_wpad2 je .w32_wpad4 vpbroadcastw m1, [ypxq+14] vpblendd m0, m1, 0xf0 jmp .w32_wpad_end .w32_wpad4: vpbroadcastw m1, [ypxq+30] jmp .w32_wpad_end .w32_wpad2: vpbroadcastw m1, [ypxq+46] vinserti128 m1, [ypxq+32], 0 .w32_wpad_end: add ypxq, strideq psllw m0, 3 psllw m1, 3 pmaddwd m2, m0, m5 mova [acq+32*0], m0 pmaddwd m3, m1, m5 mova [acq+32*1], m1 add acq, 32*2 paddd m2, m3 paddd m4, m2 dec hd jg .w32_wpad jmp .w32_hpad cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h vbroadcasti128 m4, [palq] lea r2, [pal_pred_16bpc_avx2_table] tzcnt wd, wm vbroadcasti128 m5, [pal_pred_shuf] movifnidn hd, hm movsxd wq, [r2+wq*4] pshufb m4, m5 punpckhqdq m5, m4, m4 add wq, r2 DEFINE_ARGS dst, stride, stride3, idx, w, h lea stride3q, [strideq*3] jmp wq .w4: movq xm0, [idxq] add idxq, 8 psrlw xm1, xm0, 4 punpcklbw xm0, xm1 pshufb xm1, xm4, xm0 pshufb xm2, xm5, xm0 punpcklbw xm0, xm1, xm2 punpckhbw xm1, xm2 movq [dstq+strideq*0], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+strideq*1], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m2, [idxq] add idxq, 16 psllw m1, m2, 4 por m2, m1 pshufb m1, m4, m2 pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], xm0 movu [dstq+strideq*1], xm1 vextracti128 [dstq+strideq*2], m0, 1 vextracti128 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: pshufd m3, [idxq], q3120 add idxq, 32 vpermq m3, m3, q3120 psrlw m1, m3, 4 punpcklbw m2, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m2 pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 pshufb m1, m4, m3 pshufb m3, m5, m3 punpcklbw m0, m1, m3 punpckhbw m1, m3 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: pshufd m3, [idxq], q3120 add idxq, 32 vpermq m3, m3, q3120 psrlw m1, m3, 4 punpcklbw m2, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m2 pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+ 0], m0 movu [dstq+32], m1 pshufb m1, m4, m3 pshufb m3, m5, m3 punpcklbw m0, m1, m3 punpckhbw m1, m3 movu [dstq+strideq+ 0], m0 movu [dstq+strideq+32], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: pshufd m3, [idxq], q3120 add idxq, 32 vpermq m3, m3, q3120 psrlw m1, m3, 4 punpcklbw m2, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m2 pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+32*0], m0 mova [dstq+32*1], m1 pshufb m1, m4, m3 pshufb m3, m5, m3 punpcklbw m0, m1, m3 punpckhbw m1, m3 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq dec hd jg .w64 RET %endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred16_avx512.asm000066400000000000000000002636761517466257200243660ustar00rootroot00000000000000; Copyright © 2022-2024, VideoLAN and dav2d authors ; Copyright © 2022-2024, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6 dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14 z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2 dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4 z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1 dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168 z_xpos_off1b: dw 30720, 30848, 30976, 31104, 31232, 31360, 31488, 31616 filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 times 4 db 10, 11, 12, 13, 2, 3, -1, -1 filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 times 4 db 26, 27, 28, 29, 14, 15, -1, -1 filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 pw_1: times 2 dw 1 dd 10 filter_rnd: dd 32 dd 1 dd 8 dd 11 filter_shift: times 2 dw 6 dd 0 times 2 dw 4 dd 9 pd_65536: dd 65536 pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44 db 16, 24, 20, 28, 48, 56, 52, 60 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 79, 79, 79 z_filter_k: dw 8, 8, 6, 6, 4, 4 dw 4, 4, 5, 5, 4, 4 dw 0, 0, 0, 0, 2, 2 pb_90: times 4 db 90 pw_15: times 2 dw 15 pw_16: times 2 dw 16 pw_17: times 2 dw 17 pw_24: times 2 dw 24 pw_31: times 2 dw 31 pw_32: times 2 dw 32 pw_63: times 2 dw 63 pw_64: times 2 dw 64 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pw_31806: times 2 dw 31806 pw_32640: times 2 dw 32640 pw_32672: times 2 dw 32672 pw_32704: times 2 dw 32704 pw_32735: times 2 dw 32735 pw_32736: times 2 dw 32736 %define pw_2 (z_xpos_mul+4* 2) %define pw_3 (z_xpos_mul+4* 4) %define pw_7 (z_xpos_mul+4*12) %define pw_0to31 (pw_1to32-2) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern dr_intra_derivative cextern filter_intra_taps SECTION .text %macro PAETH 3 ; top, signed_ldiff, ldiff paddw m0, m%2, m2 psubw m1, m0, m3 ; tldiff psubw m0, m%1 ; tdiff pabsw m1, m1 pabsw m0, m0 pcmpgtw k1, m0, m1 pminsw m0, m1 pcmpgtw k2, m%3, m0 vpblendmw m0{k1}, m%1, m3 vpblendmw m0{k2}, m2, m0 %endmacro INIT_ZMM avx512icl cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h %define base r6-ipred_paeth_16bpc_avx512icl_table lea r6, [ipred_paeth_16bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastw m3, [tlq] ; topleft add wq, r6 jmp wq .w4: vpbroadcastq m4, [tlq+2] ; top movsldup m7, [base+ipred_shuf] lea r6, [strideq*3] psubw m5, m4, m3 pabsw m6, m5 .w4_loop: sub tlq, 16 vbroadcasti32x4 m2, [tlq] pshufb m2, m7 ; left PAETH 4, 5, 6 vextracti32x4 xm1, m0, 2 vextracti32x4 xm8, ym0, 1 vextracti32x4 xm9, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm8 movq [dstq+r6 ], xm9 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm8 movhps [dstq+r6 ], xm9 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .w8: vbroadcasti32x4 m4, [tlq+2] movsldup m7, [base+ipred_shuf] lea r6, [strideq*3] psubw m5, m4, m3 pabsw m6, m5 .w8_loop: sub tlq, 8 vpbroadcastq m2, [tlq] pshufb m2, m7 PAETH 4, 5, 6 movu [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x8 m4, [tlq+2] movsldup m7, [base+ipred_shuf] psubw m5, m4, m3 pabsw m6, m5 .w16_loop: sub tlq, 4 vpbroadcastd m2, [tlq] pshufb m2, m7 PAETH 4, 5, 6 movu [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET .w32: movu m4, [tlq+2] psubw m5, m4, m3 pabsw m6, m5 .w32_loop: sub tlq, 2 vpbroadcastw m2, [tlq] PAETH 4, 5, 6 movu [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET .w64: movu m4, [tlq+ 2] movu m7, [tlq+66] psubw m5, m4, m3 psubw m8, m7, m3 pabsw m6, m5 pabsw m9, m8 .w64_loop: sub tlq, 2 vpbroadcastw m2, [tlq] PAETH 4, 5, 6 mova [dstq+64*0], m0 PAETH 7, 8, 9 mova [dstq+64*1], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] neg hq vpbroadcastw m6, [tlq+hq*2] ; bottom lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastq m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w4_loop: vbroadcasti32x4 m3, [weightsq+hq*2] pshufb m3, m4 pmulhrsw m3, m5 paddw m3, m6 vextracti32x4 xm0, m3, 3 vextracti32x4 xm1, ym3, 1 vextracti32x4 xm2, m3, 2 movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 add hq, 8 jg .end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jl .w4_loop .end: RET .w8: vbroadcasti32x4 m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m4 pmulhrsw m0, m5 paddw m0, m6 vextracti32x4 [dstq+strideq*0], m0, 3 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x8 m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w16_loop: vpbroadcastd m0, [weightsq+hq*2+0] vpbroadcastd m1, [weightsq+hq*2+4] pshufb m0, m4 pshufb m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 paddw m0, m6 paddw m1, m6 vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], ym0 vextracti32x8 [dstq+strideq*2], m1, 1 mova [dstq+stride3q ], ym1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: movu m5, [tlq+2] psubw m5, m6 .w32_loop: vpbroadcastw m0, [weightsq+hq*2+0] vpbroadcastw m1, [weightsq+hq*2+2] vpbroadcastw m2, [weightsq+hq*2+4] vpbroadcastw m3, [weightsq+hq*2+6] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w32_loop RET .w64: movu m4, [tlq+ 2] movu m5, [tlq+66] psubw m4, m6 psubw m5, m6 .w64_loop: vpbroadcastw m1, [weightsq+hq*2+0] vpbroadcastw m3, [weightsq+hq*2+2] pmulhrsw m0, m4, m1 pmulhrsw m1, m5 pmulhrsw m2, m4, m3 pmulhrsw m3, m5 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 mova [dstq+strideq*1+64*0], m2 mova [dstq+strideq*1+64*1], m3 lea dstq, [dstq+strideq*2] add hq, 2 jl .w64_loop RET cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 lea r6, [$$] mov wd, wm movifnidn hd, hm vpbroadcastw m6, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] sub tlq, hq lea stride3q, [strideq*3] lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] jmp wq .w4: movsldup m4, [base+ipred_shuf] vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] .w4_loop: vbroadcasti32x4 m0, [tlq+hq-16] ; left pshufb m0, m4 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8*2 jl .end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w4_loop .end: RET .w8: movsldup m4, [base+ipred_shuf] vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] .w8_loop: vpbroadcastq m0, [tlq+hq-8] ; left pshufb m0, m4 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w8_loop RET .w16: movsldup m4, [base+ipred_shuf] vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastd m1, [tlq+hq-8] pshufb m0, m4 pshufb m1, m4 psubw m0, m6 psubw m1, m6 pmulhrsw m0, m5 pmulhrsw m1, m5 paddw m0, m6 paddw m1, m6 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w16_loop RET .w32: movu m5, [base+smooth_weights_1d_16bpc+32*2] .w32_loop: vpbroadcastq m3, [tlq+hq-8] punpcklwd m3, m3 psubw m3, m6 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w32_loop RET .w64: movu m4, [base+smooth_weights_1d_16bpc+64*2] movu m5, [base+smooth_weights_1d_16bpc+64*3] .w64_loop: vpbroadcastw m1, [tlq+hq-2] vpbroadcastw m3, [tlq+hq-4] psubw m1, m6 psubw m3, m6 pmulhrsw m0, m4, m1 pmulhrsw m1, m5 pmulhrsw m2, m4, m3 pmulhrsw m3, m5 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 mova [dstq+strideq*1+64*0], m2 mova [dstq+strideq*1+64*1], m3 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w64_loop RET cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 lea r6, [$$] mov wd, wm movifnidn hd, hm vpbroadcastw m13, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] mov r5d, 0x55555555 sub tlq, hq mova m14, [base+smooth_perm] kmovd k1, r5d vpbroadcastw m0, [tlq] ; bottom mov r5, 0x3333333333333333 pxor m15, m15 lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] kmovq k2, r5 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] jmp wq .w4: vpbroadcastq m5, [tlq+hq+2] movshdup m3, [base+ipred_shuf] movsldup m4, [base+ipred_shuf] vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] lea stride3q, [strideq*3] punpcklwd m5, m0 ; top, bottom .w4_loop: vbroadcasti32x4 m0, [v_weightsq] vpbroadcastq m2, [tlq+hq-8] mova m1, m13 pshufb m0, m3 pmaddwd m0, m5 pshufb m1{k2}, m2, m4 ; left, right vpdpwssd m0, m1, m6 vpermb m0, m14, m0 pavgw ym0, ym15 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 4*4 sub hd, 4*2 jg .w4_loop RET .w8: vbroadcasti32x4 ym5, [tlq+hq+2] movshdup m6, [base+ipred_shuf] movsldup m7, [base+ipred_shuf] pmovzxwd m5, ym5 vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] lea stride3q, [strideq*3] vpblendmw m5{k1}, m0, m5 ; top, bottom .w8_loop: vpbroadcastq m0, [v_weightsq+0] vpbroadcastq m1, [v_weightsq+8] vpbroadcastd m3, [tlq+hq-4] vpbroadcastd m4, [tlq+hq-8] pshufb m0, m6 pmaddwd m0, m5 pshufb m1, m6 pmaddwd m1, m5 mova m2, m13 pshufb m2{k2}, m3, m7 ; left, right mova m3, m13 pshufb m3{k2}, m4, m7 vpdpwssd m0, m2, m8 vpdpwssd m1, m3, m8 add v_weightsq, 4*4 vpermt2b m0, m14, m1 pavgw m0, m15 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w8_loop RET .w16: pmovzxwd m5, [tlq+hq+2] mova m6, [base+smooth_weights_2d_16bpc+16*4] vpblendmw m5{k1}, m0, m5 ; top, bottom .w16_loop: vpbroadcastd m0, [v_weightsq+0] vpbroadcastd m1, [v_weightsq+4] pmaddwd m0, m5 pmaddwd m1, m5 mova m2, m13 vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right mova m3, m13 vpbroadcastw m3{k1}, [tlq+hq-4] vpdpwssd m0, m2, m6 vpdpwssd m1, m3, m6 add v_weightsq, 2*4 vpermt2b m0, m14, m1 pavgw m0, m15 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w16_loop RET .w32: pmovzxwd m5, [tlq+hq+ 2] pmovzxwd m6, [tlq+hq+34] mova m7, [base+smooth_weights_2d_16bpc+32*4] mova m8, [base+smooth_weights_2d_16bpc+32*6] vpblendmw m5{k1}, m0, m5 ; top, bottom vpblendmw m6{k1}, m0, m6 .w32_loop: vpbroadcastd m2, [v_weightsq+0] vpbroadcastd m3, [v_weightsq+4] pmaddwd m0, m5, m2 pmaddwd m2, m6 pmaddwd m1, m5, m3 pmaddwd m3, m6 mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right vpdpwssd m0, m4, m7 vpdpwssd m2, m4, m8 mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-4] vpdpwssd m1, m4, m7 vpdpwssd m3, m4, m8 add v_weightsq, 2*4 vpermt2b m0, m14, m2 vpermt2b m1, m14, m3 pavgw m0, m15 pavgw m1, m15 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w32_loop RET .w64: pmovzxwd m5, [tlq+hq+ 2] pmovzxwd m6, [tlq+hq+34] pmovzxwd m7, [tlq+hq+66] pmovzxwd m8, [tlq+hq+98] mova m9, [base+smooth_weights_2d_16bpc+64*4] vpblendmw m5{k1}, m0, m5 ; top, bottom mova m10, [base+smooth_weights_2d_16bpc+64*5] vpblendmw m6{k1}, m0, m6 mova m11, [base+smooth_weights_2d_16bpc+64*6] vpblendmw m7{k1}, m0, m7 mova m12, [base+smooth_weights_2d_16bpc+64*7] vpblendmw m8{k1}, m0, m8 .w64_loop: vpbroadcastd m3, [v_weightsq] mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right pmaddwd m0, m5, m3 pmaddwd m2, m6, m3 pmaddwd m1, m7, m3 pmaddwd m3, m8 vpdpwssd m0, m4, m9 vpdpwssd m2, m4, m10 vpdpwssd m1, m4, m11 vpdpwssd m3, m4, m12 add v_weightsq, 1*4 vpermt2b m0, m14, m2 vpermt2b m1, m14, m3 pavgw m0, m15 pavgw m1, m15 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq sub hd, 1*2 jg .w64_loop RET %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx %define base r7-z_filter_t0 lea r7, [z_filter_t0] tzcnt wd, wm movifnidn angled, anglem lea t0, [dr_intra_derivative] movsxd wq, [base+ipred_z1_16bpc_avx512icl_table+wq*4] add tlq, 2 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [t0+dxq] lea wq, [base+ipred_z1_16bpc_avx512icl_table+wq] movifnidn hd, hm xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m15, [base+pw_31806] jmp wq .w4: vpbroadcastw m5, [tlq+14] vinserti32x4 m5, [tlq], 0 cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) call .upsample_top vpbroadcastq m0, [base+z_xpos_off1b] jmp .w4_main2 .w4_no_upsample: test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+3] vpbroadcastb xm0, r3d vpbroadcastb xm1, angled shr angled, 8 ; is_sm << 1 vpcmpeqb k1, xm0, [base+z_filter_wh] vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] kmovw r5d, k1 test r5d, r5d jz .w4_main call .w16_filter mov r2d, 9 cmp hd, 4 cmovne r3d, r2d vpbroadcastw m6, r3d pminuw m6, [base+pw_0to31] vpermw m5, m6, m5 .w4_main: vpbroadcastq m0, [base+z_xpos_off1a] .w4_main2: movsldup m3, [base+z_xpos_mul] vpbroadcastw m4, dxd lea r2, [strideq*3] pmullw m3, m4 vshufi32x4 m6, m5, m5, q3321 psllw m4, 3 ; dx*8 paddsw m3, m0 ; xpos palignr m6, m5, 2 ; top+1 .w4_loop: psrlw m1, m3, 6 ; base_x pand m2, m15, m3 ; frac vpermw m0, m1, m5 ; top[base_x] vpermw m1, m1, m6 ; top[base_x+1] psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 sub hd, 8 jl .w4_end vextracti32x4 xm1, m0, 2 paddsw m3, m4 ; xpos += dx lea dstq, [dstq+strideq*4] vextracti32x4 xm0, m0, 3 movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm0 movhps [dstq+r2 ], xm0 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .upsample_top: vinserti32x4 m5, [tlq-16], 3 mova m3, [base+z_upsample] vpbroadcastd m4, [base+pd_65536] add dxd, dxd vpermw m0, m3, m5 paddw m3, m4 vpermw m1, m3, m5 paddw m3, m4 vpermw m2, m3, m5 paddw m3, m4 vpermw m3, m3, m5 vpbroadcastw m5, r9m ; pixel_max paddw m1, m2 ; b+c paddw m0, m3 ; a+d psubw m0, m1, m0 psraw m0, 3 pxor m2, m2 paddw m0, m1 pmaxsw m0, m2 pavgw m0, m2 pminsw m5, m0 ret .w8: lea r3d, [angleq+216] movu ym5, [tlq] mov r3b, hb movu m10, [base+pw_0to31] cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq+7] vpbroadcastw m6, r3d add r3d, r3d pminuw m6, m10 vpermw m5, m6, m5 call .upsample_top vbroadcasti32x4 m0, [base+z_xpos_off1b] jmp .w8_main2 .w8_no_upsample: lea r3d, [hq+7] vpbroadcastb ym0, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) vpbroadcastw m6, r3d pminuw m6, m10 vpermw m5, m6, m5 test angled, 0x400 jnz .w8_main vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym0, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 kmovd r5d, k1 test r5d, r5d jz .w8_main call .w16_filter cmp hd, r3d jl .w8_filter_end pminud m6, m10, [base+pw_17] {1to16} add r3d, 2 .w8_filter_end: vpermw m5, m6, m5 .w8_main: vbroadcasti32x4 m0, [base+z_xpos_off1a] .w8_main2: movshdup m3, [base+z_xpos_mul] vpbroadcastw m4, dxd shl r3d, 6 lea r2, [strideq*3] pmullw m3, m4 vshufi32x4 m6, m5, m5, q3321 sub r3d, dxd psllw m4, 2 ; dx*4 shl dxd, 2 paddsw m3, m0 ; xpos palignr m6, m5, 2 ; top+1 .w8_loop: psrlw m1, m3, 6 ; base_x pand m2, m15, m3 ; frac vpermw m0, m1, m5 ; top[base_x] vpermw m1, m1, m6 ; top[base_x+1] psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 sub hd, 4 jz .w8_end paddsw m3, m4 ; xpos += dx lea dstq, [dstq+strideq*4] sub r3d, dxd jg .w8_loop vextracti32x4 xm5, m5, 3 .w8_end_loop: mova [dstq+strideq*0], xm5 mova [dstq+strideq*1], xm5 mova [dstq+strideq*2], xm5 mova [dstq+r2 ], xm5 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_end_loop .w8_end: RET .w16_filter: vpbroadcastw m1, [tlq-2] popcnt r5d, r5d valignq m3, m6, m5, 2 vpbroadcastd m7, [base+z_filter_k+(r5-1)*4+12*0] valignq m1, m5, m1, 6 vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1] palignr m2, m3, m5, 2 vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2] palignr m0, m5, m1, 14 pmullw m7, m5 palignr m3, m5, 4 paddw m0, m2 palignr m5, m1, 12 pmullw m0, m8 paddw m5, m3 pmullw m5, m9 pxor m1, m1 paddw m0, m7 paddw m5, m0 psrlw m5, 3 pavgw m5, m1 ret .w16: lea r3d, [hq+15] vpbroadcastb ym0, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) vpbroadcastw m11, r3d pminuw m10, m11, [base+pw_0to31] vpbroadcastw m6, [tlq+r3*2] vpermw m5, m10, [tlq] test angled, 0x400 jnz .w16_main vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym0, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 kmovd r5d, k1 test r5d, r5d jz .w16_main call .w16_filter cmp hd, 16 jg .w16_filter_h32 vpermw m6, m11, m5 vpermw m5, m10, m5 jmp .w16_main .w16_filter_h32: movzx r3d, word [tlq+62] movzx r2d, word [tlq+60] lea r2d, [r2+r3*8+4] sub r2d, r3d mov r3d, 1 shr r2d, 3 kmovb k1, r3d movd xm0, r2d or r3d, 32 vmovdqu16 m6{k1}, m0 .w16_main: rorx r2d, dxd, 23 mov r7, rsp and rsp, ~63 vpbroadcastw m3, r2d sub rsp, 64*2 mov r2d, dxd paddw m4, m3, m3 mova [rsp+64*0], m5 vinserti32x8 m3, ym4, 1 mova [rsp+64*1], m6 shl r3d, 6 .w16_loop: lea r5d, [r2+dxq] shr r2d, 6 movu ym0, [rsp+r2*2] movu ym1, [rsp+r2*2+2] lea r2d, [r5+dxq] shr r5d, 6 vinserti32x8 m0, [rsp+r5*2], 1 vinserti32x8 m1, [rsp+r5*2+2], 1 pand m2, m15, m3 ; frac << 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end paddw m3, m4 lea dstq, [dstq+strideq*2] cmp r2d, r3d jl .w16_loop punpckhqdq ym6, ym6 .w16_end_loop: mova [dstq+strideq*0], ym6 mova [dstq+strideq*1], ym6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: mov rsp, r7 RET .w32: lea r3d, [hq+31] movu m7, [tlq+64*0] and r3d, 31 vpbroadcastw m11, r3d or r3d, 32 ; imin(h+31, 63) pminuw m10, m11, [base+pw_0to31] vpbroadcastw m9, [tlq+r3*2] vpermw m8, m10, [tlq+64*1] test angled, 0x400 jnz .w32_main vpbroadcastd m5, [base+pw_3] mov r5d, ~1 movu m3, [tlq-2] kmovd k1, r5d valignq m2, m8, m7, 6 paddw m7, m3 vmovdqu16 m3{k1}, [tlq-4] valignq m4, m9, m8, 2 paddw m3, m5 paddw m7, [tlq+2] palignr m1, m8, m2, 14 pavgw m3, [tlq+4] palignr m2, m8, m2, 12 paddw m7, m3 palignr m3, m4, m8, 2 psrlw m7, 2 palignr m4, m8, 4 paddw m8, m1 paddw m2, m5 paddw m8, m3 pavgw m2, m4 paddw m8, m2 psrlw m8, 2 cmp hd, 64 je .w32_filter_h64 vpermw m9, m11, m8 vpermw m8, m10, m8 jmp .w32_main .w32_filter_h64: movzx r3d, word [tlq+126] movzx r2d, word [tlq+124] lea r2d, [r2+r3*8+4] sub r2d, r3d mov r3d, 65 shr r2d, 3 movd xm0, r2d vpblendmw m9{k1}, m0, m9 .w32_main: rorx r2d, dxd, 23 mov r7, rsp and rsp, ~63 vpbroadcastw m5, r2d sub rsp, 64*4 mov r2d, dxd mova [rsp+64*0], m7 shl r3d, 6 mova [rsp+64*1], m8 mova m6, m5 mova [rsp+64*2], m9 punpckhqdq m9, m9 mova [rsp+64*3], ym9 .w32_loop: lea r5d, [r2+dxq] shr r2d, 6 movu m0, [rsp+r2*2] movu m2, [rsp+r2*2+2] lea r2d, [r5+dxq] shr r5d, 6 movu m1, [rsp+r5*2] movu m3, [rsp+r5*2+2] pand m4, m15, m5 paddw m5, m6 psubw m2, m0 pmulhrsw m2, m4 pand m4, m15, m5 psubw m3, m1 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jz .w32_end paddw m5, m6 lea dstq, [dstq+strideq*2] cmp r2d, r3d jl .w32_loop .w32_end_loop: mova [dstq+strideq*0], m9 mova [dstq+strideq*1], m9 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_end_loop .w32_end: mov rsp, r7 RET .w64_filter96: vpbroadcastd m4, [base+pw_3] mov r5d, ~1 movu m0, [tlq-2] kmovd k1, r5d paddw m7, m0 vmovdqu16 m0{k1}, [tlq-4] paddw m0, m4 paddw m7, [tlq+2] pavgw m0, [tlq+4] valignq m1, m9, m8, 6 paddw m8, [tlq+62] paddw m2, m4, [tlq+60] valignq m3, m10, m9, 2 paddw m8, [tlq+66] pavgw m2, [tlq+68] paddw m7, m0 palignr m0, m9, m1, 14 paddw m8, m2 palignr m1, m9, m1, 12 psrlw m7, 2 palignr m2, m3, m9, 2 psrlw m8, 2 palignr m3, m9, 4 paddw m0, m9 paddw m1, m4 paddw m0, m2 pavgw m1, m3 paddw m0, m1 ret .w64: movu m7, [tlq+64*0] lea r3d, [hq-1] movu m8, [tlq+64*1] vpbroadcastw m11, [tlq+r3*2+128] movu m9, [tlq+64*2] cmp hd, 64 je .w64_h64 vpbroadcastw m13, r3d or r3d, 64 pminuw m12, m13, [base+pw_0to31] mova m10, m11 vpermw m9, m12, m9 test angled, 0x400 jnz .w64_main call .w64_filter96 psrlw m0, 2 vpermw m9, m12, m0 vpermw m10, m13, m0 mova m11, m10 jmp .w64_main .w64_h64: movu m10, [tlq+64*3] or r3d, 64 test angled, 0x400 jnz .w64_main call .w64_filter96 valignq m1, m10, m9, 6 valignq m3, m11, m10, 2 vpbroadcastd m11, [base+pw_63] psrlw m9, m0, 2 palignr m0, m10, m1, 14 palignr m1, m10, m1, 12 palignr m2, m3, m10, 2 palignr m3, m10, 4 paddw m10, m0 paddw m1, m4 paddw m10, m2 pavgw m1, m3 paddw m10, m1 psrlw m10, 2 vpermw m11, m11, m10 .w64_main: rorx r2d, dxd, 23 mov r7, rsp and rsp, ~63 vpbroadcastw m5, r2d sub rsp, 64*6 mova [rsp+64*0], m7 mov r2d, dxd mova [rsp+64*1], m8 lea r5, [rsp+r3*2] mova [rsp+64*2], m9 shl r3d, 6 mova [rsp+64*3], m10 sub r2, r3 mova [rsp+64*4], m11 mova m6, m5 mova [rsp+64*5], m11 .w64_loop: mov r3, r2 sar r3, 6 movu m0, [r5+r3*2+64*0] movu m2, [r5+r3*2+64*0+2] movu m1, [r5+r3*2+64*1] movu m3, [r5+r3*2+64*1+2] pand m4, m15, m5 psubw m2, m0 pmulhrsw m2, m4 psubw m3, m1 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jz .w64_end paddw m5, m6 add dstq, strideq add r2, dxq jl .w64_loop .w64_end_loop: mova [dstq+64*0], m11 mova [dstq+64*1], m11 add dstq, strideq dec hd jg .w64_end_loop .w64_end: mov rsp, r7 RET cglobal ipred_z2_16bpc, 3, 9, 16, dst, stride, tl, w, h, angle, dx, _, dy tzcnt wd, wm movifnidn angled, anglem lea dxq, [dr_intra_derivative-90] movzx dyd, angleb xor angled, 0x400 mov r7, dxq sub dxq, dyq movifnidn hd, hm and dyd, ~1 vpbroadcastw m12, [tlq] and dxq, ~1 movzx dyd, word [r7+dyq] ; angle - 90 lea r7, [z_filter_t0] movzx dxd, word [dxq+270] ; 180 - angle mova m0, [base+pw_31to0] movsxd wq, [base+ipred_z2_16bpc_avx512icl_table+wq*4] movu m4, [tlq+2] neg dyd vpermw m7, m0, [tlq-64*1] lea wq, [base+ipred_z2_16bpc_avx512icl_table+wq] vpbroadcastd m14, [base+pw_31806] vpbroadcastd m15, [base+pw_1] jmp wq .w4: movq xm3, [tlq] vpbroadcastq m8, [base+pw_1to32] test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) pshuflw xm0, xm4, q3321 sub angled, 1075 ; angle - 53 lea r3d, [hq+3] call .upsample_above punpcklwd xm4, xm3, xm4 palignr xm3, xm4, xm12, 14 jmp .w4_main .w4_upsample_left: call .upsample_left movsldup m1, [base+z_xpos_mul] paddw m1, m1 jmp .w4_main2 .w4_no_upsample_above: lea r3d, [hq+3] vpbroadcastd ym0, [base+pw_3] sub angled, 1112 ; angle - 90 call .filter_above2 lea r3d, [hq+2] add angled, 973 ; angle + 883 palignr xm3, xm4, xm12, 14 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) call .filter_left16 .w4_main: movsldup m1, [base+z_xpos_mul] psllw m15, 3 .w4_main2: vpbroadcastq m0, [base+pw_1to32] vpbroadcastw m11, dxd movsldup m2, [base+z_xpos_mul] vpbroadcastw m13, dyd vpbroadcastd m5, [tlq-2] psllw m10, m8, 6 valignq m5, m7, m5, 6 pmullw m2, m11 psubw m10, m2 ; xpos pmullw m13, m0 ; ypos palignr m5, m7, m5, 14 psrlw m12, m13, 6 psllw m13, 9 paddw m12, m1 ; base_y pand m13, m14 ; frac_y << 9 psllw m11, 3 lea r5, [strideq*3] .w4_loop: psrlw m1, m10, 6 ; base_x pand m2, m14, m10 ; frac vpermw m0, m1, m3 ; top[base_x] vpermw m1, m1, m4 ; top[base_x+1] vpmovw2m k1, m10 ; base_x < 0 psllw m2, 9 vpermw m0{k1}, m12, m5 ; left[base_y] vpermw m1{k1}, m12, m7 ; left[base_y+1] vmovdqu16 m2{k1}, m13 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r5 ], xm1 sub hd, 8 jl .w4_end vextracti32x8 ym0, m0, 1 psubw m10, m11 ; base_x -= dx lea dstq, [dstq+strideq*4] paddw m12, m15 ; base_y++ vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r5 ], xm1 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .upsample_above: ; w4/w8 mova ym9, [base+pw_1to32] palignr xm1, xm4, xm12, 12 paddw xm3, xm4 ; b+c xor angled, 0x7f ; 180 - angle paddw xm0, xm1 ; a+d vpbroadcastw xm1, r9m ; pixel_max vpbroadcastb xm11, r3d psubw xm0, xm3, xm0 vpbroadcastb xm2, angled psraw xm0, 3 shr angled, 8 paddw xm3, xm0 pxor xm0, xm0 vpcmpeqb k2, xm11, [base+z_filter_wh] pmaxsw xm3, xm0 add dxd, dxd pavgw xm3, xm0 vpcmpgtb k2{k2}, xm2, [base+z_filter_t0+angleq*8] pminsw xm3, xm1 paddw m8, m8 jmp .filter_left16b .upsample_left: ; h4/h8 lea r3d, [hq-1] palignr xm2, xm7, xm12, 14 vpbroadcastw xm0, r3d palignr xm1, xm7, xm12, 12 pminuw xm0, xm9 paddw xm2, xm7 ; b+c vpermw xm0, xm0, xm7 add dyd, dyd paddw xm0, xm1 ; a+d vpbroadcastw xm1, r9m ; pixel_max psubw xm0, xm2, xm0 psraw xm0, 3 paddw xm2, xm0 pxor xm0, xm0 pmaxsw xm2, xm0 pavgw xm2, xm0 pminsw xm2, xm1 punpckhwd xm0, xm2, xm7 punpcklwd xm7, xm2, xm7 vinserti32x4 ym7, xm0, 1 ret .filter_above: sub angled, 90 .filter_above2: vpbroadcastb ym1, r3d vpbroadcastb ym10, angled mov r3d, angled shr r3d, 8 vpcmpeqb k2, ym1, [base+z_filter_wh] mova xm11, [base+z_filter_t0+r3*8] vpcmpgtb k1{k2}, ym10, ym11 mova m9, [base+pw_1to32] kmovd r3d, k1 test r3d, r3d jz .filter_end pminuw ym0, ym9 popcnt r3d, r3d vpbroadcastd ym6, r7m ; max_w kxnorw k1, k1, k1 vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] kaddw k1, k1, k1 ; ~1 vpbroadcastd ym13, [base+z_filter_k+(r3-1)*4+12*1] vpermw ym2, ym0, ym4 ; +1 pmullw ym5, ym4 paddw ym1, ym2, ym3 vmovdqu16 m3{k1}, [tlq-2] ; -2 vpermw ym2, ym0, ym2 ; +2 vpbroadcastd ym0, [base+z_filter_k+(r3-1)*4+12*2] pmullw ym1, ym13 movu m13, [base+pw_0to31] paddw ym2, ym3 packssdw ym6, ym6 pmullw ym2, ym0 paddw ym1, ym5 vpcmpgtw k1, ym6, ym13 paddw ym1, ym2 pxor ym2, ym2 psrlw ym1, 3 pavgw ym4{k1}, ym1, ym2 .filter_end: ret .filter_left16: vpbroadcastd ym1, [base+pb_90] psubb ym1, ym10 vpcmpgtb k2{k2}, ym1, ym11 .filter_left16b: kmovd r3d, k2 test r3d, r3d jz .filter_end lea r5d, [hq-1] vinserti32x4 ym0, ym12, xm7, 1 vpbroadcastw ym1, r5d popcnt r3d, r3d vpbroadcastd ym6, r8m ; max_h pminuw ym9, ym1 vpbroadcastd ym5, [base+z_filter_k+(r3-1)*4+12*0] vpermw ym2, ym9, ym7 ; +1 vpbroadcastd ym10, [base+z_filter_k+(r3-1)*4+12*1] palignr ym1, ym7, ym0, 14 ; -1 pmullw ym5, ym7 palignr ym0, ym7, ym0, 12 ; -2 paddw ym1, ym2 vpermw ym2, ym9, ym2 ; +2 vpbroadcastd ym9, [base+z_filter_k+(r3-1)*4+12*2] pmullw ym1, ym10 paddw ym2, ym0 packssdw ym6, ym6 pmullw ym2, ym9 paddw ym1, ym5 vpcmpgtw k1, ym6, [base+pw_0to31] paddw ym1, ym2 pxor ym2, ym2 psrlw ym1, 3 pavgw ym7{k1}, ym1, ym2 ret .filter_left: cmp hd, 32 jl .filter_left16 vpbroadcastd m5, [base+pw_3] pminud m0, m9, [base+pw_31] {1to16} .filter_left32: vpbroadcastd m6, r8m ; max_h valignq m2, m7, m12, 6 packssdw m6, m6 palignr m1, m7, m2, 14 ; -1 paddw m1, m7 palignr m2, m7, m2, 12 ; -2 vpcmpgtw k1, m6, m13 paddw m2, m5 cmp hd, 64 je .filter_left64 lea r3d, [hq-1] vpbroadcastw m10, r3d pminuw m0, m10 vpermw m10, m0, m7 ; +1 paddw m1, m10 vpermw m10, m0, m10 ; +2 pavgw m2, m10 paddw m1, m2 vpsrlw m7{k1}, m1, 2 ret .filter_left64: valignq m10, m8, m7, 2 vpaddd m13, [base+pw_32] {1to16} palignr m11, m10, m7, 2 ; +1 paddw m1, m11 palignr m11, m10, m7, 4 ; +2 valignq m10, m8, m7, 6 pavgw m11, m2 vpermw m2, m0, m8 ; 32+1 paddw m1, m11 vpsrlw m7{k1}, m1, 2 palignr m1, m8, m10, 14 ; 32-1 paddw m1, m8 palignr m10, m8, m10, 12 ; 32-2 paddw m1, m2 vpermw m2, m0, m2 ; 32+2 paddw m10, m5 vpcmpgtw k1, m6, m13 pavgw m2, m10 paddw m1, m2 vpsrlw m8{k1}, m1, 2 ret .w8: mova xm3, [tlq] vbroadcasti32x4 m8, [base+pw_1to32] test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm psrldq xm0, xm4, 2 sub angled, 53 pshufhw xm0, xm0, q2210 lea r3d, [hq+7] call .upsample_above punpcklwd xm0, xm3, xm4 punpckhwd xm4, xm3, xm4 vinserti32x4 ym3, ym12, xm0, 1 vinserti32x4 ym4, ym0, xm4, 1 palignr ym3, ym4, ym3, 14 jmp .w8_main .w8_upsample_left: call .upsample_left movshdup m1, [base+z_xpos_mul] psllw m15, 3 paddw m1, m1 jmp .w8_main2 .w8_no_upsample_above: lea r3d, [hq+7] vpbroadcastd ym0, [base+pw_7] call .filter_above lea r3d, [angleq-51] mov r3b, hb palignr xm3, xm4, xm12, 14 cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm call .filter_left .w8_main: movshdup m1, [base+z_xpos_mul] psllw m15, 2 .w8_main2: vbroadcasti32x4 m0, [base+pw_1to32] vpbroadcastw m11, dxd movshdup m2, [base+z_xpos_mul] vpbroadcastw m13, dyd psllw m10, m8, 6 valignq m5, m7, m12, 6 pmullw m2, m11 psubw m10, m2 ; xpos pmullw m13, m0 ; ypos palignr m5, m7, m5, 14 psrlw m12, m13, 6 psllw m13, 9 mov r2d, 1<<6 paddw m12, m1 ; base_y lea r3d, [dxq-(8<<6)] ; left-only threshold pand m13, m14 ; frac_y << 9 shl dxd, 2 psllw m11, 2 lea r5, [strideq*3] .w8_loop: psrlw m1, m10, 6 pand m2, m14, m10 vpermw m0, m1, m3 vpermw m1, m1, m4 psllw m2, 9 sub r2d, dxd jge .w8_toponly vpmovw2m k1, m10 vpermw m0{k1}, m12, m5 vpermw m1{k1}, m12, m7 vmovdqu16 m2{k1}, m13 .w8_toponly: psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r5 ], m0, 3 sub hd, 4 jz .w8_end psubw m10, m11 ; base_x -= dx lea dstq, [dstq+strideq*4] paddw m12, m15 ; base_y++ cmp r2d, r3d jge .w8_loop .w8_leftonly_loop: vpermw m0, m12, m5 vpermw m1, m12, m7 psubw m1, m0 pmulhrsw m1, m13 paddw m12, m15 paddw m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r5 ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_leftonly_loop .w8_end: RET .w16: mova ym3, [tlq] vpermw m8, m0, [tlq-64*2] test angled, 0x400 jnz .w16_main lea r3d, [hq+15] vpbroadcastd ym0, [base+pw_15] call .filter_above call .filter_left vinserti32x4 ym3, ym12, xm4, 1 palignr ym3, ym4, ym3, 14 .w16_main: vbroadcasti32x8 m0, [base+pw_1to32] vpbroadcastw m11, dxd vpbroadcastw m13, dyd kxnorw k2, k2, k2 psllw m10, m0, 6 valignq m5, m7, m12, 6 psubw m10, m11 ; xpos valignq m6, m8, m7, 6 pmullw m13, m0 ; ypos knotd k1, k2 palignr m5, m7, m5, 14 palignr m6, m8, m6, 14 vpsubw m10{k1}, m11 psrlw m12, m13, 6 psllw m13, 9 mov r2d, 1<<6 vpsubw m12{k2}, m15 ; base_y pand m13, m14 ; frac_y << 9 lea r3d, [dxq-(16<<6)] paddw m11, m11 add dxd, dxd paddw m15, m15 .w16_loop: psrlw m1, m10, 6 pand m2, m14, m10 vpermw m0, m1, m3 vpermw m1, m1, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m12, m15 ; base_y++ paddw m0, m1 sub r2d, dxd jge .w16_toponly mova m1, m5 vpermt2w m1, m12, m6 mova m2, m7 vpermt2w m2, m12, m8 vpmovw2m k1, m10 psubw m2, m1 pmulhrsw m2, m13 vpaddw m0{k1}, m1, m2 .w16_toponly: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end psubw m10, m11 ; base_x -= dx lea dstq, [dstq+strideq*2] cmp r2d, r3d jge .w16_loop paddw m12, m15 vpermt2w m5, m12, m6 mova m1, m7 vpermt2w m1, m12, m8 jmp .w16_leftonly_loop_start .w16_leftonly_loop: mova m1, m7 vpermt2w m1, m12, m8 vshufi32x4 m5, m1, q1032 .w16_leftonly_loop_start: psubw m0, m1, m5 pmulhrsw m0, m13 paddw m12, m15 paddw m0, m5 mova m5, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_leftonly_loop .w16_end: RET .w32: mova m3, [tlq] vpermw m8, m0, [tlq-64*2] mova m9, [base+pw_1to32] test angled, 0x400 jnz .w32_main pminud m0, m9, [base+pw_31] {1to16} mov r3d, ~1 kmovd k1, r3d vpbroadcastd m5, [base+pw_3] vpbroadcastd m6, r6m ; max_w vpermw m2, m0, m4 ; +1 movu m13, [base+pw_0to31] paddw m1, m4, m3 vmovdqu16 m3{k1}, [tlq-2] ; -2 packssdw m6, m6 paddw m1, m2 vpermw m2, m0, m2 ; +2 paddw m3, m5 vpcmpgtw k1, m6, m13 pavgw m2, m3 paddw m1, m2 psrlw m4{k1}, m1, 2 call .filter_left32 .w32_main: sub rsp, 64*2 call .w32_main1 add rsp, 64*2 RET .w32_main1: vpbroadcastw m11, dxd movu [rsp+64], m4 vpbroadcastw m4, dyd movd [rsp+60], xm12 valignq m5, m7, m12, 6 psllw m3, m9, 6 ; xpos valignq m6, m8, m7, 6 pmullw m9, m4 ; ypos palignr m5, m7, m5, 14 mov r2d, 33<<6 palignr m6, m8, m6, 14 mova m10, m3 .w32_main2: psllw m13, m9, 9 sub r2d, dxd psrlw m12, m9, 6 ; base_y mov r8d, hd pand m13, m14 ; frac_y << 9 .w32_loop: mov r3d, r2d shr r3d, 6 psubw m10, m11 ; base_x -= dx movu m0, [rsp+r3*2-2] pand m2, m10, m14 ; frac_x movu m1, [rsp+r3*2] psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m12, m15 ; base_y++ paddw m0, m1 cmp r2d, 32<<6 jge .w32_toponly mova m1, m5 vpermt2w m1, m12, m6 mova m2, m7 vpermt2w m2, m12, m8 vpmovw2m k1, m10 psubw m2, m1 pmulhrsw m2, m13 vpaddw m0{k1}, m1, m2 .w32_toponly: mova [dstq], m0 dec r8d jz .w32_end add dstq, strideq sub r2d, dxd jge .w32_loop paddw m12, m15 mova m2, m5 vpermt2w m2, m12, m6 .w32_leftonly_loop: mova m1, m7 vpermt2w m1, m12, m8 psubw m0, m1, m2 pmulhrsw m0, m13 paddw m12, m15 paddw m0, m2 mova m2, m1 mova [dstq], m0 add dstq, strideq dec r8d jg .w32_leftonly_loop .w32_end: ret .w64: movu m3, [tlq+66] vpermw m8, m0, [tlq-64*2] mova m9, [base+pw_1to32] test angled, 0x400 jnz .w64_main mova m2, [tlq] ; -1 mov r3d, ~1 vpbroadcastd m5, [base+pw_3] kmovd k1, r3d movu m13, [base+pw_0to31] vpbroadcastd m6, r6m ; max_w pminud m0, m9, [base+pw_31] {1to16} paddw m1, m4, m2 vmovdqu16 m2{k1}, [tlq-2] ; -2 packssdw m6, m6 paddw m1, [tlq+4] ; +1 paddw m2, m5 vpcmpgtw k1, m6, m13 pavgw m2, [tlq+6] ; +2 paddw m1, m2 vpermw m2, m0, m3 ; 32+1 psrlw m4{k1}, m1, 2 paddw m1, m3, [tlq+64] ; 32-1 vpaddd m11, m13, [base+pw_32] {1to16} paddw m1, m2 vpermw m2, m0, m2 ; 32+2 paddw m10, m5, [tlq+62] ; 32-2 vpcmpgtw k1, m6, m11 pavgw m2, m10 paddw m1, m2 psrlw m3{k1}, m1, 2 call .filter_left32 .w64_main: sub rsp, 64*3 movu [rsp+64*2-gprsize], m3 mov r5, dstq call .w32_main1 psllw m4, 5 mov r2d, 65<<6 vpaddd m10, m3, [base+pw_2048] {1to16} ; xpos lea dstq, [r5+64] paddw m9, m4 ; ypos call .w32_main2 add rsp, 64*3 RET cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm movifnidn angled, anglem lea t0, [dr_intra_derivative+45*2-1] movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4] sub angled, 180 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e mova m0, [base+pw_31to0] movzx dyd, word [t0+dyq] lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq] movifnidn hd, hm vpbroadcastd m14, [base+pw_31806] vpbroadcastd m15, [base+pw_1] jmp wq .w4: lea r3d, [hq+3] xor r3d, 31 ; 32 - (h + imin(w, h)) vpbroadcastw m7, r3d pmaxuw m7, m0 vpermw m6, m7, [tlq-64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main cmp angleb, 40 jae .w4_filter lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_filter ; h > 8 || (h == 8 && is_sm) call .upsample movsldup m1, [base+z_ypos_mul] paddw m1, m1 jmp .w4_main2 .w4_filter: lea r3d, [hq+3] call .filter32 .w4_main: movsldup m1, [base+z_ypos_mul] .w4_main2: vpbroadcastq m0, [base+pw_1to32] vpbroadcastw m4, dyd lea r2d, [hq+4] shr r2d, 3 pmullw m4, m0 ; ypos vpbroadcastw m0, r2d imul r2, strideq ; stride * imax(height / 8, 1) pmullw m1, m0 lea r3, [r2*3] paddd m1, [base+pw_32736] {1to16} psrlw m2, m4, 6 psllw m4, 9 paddsw m2, m1 ; base+0 vpandd m4, m14 ; frac << 9 vpermw m3, m2, m6 ; left[base+0] .w4_loop: paddsw m2, m15 ; base+1 vpermw m1, m2, m6 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddw m0, m3 movq [dstq+r2*0], xm0 movhps [dstq+r2*1], xm0 vextracti32x4 xm3, ym0, 1 movq [dstq+r2*2], xm3 movhps [dstq+r3 ], xm3 sub hd, 8 jl .w4_end lea r5, [dstq+r2*4] vextracti32x8 ym0, m0, 1 mova m3, m1 movq [r5+r2*0], xm0 movhps [r5+r2*1], xm0 vextracti32x4 xm1, ym0, 1 movq [r5+r2*2], xm1 movhps [r5+r3 ], xm1 add dstq, strideq test hd, hd jnz .w4_loop .w4_end: RET .upsample: vinserti32x4 m6, [tlq-14], 3 mova m3, [base+z_upsample] vpbroadcastd m4, [base+pd_65536] add dyd, dyd vpermw m0, m3, m6 paddw m3, m4 vpermw m1, m3, m6 paddw m3, m4 vpermw m2, m3, m6 paddw m3, m4 vpermw m3, m3, m6 vpbroadcastw m6, r9m ; pixel_max paddw m1, m2 ; b+c paddw m0, m3 ; a+d psubw m0, m1, m0 psraw m0, 3 pxor m2, m2 paddw m0, m1 pmaxsw m0, m2 pavgw m0, m2 pminsw m6, m0 ret .w8: mova m6, [tlq-64*1] cmp hd, 32 je .w8_h32 mov r3d, 8 cmp hd, 4 cmove r3d, hd lea r3d, [r3+hq-1] xor r3d, 31 ; 32 - (h + imin(w, h)) vpbroadcastw m1, r3d vpermw m7, m1, m6 pmaxuw m1, m0 vpermw m6, m1, m6 test angled, 0x400 jnz .w8_main lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_filter ; is_sm || d >= 40 || h > 8 call .upsample movshdup m1, [base+z_ypos_mul] paddw m1, m1 call .w8_main_setup .w8_upsample_loop: vpermw m3, m2, m6 ; left[base+0] paddw m2, m15 ; base+1 vpermw m1, m2, m6 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddw m2, m15 ; base+2 paddw m0, m3 mova m3, m1 mova [dstq+r2*0], xm0 vextracti32x4 [dstq+r2*1], ym0, 1 vextracti32x4 [dstq+r2*2], m0, 2 vextracti32x4 [dstq+r3 ], m0, 3 add dstq, strideq sub hd, 4 jg .w8_upsample_loop RET .w8_main_setup: vbroadcasti32x4 m0, [base+pw_1to32] vpbroadcastw m4, dyd rorx r2d, hd, 2 pmullw m4, m0 ; ypos vpbroadcastw m0, r2d imul r2, strideq ; stride * height / 4 lea r3, [r2*3] pmullw m1, m0 ; 0 1 2 3 paddd m1, [base+pw_32704] {1to16} psrlw m2, m4, 6 psllw m4, 9 paddsw m2, m1 ; base+0 vpandd m4, m14 ; frac << 9 ret .w8_h32: pmaxud m7, m0, [base+pw_24] {1to16} vpermw m6, m0, m6 vpermw m7, m7, [tlq-64*2] test angled, 0x400 jnz .w8_main call .filter64 vpbroadcastd m0, [base+pw_7] pminuw m0, [base+pw_0to31] vpermw m7, m0, m7 jmp .w8_main .w8_filter: lea r3d, [hq+7] call .filter32 .w8_main: movshdup m1, [base+z_ypos_mul] call .w8_main_setup mova m3, m6 vpermt2w m3, m2, m7 ; left[base+0] .w8_loop: paddsw m2, m15 ; base+1 mova m1, m6 vpermt2w m1, m2, m7 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddw m0, m3 mova m3, m1 mova [dstq+r2*0], xm0 vextracti32x4 [dstq+r2*1], ym0, 1 vextracti32x4 [dstq+r2*2], m0, 2 vextracti32x4 [dstq+r3 ], m0, 3 add dstq, strideq sub hd, 4 jg .w8_loop RET .filter32: vpbroadcastb ym10, r3d vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym10, [base+z_filter_wh] mova xm2, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym2 kmovd r5d, k1 test r5d, r5d jz .filter32_end vpbroadcastw m2, [tlq] popcnt r5d, r5d vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0] valignq m2, m6, m2, 6 vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1] valignq m4, m7, m6, 2 vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2] palignr m1, m6, m2, 14 pmullw m5, m6 palignr m3, m4, m6, 2 paddw m1, m3 palignr m2, m6, m2, 12 pmullw m1, m8 palignr m4, m6, 4 paddw m2, m4 pmullw m2, m9 pmovzxbw m10, ym10 pxor m6, m6 paddw m5, m1 pminuw m1, m10, [base+pw_0to31] paddw m5, m2 psrlw m5, 3 pavgw m6, m5 vpermw m7, m10, m6 vpermw m6, m1, m6 .filter32_end: ret .w16: mova m6, [tlq-64*1] cmp hd, 32 jl .w16_h16 pmaxud m8, m0, [base+pw_16] {1to16} mova m7, [tlq-64*2] vpermw m6, m0, m6 jg .w16_h64 vpermw m7, m8, m7 test angled, 0x400 jnz .w16_main call .filter64 vpbroadcastd m0, [base+pw_15] vinserti32x8 m0, [base+pw_0to31], 0 vpermw m7, m0, m7 jmp .w16_main .w16_h16: lea r3d, [hq*2-1] xor r3d, 31 ; 32 - (h + imin(w, h)) vpbroadcastw m1, r3d vpermw m7, m1, m6 pmaxuw m1, m0 vpermw m6, m1, m6 test angled, 0x400 jnz .w16_main lea r3d, [hq+15] call .filter32 .w16_main: vbroadcasti32x8 m0, [base+pw_1to32] vpbroadcastw m4, dyd rorx r2d, hd, 1 pmullw m4, m0 ; ypos vpbroadcastw ym1, r2d imul r2, strideq ; stride * height / 2 paddd m1, [base+pw_32704] {1to16} lea r3, [r2+strideq] psrlw m2, m4, 6 psllw m4, 9 paddsw m2, m1 ; base+0 vpandd m4, m14 ; frac << 9 mova m3, m6 vpermt2w m3, m2, m7 ; left[base+0] .w16_loop: paddsw m1, m2, m15 ; base+1 paddsw m2, m1, m15 ; base+2 vpermi2w m1, m6, m7 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddw m0, m3 mova m3, m6 vpermt2w m3, m2, m7 ; left[base+2] vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+r2 ], ym0 psubw m0, m3, m1 pmulhrsw m0, m4 paddw m0, m1 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+r3 ], ym0 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w16_loop RET .w16_h64: vpermw m7, m0, m7 vpermw m8, m8, [tlq-64*3] test angled, 0x400 jnz .w16_h64_main valignq m11, m8, m7, 6 call .filter64 vshufi32x4 m2, m8, m8, q3321 vpbroadcastd m0, [base+pw_15] palignr ym3, ym8, ym11, 12 vinserti32x8 m0, [base+pw_0to31], 0 palignr ym4, ym8, ym11, 14 palignr ym1, ym2, ym8, 4 paddw ym3, ym5 palignr ym2, ym8, 2 paddw ym8, ym4 pavgw ym3, ym1 paddw ym8, ym2 paddw ym8, ym3 psrlw ym8, 2 vpermw m8, m0, m8 .w16_h64_main: vbroadcasti32x8 m0, [base+pw_1to32] vpbroadcastw m4, dyd pmullw m4, m0 ; ypos vpbroadcastd ym1, [base+pw_32] paddd m1, [base+pw_32672] {1to16} mov r2, strideq shl r2, 5 ; stride*32 vpbroadcastd m9, [base+pw_32735] lea r3, [r2+strideq] psrlw m2, m4, 6 psllw m4, 9 paddsw m2, m1 ; base+0 vpandd m4, m14 ; frac << 9 mova m3, m7 vpermt2w m3, m2, m6 vpcmpgtw k1, m2, m9 vpermw m3{k1}, m2, m8 ; left[base+0] .w16_h64_loop: paddsw m2, m15 ; base+1 mova m1, m7 vpermt2w m1, m2, m6 vpcmpgtw k1, m2, m9 vpermw m1{k1}, m2, m8 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddsw m2, m15 ; base+2 paddw m0, m3 mova m3, m7 vpermt2w m3, m2, m6 vpcmpgtw k1, m2, m9 vpermw m3{k1}, m2, m8 ; left[base+2] vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+r2 ], ym0 psubw m0, m3, m1 pmulhrsw m0, m4 paddw m0, m1 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+r3 ], ym0 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w16_h64_loop RET .filter64: vpbroadcastw m2, [tlq] vpbroadcastd m5, [base+pw_3] valignq m2, m6, m2, 6 valignq m4, m7, m6, 2 valignq m10, m7, m6, 6 palignr m1, m6, m2, 12 palignr m2, m6, m2, 14 palignr m3, m4, m6, 4 paddw m1, m5 palignr m4, m6, 2 paddw m6, m2 valignq m2, m8, m7, 2 pavgw m1, m3 palignr m3, m7, m10, 12 paddw m6, m4 palignr m4, m7, m10, 14 paddw m6, m1 palignr m1, m2, m7, 4 psrlw m6, 2 palignr m2, m7, 2 paddw m3, m5 paddw m7, m4 pavgw m3, m1 paddw m7, m2 paddw m7, m3 psrlw m7, 2 ret .w32: mova m6, [tlq-64*1] cmp hd, 32 jl .w32_h16 mova m8, [tlq-64*2] vpermw m6, m0, m6 vpermw m7, m0, m8 jg .w32_h64 test angled, 0x400 jnz .w32_main vpbroadcastw xm8, xm8 jmp .w32_filter .w32_h16: lea r3d, [hq*2-1] xor r3d, 31 ; 32 - (h + imin(w, h)) vpbroadcastw m1, r3d vpermw m7, m1, m6 pmaxuw m1, m0 vpermw m6, m1, m6 test angled, 0x400 jnz .w32_main vextracti32x4 xm8, m7, 3 .w32_filter: call .filter64 .w32_main: vpbroadcastw m4, dyd vpbroadcastd m1, [base+pw_32704] pmullw m4, [base+pw_1to32] ; ypos psrlw m2, m4, 6 psllw m4, 9 paddsw m2, m1 ; base+0 vpandd m4, m14 ; frac << 9 mova m3, m6 vpermt2w m3, m2, m7 ; left[base+0] .w32_loop: paddsw m1, m2, m15 ; base+1 paddsw m2, m1, m15 ; base+2 vpermi2w m1, m6, m7 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddw m0, m3 mova m3, m6 vpermt2w m3, m2, m7 ; left[base+2] mova [dstq+strideq*0], m0 psubw m0, m3, m1 pmulhrsw m0, m4 paddw m0, m1 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w32_h64: mova m9, [tlq-64*3] vpermw m8, m0, m9 test angled, 0x400 jnz .w32_h64_main vpbroadcastw xm9, xm9 call .filter96 .w32_h64_main: vpbroadcastw m4, dyd vpbroadcastd m1, [base+pw_32672] pmullw m4, [base+pw_1to32] ; ypos vpbroadcastd m9, [base+pw_32735] psrlw m2, m4, 6 psllw m4, 9 paddsw m2, m1 ; base+0 vpandd m4, m14 ; frac << 9 mova m3, m7 vpermt2w m3, m2, m6 vpcmpgtw k1, m2, m9 vpermw m3{k1}, m2, m8 ; left[base+0] .w32_h64_loop: paddsw m2, m15 ; base+1 mova m1, m7 vpermt2w m1, m2, m6 vpcmpgtw k1, m2, m9 vpermw m1{k1}, m2, m8 ; left[base+1] psubw m0, m1, m3 pmulhrsw m0, m4 paddsw m2, m15 ; base+2 paddw m0, m3 mova m3, m7 vpermt2w m3, m2, m6 vpcmpgtw k1, m2, m9 vpermw m3{k1}, m2, m8 ; left[base+2] mova [dstq+strideq*0], m0 psubw m0, m3, m1 pmulhrsw m0, m4 paddw m0, m1 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_h64_loop RET .filter96: valignq m11, m8, m7, 6 call .filter64 valignq m2, m9, m8, 2 palignr m3, m8, m11, 12 palignr m4, m8, m11, 14 palignr m1, m2, m8, 4 paddw m3, m5 palignr m2, m8, 2 paddw m8, m4 pavgw m3, m1 paddw m8, m2 paddw m8, m3 psrlw m8, 2 ret .w64: mova m7, [tlq-64*1] vpermw m6, m0, m7 cmp hd, 32 jl .w64_h16 mova m8, [tlq-64*2] vpermw m7, m0, m8 jg .w64_h64 test angled, 0x400 jnz .w64_main vpbroadcastw m8, xm8 mova m9, m8 call .filter96 vshufi32x4 m9, m8, m8, q3333 jmp .w64_h64_main .w64_h16: vpbroadcastw m7, xm7 test angled, 0x400 jnz .w64_main mova m8, m7 call .filter64 .w64_main: vpbroadcastw m11, dyd vpbroadcastd m1, [base+pw_32704] pmullw m10, m11, [base+pw_1to32] ; ypos psllw m11, 5 psrlw m8, m10, 6 paddw m11, m10 psllw m10, 9 psrlw m9, m11, 6 psllw m11, 9 psubw m9, m8 paddsw m8, m1 ; base+0 vpandd m10, m14 ; frac << 9 vpandd m11, m14 ; frac << 9 mova m4, m6 vpermt2w m4, m8, m7 ; left[base+0] ( 0..31) paddsw m5, m8, m9 vpermi2w m5, m6, m7 ; left[base+0] (32..63) .w64_loop: paddsw m8, m15 ; base+1 ( 0..31) mova m2, m6 vpermt2w m2, m8, m7 ; left[base+1] ( 0..31) paddsw m3, m8, m9 ; base+1 (32..63) vpermi2w m3, m6, m7 ; left[base+1] (32..63) psubw m0, m2, m4 psubw m1, m3, m5 pmulhrsw m0, m10 pmulhrsw m1, m11 paddw m0, m4 paddw m1, m5 mova m4, m2 mova [dstq+64*0], m0 mova m5, m3 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w64_loop RET .w64_h64: vpermw m8, m0, [tlq-64*3] mova m13, [tlq-64*4] vpermw m9, m0, m13 test angled, 0x400 jnz .w64_h64_main valignq m12, m9, m8, 6 call .filter96 vpbroadcastw xm2, xm13 valignq m2, m9, 2 palignr m3, m9, m12, 12 palignr m4, m9, m12, 14 palignr m1, m2, m9, 4 paddw m3, m5 palignr m2, m9, 2 paddw m9, m4 pavgw m3, m1 paddw m9, m2 paddw m9, m3 psrlw m9, 2 .w64_h64_main: vpbroadcastw m11, dyd vpbroadcastd m1, [base+pw_32640] pmullw m10, m11, [base+pw_1to32] ; ypos psllw m11, 5 psrlw m12, m10, 6 paddw m11, m10 psllw m10, 9 psrlw m13, m11, 6 psllw m11, 9 psubw m13, m12 paddsw m12, m1 ; base+0 vpandd m10, m14 ; frac << 9 vpandd m11, m14 ; frac << 9 vpbroadcastd m14, [base+pw_64] mova m4, m6 vpermt2w m4, m12, m7 vptestmw k1, m12, m14 mova m0, m8 vpermt2w m0, m12, m9 paddsw m1, m12, m13 mova m5, m6 vpermt2w m5, m1, m7 vptestmw k2, m1, m14 vpermi2w m1, m8, m9 vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31) vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63) .w64_h64_loop: paddsw m12, m15 ; base+1 mova m2, m6 vpermt2w m2, m12, m7 vptestmw k1, m12, m14 mova m0, m8 vpermt2w m0, m12, m9 paddsw m1, m12, m13 mova m3, m6 vpermt2w m3, m1, m7 vptestmw k2, m1, m14 vpermi2w m1, m8, m9 vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31) vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63) psubw m0, m2, m4 psubw m1, m3, m5 pmulhrsw m0, m10 pmulhrsw m1, m11 paddw m0, m4 paddw m1, m5 mova m4, m2 mova [dstq+64*0], m0 mova m5, m3 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w64_h64_loop RET cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_16bpc_avx512icl_table] tzcnt wd, wm mova m3, [pal_pred_perm] movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastq m4, [pal_unpack+0] vpbroadcastq m5, [pal_unpack+8] add wq, r6 vbroadcasti32x4 m6, [palq] lea stride3q, [strideq*3] jmp wq .w4: pmovzxbd ym0, [idxq] add idxq, 8 vpmultishiftqb ym0, ym4, ym0 vpermw ym0, ym0, ym6 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbd m0, [idxq] add idxq, 16 vpmultishiftqb m0, m4, m0 vpermw m0, m0, m6 movu [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: movu ym1, [idxq] add idxq, 32 vpermb m1, m3, m1 vpmultishiftqb m1, m4, m1 vpermw m0, m1, m6 psrlw m1, 8 vpermw m1, m1, m6 movu [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 movu [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpermb m2, m3, [idxq] add idxq, 64 vpmultishiftqb m1, m4, m2 vpmultishiftqb m2, m5, m2 vpermw m0, m1, m6 psrlw m1, 8 vpermw m1, m1, m6 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 vpermw m0, m2, m6 psrlw m2, 8 vpermw m1, m2, m6 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET .w64: vpermb m2, m3, [idxq] add idxq, 64 vpmultishiftqb m1, m4, m2 vpmultishiftqb m2, m5, m2 vpermw m0, m1, m6 psrlw m1, 8 vpermw m1, m1, m6 mova [dstq+ 0], m0 mova [dstq+64], m1 vpermw m0, m2, m6 psrlw m2, 8 vpermw m1, m2, m6 mova [dstq+strideq+ 0], m0 mova [dstq+strideq+64], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. ; w4 w8 w16 w32 ; 1 1 2 1 2 5 6 1 2 5 6 9 a d e ; 2 2 3 2 3 6 7 2 3 6 7 a b e f ; 3 3 4 3 4 7 8 3 4 7 8 b c f g ; 4 4 5 4 5 8 9 4 5 8 9 c d g h cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top %define base r6-$$ lea r6, [$$] %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 movifnidn hd, hm movu xm0, [tlq-6] pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] mov r5d, r8m ; bitdepth_max movsldup m9, [base+filter_permA] movshdup m10, [base+filter_permA] shr r5d, 11 ; is_12bpc jnz .12bpc psllw m7, 2 ; upshift multipliers so that packusdw psllw m8, 2 ; will perform clipping for free .12bpc: vpbroadcastd m5, [base+filter_rnd+r5*8] vpbroadcastd m6, [base+filter_shift+r5*8] sub wd, 8 jl .w4 .w8: call .main4 movsldup m11, [filter_permB] lea r5d, [hq*2+2] movshdup m12, [filter_permB] lea topq, [tlq+2] mova m13, [filter_permC] sub hd, 4 vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 sub tlq, r5 %if WIN64 push r7 push r8 %endif mov r7, dstq mov r8d, hd .w8_loop: movlps xm4, xm0, [tlq+hq*2] call .main8 lea dstq, [dstq+strideq*2] sub hd, 2 jge .w8_loop test wd, wd jz .end mov r2d, 0x0d kmovb k1, r2d lea r2, [strideq*3] .w16: movd xmm0, [r7+strideq*1+12] vpblendd xmm0, [topq+8], 0x0e ; t1 t2 pinsrw xm4, xmm0, [r7+strideq*0+14], 2 call .main8 add r7, 16 vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 mov hd, r8d mov dstq, r7 add topq, 16 .w16_loop: movd xmm1, [dstq+strideq*2-4] punpcklwd xm4, xmm1, xmm0 movd xmm0, [dstq+r2-4] shufps xm4{k1}, xmm0, xm0, q3210 call .main8 lea dstq, [dstq+strideq*2] sub hd, 2 jge .w16_loop sub wd, 8 jg .w16 .end: vpermb m2, m11, m0 mova ym1, ym5 vpdpwssd m1, m2, m7 vpermb m2, m12, m0 vpdpwssd m1, m2, m8 %if WIN64 pop r8 pop r7 %endif vextracti32x8 ym2, m1, 1 paddd ym1, ym2 packusdw ym1, ym1 vpsrlvw ym1, ym6 vpermt2q m0, m13, m1 vextracti32x4 [dstq+strideq*0], m0, 2 vextracti32x4 [dstq+strideq*1], ym0, 1 RET .w4_loop: movlps xm0, [tlq-10] lea dstq, [dstq+strideq*2] sub tlq, 4 .w4: call .main4 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 sub hd, 2 jg .w4_loop RET ALIGN function_align .main4: vpermb m2, m9, m0 mova ym1, ym5 vpdpwssd m1, m2, m7 vpermb m0, m10, m0 vpdpwssd m1, m0, m8 vextracti32x8 ym0, m1, 1 paddd ym0, ym1 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 ; clip vpsrlvw xm0, xm6 ret ALIGN function_align .main8: vpermb m3, m11, m0 mova ym2, ym5 vpdpwssd m2, m3, m7 vpermb m3, m9, m4 mova ym1, ym5 vpdpwssd m1, m3, m7 vpermb m3, m12, m0 vpdpwssd m2, m3, m8 vpermb m3, m10, m4 vpdpwssd m1, m3, m8 vextracti32x8 ym4, m2, 1 vextracti32x8 ym3, m1, 1 paddd ym2, ym4 paddd ym1, ym3 packusdw ym1, ym2 ; clip vpsrlvw ym1, ym6 vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 vextracti32x4 [dstq+strideq*0], m0, 2 vextracti32x4 [dstq+strideq*1], ym0, 1 ret %endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred16_sse.asm000066400000000000000000004114671517466257200241230ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filt_wh4: db 7, 7, 19, 7, z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 ALIGN 8 pb_2_3: times 4 db 2, 3 z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 z_filt_k: times 4 dw 8 times 4 dw 6 times 4 dw 4 times 4 dw 5 pw_m3584: times 4 dw -3584 pw_m3072: times 4 dw -3072 pw_m2560: times 4 dw -2560 pw_m2048: times 4 dw -2048 pw_m1536: times 4 dw -1536 pw_m1024: times 4 dw -1024 pw_m512: times 4 dw -512 pw_1: times 4 dw 1 pw_2: times 4 dw 2 pw_3: times 4 dw 3 pw_62: times 4 dw 62 pw_256: times 4 dw 256 pw_512: times 4 dw 512 pw_2048: times 4 dw 2048 %define pw_4 (z_filt_k+8*2) %define pw_8 (z_filt_k+8*0) %define pw_m1to4 z2_upsample_l %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) %define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) %define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table movd m4, wm tzcnt wd, wm add tlq, 2 movifnidn hd, hm pxor m3, m3 pavgw m4, m3 movd m5, wd movu m0, [tlq] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_16bpc_ssse3_table mov hd, hm movd m4, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm pxor m3, m3 sub tlq, hq pavgw m4, m3 movd m5, r6d movu m0, [tlq] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m2, [tlq+112] movu m1, [tlq+ 96] paddw m0, m2 movu m2, [tlq+ 80] paddw m1, m2 movu m2, [tlq+ 64] paddw m0, m2 paddw m0, m1 .h32: movu m1, [tlq+ 48] movu m2, [tlq+ 32] paddw m1, m2 paddw m0, m1 .h16: movu m1, [tlq+ 16] paddw m0, m1 .h8: movhlps m1, m0 paddw m0, m1 .h4: punpcklwd m0, m3 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 lea stride3q, [strideq*3] pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_16bpc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw m4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m1, m0 punpckhwd m0, m3 punpcklwd m1, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 cmp hd, 4 jg .w4_mul psrlw m0, 3 jmp .w4_end .w4_mul: mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d psrld m0, 2 movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 .s4: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 8 je .w8_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 32 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m0 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 16 je .w16_end mov r2d, 0xAAAB mov r3d, 0x6667 test hd, 8|32 cmovz r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16c: mova m1, m0 .s16: movu [dstq+strideq*0+16*0], m0 movu [dstq+strideq*0+16*1], m1 movu [dstq+strideq*1+16*0], m0 movu [dstq+strideq*1+16*1], m1 movu [dstq+strideq*2+16*0], m0 movu [dstq+strideq*2+16*1], m1 movu [dstq+stride3q +16*0], m0 movu [dstq+stride3q +16*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m0, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 32 je .w32_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 8 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32c: mova m1, m0 mova m2, m0 mova m3, m0 .s32: movu [dstq+strideq*0+16*0], m0 movu [dstq+strideq*0+16*1], m1 movu [dstq+strideq*0+16*2], m2 movu [dstq+strideq*0+16*3], m3 movu [dstq+strideq*1+16*0], m0 movu [dstq+strideq*1+16*1], m1 movu [dstq+strideq*1+16*2], m2 movu [dstq+strideq*1+16*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s32 RET .h64: mova m0, [tlq-128] mova m1, [tlq-112] paddw m0, [tlq- 96] paddw m1, [tlq- 80] paddw m0, [tlq- 64] paddw m1, [tlq- 48] paddw m0, [tlq- 32] paddw m1, [tlq- 16] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] movu m2, [tlq+ 18] paddw m1, m2 movu m2, [tlq+ 34] paddw m0, m2 movu m2, [tlq+ 50] paddw m1, m2 movu m2, [tlq+ 66] paddw m0, m2 movu m2, [tlq+ 82] paddw m1, m2 movu m2, [tlq+ 98] paddw m0, m2 movu m2, [tlq+114] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 64 je .w64_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w64_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s64: mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m LEA r5, ipred_dc_128_16bpc_ssse3_table tzcnt wd, wm shr r6d, 11 movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_16bpc_ssse3_table movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+ 18] movu m2, [tlq+ 34] movu m3, [tlq+ 50] cmp wd, 64 je .w64 tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w64: WIN64_SPILL_XMM 8 movu m4, [tlq+ 66] movu m5, [tlq+ 82] movu m6, [tlq+ 98] movu m7, [tlq+114] .w64_loop: mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 mova [dstq+16*4], m4 mova [dstq+16*5], m5 mova [dstq+16*6], m6 mova [dstq+16*7], m7 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 %define base r5-ipred_h_16bpc_ssse3_table tzcnt wd, wm LEA r5, ipred_h_16bpc_ssse3_table movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m2, [base+pw_256] movddup m3, [base+pb_2_3] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: sub tlq, 8 movq m3, [tlq] pshuflw m0, m3, q3333 pshuflw m1, m3, q2222 pshuflw m2, m3, q1111 pshuflw m3, m3, q0000 movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m1 movq [dstq+strideq*2], m2 movq [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: sub tlq, 8 movq m3, [tlq] punpcklwd m3, m3 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*0+16*3], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m1 mova [dstq+strideq*1+16*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: sub tlq, 2 movd m0, [tlq] pshufb m0, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .w64 RET cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left %define base r5-ipred_paeth_16bpc_ssse3_table movifnidn hd, hm movd m4, [tlq] pshuflw m4, m4, q0000 mov leftq, tlq add hd, hd punpcklqdq m4, m4 ; topleft sub leftq, hq and wd, ~7 jnz .w8 movddup m5, [tlq+2] ; top psubw m6, m5, m4 pabsw m7, m6 .w4_loop: movd m1, [leftq+hq-4] punpcklwd m1, m1 punpckldq m1, m1 ; left %macro PAETH 0 paddw m0, m6, m1 psubw m2, m4, m0 ; tldiff psubw m0, m5 ; tdiff pabsw m2, m2 pabsw m0, m0 pminsw m2, m0 pcmpeqw m0, m2 pand m3, m5, m0 pandn m0, m4 por m0, m3 pcmpgtw m3, m7, m2 pand m0, m3 pandn m3, m1 por m0, m3 %endmacro PAETH movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %define r7d hm %assign regs_used 7 %elif WIN64 movaps r4m, m8 PUSH r7 %assign regs_used 8 %endif %if ARCH_X86_64 movddup m8, [pw_256] %endif lea tlq, [tlq+wq*2+2] neg wq mov r7d, hd .w8_loop0: movu m5, [tlq+wq*2] mov r6, dstq add dstq, 16 psubw m6, m5, m4 pabsw m7, m6 .w8_loop: movd m1, [leftq+hq-2] %if ARCH_X86_64 pshufb m1, m8 %else pshuflw m1, m1, q0000 punpcklqdq m1, m1 %endif PAETH movu [r6], m0 add r6, strideq sub hd, 1*2 jg .w8_loop mov hd, r7d add wq, 8 jl .w8_loop0 %if WIN64 movaps m8, r4m %endif RET %if ARCH_X86_64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 4 %endif cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov hd, hm lea weightsq, [weightsq+hq*4] neg hq movd m5, [tlq+hq*2] ; bottom pshuflw m5, m5, q0000 punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [tlq+2] ; top lea r3, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: movq m1, [weightsq+hq*2] punpcklwd m1, m1 pshufd m0, m1, q1100 punpckhdq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %assign regs_used 7 mov hm, hq %define hq hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0, hq movu m4, [tlq+2] add tlq, 16 mov r6, dstq add dstq, 16 psubw m4, m5 .w8_loop: movq m3, [weightsq+t0*2] punpcklwd m3, m3 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] add t0, 4 jl .w8_loop sub wd, 8 jg .w8_loop0 RET cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov wd, wm movifnidn hd, hm movd m5, [tlq+wq*2] ; right sub tlq, 8 add hd, hd pshuflw m5, m5, q0000 sub tlq, hq punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [weightsq+4*2] lea r3, [strideq*3] .w4_loop: movq m1, [tlq+hq] ; left punpcklwd m1, m1 psubw m1, m5 ; left - right pshufd m0, m1, q3322 punpckldq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movhps [dstq+strideq*2], m1 movq [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: lea weightsq, [weightsq+wq*4] neg wq %if ARCH_X86_32 PUSH r6 %assign regs_used 7 %define hd hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0d, hd mova m4, [weightsq+wq*2] mov r6, dstq add dstq, 16 .w8_loop: movq m3, [tlq+t0*(1+ARCH_X86_32)] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] sub t0d, 4*(1+ARCH_X86_64) jg .w8_loop add wq, 8 jl .w8_loop0 RET %if ARCH_X86_64 DECLARE_REG_TMP 10 %else DECLARE_REG_TMP 3 %endif cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ h_weights, v_weights, top LEA h_weightsq, smooth_weights_2d_16bpc mov wd, wm mov hd, hm movd m7, [tlq+wq*2] ; right lea v_weightsq, [h_weightsq+hq*8] neg hq movd m6, [tlq+hq*2] ; bottom pshuflw m7, m7, q0000 pshuflw m6, m6, q0000 cmp wd, 4 jne .w8 movq m4, [tlq+2] ; top mova m5, [h_weightsq+4*4] punpcklwd m4, m6 ; top, bottom pxor m6, m6 .w4_loop: movq m1, [v_weightsq+hq*4] sub tlq, 4 movd m3, [tlq] ; left pshufd m0, m1, q0000 pshufd m1, m1, q1111 pmaddwd m0, m4 punpcklwd m3, m7 ; left, right pmaddwd m1, m4 pshufd m2, m3, q1111 pshufd m3, m3, q0000 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add hq, 2 jl .w4_loop RET .w8: %if ARCH_X86_32 lea h_weightsq, [h_weightsq+wq*4] mov t0, tlq mov r1m, tlq mov r2m, hq %define m8 [h_weightsq+16*0] %define m9 [h_weightsq+16*1] %else %if WIN64 movaps r4m, m8 movaps r6m, m9 PUSH r7 PUSH r8 %endif PUSH r9 PUSH r10 %assign regs_used 11 lea h_weightsq, [h_weightsq+wq*8] lea topq, [tlq+wq*2] neg wq mov r8, tlq mov r9, hq %endif punpcklqdq m6, m6 .w8_loop0: %if ARCH_X86_32 movu m5, [t0+2] add t0, 16 mov r0m, t0 %else movu m5, [topq+wq*2+2] mova m8, [h_weightsq+wq*4+16*0] mova m9, [h_weightsq+wq*4+16*1] %endif mov t0, dstq add dstq, 16 punpcklwd m4, m5, m6 punpckhwd m5, m6 .w8_loop: movd m1, [v_weightsq+hq*4] sub tlq, 2 movd m3, [tlq] ; left pshufd m1, m1, q0000 pmaddwd m0, m4, m1 pshuflw m3, m3, q0000 pmaddwd m1, m5 punpcklwd m3, m7 ; left, right pmaddwd m2, m8, m3 pmaddwd m3, m9 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pxor m1, m1 pavgw m0, m1 mova [t0], m0 add t0, strideq inc hq jl .w8_loop %if ARCH_X86_32 mov t0, r0m mov tlq, r1m add h_weightsq, 16*2 mov hq, r2m sub dword wm, 8 jg .w8_loop0 %else mov tlq, r8 mov hq, r9 add wq, 8 jl .w8_loop0 %endif %if WIN64 movaps m8, r4m movaps m9, r6m %endif RET %if ARCH_X86_64 cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx %define base r7-$$ %define bdmaxm r8m lea r7, [$$] %else cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx %define base r1-$$ %define stridemp [rsp+4*0] %define bdmaxm [rsp+4*1] mov r3, r8m mov stridemp, r1 mov bdmaxm, r3 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm add tlq, 2 movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] mov dxd, angled movddup m0, [base+pw_256] and dxd, 0x7e movddup m7, [base+pw_62] add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) movd m3, [tlq+14] movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movd m1, bdmaxm pshufb m3, m0 palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd mova [rsp+32], m3 palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 pshufb m1, m0 paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 movd m4, dxd psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 paddw m3, m5 pxor m5, m5 pmaxsw m3, m5 mov r3d, dxd pavgw m3, m5 pshufb m4, m0 pminsw m3, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 mova m3, [base+z_upsample] movifnidn strideq, stridemp mova [rsp+ 0], m1 paddw m5, m4, m4 mova [rsp+16], m2 punpcklqdq m4, m5 ; xpos0 xpos1 .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m4, m5 ; xpos += dx paddw m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m1, r3d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movd m3, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 pshuflw m3, m3, q0000 mova [rsp+16*1], m2 lea r2d, [r3+2] movq [rsp+r3*2+18], m3 cmp hd, 8 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w4_main: lea tlq, [tlq+r3*2] movd m4, dxd movddup m1, [base+z_base_inc] ; base_inc << 6 movd m6, [tlq] ; top[max_base_x] shl r3d, 6 movd m3, r3d pshufb m4, m0 mov r5d, dxd ; xpos pshufb m6, m0 sub r5, r3 pshufb m3, m0 paddw m5, m4, m4 psubw m3, m1 ; max_base_x punpcklqdq m4, m5 ; xpos0 xpos1 movifnidn strideq, stridemp .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5*2+0] movq m1, [tlq+r5*2+2] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3*2+0] movhps m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop .w4_end_loop: movq [dstq+strideq*0], m6 movq [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a paddw m5, m1 paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 psubw m2, m5, m3 movu m6, [tlq+18] ; a b c d e f g _ psraw m2, 3 movu m3, [tlq+20] ; b c d e f g _ _ paddw m5, m2 movu m2, [tlq+16] ; 9 a b c d e f g paddw m6, m2 add dxd, dxd cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ .w8_upsample_h8: paddw m3, [tlq+14] ; 8 9 a b c d e f psubw m4, m6, m3 movd m3, bdmaxm psraw m4, 3 mov r3d, dxd paddw m6, m4 pxor m4, m4 pmaxsw m5, m4 pmaxsw m6, m4 pshufb m3, m0 pavgw m5, m4 pavgw m6, m4 movd m4, dxd pminsw m5, m3 pminsw m6, m3 mova m3, [base+z_upsample] pshufb m4, m0 movifnidn strideq, stridemp punpcklwd m0, m1, m5 mova [rsp+ 0], m0 punpckhwd m1, m5 mova [rsp+16], m1 punpcklwd m0, m2, m6 mova [rsp+32], m0 punpckhwd m2, m6 mova [rsp+48], m2 mova m5, m4 .w8_upsample_loop: mov r2d, r3d shr r2d, 6 movu m1, [rsp+r2*2+ 0] movu m2, [rsp+r2*2+16] add r3d, dxd pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m1, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .w8_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movu m3, [tlq+16*1] movd m4, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 pshuflw m4, m4, q0000 mova [rsp+16*2], m3 lea r2d, [r3+2] movq [rsp+r3*2+18], m4 cmp hd, 16 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w8_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 movifnidn strideq, stridemp .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+0] movu m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m4, 15 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [dstq], m0 dec hd jz .w8_end add dstq, strideq add r5, dxq jl .w8_loop .w8_end_loop: mova [dstq], m6 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: %if ARCH_X86_32 %define strideq r3 %endif lea r3d, [hq+15] movd m1, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x24924924 movu m3, [tlq+16*1] movu m4, [tlq+16*2] shr r5d, 30 movu m5, [tlq+16*3] movd m6, [tlq+r3*2] adc r5d, -1 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 pshuflw m6, m6, q0000 mova [rsp+16*3], m4 mova [rsp+16*4], m5 lea r2d, [r3+2] movq [rsp+r3*2+18], m6 cmp hd, 32 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w16_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*0], m0 por m1, m3 mova [dstq+16*1], m1 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq+16*0], m6 mova [dstq+16*1], m6 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main call .filter_copy lea r5d, [r3+2] cmp hd, 64 cmove r3d, r5d call .filter_edge_s3 .w32_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*2], m0 por m1, m3 mova [dstq+16*3], m1 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main call .filter_copy call .filter_edge_s3 .w64_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m1536] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*2], m0 por m1, m2 mova [dstq+16*3], m1 movu m0, [tlq+r3*2+64] movu m2, [tlq+r3*2+66] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+80] paddw m0, m2 movu m2, [tlq+r3*2+82] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m2048] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m2560] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*4], m0 por m1, m2 mova [dstq+16*5], m1 movu m0, [tlq+r3*2+96] movu m2, [tlq+r3*2+98] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+112] paddw m0, m2 movu m2, [tlq+r3*2+114] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m3072] movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*6], m0 por m1, m3 mova [dstq+16*7], m1 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_copy: pshuflw m2, [tlq-2], q0000 pshuflw m3, [tlq+r3*2], q0000 xor r5d, r5d movd [rsp+gprsize+12], m2 .filter_copy_loop: movu m1, [tlq+r5*2+16*0] movu m2, [tlq+r5*2+16*1] add r5d, 16 mova [rsp+r5*2+gprsize-16*1], m1 mova [rsp+r5*2+gprsize-16*0], m2 cmp r5d, r3d jle .filter_copy_loop lea tlq, [rsp+gprsize+16*1] movq [tlq+r3*2+2], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-2] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-2] mova [tlq+r5*2-16], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2] movu m3, [tlq+r5*2+2] paddw m2, m3 pmullw m2, m5 add r5d, 8 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r3d jl .filter_edge_loop mova [tlq+r5*2-16], m1 ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-2] movu m3, [tlq-4] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-2] movu m3, [tlq+r5*2-4] mova [tlq+r5*2-16], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2+0] paddw m3, m5 movu m1, [tlq+r5*2+2] movu m4, [tlq+r5*2+4] add r5d, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r3d jl .filter_edge_s3_loop mova [tlq+r5*2-16], m1 ret %if ARCH_X86_64 cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m %define bdmaxm r8m lea r7, [$$] mov hd, hm movddup m8, [base+pw_62] lea r9d, [wq-4] shl r9d, 6 mova m9, [base+z2_top_shufA] or r9d, hd mova m10, [base+z2_left_shufA] %else cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx %define base r1-$$ %define r9b byte [rsp+16*26+4*0] %define r9d dword [rsp+16*26+4*0] %define r10d dword [rsp+16*26+4*1] %define r11d dword [rsp+16*26+4*2] %define maxwm [rsp+16*2+4*0] %define maxhm [rsp+16*2+4*1] %define bdmaxm [rsp+16*2+4*2] %define stridemp [rsp+16*26+4*3] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov r5d, r8m mov maxwm, r1d mov maxhm, r4d mov bdmaxm, r5d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z2_top_shufA] shl hd, 6 mova m1, [base+z2_left_shufA] or hd, hm mova [rsp+16*24], m0 mov r9d, hd mova [rsp+16*25], m1 %endif tzcnt wd, wd movifnidn angled, anglem mova m0, [tlq-16*8] mova m1, [tlq-16*7] mova m2, [tlq-16*6] mova m3, [tlq-16*5] movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif mova m4, [tlq-16*4] mova m5, [tlq-16*3] mova m6, [tlq-16*2] mova m7, [tlq-16*1] mova [rsp+16* 5], m0 xor angled, 0x400 mova [rsp+16* 6], m1 mov dyd, dxd mova [rsp+16* 7], m2 neg dxq mova [rsp+16* 8], m3 and dyd, ~1 mova [rsp+16* 9], m4 and dxq, ~1 mova [rsp+16*10], m5 lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] mova [rsp+16*11], m6 pxor m3, m3 mova [rsp+16*12], m7 movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle movddup m0, [base+pw_256] ; 4<<6 movd m4, [tlq] movu m5, [tlq+16*0+2] movu m6, [tlq+16*1+2] movsldup m1, [base+z2_dy_offset] pshufb m4, m0 movq m7, [base+z_base_inc+2] mov r11d, (112-4)<<6 mova [rsp+16*13], m4 neg dxd mova [rsp+16*14], m5 or dyd, 4<<16 mova [rsp+16*15], m6 %if ARCH_X86_64 lea r10d, [dxq+(112<<6)] ; xpos %else mov [rsp+8*3], dyd lea r4d, [dxq+(112<<6)] mov r10d, r4d movzx hd, r9b %endif movq [rsp+8*0], m1 movq [rsp+8*1], m0 movq [rsp+8*2], m7 jmp wq .w4: test angled, 0x400 jnz .w4_main lea r3d, [hq+2] add angled, 1022 pshuflw m1, m5, q3333 shl r3d, 6 movq [rsp+16*14+8], m1 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, [base+z_filt_wh4] pand m7, m2 pcmpgtb m7, [base+z_filt_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 paddw m2, m5, [tlq] movu m1, [rsp+gprsize+16*14+2] movu m4, [rsp+gprsize+16*14-4] %if ARCH_X86_64 movd m6, r9m ; bdmax, offset due to call %else movd m6, [rsp+gprsize+16*2+4*2] %endif paddw m4, m1 psubw m1, m2, m4 pshufb m6, m0 psraw m1, 3 paddw m2, m1 add dxd, dxd pmaxsw m2, m3 paddw m7, m7 pavgw m2, m3 pminsw m2, m6 %if ARCH_X86_64 mova m9, [base+z2_top_shufB] lea r10d, [dxq+(113<<6)] mov r11d, (112-7)<<6 %else mova m1, [base+z2_top_shufB] lea r3d, [dxq+(113<<6)] mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 mov [rsp+gprsize+16*26+4*1], r3d mova [rsp+gprsize+16*24], m1 %endif punpcklwd m1, m2, m5 punpckhwd m2, m5 movq [rsp+gprsize+8*2], m7 mova [rsp+gprsize+16*14], m1 mova [rsp+gprsize+16*15], m2 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp+16*4], angled sub angled, 1112 ; angle - 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 mova m4, [base+z_filt_wh4] movd m7, r3d mova m5, [base+z_filt_t_w48+angleq*8] mov r3d, 4 call .w8_filter_top mov angled, [rsp+16*4] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 mova m2, [tlq-16] lea r3d, [hq-4] movu m3, [tlq-14] movu m4, [rsp+16*12+4] pshufb m1, m2, [base+z2_upsample_l+r3*4] movd m6, bdmaxm pxor m5, m5 paddw m3, m2 paddw m4, m1 psubw m1, m3, m4 movshdup m4, [base+z2_dy_offset] psraw m1, 3 pshufb m6, m0 paddw m3, m1 pmaxsw m3, m5 pavgw m3, m5 pminsw m3, m6 %if ARCH_X86_64 mova m10, [base+z2_left_shufB] add dyd, dyd %else mova m1, [base+z2_left_shufB] shl dword [rsp+8*3], 1 mova [rsp+16*25], m1 %endif punpckhwd m1, m2, m3 punpcklwd m2, m3 movq [rsp+8*0], m4 mova [rsp+16*12], m1 mova [rsp+16*11], m2 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+8*3] %endif pshufb m6, m0 movddup m0, [rsp+8*2] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+8*0] movq [rsp+8*3], m3 movq [rsp+8*5], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*4], m6 movq [rsp+8*4], m4 %if ARCH_X86_64 pand m0, m8, m4 %else movq m0, [base+pw_62] pand m0, m4 %endif psraw m4, 6 psllw m0, 9 ; frac_y << 9 movq [rsp+8*7], m0 pabsw m4, m4 movq [rsp+8*6], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu m2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu m3, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movu m4, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m9}, m2, m1, m3, m4 %else mova m0, [rsp+16*24] REPX {pshufb x, m0}, m2, m1, m3, m4 %endif punpcklqdq m0, m2, m1 punpckhqdq m2, m1 punpcklqdq m1, m3, m4 punpckhqdq m3, m4 %if ARCH_X86_64 pand m5, m8, m6 %else movddup m5, [base+pw_62] pand m5, m6 %endif psllw m5, 9 psubw m2, m0 pmulhrsw m2, m5 paddw m5, m6, m7 psubw m3, m1 paddw m0, m2 %if ARCH_X86_64 pand m2, m8, m5 %else movddup m2, [base+pw_62] pand m2, m5 %endif psllw m2, 9 pmulhrsw m3, m2 paddw m1, m3 cmp r3d, 111 ; topleft jge .w4_toponly mova [rsp+16*22], m0 mova [rsp+16*23], m1 movzx r3d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r3*2] movzx r3d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r3*2] movzx r3d, byte [rsp+8*6+4] ; base_y2 movu m4, [rsp+r3*2] movzx r3d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m4, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m4, m0 %endif punpcklwd m1, m3, m2 punpckhwd m3, m2 ; 01 punpcklwd m2, m4, m0 punpckhwd m4, m0 ; 23 punpckldq m0, m1, m2 ; y0 d1 punpckhdq m1, m2 ; y2 y3 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 psraw m6, 15 ; base_x < topleft psraw m4, m5, 15 paddw m0, m2 paddw m1, m3 pand m0, m6 pandn m6, [rsp+16*22] pand m1, m4 pandn m4, [rsp+16*23] por m0, m6 por m1, m4 .w4_toponly: movifnidn strideq, stridemp movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jz .w4_end movq m4, [rsp+8*6] paddsw m6, m5, m7 ; xpos += dx movq m5, [rsp+8*3] psubw m4, m5 lea dstq, [dstq+strideq*2] movq [rsp+8*6], m4 cmp r2d, r11d jge .w4_loop .w4_leftonly_loop: movzx r2d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r2*2] movzx r2d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r2*2] movzx r2d, byte [rsp+8*6+4] ; base_y2 movu m6, [rsp+r2*2] movzx r2d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r2*2] psubw m4, m5 %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m6, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m6, m0 %endif movq [rsp+8*6], m4 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m6, m0 punpckhwd m6, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m6 punpckhdq m3, m6 movddup m6, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m6 pmulhrsw m3, m6 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*5] add r5, 8 mov dstq, r5 paddw m4, [rsp+8*4] ; base_y += 4*dy movzx r2d, word [rsp+8*1] movddup m6, [rsp+8*1] paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main lea r3d, [angleq+126] pshufhw m1, m5, q3333 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movhps [rsp+16*15], m1 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filt_wh8] movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, m1 movq m1, [base+pw_512] pand m7, m2 pcmpgtb m7, m4 movq [rsp+8*1], m1 ; 8<<6 jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp+16*4], angled sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m4, [base+z_filt_wh8] movd m7, r3d psrldq m5, [base+z_filt_t_w48+angleq*8], 4 mov r3d, 8 call .w8_filter_top mov r3d, [rsp+16*4] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x55555555 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 ; filter_strength lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w8_filter_top: REPX {pshufb x, m3}, m2, m1, m7 pcmpeqb m2, m4 pand m1, m2 pand m7, m2 pcmpgtb m1, m5 pcmpgtb m7, m5 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 mov [dstq], tlq lea tlq, [rsp+16*14+gprsize] shr r5d, 30 ; filter_strength call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+16*2+4*1] %endif mov tlq, [dstq] cmp r3d, 8 jge .w8_filter_top_end movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14+gprsize], m1 movu [rsp+r3*2+16*15+gprsize], m2 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m7, r3d REPX {pshufb x, m3}, m2, m1, m7 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m2, [base+z_filt_wh16] pand m1, m2 pand m7, m2 pcmpgtb m1, m4 pcmpgtb m7, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufhw m6, m6, q3333 mov [dstq], tlq lea tlq, [rsp+16*14] shr r5d, 30 movhps [tlq+16*2], m6 adc r5d, -1 ; filter_strength mov r3d, 16 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge mov r3d, maxwm mov tlq, [dstq] cmp r3d, 16 jge .w16_filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 .w16_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x24924924 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 adc r5d, -1 ; filter_strength movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w32: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] mova [rsp+16*16], m1 mova [rsp+16*17], m2 test angled, 0x400 jnz .w4_main mov [dstq], tlq lea tlq, [rsp+16*14] pshufhw m2, m2, q3333 mov r3d, 32 movhps [tlq+16*4], m2 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 .filter_left: neg hq mov r3, tlq pshuflw m1, [tlq+hq*2], q0000 lea tlq, [rsp+16*13-2] movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 .filter_left_end: mov r2d, maxhm cmp r2d, hd jge .w4_main neg r2 movu m1, [r3+r2*2-16*1] movu m2, [r3+r2*2-16*2] movu [rsp+r2*2+16*12], m1 movu [rsp+r2*2+16*11], m2 cmp r2d, -48 jle .w4_main movu m1, [r3+r2*2-16*3] movu m2, [r3+r2*2-16*4] movu [rsp+r2*2+16*10], m1 movu [rsp+r2*2+16* 9], m2 cmp r2d, -32 jle .w4_main movu m1, [r3+r2*2-16*5] movu m2, [r3+r2*2-16*6] movu [rsp+r2*2+16* 8], m1 movu [rsp+r2*2+16* 7], m2 cmp r2d, -16 jle .w4_main movu m1, [r3+r2*2-16*7] movu m2, [r3+r2*2-16*8] movu [rsp+r2*2+16* 6], m1 movu [rsp+r2*2+16* 5], m2 jmp .w4_main .w64: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] movu m3, [tlq+16*4+2] movu m4, [tlq+16*5+2] movu m5, [tlq+16*6+2] movu m6, [tlq+16*7+2] mov [dstq], tlq lea tlq, [rsp+16*14] mova [tlq+16*2], m1 mova [tlq+16*3], m2 mova [tlq+16*4], m3 mova [tlq+16*5], m4 mova [tlq+16*6], m5 mova [tlq+16*7], m6 test angled, 0x400 jnz .w4_main pshufhw m6, m6, q3333 mov r3d, 64 movhps [tlq+16*8], m6 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 64 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 48 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*4+2] movu m2, [tlq+r3*2+16*5+2] movu [rsp+r3*2+16*18], m1 movu [rsp+r3*2+16*19], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*6+2] movu m2, [tlq+r3*2+16*7+2] movu [rsp+r3*2+16*20], m1 movu [rsp+r3*2+16*21], m2 jmp .filter_left %if ARCH_X86_64 cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mov org_wd, wd %else cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define org_wd r5 %define org_wq r5 movd m6, r8m ; pixel_max mov [dstq+4*0], strideq LEA r1, $$ mov [dstq+4*1], wd %endif tzcnt hd, hm movifnidn angled, anglem sub tlq, 2 movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] sub angled, 180 movddup m0, [base+pw_256] mov dyd, angled neg dyd xor angled, 0x400 movddup m7, [base+pw_62] or dyq, ~0x7e lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 %if ARCH_X86_64 movd m6, r8m %endif pshufb m4, m2, m0 mov tlq, rsp palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 add dyd, dyd palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 paddw m1, m2 paddw m3, m5 psubw m5, m1, m3 mova m3, [base+z_upsample] mova [tlq+ 0], m4 movd m4, dyd psraw m5, 3 neg dyd paddw m1, m5 pxor m5, m5 lea r5d, [dyq+(16<<6)+63] ; ypos pmaxsw m1, m5 pshufb m6, m0 shl wd, 3 pavgw m1, m5 pshufb m4, m0 pminsw m1, m6 sub rsp, wq punpckhwd m0, m1, m2 paddw m5, m4, m4 punpcklwd m1, m2 mova [tlq+32], m0 movsd m4, m5 mova [tlq+16], m1 .h4_upsample_loop: lea r4d, [r5+dyq] sar r5d, 6 movu m2, [tlq+r5*2] lea r5d, [r4+dyq] sar r4d, 6 movu m1, [tlq+r4*2] pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h4_upsample_loop or r3d, 4*2 jmp .end_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m1, r4d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-14] neg r4 movd m3, [tlq+r4*2] shr r5d, 30 movd [rsp+16*17], m1 pshuflw m3, m3, q0000 mova [rsp+16*16], m2 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m3 cmp wd, 8 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h4_main: movd m4, dyd sub tlq, r4 movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] ; ypos pshufb m3, m0 shl wd, 3 paddw m5, m4, m4 sub rsp, wq psubw m3, m1 ; max_base_y movsd m4, m5 ; ypos1 ypos0 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movddup m0, [tlq+r5*2-6] movddup m1, [tlq+r5*2-8] lea r5, [r4+dyq] sar r4, 6 movlps m0, [tlq+r4*2-6] movlps m1, [tlq+r4*2-8] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 16 jz .h4_transpose test r5d, r5d jg .h4_loop .h4_end_loop: mova [rsp+wq-16], m6 sub wd, 16 jg .h4_end_loop .h4_transpose: or r3d, 4*2 jmp .end_transpose .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; g f e d c b a 9 movu m1, [tlq-32] ; _ g f e d c b a movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 pshufd m4, m2, q2100 ; _ _ g f e d c b paddw m1, m2 movu m5, [tlq-28] ; f e d c b a 9 8 add dyd, dyd cmp wd, 8 je .h8_upsample_w8 pshufhw m4, m2, q1000 ; _ _ _ _ c c c b .h8_upsample_w8: paddw m4, m5 psubw m5, m1, m4 movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 psraw m5, 3 paddw m1, m5 movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 %if ARCH_X86_64 movd m6, r8m ; pixel_max %endif paddw m4, m5 shl wd, 4 psubw m5, m3, m4 movd m4, dyd psraw m5, 3 neg dyd paddw m3, m5 pshufb m6, m0 mova m5, [tlq-14] pshufb m4, m0 pxor m0, m0 pmaxsw m1, m0 pmaxsw m3, m0 mov tlq, rsp pavgw m1, m0 pavgw m3, m0 sub rsp, wq pminsw m1, m6 pminsw m6, m3 mova m3, [base+z_upsample] lea r5d, [dyq+(16<<6)+63] ; ypos punpcklwd m0, m1, m2 mova [tlq+16*0], m0 punpckhwd m1, m2 mova [tlq+16*1], m1 punpcklwd m0, m6, m5 mova [tlq+16*2], m0 punpckhwd m6, m5 mova [tlq+16*3], m6 mova m5, m4 .h8_upsample_loop: mov r4d, r5d sar r4d, 6 movu m1, [tlq+r4*2+16*0] movu m2, [tlq+r4*2+16*1] add r5d, dyd pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h8_upsample_loop or r3d, 8*2 jmp .end_transpose .h8_no_upsample: lea r4d, [wq+7] movd m1, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .h8_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-16*1+2] neg r4 mova m3, [tlq-16*2+2] shr r5d, 30 movd m4, [tlq+r4*2] movd [rsp+16*17], m1 mova [rsp+16*16], m2 pshuflw m4, m4, q0000 mova [rsp+16*15], m3 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m4 cmp wd, 16 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h8_main: sub tlq, r4 movd m4, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 4 mova m5, m4 sub rsp, wq psubw m3, [base+z_base_inc_z2] .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m1, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 8*2 jz .h8_transpose add r5, dyq jg .h8_loop .h8_end_loop: mova [rsp+wq-16], m6 sub wd, 8*2 jg .h8_end_loop .h8_transpose: or r3d, 8*2 jmp .end_transpose .h16: lea r4d, [wq+15] movd m1, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 mova m2, [tlq-16*1+2] imul r5d, 0x24924924 mova m3, [tlq-16*2+2] neg r4 mova m4, [tlq-16*3+2] shr r5d, 30 mova m5, [tlq-16*4+2] movd m6, [tlq+r4*2] adc r5d, -1 ; filter_strength movd [rsp+16*17], m1 mova [rsp+16*16], m2 mova [rsp+16*15], m3 pshuflw m6, m6, q0000 mova [rsp+16*14], m4 mova [rsp+16*13], m5 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m6 cmp wd, 32 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h16_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 5 paddw m4, m5, [base+z_base_inc_z2] sub rsp, wq psubw m4, m3 .h16_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m2, [tlq+r4*2-16] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r4*2-30] paddw m0, m2 movu m2, [tlq+r4*2-32] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+wq-16*1], m0 por m1, m3 mova [rsp+wq-16*2], m1 sub wd, 16*2 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16*1], m6 mova [rsp+wq-16*2], m6 sub wd, 16*2 jg .h16_end_loop .h16_transpose: or r3d, 16*2 jmp .end_transpose .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main call .filter_copy lea r5, [r4-2] cmp wd, 64 cmove r4, r5 call .filter_edge_s3 .h32_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h32_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m3, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-30] paddw m0, m3 movu m3, [tlq+r4*2-32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*4 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-46] movu m3, [tlq+r4*2-48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-62] paddw m0, m3 movu m3, [tlq+r4*2-64] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m1024] paddw m1, m3 movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 16*4 REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32*2 jmp .end_transpose .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main call .filter_copy call .filter_edge_s3 .h64_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h64_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2- 14] movu m3, [tlq+r4*2- 16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 30] paddw m0, m3 movu m3, [tlq+r4*2- 32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*8 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*7], m0 por m1, m3 mova [rsp+16*6], m1 movu m0, [tlq+r4*2- 46] movu m3, [tlq+r4*2- 48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 62] paddw m0, m3 movu m3, [tlq+r4*2- 64] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m1024] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m1536] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*5], m0 por m1, m3 mova [rsp+16*4], m1 movu m0, [tlq+r4*2- 78] movu m3, [tlq+r4*2- 80] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 94] paddw m0, m3 movu m3, [tlq+r4*2- 96] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m2048] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m2560] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-110] movu m3, [tlq+r4*2-112] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-126] paddw m0, m3 movu m3, [tlq+r4*2-128] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m3072] paddw m1, m3 movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 16*8 REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 dec wd jg .h64_end_loop .h64_transpose: add r3d, 64*2 .end_transpose: %if ARCH_X86_64 lea r7, [strideq*3] %else mov strideq, [dstq+4*0] mov org_wd, [dstq+4*1] %endif lea r4d, [r3*3] .end_transpose_loop: lea r2, [rsp+r3-8] lea r6, [dstq+org_wq*2-8] .end_transpose_loop_y: movq m0, [r2+r4 ] movq m1, [r2+r3*2] movq m2, [r2+r3*1] movq m3, [r2+r3*0] sub r2, 8 punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 %if ARCH_X86_64 movhps [r6+strideq*2], m0 movq [r6+r7 ], m0 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 lea r6, [r6+strideq*2] %endif cmp r2, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*4] sub org_wd, 4 jg .end_transpose_loop RET .filter_copy: neg r4 pshuflw m2, [tlq+2], q0000 xor r5d, r5d pshuflw m3, [tlq+r4*2], q0000 movq [rsp+gprsize+16*17], m2 .filter_copy_loop: mova m1, [tlq+r5*2-16*1+2] mova m2, [tlq+r5*2-16*2+2] sub r5, 16 mova [rsp+r5*2+gprsize+16*18], m1 mova [rsp+r5*2+gprsize+16*17], m2 cmp r5d, r4d jg .filter_copy_loop lea tlq, [rsp+gprsize+16*17-2] movq [tlq+r4*2-8], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-12] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-12] mova [tlq+r5*2+2], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2-14] movu m3, [tlq+r5*2-16] sub r5, 8 paddw m2, m3 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r4d jg .filter_edge_loop mova [tlq+r5*2+2], m1 neg r4d ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-12] movu m3, [tlq-10] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-12] movu m3, [tlq+r5*2-10] mova [tlq+r5*2+2], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2-14] paddw m3, m5 movu m1, [tlq+r5*2-16] movu m4, [tlq+r5*2-18] sub r5, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r4d jg .filter_edge_s3_loop mova [tlq+r5*2+2], m1 neg r4d ret %if ARCH_X86_64 cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter %else cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %endif %define base r6-$$ movifnidn hd, hm movd m6, r8m ; bitdepth_max %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif LEA r6, $$ shl filterd, 6 movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 mova m1, [base+filter_intra_taps+filterq+16*0] mova m2, [base+filter_intra_taps+filterq+16*1] mova m3, [base+filter_intra_taps+filterq+16*2] mova m4, [base+filter_intra_taps+filterq+16*3] pxor m5, m5 %if ARCH_X86_64 punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid punpcklbw m10, m5, m2 ; having to perform sign-extension. punpckhbw m11, m5, m2 punpcklbw m12, m5, m3 punpckhbw m13, m5, m3 punpcklbw m14, m5, m4 punpckhbw m15, m5, m4 %else punpcklbw m7, m5, m1 mova m8, m7 punpckhbw m7, m5, m1 mova m9, m7 punpcklbw m7, m5, m2 mova m10, m7 punpckhbw m7, m5, m2 mova m11, m7 punpcklbw m7, m5, m3 mova m12, m7 punpckhbw m7, m5, m3 mova m13, m7 punpcklbw m7, m5, m4 mova m14, m7 punpckhbw m7, m5, m4 mova m15, m7 %endif mova m7, [base+filter_shuf] add hd, hd mov r5, dstq pshuflw m6, m6, q0000 mov r6, tlq punpcklqdq m6, m6 sub tlq, hq .left_loop: pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ pshufd m1, m0, q0000 pmaddwd m2, m8, m1 pmaddwd m1, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 paddd m2, m3 paddd m1, m4 pshufd m4, m0, q2222 pmaddwd m3, m12, m4 pmaddwd m4, m13 paddd m2, m3 paddd m1, m4 pshufd m3, m0, q3333 pmaddwd m0, m14, m3 pmaddwd m3, m15 paddd m0, m2 paddd m1, m3 psrad m0, 11 ; x >> 3 psrad m1, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 ; (x + 8) >> 4 pminsw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movlps m0, [tlq+hq-10] lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .left_loop sub wd, 4 jz .end sub tld, r6d ; -h*2 sub r6, r5 ; tl-dst .right_loop0: add r5, 8 mov hd, tld movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ mov dstq, r5 .right_loop: pshufd m2, m0, q0000 pmaddwd m1, m8, m2 pmaddwd m2, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 pinsrw m0, [dstq+strideq*0-2], 5 paddd m1, m3 paddd m2, m4 pshufd m0, m0, q2222 movddup m4, [dstq+strideq*1-8] pmaddwd m3, m12, m0 pmaddwd m0, m13 paddd m1, m3 paddd m0, m2 pshuflw m2, m4, q3333 punpcklwd m2, m5 pmaddwd m3, m14, m2 pmaddwd m2, m15 paddd m1, m3 paddd m0, m2 psrad m1, 11 psrad m0, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 pminsw m0, m6 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 palignr m0, m4, 14 lea dstq, [dstq+strideq*2] add hd, 2*2 jl .right_loop sub wd, 4 jg .right_loop0 .end: RET %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac LEA t0, ipred_cfl_left_16bpc_ssse3_table movd m4, wd tzcnt wd, wd movifnidn hd, hm add tlq, 2 movsxd r6, [t0+wq*4] movd m5, wd jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm LEA t0, ipred_cfl_left_16bpc_ssse3_table tzcnt wd, wm lea r6d, [hq*2] movd m4, hd sub tlq, r6 tzcnt r6d, hd movd m5, r6d movsxd r6, [t0+r6*4] .start: movd m7, r7m movu m0, [tlq] add r6, t0 add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table movsxd wq, [t0+wq*4] pxor m6, m6 pshuflw m7, m7, q0000 pcmpeqw m3, m3 add wq, t0 movifnidn acq, acmp pavgw m4, m6 punpcklqdq m7, m7 jmp r6 .h32: movu m1, [tlq+48] movu m2, [tlq+32] paddw m0, m1 paddw m0, m2 .h16: movu m1, [tlq+16] paddw m0, m1 .h8: pshufd m1, m0, q1032 paddw m0, m1 .h4: pmaddwd m0, m3 psubd m4, m0 pshuflw m0, m4, q1032 paddd m0, m4 psrld m0, m5 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq %macro IPRED_CFL 2 ; dst, src pabsw m%1, m%2 pmulhrsw m%1, m2 psignw m%2, m1 psignw m%1, m%2 paddw m%1, m0 pmaxsw m%1, m6 pminsw m%1, m7 %endmacro cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_16bpc_ssse3_table tzcnt wd, wd movd m7, r7m movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw m4, 1 pxor m6, m6 pshuflw m7, m7, q0000 add r6, t0 add wq, t0 movifnidn acq, acmp pcmpeqw m3, m3 punpcklqdq m7, m7 jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 cmp hd, 4 jg .w4_mul psrld m0, 3 jmp .w4_end .w4_mul: mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 16 cmove r6d, r2d movd m1, r6d psrld m0, 2 pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham lea r6, [strideq*3] pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 movq [dstq+strideq*0], m3 movhps [dstq+strideq*1], m3 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4_loop RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+strideq*0], m3 mova [dstq+strideq*1], m4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s8_loop RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 add dstq, strideq dec hd jg .s16_loop RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m1, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 32 je .w32_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 8 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 mova m4, [acq+16*2] mova m5, [acq+16*3] add acq, 16*4 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, strideq dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac tzcnt wd, wm LEA t0, ipred_cfl_splat_16bpc_ssse3_table mov r6d, r7m movifnidn hd, hm shr r6d, 11 movd m7, r7m movsxd wq, [t0+wq*4] movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] pshuflw m7, m7, q0000 pxor m6, m6 add wq, t0 movifnidn acq, acmp punpcklqdq m7, m7 jmp wq cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 paddw m5, m5 %else movddup m5, [pw_2] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] pmaddwd m2, m5, [ypxq+strideq*2] pmaddwd m3, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m0, m1 paddd m2, m3 paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq], m0 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc punpckhqdq m0, m0 pslld m2, 2 .w4_hpad: mova [acq+16*0], m0 paddd m4, m2 mova [acq+16*1], m0 add acq, 16*2 sub hpadd, 4 jg .w4_hpad jmp .dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*1+16*0] pmaddwd m1, m5, [ypxq+strideq*0+16*1] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m0, m2 paddd m1, m3 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc pslld m2, 2 mova m1, m0 jmp .hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 pshufd m1, m0, q3333 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m6, m5, [ypxq+strideq*1+16*0] paddd m0, m6 cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+strideq*0+16*1] pmaddwd m6, m5, [ypxq+strideq*1+16*1] paddd m3, m6 je .w16_wpad2 pmaddwd m1, m5, [ypxq+strideq*0+16*2] pmaddwd m6, m5, [ypxq+strideq*1+16*2] paddd m1, m6 jp .w16_wpad1 pmaddwd m2, m5, [ypxq+strideq*0+16*3] pmaddwd m6, m5, [ypxq+strideq*1+16*3] paddd m2, m6 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] paddd m6, m0, m3 packssdw m0, m3 paddd m6, m1 mova [acq+16*0], m0 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz .dc paddd m2, m2 .hpad: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m0 mova [acq+16*3], m1 add acq, 16*4 sub hpadd, 4 jg .hpad .dc: sub r5, acq ; -w*h*2 pshufd m2, m4, q1032 tzcnt r1d, r5d paddd m2, m4 sub r1d, 2 pshufd m4, m2, q2301 movd m0, r1d paddd m2, m4 psrld m2, m0 pxor m0, m0 pavgw m2, m0 packssdw m2, m2 .dc_loop: mova m0, [acq+r5+16*0] mova m1, [acq+r5+16*1] psubw m0, m2 psubw m1, m2 mova [acq+r5+16*0], m0 mova [acq+r5+16*1], m1 add r5, 16*2 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 psllw m5, 2 %else movddup m5, [pw_4] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m3, m5, [ypxq+strideq*1] pmaddwd m1, m5, [ypxq+strideq*2] pmaddwd m2, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m4, m0 packssdw m0, m3 paddd m3, m1 packssdw m1, m2 paddd m4, m2 paddd m4, m3 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 pslld m2, 3 mova [acq+16*0], m1 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m1 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*0+16*1] pmaddwd m1, m5, [ypxq+strideq*1+16*0] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq+16*0], m0 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] pshufd m2, m0, q3333 pshufd m3, m1, q3333 paddd m4, m0 packssdw m0, m2 paddd m4, m2 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+16*0] cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+16*1] je .w16_wpad2 pmaddwd m1, m5, [ypxq+16*2] jp .w16_wpad1 pmaddwd m2, m5, [ypxq+16*3] .w16_wpad_end: add ypxq, strideq paddd m6, m0, m3 packssdw m0, m3 mova [acq+16*0], m0 paddd m6, m1 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h %define base r6-ipred_cfl_ac_444_16bpc_ssse3_table LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table tzcnt wd, wm movifnidn hpadd, hpadm pxor m4, m4 movsxd wq, [r6+wq*4] movddup m5, [base+pw_1] add wq, r6 mov hd, hm shl hpadd, 2 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq m0, [ypxq+strideq*0] movhps m0, [ypxq+strideq*1] movq m1, [ypxq+strideq*2] movhps m1, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 mova [acq+16*0], m1 pslld m2, 2 mova [acq+16*1], m1 punpckhqdq m2, m2 mova [acq+16*2], m1 paddd m4, m2 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: mov r5, acq .w8_loop: mova m0, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w16_wpad2: pshufhw m3, m2, q3333 pshufhw m1, m0, q3333 punpckhqdq m3, m3 punpckhqdq m1, m1 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0+16*0] mova m0, [ypxq+strideq*1+16*0] psllw m2, 3 psllw m0, 3 test wpadd, wpadd jnz .w16_wpad2 mova m3, [ypxq+strideq*0+16*1] mova m1, [ypxq+strideq*1+16*1] psllw m3, 3 psllw m1, 3 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] mova [acq+16*0], m2 pmaddwd m2, m5 mova [acq+16*1], m3 pmaddwd m3, m5 paddd m4, m2 pmaddwd m2, m5, m0 mova [acq+16*2], m0 paddd m4, m3 pmaddwd m3, m5, m1 mova [acq+16*3], m1 add acq, 16*4 paddd m2, m3 paddd m4, m2 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w32_wpad6: pshufhw m1, m0, q3333 punpckhqdq m1, m1 mova m2, m1 mova m3, m1 jmp .w32_wpad_end .w32_wpad4: pshufhw m2, m1, q3333 punpckhqdq m2, m2 mova m3, m2 jmp .w32_wpad_end .w32_wpad2: pshufhw m3, m2, q3333 punpckhqdq m3, m3 jmp .w32_wpad_end .w32: movifnidn wpadd, wpadm mov r5, acq WIN64_SPILL_XMM 8 .w32_loop: mova m0, [ypxq+16*0] psllw m0, 3 cmp wpadd, 4 jg .w32_wpad6 mova m1, [ypxq+16*1] psllw m1, 3 je .w32_wpad4 mova m2, [ypxq+16*2] psllw m2, 3 jnp .w32_wpad2 mova m3, [ypxq+16*3] psllw m3, 3 .w32_wpad_end: add ypxq, strideq pmaddwd m6, m5, m0 mova [acq+16*0], m0 pmaddwd m7, m5, m1 mova [acq+16*1], m1 paddd m6, m7 pmaddwd m7, m5, m2 mova [acq+16*2], m2 paddd m6, m7 pmaddwd m7, m5, m3 mova [acq+16*3], m3 add acq, 16*4 paddd m6, m7 paddd m4, m6 dec hd jg .w32_loop %if WIN64 mova m5, m6 WIN64_RESTORE_XMM SWAP 5, 6 %endif test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w32_hpad_loop: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m6 mova [acq+16*2], m2 mova [acq+16*3], m3 add acq, 16*4 dec hpadd jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 %define hd r2d %endif mova m4, [palq] LEA r2, pal_pred_16bpc_ssse3_table tzcnt wd, wm pshufb m4, [base+pal_pred_shuf] movsxd wq, [r2+wq*4] pshufd m5, m4, q1032 add wq, r2 movifnidn hd, hm jmp wq .w4: movq m0, [idxq] add idxq, 8 psrlw m1, m0, 4 punpcklbw m0, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4 RET .w8: movu m3, [idxq] add idxq, 16 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w8 RET .w16: movu m3, [idxq] add idxq, 16 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+ 0], m0 movu [dstq+16], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq+ 0], m0 movu [dstq+strideq+16], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: movu m3, [idxq] add idxq, 16 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+16*0], m0 movu [dstq+16*1], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+16*2], m0 movu [dstq+16*3], m1 add dstq, strideq dec hd jg .w32 RET .w64: movu m3, [idxq+16*0] psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 movu m3, [idxq+16*1] add idxq, 32 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*2], m0 mova [dstq+16*3], m1 psrlw m1, m3, 4 punpcklbw m0, m3, m1 punpckhbw m3, m1 pshufb m1, m4, m0 pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*4], m0 mova [dstq+16*5], m1 pshufb m1, m4, m3 pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*6], m0 mova [dstq+16*7], m1 add dstq, strideq dec hd jg .w64 RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred_avx2.asm000066400000000000000000006770761517466257200240540ustar00rootroot00000000000000; Copyright © 2018-2021, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_4x_1_m1_4x_3_m3_4x_2_m2_4x_4_m4: times 4 db 1, -1 times 4 db 3, -3 times 4 db 2, -2 times 4 db 4, -4 pb_8x_1_m1_8x_2_m2: times 8 db 1, -1 times 8 db 2, -2 pb_8x32_4x56_8_4x48_16_4x60_4: times 4 db 32, 32 times 4 db 56, 8 times 4 db 48, 16 times 4 db 60, 4 pb_4x62_2_4x64_0_4x63_1_4x64_0: times 4 db 62, 2 times 4 db 64, 0 times 4 db 63, 1 times 4 db 64, 0 pb_8x32_4x62_2_4x56_8_4x64_0: times 4 db 32, 32 times 4 db 62, 2 times 4 db 56, 8 times 4 db 64, 0 pb_4x32_4x3_4x32_4x1_4x16_4x2_4x16_4x0: times 4 db 32 times 4 db 3 times 4 db 32 times 4 db 1 times 4 db 16 times 4 db 2 times 4 db 16 times 4 db 0 pb_4x7_4x5_4x3_4x1_4x6_4x4_4x2_4x0: times 4 db 7 times 4 db 5 times 4 db 3 times 4 db 1 times 4 db 6 times 4 db 4 times 4 db 2 times 4 db 0 pb_63_1_to_56_8_and_47_17_to_40_24: db 63, 1, 62, 2, 61, 3, 60, 4, 59, 5, 58, 6, 57, 7, 56, 8 db 47, 17, 46, 18, 45, 19, 44, 20, 43, 21, 42, 22, 41, 23, 40, 24 pb_31_1_to_24_8_and_15_17_to_8_24: db 31, 1, 30, 2, 29, 3, 28, 4, 27, 5, 26, 6, 25, 7, 24, 8 db 15, 17, 14, 18, 13, 19, 12, 20, 11, 21, 10, 22, 9, 23, 8, 24 pb_4x32_2x48_16_2x56_2x8_2x60_4_8x64_0: pb_4x32_2x48_16_2x56_2x8_2x60_4: times 4 db 32 times 2 db 48, 16 times 2 db 56, 8 times 2 db 60, 4 times 8 db 64, 0 pb_2x62_2_2x63_1_12x64_0: pb_2x62_2_2x63_1_4x64_0: times 2 db 62, 2 times 2 db 63, 1 times 12 db 64, 0 pb_32_32_48_16_56_8_60_4_62_2_63_1_10x64_0: pb_32_32_48_16_56_8_60_4_62_2_63_1_2x64_0: pb_32_32_48_16_56_8_60_4: db 32, 32, 48, 16, 56, 8, 60, 4, 62, 2, 63, 1 times 10 db 64, 0 pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 const \ z_filter_s, db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line pb_128: times 4 db 128 ; those are just placed here for alignment. pb_36_m4: times 2 db 36, -4 z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 pb_127_m127: times 2 db 127, -127 ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 pw_64: times 2 dw 64 cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 times 9 db 7, -1 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ; w=8, w_pad=1 as well as second half of previous one cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 times 5 db 6, 7 ; w=16,w_pad=2 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 times 8 db 14, 15 ; w=16,w_pad=3 db 0, 1, 2, 3, 4, 5 times 13 db 6, 7 pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 dc_mul: dw 0x7880, 0x71c0, 0x6680, 0x5540, 0x4000, 0x5540, 0x6680, 0x71c0, 0x7880 pb_32_32_56_8_62_2_5x64_0: pb_32_32_56_8_62_2_64_0: db 32, 32, 56, 8, 62, 2 times 5 db 64, 0 pb_15_1_to_8_8: db 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8, 8 pb_7_1_to_0_8: db 7, 1, 6, 2, 5, 3, 4, 4, 3, 5, 2, 6, 1, 7, 0, 8 pb_3_1_to_0_4: db 3, 1, 2, 2, 1, 3, 0, 4 %define pb_0to15 cfl_ac_w16_pad_shuffle %define pb_1 (ipred_h_shuf+12) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+ 4) %define pb_4 (ipred_h_shuf+24) %define pb_5 (ipred_h_shuf+ 8) %define pb_7 (ipred_h_shuf+ 0) %define pb_8 (z_upsample2 +12) %define pb_12 (z2_y_shuf_h4+20) %define pb_14 (z2_y_shuf_h4+ 4) %define pb_15 (z_filter_s +32) %define pb_27 (z2_y_shuf_h4+ 8) %define pb_31 (z2_y_shuf_h4+12) %define pb_32 (z2_y_shuf_h4+16) %define pb_90 (z2_y_shuf_h4+ 0) %define pw_1 (z2_y_shuf_h4+24) %define pw_8 (z_filter_k +32) pb_64: times 4 db 64 pb_252: times 4 db 252 pb_254: times 4 db 254 pb_1_m1: times 2 db 1, -1 pb_2_m2: times 2 db 2, -2 pb_4_m4: times 2 db 4, -4 pb_8_m8: times 2 db 8, -8 pw_62: times 2 dw 62 pw_128: times 2 dw 128 pw_255: times 2 dw 255 pw_256: times 2 dw 256 pw_512: times 2 dw 512 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 pw_8192: times 2 dw 8192 pd_1: dd 1 pd_2: dd 2 %macro IBP_WEIGHT_TABLE 1-* %rep %0 db %1, 128-%1 %rotate 1 %endrep %endmacro ibp_weights: IBP_WEIGHT_TABLE \ 96, \ 86, 107, \ 77, 90, 102, 115, \ 71, 78, 86, 92, 100, 107, 114, 121, \ 68, 72, 76, 79, 83, 87, 90, 94, \ 98, 102, 106, 109, 113, 117, 121, 124 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4, s8, s16, s32, s64, s4i, s8i, s16i, s32i, s64i JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64, w4i, w8i, w16i, w32i, w64i JMP_TABLE ipred_dc_top, avx2, w4i, w8i, w16i, w32i, w64i JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64, w4m, w8m, w16m, w32m, w64m JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 8 %endif INIT_YMM avx2 cglobal ipred_dc_top_8bpc, 3, 8, 6, dst, stride, tl, w, h, stride3 tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 shrx r6d, r6d, wd movd xm3, r6d lea t0, [ipred_dc_left_avx2_table] movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 %if UNIX64 test r5w, 0x1000 ; ibp %else test word r5m, 0x1000 ; ibp %endif jnz .ibp movsxd wq, [t0+ipred_dc_avx2_table-ipred_dc_left_avx2_table+wq*4+10*4] lea wq, [wq+t0+ipred_dc_avx2_table-ipred_dc_left_avx2_table] jmp r6 .ibp: vpbroadcastd m4, [pw_256] movsxd wq, [t0+ipred_dc_top_avx2_table-ipred_dc_left_avx2_table+wq*4] lea wq, [wq+t0+ipred_dc_top_avx2_table-ipred_dc_left_avx2_table] jmp r6 .w4i: vpbroadcastd xm1, [tlq] punpcklbw xm1, xm0, xm1 cmp hd, 8 ; note that 4x4 w/ ibp is not allowed... jg .w4x16i ; fall-through for .w4x8i movd xm2, [ibp_weights+2] punpcklwd xm2, xm2 punpckldq xm2, xm2 pmaddubsw xm1, xm2 pmulhrsw xm1, xm4 packuswb xm1, xm1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s4 .w4x16i: mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .w4x16i_loop: movq xm2, [r3+r6*2] punpcklwd xm2, xm2 punpckhdq xm3, xm2, xm2 punpckldq xm2, xm2 REPX {pmaddubsw x, xm1, x}, xm2, xm3 REPX {pmulhrsw x, xm4}, xm2, xm3 packuswb xm2, xm3 movd [dstq+strideq*0], xm2 pextrd [dstq+strideq*1], xm2, 1 pextrd [dstq+strideq*2], xm2, 2 pextrd [dstq+stride3q ], xm2, 3 lea dstq, [dstq+strideq*4] add r6, 4 jl .w4x16i_loop jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s4 .w8i: movq xm1, [tlq] punpcklbw xm1, xm0, xm1 cmp hd, 8 jg .w8x16i je .w8x8i ; fall-through for .w8x4i vpbroadcastw xm2, [ibp_weights] pmaddubsw xm1, xm2 pmulhrsw xm1, xm4 packuswb xm1, xm1 movq [dstq+strideq*0], xm1 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 RET .w8x8i: vpbroadcastw xm2, [ibp_weights+2] vpbroadcastw xm3, [ibp_weights+4] REPX {pmaddubsw x, xm1, x}, xm2, xm3 REPX {pmulhrsw x, xm4}, xm2, xm3 packuswb xm2, xm3 movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s8 .w8x16i: WIN64_SPILL_XMM 7 mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .w8x16i_loop: vpbroadcastw xm2, [r3+r6*2+0] vpbroadcastw xm3, [r3+r6*2+2] vpbroadcastw xm5, [r3+r6*2+4] vpbroadcastw xm6, [r3+r6*2+6] REPX {pmaddubsw x, xm1, x}, xm2, xm3, xm5, xm6 REPX {pmulhrsw x, xm4}, xm2, xm3, xm5, xm6 packuswb xm2, xm3 packuswb xm5, xm6 movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 movq [dstq+strideq*2], xm5 movhps [dstq+stride3q ], xm5 lea dstq, [dstq+strideq*4] add r6, 4 jl .w8x16i_loop WIN64_RESTORE_XMM jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s8 .w16i: vbroadcasti128 m1, [tlq] punpckhbw m2, m0, m1 punpcklbw m1, m0, m1 cmp hd, 8 jg .w16x16i je .w16x8i ; fall-through for .w16x4i vpbroadcastw xm3, [ibp_weights] REPX {pmaddubsw x, xm3}, xm1, xm2 REPX {pmulhrsw x, xm4}, xm1, xm2 packuswb xm1, xm2 movu [dstq+strideq*0], xm1 movu [dstq+strideq*1], xm0 movu [dstq+strideq*2], xm0 movu [dstq+stride3q ], xm0 RET .w16x8i: vpbroadcastw xm3, [ibp_weights+2] vpbroadcastw m5, [ibp_weights+4] vpblendd m3, m5, 11110000b REPX {pmaddubsw x, m3}, m1, m2 REPX {pmulhrsw x, m4}, m1, m2 packuswb m1, m2 movu [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 movu [dstq+strideq*2], xm0 movu [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s16 .w16x16i: WIN64_SPILL_XMM 8 mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .w16x16i_loop: vpbroadcastw xm3, [r3+r6*2+0] vpbroadcastw m5, [r3+r6*2+2] vpbroadcastw xm6, [r3+r6*2+4] vpbroadcastw m7, [r3+r6*2+6] vpblendd m3, m5, 11110000b vpblendd m6, m7, 11110000b pmaddubsw m5, m2, m3 pmaddubsw m3, m1, m3 pmaddubsw m7, m2, m6 pmaddubsw m6, m1, m6 REPX {pmulhrsw x, m4}, m5, m3, m7, m6 packuswb m3, m5 packuswb m6, m7 movu [dstq+strideq*0], xm3 vextracti128 [dstq+strideq*1], m3, 1 movu [dstq+strideq*2], xm6 vextracti128 [dstq+stride3q ], m6, 1 lea dstq, [dstq+strideq*4] add r6, 4 jl .w16x16i_loop WIN64_RESTORE_XMM jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s16 .w32i: movu m1, [tlq] punpckhbw m2, m0, m1 punpcklbw m1, m0, m1 cmp hd, 4 jg .w32x8i ; fall-through for .w32x4i vpbroadcastw m3, [ibp_weights] REPX {pmaddubsw x, m3}, m1, m2 REPX {pmulhrsw x, m4}, m1, m2 packuswb m1, m2 movu [dstq+strideq*0], m1 movu [dstq+strideq*1], m0 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m0 RET .w32x8i: WIN64_SPILL_XMM 8 mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .w32x8i_loop: vpbroadcastw m3, [r3+r6*2+0] vpbroadcastw m6, [r3+r6*2+2] pmaddubsw m5, m2, m3 pmaddubsw m3, m1, m3 pmaddubsw m7, m2, m6 pmaddubsw m6, m1, m6 REPX {pmulhrsw x, m4}, m5, m3, m7, m6 packuswb m3, m5 packuswb m6, m7 movu [dstq+strideq*0], m3 movu [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] add r6, 2 jl .w32x8i_loop test hd, 2 WIN64_RESTORE_XMM jz mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s32 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s32 .w64i: WIN64_SPILL_XMM 11 movu m2, [tlq] movu m5, [tlq+32] punpckhbw m3, m0, m2 punpcklbw m2, m0, m2 punpckhbw m6, m1, m5 punpcklbw m5, m1, m5 mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .w64i_loop: vpbroadcastw m7, [r3+r6*2] pmaddubsw m8, m2, m7 pmaddubsw m9, m3, m7 pmaddubsw m10, m5, m7 pmaddubsw m7, m6, m7 REPX {pmulhrsw x, m4}, m8, m9, m10, m7 packuswb m8, m9 packuswb m10, m7 mova [dstq+ 0], m8 mova [dstq+32], m10 add dstq, strideq inc r6 jl .w64i_loop WIN64_RESTORE_XMM test hd, 3 jz mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s64 .w64i_dcloop: mova [dstq+ 0], m0 mova [dstq+32], m1 add dstq, strideq dec hd jz .w64i_ret test hd, 3 jnz .w64i_dcloop jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s64 .w64i_ret: RET cglobal ipred_dc_left_8bpc, 3, 8, 6, dst, stride, tl, w, h, stride3 mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 shrx t0d, t0d, r6d movd xm3, t0d lea t0, [ipred_dc_left_avx2_table] movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 %if UNIX64 test r5w, 0x1000 ; ibp %else test word r5m, 0x1000 ; ibp %endif jnz .ibp movsxd wq, [t0+ipred_dc_avx2_table-ipred_dc_left_avx2_table+wq*4+10*4] lea wq, [wq+t0+ipred_dc_avx2_table-ipred_dc_left_avx2_table] jmp r6 .ibp: movsxd wq, [t0+wq*4+5*4] add wq, t0 vpbroadcastd m5, [t0+pw_256-ipred_dc_left_avx2_table] jmp r6 .h64: movu m1, [tlq+32] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h32: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 .h8: psrlq xm1, xm0, 32 paddw xm0, xm1 .h4: pmaddwd xm0, xm2 pmulhrsw xm0, xm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 mova m1, m0 jmp wq .w4i: pslld xm3, xm2, 8 vpbroadcastw xm2, [ibp_weights] jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s4i_leftonly_loop .w8i: vpbroadcastd xm2, [ibp_weights+2] jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s8i_leftonly_loop .w16i: vpbroadcastq xm3, [ibp_weights+6] jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s16i_leftonly_loop .w32i: WIN64_SPILL_XMM 9 vbroadcasti128 m3, [ibp_weights+14] jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s32i_leftonly_loop RESET_STACK_STATE .w64i: movu m3, [ibp_weights+30] jmp mangle(private_prefix %+ _ipred_dc_8bpc_avx2).s64i_leftonly_loop cglobal ipred_dc_8bpc, 3, 8, 6, dst, stride, tl, w, h, stride3 mov hd, hm ; zero upper half tzcnt wd, wm ; log2(w) tzcnt r6d, hd ; log2(h) mov t0d, wd mov r7d, wd cmp wd, r6d cmovb t0d, r6d ; max(log2(w), log2(h)) sub r7, r6 ; log2(w) - log2(h) movd xm3, t0d lea t0, [ipred_dc_avx2_table] movd xm4, [t0+dc_mul-ipred_dc_avx2_table+r7*2+4*2] psrlw xm4, xm3 movsxd r6, [t0+r6*4] lea r7, [t0+5*4] %if UNIX64 test r5w, 0x1000 ; ibp %else test word r5m, 0x1000 ; ibp %endif cmovz r7, t0 movsxd r7, [r7+wq*4+10*4] movsxd wq, [t0+wq*4+5*4] pcmpeqd m3, m3 vpbroadcastd m5, [pw_256] add r6, t0 add wq, t0 add r7, t0 jmp r6 .h4: movd xm0, [tlq-4] pmaddubsw xm0, xm3 jmp wq .w4: movd xm1, [tlq+1] pmaddubsw xm1, xm3 ; fall-through .dcgen: paddw m0, m1 vextracti128 xm1, m0, 1 paddw xm0, xm1 pmaddwd xm0, xm3 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmulhrsw xm0, xm4 vpbroadcastb m0, xm0 mova m1, m0 ; in case we jump to .s64 lea stride3q, [strideq*3] jmp r7 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .s4i: vpbroadcastd xm1, [tlq+1] punpcklbw xm1, xm0, xm1 pslld xm3, 8 sub tlq, hq vpbroadcastw xm2, [ibp_weights] cmp hd, 8 ; note that 4x4 w/ ibp is not allowed... jg .s4x16i ; fall-through for .s4x8i - top filter movd xm4, [ibp_weights+2] punpcklwd xm4, xm4 punpckldq xm4, xm4 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 vpblendd xm1, xm0, 1100b ; left filter movd xm4, [tlq+hq-4] punpcklbw xm4, xm0, xm4 ; dc, left[y] [4x] pmaddubsw xm4, xm2 pmulhrsw xm4, xm5 ; blend [4x], _ [4x] punpcklwd xm4, xm4 pshufd xm4, xm4, q0123 vpblendvb xm4, xm1, xm3 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 pextrd [dstq+strideq*2], xm4, 2 pextrd [dstq+stride3q ], xm4, 3 sub hq, 4 lea dstq, [dstq+strideq*4] .s4i_leftonly_loop: movd xm4, [tlq+hq-4] punpcklbw xm4, xm0, xm4 ; dc, left[y] [4x] pmaddubsw xm4, xm2 pmulhrsw xm4, xm5 ; blend [4x], _ [4x] punpcklwd xm4, xm4 vpblendvb xm4, xm0, xm3 pextrd [dstq+strideq*0], xm4, 3 pextrd [dstq+strideq*1], xm4, 2 pextrd [dstq+strideq*2], xm4, 1 movd [dstq+stride3q ], xm4 lea dstq, [dstq+strideq*4] sub hq, 4 jg .s4i_leftonly_loop RET .s4x16i: WIN64_SPILL_XMM 7 lea r7, [tlq+hq-4] mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .s4x16i_loop: ; top filter movq xm4, [r3+r6*2] punpcklwd xm4, xm4 punpckhdq xm6, xm4, xm4 punpckldq xm4, xm4 REPX {pmaddubsw x, xm1, x}, xm4, xm6 REPX {pmulhrsw x, xm5}, xm4, xm6 packuswb xm4, xm6 ; left filter movd xm6, [r7] punpcklbw xm6, xm0, xm6 ; dc, left[y] [4x] pmaddubsw xm6, xm2 pmulhrsw xm6, xm5 ; blend [4x], _ [4x] punpcklwd xm6, xm6 pshufd xm6, xm6, q0123 vpblendvb xm6, xm4, xm3 movd [dstq+strideq*0], xm6 pextrd [dstq+strideq*1], xm6, 1 pextrd [dstq+strideq*2], xm6, 2 pextrd [dstq+stride3q ], xm6, 3 lea dstq, [dstq+strideq*4] sub r7, 4 add r6, 4 jl .s4x16i_loop WIN64_RESTORE_XMM jmp .s4i_leftonly_loop ALIGN function_align .h8: movq xm0, [tlq-8] pmaddubsw xm0, xm3 jmp wq .w8: movq xm1, [tlq+1] pmaddubsw xm1, xm3 jmp .dcgen .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .s8i: vpbroadcastq xm1, [tlq+1] punpcklbw xm1, xm0, xm1 sub tlq, hq vpbroadcastd xm2, [ibp_weights+2] cmp hd, 8 je .s8x8i jg .s8x16i ; fall-through for .s8x4i ; top filter only for first line vpbroadcastw xm3, [ibp_weights] pmaddubsw xm1, xm3 pmulhrsw xm1, xm5 packuswb xm1, xm1 movq [dstq+strideq*0], xm1 ; left filter only for remaining lines movd xm1, [tlq+hq-4] punpcklbw xm1, xm0, xm1 ; dc, left[y] [4x] punpcklwd xm1, xm1 pmaddubsw xm1, xm2 pmulhrsw xm1, xm5 ; blendA, blendB [4x] packuswb xm1, xm1 punpcklwd xm1, xm0 punpckhdq xm2, xm1, xm0 punpckldq xm1, xm0 movq [dstq+strideq*1], xm2 movhps [dstq+strideq*2], xm1 movq [dstq+stride3q ], xm1 RET .s8x8i: ; top filter only for first two lines vpbroadcastw xm3, [ibp_weights+2] vpbroadcastw xm4, [ibp_weights+4] REPX {pmaddubsw x, xm1, x}, xm3, xm4 REPX {pmulhrsw x, xm5}, xm3, xm4 packuswb xm3, xm4 movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 ; left filter for other two lines movd xm1, [tlq+hq-4] punpcklbw xm1, xm0, xm1 ; dc, left[y] [4x] punpcklwd xm1, xm1 pmaddubsw xm1, xm2 pmulhrsw xm1, xm5 ; blendA, blendB [4x] packuswb xm1, xm1 punpcklwd xm1, xm0 punpckldq xm1, xm0 movhps [dstq+strideq*2], xm1 movq [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hq, 4 .s8i_leftonly_loop: movd xm1, [tlq+hq-4] punpcklbw xm1, xm0, xm1 ; dc, left[y] [4x] punpcklwd xm1, xm1 pmaddubsw xm1, xm2 pmulhrsw xm1, xm5 ; blendA, blendB [4x] packuswb xm1, xm1 punpcklwd xm1, xm0 punpckhdq xm3, xm1, xm0 punpckldq xm1, xm0 movhps [dstq+strideq*0], xm3 movq [dstq+strideq*1], xm3 movhps [dstq+strideq*2], xm1 movq [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hq, 4 jg .s8i_leftonly_loop RET .s8x16i: WIN64_SPILL_XMM 8 lea r7, [tlq+hq-4] mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .s8x16i_loop: ; top filter vpbroadcastw xm3, [r3+r6*2+0] vpbroadcastw xm4, [r3+r6*2+2] vpbroadcastw xm6, [r3+r6*2+4] vpbroadcastw xm7, [r3+r6*2+6] REPX {pmaddubsw x, xm1, x}, xm3, xm4, xm6, xm7 REPX {pmulhrsw x, xm5}, xm3, xm4, xm6, xm7 packuswb xm3, xm4 packuswb xm6, xm7 ; left filter movd xm4, [r7] punpcklbw xm4, xm0, xm4 ; dc, left[y] [4x] punpcklwd xm4, xm4 pmaddubsw xm4, xm2 pmulhrsw xm4, xm5 ; blendA, blendB [4x] packuswb xm4, xm4 pshuflw xm4, xm4, q3321 vpblendw xm6, xm4, 00010001b psrlq xm4, 32 vpblendw xm3, xm4, 00010001b movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 movq [dstq+strideq*2], xm6 movhps [dstq+stride3q ], xm6 lea dstq, [dstq+strideq*4] sub r7, 4 add r6, 4 jl .s8x16i_loop WIN64_RESTORE_XMM jmp .s8i_leftonly_loop ALIGN function_align .h16: movu xm0, [tlq-16] pmaddubsw xm0, xm3 jmp wq .w16: movu xm1, [tlq+1] pmaddubsw xm1, xm3 jmp .dcgen .s16: movu [dstq+strideq*0], xm0 movu [dstq+strideq*1], xm0 movu [dstq+strideq*2], xm0 movu [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .s16i: vbroadcasti128 m1, [tlq+1] punpckhbw m2, m0, m1 punpcklbw m1, m0, m1 vpbroadcastq xm3, [ibp_weights+6] sub tlq, hq cmp hd, 4 jg .s16x8i ; fall-through for s16x4i ; top filter vpbroadcastw xm4, [ibp_weights] REPX {pmaddubsw x, xm4}, xm1, xm2 REPX {pmulhrsw x, xm5}, xm1, xm2 packuswb xm1, xm2 movu [dstq+strideq*0], xm1 ; left filter movd xm1, [tlq+hq-4] punpcklbw xm1, xm0, xm1 ; dc, left[y] [4x] punpcklwd xm1, xm1 punpckhdq xm2, xm1, xm1 punpckldq xm1, xm1 REPX {pmaddubsw x, xm3}, xm2, xm1 REPX {pmulhrsw x, xm5}, xm2, xm1 packuswb xm1, xm2 vpblendd m1, m0, 11110000b vpermq m1, m1, q3120 pshufd m2, m1, q2221 pshufd m1, m1, q2220 vextracti128 [dstq+strideq*1], m1, 1 movu [dstq+strideq*2], xm2 movu [dstq+stride3q ], xm1 RET .s16x8i: lea r7, [tlq+hq-4] mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 cmp hd, 24 jge .s16x32i WIN64_SPILL_XMM 7 .s16x8i_toponly_loop: ; top filter vpbroadcastw xm4, [r3+r6*2+2] vpbroadcastw m6, [r3+r6*2+0] vpblendd m4, m6, 11110000b pmaddubsw m6, m2, m4 pmaddubsw m4, m1, m4 REPX {pmulhrsw x, m5}, m6, m4 packuswb m4, m6 vextracti128 [dstq+strideq*0], m4, 1 movu [dstq+strideq*1], xm4 lea dstq, [dstq+strideq*2] add r6, 2 jl .s16x8i_toponly_loop WIN64_RESTORE_XMM test hd, 2 jz .s16i_leftonly_loop ; left filter for 2px movd xm1, [tlq+hq-2] punpcklbw xm1, xm0, xm1 ; dc, left[y] [4x] punpcklwd xm1, xm1 punpckldq xm1, xm1 pmaddubsw xm1, xm3 pmulhrsw xm1, xm5 packuswb xm1, xm1 vpblendd xm1, xm0, 1100b pshufd xm2, xm1, q2221 pshufd xm1, xm1, q2220 movu [dstq+strideq*0], xm2 movu [dstq+strideq*1], xm1 lea dstq, [dstq+strideq*2] sub hq, 2 .s16i_leftonly_loop: movd xm1, [tlq+hq-4] punpcklbw xm1, xm0, xm1 ; dc, left[y] [4x] punpcklwd xm1, xm1 punpckhdq xm2, xm1, xm1 punpckldq xm1, xm1 REPX {pmaddubsw x, xm3}, xm2, xm1 REPX {pmulhrsw x, xm5}, xm2, xm1 packuswb xm1, xm2 vpblendd m1, m0, 11110000b vpermq m1, m1, q3120 pshufd m2, m1, q2221 pshufd m1, m1, q2220 vextracti128 [dstq+strideq*0], m2, 1 vextracti128 [dstq+strideq*1], m1, 1 movu [dstq+strideq*2], xm2 movu [dstq+stride3q ], xm1 sub hq, 4 lea dstq, [dstq+strideq*4] jg .s16i_leftonly_loop RET .s16x32i: WIN64_SPILL_XMM 9 .s16x32i_topleft_loop: ; top filter vpbroadcastw xm4, [r3+r6*2+4] vpbroadcastw m7, [r3+r6*2+0] vpbroadcastw xm6, [r3+r6*2+6] vpbroadcastw m8, [r3+r6*2+2] vpblendd m4, m7, 11110000b vpblendd m6, m8, 11110000b pmaddubsw m7, m2, m4 pmaddubsw m4, m1, m4 pmaddubsw m8, m2, m6 pmaddubsw m6, m1, m6 REPX {pmulhrsw x, m5}, m7, m4, m8, m6 packuswb m4, m7 packuswb m6, m8 ; left filter movd xm7, [r7] punpcklbw xm7, xm0, xm7 ; dc, left[y] [4x] punpcklwd xm7, xm7 punpckhdq xm8, xm7, xm7 punpckldq xm7, xm7 REPX {pmaddubsw x, xm3}, xm8, xm7 REPX {pmulhrsw x, xm5}, xm8, xm7 packuswb xm7, xm8 vpermq m7, m7, q1100 psrlq m8, m7, 32 vpblendd m4, m8, 00010001b vpblendd m6, m7, 00010001b vextracti128 [dstq+strideq*0], m4, 1 vextracti128 [dstq+strideq*1], m6, 1 movu [dstq+strideq*2], xm4 movu [dstq+stride3q ], xm6 lea dstq, [dstq+strideq*4] sub r7, 4 add r6, 4 jl .s16x32i_topleft_loop WIN64_RESTORE_XMM jmp .s16i_leftonly_loop ALIGN function_align .h32: movu m0, [tlq-32] pmaddubsw m0, m3 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 jmp .dcgen .s32: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m0 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET .s32i: movu m1, [tlq+1] sub tlq, hq punpckhbw m2, m0, m1 punpcklbw m1, m0, m1 vbroadcasti128 m3, [ibp_weights+14] cmp hd, 4 jg .s32x8i WIN64_SPILL_XMM 8 ; top filter vpbroadcastw m4, [ibp_weights] REPX {pmaddubsw x, m4}, m1, m2 REPX {pmulhrsw x, m5}, m1, m2 packuswb m1, m2 movu [dstq+strideq*0], m1 ; left filter vpbroadcastb m4, [tlq+hq-2] vpbroadcastb xm6, [tlq+hq-3] vpbroadcastb m7, [tlq+hq-4] vpblendd m6, m7, 11110000b REPX {punpcklbw x, m0, x}, m4, m6 ; dc, left[y] [4x] REPX {pmaddubsw x, m3 }, m4, m6 REPX {pmulhrsw x, m5 }, m4, m6 packuswb m4, m6 punpckhqdq m6, m4, m0 punpcklqdq m4, m0 vpblendd m4, m0, 11111100b movu [dstq+strideq*1], m4 vpblendd m7, m6, m0, 11111100b vperm2i128 m6, m0, q0301 movu [dstq+strideq*2], m7 movu [dstq+stride3q ], m6 RET .s32x8i: WIN64_SPILL_XMM 9 lea r7, [tlq+hq-2] mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 cmp hd, 48 je .s32x64i_topleft_loop .s32x8i_toponly_loop: ; top filter vpbroadcastw m4, [r3+r6*2+0] vpbroadcastw m6, [r3+r6*2+2] pmaddubsw m7, m2, m4 pmaddubsw m4, m1, m4 pmaddubsw m8, m2, m6 pmaddubsw m6, m1, m6 REPX {pmulhrsw x, m5}, m7, m4, m8, m6 packuswb m4, m7 packuswb m6, m8 movu [dstq+strideq*0], m4 movu [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] add r6, 2 jl .s32x8i_toponly_loop test hd, 2 jz .s32i_leftonly_loop ; left filter vpbroadcastb xm4, [tlq+hq-1] vpbroadcastb m6, [tlq+hq-2] vpblendd m4, m6, 11110000b punpcklbw m4, m0, m4 ; dc, left[y] [4x] pmaddubsw m4, m3 pmulhrsw m4, m5 packuswb m4, m4 vextracti128 xm6, m4, 1 REPX {vpblendd x, m0, 11111100b}, m4, m6 movu [dstq+strideq*0], m4 movu [dstq+strideq*1], m6 sub hq, 2 lea dstq, [dstq+strideq*2] .s32i_leftonly_loop: vpbroadcastb xm4, [tlq+hq-1] vpbroadcastb m6, [tlq+hq-2] vpblendd m4, m6, 11110000b vpbroadcastb xm6, [tlq+hq-3] vpbroadcastb m7, [tlq+hq-4] vpblendd m6, m7, 11110000b REPX {punpcklbw x, m0, x}, m4, m6 ; dc, left[y] [4x] REPX {pmaddubsw x, m3 }, m4, m6 REPX {pmulhrsw x, m5 }, m4, m6 packuswb m4, m6 punpckhqdq m6, m4, m0 punpcklqdq m4, m0 vpblendd m7, m4, m0, 11111100b vperm2i128 m4, m0, q0301 movu [dstq+strideq*0], m7 movu [dstq+strideq*1], m4 vpblendd m7, m6, m0, 11111100b vperm2i128 m6, m0, q0301 movu [dstq+strideq*2], m7 movu [dstq+stride3q ], m6 sub hq, 4 lea dstq, [dstq+strideq*4] jg .s32i_leftonly_loop RET .s32x64i_topleft_loop: ; top filter vpbroadcastw m4, [r3+r6*2+0] vpbroadcastw m6, [r3+r6*2+2] pmaddubsw m7, m2, m4 pmaddubsw m4, m1, m4 pmaddubsw m8, m2, m6 pmaddubsw m6, m1, m6 REPX {pmulhrsw x, m5}, m7, m4, m8, m6 packuswb m4, m7 packuswb m6, m8 ; left filter vpbroadcastb xm7, [r7+1] vpbroadcastb m8, [r7] vpblendd m7, m8, 11110000b punpcklbw m7, m0, m7 ; dc, left[y] [4x] pmaddubsw m7, m3 pmulhrsw m7, m5 packuswb m7, m7 vextracti128 xm8, m7, 1 vpblendd m4, m7, 00000011b vpblendd m6, m8, 00000011b movu [dstq+strideq*0], m4 movu [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub r7, 2 add r6, 2 jl .s32x64i_topleft_loop jmp .s32i_leftonly_loop RESET_STACK_STATE ALIGN function_align .h64: movu m0, [tlq-64] movu m1, [tlq-32] pmaddubsw m0, m3 pmaddubsw m1, m3 paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 1] movu m2, [tlq+33] pmaddubsw m1, m3 pmaddubsw m2, m3 paddw m1, m2 jmp .dcgen .s64: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m0 mova [dstq+stride3q +32*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET .s64i: WIN64_SPILL_XMM 10 movu m2, [tlq+1] movu m4, [tlq+33] sub tlq, hq punpcklbw m1, m0, m2 punpckhbw m2, m0, m2 punpcklbw m3, m0, m4 punpckhbw m4, m0, m4 mov r6d, hd lea r3, [ibp_weights] shr r6d, 2 lea r3, [r3+r6*4-2] sub hd, r6d neg r6 .s64i_toponly_loop: vpbroadcastw m9, [r3+r6*2] pmaddubsw m6, m1, m9 pmaddubsw m7, m2, m9 pmaddubsw m8, m3, m9 pmaddubsw m9, m4, m9 REPX {pmulhrsw x, m5}, m6, m7, m8, m9 packuswb m6, m7 packuswb m8, m9 mova [dstq+ 0], m6 mova [dstq+32], m8 add dstq, strideq inc r6 jl .s64i_toponly_loop WIN64_RESTORE_XMM movu m3, [ibp_weights+30] test hd, 1 jz .s64i_leftonly_loop vpbroadcastb m1, [tlq+hq-1] punpcklbw m1, m0, m1 ; dc, left[y] [4x] pmaddubsw m1, m3 pmulhrsw m1, m5 packuswb m1, m1 vpermq m1, m1, q3120 vpblendd m1, m0, 11110000b mova [dstq+strideq*0+ 0], m1 mova [dstq+strideq*0+32], m0 dec hq add dstq, strideq .s64i_leftonly_loop: vpbroadcastb m1, [tlq+hq-1] vpbroadcastb m2, [tlq+hq-2] REPX {punpcklbw x, m0, x}, m1, m2 ; dc, left[y] [4x] REPX {pmaddubsw x, m3 }, m1, m2 REPX {pmulhrsw x, m5 }, m1, m2 packuswb m1, m2 vpermq m1, m1, q3120 vpblendd m2, m1, m0, 11110000b vperm2i128 m1, m0, q0301 mova [dstq+strideq*0+ 0], m2 mova [dstq+strideq*0+32], m0 mova [dstq+strideq*1+ 0], m1 mova [dstq+strideq*1+32], m0 sub hq, 2 lea dstq, [dstq+strideq*2] jg .s64i_leftonly_loop RET cglobal ipred_dc_128_8bpc, 3, 8, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4+10*4] vpbroadcastd m0, [r5-ipred_dc_avx2_table+pb_128] mova m1, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 8, 6, dst, stride, tl, w, h, stride3 lea r6, [ipred_dc_avx2_table] movu m0, [tlq+ 1] movu m1, [tlq+33] movifnidn wd, wm movifnidn hd, hm %if UNIX64 test r5w, 0x8000 ; multi-mrl %else test word r5m, 0x8000 ; multi-mrl %endif jz .no_multi_mrl lea r5d, [wd+hd] pavgb m0, [tlq+r5*2+2] pavgb m1, [tlq+r5*2+34] .no_multi_mrl: tzcnt wd, wd movsxd wq, [r6+wq*4+10*4] add wq, r6 lea stride3q, [strideq*3] jmp wq %macro IPRED_H 2-3 ; w, store_type, multi-mrl-suffix .w%1%3: %ifidn %3, m movd xm0, [tlq-4] movd xm1, [tlq+r5*2-3] sub tlq, 4 pavgb xm0, xm1 vpbroadcastb m3, xm0 psrld xm0, 8 vpbroadcastb m2, xm0 psrld xm0, 8 vpbroadcastb m1, xm0 psrld xm0, 8 vpbroadcastb m0, xm0 %else vpbroadcastb m0, [tlq-1] vpbroadcastb m1, [tlq-2] vpbroadcastb m2, [tlq-3] sub tlq, 4 vpbroadcastb m3, [tlq+0] %endif mov%2 [dstq+strideq*0], m0 %if %1 == 64 mov%2 [dstq+strideq*0+32], m0 %endif mov%2 [dstq+strideq*1], m1 %if %1 == 64 mov%2 [dstq+strideq*1+32], m1 %endif mov%2 [dstq+strideq*2], m2 %if %1 == 64 mov%2 [dstq+strideq*2+32], m2 %endif mov%2 [dstq+stride3q ], m3 %if %1 == 64 mov%2 [dstq+stride3q +32], m3 %endif lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1%3 RET ALIGN function_align %endmacro INIT_XMM avx2 cglobal ipred_h_8bpc, 3, 7, 4, dst, stride, tl, w, h, _, stride3 lea r6, [ipred_h_avx2_table] movifnidn wd, wm movifnidn hd, hm %if UNIX64 test r5w, 0x8000 ; multi-mrl %else test word r5m, 0x8000 ; multi-mrl %endif jnz .multi_mrl tzcnt wd, wd movsxd wq, [r6+wq*4] add wq, r6 lea stride3q, [strideq*3] jmp wq IPRED_H 4, d IPRED_H 8, q IPRED_H 16, u INIT_YMM avx2 IPRED_H 32, u IPRED_H 64, u .multi_mrl: lea r5d, [hd+wd] tzcnt wd, wd movsxd wq, [r6+wq*4+5*4] add wq, r6 lea stride3q, [strideq*3] jmp wq INIT_XMM avx2 IPRED_H 4, d, m IPRED_H 8, q, m IPRED_H 16, u, m INIT_YMM avx2 IPRED_H 32, u, m IPRED_H 64, u, m %macro PAETH 2 ; top, ldiff pavgb m1, m%1, m3 ; Calculating tldiff normally requires pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it pand m0, m4 ; in 8-bit with some tricks which avoids psubusb m2, m5, m1 ; having to unpack everything to 16-bit. psubb m1, m0 psubusb m1, m5 por m1, m2 paddusb m1, m1 por m1, m0 ; min(tldiff, 255) psubusb m2, m5, m3 psubusb m0, m3, m5 por m2, m0 ; tdiff pminub m2, m%2 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff vpblendvb m0, m%1, m3, m0 pminub m1, m2 pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff vpblendvb m0, m5, m0, m1 %endmacro cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h %define base r5-ipred_paeth_avx2_table lea r5, [ipred_paeth_avx2_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m4, [base+pb_1] add wq, r5 jmp wq .w4: vpbroadcastd m6, [tlq+1] ; top mova m8, [base+ipred_h_shuf] lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: sub tlq, 8 vpbroadcastq m3, [tlq] pshufb m3, m8 ; left PAETH 6, 7 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: vpbroadcastq m6, [tlq+1] mova m8, [base+ipred_h_shuf] lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: sub tlq, 4 vpbroadcastd m3, [tlq] pshufb m3, m8 PAETH 6, 7 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m6, [tlq+1] mova xm8, xm4 ; lower half = 1, upper half = 0 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: sub tlq, 2 vpbroadcastd m3, [tlq] pshufb m3, m8 PAETH 6, 7 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: dec tlq vpbroadcastb m3, [tlq] PAETH 6, 7 movu [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: movu m6, [tlq+ 1] movu m7, [tlq+33] %if WIN64 movaps r4m, xmm9 %endif psubusb m8, m5, m6 psubusb m0, m6, m5 psubusb m9, m5, m7 psubusb m1, m7, m5 por m8, m0 por m9, m1 .w64_loop: dec tlq vpbroadcastb m3, [tlq] PAETH 6, 8 mova [dstq+32*0], m0 PAETH 7, 9 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop %if WIN64 movaps xmm9, r4m %endif RET cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h %define base r6-ipred_smooth_v_avx2_table tzcnt wd, wm mov hd, hm tzcnt r6d, hd mov r5d, 32768 shrx r5d, r5d, r6d movd xm1, r5d lea r6, [ipred_smooth_v_avx2_table] movsxd wq, [r6+wq*4] movd xm4, hd neg hq vpbroadcastb m5, [tlq+hq-1] ; bottom vpbroadcastw m4, xm4 vpbroadcastd m3, [base+pw_512] vpbroadcastw m1, xm1 ; 32768 >> log2(h) add wq, r6 jmp wq DEFINE_ARGS dst, stride, tl, stride3, h .w4: WIN64_SPILL_XMM 10 vpbroadcastd m2, [base+pb_4_m4] psubb m4, [base+pb_4x_1_m1_4x_3_m3_4x_2_m2_4x_4_m4] vpbroadcastd m0, [tlq+1] punpcklbw m5, m0, m5 ; top, bottom lea stride3q, [strideq*3] psubb m6, m4, m2 pmaddubsw m7, m5, m4 pmaddubsw m8, m5, m6 psubb m4, m6, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred punpckhbw m8, m7, m0 punpcklbw m7, m0 cmp hd, -16 jg .w4x48 mova m9, [base+pb_8x32_4x56_8_4x48_16_4x60_4] mova m0, [base+pb_4x62_2_4x64_0_4x63_1_4x64_0] jmp .w4cont .w4x48: mova m9, [base+pb_8x32_4x62_2_4x56_8_4x64_0] vpbroadcastd m0, [base+pw_64] .w4cont: pmaddubsw m7, m9 pmaddubsw m8, m0 REPX {pmulhrsw x, m3}, m7, m8 .w4_loop: packuswb m7, m8 ; dst vextracti128 xm8, m7, 1 movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm8 pextrd [dstq+strideq*2], xm7, 1 pextrd [dstq+stride3q ], xm8, 1 cmp hd, -4 je .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm7, 2 pextrd [dstq+strideq*1], xm8, 2 pextrd [dstq+strideq*2], xm7, 3 pextrd [dstq+stride3q ], xm8, 3 lea dstq, [dstq+strideq*4] add hd, 8 jge .w4_ret ; non-first lines don't need to do pred adjustment psubb m6, m4, m2 pmaddubsw m7, m5, m4 pmaddubsw m8, m5, m6 psubb m4, m6, m2 REPX {pmulhrsw x, m1}, m7, m8 jmp .w4_loop .w4_ret: RET ALIGN function_align .w8: WIN64_SPILL_XMM 12 vpbroadcastd m2, [base+pb_2_m2] psubb m4, [base+pb_8x_1_m1_8x_2_m2] vpbroadcastq m0, [tlq+1] punpcklbw m5, m0, m5 lea stride3q, [strideq*3] cmp hd, -4 je .w8x4 movddup m9, [pb_8x32_4x56_8_4x48_16_4x60_4] movddup m10, [pb_8x32_4x56_8_4x48_16_4x60_4+8] movddup m11, [pb_4x62_2_4x64_0_4x63_1_4x64_0] jmp .w8_loop .w8x4: movddup m9, [pb_8x32_4x62_2_4x56_8_4x64_0] movddup m10, [pb_8x32_4x62_2_4x56_8_4x64_0+8] pxor m11, m11 .w8_loop: psubb m6, m4, m2 pmaddubsw m7, m5, m4 pmaddubsw m8, m5, m6 psubb m4, m6, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred punpckhbw m8, m7, m0 punpcklbw m7, m0 pmaddubsw m8, m10 pmaddubsw m7, m9 REPX {pmulhrsw x, m3}, m8, m7 packuswb m7, m8 vextracti128 xm8, m7, 1 movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm8 movhps [dstq+strideq*2], xm7 movhps [dstq+r3 ], xm8 lea dstq, [dstq+strideq*4] add hq, 4 jz .w8_ret ptest m11, m11 jz .w8_loop_noadj mova m9, m11 vpbroadcastd m10, [pw_64] pxor m11, m11 jmp .w8_loop .w8_loop_noadj: psubb m6, m4, m2 pmaddubsw m7, m5, m4 pmaddubsw m8, m5, m6 psubb m4, m6, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred vextracti128 xm8, m7, 1 movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm8 movhps [dstq+strideq*2], xm7 movhps [dstq+r3 ], xm8 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop_noadj .w8_ret: RET ALIGN function_align .w16: WIN64_SPILL_XMM 14 vpbroadcastd m2, [base+pb_2_m2] psubb m4, [base+pb_8x_1_m1_8x_2_m2] vbroadcasti128 m0, [tlq+1] punpcklbw m6, m0, m5 punpckhbw m5, m0, m5 vpbroadcastd m10, [pb_64] cmp hd, -64 je .w16x64 movsldup m9, [pb_4x32_4x3_4x32_4x1_4x16_4x2_4x16_4x0] vpbroadcastd m11, [pb_252] movd xm12, [pd_2] jmp .w16_loop .w16x64: vpbroadcastd m9, [pb_32] vpbroadcastd m11, [pb_254] movd xm12, [pd_1] .w16_loop: pmaddubsw m7, m6, m4 pmaddubsw m8, m5, m4 psubb m4, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred psubb m13, m10, m9 punpcklbw m13, m9 punpckhbw m8, m7, m0 punpcklbw m7, m0 REPX {pmaddubsw x, m13}, m8, m7 REPX {pmulhrsw x, m3 }, m8, m7 packuswb m7, m8 movu [dstq+strideq*0], xm7 vextracti128 [dstq+strideq*1], m7, 1 lea dstq, [dstq+strideq*2] add hq, 2 jz .w16_ret pand m9, m11 ptest m9, m9 jz .w16_loop_noadj psrlw m9, xm12 jmp .w16_loop .w16_loop_noadj: pmaddubsw m7, m6, m4 pmaddubsw m8, m5, m4 psubb m4, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred movu [dstq+strideq*0], xm7 vextracti128 [dstq+strideq*1], m7, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w16_loop_noadj .w16_ret: RET ALIGN function_align .w32: WIN64_SPILL_XMM 14 vpbroadcastd m2, [base+pb_1_m1] psubb m4, m2 movu m0, [tlq+1] punpcklbw m6, m0, m5 punpckhbw m5, m0, m5 vpbroadcastd m9, [pb_32] vpbroadcastd m10, [pb_64] vpbroadcastd m11, [pb_254] xor r2d, r2d xor r5d, r5d cmp hd, -32 jg .w32_loop mov r2d, 1 .w32_loop: pmaddubsw m7, m6, m4 pmaddubsw m8, m5, m4 psubb m4, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred psubb m12, m10, m9 punpcklbw m12, m9 punpckhbw m8, m7, m0 punpcklbw m7, m0 REPX {pmaddubsw x, m12}, m8, m7 REPX {pmulhrsw x, m3 }, m8, m7 packuswb m7, m8 movu [dstq], m7 add dstq, strideq inc hq jz .w32_ret xor r5d, r2d jnz .w32_loop pand m9, m11 ptest m9, m9 jz .w32_loop_noadj psrlw m9, 1 jmp .w32_loop .w32_loop_noadj: pmaddubsw m7, m6, m4 pmaddubsw m8, m5, m4 psubb m4, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred movu [dstq], m7 add dstq, strideq inc hq jl .w32_loop_noadj .w32_ret: RET ALIGN function_align .w64: WIN64_SPILL_XMM 16 vpbroadcastd m2, [base+pb_1_m1] psubb m4, m2 movu m0, [tlq+ 1] movu m12, [tlq+33] punpcklbw m13, m12, m5 punpckhbw m14, m12, m5 punpcklbw m6, m0, m5 punpckhbw m5, m0, m5 vpbroadcastd m9, [pb_32] vpbroadcastd m10, [pb_64] vpbroadcastd m11, [pb_254] xor r2d, r2d xor r5d, r5d cmp hd, -16 jg .w64_loop mov r2d, 1 .w64_loop: pmaddubsw m7, m6, m4 pmaddubsw m8, m5, m4 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred psubb m15, m10, m9 punpcklbw m15, m9 punpckhbw m8, m7, m0 punpcklbw m7, m0 REPX {pmaddubsw x, m15}, m8, m7 REPX {pmulhrsw x, m3 }, m8, m7 packuswb m7, m8 mova [dstq+ 0], m7 pmaddubsw m7, m13, m4 pmaddubsw m8, m14, m4 psubb m4, m2 REPX {pmulhrsw x, m1}, m7, m8 packuswb m7, m8 ; pred punpckhbw m8, m7, m12 punpcklbw m7, m12 REPX {pmaddubsw x, m15}, m8, m7 REPX {pmulhrsw x, m3 }, m8, m7 packuswb m7, m8 mova [dstq+32], m7 add dstq, strideq inc hq jz .w64_ret xor r5d, r2d jnz .w64_loop pand m9, m11 ptest m9, m9 jz .w64_loop_noadj psrlw m9, 1 jmp .w64_loop .w64_loop_noadj: pmaddubsw m7, m6, m4 pmaddubsw m8, m5, m4 pmaddubsw m9, m13, m4 pmaddubsw m10, m14, m4 psubb m4, m2 REPX {pmulhrsw x, m1}, m7, m8, m9, m10 packuswb m7, m8 ; pred packuswb m9, m10 mova [dstq+ 0], m7 mova [dstq+32], m9 add dstq, strideq inc hq jl .w64_loop_noadj .w64_ret: RET cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h %define base r5-ipred_smooth_h_avx2_table lea r5, [ipred_smooth_h_avx2_table] mov wd, wm vpbroadcastb m3, [tlq+wq+1] ; right tzcnt wd, wd mov hd, hm movsxd wq, [r5+wq*4] add wq, r5 vpbroadcastd m4, [base+pw_512] sub tlq, hq jmp wq DEFINE_ARGS dst, stride, tl, stride3, h .w4: WIN64_SPILL_XMM 9 vpbroadcastq m0, [base+pb_3_1_to_0_4] vpbroadcastd m2, [base+pw_8192] mova m6, [base+pb_4x7_4x5_4x3_4x1_4x6_4x4_4x2_4x0] sub tlq, 8 lea stride3q, [strideq*3] cmp hd, 16 jl .w4x48 vpbroadcastq m1, [base+pb_32_32_48_16_56_8_60_4] jmp .w4_loop .w4x48: vpbroadcastq m1, [base+pb_32_32_56_8_62_2_64_0] .w4_loop: vpbroadcastq m5, [tlq+hq] pshufb m5, m6 punpcklbw m7, m5, m3 punpckhbw m8, m5, m3 REPX {pmaddubsw x, m0}, m7, m8 REPX {pmulhrsw x, m2}, m7, m8 packuswb m7, m8 punpckhbw m8, m7, m5 punpcklbw m7, m5 REPX {pmaddubsw x, m1}, m8, m7 REPX {pmulhrsw x, m4}, m8, m7 packuswb m7, m8 vextracti128 xm8, m7, 1 movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm8 pextrd [dstq+strideq*2], xm7, 1 pextrd [dstq+stride3q ], xm8, 1 cmp hd, 4 je .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm7, 2 pextrd [dstq+strideq*1], xm8, 2 pextrd [dstq+strideq*2], xm7, 3 pextrd [dstq+stride3q ], xm8, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .w4_ret: RET ALIGN function_align .w8: WIN64_SPILL_XMM 9 vbroadcasti128 m0, [base+pb_7_1_to_0_8] vpbroadcastd m2, [base+pw_4096] movshdup m6, [base+pb_4x32_4x3_4x32_4x1_4x16_4x2_4x16_4x0] sub tlq, 4 lea stride3q, [strideq*3] cmp hd, 8 jl .w8x4 vbroadcasti128 m1, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_2x64_0] jmp .w8_loop .w8x4: vbroadcasti128 m1, [base+pb_32_32_56_8_62_2_5x64_0] .w8_loop: vpbroadcastd m5, [tlq+hq] pshufb m5, m6 punpcklbw m7, m5, m3 punpckhbw m8, m5, m3 REPX {pmaddubsw x, m0}, m7, m8 REPX {pmulhrsw x, m2}, m7, m8 packuswb m7, m8 punpckhbw m8, m7, m5 punpcklbw m7, m5 REPX {pmaddubsw x, m1}, m8, m7 REPX {pmulhrsw x, m4}, m8, m7 packuswb m7, m8 vextracti128 xm8, m7, 1 movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm8 movhps [dstq+strideq*2], xm7 movhps [dstq+r3 ], xm8 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: WIN64_SPILL_XMM 11 vpbroadcastd m10, [base+pb_8_m8] vbroadcasti128 m0, [base+pb_15_1_to_8_8] psubb m10, m0, m10 vpbroadcastd m2, [base+pw_2048] vpbroadcastd xm6, [base+pb_1] sub tlq, 2 cmp hd, 64 je .w16x64 vbroadcasti128 m1, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_2x64_0] vpbroadcastd m9, [base+pw_64] jmp .w16_loop .w16x64: vbroadcasti128 m1, [base+pb_4x32_2x48_16_2x56_2x8_2x60_4] vbroadcasti128 m9, [base+pb_2x62_2_2x63_1_4x64_0] .w16_loop: vpbroadcastw m5, [tlq+hq] pshufb m5, m6 punpcklbw m7, m5, m3 pmaddubsw m8, m7, m10 pmaddubsw m7, m0 REPX {pmulhrsw x, m2}, m8, m7 packuswb m7, m8 punpckhbw m8, m7, m5 punpcklbw m7, m5 pmaddubsw m8, m9 pmaddubsw m7, m1 REPX {pmulhrsw x, m4}, m7, m8 packuswb m7, m8 movu [dstq+strideq*0], xm7 vextracti128 [dstq+strideq*1], m7, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: WIN64_SPILL_XMM 10 vpbroadcastd m6, [base+pb_8_m8] mova m0, [base+pb_31_1_to_24_8_and_15_17_to_8_24] psubb m6, m0, m6 vpbroadcastd m2, [base+pw_1024] dec tlq cmp hd, 16 jg .w32x3264 mova m1, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_10x64_0] vpbroadcastd m9, [base+pw_64] jmp .w32_loop .w32x3264: mova m1, [base+pb_4x32_2x48_16_2x56_2x8_2x60_4_8x64_0] mova m9, [base+pb_2x62_2_2x63_1_12x64_0] .w32_loop: vpbroadcastb m5, [tlq+hq] pshufb m5, m6 punpcklbw m7, m5, m3 pmaddubsw m8, m7, m6 pmaddubsw m7, m0 REPX {pmulhrsw x, m2}, m8, m7 packuswb m7, m8 punpckhbw m8, m7, m5 punpcklbw m7, m5 pmaddubsw m8, m9 pmaddubsw m7, m1 REPX {pmulhrsw x, m4}, m7, m8 packuswb m7, m8 movu [dstq], m7 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: WIN64_SPILL_XMM 13 vpbroadcastd m6, [base+pb_8_m8] mova m0, [base+pb_63_1_to_56_8_and_47_17_to_40_24] paddb m2, m6, m6 psubb m6, m0, m6 paddb m2, m2 psubb m10, m0, m2 psubb m2, m6, m2 dec tlq cmp hd, 8 jg .w64x163264 mova m1, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_10x64_0] vpbroadcastd m9, [base+pw_64] jmp .w64_loop .w64x163264: mova m1, [base+pb_4x32_2x48_16_2x56_2x8_2x60_4_8x64_0] mova m9, [base+pb_2x62_2_2x63_1_12x64_0] .w64_loop: vpbroadcastb m5, [tlq+hq] pshufb m5, m6 punpcklbw m7, m5, m3 pmaddubsw m11, m7, m10 pmaddubsw m12, m7, m2 pmaddubsw m8, m7, m6 pmaddubsw m7, m0 REPX {pmulhrsw x, m4}, m11, m12, m8, m7 packuswb m7, m8 packuswb m11, m12 punpckhbw m8, m7, m5 punpcklbw m7, m5 pmaddubsw m8, m9 pmaddubsw m7, m1 REPX {pmulhrsw x, m4}, m7, m8 packuswb m7, m8 mova [dstq+32*0], m7 mova [dstq+32*1], m11 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_8bpc, 3, 7, 16, 0 - 11 * mmsize, dst, stride, tl, w, h %define base r6-ipred_smooth_avx2_table mov wd, wm mov hd, hm vpbroadcastb m4, [tlq+wq+1] ; right movd xm2, hd mov r5d, 32768 tzcnt r6d, hd shrx r5d, r5d, r6d ; 32768 >> log2(h) movd xm3, r5d lea r6, [ipred_smooth_avx2_table] tzcnt wd, wd mov r5, tlq sub r5, hq movsxd wq, [r6+wq*4] vpbroadcastb m0, [r5-1] ; bottom vpbroadcastd m1, [base+pw_512] vpbroadcastw m2, xm2 vpbroadcastw m3, xm3 add wq, r6 jmp wq ; m0=bottom ; m1=pw_512 ; m2=height ; m3=32768>>log2(h) ; m4=right DEFINE_ARGS dst, stride, tl, stride3, h .w4: vpbroadcastd m5, [base+pb_4_m4] psubb m2, [base+pb_4x_1_m1_4x_3_m3_4x_2_m2_4x_4_m4] vpbroadcastq m7, [base+pb_3_1_to_0_4] vpbroadcastd m8, [base+pw_8192] mova m9, [base+pb_4x7_4x5_4x3_4x1_4x6_4x4_4x2_4x0] vpbroadcastd m6, [tlq+1] sub tlq, hq sub tlq, 8 punpcklbw m0, m6, m0 ; top, bottom lea stride3q, [strideq*3] cmp hd, 16 jl .w4x48 mova m10, [base+pb_8x32_4x56_8_4x48_16_4x60_4] mova m11, [base+pb_4x62_2_4x64_0_4x63_1_4x64_0] vpbroadcastq m12, [base+pb_32_32_48_16_56_8_60_4] jmp .w4cont .w4x48: mova m10, [base+pb_8x32_4x62_2_4x56_8_4x64_0] vpbroadcastd m11, [base+pw_64] vpbroadcastq m12, [base+pb_32_32_56_8_62_2_64_0] .w4cont: ; vertical filter psubb m13, m2, m5 pmaddubsw m14, m0, m2 pmaddubsw m15, m0, m13 psubb m2, m13, m5 REPX {pmulhrsw x, m3}, m14, m15 packuswb m14, m15 ; pred punpckhbw m15, m14, m6 punpcklbw m14, m6 pmaddubsw m15, m11 pmaddubsw m14, m10 REPX {pmulhrsw x, m1}, m15, m14 .w4_loop: packuswb m14, m15 ; smooth_v output ; horizontal filter vpbroadcastq m13, [tlq+hq] pshufb m13, m9 punpcklbw m15, m13, m4 punpckhbw m10, m13, m4 REPX {pmaddubsw x, m7}, m15, m10 REPX {pmulhrsw x, m8}, m15, m10 packuswb m15, m10 punpckhbw m10, m15, m13 punpcklbw m15, m13 REPX {pmaddubsw x, m12}, m10, m15 REPX {pmulhrsw x, m1 }, m10, m15 packuswb m15, m10 ; smooth_h output ; write-out pavgb m14, m15 vextracti128 xm15, m14, 1 movd [dstq+strideq*0], xm14 movd [dstq+strideq*1], xm15 pextrd [dstq+strideq*2], xm14, 1 pextrd [dstq+r3 ], xm15, 1 cmp hd, 4 je .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm14, 2 pextrd [dstq+strideq*1], xm15, 2 pextrd [dstq+strideq*2], xm14, 3 pextrd [dstq+r3 ], xm15, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jz .w4_ret ; vertical filter psubb m13, m2, m5 pmaddubsw m14, m0, m2 pmaddubsw m15, m0, m13 psubb m2, m13, m5 REPX {pmulhrsw x, m3}, m14, m15 jmp .w4_loop .w4_ret: RET ALIGN function_align .w8: vpbroadcastd m5, [base+pb_2_m2] psubb m2, [base+pb_8x_1_m1_8x_2_m2] vbroadcasti128 m7, [base+pb_7_1_to_0_8] vpbroadcastd m8, [base+pw_4096] movshdup m9, [base+pb_4x32_4x3_4x32_4x1_4x16_4x2_4x16_4x0] vpbroadcastq m6, [tlq+1] sub tlq, hq sub tlq, 4 punpcklbw m0, m6, m0 ; top, bottom lea stride3q, [strideq*3] cmp hd, 8 jl .w8x4 movddup m10, [base+pb_8x32_4x56_8_4x48_16_4x60_4] movddup m11, [base+pb_8x32_4x56_8_4x48_16_4x60_4+8] vbroadcasti128 m12, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_2x64_0] mov r5d, 1 jmp .w8_loop .w8x4: movddup m10, [base+pb_8x32_4x62_2_4x56_8_4x64_0] movddup m11, [base+pb_8x32_4x62_2_4x56_8_4x64_0+8] vbroadcasti128 m12, [base+pb_32_32_56_8_62_2_5x64_0] .w8_loop: ; vertical filter psubb m13, m2, m5 pmaddubsw m14, m0, m2 pmaddubsw m15, m0, m13 psubb m2, m13, m5 REPX {pmulhrsw x, m3}, m14, m15 packuswb m14, m15 ; pred punpckhbw m15, m14, m6 punpcklbw m14, m6 pmaddubsw m15, m11 pmaddubsw m14, m10 REPX {pmulhrsw x, m1}, m15, m14 .w8_loop_noadj_cont: packuswb m14, m15 ; smooth_v output ; horizontal filter vpbroadcastd m13, [tlq+hq] pshufb m13, m9 punpcklbw m15, m13, m4 punpckhbw m10, m13, m4 REPX {pmaddubsw x, m7}, m15, m10 REPX {pmulhrsw x, m8}, m15, m10 packuswb m15, m10 punpckhbw m10, m15, m13 punpcklbw m15, m13 REPX {pmaddubsw x, m12}, m10, m15 REPX {pmulhrsw x, m1 }, m10, m15 packuswb m15, m10 ; smooth_h output ; write-out pavgb m14, m15 vextracti128 xm15, m14, 1 movq [dstq+strideq*0], xm14 movq [dstq+strideq*1], xm15 movhps [dstq+strideq*2], xm14 movhps [dstq+r3 ], xm15 lea dstq, [dstq+strideq*4] sub hd, 4 jz .w8_ret dec r5d jl .w8_loop_noadj movddup m10, [base+pb_4x62_2_4x64_0_4x63_1_4x64_0] vpbroadcastd m11, [pw_64] jmp .w8_loop .w8_loop_noadj: ; vertical filter psubb m13, m2, m5 pmaddubsw m14, m0, m2 pmaddubsw m15, m0, m13 psubb m2, m13, m5 REPX {pmulhrsw x, m3}, m14, m15 jmp .w8_loop_noadj_cont .w8_ret: RET ALIGN function_align .w16: vpbroadcastd m5, [base+pb_2_m2] psubb m2, [base+pb_8x_1_m1_8x_2_m2] paddb m15, m5, m5 paddb m15, m15 ; pb_8_m8 vbroadcasti128 m7, [base+pb_15_1_to_8_8] psubb m15, m7, m15 ; pb_7_9_to_0_16 vpbroadcastd m8, [base+pw_2048] vpbroadcastd xm9, [base+pb_1] vbroadcasti128 m6, [tlq+1] sub tlq, hq sub tlq, 2 punpckhbw m13, m6, m0 punpcklbw m0, m6, m0 ; top, bottom mova [rsp+0*mmsize], m9 mova [rsp+1*mmsize], m5 mova [rsp+4*mmsize], m15 cmp hd, 64 je .w16x64 mov r5d, 2 movsldup m10, [base+pb_4x32_4x3_4x32_4x1_4x16_4x2_4x16_4x0] vbroadcasti128 m11, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_2x64_0] vpbroadcastd m12, [base+pw_64] vpbroadcastd m9, [base+pb_252] mov qword [rsp+5*mmsize], 2 jmp .w16_cont .w16x64: mov r5d, 5 vpbroadcastd m10, [base+pb_32] vbroadcasti128 m11, [base+pb_4x32_2x48_16_2x56_2x8_2x60_4] vbroadcasti128 m12, [base+pb_2x62_2_2x63_1_4x64_0] vpbroadcastd m9, [base+pb_254] mov qword [rsp+5*mmsize], 1 .w16_cont: mova [rsp+2*mmsize], m11 mova [rsp+3*mmsize], m12 vpbroadcastd m5, [pb_64] .w16_loop: ; vertical filter pmaddubsw m14, m0, m2 pmaddubsw m15, m13, m2 psubb m2, [rsp+1*mmsize] REPX {pmulhrsw x, m3}, m14, m15 psubb m11, m5, m10 packuswb m14, m15 ; pred punpckhbw m15, m14, m6 punpcklbw m11, m10 punpcklbw m14, m6 pmaddubsw m15, m11 pmaddubsw m14, m11 REPX {pmulhrsw x, m1}, m15, m14 .w16_loop_noadj_cont: packuswb m14, m15 ; smooth_v output ; horizontal filter vpbroadcastw m11, [tlq+hq] pshufb m11, [rsp+0*mmsize] punpcklbw m15, m11, m4 pmaddubsw m12, m15, [rsp+4*mmsize] pmaddubsw m15, m7 REPX {pmulhrsw x, m8}, m15, m12 packuswb m15, m12 punpckhbw m12, m15, m11 punpcklbw m15, m11 pmaddubsw m12, [rsp+3*mmsize] pmaddubsw m15, [rsp+2*mmsize] REPX {pmulhrsw x, m1}, m12, m15 packuswb m15, m12 ; smooth_h output ; write-out pavgb m14, m15 movu [dstq+strideq*0], xm14 vextracti128 [dstq+strideq*1], m14, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jz .w16_ret dec r5d jl .w16_loop_noadj pand m10, m9 psrlw m10, [rsp+5*mmsize] jmp .w16_loop .w16_loop_noadj: ; vertical filter pmaddubsw m14, m0, m2 pmaddubsw m15, m13, m2 psubb m2, [rsp+1*mmsize] REPX {pmulhrsw x, m3}, m14, m15 jmp .w16_loop_noadj_cont .w16_ret: RET ALIGN function_align .w32: vpbroadcastd m5, [base+pb_1_m1] psubb m2, m5 mova m7, [base+pb_31_1_to_24_8_and_15_17_to_8_24] vpbroadcastd m15, [base+pb_8_m8] psubb m15, m7, m15 vpbroadcastd m8, [base+pw_1024] movu m6, [tlq+1] sub tlq, hq dec tlq punpckhbw m13, m6, m0 punpcklbw m0, m6, m0 ; top, bottom ; stack0 is unused mova [rsp+1*mmsize], m5 mova [rsp+4*mmsize], m15 vpbroadcastd m10, [base+pb_32] vpbroadcastd m5, [base+pb_64] vpbroadcastd m9, [base+pb_254] cmp hd, 16 jg .w32x3264 mov r5d, 5 mova m11, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_10x64_0] vpbroadcastd m12, [base+pw_64] xor r6d, r6d jmp .w32_cont .w32x3264: mov r5d, 11 mova m11, [base+pb_4x32_2x48_16_2x56_2x8_2x60_4_8x64_0] mova m12, [base+pb_2x62_2_2x63_1_12x64_0] mov r6d, 0x8000 .w32_cont: xor r3d, r3d mova [rsp+2*mmsize], m11 mova [rsp+3*mmsize], m12 .w32_loop: ; vertical filter pmaddubsw m14, m0, m2 pmaddubsw m15, m13, m2 psubb m2, [rsp+1*mmsize] REPX {pmulhrsw x, m3}, m14, m15 psubb m11, m5, m10 packuswb m14, m15 ; pred punpckhbw m15, m14, m6 punpcklbw m11, m10 punpcklbw m14, m6 pmaddubsw m15, m11 pmaddubsw m14, m11 REPX {pmulhrsw x, m1}, m15, m14 .w32_loop_noadj_cont: packuswb m14, m15 ; smooth_v output ; horizontal filter vpbroadcastb m11, [tlq+hq] punpcklbw m15, m11, m4 pmaddubsw m12, m15, [rsp+4*mmsize] pmaddubsw m15, m7 REPX {pmulhrsw x, m8}, m15, m12 packuswb m15, m12 punpckhbw m12, m15, m11 punpcklbw m15, m11 pmaddubsw m12, [rsp+3*mmsize] pmaddubsw m15, [rsp+2*mmsize] REPX {pmulhrsw x, m1}, m12, m15 packuswb m15, m12 ; smooth_h output ; write-out pavgb m14, m15 movu [dstq], m14 add dstq, strideq dec hd jz .w32_ret dec r5d jl .w32_loop_noadj xor r3d, r6d jnz .w32_loop pand m10, m9 psrlw m10, 1 jmp .w32_loop .w32_loop_noadj: ; vertical filter pmaddubsw m14, m0, m2 pmaddubsw m15, m13, m2 psubb m2, [rsp+1*mmsize] REPX {pmulhrsw x, m3}, m14, m15 jmp .w32_loop_noadj_cont .w32_ret: RET ALIGN function_align .w64: vpbroadcastd m5, [base+pb_1_m1] vpbroadcastd m13, [base+pb_8_m8] psubb m2, m5 paddb m15, m13, m13 mova m7, [base+pb_63_1_to_56_8_and_47_17_to_40_24] paddb m15, m15 ; pb_32_m32 psubb m13, m7, m13 psubb m14, m7, m15 psubb m15, m13, m15 movu m6, [tlq+1] movu m8, [tlq+33] sub tlq, hq dec tlq mova [rsp+1*mmsize], m5 mova [rsp+4*mmsize], m7 mova [rsp+5*mmsize], m13 mova [rsp+6*mmsize], m14 mova [rsp+7*mmsize], m15 punpckhbw m9, m8, m0 punpcklbw m7, m8, m0 punpckhbw m13, m6, m0 punpcklbw m0, m6, m0 ; top, bottom vpbroadcastd m10, [base+pb_32] vpbroadcastd m5, [base+pb_64] vpbroadcastd m15, [base+pb_254] cmp hd, 8 jg .w64x163264 mov r5d, 5 mova m11, [base+pb_32_32_48_16_56_8_60_4_62_2_63_1_10x64_0] vpbroadcastd m12, [base+pw_64] xor r6d, r6d jmp .w64_cont .w64x163264: mov r5d, 11 mova m11, [base+pb_4x32_2x48_16_2x56_2x8_2x60_4_8x64_0] mova m12, [base+pb_2x62_2_2x63_1_12x64_0] mov r6d, 0x8000 .w64_cont: xor r3d, r3d mova [rsp+0*mmsize], m15 mova [rsp+2*mmsize], m11 mova [rsp+3*mmsize], m12 mova [rsp+8*mmsize], m5 ; m0/13/7/9=top,bottom interleaved ; m1=pw_512 ; m2=pb_hmin1_1 ; m3=pw_32768>>log2(h) ; m4=right ; m6/8=top ; m10=vertical secondary weights [e.g. pb_32] ; m11/m12/m14/m15=free ; rsp0=pb_254 ; rsp1=pb_1_m1 ; rsp2/3=horizontal secondary weights already interleaved ; rsp4-7=horizontal primary weights already interleaved ; rsp8=pb_64 ; rsp9/10=temp storage for smooth_v while we run smooth_h .w64_loop: ; vertical filter pmaddubsw m14, m0, m2 pmaddubsw m15, m13, m2 pmaddubsw m11, m7, m2 pmaddubsw m12, m9, m2 psubb m2, [rsp+1*mmsize] mova m5, [rsp+8*mmsize] REPX {pmulhrsw x, m3}, m14, m15, m11, m12 psubb m5, m10 packuswb m14, m15 ; pred packuswb m11, m12 punpcklbw m5, m10 punpckhbw m15, m14, m6 punpcklbw m14, m6 punpckhbw m12, m11, m8 punpcklbw m11, m8 REPX {pmaddubsw x, m5}, m15, m14, m12, m11 REPX {pmulhrsw x, m1}, m15, m14, m12, m11 .w64_loop_noadj_cont: packuswb m14, m15 ; smooth_v output packuswb m11, m12 mova [rsp+ 9*mmsize], m14 mova [rsp+10*mmsize], m11 ; horizontal filter vpbroadcastb m5, [tlq+hq] punpcklbw m12, m5, m4 pmaddubsw m14, m12, [rsp+4*mmsize] pmaddubsw m15, m12, [rsp+5*mmsize] pmaddubsw m11, m12, [rsp+6*mmsize] pmaddubsw m12, [rsp+7*mmsize] REPX {pmulhrsw x, m1}, m14, m15, m11, m12 packuswb m14, m15 packuswb m11, m12 punpckhbw m15, m14, m5 punpcklbw m14, m5 pmaddubsw m15, [rsp+3*mmsize] pmaddubsw m14, [rsp+2*mmsize] REPX {pmulhrsw x, m1}, m15, m14 packuswb m14, m15 ; smooth_h output ; write-out pavgb m14, [rsp+ 9*mmsize] pavgb m11, [rsp+10*mmsize] mova [dstq+ 0], m14 mova [dstq+32], m11 add dstq, strideq dec hd jz .w32_ret dec r5d jl .w64_loop_noadj xor r3d, r6d jnz .w64_loop pand m10, [rsp+0*mmsize] psrlw m10, 1 jmp .w64_loop .w64_loop_noadj: ; vertical filter pmaddubsw m14, m0, m2 pmaddubsw m15, m13, m2 pmaddubsw m11, m7, m2 pmaddubsw m12, m9, m2 psubb m2, [rsp+1*mmsize] REPX {pmulhrsw x, m3}, m14, m15, m11, m12 jmp .w64_loop_noadj_cont .w64_ret: RET cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase lea r6, [ipred_z1_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea r7, [dr_intra_derivative] inc tlq movsxd wq, [r6+wq*4] add wq, r6 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [r7+dxq] xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m3, [pw_512] vpbroadcastd m4, [pw_62] vpbroadcastd m5, [pw_64] jmp wq .w4: cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) ALLOC_STACK -32, 8 mova xm1, [tlq-1] pshufb xm0, xm1, [z_upsample1] pshufb xm1, [z_upsample2] vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse add dxd, dxd ; pw_512 (which is already in m3) pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 pextrd [rsp+16], xm1, 3 ; top[max_base_x] pmaddubsw xm1, xm2 movd xm7, dxd mov r3d, dxd ; xpos vpbroadcastw m7, xm7 paddw xm1, xm0 movq xm0, [tlq] pmulhrsw xm1, xm3 pslldq m6, m7, 8 paddw xm2, xm7, xm7 lea r2, [strideq*3] paddw m6, m7 packuswb xm1, xm1 paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 punpcklbw xm0, xm1 psllw m7, 2 mova [rsp], xm0 .w4_upsample_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 vpbroadcastq m1, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vpbroadcastq m2, [rsp+r5] lea r5d, [r3+dxq] shr r3d, 6 ; base2 movq xm0, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 movhps xm0, [rsp+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 paddw m6, m7 ; xpos += dx pmulhrsw m0, m3 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r2 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_upsample_loop RET ALIGN function_align .filter_strength: ; w4/w8/w16 ; The C version uses a lot of branches, but we can do all the comparisons ; in parallel and use popcnt to get the final filter strength value. %define base r3-z_filter_t0 lea r3, [z_filter_t0] movd xm0, maxbased movd xm2, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m2, xm2 pcmpeqb m1, m0, [base+z_filter_wh] pand m1, m2 mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases pcmpgtb m1, m2 pmovmskb r5d, m1 ret .w4_no_upsample: ALLOC_STACK -16, 11 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m7, [base+pb_8] vbroadcasti128 m2, [tlq-1] pminub m1, m7, [base+z_filter_s] vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] pminub m7, [base+z_filter_s+8] vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] pshufb m0, m2, m1 shufps m1, m7, q2121 pmaddubsw m0, m8 pshufb m1, m2, m1 pmaddubsw m1, m9 pshufb m2, m7 pmaddubsw m2, m10 paddw m0, m1 paddw m0, m2 pmulhrsw m0, m3 mov r3d, 9 mov tlq, rsp cmp hd, 4 cmovne maxbased, r3d vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [tlq], xm0 .w4_main: movd xm6, dxd vpbroadcastq m0, [z_base_inc] ; base_inc << 6 vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 mov r3d, dxd ; xpos movd xm9, maxbased vpbroadcastw m9, xm9 vbroadcasti128 m8, [z1_shuf_w4] psrlw m7, 8 ; top[max_base_x] paddw m10, m6, m6 psubw m9, m0 ; max_base_x vpblendd m6, m10, 0xcc mova xm0, xm10 paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 paddw m10, m10 .w4_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 vpbroadcastq m1, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vpbroadcastq m2, [tlq+r5] lea r5d, [r3+dxq] shr r3d, 6 ; base2 movq xm0, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 movhps xm0, [tlq+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 pshufb m0, m8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 pcmpgtw m1, m9, m6 ; base < max_base_x pmulhrsw m0, m3 paddw m6, m10 ; xpos += dx lea r5, [dstq+strideq*2] vpblendvb m0, m7, m0, m1 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [r5 +strideq*0], xm0 pextrd [r5 +strideq*1], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r3d, maxbased jb .w4_loop packuswb xm7, xm7 lea r6, [strideq*3] .w4_end_loop: movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm7 movd [dstq+strideq*2], xm7 movd [dstq+r6 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_end_loop .w4_end: RET ALIGN function_align .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 ALLOC_STACK -32, 8 movu xm2, [z_filter_s+6] mova xm0, [tlq-1] movd xm6, hd vinserti128 m0, [tlq+7], 1 vpbroadcastb xm6, xm6 vbroadcasti128 m1, [z_upsample1] pminub xm6, xm2 vpbroadcastd m7, [pb_36_m4] vinserti128 m2, xm6, 1 add dxd, dxd pshufb m1, m0, m1 pshufb m2, m0, m2 movd xm6, dxd pmaddubsw m1, m7 pmaddubsw m2, m7 vpbroadcastw m6, xm6 mov r3d, dxd psrldq m0, 1 lea r2, [strideq*3] paddw m7, m6, m6 paddw m1, m2 vpblendd m6, m7, 0xf0 pmulhrsw m1, m3 pslldq m2, m7, 8 paddw m7, m7 paddw m6, m2 packuswb m1, m1 punpcklbw m0, m1 mova [rsp], m0 .w8_upsample_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 movu xm0, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [rsp+r5], 1 lea r5d, [r3+dxq] shr r3d, 6 ; base2 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 punpcklqdq m1, m2, m2 ; frac0 frac1 pmaddubsw m0, m1 movu xm1, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 vinserti128 m1, [rsp+r5], 1 punpckhqdq m2, m2 ; frac2 frac3 pmaddubsw m1, m2 pmulhrsw m0, m3 paddw m6, m7 pmulhrsw m1, m3 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_upsample_loop RET .w8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: ALLOC_STACK -32, 10 lea maxbased, [hq+7] test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w8_main ; filter_strength == 0 popcnt r5d, r5d movu xm2, [tlq] pminub xm1, xm0, [base+z_filter_s+14] vinserti128 m2, [tlq-1], 1 vinserti128 m1, [base+z_filter_s+ 0], 1 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pminub xm0, [base+z_filter_s+22] vinserti128 m0, [base+z_filter_s+ 8], 1 pshufb m6, m2, m1 pmaddubsw m6, m7 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] movzx r3d, byte [tlq+15] shufps m1, m0, q2121 pshufb m1, m2, m1 pmaddubsw m1, m7 paddw m1, m6 sub r5d, 3 jnz .w8_3tap ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, ; which also results in an awkward edge case where out[w*2] is ; slightly different from out[max_base_x] when h > w. vpbroadcastd m7, [z_filter_k+4*8] movzx r2d, byte [tlq+14] pshufb m2, m0 pmaddubsw m2, m7 sub r2d, r3d lea r2d, [r2+r3*8+4] shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 mov [rsp+16], r2b paddw m1, m2 .w8_3tap: pmulhrsw m1, m3 sar r5d, 1 mov tlq, rsp add r5d, 17 ; w*2 + (filter_strength == 3) cmp hd, 16 cmovns maxbased, r5d mov [tlq+r5], r3b vextracti128 xm0, m1, 1 packuswb xm0, xm1 mova [tlq], xm0 .w8_main: movd xm2, dxd vbroadcasti128 m0, [z_base_inc] vpbroadcastw m2, xm2 vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 psrlw m7, 8 psubw m9, m0 mov r3d, dxd paddw m6, m2, m2 vpblendd m2, m6, 0xf0 .w8_loop: lea r5d, [r3+dxq] shr r3d, 6 pand m0, m4, m2 psubw m1, m5, m0 psllw m0, 8 por m1, m0 movu xm0, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [tlq+r5], 1 pshufb m0, m8 pmaddubsw m0, m1 pcmpgtw m1, m9, m2 paddw m2, m6 pmulhrsw m0, m3 vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w8_loop packuswb xm7, xm7 .w8_end_loop: movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_end_loop .w8_end: RET .w16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(h+15, 31) jmp .w16_main ALIGN function_align .w16: ALLOC_STACK -64, 12 lea maxbased, [hq+15] test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w16_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m1, [base+pb_12] vbroadcasti128 m6, [base+z_filter_s+8] vinserti128 m2, m6, [base+z_filter_s], 0 vinserti128 m6, [base+z_filter_s+16], 1 mova xm10, [tlq-1] vinserti128 m10, [tlq+3], 1 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] vbroadcasti128 m7, [base+z_filter_s+14] vinserti128 m8, m7, [base+z_filter_s+6], 0 vinserti128 m7, [base+z_filter_s+22], 1 psubw m0, m1 movu xm11, [tlq+12] vinserti128 m11, [tlq+16], 1 pminub m8, m0 pminub m7, m0 pshufb m0, m10, m2 shufps m2, m6, q2121 pmaddubsw m0, m9 pshufb m1, m11, m8 shufps m8, m7, q2121 pmaddubsw m1, m9 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] movzx r3d, byte [tlq+31] pshufb m2, m10, m2 pmaddubsw m2, m9 pshufb m8, m11, m8 pmaddubsw m8, m9 paddw m0, m2 paddw m1, m8 sub r5d, 3 jnz .w16_3tap vpbroadcastd m9, [z_filter_k+4*8] movzx r2d, byte [tlq+30] pshufb m10, m6 pmaddubsw m10, m9 pshufb m11, m7 pmaddubsw m11, m9 sub r2d, r3d lea r2d, [r2+r3*8+4] shr r2d, 3 mov [rsp+32], r2b paddw m0, m10 paddw m1, m11 .w16_3tap: pmulhrsw m0, m3 pmulhrsw m1, m3 sar r5d, 1 mov tlq, rsp add r5d, 33 cmp hd, 32 cmovns maxbased, r5d mov [tlq+r5], r3b packuswb m0, m1 vpermq m0, m0, q3120 mova [tlq], m0 .w16_main: movd xm6, dxd vbroadcasti128 m0, [z_base_inc] vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 mov r3d, dxd psubw m9, m0 paddw m11, m6, m6 psubw m10, m9, m3 ; 64*8 vpblendd m6, m11, 0xf0 .w16_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r3+0] movu xm1, [tlq+r3+8] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [tlq+r5+0], 1 vinserti128 m1, [tlq+r5+8], 1 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w16_loop .w16_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: RET ALIGN function_align .w32: ALLOC_STACK -96, 15 lea r3d, [hq+31] mov maxbased, 63 cmp hd, 32 cmovs maxbased, r3d test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vbroadcasti128 m0, [pb_0to15] sub r3d, 29 ; h+2 movu xm13, [tlq+29] ; 32-39 movd xm1, r3d movu xm14, [tlq+37] ; 40-47 sub r3d, 8 ; h-6 vinserti128 m14, [tlq+51], 1 ; 56-63 vpbroadcastb xm1, xm1 mova xm11, [tlq- 1] ; 0- 7 vinserti128 m11, [tlq+13], 1 ; 16-23 movd xm2, r3d movu xm12, [tlq+ 5] ; 8-15 vinserti128 m12, [tlq+19], 1 ; 24-31 pminub xm1, xm0 ; clip 32x8 mova m7, [z_filter_s+0] pshufb xm13, xm1 vpbroadcastd m1, [pb_12] vpbroadcastb xm2, xm2 vinserti128 m13, [tlq+43], 1 ; 48-55 vinserti128 m8, m7, [z_filter_s+4], 1 vpblendd m2, m1, 0xf0 vinserti128 m7, [z_filter_s+12], 0 pminub m2, m0 ; clip 32x16 and 32x(32|64) vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m14, m2 pshufb m0, m11, m8 shufps m8, m7, q1021 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 shufps m8, m7, q2121 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m8 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m8 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m8 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m7 pmaddubsw m12, m9 movzx r3d, byte [tlq+63] movzx r2d, byte [tlq+62] paddw m0, m11 paddw m2, m12 pshufb m13, m7 pmaddubsw m13, m9 pshufb m14, m7 pmaddubsw m14, m9 paddw m1, m13 paddw m6, m14 sub r2d, r3d lea r2d, [r2+r3*8+4] ; edge case for 32x64 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 shr r2d, 3 mov [rsp+64], r2b mov tlq, rsp mov [tlq+65], r3b mov r3d, 65 cmp hd, 64 cmove maxbased, r3d packuswb m0, m2 packuswb m1, m6 mova [tlq+ 0], m0 mova [tlq+32], m1 .w32_main: movd xm6, dxd vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 mov r5d, dxd psubw m9, [z_base_inc] mova m11, m6 psubw m10, m9, m3 ; 64*8 .w32_loop: mov r3d, r5d shr r3d, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu m0, [tlq+r3+0] movu m1, [tlq+r3+8] add r5d, dxd pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [dstq], m0 dec hd jz .w32_end add dstq, strideq cmp r5d, maxbased jb .w32_loop test hb, 1 jz .w32_end_loop mova [dstq], m7 add dstq, strideq dec hd jz .w32_end .w32_end_loop: mova [dstq+strideq*0], m7 mova [dstq+strideq*1], m7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_end_loop .w32_end: RET ALIGN function_align .w64: ALLOC_STACK -128, 16 lea maxbased, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main mova xm11, [tlq- 1] ; 0- 7 vinserti128 m11, [tlq+13], 1 ; 16-23 movu xm12, [tlq+ 5] ; 8-15 vinserti128 m12, [tlq+19], 1 ; 24-31 mova m7, [z_filter_s+0] vinserti128 m8, m7, [z_filter_s+4], 1 vinserti128 m7, [z_filter_s+12], 0 vpbroadcastd m9, [z_filter_k+4*2+12*0] movu xm13, [tlq+29] ; 32-39 vinserti128 m13, [tlq+43], 1 ; 48-55 movu xm14, [tlq+37] ; 40-47 vinserti128 m14, [tlq+51], 1 ; 56-63 pshufb m0, m11, m8 shufps m8, m7, q1021 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 shufps m15, m8, m7, q2121 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m15 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m10, [z_filter_k+4*2+12*2] pshufb m11, m15 pmaddubsw m11, m10 pshufb m12, m7 pmaddubsw m12, m10 pshufb m13, m7 pmaddubsw m13, m10 pshufb m14, m7 pmaddubsw m14, m10 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 movu xm11, [tlq+ 61] ; 64- 71 vinserti128 m11, [tlq+ 75], 1 ; 80- 87 movu xm12, [tlq+ 69] ; 72- 79 vinserti128 m12, [tlq+ 83], 1 ; 88- 95 movu xm13, [tlq+ 93] ; 96-103 vinserti128 m13, [tlq+107], 1 ; 112-119 movu xm14, [tlq+101] ; 104-111 vinserti128 m14, [tlq+115], 1 ; 120-127 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 lea r3d, [hq-20] mov tlq, rsp packuswb m0, m2 packuswb m1, m6 vpbroadcastd xm2, [pb_14] vbroadcasti128 m6, [pb_0to15] mova [tlq+32*0], m0 mova [tlq+32*1], m1 movd xm0, r3d vpbroadcastd m1, [pb_12] vpbroadcastb m0, xm0 paddb m0, m2 pminub m0, m6 ; clip 64x16 and 64x32 pshufb m12, m0 pminub m1, m6 ; clip 64x64 pshufb m14, m1 pshufb m0, m11, m7 pmaddubsw m0, m10 pshufb m2, m12, m7 pmaddubsw m2, m10 pshufb m1, m13, m7 pmaddubsw m1, m10 pshufb m6, m14, m7 pmaddubsw m6, m10 pshufb m7, m11, m15 pmaddubsw m7, m9 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m0, m7 pshufb m7, m13, m15 pmaddubsw m7, m9 paddw m2, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m1, m7 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m8 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 packuswb m0, m2 packuswb m1, m6 mova [tlq+32*2], m0 mova [tlq+32*3], m1 .w64_main: movd xm12, dxd vpbroadcastb m7, [tlq+maxbaseq] lea r3d, [dxq-64] shl maxbased, 6 vpbroadcastw m12, xm12 sub r3d, maxbased vbroadcasti128 m8, [z_filter_s+2] movd xm6, r3d mov r5d, dxd mova m10, [pb_1to32] vpbroadcastd m11, [pb_32] vpbroadcastw m6, xm6 .w64_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3+ 0] movu m1, [tlq+r3+ 8] pand m2, m4, m6 psubw m9, m5, m2 psllw m2, 8 por m9, m2 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 psraw m2, m6, 6 pmulhrsw m0, m3 pmulhrsw m1, m3 packsswb m2, m2 paddb m2, m10 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [dstq+ 0], m0 movu m0, [tlq+r3+32] movu m1, [tlq+r3+40] add r5d, dxd pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 paddb m2, m11 pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m6, m12 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [dstq+32], m0 dec hd jz .w64_end add dstq, strideq cmp r5d, maxbased jb .w64_loop .w64_end_loop: mova [dstq+ 0], m7 mova [dstq+32], m7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea dxq, [dr_intra_derivative-90] movsxd wq, [r9+wq*4] movzx dyd, angleb xor angled, 0x400 mov r8, dxq sub dxq, dyq add wq, r9 add r9, z_filter_t0-ipred_z2_avx2_table mova m2, [tlq-64] mova m0, [tlq-32] mova m1, [tlq] and dyd, ~1 and dxq, ~1 movzx dyd, word [r8+dyq] ; angle - 90 movzx dxd, word [dxq+270] ; 180 - angle vpbroadcastd m13, [base+pw_512] vpbroadcastd m14, [base+pw_62] vpbroadcastd m15, [base+pw_64] mova [rsp+ 0], m2 mova [rsp+32], m0 mova [rsp+64], m1 neg dxd neg dyd jmp wq .w4: vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 vbroadcasti128 m10, [base+z1_shuf_w4] vbroadcasti128 m11, [base+z2_shuf_h4] lea r2d, [dxq+(65<<6)] ; xpos movd xm5, dyd mov r8d, (63-4)<<6 mov dyq, -4 pshuflw xm5, xm5, q0000 pmullw xm5, [base+z2_ymul] test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) vpbroadcastd xm3, [base+pb_4] call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle call .filter_strength jmp .w4_filter_left ALIGN function_align .filter_strength: movd xm8, r3d mov r3d, angled movd xm7, angled vpbroadcastb m8, xm8 shr r3d, 8 ; is_sm << 1 vpbroadcastb m7, xm7 pcmpeqb m8, [base+z_filter_wh] mova xm9, [r9+r3*8] pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 ret ALIGN function_align .upsample_above: ; w4/w8 pshufb xm2, xm1, [base+z_upsample1-2] pminub xm3, [base+z_filter_s+4] vpbroadcastd xm4, [base+pb_36_m4] vbroadcasti128 m10, [base+pb_0to15] pshufb xm3, xm1, xm3 pmaddubsw xm2, xm4 pmaddubsw xm3, xm4 lea r2d, [r2+dxq+(1<<6)] add dxd, dxd paddw xm2, xm3 pmulhrsw xm2, xm13 sub r8d, 3<<6 paddw m6, m6 packuswb xm2, xm2 punpcklbw xm1, xm2 mova [rsp+gprsize+64], xm1 ret ALIGN function_align .upsample_left: ; h4/h8 mov r3d, hd and r3d, 4 movd xm2, [rsp+gprsize+64] movddup xm0, [rsp+gprsize+56] movd xm1, r3d palignr xm2, xm0, 1 vpbroadcastb xm1, xm1 pshufb xm2, [base+z_filter_s+18] vpbroadcastd xm3, [base+pb_36_m4] pmaxub xm1, [base+z_upsample1-2] pshufb xm1, xm0, xm1 pmaddubsw xm2, xm3 pmaddubsw xm1, xm3 paddw xm5, xm5 add dyq, dyq paddw xm1, xm2 pmulhrsw xm1, xm13 vbroadcasti128 m11, [base+z2_upsample] paddw xm5, xm15 packuswb xm1, xm1 punpcklbw xm0, xm1 mova [rsp+gprsize+48], xm0 ret .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above popcnt r3d, r3d vpbroadcastd xm2, [base+pb_4] pminub xm2, [base+z_filter_s] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] pshufb xm3, xm1, xm2 ; 00 01 12 23 pshufd xm2, xm2, q0321 pmaddubsw xm0, xm3, xm0 pshufb xm2, xm1, xm2 ; 12 23 34 44 pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] punpckhqdq xm3, xm3 ; 34 44 44 44 pmaddubsw xm3, xm4 vpbroadcastd xm4, r6m ; max_width packssdw xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 packsswb xm4, xm4 psrlq xm1, 8 psubb xm4, [base+pb_1to32] packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movd [rsp+65], xm0 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd xm0, [base+pb_90] psubb xm0, xm7 ; 180 - angle pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d mov r5d, 10 cmp hd, 16 movu xm2, [rsp+49] vinserti128 m2, [rsp+43], 1 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef vpbroadcastb m0, xm0 pmaxub m0, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pshufb m0, m2, m0 pmaddubsw m0, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] pshufb m1, m2, m1 pmaddubsw m1, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m4 pmaddubsw m2, m3 vpbroadcastd xm4, r7m ; max_height packssdw xm4, xm4 paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 packsswb xm4, xm4 vextracti128 xm0, m1, 1 psubb xm4, [base+pb_16to1] packuswb xm0, xm1 vpblendvb xm0, [rsp+48], xm4 mova [rsp+48], xm0 jmp .w4_main .w4_upsample_left: call .upsample_left .w4_main: movd xm0, dxd mova m12, [base+z2_y_shuf_h4] lea r5, [rsp+56] ; left-7 vpbroadcastw m0, xm0 lea r9, [strideq*3] psraw xm1, xm5, 6 pand xm5, xm14 ; frac_y pxor xm2, xm2 paddw m7, m0, m0 psubw xm4, xm2, xm1 ; base_y vpblendd m0, m7, 0xcc mova xm1, xm7 punpcklwd xm4, xm2 paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 psubw xm1, xm15, xm5 ; 64-frac_y psllw xm5, 8 paddw m7, m7 paddw m6, m0 por xm5, xm1 ; 64-frac_y, frac_y vpbroadcastq m5, xm5 .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 vpbroadcastq m1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vpbroadcastq m2, [rsp+r3] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movq xm0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movhps xm0, [rsp+r3] vpblendd m1, m2, 0xc0 pand m2, m14, m6 ; frac_x vpblendd m0, m1, 0xf0 psubw m1, m15, m2 ; 64-frac_x psllw m2, 8 pshufb m0, m10 por m1, m2 ; 64-frac_x, frac_x pmaddubsw m0, m1 cmp r3d, 64 jge .w4_toponly mova m1, m7 ; arbitrary negative value vpgatherdq m3, [r5+xm4], m1 pshufb m1, m3, m11 vpermd m1, m12, m1 pmaddubsw m1, m5 psraw m2, m6, 15 ; base_x < topleft vpblendvb m0, m1, m2 .w4_toponly: pmulhrsw m0, m13 paddw m6, m7 ; xpos += dx add r5, dyq packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r9 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r2d, r8d jge .w4_loop .w4_leftonly_loop: mova m1, m7 vpgatherdq m2, [r5+xm4], m1 add r5, dyq pshufb m0, m2, m11 vpermd m0, m12, m0 pmaddubsw m0, m5 pmulhrsw m0, m13 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r9 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_leftonly_loop .w4_end: RET .w8: vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 movd xm5, dyd vbroadcasti128 m10, [base+z_filter_s+2] vbroadcasti128 m11, [base+z2_shuf_h4] lea r2d, [dxq+(65<<6)] ; xpos vpbroadcastw xm5, xm5 mov r8d, (63-8)<<6 mov dyq, -4 pmullw xm5, [base+z2_ymul] test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm vpbroadcastd xm3, [base+pb_8] movhps [rsp+80], xm1 call .upsample_above sub angled, 53 ; angle - 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle call .filter_strength jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above popcnt r3d, r3d vpbroadcastd xm3, [base+pb_8] pminub xm3, [base+z_filter_s+8] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 pmaddubsw xm0, xm2, xm0 pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] pmaddubsw xm3, xm4 vpbroadcastd xm4, r6m ; max_width packssdw xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 packsswb xm4, xm4 psrldq xm1, 1 psubb xm4, [base+pb_1to32] packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movq [rsp+65], xm0 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 .w8_filter_left: test r3d, r3d jz .w8_main popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] cmp hd, 32 jne .w8_filter_left_h16 movu xm2, [rsp+27] vinserti128 m2, [rsp+35], 1 vpbroadcastd xm0, [base+pb_5] vbroadcasti128 m3, [base+z_filter_s+ 8] vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] pmaxub m3, m0 pshufb m3, m2, m3 pmaddubsw m3, m7 pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 paddw m3, m1 paddw m3, m2 pmulhrsw m3, m13 jmp .w8_filter_left_top16 .w8_filter_left_h16: mov r5d, 10 cmp hd, 16 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vpbroadcastb m0, xm0 .w8_filter_left_top16: vbroadcasti128 m1, [base+z_filter_s+12] vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab vbroadcasti128 m4, [base+z_filter_s+16] vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef pmaxub m0, m2 movu xm2, [rsp+49] vinserti128 m2, [rsp+43], 1 pshufb m0, m2, m0 pmaddubsw m0, m7 vpbroadcastd m7, r7m ; max_height pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 packssdw m7, m7 paddw m1, m0 packsswb m7, m7 paddw m1, m2 pmulhrsw m1, m13 psubb m7, [base+pb_32to1] packuswb m3, m1 vpermq m3, m3, q1320 vpblendvb m3, [rsp+32], m7 mova [rsp+32], m3 jmp .w8_main .w8_upsample_left: call .upsample_left .w8_main: movd xm3, dxd lea r5, [rsp+56] ; left-7 pshufd xm1, xm5, q3120 pand xm5, xm14 vpbroadcastw m3, xm3 pxor xm0, xm0 psubw xm2, xm15, xm5 psraw xm1, 6 lea r9, [strideq*3] paddw m7, m3, m3 psubw xm9, xm0, xm1 ; base_y psllw xm5, 8 punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 vpblendd m3, m7, 0xf0 ; xpos0 xpos1 por xm5, xm2 ; 64-frac_y, frac_y punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 paddw m6, m3 vinserti128 m12, m5, xm5, 1 .w8_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vinserti128 m0, [rsp+r3], 1 lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu xm1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 vinserti128 m1, [rsp+r3], 1 pand m2, m14, m6 paddsw m4, m6, m7 psubw m5, m15, m2 psllw m2, 8 pshufb m0, m10 por m2, m5 pmaddubsw m0, m2 pand m2, m14, m4 psubw m5, m15, m2 psllw m2, 8 pshufb m1, m10 por m2, m5 pmaddubsw m1, m2 cmp r3d, 64 jge .w8_toponly mova m5, m7 vpgatherdq m3, [r5+xm9], m7 mova m7, m5 vpgatherdq m2, [r5+xm8], m5 pshufb m3, m11 pshufb m2, m11 punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 vpermq m5, m5, q3120 ; y0 y1 vpermq m2, m2, q3120 ; y2 y3 pmaddubsw m5, m12 pmaddubsw m2, m12 psraw m6, 15 ; base_x < topleft vpblendvb m0, m5, m6 psraw m3, m4, 15 vpblendvb m1, m2, m3 .w8_toponly: pmulhrsw m0, m13 pmulhrsw m1, m13 paddw m6, m4, m7 ; xpos += dx add r5, dyq packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r9 ], xm1 sub hd, 4 jz .w8_end lea dstq, [dstq+strideq*4] cmp r2d, r8d jge .w8_loop .w8_leftonly_loop: mova m0, m7 vpgatherdq m5, [r5+xm9], m7 mova m7, m0 vpgatherdq m3, [r5+xm8], m0 add r5, dyq pshufb m2, m5, m11 pshufb m1, m3, m11 punpckldq m0, m1, m2 punpckhdq m1, m2 vpermq m0, m0, q3120 vpermq m1, m1, q3120 pmaddubsw m0, m12 pmaddubsw m1, m12 pmulhrsw m0, m13 pmulhrsw m1, m13 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_leftonly_loop .w8_end: RET .w16: mov r8d, hd test angled, 0x400 jnz .w16_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above popcnt r3d, r3d vbroadcasti128 m6, [tlq+1] mova xm2, [base+z_filter_s] vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de movu xm3, [base+z_filter_s+8] vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff vpblendd m1, m6, 0xf0 vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m1, m2 pshufb m1, m3 pmaddubsw m0, m2, m0 shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff pmaddubsw m2, m4 pmaddubsw m1, m5 vpbroadcastd xm4, r6m ; max_width packssdw xm4, xm4 paddw m0, m2 paddw m0, m1 pmulhrsw m0, m13 packsswb xm4, xm4 vextracti128 xm2, m0, 1 psubb xm4, [base+pb_1to32] packuswb xm0, xm2 vpblendvb xm0, xm6, xm4 movu [rsp+65], xm0 .w16_no_filter_above: vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 test r3d, r3d jz .w16_main popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] .w16_filter_left: vpbroadcastd m6, r7m ; max_height packssdw m6, m6 packsswb m6, m6 cmp hd, 32 jl .w16_filter_left_h16 vpbroadcastd xm0, [base+pb_5] vbroadcasti128 m10, [base+z_filter_s+ 8] vbroadcasti128 m11, [base+z_filter_s+12] vbroadcasti128 m12, [base+z_filter_s+16] je .w16_filter_left_h32 movu m3, [tlq-69] movu m5, [tlq-61] pmaxub m1, m10, m0 pshufb m1, m3, m1 pmaddubsw m1, m7 pshufb m2, m3, m11 pmaddubsw m2, m8 pshufb m3, m12 pmaddubsw m3, m9 paddw m1, m2 pshufb m2, m5, m10 pmaddubsw m2, m7 pshufb m4, m5, m11 pmaddubsw m4, m8 pshufb m5, m12 pmaddubsw m5, m9 paddw m1, m3 vpbroadcastd m3, [base+pb_32] paddb m3, [base+pb_32to1] paddw m2, m4 paddw m2, m5 pmulhrsw m1, m13 pmulhrsw m2, m13 psubb m3, m6, m3 packuswb m1, m2 vpblendvb m1, [tlq-64], m3 mova [rsp], m1 jmp .w16_filter_left_top32 .w16_filter_left_h32: pmaxub m10, m0 .w16_filter_left_top32: movu xm2, [tlq-37] vinserti128 m2, [tlq-29], 1 pshufb m3, m2, m10 pshufb m1, m2, m11 pshufb m2, m12 pmaddubsw m3, m7 pmaddubsw m1, m8 pmaddubsw m2, m9 paddw m3, m1 paddw m3, m2 pmulhrsw m3, m13 jmp .w16_filter_left_top16 .w16_filter_left_h16: mov r5d, 10 cmp hd, 16 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vpbroadcastb m0, xm0 .w16_filter_left_top16: movu xm2, [tlq-15] vinserti128 m2, [tlq-21], 1 vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef pmaxub m0, m5 pshufb m0, m2, m0 pmaddubsw m0, m7 pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 psubb m6, [base+pb_32to1] paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 packuswb m3, m1 vpermq m3, m3, q1320 vpblendvb m3, [tlq-32], m6 mova [rsp+32], m3 .w16_main: movd xm1, dyd vbroadcasti128 m10, [base+z_filter_s+2] movd xm7, dxd vbroadcasti128 m11, [base+z2_shuf_h2] vpbroadcastw m1, xm1 vpbroadcastw m7, xm7 mov r7, dstq pmullw m0, m1, [base+z2_ymul] psllw xm1, 4 paddw m6, m7, [base+z2_base_inc] lea r9d, [dxq+(65<<6)] ; xpos movd [rsp+156], xm1 .w16_loop0: mov r2d, r9d mova [rsp+160], m0 lea r5, [rsp+60] ; left-3 mova [rsp+192], m6 pxor m1, m1 psraw m2, m0, 6 pand m0, m14 psubw m9, m1, m2 ; base_y psubw m12, m15, m0 punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 psllw m0, 8 punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 por m12, m0 ; 64-frac_y, frac_y .w16_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2] vinserti128 m0, [rsp+r2+8], 1 lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu xm1, [rsp+r3] vinserti128 m1, [rsp+r3+8], 1 pand m2, m14, m6 paddsw m5, m6, m7 psubw m3, m15, m2 psllw m2, 8 pshufb m0, m10 por m2, m3 pmaddubsw m0, m2 pand m2, m14, m5 psubw m3, m15, m2 psllw m2, 8 pshufb m1, m10 por m2, m3 pmaddubsw m1, m2 cmp r3d, 64 jge .w16_toponly punpckhwd m2, m5, m5 ; mask out unnecessary loads vpgatherdd m4, [r5+m9], m2 punpcklwd m2, m5, m5 vpgatherdd m3, [r5+m8], m2 pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 punpcklqdq m2, m3, m4 ; y0 punpckhqdq m3, m4 ; y1 pmaddubsw m2, m12 pmaddubsw m3, m12 psraw m6, 15 ; base_x < topleft vpblendvb m0, m2, m6 psraw m6, m5, 15 vpblendvb m1, m3, m6 .w16_toponly: pmulhrsw m0, m13 pmulhrsw m1, m13 paddw m6, m5, m7 ; xpos += dx sub r5, 2 packuswb m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r2d, (63-16)<<6 jge .w16_loop .w16_leftonly_loop: mova m0, m7 vpgatherdd m4, [r5+m9], m7 mova m7, m0 vpgatherdd m3, [r5+m8], m0 sub r5, 2 pshufb m2, m4, m11 pshufb m1, m3, m11 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pmaddubsw m0, m12 pmaddubsw m1, m12 pmulhrsw m0, m13 pmulhrsw m1, m13 packuswb m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_leftonly_loop .w16_end: sub r8d, 1<<8 jl .w16_ret vpbroadcastd m0, [rsp+156] paddw m0, [rsp+160] ; base_y += 16*dy paddw m6, m13, [rsp+192] add r7, 16 add r9d, 16<<6 movzx hd, r8b mov dstq, r7 paddw m6, m13 ; base_x += 16*64 jmp .w16_loop0 .w16_ret: RET .w32: mova m2, [tlq+32] lea r8d, [hq+(1<<8)] mova [rsp+96], m2 test angled, 0x400 jnz .w16_main vpbroadcastd m7, [base+z_filter_k+4*2+12*0] vpbroadcastd m8, [base+z_filter_k+4*2+12*1] vpbroadcastd m9, [base+z_filter_k+4*2+12*2] mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc vinserti128 m1, [tlq+11], 1 movu xm6, [base+z_filter_s+12] vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff movu xm3, [tlq+ 6] vinserti128 m3, [tlq+17], 1 vpbroadcastd m10, r6m ; max_width packssdw m10, m10 packsswb m10, m10 .w32_filter_above: pshufb m0, m1, m5 shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de pmaddubsw m0, m7 pshufb m2, m1, m4 shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff pmaddubsw m2, m8 pshufb m1, m5 pmaddubsw m1, m9 paddw m0, m2 paddw m0, m1 pshufb m1, m3, m4 pmaddubsw m1, m7 pshufb m2, m3, m5 pmaddubsw m2, m8 pshufb m3, m6 pmaddubsw m3, m9 paddw m1, m2 paddw m1, m3 pmulhrsw m0, m13 pmulhrsw m1, m13 psubb m10, [base+pb_1to32] packuswb m0, m1 vpblendvb m0, [tlq+1], m10 movu [rsp+65], m0 jmp .w16_filter_left .w64: mova m2, [tlq+32] mov r3d, [tlq+64] lea r8d, [hq+(3<<8)] mova [rsp+ 96], m2 mov [rsp+128], r3d test angled, 0x400 jnz .w16_main vpbroadcastd m7, [base+z_filter_k+4*2+12*0] vpbroadcastd m8, [base+z_filter_k+4*2+12*1] vpbroadcastd m9, [base+z_filter_k+4*2+12*2] movu xm6, [base+z_filter_s+ 4] vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc movu xm3, [tlq+30] vinserti128 m3, [tlq+43], 1 movu xm5, [base+z_filter_s+16] vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff pshufb m0, m3, m6 shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de pmaddubsw m0, m7 pshufb m2, m3, m4 shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff pmaddubsw m2, m8 pshufb m3, m6 pmaddubsw m3, m9 paddw m0, m2 paddw m0, m3 movu xm2, [tlq+36] vinserti128 m2, [tlq+49], 1 vpbroadcastd m10, r6m ; max_width pshufb m4, m2, m4 pmaddubsw m4, m7 pshufb m3, m2, m6 pmaddubsw m3, m8 pshufb m2, m5 pmaddubsw m2, m9 packssdw m10, m10 paddw m3, m4 paddw m2, m3 vpbroadcastd m3, [base+pb_32] pmulhrsw m0, m13 pmulhrsw m2, m13 packsswb m10, m10 mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+6], 1 psubb m3, m10, m3 psubb m3, [base+pb_1to32] vinserti128 m1, [tlq+13], 1 packuswb m0, m2 vpblendvb m0, [tlq+33], m3 movu xm3, [tlq+ 6] vinserti128 m3, [tlq+19], 1 movu [rsp+97], m0 jmp .w32_filter_above cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase lea r6, [ipred_z3_avx2_table] tzcnt hd, hm movifnidn angled, anglem lea r7, [dr_intra_derivative+45*2-1] dec tlq movsxd hq, [r6+hq*4] sub angled, 180 add hq, r6 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e movzx dyd, word [r7+dyq] vpbroadcastd m3, [pw_512] vpbroadcastd m4, [pw_62] vpbroadcastd m5, [pw_64] mov org_wd, wd jmp hq .h4: lea r7, [strideq*3] cmp angleb, 40 jae .h4_no_upsample lea r4d, [angleq-1024] sar r4d, 7 add r4d, wd jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) ALLOC_STACK -32, 9 movu xm8, [tlq-7] pshufb xm0, xm8, [z_upsample1-4] vpbroadcastb xm2, xm8 pshufb xm1, xm8, [z_filter_s+2] mova [rsp+16], xm2 ; top[max_base_y] vpbroadcastd xm2, [pb_36_m4] add dyd, dyd pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 movd xm7, dyd mov r2d, dyd vpbroadcastw m7, xm7 paddw xm1, xm0 pmulhrsw xm1, xm3 pslldq m6, m7, 8 paddw xm2, xm7, xm7 paddw m6, m7 packuswb xm1, xm1 paddw m6, m2 punpcklbw xm1, xm8 mova xm8, [z_transpose4] psllw m7, 2 pshufb xm1, [pb_15to0] mova [rsp], xm1 .h4_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 vpbroadcastq m1, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 vpbroadcastq m2, [rsp+r4] lea r4d, [r2+dyq] shr r2d, 6 movq xm0, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 movhps xm0, [rsp+r4] vpblendd m1, m2, 0xc0 pand m2, m4, m6 vpblendd m0, m1, 0xf0 psubw m1, m5, m2 psllw m2, 8 por m1, m2 pmaddubsw m0, m1 paddw m6, m7 pmulhrsw m0, m3 vextracti128 xm1, m0, 1 packuswb xm1, xm0 pshufb xm1, xm8 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r7 ], xm1, 3 add dstq, 4 sub wd, 4 jg .h4_upsample_loop RET ALIGN function_align .filter_strength: ; h4/h8/h16 %define base r4-z_filter_t0 lea r4, [z_filter_t0] movd xm0, maxbased movd xm2, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m2, xm2 pcmpeqb m1, m0, [base+z_filter_wh] pand m1, m2 mova xm2, [r4+angleq*8] pcmpgtb m1, m2 pmovmskb r5d, m1 ret .h4_no_upsample: ALLOC_STACK -16, 12 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m7, [base+pb_7] vbroadcasti128 m2, [tlq-14] pmaxub m1, m7, [base+z_filter_s-4] vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] pmaxub m7, [base+z_filter_s+4] vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] pshufb m0, m2, m1 shufps m1, m7, q2121 pmaddubsw m0, m8 pshufb m1, m2, m1 pmaddubsw m1, m9 pshufb m2, m7 pmaddubsw m2, m10 paddw m0, m1 paddw m0, m2 pmulhrsw m0, m3 mov r4d, 9 lea tlq, [rsp+15] cmp wd, 4 cmovne maxbased, r4d vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [rsp], xm0 .h4_main: movd xm6, dyd vpbroadcastq m0, [z_base_inc] ; base_inc << 6 mov r4, tlq sub tlq, 4 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] ; ypos movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf_w4] add maxbased, 64 vpbroadcastw m9, xm9 psrlw m7, 8 ; top[max_base_y] paddw m10, m6, m6 psubw m9, m0 ; max_base_y vpblendd m6, m10, 0xcc mova xm0, xm10 paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 paddw m10, m10 mova xm11, [z_transpose4] .h4_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 vpbroadcastq m1, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 ; base1 vpbroadcastq m2, [tlq+r5] lea r5, [r4+dyq] sar r4, 6 ; base2 movq xm0, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 ; base3 movhps xm0, [tlq+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 pshufb m0, m8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 pcmpgtw m1, m9, m6 ; base < max_base_y pmulhrsw m0, m3 paddw m6, m10 ; ypos += dy vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 packuswb xm1, xm0 pshufb xm1, xm11 ; transpose movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r7 ], xm1, 3 sub wd, 4 jz .h4_end add dstq, 4 cmp r4d, maxbased jg .h4_loop packuswb xm7, xm7 .h4_end_loop: movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm7 movd [dstq+strideq*2], xm7 movd [dstq+r7 ], xm7 add dstq, 4 sub wd, 4 jg .h4_end_loop .h4_end: RET ALIGN function_align .h8: lea r4d, [angleq+216] mov r4b, wb cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 ALLOC_STACK -32, 8 and r4d, 4 mova xm0, [tlq-15] vinserti128 m0, [tlq- 9], 1 movd xm1, r4d movu xm2, [z_filter_s+2] vinserti128 m2, [z_filter_s+6], 1 vpbroadcastb xm1, xm1 ; w & 4 vpbroadcastd m7, [pb_36_m4] pmaxub xm1, [z_upsample1-4] ; clip 4x8 vinserti128 m1, [z_upsample1], 1 add dyd, dyd pshufb m1, m0, m1 pshufb m2, m0, m2 vinserti128 m0, [tlq-7], 1 movd xm6, dyd pmaddubsw m1, m7 pmaddubsw m2, m7 vpbroadcastw m6, xm6 mov r2d, dyd lea r5, [strideq*3] paddw m7, m6, m6 paddw m1, m2 vpblendd m6, m7, 0xf0 pmulhrsw m1, m3 pslldq m2, m7, 8 paddw m7, m7 paddw m6, m2 vbroadcasti128 m2, [pb_15to0] packuswb m1, m1 punpcklbw m1, m0 pshufb m1, m2 vextracti128 [rsp+ 0], m1, 1 mova [rsp+16], xm1 .h8_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 ; base0 movu xm0, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 ; base1 vinserti128 m0, [rsp+r4], 1 lea r4d, [r2+dyq] shr r2d, 6 ; base2 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 punpcklqdq m1, m2, m2 ; frac0 frac1 pmaddubsw m0, m1 movu xm1, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 ; base3 vinserti128 m1, [rsp+r4], 1 punpckhqdq m2, m2 ; frac2 frac3 pmaddubsw m1, m2 pmulhrsw m0, m3 paddw m6, m7 pmulhrsw m1, m3 lea r4, [dstq+strideq*4] psllw m1, 8 por m0, m1 vextracti128 xm1, m0, 1 punpcklbw xm2, xm0, xm1 punpckhbw xm0, xm1 movd [dstq+strideq*0], xm2 pextrd [dstq+strideq*1], xm2, 1 pextrd [dstq+strideq*2], xm2, 2 pextrd [dstq+r5 ], xm2, 3 movd [r4 +strideq*0], xm0 pextrd [r4 +strideq*1], xm0, 1 pextrd [r4 +strideq*2], xm0, 2 pextrd [r4 +r5 ], xm0, 3 add dstq, 4 sub wd, 4 jg .h8_upsample_loop RET .h8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: ALLOC_STACK -32, 10 lea maxbased, [wq+7] test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h8_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd xm6, [base+pb_15] pcmpeqb xm1, xm1 psubusb xm6, xm0 psubb xm6, xm1 ; w == 4 ? 5 : 1 movu xm2, [tlq-16] pmaxub xm1, xm6, [base+z_filter_s] vinserti128 m2, [tlq-14], 1 vinserti128 m1, [base+z_filter_s+12], 1 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pmaxub xm6, [base+z_filter_s+ 8] vinserti128 m6, [base+z_filter_s+20], 1 pshufb m0, m2, m1 pmaddubsw m0, m7 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] movzx r4d, byte [tlq-15] shufps m1, m6, q2121 pshufb m1, m2, m1 pmaddubsw m1, m7 paddw m0, m1 sub r5d, 3 jnz .h8_3tap vpbroadcastd m7, [z_filter_k+4*8] movzx r2d, byte [tlq-14] pshufb m2, m6 pmaddubsw m2, m7 sub r2d, r4d lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+15], r2b paddw m0, m2 .h8_3tap: pmulhrsw m0, m3 sar r5d, 1 lea tlq, [rsp+31] add r5d, 17 cmp wd, 16 cmovns maxbased, r5d neg r5 mov [tlq+r5], r4b vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [tlq-15], xm0 .h8_main: movd xm2, dyd vbroadcasti128 m0, [z_base_inc] mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m2, xm2 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psrlw m7, 8 psubw m9, m0 paddw m6, m2, m2 vpblendd m2, m6, 0x0f .h8_loop: lea r5, [r4+dyq] sar r4, 6 pand m0, m4, m2 psubw m1, m5, m0 psllw m0, 8 por m1, m0 vbroadcasti128 m0, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 vinserti128 m0, [tlq+r5], 0 sub rsp, 8*2 pshufb m0, m8 pmaddubsw m0, m1 pcmpgtw m1, m9, m2 paddw m2, m6 pmulhrsw m0, m3 vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 psllw xm0, 8 por xm0, xm1 ; interleave rows (partial transpose) mova [rsp], xm0 sub wd, 2 jz .h8_transpose cmp r4d, maxbased jg .h8_loop packuswb xm0, xm7, xm7 .h8_end_loop: sub rsp, 8*2 mova [rsp], xm0 sub wd, 2 jg .h8_end_loop .h8_transpose: mova xm2, [rsp+16*1] sub org_wd, 8 lea r2, [strideq*3] lea r6, [dstq+org_wq] cmovns dstq, r6 punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 lea r6, [dstq+strideq*4] jge .h8_w8 add rsp, 16*2 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r2 ], xm1, 3 movd [r6 +strideq*0], xm2 pextrd [r6 +strideq*1], xm2, 1 pextrd [r6 +strideq*2], xm2, 2 pextrd [r6 +r2 ], xm2, 3 jmp .h8_end .h8_w8_loop: mova xm0, [rsp+16*0] mova xm2, [rsp+16*1] punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 .h8_w8: ; w8/w16/w32 mova xm0, [rsp+16*2] mova xm4, [rsp+16*3] add rsp, 16*4 punpcklwd xm3, xm4, xm0 punpckhwd xm4, xm0 punpckldq xm0, xm3, xm1 punpckhdq xm3, xm1 punpckldq xm1, xm4, xm2 punpckhdq xm4, xm2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm3 movhps [dstq+r2 ], xm3 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 movq [r6 +strideq*2], xm4 movhps [r6 +r2 ], xm4 sub dstq, 8 sub r6, 8 sub org_wd, 8 jge .h8_w8_loop .h8_end: RET .h16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(w+15, 31) jmp .h16_main ALIGN function_align .h16: ALLOC_STACK -64, 12 lea maxbased, [wq+15] test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h16_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m11, [base+pb_27] vpbroadcastd m1, [base+pb_1] vbroadcasti128 m6, [base+z_filter_s+12] vinserti128 m2, m6, [base+z_filter_s+4], 0 vinserti128 m6, [base+z_filter_s+20], 1 movu xm10, [tlq-18] vinserti128 m10, [tlq-14], 1 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] vbroadcasti128 m7, [base+z_filter_s+8] vinserti128 m8, m7, [base+z_filter_s+0], 0 vinserti128 m7, [base+z_filter_s+16], 1 psubusb m11, m0 por m1, m11 movu xm11, [tlq-32] vinserti128 m11, [tlq-28], 1 pmaxub m8, m1 pmaxub m7, m1 pshufb m0, m10, m2 shufps m2, m6, q2121 pmaddubsw m0, m9 pshufb m1, m11, m8 shufps m8, m7, q2121 pmaddubsw m1, m9 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] movzx r4d, byte [tlq-31] pshufb m2, m10, m2 pmaddubsw m2, m9 pshufb m8, m11, m8 pmaddubsw m8, m9 paddw m0, m2 paddw m1, m8 sub r5d, 3 jnz .h16_3tap vpbroadcastd m9, [z_filter_k+4*8] movzx r2d, byte [tlq-30] pshufb m10, m6 pmaddubsw m10, m9 pshufb m11, m7 pmaddubsw m11, m9 sub r2d, r4d lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+31], r2b paddw m0, m10 paddw m1, m11 .h16_3tap: pmulhrsw m0, m3 pmulhrsw m1, m3 sar r5d, 1 lea tlq, [rsp+63] add r5d, 33 cmp wd, 32 cmovns maxbased, r5d neg r5 mov [tlq+r5], r4b packuswb m0, m1 vpermq m0, m0, q2031 mova [tlq-31], m0 .h16_main: movd xm6, dyd vbroadcasti128 m0, [z_base_inc] mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psubw m9, m0 paddw m11, m6, m6 psubw m10, m9, m3 ; 64*8 vpblendd m6, m11, 0xf0 .h16_loop: lea r5, [r4+dyq] sar r4, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r4-0] movu xm1, [tlq+r4-8] lea r4, [r5+dyq] sar r5, 6 vinserti128 m0, [tlq+r5-0], 1 vinserti128 m1, [tlq+r5-8], 1 sub rsp, 32 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 vpermq m0, m0, q3120 mova [rsp], m0 sub wd, 2 jz .h16_transpose cmp r4d, maxbased jg .h16_loop mova m0, m7 .h16_end_loop: sub rsp, 32 mova [rsp], m7 sub wd, 2 jg .h16_end_loop .h16_transpose: mova m2, [rsp+32*1] sub org_wd, 8 lea r2, [strideq*3] lea r6, [dstq+org_wq] cmovns dstq, r6 punpcklbw m1, m2, m0 punpckhbw m2, m0 lea r3, [strideq*5] punpcklbw m0, m1, m2 punpckhbw m1, m2 lea r4, [strideq+r2*2] ; stride*7 jge .h16_w8 add rsp, 32*2 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 vextracti128 xm0, m0, 1 movd [dstq+strideq*4], xm1 pextrd [dstq+r3 ], xm1, 1 pextrd [dstq+r2*2 ], xm1, 2 pextrd [dstq+r4 ], xm1, 3 lea dstq, [dstq+strideq*8] vextracti128 xm1, m1, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 movd [dstq+strideq*4], xm1 pextrd [dstq+r3 ], xm1, 1 pextrd [dstq+r2*2 ], xm1, 2 pextrd [dstq+r4 ], xm1, 3 jmp .h16_end .h16_w8_loop: mova m0, [rsp+32*0] mova m2, [rsp+32*1] punpcklbw m1, m2, m0 punpckhbw m2, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 .h16_w8: mova m2, [rsp+32*2] mova m4, [rsp+32*3] lea r6, [dstq+strideq*8] add rsp, 32*4 punpcklbw m3, m4, m2 punpckhbw m4, m2 punpcklbw m2, m3, m4 punpckhbw m3, m4 punpckldq m4, m2, m0 punpckhdq m2, m0 punpckldq m0, m3, m1 punpckhdq m3, m1 movq [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm4 vextracti128 xm4, m4, 1 movq [dstq+strideq*2], xm2 movhps [dstq+r2 ], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*4], xm0 movhps [dstq+r3 ], xm0 vextracti128 xm0, m0, 1 movq [dstq+r2*2 ], xm3 movhps [dstq+r4 ], xm3 vextracti128 xm3, m3, 1 movq [r6+strideq*0], xm4 movhps [r6+strideq*1], xm4 movq [r6+strideq*2], xm2 movhps [r6+r2 ], xm2 movq [r6+strideq*4], xm0 movhps [r6+r3 ], xm0 movq [r6+r2*2 ], xm3 movhps [r6+r4 ], xm3 sub dstq, 8 sub org_wd, 8 jge .h16_w8_loop .h16_end: RET ALIGN function_align .h32: ALLOC_STACK -96, 15 lea maxbased, [wq+31] and maxbased, 31 or maxbased, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main vbroadcasti128 m0, [pb_0to15] mov r4d, 21 mov r5d, 3 movu xm11, [tlq-66] ; 56-63 vinserti128 m11, [tlq-52], 1 ; 40-47 sub r4d, wd ; 21-w cmovns r5d, r4d movu xm12, [tlq-58] ; 48-55 vinserti128 m12, [tlq-44], 1 ; 32-39 sub r4d, 8 ; 13-w movd xm1, r5d movu xm13, [tlq-34] ; 24-31 vinserti128 m13, [tlq-20], 1 ; 8-15 movd xm2, r4d vpbroadcastb m1, xm1 movu xm14, [tlq-28] ; 16-23 vinserti128 m14, [tlq-14], 1 ; 0- 7 vpbroadcastb m2, xm2 pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 movu m7, [z_filter_s+4] pshufb m11, m1 vinserti128 m8, m7, [z_filter_s+8], 1 vinserti128 m7, [z_filter_s+16], 0 pmaxsb m2, m0 ; clip 8x32 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m12, m2 pshufb m0, m11, m8 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 shufps m8, m7, q1021 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m8 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m8 pmaddubsw m10, m9 shufps m8, m7, q2121 paddw m1, m10 pshufb m10, m14, m8 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 movzx r4d, byte [tlq-63] movzx r2d, byte [tlq-62] paddw m0, m11 paddw m2, m12 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m7 pmaddubsw m14, m9 paddw m1, m13 paddw m6, m14 sub r2d, r4d lea r2d, [r2+r4*8+4] ; edge case for 64x32 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 shr r2d, 3 mov [rsp+31], r2b lea tlq, [rsp+95] mov [tlq-65], r4b mov r4d, 65 cmp wd, 64 cmove maxbased, r4d packuswb m0, m2 packuswb m1, m6 mova [tlq-63], m0 mova [tlq-31], m1 .h32_main: movd xm6, dyd mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psubw m9, [z_base_inc] mova m11, m6 psubw m10, m9, m3 ; 64*8 .h32_loop: mov r5, r4 sar r5, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r5- 0] vinserti128 m0, [tlq+r5-16], 1 movu xm1, [tlq+r5- 8] vinserti128 m1, [tlq+r5-24], 1 sub rsp, 32 add r4, dyq pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [rsp], m0 dec wd jz .h32_transpose cmp r4d, maxbased jg .h32_loop .h32_end_loop: sub rsp, 32 mova [rsp], m7 dec wd jg .h32_end_loop .h32_transpose: lea dstq, [dstq+org_wq-8] lea r2, [strideq*3] lea r3, [strideq*5] lea r4, [strideq+r2*2] ; stride*7 .h32_w8_loop: mova m7, [rsp+32*0] mova m6, [rsp+32*1] mova m5, [rsp+32*2] mova m4, [rsp+32*3] mova m3, [rsp+32*4] mova m2, [rsp+32*5] mova m1, [rsp+32*6] mova m0, [rsp+32*7] lea r6, [dstq+strideq*8] add rsp, 32*8 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m3 punpckhdq m8, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 movq [dstq+strideq*0], xm6 movhps [dstq+strideq*1], xm6 vextracti128 xm6, m6, 1 movq [dstq+strideq*2], xm7 movhps [dstq+r2 ], xm7 vextracti128 xm7, m7, 1 movq [dstq+strideq*4], xm2 movhps [dstq+r3 ], xm2 vextracti128 xm2, m2, 1 movq [dstq+r2*2 ], xm8 movhps [dstq+r4 ], xm8 vextracti128 xm8, m8, 1 movq [r6+strideq*0], xm3 movhps [r6+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [r6+strideq*2], xm1 movhps [r6+r2 ], xm1 vextracti128 xm1, m1, 1 movq [r6+strideq*4], xm5 movhps [r6+r3 ], xm5 vextracti128 xm5, m5, 1 movq [r6+r2*2 ], xm0 movhps [r6+r4 ], xm0 lea r6, [r6+strideq*8] vextracti128 xm0, m0, 1 movq [r6+strideq*0], xm6 movhps [r6+strideq*1], xm6 movq [r6+strideq*2], xm7 movhps [r6+r2 ], xm7 movq [r6+strideq*4], xm2 movhps [r6+r3 ], xm2 movq [r6+r2*2 ], xm8 movhps [r6+r4 ], xm8 lea r6, [r6+strideq*8] movq [r6+strideq*0], xm3 movhps [r6+strideq*1], xm3 movq [r6+strideq*2], xm1 movhps [r6+r2 ], xm1 movq [r6+strideq*4], xm5 movhps [r6+r3 ], xm5 movq [r6+r2*2 ], xm0 movhps [r6+r4 ], xm0 sub dstq, 8 sub org_wd, 8 jg .h32_w8_loop RET ALIGN function_align .h64: ALLOC_STACK -128, 16 lea maxbased, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main mov r4d, 21 vpbroadcastb xm11, [tlq-127] vpblendd xm11, [tlq-130], 0x0e ; 120-127 sub r4d, wd ; 21-w mov r5d, 3 vinserti128 m11, [tlq-116], 1 ; 104-111 movu m7, [z_filter_s+4] cmp wd, 32 cmove r4d, r5d vinserti128 m8, m7, [z_filter_s+8], 1 vbroadcasti128 m6, [pb_0to15] movd xm1, r4d vpbroadcastd m9, [z_filter_k+4*2+12*0] movu xm12, [tlq-122] ; 112-119 vinserti128 m12, [tlq-108], 1 ; 96-103 vpbroadcastb m1, xm1 movu xm13, [tlq- 98] ; 88- 95 vinserti128 m13, [tlq- 84], 1 ; 72- 79 movu xm14, [tlq- 90] ; 80- 87 vinserti128 m14, [tlq- 76], 1 ; 64- 71 vinserti128 m7, [z_filter_s+16], 0 pshufb m0, m11, m8 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pmaxsb m1, m6 ; clip (16|32)x64 pshufb m13, m1 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] shufps m15, m8, m7, q1021 pshufb m10, m11, m15 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m15 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] shufps m10, m8, m7, q2132 pshufb m11, m10 pmaddubsw m11, m9 pshufb m12, m10 pmaddubsw m12, m9 pshufb m13, m10 pmaddubsw m13, m9 pshufb m14, m10 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 movu xm11, [tlq-66] ; 56-63 vinserti128 m11, [tlq-52], 1 ; 40-47 movu xm12, [tlq-58] ; 48-55 vinserti128 m12, [tlq-44], 1 ; 32-39 movu xm13, [tlq-34] ; 24-31 vinserti128 m13, [tlq-20], 1 ; 8-15 movu xm14, [tlq-28] ; 16-23 vinserti128 m14, [tlq-14], 1 ; 0- 7 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 lea tlq, [rsp+127] packuswb m0, m2 packuswb m1, m6 mova [tlq-127], m0 mova [tlq- 95], m1 pshufb m0, m11, m10 pmaddubsw m0, m9 pshufb m2, m12, m10 pmaddubsw m2, m9 pshufb m1, m13, m10 pmaddubsw m1, m9 pshufb m6, m14, m7 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m7, m11, m15 pmaddubsw m7, m9 paddw m0, m7 pshufb m7, m12, m15 pmaddubsw m7, m9 paddw m2, m7 pshufb m7, m13, m15 pmaddubsw m7, m9 paddw m1, m7 pshufb m7, m14, m10 pmaddubsw m7, m9 paddw m6, m7 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m15 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 packuswb m0, m2 packuswb m1, m6 mova [tlq-63], m0 mova [tlq-31], m1 .h64_main: movd xm12, dyd neg maxbaseq vbroadcasti128 m8, [z3_shuf] vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m12, xm12 lea r5d, [dyq+maxbaseq-64] neg dyq or maxbased, 63 lea r4, [dyq+63] movd xm6, r5d mova xm10, [pb_1to32+16] vinserti128 m10, [pb_1to32], 1 vpbroadcastd m11, [pb_32] vpbroadcastw m6, xm6 .h64_loop: mov r5, r4 sar r5, 6 movu m0, [tlq+r5-24] movu m1, [tlq+r5-32] pand m2, m4, m6 psubw m9, m5, m2 psllw m2, 8 por m9, m2 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 psraw m2, m6, 6 sub rsp, 64 pmulhrsw m0, m3 pmulhrsw m1, m3 packsswb m2, m2 paddb m2, m10 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [rsp+32], m0 movu m0, [tlq+r5-56] movu m1, [tlq+r5-64] add r4, dyq pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 paddb m2, m11 pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m6, m12 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [rsp], m0 dec wd jz .h64_transpose cmp r4d, maxbased jg .h64_loop .h64_end_loop: sub rsp, 64 mova [rsp+32], m7 mova [rsp+ 0], m7 dec wd jg .h64_end_loop .h64_transpose: lea r2, [strideq*3] lea r3, [strideq*5] imul r5, strideq, -8 lea dstq, [dstq+org_wq-16] lea r4, [strideq+r2*2] ; stride*7 .h64_transpose_loop0: lea r6, [rsp+16*3] .h64_transpose_loop: mova xm0, [r6+64*15] vinserti128 m0, [r6+64* 7], 1 mova xm1, [r6+64*14] vinserti128 m1, [r6+64* 6], 1 mova xm2, [r6+64*13] vinserti128 m2, [r6+64* 5], 1 mova xm3, [r6+64*12] vinserti128 m3, [r6+64* 4], 1 mova xm4, [r6+64*11] vinserti128 m4, [r6+64* 3], 1 mova xm5, [r6+64*10] vinserti128 m5, [r6+64* 2], 1 mova xm6, [r6+64* 9] vinserti128 m6, [r6+64* 1], 1 mova xm7, [r6+64* 8] vinserti128 m7, [r6+64* 0], 1 sub r6, 16 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m3 punpckhdq m8, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 vpermq m6, m6, q3120 vpermq m7, m7, q3120 vpermq m2, m2, q3120 vpermq m8, m8, q3120 vpermq m3, m3, q3120 vpermq m1, m1, q3120 vpermq m5, m5, q3120 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm6 vextracti128 [dstq+strideq*1], m6, 1 mova [dstq+strideq*2], xm7 vextracti128 [dstq+r2 ], m7, 1 mova [dstq+strideq*4], xm2 vextracti128 [dstq+r3 ], m2, 1 mova [dstq+r2*2 ], xm8 vextracti128 [dstq+r4 ], m8, 1 sub dstq, r5 mova [dstq+strideq*0], xm3 vextracti128 [dstq+strideq*1], m3, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r2 ], m1, 1 mova [dstq+strideq*4], xm5 vextracti128 [dstq+r3 ], m5, 1 mova [dstq+r2*2 ], xm0 vextracti128 [dstq+r4 ], m0, 1 sub dstq, r5 cmp r6, rsp jae .h64_transpose_loop add rsp, 64*16 lea dstq, [dstq+r5*8-16] sub org_wd, 16 jg .h64_transpose_loop0 .h64_end: RET %macro FILTER_XMM 4 ; dst, src, tmp, shuf %ifnum %4 pshufb xm%2, xm%4 %else pshufb xm%2, %4 %endif pshufd xm%1, xm%2, q0000 ; p0 p1 pmaddubsw xm%1, xm2 pshufd xm%3, xm%2, q1111 ; p2 p3 pmaddubsw xm%3, xm3 paddw xm%1, xm1 paddw xm%1, xm%3 pshufd xm%3, xm%2, q2222 ; p4 p5 pmaddubsw xm%3, xm4 paddw xm%1, xm%3 pshufd xm%3, xm%2, q3333 ; p6 __ pmaddubsw xm%3, xm5 paddw xm%1, xm%3 psraw xm%1, 4 packuswb xm%1, xm%1 %endmacro %macro FILTER_YMM 4 ; dst, src, tmp, shuf pshufb m%2, m%4 pshufd m%1, m%2, q0000 pmaddubsw m%1, m2 pshufd m%3, m%2, q1111 pmaddubsw m%3, m3 paddw m%1, m1 paddw m%1, m%3 pshufd m%3, m%2, q2222 pmaddubsw m%3, m4 paddw m%1, m%3 pshufd m%3, m%2, q3333 pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 vperm2i128 m%3, m%1, m%1, 0x01 packuswb m%1, m%3 %endmacro ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. One redundant ; block is calculated for w8 and w16, two for w32. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 5 1 2 3 5 b c d f ; 2 2 3 2 4 5 7 2 4 5 7 c e f h ; 3 3 4 4 6 7 9 4 6 7 9 e g h j ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter %define base r6-ipred_filter_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 WIN64_SPILL_XMM 9, 15 add filterq, r6 lea r6, [ipred_filter_avx2_table] movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 movsxd wq, [r6+wq*4] vpbroadcastd m1, [base+pw_8] vbroadcasti128 m2, [filterq+16*0] vbroadcasti128 m3, [filterq+16*1] vbroadcasti128 m4, [filterq+16*2] vbroadcasti128 m5, [filterq+16*3] add wq, r6 mov hd, hm jmp wq .w4: mova xm8, [base+filter_shuf2] sub tlq, 3 sub tlq, hq jmp .w4_loop_start .w4_loop: pinsrd xm0, xm6, [tlq+hq], 0 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER_XMM 6, 0, 7, 8 movd [dstq+strideq*0], xm6 pextrd [dstq+strideq*1], xm6, 1 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: WIN64_PUSH_XMM 10 mova m8, [base+filter_shuf1] FILTER_XMM 7, 0, 6, [base+filter_shuf2] vpbroadcastd m0, [tlq+4] vpbroadcastd m6, [tlq+5] sub tlq, 4 sub tlq, hq vpbroadcastq m7, xm7 vpblendd m7, m6, 0x20 .w8_loop: vpbroadcastd xm6, [tlq+hq] palignr m6, m0, 12 vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm6, xm7 call .main vpblendd xm6, xm7, 0x0c pshufd xm6, xm6, q3120 movq [dstq+strideq*0], xm6 movhps [dstq+strideq*1], xm6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: sub hd, 2 call .w16_main %if WIN64 jmp .end %else RET %endif .w16_main: ; The spills are into the callers stack frame %assign stack_size stack_size + gprsize WIN64_PUSH_XMM 15, 9 %assign stack_size stack_size - gprsize FILTER_XMM 12, 0, 7, [base+filter_shuf2] vpbroadcastd m0, [tlq+5] vpblendd m0, [tlq-12], 0x14 mova m8, [base+filter_shuf1] vpbroadcastq m7, xm12 vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 vinserti128 m14, m8, [base+filter_shuf3], 0 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 FILTER_XMM 6, 9, 10, 14 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 vpbroadcastd m9, [tlq+13] vpbroadcastd m10, [tlq+12] psrld m11, m8, 4 vpblendd m6, m9, 0x20 ; top sub tlq, 6 sub tlq, hq .w16_loop: vpbroadcastd xm9, [tlq+hq] palignr m9, m0, 12 vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm13, xm7 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 vpblendd m9, m12, m10, 0xf0 vpblendd m12, m6, 0xc0 pshufd m9, m9, q3333 vpblendd m9, m6, 0xee vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 mova [dstq+strideq*0], xm9 vextracti128 [dstq+strideq*1], m9, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 ret ALIGN function_align .w32: sub hd, 2 lea r3, [dstq+16] lea r5d, [hq-2] call .w16_main add tlq, r5 mov dstq, r3 lea r3, [strideq-4] lea r4, [r3+strideq*2] movq xm0, [tlq+21] pinsrd xm0, [dstq-4], 2 pinsrd xm0, [dstq+r3*1], 3 FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 movq xm7, [dstq+r3*2] pinsrd xm7, [dstq+r4], 2 palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 vpbroadcastd m0, [tlq+28] vpbroadcastd m9, [tlq+29] vbroadcasti128 m8, [base+filter_shuf1+16] vpblendd m0, m9, 0x20 vpblendd m0, m7, 0x0f vpbroadcastq m7, xm12 vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 add r3, 2 lea r4, [r4+strideq*2] movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 FILTER_XMM 6, 9, 10, 14 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 vpbroadcastd m9, [tlq+37] vpbroadcastd m10, [tlq+36] vpblendd m6, m9, 0x20 ; top .w32_loop: movq xm9, [dstq+r3*4] pinsrd xm9, [dstq+r4], 2 .w32_loop_last: palignr m9, m0, 12 vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm13, xm7 ; c0 d0 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 vpblendd m9, m12, m10, 0xf0 vpblendd m12, m6, 0xc0 pshufd m9, m9, q3333 vpblendd m9, m6, 0xee vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 mova [dstq+strideq*0], xm9 vextracti128 [dstq+strideq*1], m9, 1 lea dstq, [dstq+strideq*2] sub r5d, 2 jg .w32_loop jz .w32_loop_last vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 .end: RET ALIGN function_align .main: FILTER_YMM 7, 0, 9, 8 ret %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_left_avx2_table] tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 shrx r6d, r6d, wd movd xm3, r6d movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 shrx t0d, t0d, r6d movd xm3, t0d lea t0, [ipred_cfl_left_avx2_table] movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 .h8: psrlq xm1, xm0, 32 paddw xm0, xm1 .h4: pmaddwd xm0, xm2 pmulhrsw xm0, xm3 vpbroadcastw m0, xm0 jmp wq cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea t0d, [wq+hq] movd xm4, t0d tzcnt t0d, t0d movd xm5, t0d lea t0, [ipred_cfl_avx2_table] tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] pcmpeqd m3, m3 psrlw xm4, 1 add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movd xm0, [tlq-4] pmaddubsw xm0, xm3 jmp wq .w4: movd xm1, [tlq+1] pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm1 pmaddwd xm0, xm3 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddw xm0, xm1 shrx r6d, r6d, r2d psrlq xm1, xm0, 32 paddw xm0, xm1 movd xm1, r6d psrlw xm0, 2 pmulhuw xm0, xm1 .w4_end: vpbroadcastw m0, xm0 .s4: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] IPRED_CFL 4 packuswb m4, m4 vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 movd [dstq+strideq*2], xm5 pextrd [dstq+r6 ], xm5, 1 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: movq xm0, [tlq-8] pmaddubsw xm0, xm3 jmp wq .w8: movq xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 punpckhqdq xm2, xm0, xm0 paddw xm0, xm2 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w8_end: vpbroadcastw m0, xm0 .s8: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 movhps [dstq+strideq*2], xm4 movhps [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova xm0, [tlq-16] pmaddubsw xm0, xm3 jmp wq .w16: movu xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w16_end: vpbroadcastw m0, xm0 .s16: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vpermq m4, m4, q3120 mova [dstq+strideq*0], xm4 vextracti128 [dstq+strideq*1], m4, 1 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w32_end: vpbroadcastw m0, xm0 .s32: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vpermq m4, m4, q3120 mova [dstq], m4 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [t0+wq*4] vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] add wq, t0 movifnidn acq, acmp jmp wq cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd mov ac_bakq, acq imul szd, hd shl hpadd, 2 sub hd, hpadd vpbroadcastd m2, [pb_2] pxor m4, m4 cmp wd, 8 jg .w16 je .w8 ; fall-through DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak .w4: lea stride3q, [strideq*3] .w4_loop: movq xm0, [yq] movq xm1, [yq+strideq] movhps xm0, [yq+strideq*2] movhps xm1, [yq+stride3q] pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 paddw xm0, xm1 mova [acq], xm0 paddw xm4, xm0 lea yq, [yq+strideq*4] add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .calc_avg vpermq m0, m0, q1111 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova xm0, [yq] mova xm1, [yq+strideq] vinserti128 m0, [yq+strideq*2], 1 vinserti128 m1, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_loop test hpadd, hpadd jz .calc_avg jmp .w8_hpad .w8_wpad: vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] .w8_wpad_loop: movq xm0, [yq] movq xm1, [yq+strideq] vinserti128 m0, [yq+strideq*2], 1 vinserti128 m1, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufb m0, m3 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_wpad_loop test hpadd, hpadd jz .calc_avg .w8_hpad: vpermq m0, m0, q3232 .w8_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad_loop jmp .calc_avg .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad_loop .w16_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_420_avx2_table] shl wpadd, 2 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ ipred_cfl_ac_420_avx2_table+wpadq*8-32] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w16_pad3: vpbroadcastq m0, [yq] vpbroadcastq m1, [yq+strideq] jmp .w16_wpad_end .w16_pad2: vbroadcasti128 m0, [yq] vbroadcasti128 m1, [yq+strideq] jmp .w16_wpad_end .w16_pad1: mova m0, [yq] mova m1, [yq+strideq] ; fall-through .w16_wpad_end: pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufb m0, m3 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 dec hd jz .w16_wpad_done jmp iptrq .w16_wpad_done: test hpadd, hpadd jz .calc_avg .w16_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 dec hpadd jg .w16_hpad_loop ; fall-through .calc_avg: vpbroadcastd m2, [pw_1] pmaddwd m0, m4, m2 vextracti128 xm1, m0, 1 tzcnt r1d, szd paddd xm0, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd mov ac_bakq, acq imul szd, hd shl hpadd, 2 sub hd, hpadd vpbroadcastd m2, [pb_4] pxor m4, m4 pxor m5, m5 cmp wd, 8 jg .w16 je .w8 ; fall-through DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak .w4: lea stride3q, [strideq*3] .w4_loop: movq xm1, [yq] movhps xm1, [yq+strideq] movq xm0, [yq+strideq*2] movhps xm0, [yq+stride3q] pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 mova [acq], xm1 mova [acq+16], xm0 paddw xm4, xm0 paddw xm5, xm1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg vpermq m0, m0, q1111 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova xm1, [yq] vinserti128 m1, [yq+strideq], 1 mova xm0, [yq+strideq*2] vinserti128 m0, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg jmp .w8_hpad .w8_wpad: vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] .w8_wpad_loop: movq xm1, [yq] vinserti128 m1, [yq+strideq], 1 movq xm0, [yq+strideq*2] vinserti128 m0, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 pshufb m0, m3 pshufb m1, m3 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_wpad_loop test hpadd, hpadd jz .calc_avg .w8_hpad: vpermq m0, m0, q3232 .w8_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad_loop jmp .calc_avg .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m1, [yq] mova m0, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad_loop .w16_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_422_avx2_table] shl wpadd, 2 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ ipred_cfl_ac_422_avx2_table+wpadq*8-32] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w16_pad3: vpbroadcastq m1, [yq] vpbroadcastq m0, [yq+strideq] jmp .w16_wpad_end .w16_pad2: vbroadcasti128 m1, [yq] vbroadcasti128 m0, [yq+strideq] jmp .w16_wpad_end .w16_pad1: mova m1, [yq] mova m0, [yq+strideq] ; fall-through .w16_wpad_end: pmaddubsw m0, m2 pmaddubsw m1, m2 pshufb m0, m3 pshufb m1, m3 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jz .w16_wpad_done jmp iptrq .w16_wpad_done: test hpadd, hpadd jz .calc_avg .w16_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddw m4, m0 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop ; fall-through .calc_avg: vpbroadcastd m2, [pw_1] pmaddwd m5, m5, m2 pmaddwd m0, m4, m2 paddd m0, m5 vextracti128 xm1, m0, 1 tzcnt r1d, szd paddd xm0, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd imul szd, hd shl hpadd, 2 sub hd, hpadd pxor m4, m4 vpbroadcastd m5, [pw_1] tzcnt r8d, wd lea r5, [ipred_cfl_ac_444_avx2_table] movsxd r8, [r5+r8*4+12] add r5, r8 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak mov ac_bakq, acq jmp r5 .w4: lea stride3q, [strideq*3] pxor xm2, xm2 .w4_loop: movd xm1, [yq] movd xm0, [yq+strideq*2] pinsrd xm1, [yq+strideq], 1 pinsrd xm0, [yq+stride3q], 1 punpcklbw xm1, xm2 punpcklbw xm0, xm2 psllw xm1, 3 psllw xm0, 3 mova [acq], xm1 mova [acq+16], xm0 paddw xm1, xm0 paddw xm4, xm1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_mul pshufd xm0, xm0, q3232 paddw xm1, xm0, xm0 .w4_hpad_loop: mova [acq], xm0 mova [acq+16], xm0 paddw xm4, xm1 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg_mul .w8: lea stride3q, [strideq*3] pxor m2, m2 .w8_loop: movq xm1, [yq] movq xm0, [yq+strideq*2] vinserti128 m1, [yq+strideq], 1 vinserti128 m0, [yq+stride3q], 1 punpcklbw m1, m2 punpcklbw m0, m2 psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 paddw m4, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_mul vpermq m0, m0, q3232 paddw m1, m0, m0 .w8_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddw m4, m1 add acq, 64 sub hpadd, 4 jg .w8_hpad_loop jmp .calc_avg_mul .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: pmovzxbw m1, [yq] pmovzxbw m0, [yq+strideq] psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 pmaddwd m1, m5 paddd m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad .w16_wpad: mova m3, [cfl_ac_444_w16_pad1_shuffle] .w16_wpad_loop: vpbroadcastq m1, [yq] vpbroadcastq m0, [yq+strideq] pshufb m1, m3 pshufb m0, m3 psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 pmaddwd m1, m5 paddd m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_wpad_loop test hpadd, hpadd jz .calc_avg .w16_hpad: paddw m1, m0, m0 pmaddwd m1, m5 .w16_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddd m4, m1 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop jmp .calc_avg .w32: test wpadd, wpadd jnz .w32_wpad .w32_loop: pmovzxbw m1, [yq] pmovzxbw m0, [yq+16] psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m2, m1, m0 pmaddwd m2, m5 paddd m4, m2 add yq, strideq add acq, 64 dec hd jg .w32_loop test hpadd, hpadd jz .calc_avg jmp .w32_hpad_loop .w32_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_444_avx2_table] add wpadd, wpadd mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w32_pad3: vpbroadcastq m1, [yq] pshufb m1, m3 vpermq m0, m1, q3232 jmp .w32_wpad_end .w32_pad2: pmovzxbw m1, [yq] pshufhw m0, m1, q3333 vpermq m0, m0, q3333 jmp .w32_wpad_end .w32_pad1: pmovzxbw m1, [yq] vpbroadcastq m0, [yq+16] pshufb m0, m3 ; fall-through .w32_wpad_end: psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m2, m1, m0 pmaddwd m2, m5 paddd m4, m2 add yq, strideq add acq, 64 dec hd jz .w32_wpad_done jmp iptrq .w32_wpad_done: test hpadd, hpadd jz .calc_avg .w32_hpad_loop: mova [acq], m1 mova [acq+32], m0 paddd m4, m2 add acq, 64 dec hpadd jg .w32_hpad_loop jmp .calc_avg .calc_avg_mul: pmaddwd m4, m5 .calc_avg: vextracti128 xm1, m4, 1 tzcnt r1d, szd paddd xm0, xm4, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h vpbroadcastq m4, [palq] lea r2, [pal_pred_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] add wq, r2 lea r2, [strideq*3] jmp wq .w4: movq xm0, [idxq] add idxq, 8 psrlw xm1, xm0, 4 punpcklbw xm0, xm1 pshufb xm0, xm4, xm0 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movu xm2, [idxq] add idxq, 16 pshufb xm1, xm4, xm2 psrlw xm2, 4 pshufb xm2, xm4, xm2 punpcklbw xm0, xm1, xm2 punpckhbw xm1, xm2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: movu m2, [idxq] add idxq, 32 pshufb m1, m4, m2 psrlw m2, 4 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], xm0 movu [dstq+strideq*1], xm1 vextracti128 [dstq+strideq*2], m0, 1 vextracti128 [dstq+r2 ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpermq m2, [idxq], q3120 add idxq, 32 pshufb m1, m4, m2 psrlw m2, 4 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: vpermq m2, [idxq], q3120 add idxq, 32 pshufb m1, m4, m2 psrlw m2, 4 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, strideq dec hd jg .w64 RET %endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred_avx512.asm000066400000000000000000003417531517466257200242100ustar00rootroot00000000000000; Copyright © 2020, VideoLAN and dav2d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ; dav2d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1 db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3 db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5 db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7 z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24 dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56 z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32 dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64 z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 79, 79, 79 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 pb_8_56_0_0: db 8, 56, 0, 0 pb_m4_36: times 2 db -4, 36 pb_127_m127: times 2 db 127, -127 pb_8: times 4 db 8 pb_15: times 4 db 15 pb_16: times 4 db 16 pb_31: times 4 db 31 pb_63: times 4 db 63 pb_90: times 4 db 90 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 pw_512: times 2 dw 512 %define pb_1 (ipred_h_shuf+24) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+16) %define pb_4 (smooth_shuf +48) %define pb_7 (ipred_h_shuf+ 0) %define pb_9 (z_xpos_bc + 8) %define pb_17 (z_xpos_bc + 0) %define pb_33 (z_xpos_bc + 4) %define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 cextern dr_intra_derivative cextern pb_0to63 SECTION .text INIT_ZMM avx512icl cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h lea r5, [ipred_dc_left_8bpc_avx512icl_table] movd xm0, wm tzcnt wd, wm inc tlq movifnidn hd, hm movu ym1, [tlq] movd xmm3, wd movsxd r6, [r5+wq*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_left_8bpc_avx512icl_table] mov hd, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movd xm0, hm movu ym1, [tlq] movd xmm3, r6d movsxd r6, [r5+r6*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu ym1, [tlq+32] ; unaligned when jumping here from dc_top vpdpbusd ym0, ym1, ym2 .h32: vextracti32x4 xm1, ym0, 1 paddd xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 .h8: psrlq xm1, xm0, 32 paddd xm0, xm1 .h4: vpsrlvd xm0, xmm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 jmp wq cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd xm0, r5d tzcnt r5d, r5d movd xmm4, r5d lea r5, [ipred_dc_8bpc_avx512icl_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] psrld xm0, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd xmm1, [tlq-4] vpdpbusd xm0, xmm1, xm3 jmp wq .w4: movd xmm1, [tlq+1] vpdpbusd xm0, xmm1, xm3 cmp hd, 4 jg .w4_mul psrlw xmm0, xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xmm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddd xmm1, xm0 shrx r6d, r6d, r2d psrlq xmm0, xmm1, 32 paddd xmm0, xmm1 movd xmm1, r6d psrld xmm0, 2 pmulhuw xmm0, xmm1 .w4_end: vpbroadcastb xm0, xmm0 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: movq xmm1, [tlq-8] vpdpbusd xm0, xmm1, xm3 jmp wq .w8: movq xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w8_end: vpbroadcastb xm0, xmm0 .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova xmm1, [tlq-16] vpdpbusd xm0, xmm1, xm3 jmp wq .w16: movu xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w16_end: vpbroadcastb xm0, xmm0 .s16: movu [dstq+strideq*0], xm0 movu [dstq+strideq*1], xm0 movu [dstq+strideq*2], xm0 movu [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova ym1, [tlq-32] vpdpbusd ym0, ym1, ym3 jmp wq .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w32_end: vpbroadcastb ym0, xmm0 .s32: movu [dstq+strideq*0], ym0 movu [dstq+strideq*1], ym0 movu [dstq+strideq*2], ym0 movu [dstq+stride3q ], ym0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET .h64: mova ym1, [tlq-64] mova ym2, [tlq-32] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 jmp wq .w64: movu ym1, [tlq+ 1] movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 64 je .w64_end mov r6d, 0x33345556 shrx r6d, r6d, hd movd xmm1, r6d pmulhuw xmm0, xmm1 .w64_end: vpbroadcastb m0, xmm0 .s64: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movu m0, [tlq+1] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 %define base r6-ipred_h_8bpc_avx512icl_table lea r6, [ipred_h_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea stride3q, [strideq*3] sub tlq, hq add wq, r6 jmp wq .w4: mova xmm1, [base+ipred_h_shuf+16] .w4_loop: movd xmm0, [tlq+hq-4] pshufb xmm0, xmm1 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET .w8: movsldup xmm2, [base+ipred_h_shuf+16] movshdup xmm3, [base+ipred_h_shuf+16] .w8_loop: movd xmm1, [tlq+hq-4] pshufb xmm0, xmm1, xmm2 pshufb xmm1, xmm3 movq [dstq+strideq*0], xmm0 movq [dstq+strideq*1], xmm1 movhps [dstq+strideq*2], xmm0 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m1, [base+smooth_shuf] .w16_loop: vpbroadcastd m0, [tlq+hq-4] pshufb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpbroadcastd ym3, [base+pb_1] vpord m2, m3, [base+pb_2] {1to16} .w32_loop: vpbroadcastd m1, [tlq+hq-4] pshufb m0, m1, m2 pshufb m1, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32_loop RET .w64: vpbroadcastd m4, [base+pb_3] vpbroadcastd m5, [base+pb_2] vpbroadcastd m6, [base+pb_1] pxor m7, m7 .w64_loop: vpbroadcastd m3, [tlq+hq-4] pshufb m0, m3, m4 pshufb m1, m3, m5 pshufb m2, m3, m6 pshufb m3, m7 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64_loop RET %macro PAETH 0 psubusb m1, m5, m4 psubusb m0, m4, m5 por m1, m0 ; tdiff pavgb m2, m6, m4 vpcmpub k1, m1, m7, 1 ; tdiff < ldiff vpblendmb m0{k1}, m4, m6 vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 psubusb m3, m5, m2 psubb m2, m4 psubusb m2, m5 por m2, m3 pminub m1, m7 paddusb m2, m2 por m2, m4 ; min(tldiff, 255) vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff vmovdqu8 m0{k1}, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 lea r6, [ipred_paeth_8bpc_avx512icl_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] lea topq, [tlq+1] sub tlq, hq add wq, r6 lea stride3q, [strideq*3] jmp wq INIT_YMM avx512icl .w4: vpbroadcastd m6, [topq] mova m9, [ipred_h_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 ; left PAETH movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: RET INIT_ZMM avx512icl .w8: vpbroadcastq m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: RET .w16: vbroadcasti32x4 m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: vpbroadcastd m4, [tlq+hq-4] pshufb m4, m9 PAETH movu [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m6, [topq] mova ym9, ym8 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: vpbroadcastd m4, [tlq+hq-2] pshufb m4, m9 PAETH movu [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m6, [topq] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w64_loop: vpbroadcastb m4, [tlq+hq-1] PAETH mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-ipred_smooth_v_8bpc_avx512icl_table lea r6, [ipred_smooth_v_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m0, [base+pb_127_m127] vpbroadcastd m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq vpbroadcastb m4, [tlq+hq] ; bottom add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 ; top, bottom pmaddubsw m3, m2, m0 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; 128 * top + 129 * bottom + 128 .w4_loop: vbroadcasti32x4 m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: RET .w8: vpbroadcastq m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w16_loop: vpbroadcastq m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: vbroadcasti32x8 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w32_loop: vpbroadcastd m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: movu m3, [tlq+1] mova m6, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w64_loop: vpbroadcastw m1, [weightsq+hq*2] pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m6, m1 mova [dstq], m0 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 %define base r5-ipred_smooth_h_8bpc_avx512icl_table lea r5, [ipred_smooth_h_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd vpbroadcastb m4, [tlq+r6] ; right mov hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m5, [base+pb_127_m127] vpbroadcastd m6, [base+pw_128] sub tlq, hq add wq, r5 vpmovb2m k1, m6 lea stride3q, [strideq*3] jmp wq .w4: movsldup m3, [smooth_shuf] vpbroadcastq m7, [smooth_weights+4*2] mova ym8, [smooth_endA] .w4_loop: vpbroadcastq m0, [tlq+hq-8] mova m2, m4 vpshufb m2{k1}, m0, m3 ; left, right pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: movsldup m3, [smooth_shuf] vbroadcasti32x4 m7, [smooth_weights+8*2] mova ym8, [smooth_endA] .w8_loop: vpbroadcastd m0, [tlq+hq-4] mova m2, m4 vpshufb m2{k1}, m0, m3 pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m7, [smooth_shuf] vbroadcasti32x4 m8, [smooth_weights+16*2] vbroadcasti32x4 m9, [smooth_weights+16*3] mova m10, [smooth_endB] .w16_loop: vpbroadcastd m0, [tlq+hq-4] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: mova m10, [smooth_endA] vpbroadcastd ym7, [pb_1] vbroadcasti32x8 m8, [smooth_weights+32*2] vbroadcasti32x8 m9, [smooth_weights+32*3] vshufi32x4 m10, m10, q3120 .w32_loop: vpbroadcastd m0, [tlq+hq-2] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: mova m7, [smooth_weights+64*2] mova m8, [smooth_weights+64*3] mova m9, [smooth_endA] .w64_loop: mova m3, m4 vpbroadcastb m3{k1}, [tlq+hq-1] pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m7 pmaddubsw m1, m3, m8 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m9, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 %define base r5-ipred_smooth_8bpc_avx512icl_table lea r5, [ipred_smooth_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd mov hd, hm vpbroadcastb m6, [tlq+r6] ; right sub tlq, hq movsxd wq, [r5+wq*4] vpbroadcastd m7, [base+pb_127_m127] vpbroadcastb m0, [tlq] ; bottom vpbroadcastd m1, [base+pw_255] add wq, r5 lea v_weightsq, [base+smooth_weights+hq*2] vpmovb2m k1, m1 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vpbroadcastq m9, [smooth_weights+4*2] mova ym11, [smooth_endA] punpcklbw m8, m0 ; top, bottom pmaddubsw m10, m8, m7 paddw m1, m8 ; 1 * top + 256 * bottom + 255 paddw m10, m1 ; 128 * top + 129 * bottom + 255 .w4_loop: vpbroadcastq m1, [tlq+hq-8] vbroadcasti32x4 m0, [v_weightsq] add v_weightsq, 16 mova m2, m6 vpshufb m2{k1}, m1, m4 ; left, right pmaddubsw m1, m2, m7 ; 127 * left - 127 * right pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 ; 128 * left + 129 * right pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: vpbroadcastq m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vbroadcasti32x4 m9, [smooth_weights+8*2] mova ym11, [smooth_endA] punpcklbw m8, m0 pmaddubsw m10, m8, m7 paddw m1, m8 paddw m10, m1 .w8_loop: vpbroadcastd m1, [tlq+hq-4] vpbroadcastq m0, [v_weightsq] add v_weightsq, 8 mova m2, m6 vpshufb m2{k1}, m1, m4 pmaddubsw m1, m2, m7 pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x4 m9, [tlq+hq+1] movsldup m5, [smooth_shuf] movshdup m10, [smooth_shuf] vbroadcasti32x4 m11, [smooth_weights+16*2] vbroadcasti32x4 m12, [smooth_weights+16*3] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastq m1, [v_weightsq] add v_weightsq, 8 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m9, [tlq+hq+1] movshdup m10, [smooth_shuf] mova m12, [smooth_weights+32*2] vpbroadcastd ym5, [pb_1] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 vshufi32x4 m11, m12, m12, q2020 vshufi32x4 m12, m12, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w32_loop: vpbroadcastd m0, [tlq+hq-2] vpbroadcastd m1, [v_weightsq] add v_weightsq, 4 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m9, [tlq+hq+1] mova m11, [smooth_weights+64*2] mova m2, [smooth_weights+64*3] mova m14, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m12, m8, m7 pmaddubsw m13, m9, m7 vshufi32x4 m10, m11, m2, q2020 vshufi32x4 m11, m2, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m12, m0 paddw m13, m1 .w64_loop: mova m4, m6 vpbroadcastb m4{k1}, [tlq+hq-1] vpbroadcastw m1, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m4, m7 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m10 pmaddubsw m4, m11 paddw m0, m12 paddw m1, m13 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m14, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 movifnidn wd, wm movifnidn hd, hm lea stride3q, [strideq*3] cmp wd, 8 jg .w32 movq xmm3, [palq] je .w8 .w4: movq xmm0, [idxq] add idxq, 8 psrlw xmm1, xmm0, 4 punpcklbw xmm0, xmm1 pshufb xmm0, xmm3, xmm0 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movu xmm2, [idxq] add idxq, 16 pshufb xmm1, xmm3, xmm2 psrlw xmm2, 4 pshufb xmm2, xmm3, xmm2 punpcklbw xmm0, xmm1, xmm2 punpckhbw xmm1, xmm2 movq [dstq+strideq*0], xmm0 movhps [dstq+strideq*1], xmm0 movq [dstq+strideq*2], xmm1 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: pmovzxdq m0, [idxq] add idxq, 32 vpmultishiftqb m0, m3, m0 pshufb m0, m5, m0 movu [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpbroadcastq m3, [pal_unpack+0] vpbroadcastq m5, [palq] cmp wd, 32 jl .w16 pmovzxbd m2, [pal_perm] vpbroadcastq m4, [pal_unpack+8] jg .w64 .w32_loop: vpermd m1, m2, [idxq] add idxq, 64 vpmultishiftqb m0, m3, m1 vpmultishiftqb m1, m4, m1 pshufb m0, m5, m0 pshufb m1, m5, m1 movu [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 movu [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32_loop RET .w64: vpermd m1, m2, [idxq] add idxq, 64 vpmultishiftqb m0, m3, m1 vpmultishiftqb m1, m4, m1 pshufb m0, m5, m0 pshufb m1, m5, m1 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx %define base r7-z_filter_t0 lea r7, [z_filter_t0] tzcnt wd, wm movifnidn angled, anglem lea t0, [dr_intra_derivative] movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] inc tlq mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [t0+dxq] lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] movifnidn hd, hm xor angled, 0x4ff ; d = 90 - angle mova m14, [base+z_frac_table] vpbroadcastd m15, [base+pw_512] jmp wq .w4: mova m9, [pb_0to63] pminud m8, m9, [base+pb_7] {1to16} vpbroadcastq m7, [tlq] pshufb m7, m8 cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) pshufb xmm0, xm7, [base+z_filter_s4] mova xmm1, [tlq-1] pshufb xmm1, [base+z_xpos_off2a] vpbroadcastd xmm2, [base+pb_m4_36] vpbroadcastq m4, [pb_0to63] pmaddubsw xmm0, xmm2 pmaddubsw xmm1, xmm2 add dxd, dxd kxnorw k1, k1, k1 paddw xmm0, xmm1 pmulhrsw xm0, xmm0, xm15 packuswb xm0, xm0 punpcklbw ym7{k1}, ym0 jmp .w4_main2 .w4_no_upsample: test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+3] vpbroadcastb xm0, r3d vpbroadcastb xm1, angled shr angled, 8 ; is_sm << 1 vpcmpeqb k1, xm0, [base+z_filter_wh] vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] kmovw r5d, k1 test r5d, r5d jz .w4_main vbroadcasti32x4 ym0, [tlq-1] pshufb ym0, [base+z_filter4_s1] popcnt r5d, r5d ; filter_strength pshufb ym1, ym7, [z_filter_s4] pshufb ym7, [base+z_filter_s3] vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] pmaddubsw ym0, ym11 pmaddubsw ym1, ym11 pmaddubsw ym7, ym12 paddw ym0, ym1 paddw ym7, ym0 pmulhrsw ym7, ym15 cmp hd, 4 je .w4_filter_end vpbroadcastd m8, [base+pb_9] pminub m8, m9 .w4_filter_end: paddb m8, m8 vpermb m7, m8, m7 .w4_main: vpbroadcastq m4, [base+z_xpos_off1a] .w4_main2: movsldup m2, [base+z_xpos_mul] vpbroadcastw m5, dxd vbroadcasti32x4 m3, [base+z_xpos_bc] lea r2, [strideq*3] pmullw m2, m5 ; xpos psllw m5, 5 ; dx*8 .w4_loop: psrlw m1, m2, 3 pshufb m0, m2, m3 vpermw m1, m1, m14 ; 64-frac, frac paddsb m0, m4 ; base, base+1 vpermb m0, m0, m7 ; top[base], top[base+1] paddsw m2, m5 ; xpos += dx pmaddubsw m0, m1 ; v pmulhrsw m0, m15 packuswb m0, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+r2 ], xm1, 1 sub hd, 8 jl .w4_end vextracti32x4 xm1, m0, 2 ; top[max_base_x] lea dstq, [dstq+strideq*4] vextracti32x4 xm0, m0, 3 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r2 ], xm0, 1 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .w8_filter: mova ym0, [base+z_filter_s1] popcnt r5d, r5d vbroadcasti32x4 ym1, [base+z_filter_s2] vbroadcasti32x4 ym3, [base+z_filter_s3] vbroadcasti32x4 ym4, [base+z_filter_s4] vpermi2b ym0, ym7, ym2 ; al bl mova ym5, [base+z_filter_s5] pshufb ym1, ym7, ym1 ; ah bh vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] pshufb ym3, ym7, ym3 ; cl ch vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] pshufb ym4, ym7, ym4 ; el dl vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] vpermb ym5, ym5, ym7 ; eh dh pmaddubsw ym0, ym11 pmaddubsw ym1, ym11 pmaddubsw ym2, ym3, ym12 pmaddubsw ym3, ym13 pmaddubsw ym4, ym11 pmaddubsw ym5, ym11 paddw ym0, ym2 paddw ym1, ym3 paddw ym0, ym4 paddw ym1, ym5 pmulhrsw ym0, ym15 pmulhrsw ym1, ym15 packuswb ym0, ym1 ret .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq-1] mova xm1, [base+z_filter_s4] vpbroadcastb xm2, r3d mova xm7, [tlq-1] vinserti32x4 ym7, [tlq+7], 1 vbroadcasti32x4 ym0, [base+z_xpos_off1a] vpbroadcastd ym3, [base+pb_m4_36] pminub xm2, xm1 pshufb ym0, ym7, ym0 vinserti32x4 ym1, xm2, 1 psrldq ym7, 1 pshufb ym1, ym7, ym1 pmaddubsw ym0, ym3 pmaddubsw ym1, ym3 vbroadcasti32x4 m8, [pb_0to63] add dxd, dxd paddw ym0, ym1 pmulhrsw ym0, ym15 packuswb ym0, ym0 punpcklbw ym7, ym0 jmp .w8_main2 .w8_no_upsample: lea r3d, [hq+7] mova m9, [pb_0to63] vpbroadcastb ym0, r3d and r3d, 7 vbroadcasti32x4 m7, [tlq] or r3d, 8 ; imin(h+7, 15) vpbroadcastb m8, r3d pminub m8, m9 pshufb m7, m8 test angled, 0x400 jnz .w8_main vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym0, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 kmovd r5d, k1 test r5d, r5d jz .w8_main vpbroadcastd ym2, [tlq-4] call .w8_filter cmp hd, 8 jle .w8_filter_end vpbroadcastd m8, [base+pb_17] add r3d, 2 pminub m8, m9 .w8_filter_end: vpermb m7, m8, m0 .w8_main: vbroadcasti32x4 m8, [base+z_xpos_off1a] .w8_main2: movsldup m4, [base+z_xpos_mul] vpbroadcastw m9, dxd shl r3d, 6 vpbroadcastd m5, [base+z_xpos_bc+8*0] pmullw m4, m9 ; xpos vpbroadcastd m6, [base+z_xpos_bc+8*1] sub r3d, dxd shl dxd, 3 psllw m9, 5 ; dx*8 lea r2, [strideq*3] .w8_loop: psrlw m3, m4, 3 pshufb m0, m4, m5 pshufb m1, m4, m6 vpermw m3, m3, m14 paddsb m0, m8 paddsb m1, m8 vpermb m0, m0, m7 vpermb m1, m1, m7 paddsw m4, m9 punpcklqdq m2, m3, m3 pmaddubsw m0, m2 punpckhqdq m3, m3 pmaddubsw m1, m3 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 sub hd, 8 jl .w8_end vextracti32x8 ym0, m0, 1 lea dstq, [dstq+strideq*4] vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 jz .w8_end lea dstq, [dstq+strideq*4] sub r3d, dxd jg .w8_loop vextracti32x4 xm7, m7, 3 .w8_end_loop: movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 movq [dstq+strideq*2], xm7 movq [dstq+r2 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_end_loop .w8_end: RET .w16_filter: mova m0, [base+z_filter_s1] popcnt r5d, r5d vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vbroadcasti32x4 m4, [base+z_filter_s4] vpermi2b m0, m7, m2 ; al bl mova m5, [base+z_filter_s5] pshufb m1, m7, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] pshufb m3, m7, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] pshufb m4, m7, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] vpermb m5, m5, m7 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m2, m3, m12 pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m0, m2 paddw m1, m3 paddw m0, m4 paddw m1, m5 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 ret .w16: lea r3d, [hq+15] mova m9, [pb_0to63] vpbroadcastb ym0, r3d and r3d, 15 movu ym7, [tlq] or r3d, 16 ; imin(h+15, 31) vpbroadcastb m8, r3d pminub m8, m9 vpermb m7, m8, m7 test angled, 0x400 jnz .w16_main vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym0, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 kmovd r5d, k1 test r5d, r5d jz .w16_main vpbroadcastd m2, [tlq-4] call .w16_filter cmp hd, 16 jle .w16_filter_end vpbroadcastd m8, [base+pb_33] add r3d, 2 pminub m8, m9 .w16_filter_end: vpermb m7, m8, m0 .w16_main: movshdup m3, [base+z_xpos_mul] vpbroadcastw m8, dxd shl r3d, 6 vpbroadcastd m4, [base+z_xpos_bc] pmullw m3, m8 ; xpos vbroadcasti32x4 m5, [base+z_xpos_off1a] sub r3d, dxd shl dxd, 2 vbroadcasti32x4 m6, [base+z_xpos_off1b] psllw m8, 4 ; dx*4 lea r2, [strideq*3] .w16_loop: pshufb m1, m3, m4 psrlw m2, m3, 3 paddsb m0, m1, m5 vpermw m2, m2, m14 paddsb m1, m6 vpermb m0, m0, m7 vpermb m1, m1, m7 paddsw m3, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 sub hd, 4 jz .w16_end lea dstq, [dstq+strideq*4] sub r3d, dxd jg .w16_loop vextracti32x4 xm7, m7, 3 .w16_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 mova [dstq+strideq*2], xm7 mova [dstq+r2 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_end_loop .w16_end: RET .w32_filter: mova m0, [base+z_filter_s1] vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vbroadcasti32x4 m4, [base+z_filter_s4] vpermi2b m0, m7, m2 ; al bl mova m5, [base+z_filter_s5] pshufb m1, m7, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+4*2+12*0] pshufb m3, m7, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+4*2+12*1] pshufb m4, m7, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+4*2+12*2] vpermi2b m5, m7, m8 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m2, m3, m12 pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m0, m2 paddw m1, m3 paddw m0, m4 paddw m1, m5 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m7, m0, m1 ret .w32: lea r3d, [hq+31] vpbroadcastb m9, r3d and r3d, 31 pminub m10, m9, [pb_0to63] or r3d, 32 ; imin(h+31, 63) vpermb m7, m10, [tlq] vpbroadcastb m8, [tlq+r3] test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vpbroadcastd m2, [tlq-4] call .w32_filter cmp hd, 64 je .w32_h64_filter_end vpermb m8, m9, m7 vpermb m7, m10, m7 jmp .w32_main .w32_h64_filter_end: ; edge case for 32x64 movd xmm0, [tlq+r3-1] movd xmm1, [base+pb_8_56_0_0] add r3d, 2 pmaddubsw xmm0, xmm1 vptestmw k1, xmm1, xmm1 ; 0x01 pmulhrsw xm0, xmm0, xm15 vmovdqu8 m8{k1}, m0 .w32_main: rorx r2d, dxd, 30 vpbroadcastd m4, [base+z_xpos_bc] vpbroadcastw m3, r2d vbroadcasti32x8 m5, [base+z_xpos_off2a] shl r3d, 6 vbroadcasti32x8 m6, [base+z_xpos_off2b] sub r3d, dxd paddw m9, m3, m3 add dxd, dxd vinserti32x8 m3, ym9, 1 .w32_loop: pshufb m1, m3, m4 psrlw m2, m3, 3 paddsb m0, m1, m5 vpermw m2, m2, m14 paddsb m1, m6 vpermi2b m0, m7, m8 vpermi2b m1, m7, m8 paddsw m3, m9 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w32_end lea dstq, [dstq+strideq*2] sub r3d, dxd jg .w32_loop punpckhqdq ym8, ym8 .w32_end_loop: mova [dstq+strideq*0], ym8 mova [dstq+strideq*1], ym8 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_end_loop .w32_end: RET .w64_filter: vbroadcasti32x4 m3, [base+z_filter_s2] mova m1, [base+z_filter_s1] pshufb m0, m3 ; al bl vpermi2b m1, m7, m2 vbroadcasti32x4 m4, [base+z_filter_s4] pshufb m6, m8, m4 ; el dl pshufb m9, m7, m4 pminub m10, m13, [base+z_filter_s5] pshufb m2, m8, m3 ; ah bh pshufb m3, m7, m3 vbroadcasti32x4 m5, [base+z_filter_s3] vpermb m10, m10, m8 ; eh dh pshufb m11, m4 vpbroadcastd m4, [base+z_filter_k+4*2+12*0] pshufb m8, m5 ; cl ch pshufb m7, m5 vpbroadcastd m5, [base+z_filter_k+4*2+12*1] REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 pmaddubsw m4, m8, m5 pmaddubsw m5, m7, m5 paddw m0, m6 vpbroadcastd m6, [base+z_filter_k+4*2+12*2] paddw m1, m9 pmaddubsw m7, m6 pmaddubsw m8, m6 paddw m2, m10 paddw m3, m11 paddw m0, m4 paddw m1, m5 paddw m2, m8 paddw m3, m7 REPX {pmulhrsw x, m15}, m0, m2, m1, m3 packuswb m0, m2 packuswb m7, m1, m3 vpermb m8, m12, m0 ret .w64: lea r3d, [hq-1] movu m7, [tlq+64*0] vpbroadcastb m13, r3d pminub m12, m13, [pb_0to63] or r3d, 64 vpermb m8, m12, [tlq+64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main movu m0, [tlq+56] vpbroadcastd m2, [tlq-4] movu m11, [tlq+8] call .w64_filter .w64_main: rorx r2d, dxd, 30 vpbroadcastd m4, [base+z_xpos_bc] vpbroadcastw m3, r2d mova m5, [base+z_xpos_off2a] shl r3d, 6 mova m6, [base+z_xpos_off2b] sub r3d, dxd mova m9, m3 .w64_loop: pshufb m1, m3, m4 psrlw m2, m3, 3 paddsb m0, m1, m5 vpermw m2, m2, m14 paddsb m1, m6 vpermi2b m0, m7, m8 vpermi2b m1, m7, m8 paddsw m3, m9 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 mova [dstq], m0 dec hd jz .w64_end add dstq, strideq sub r3d, dxd jg .w64_loop vpermb m8, m13, m8 .w64_end_loop: mova [dstq], m8 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy tzcnt wd, wm movifnidn angled, anglem lea dxq, [dr_intra_derivative-90] movzx dyd, angleb xor angled, 0x400 mov r7, dxq sub dxq, dyq movifnidn hd, hm and dyd, ~1 and dxq, ~1 movzx dyd, word [r7+dyq] ; angle - 90 lea r7, [z_filter_t0] movzx dxd, word [dxq+270] ; 180 - angle movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4] mova m8, [base+pb_63to0] neg dyd vpermb m8, m8, [tlq-64] ; left lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq] mova m14, [base+z_frac_table] inc tlq vpbroadcastd m15, [base+pw_512] neg dxd jmp wq .w4: movd xm7, [tlq] vpbroadcastq m10, [base+z_xpos_off2a] test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) vpbroadcastd xm2, [base+pb_4] sub angled, 1075 ; angle - 53 call .upsample_above lea r3d, [hq+3] vpbroadcastq m10, [pb_0to63+1] punpcklbw xm7, xm0, xm7 call .filter_strength jmp .w4_filter_left .w4_upsample_left: call .upsample_left movsldup m16, [base+z_ypos_off3] vpbroadcastd m9, [base+pb_16] punpcklbw xm8, xm0, xm8 jmp .w4_main2 .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above vpbroadcastd xm5, [base+pb_3] call .filter_top_w16 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd ym0, [base+pb_90] psubb ym0, ym17 vpcmpgtb k2{k2}, ym0, ym16 kmovd r3d, k2 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d call .filter_left_h16 .w4_main: movsldup m16, [base+z_ypos_off1] vpbroadcastd m9, [base+pb_8] .w4_main2: vpbroadcastq m3, [base+z_ypos_mul1a] vpbroadcastw m0, dyd movsldup m1, [base+z_xpos_mul] vpbroadcastw m5, dxd vinserti32x4 m7, [tlq-16], 3 vinserti32x4 m8, [tlq-16], 3 pmullw m3, m0 vbroadcasti32x4 m2, [base+z_xpos_bc] pmullw m1, m5 ; xpos0..3 psllw m5, 5 ; dx*8 psraw m4, m3, 6 psrlw m3, 1 packsswb m4, m4 vpermw m3, m3, m14 ; 64-frac, frac punpcklbw m4, m4 lea r2, [strideq*3] paddb m4, m16 ; base, base+1 .w4_loop: pshufb m16, m1, m2 psrlw m0, m1, 3 paddb m16, m10 vpermw m0, m0, m14 vpmovw2m k1, m16 ; base_x < 0 vpermb m16, m16, m7 pmaddubsw m16, m0 vpermb m0, m4, m8 pmaddubsw m16{k1}, m0, m3 pmulhrsw m16, m15 vpmovwb ym16, m16 movd [dstq+strideq*0], xm16 pextrd [dstq+strideq*1], xm16, 1 pextrd [dstq+strideq*2], xm16, 2 pextrd [dstq+r2 ], xm16, 3 sub hd, 8 jl .w4_end paddsw m1, m5 vextracti128 xm16, ym16, 1 lea dstq, [dstq+strideq*4] paddb m4, m9 movd [dstq+strideq*0], xm16 pextrd [dstq+strideq*1], xm16, 1 pextrd [dstq+strideq*2], xm16, 2 pextrd [dstq+r2 ], xm16, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .upsample_above: ; w4/w8 mova xm0, [tlq-1] xor angled, 0x7f ; 180 - angle add dxd, dxd jmp .upsample .upsample_left: ; h4/h8 palignr xm0, xm8, [tlq-16], 15 vpbroadcastb xm2, hd add dyd, dyd .upsample: pshufb xm1, xm0, [base+z_filter4_s1] pminub xm2, [base+z_filter_s4] vpbroadcastd xm3, [base+pb_m4_36] pshufb xm0, xm2 pmaddubsw xm1, xm3 pmaddubsw xm0, xm3 paddw xm0, xm1 pmulhrsw xm0, xm15 packuswb xm0, xm0 ret .filter_strength: vpbroadcastb ym16, r3d mov r3d, angled vpbroadcastd m2, [tlq-4] vpbroadcastb ym17, angled shr r3d, 8 vpcmpeqb k2, ym16, [base+z_filter_wh] mova xm16, [base+z_filter_t0+r3*8] vpcmpgtb k1{k2}, ym17, ym16 mova m9, [pb_0to63] kmovd r3d, k1 ret .w8: movq xm7, [tlq] vbroadcasti32x4 m10, [base+z_xpos_off2a] test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm vpbroadcastd xm2, [base+pb_8] sub angled, 53 ; angle - 53 call .upsample_above lea r3d, [hq+7] vbroadcasti32x4 m10, [pb_0to63+1] punpcklbw xm7, xm0, xm7 call .filter_strength jmp .w8_filter_left .w8_upsample_left: call .upsample_left movshdup m16, [base+z_ypos_off3] vpbroadcastd m9, [base+pb_8] punpcklbw xm8, xm0, xm8 jmp .w8_main2 .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above vpbroadcastd xm5, [base+pb_7] call .filter_top_w16 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd ym0, [base+pb_90] psubb ym0, ym17 vpcmpgtb k2{k2}, ym0, ym16 kmovd r3d, k2 .w8_filter_left: test r3d, r3d jz .w8_main cmp hd, 32 je .w8_filter_left_h32 popcnt r3d, r3d call .filter_left_h16 jmp .w8_main .w8_filter_left_h32: call .filter_left_h64 .w8_main: movshdup m16, [base+z_ypos_off2] vpbroadcastd m9, [base+pb_4] .w8_main2: vbroadcasti32x4 m3, [base+z_ypos_mul1a] vpbroadcastw m0, dyd movshdup m1, [base+z_xpos_mul] vpbroadcastw m5, dxd vinserti32x4 m7, [tlq-16], 3 vinserti32x4 m8, [tlq-16], 3 pmullw m3, m0 vpbroadcastd m2, [base+pb_1] pmullw m1, m5 ; xpos0..3 psllw m5, 4 ; dx*4 psraw m4, m3, 6 psrlw m3, 1 packsswb m4, m4 vpermw m3, m3, m14 ; 64-frac, frac lea r3d, [dxq+(8<<6)] paddsb m4, m16 shl dxd, 2 paddsb m0, m4, m2 lea r2, [strideq*3] punpcklbw m4, m0 ; base, base+1 .w8_loop: pshufb m16, m1, m2 psrlw m0, m1, 3 paddb m16, m10 vpermw m0, m0, m14 vpmovw2m k1, m16 ; base_x < 0 vpermb m16, m16, m7 pmaddubsw m16, m0 vpermb m0, m4, m8 pmaddubsw m16{k1}, m0, m3 pmulhrsw m16, m15 vpmovwb ym16, m16 vextracti128 xm17, ym16, 1 movq [dstq+strideq*0], xm16 movhps [dstq+strideq*1], xm16 movq [dstq+strideq*2], xm17 movhps [dstq+r2 ], xm17 sub hd, 4 jz .w8_end paddw m1, m5 lea dstq, [dstq+strideq*4] paddb m4, m9 add r3d, dxd jge .w8_loop .w8_leftonly_loop: vpermb m16, m4, m8 pmaddubsw m16, m3 paddb m4, m9 pmulhrsw m16, m15 vpmovwb ym16, m16 vextracti128 xm17, ym16, 1 movq [dstq+strideq*0], xm16 movhps [dstq+strideq*1], xm16 movq [dstq+strideq*2], xm17 movhps [dstq+r2 ], xm17 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_leftonly_loop .w8_end: RET .filter_top_w16: mova xm0, [base+z_filter_s1] popcnt r3d, r3d pminub xm4, xm5, [base+z_filter_s4] vpermi2b xm0, xm7, xm2 pminub xm5, [base+z_filter_s5] pshufb xm1, xm7, [base+z_filter_s2] vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] pshufb xm3, xm7, [base+z_filter_s3] vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] pshufb xm4, xm7, xm4 vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] pshufb xm5, xm7, xm5 pmaddubsw xm0, xm11 pmaddubsw xm1, xm11 pmaddubsw xm6, xm3, xm12 vpbroadcastd xm12, r7m ; max_width pmaddubsw xm3, xm13 pmaddubsw xm4, xm11 pmaddubsw xm5, xm11 packssdw xm12, xm12 paddw xm0, xm6 paddw xm1, xm3 paddw xm0, xm4 paddw xm1, xm5 packsswb xm12, xm12 pmulhrsw xm0, xm15 pmulhrsw xm1, xm15 vpcmpgtb k1, xm12, xm9 ; x < max_width packuswb xm7{k1}, xm0, xm1 ret .filter_left_h16: lea r5d, [hq-1] mova xm0, [base+z_filter_s1] vpbroadcastb xm5, r5d vpermi2b xm0, xm8, xm2 pminub xm4, xm5, [base+z_filter_s4] pshufb xm1, xm8, [base+z_filter_s2] pminub xm5, [base+z_filter_s5] pshufb xm3, xm8, [base+z_filter_s3] vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0] pshufb xm4, xm8, xm4 vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1] pshufb xm5, xm8, xm5 vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2] pmaddubsw xm0, xm11 pmaddubsw xm1, xm11 pmaddubsw xm6, xm3, xm12 vpbroadcastd xm12, r8m ; max_height pmaddubsw xm3, xm13 pmaddubsw xm4, xm11 pmaddubsw xm5, xm11 packssdw xm12, xm12 paddw xm0, xm6 paddw xm1, xm3 paddw xm0, xm4 paddw xm1, xm5 packsswb xm12, xm12 pmulhrsw xm0, xm15 pmulhrsw xm1, xm15 vpcmpgtb k1, xm12, xm9 ; y < max_height packuswb xm8{k1}, xm0, xm1 ret .w16: movu xm7, [tlq] ; top test angled, 0x400 jnz .w16_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above vpbroadcastd xm5, [base+pb_15] call .filter_top_w16 .w16_no_filter_above: cmp hd, 16 jg .w16_filter_left_h64 vpbroadcastd ym0, [base+pb_90] psubb ym0, ym17 vpcmpgtb k2{k2}, ym0, ym16 kmovd r3d, k2 test r3d, r3d jz .w16_main popcnt r3d, r3d call .filter_left_h16 jmp .w16_main .w16_filter_left_h64: call .filter_left_h64 .w16_main: vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8 vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15 vpbroadcastw m0, dyd vinserti32x4 m7, [tlq-16], 3 vpbroadcastd m2, [base+pb_1] vpbroadcastw m12, dxd movshdup m1, [base+z_xpos_mul] pmullw m6, m0 vbroadcasti32x4 m3, [base+z_xpos_off2a] pmullw m5, m0 vbroadcasti32x4 m4, [base+z_xpos_off2b] pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3 vpbroadcastd m9, [base+pb_4] psllw m12, 4 ; dx*4 movshdup m16, [base+z_ypos_off2] psrlw m10, m6, 1 psrlw m11, m5, 1 vpermw m10, m10, m14 ; 64-frac, frac psraw m6, 6 vpermw m11, m11, m14 psraw m5, 6 mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft packsswb m6, m5 mov r3d, 1<<6 paddsb m6, m16 sub r5d, dxd ; left-only threshold paddsb m0, m6, m2 shl dxd, 2 punpcklbw m5, m6, m0 ; base, base+1 lea r2, [strideq*3] punpckhbw m6, m0 .w16_loop: pshufb m17, m1, m2 psrlw m0, m1, 3 paddb m16, m3, m17 vpermw m0, m0, m14 paddb m17, m4 vpmovw2m k1, m16 vpermb m16, m16, m7 vpmovw2m k2, m17 vpermb m17, m17, m7 pmaddubsw m16, m0 pmaddubsw m17, m0 add r3d, dxd jge .w16_toponly mova m0, m8 vpermt2b m0, m5, m7 pmaddubsw m16{k1}, m0, m10 mova m0, m8 vpermt2b m0, m6, m7 pmaddubsw m17{k2}, m0, m11 .w16_toponly: pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 sub hd, 4 jz .w16_end paddw m1, m12 lea dstq, [dstq+strideq*4] paddb m5, m9 paddb m6, m9 cmp r3d, r5d jge .w16_loop .w16_leftonly_loop: vpermb m16, m5, m8 vpermb m17, m6, m8 pmaddubsw m16, m10 pmaddubsw m17, m11 paddb m5, m9 paddb m6, m9 pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_leftonly_loop .w16_end: RET .w32: movu ym7, [tlq] test angled, 0x400 jnz .w32_main vpbroadcastd m2, [tlq-4] mova ym0, [base+z_filter_s1] vbroadcasti32x4 ym1, [base+z_filter_s2] vbroadcasti32x4 ym3, [base+z_filter_s3] vbroadcasti32x4 ym4, [base+z_filter_s4] vpermi2b ym0, ym7, ym2 ; al bl vpbroadcastd ym5, [base+pb_31] pminub ym5, [base+z_filter_s5] pshufb ym1, ym7, ym1 ; ah bh vpbroadcastd ym11, [base+z_filter_k+4*2+12*0] pshufb ym3, ym7, ym3 ; cl ch vpbroadcastd ym12, [base+z_filter_k+4*2+12*1] pshufb ym4, ym7, ym4 ; el dl vpbroadcastd ym13, [base+z_filter_k+4*2+12*2] vpermb ym5, ym5, ym7 ; eh dh pmaddubsw ym0, ym11 pmaddubsw ym1, ym11 pmaddubsw ym6, ym3, ym12 vpbroadcastd ym12, r6m pmaddubsw ym3, ym13 pmaddubsw ym4, ym11 pmaddubsw ym5, ym11 mova m9, [pb_0to63] packssdw ym12, ym12 paddw ym0, ym6 paddw ym1, ym3 paddw ym0, ym4 paddw ym1, ym5 packsswb ym12, ym12 pmulhrsw ym0, ym15 pmulhrsw ym1, ym15 vpcmpgtb k1, ym12, ym9 ; x < max_width packuswb ym7{k1}, ym0, ym1 cmp hd, 16 jg .w32_filter_h64 mov r3d, 3 call .filter_left_h16 jmp .w32_main .w32_filter_h64: call .filter_left_h64 .w32_main: vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8 vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15 vpbroadcastw m0, dyd vinserti32x4 m7, [tlq-16], 3 rorx r2q, dxq, 62 ; dx << 2 vpbroadcastd m2, [base+pb_1] vpbroadcastw m1, r2d pmullw m6, m0 vbroadcasti32x8 m3, [base+z_xpos_off2a] pmullw m5, m0 vbroadcasti32x8 m4, [base+z_xpos_off2b] mova ym0, ym1 paddw m12, m1, m1 vpbroadcastd m9, [base+pb_2] paddw m1, m0 ; xpos1 xpos0 mova ym0, ym2 psrlw m10, m6, 1 psrlw m11, m5, 1 vpermw m10, m10, m14 ; 64-frac, frac psraw m6, 6 vpermw m11, m11, m14 psraw m5, 6 mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft packsswb m6, m5 mov r3d, 1<<6 paddsb m6, m0 sub r5d, dxd ; left-only threshold paddsb m0, m6, m2 add dxd, dxd punpcklbw m5, m6, m0 ; base, base+1 punpckhbw m6, m0 .w32_loop: pshufb m17, m1, m2 psrlw m0, m1, 3 paddb m16, m3, m17 vpermw m0, m0, m14 paddb m17, m4 vpmovw2m k1, m16 vpermb m16, m16, m7 vpmovw2m k2, m17 vpermb m17, m17, m7 pmaddubsw m16, m0 pmaddubsw m17, m0 add r3d, dxd jge .w32_toponly mova m0, m8 vpermt2b m0, m5, m7 pmaddubsw m16{k1}, m0, m10 mova m0, m8 vpermt2b m0, m6, m7 pmaddubsw m17{k2}, m0, m11 .w32_toponly: pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 vextracti32x8 [dstq+strideq*0], m16, 1 mova [dstq+strideq*1], ym16 sub hd, 2 jz .w32_end paddw m1, m12 lea dstq, [dstq+strideq*2] paddb m5, m9 paddb m6, m9 cmp r3d, r5d jge .w32_loop .w32_leftonly_loop: vpermb m16, m5, m8 vpermb m17, m6, m8 pmaddubsw m16, m10 pmaddubsw m17, m11 paddb m5, m9 paddb m6, m9 pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 vextracti32x8 [dstq+strideq*0], m16, 1 mova [dstq+strideq*1], ym16 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_leftonly_loop .w32_end: RET .filter_left_h64: mova m0, [base+z_filter_s1] lea r3d, [hq-1] vbroadcasti32x4 m4, [base+z_filter_s4] vpbroadcastb m5, r3d vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vpermi2b m0, m8, m2 ; al bl pminub m5, [base+z_filter_s5] pshufb m1, m8, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+4*2+12*0] pshufb m3, m8, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+4*2+12*1] pshufb m4, m8, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+4*2+12*2] vpermb m5, m5, m8 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m6, m3, m12 vpbroadcastd m12, r8m ; max_height pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 packssdw m12, m12 paddw m0, m6 paddw m1, m3 paddw m0, m4 paddw m1, m5 packsswb m12, m12 pmulhrsw m0, m15 pmulhrsw m1, m15 vpcmpgtb k1, m12, m9 ; y < max_height packuswb m8{k1}, m0, m1 ret .w64: movu m7, [tlq] test angled, 0x400 jnz .w64_main vpbroadcastd m2, [tlq-4] mova m0, [base+z_filter_s1] vbroadcasti32x4 m1, [base+z_filter_s2] vbroadcasti32x4 m3, [base+z_filter_s3] vbroadcasti32x4 m4, [base+z_filter_s4] vpermi2b m0, m7, m2 ; al bl vpbroadcastd m5, [base+pb_63] pminub m5, [base+z_filter_s5] pshufb m1, m7, m1 ; ah bh vpbroadcastd m11, [base+z_filter_k+4*2+12*0] pshufb m3, m7, m3 ; cl ch vpbroadcastd m12, [base+z_filter_k+4*2+12*1] pshufb m4, m7, m4 ; el dl vpbroadcastd m13, [base+z_filter_k+4*2+12*2] vpermb m5, m5, m7 ; eh dh pmaddubsw m0, m11 pmaddubsw m1, m11 pmaddubsw m6, m3, m12 vpbroadcastd m12, r6m pmaddubsw m3, m13 pmaddubsw m4, m11 pmaddubsw m5, m11 mova m9, [pb_0to63] packssdw m12, m12 paddw m0, m6 paddw m1, m3 paddw m0, m4 paddw m1, m5 packsswb m12, m12 pmulhrsw m0, m15 pmulhrsw m1, m15 vpcmpgtb k1, m12, m9 ; x < max_width packuswb m7{k1}, m0, m1 call .filter_left_h64 ; always filter the full 64 pixels for simplicity .w64_main: vpbroadcastw m5, dyd vpbroadcastd m9, [tlq-4] rorx r2q, dxq, 62 ; dx << 2 pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge vpbroadcastw m1, r2d ; xpos mova m3, [base+z_xpos_off2a] mova m4, [base+z_xpos_off2b] mova m12, m1 vpbroadcastd m2, [base+pb_1] psrlw m10, m6, 1 psrlw m11, m5, 1 vpermw m10, m10, m14 ; 64-frac, frac psraw m6, 6 vpermw m11, m11, m14 psraw m5, 6 mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft packsswb m6, m5 mov r3d, 1<<6 paddsb m0, m6, m2 sub r5d, dxd ; left-only threshold punpcklbw m5, m6, m0 ; base, base+1 punpckhbw m6, m0 .w64_loop: pshufb m17, m1, m2 psrlw m0, m1, 3 paddb m16, m3, m17 vpermw m0, m0, m14 paddb m17, m4 vpmovw2m k1, m16 ; base_x < 0 vpermi2b m16, m7, m9 vpmovw2m k2, m17 vpermi2b m17, m7, m9 pmaddubsw m16, m0 pmaddubsw m17, m0 add r3d, dxd jge .w64_toponly mova m0, m8 vpermt2b m0, m5, m9 pmaddubsw m16{k1}, m0, m10 mova m0, m8 vpermt2b m0, m6, m9 pmaddubsw m17{k2}, m0, m11 .w64_toponly: pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq], m16 dec hd jz .w64_end paddw m1, m12 add dstq, strideq paddb m5, m2 paddb m6, m2 cmp r3d, r5d jge .w64_loop .w64_leftonly_loop: vpermb m16, m5, m8 vpermb m17, m6, m8 pmaddubsw m16, m10 pmaddubsw m17, m11 paddb m5, m2 paddb m6, m2 pmulhrsw m16, m15 pmulhrsw m17, m15 packuswb m16, m17 mova [dstq], m16 add dstq, strideq dec hd jg .w64_leftonly_loop .w64_end: RET cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy lea r7, [z_filter_t0] tzcnt wd, wm movifnidn angled, anglem lea t0, [dr_intra_derivative+45*2-1] movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] sub angled, 180 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e mova m0, [base+pb_63to0] movzx dyd, word [t0+dyq] lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] movifnidn hd, hm mova m14, [base+z_frac_table] shl dyd, 6 vpbroadcastd m15, [base+pw_512] jmp wq .w4: cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) lea r3d, [hq+4] call .upsample movshdup m1, [base+z_ypos_off1] vpbroadcastd m6, [base+pb_16] jmp .w4_main2 .w4_no_upsample: lea r3d, [hq+3] vpbroadcastb m9, r3d vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) pmaxub m1, m0 vpermb m7, m1, [tlq-64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main vpbroadcastb xm1, angled shr angled, 8 vpcmpeqb k1, xm9, [base+z_filter_wh] vpbroadcastd m2, [tlq-3] vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] kmovw r5d, k1 test r5d, r5d jz .w4_main pminub m9, [pb_0to63] call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter vpermb m7, m9, m0 .w4_main: movsldup m1, [base+z_ypos_off1] vpbroadcastd m6, [base+pb_8] .w4_main2: vpbroadcastw m0, dyd vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4 pmulhuw m2, m0 ; ypos >> 1 lea r2, [strideq*3] vpermw m3, m2, m14 ; 64-frac, frac psrlw m2, 5 packsswb m2, m2 punpcklbw m2, m2 paddsb m2, m1 ; base, base+1 .w4_loop: vpermb m0, m2, m7 pmaddubsw m0, m3 paddsb m2, m6 pmulhrsw m0, m15 vpmovwb ym0, m0 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 sub hd, 8 jl .w4_end vextracti32x4 xm0, ym0, 1 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .upsample: xor r3d, 31 ; 31 - (h + imin(w, h)) vbroadcasti32x4 ym0, [base+z_xpos_off2a] vpbroadcastb ym7, r3d pmaxub ym7, [base+z3_upsample] vbroadcasti32x4 ym1, [base+z_filter_s4] vpermb ym7, ym7, [tlq-31] vpbroadcastd ym2, [base+pb_m4_36] pshufb ym0, ym7, ym0 psrldq ym7, 1 pshufb ym1, ym7, ym1 pmaddubsw ym0, ym2 pmaddubsw ym1, ym2 add dyd, dyd paddw ym0, ym1 pmulhrsw ym0, ym15 packuswb ym0, ym0 punpcklbw ym7, ym0 ret .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 lea r3d, [hq*2] call .upsample pshufd m1, [base+z_ypos_off1], q0000 vpbroadcastd m6, [base+pb_8] jmp .w8_main2 .w8_no_upsample: mov r3d, 8 cmp hd, 4 cmove r3d, hd lea r3d, [r3+hq-1] xor r3d, 63 ; 63 - (h + imin(w, h)) vpbroadcastb m1, wd pmaxub m1, m0 vpermb m7, m1, [tlq-64*1] test angled, 0x400 ; !enable_intra_edge_filter jnz .w8_main lea r3d, [hq+7] call .filter_strength test r5d, r5d jz .w8_main call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter vpermb m7, m10, m0 .w8_main: movsldup m1, [base+z_ypos_off2] vpbroadcastd m6, [base+pb_4] .w8_main2: vpbroadcastw m0, dyd vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8 pmulhuw m2, m0 ; ypos >> 1 lea r2, [strideq*3] vpermw m3, m2, m14 ; 64-frac, frac psrlw m2, 5 packsswb m2, m2 punpcklbw m2, m2 paddsb m2, m1 ; base, base+1 .w8_loop: vpermb m0, m2, m7 pmaddubsw m0, m3 paddsb m2, m6 pmulhrsw m0, m15 vpmovwb ym0, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .filter_strength: vpbroadcastd m2, [tlq-3] .filter_strength2: vpbroadcastb m9, r3d vpbroadcastb ym1, angled shr angled, 8 vpcmpeqb k1, ym9, [base+z_filter_wh] mova xm0, [base+z_filter_t0+angleq*8] vpcmpgtb k1{k1}, ym1, ym0 pminub m10, m9, [pb_0to63] kmovd r5d, k1 ret .w16_load: cmp r3d, hd cmovae r3d, hd add r3d, hd mova m7, [tlq-64*1] neg r3d ; -(h + imin(w, h)) and r3d, 63 vpbroadcastb m1, r3d pmaxub m2, m0, m1 cmp hd, 64 je .w16_load_h64 vpermb m8, m1, m7 vpermb m7, m2, m7 ret .w16_load_h64: vpermb m7, m0, m7 vpermb m8, m2, [tlq-64*2] ret .w16: mov r3d, 16 call .w16_load test angled, 0x400 ; !enable_intra_edge_filter jnz .w16_main vpbroadcastd m2, [tlq-3] cmp hd, 64 je .w16_filter64 lea r3d, [hq+15] call .filter_strength2 test r5d, r5d jz .w16_main call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter pminub m10, m9, [pb_0to63] vpermb m8, m9, m0 vpermb m7, m10, m0 jmp .w16_main .w16_filter64: vpbroadcastd m13, [base+pb_15] valignq m0, m8, m7, 7 pminub m12, m13, [pb_0to63] valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w16_main: vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8 vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15 vpbroadcastw m0, dyd vpbroadcastd m6, [base+pb_4] pmulhuw m3, m0 ; ypos >> 1 pmulhuw m2, m0 movshdup m0, [base+z_ypos_off2] lea r2, [strideq*3] vpbroadcastd m1, [base+pb_1] vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 vpermw m5, m2, m14 psrlw m2, 5 packsswb m3, m2 paddsb m3, m0 paddsb m1, m3 punpcklbw m2, m3, m1 ; base, base+1 punpckhbw m3, m1 .w16_loop: %macro Z3_PERM2 0 mova m0, m7 vpermt2b m0, m2, m8 mova m1, m7 vpermt2b m1, m3, m8 pmaddubsw m0, m4 pmaddubsw m1, m5 paddsb m2, m6 paddsb m3, m6 pmulhrsw m0, m15 pmulhrsw m1, m15 packuswb m0, m1 %endmacro Z3_PERM2 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: mov r3d, 32 call .w16_load test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vpbroadcastd m2, [tlq-3] cmp hd, 64 je .w32_filter64 lea r3d, [hq+31] vpbroadcastb m9, r3d call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter vpermb m8, m9, m7 jmp .w32_main .w32_filter64: vpbroadcastd m13, [base+pb_31] valignq m0, m8, m7, 7 pminub m12, m13, [pb_0to63] valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w32_main: vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8 vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15 vpbroadcastw m0, dyd vpbroadcastd m1, [base+pb_1] pmulhuw m3, m0 ; ypos >> 1 pmulhuw m2, m0 vpbroadcastd m6, [base+pb_2] mova ym0, ym1 vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 vpermw m5, m2, m14 psrlw m2, 5 packsswb m3, m2 paddsb m3, m0 paddsb m1, m3 punpcklbw m2, m3, m1 ; base, base+1 punpckhbw m3, m1 .w32_loop: Z3_PERM2 vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], ym0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: mova m7, [tlq-64*1] cmp hd, 64 je .w64_h64 lea r3d, [hq*2-1] xor r3d, 63 ; -(h + imin(w, h)) & 63 vpbroadcastb m1, r3d pmaxub m0, m1 vpermb m8, m1, m7 jmp .w64_filter .w64_h64: vpermb m8, m0, [tlq-64*2] .w64_filter: vpermb m7, m0, m7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main lea r3d, [hq-1] vpbroadcastd m2, [tlq-3] vpbroadcastb m13, r3d valignq m0, m8, m7, 7 pminub m12, m13, [pb_0to63] valignq m11, m8, m7, 1 call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter .w64_main: vpbroadcastw m2, dyd pmulhuw m3, m2, [base+z_ypos_mul2a] pmulhuw m2, [base+z_ypos_mul2b] vpbroadcastd m6, [base+pb_1] vpermw m4, m3, m14 ; 64-frac, frac psrlw m3, 5 vpermw m5, m2, m14 psrlw m2, 5 packsswb m3, m2 paddsb m1, m3, m6 punpcklbw m2, m3, m1 ; base, base+1 punpckhbw m3, m1 .w64_loop: Z3_PERM2 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 4 1 2 3 4 9 a b c ; 2 2 3 2 3 4 5 2 3 4 5 a b c d ; 3 3 4 3 4 5 6 3 4 5 6 b c d e ; 4 4 5 4 5 6 7 4 5 6 7 c d e f ; 5 5 6 5 6 7 8 5 6 7 8 d e f g ; 6 6 7 6 7 8 9 6 7 8 9 e f g h ; 7 7 8 7 8 9 a 7 8 9 a f g h i ; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ ; 9 9 a b h i j ; a b i j ; b j cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt %define base r6-filter_taps lea r6, [filter_taps] %ifidn fltd, fltm movzx fltd, fltb %else movzx fltd, byte fltm %endif vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 movifnidn hd, hm shl fltd, 6 vpbroadcastd m6, [base+pd_8] vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 vbroadcasti32x4 m8, [r6+fltq+16*1] vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ vbroadcasti32x4 m10, [r6+fltq+16*3] mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 cmp wd, 8 jb .w4 vpbroadcastd ym2, [tlq+5] mova m11, [base+filter_perm] mov r5, 0xffffffffffff000f psrldq xmm2, 1 ; __ t0 kmovq k1, r5 ; 0x000f psraw xm5, xmm0, 4 packuswb xmm2, xm5 ; __ t0 a0 b0 pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 je .w8 kxnorb k3, k3, k3 ; 0x00ff vpbroadcastd xm3, [tlq-4] kandnq k2, k3, k1 ; 0xffffffffffff0000 vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 vpbroadcastd m2, [tlq+9] vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ kunpckbw k4, k1, k3 ; 0x0fff packssdw ym0, ym1 psraw ym0, 4 ; a0 d0 a1 b1 packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 vpbroadcastd m2, [tlq+13] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 mova m12, [base+filter_end] lea r5d, [hq-6] mov r6, dstq cmovp hd, r5d ; w == 16 ? h : h - 6 packssdw m4, m1 psraw m4, 4 ; e0 f0 c1 d1 a2 b2 packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 .w16_loop: vpbroadcastd xm3, [tlq-8] vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ mova m1, m6 vpdpbusd m1, m2, m7 mova m0, m6 vpdpbusd m0, m2, m8 sub tlq, 2 vpdpbusd m1, m3, m9 vpdpbusd m0, m3, m10 packssdw m1, m0 mova m0, m4 psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop cmp wd, 16 je .ret mova xm13, [filter_perm+16] mova xmm3, [r6+strideq*0] punpckhdq xmm3, [r6+strideq*1] vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 pinsrb xm3, xmm3, [tlq+r5+16], 7 pshufb xm3, xm13 vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckbw k5, k3, k1 ; 0xff0f lea r3, [strideq*3] vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpbroadcastd ym2, [tlq+r5+21] pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 punpckhqdq xmm3, [r6+r3] pinsrb xmm3, [r6+strideq*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kxnord k3, k3, k4 ; 0xfffff0ff lea r4, [strideq*5] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpbroadcastd m2, [tlq+r5+25] pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 vextracti32x4 [dstq+strideq*2], m5, 2 vextracti32x4 [dstq+r3 ], m5, 3 punpckhqdq xmm3, [r6+r4] pinsrb xmm3, [r6+strideq*4+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckwd k1, k1, k2 ; 0x000f0000 vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 vpbroadcastd m2, [tlq+r5+29] pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 vextracti32x4 [dstq+strideq*4], m5, 2 vextracti32x4 [dstq+r4 ], m5, 3 lea r0, [strideq+r3*2] .w32_loop: punpckhqdq xmm3, [r6+r0] pinsrb xmm3, [r6+r3*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ .w32_loop_tail: mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 mova m1, m0 psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 vextracti32x4 [r6+strideq*0+16], m5, 2 vextracti32x4 [r6+strideq*1+16], m5, 3 lea r6, [r6+strideq*2] sub r5d, 2 jg .w32_loop vpermb m3, m11, m1 cmp r5d, -6 jg .w32_loop_tail .ret: RET .w8: vpermb ym3, ym11, ymm2 .w8_loop: vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 sub tlq, 2 vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 mova ym3, ym5 packssdw ym0, ym1 psraw ym5, ym0, 4 ; c0 d0 a1 b1 packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET .w4_loop: vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 sub tlq, 2 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 .w4: psraw xmm0, 4 ; a0 b0 packuswb xmm0, xmm0 movd [dstq+strideq*0], xmm0 pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 movd [dstq+strideq*1], xmm2 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_loop RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/ipred_sse.asm000066400000000000000000005572041517466257200237540ustar00rootroot00000000000000; Copyright © 2018-2021, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro ; sm_weights[], but modified to precalculate x and 256-x with offsets to ; enable efficient use of pmaddubsw (which requires signed values) smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 z_filter_wh4: db 7, 7, 19, 7, z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 pd_32768: dd 32768 z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11 z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 pw_m1to4: dw -1, -2, -3, -4 z_filter_k: times 4 db 0, 16 times 4 db 0, 20 times 4 db 8, 16 times 4 db 32, 16 times 4 db 24, 20 times 4 db 16, 16 times 4 db 0, 0 times 4 db 0, 0 pw_8: times 8 db 8, 0 pb_3: times 16 db 3 pb_16: times 16 db 16 pw_62: times 8 dw 62 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_512: times 8 dw 512 pw_m256: times 8 dw -256 pb_2: times 8 db 2 pb_4: times 8 db 4 pb_8: times 8 db 8 pb_128: times 8 db 128 pb_m16: times 8 db -16 pw_128: times 4 dw 128 pw_255: times 4 dw 255 pb_36_m4: times 4 db 36, -4 pb_127_m127: times 4 db 127, -127 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) %define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 pshuflw m1, m0, %3 ; extend 8 byte for 2 pos punpcklqdq m1, m1 mova [dstq + %2], m1 %if %1 > 16 mova [dstq + 16 + %2], m1 %endif %if %1 > 32 mova [dstq + 32 + %2], m1 mova [dstq + 48 + %2], m1 %endif %endmacro %macro IPRED_H 1 ; width sub tlq, 4 movd m0, [tlq] ; get 4 bytes of topleft data punpcklbw m0, m0 ; extend 2 byte %if %1 == 4 pshuflw m1, m0, q2233 movd [dstq+strideq*0], m1 psrlq m1, 32 movd [dstq+strideq*1], m1 pshuflw m0, m0, q0011 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+stride3q ], m0 %elif %1 == 8 punpcklwd m0, m0 punpckhdq m1, m0, m0 punpckldq m0, m0 movq [dstq+strideq*1], m1 movhps [dstq+strideq*0], m1 movq [dstq+stride3q ], m0 movhps [dstq+strideq*2], m0 %else IPRED_SET %1, 0, q3333 IPRED_SET %1, strideq, q2222 IPRED_SET %1, strideq*2, q1111 IPRED_SET %1, stride3q, q0000 %endif lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET %endmacro INIT_XMM ssse3 cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 LEA r5, ipred_h_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: IPRED_H 4 .w8: IPRED_H 8 .w16: IPRED_H 16 .w32: IPRED_H 32 .w64: IPRED_H 64 ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movu m0, [tlq+ 1] movu m1, [tlq+17] movu m2, [tlq+33] movu m3, [tlq+49] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+20] pcmpeqd m3, m3 psrlw m4, 1 ; dc = (width + height) >> 1; add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 paddw m0, m1 pmaddwd m0, m3 cmp hd, 4 jg .w4_mul psrlw m0, 3 ; dc >>= ctz(width + height); jmp .w4_end .w4_mul: punpckhqdq m1, m0, m0 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrlw m0, 2 mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8 cmovz r6d, r2d movd m5, r6d pmulhuw m0, m5 .w4_end: pxor m1, m1 pshufb m0, m1 .s4: movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 movd [dstq+strideq*2], m0 movd [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w8_end: pxor m1, m1 pshufb m0, m1 .s8: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w16_end: pxor m1, m1 pshufb m0, m1 .s16: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m0 movu [dstq+strideq*2], m0 movu [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] pmaddubsw m2, m3 paddw m0, m2 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x5556 mov r2d, 0x3334 test hd, 64|16 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w32_end: pxor m1, m1 pshufb m0, m1 mova m1, m0 .s32: movu [dstq], m0 movu [dstq+16], m1 movu [dstq+strideq], m0 movu [dstq+strideq+16], m1 movu [dstq+strideq*2], m0 movu [dstq+strideq*2+16], m1 movu [dstq+stride3q], m0 movu [dstq+stride3q+16], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-64] mova m1, [tlq-48] pmaddubsw m0, m3 pmaddubsw m1, m3 paddw m0, m1 mova m1, [tlq-32] pmaddubsw m1, m3 paddw m0, m1 mova m1, [tlq-16] pmaddubsw m1, m3 paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 1] movu m2, [tlq+17] pmaddubsw m1, m3 pmaddubsw m2, m3 paddw m1, m2 movu m2, [tlq+33] pmaddubsw m2, m3 paddw m1, m2 movu m2, [tlq+49] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 64 je .w64_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w64_end: pxor m1, m1 pshufb m0, m1 mova m1, m0 mova m2, m0 mova m3, m0 .s64: mova [dstq], m0 mova [dstq+16], m1 mova [dstq+32], m2 mova [dstq+48], m3 mova [dstq+strideq], m0 mova [dstq+strideq+16], m1 mova [dstq+strideq+32], m2 mova [dstq+strideq+48], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s64 RET ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_ssse3_table mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] movd m2, r6d psrld m3, m2 movsxd r6, [r5+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m1, [tlq+48] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 movu m1, [tlq+32] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h32: movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: pmaddwd m0, m2 pmulhrsw m0, m3 lea stride3q, [strideq*3] pxor m1, m1 pshufb m0, m1 mova m1, m0 mova m2, m0 mova m3, m0 jmp wq ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] mova m1, m0 mova m2, m0 mova m3, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_ssse3_table tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] movd m2, wd psrld m3, m2 movsxd r6, [r5+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] ; w * a = (w - 128) * a + 128 * a ; (256 - w) * b = (127 - w) * b + 129 * b ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] pmaddubsw m6, m%3, m%1 pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b paddw m6, m%5 paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] psrlw m6, 8 psrlw m0, 8 packuswb m6, m0 %endmacro cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_ssse3_table LEA r6, ipred_smooth_v_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq movd m5, [tlq+hq] pxor m2, m2 pshufb m5, m2 add wq, r6 jmp wq .w4: movd m2, [tlq+1] punpckldq m2, m2 punpcklbw m2, m5 ; top, bottom lea r3, [strideq*3] mova m4, [base+ipred_v_shuf] mova m5, m4 punpckldq m4, m4 punpckhdq m5, m5 pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 .w4_loop: movu m1, [weightsq+hq*2] pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 movd [dstq+strideq*0], m6 pshuflw m1, m6, q1032 movd [dstq+strideq*1], m1 punpckhqdq m6, m6 movd [dstq+strideq*2], m6 psrlq m6, 32 movd [dstq+r3 ], m6 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET ALIGN function_align .w8: movq m2, [tlq+1] punpcklbw m2, m5 mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] pshufd m4, m5, q0000 pshufd m5, m5, q1111 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 ; m3 is output for loop .w8_loop: movq m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 movq [dstq+strideq*0], m6 movhps [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] add hq, 2 jl .w8_loop RET ALIGN function_align .w16: movu m3, [tlq+1] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 ; m4 and m5 is output for loop .w16_loop: movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add dstq, strideq add hq, 1 jl .w16_loop RET ALIGN function_align .w32: WIN64_PUSH_XMM 8, 7 mova m7, m5 .w32_loop_init: mov r3d, 2 .w32_loop: movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] movu m3, [tlq+1] punpcklbw m2, m3, m7 punpckhbw m3, m7 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add tlq, 16 add dstq, 16 dec r3d jg .w32_loop lea dstq, [dstq-32+strideq] sub tlq, 32 add hq, 1 jl .w32_loop_init RET ALIGN function_align .w64: WIN64_PUSH_XMM 8, 7 mova m7, m5 .w64_loop_init: mov r3d, 4 .w64_loop: movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] movu m3, [tlq+1] punpcklbw m2, m3, m7 punpckhbw m3, m7 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add tlq, 16 add dstq, 16 dec r3d jg .w64_loop lea dstq, [dstq-64+strideq] sub tlq, 64 add hq, 1 jl .w64_loop_init RET ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h %define base r6-ipred_smooth_h_ssse3_table LEA r6, ipred_smooth_h_ssse3_table mov wd, wm movd m3, [tlq+wq] pxor m1, m1 pshufb m3, m1 ; right tzcnt wd, wd mov hd, hm movsxd wq, [r6+wq*4] movddup m4, [base+pb_127_m127] movddup m5, [base+pw_128] add wq, r6 jmp wq .w4: movddup m6, [base+smooth_weights+4*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 sub tlq, hq lea r3, [strideq*3] .w4_loop: movd m2, [tlq+hq] ; left pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r3 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: mova m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 sub tlq, hq punpckldq m7, m7 .w8_loop: movd m2, [tlq+hq] ; left pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: mova m6, [base+smooth_weights+16*2] mova m7, [base+smooth_weights+16*3] sub tlq, 1 sub tlq, hq .w16_loop: pxor m1, m1 movd m2, [tlq+hq] ; left pshufb m2, m1 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 lea dstq, [dstq+strideq] sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: sub tlq, 1 sub tlq, hq pxor m6, m6 .w32_loop_init: mov r5, 2 lea r3, [base+smooth_weights+16*4] .w32_loop: mova m7, [r3] add r3, 16 movd m2, [tlq+hq] ; left pshufb m2, m6 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m7 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 mova m7, [r3] add r3, 16 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 add dstq, 16 dec r5 jg .w32_loop lea dstq, [dstq-32+strideq] sub hd, 1 jg .w32_loop_init RET ALIGN function_align .w64: sub tlq, 1 sub tlq, hq pxor m6, m6 .w64_loop_init: mov r5, 4 lea r3, [base+smooth_weights+16*8] .w64_loop: mova m7, [r3] add r3, 16 movd m2, [tlq+hq] ; left pshufb m2, m6 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m7 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 mova m7, [r3] add r3, 16 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 add dstq, 16 dec r5 jg .w64_loop lea dstq, [dstq-64+strideq] sub hd, 1 jg .w64_loop_init RET ;--------------------------------------------------------------------------------------- ;int dav2d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 pmaddubsw m6, m%3, m%1 mova m0, m6 pmaddubsw m6, m%4, m%2 mova m1, m6 %ifnum %5 paddw m0, m%5 %else paddw m0, %5 %endif %ifnum %6 paddw m1, m%6 %else paddw m1, %6 %endif %ifnum %7 %else mova m3, %7 %endif pavgw m0, m2 pavgw m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro %macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] mova m1, [rsp+16*%1] ; top punpckhbw m6, m1, m0 ; top, bottom punpcklbw m1, m0 ; top, bottom pmaddubsw m2, m1, m5 mova [rsp+16*%2], m1 paddw m1, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m1 ; 128 * top + 129 * bottom + 255 mova [rsp+16*%3], m2 pmaddubsw m2, m6, m5 mova [rsp+16*%4], m6 paddw m6, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m6 ; 128 * top + 129 * bottom + 255 mova [rsp+16*%5], m2 movd m1, [tlq+hq] ; left pshufb m1, [base+pb_3] ; topleft[-(1 + y)] punpcklbw m1, m4 ; left, right pmaddubsw m2, m1, m5 ; 127 * left - 127 * right paddw m2, m1 ; 128 * left + 129 * right mova m3, m2 pmaddubsw m0, m1, %6 ; weights_hor = &dav2d_sm_weights[width]; pmaddubsw m1, %7 paddw m2, m3, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav2d_sm_weights[height]; mova m7, [rsp+16*%9] pshufb m1, m7 mova [rsp+16*%8], m3 mova m4, [rsp+16*%2] mova m5, [rsp+16*%3] mova m3, [rsp+16*%4] mova m7, [rsp+16*%5] SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] mova [dstq], m0 movddup m3, [base+pw_255] ; recovery mova m0, [rsp+16*%10] ; recovery mova m4, [rsp+16*%11] ; recovery mova m5, [rsp+16*%12] ; recovery %endmacro cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_ssse3_table mov wd, wm mov hd, hm LEA r6, ipred_smooth_ssse3_table movd m4, [tlq+wq] ; right pxor m2, m2 pshufb m4, m2 tzcnt wd, wd mov r5, tlq sub r5, hq movsxd wq, [r6+wq*4] movddup m5, [base+pb_127_m127] movd m0, [r5] pshufb m0, m2 ; bottom movddup m3, [base+pw_255] add wq, r6 lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav2d_sm_weights[height] jmp wq .w4: mova m7, [base+ipred_v_shuf] movd m1, [tlq+1] ; left pshufd m1, m1, q0000 sub tlq, 4 lea r3, [strideq*3] sub tlq, hq punpcklbw m1, m0 ; top, bottom pshufd m6, m7, q1100 pshufd m7, m7, q3322 pmaddubsw m2, m1, m5 paddw m3, m1 ; 1 * top + 255 * bottom + 255 paddw m2, m3 ; 128 * top + 129 * bottom + 255 mova [rsp+16*0], m1 mova [rsp+16*1], m2 movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav2d_sm_weights[width]; punpcklqdq m1, m1 mova [rsp+16*2], m1 mova [rsp+16*3], m4 mova [rsp+16*4], m6 mova [rsp+16*5], m5 .w4_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+ipred_h_shuf] punpcklbw m0, m1, m4 ; left, right punpckhbw m1, m4 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right pmaddubsw m3, m1, m5 paddw m2, m0 ; 128 * left + 129 * right paddw m3, m1 mova m4, [rsp+16*2] pmaddubsw m0, m4 pmaddubsw m1, m4 paddw m2, m0 paddw m3, m1 movq m1, [v_weightsq] ; weights_ver = &dav2d_sm_weights[height]; add v_weightsq, 8 pshufb m0, m1, m6 pshufb m1, m7 mova m4, [rsp+16*0] mova m5, [rsp+16*1] SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 mova m4, [rsp+16*3] mova m6, [rsp+16*4] mova m5, [rsp+16*5] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r3 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: mova m7, [base+ipred_v_shuf] movq m1, [tlq+1] ; left punpcklqdq m1, m1 sub tlq, 4 sub tlq, hq punpcklbw m1, m0 pshufd m6, m7, q0000 pshufd m7, m7, q1111 pmaddubsw m2, m1, m5 paddw m3, m1 paddw m2, m3 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav2d_sm_weights[width]; mova [rsp+16*2], m1 mova [rsp+16*3], m4 mova [rsp+16*4], m6 mova [rsp+16*5], m5 .w8_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+ipred_h_shuf] pshufd m1, m1, q1100 punpcklbw m0, m1, m4 punpckhbw m1, m4 pmaddubsw m2, m0, m5 pmaddubsw m3, m1, m5 paddw m2, m0 paddw m3, m1 mova m4, [rsp+16*2] pmaddubsw m0, m4 pmaddubsw m1, m4 paddw m2, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav2d_sm_weights[height]; add v_weightsq, 4 pshufb m0, m1, m6 pshufb m1, m7 mova m4, [rsp+16*0] mova m5, [rsp+16*1] SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 mova m4, [rsp+16*3] mova m6, [rsp+16*4] mova m5, [rsp+16*5] movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: mova m7, [base+ipred_v_shuf] movu m1, [tlq+1] ; left sub tlq, 4 sub tlq, hq punpckhbw m6, m1, m0 ; top, bottom punpcklbw m1, m0 ; top, bottom pshufd m7, m7, q0000 mova [rsp+16*2], m7 pmaddubsw m2, m6, m5 mova [rsp+16*5], m6 paddw m6, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m6 ; 128 * top + 129 * bottom + 255 mova [rsp+16*6], m2 pmaddubsw m2, m1, m5 paddw m3, m1 ; 1 * top + 255 * bottom + 255 mova [rsp+16*0], m1 paddw m2, m3 ; 128 * top + 129 * bottom + 255 mova [rsp+16*1], m2 mova [rsp+16*3], m4 mova [rsp+16*4], m5 .w16_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+pb_3] ; topleft[-(1 + y)] punpcklbw m1, m4 ; left, right pmaddubsw m2, m1, m5 ; 127 * left - 127 * right paddw m2, m1 ; 128 * left + 129 * right mova m0, m1 mova m3, m2 pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav2d_sm_weights[width]; pmaddubsw m1, [base+smooth_weights+16*3] paddw m2, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav2d_sm_weights[height]; add v_weightsq, 2 mova m7, [rsp+16*2] pshufb m1, m7 mova [rsp+16*7], m3 mova m4, [rsp+16*0] mova m5, [rsp+16*1] mova m3, [rsp+16*5] mova m7, [rsp+16*6] SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] mova m4, [rsp+16*3] mova m5, [rsp+16*4] mova [dstq], m0 lea dstq, [dstq+strideq] sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 mova [rsp+16*1], m2 sub tlq, 4 sub tlq, hq mova m7, [base+ipred_v_shuf] pshufd m7, m7, q0000 mova [rsp+16*2], m7 mova [rsp+16*3], m0 mova [rsp+16*4], m4 mova [rsp+16*5], m5 .w32_loop: SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 lea dstq, [dstq-16+strideq] add v_weightsq, 2 sub hd, 1 jg .w32_loop RET ALIGN function_align .w64: movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 mova [rsp+16*1], m2 movu m1, [tlq+33] ; top movu m2, [tlq+49] ; top mova [rsp+16*11], m1 mova [rsp+16*12], m2 sub tlq, 4 sub tlq, hq mova m7, [base+ipred_v_shuf] pshufd m7, m7, q0000 mova [rsp+16*2], m7 mova [rsp+16*3], m0 mova [rsp+16*4], m4 mova [rsp+16*5], m5 .w64_loop: SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 lea dstq, [dstq-48+strideq] add v_weightsq, 2 sub hd, 1 jg .w64_loop RET %if ARCH_X86_64 cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx %define base r7-$$ lea r7, [$$] mova m8, [base+pw_62] mova m9, [base+pw_64] mova m10, [base+pw_512] %else cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define strideq r3 %define stridemp dword [rsp+16*12] mov stridemp, r1 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm inc tlq movsxd wq, [base+ipred_z1_ssse3_table+wq*4] mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) mova m1, [tlq-1] pshufb m0, m1, [base+z_upsample1] pshufb m1, [base+z_upsample2] movddup m2, [base+pb_36_m4] add dxd, dxd pmaddubsw m0, m2 pshufd m7, m1, q3333 movd [rsp+16], m7 ; top[max_base_x] pmaddubsw m1, m2 movd m6, dxd mov r5d, dxd ; xpos pshufb m6, [base+pw_256] paddw m1, m0 movq m0, [tlq] pmulhrsw m1, m10 paddw m7, m6, m6 punpcklqdq m6, m7 ; xpos0 xpos1 packuswb m1, m1 punpcklbw m0, m1 movifnidn strideq, stridemp mova [rsp], m0 .w4_upsample_loop: lea r2d, [r5+dxq] shr r5d, 6 ; base0 movq m0, [rsp+r5] lea r5d, [r2+dxq] shr r2d, 6 ; base1 movhps m0, [rsp+r2] pand m2, m8, m6 ; frac psubw m1, m9, m2 ; 64-frac psllw m2, 8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 paddw m6, m7 ; xpos += dx pmulhrsw m0, m10 packuswb m0, m0 movd [dstq+strideq*0], m0 pshuflw m0, m0, q1032 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m0, r3d movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 pcmpeqb m1, m0, [base+z_filter_wh4] pand m1, m2 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 mova m3, [tlq-1] imul r5d, 0x55555555 movu m7, [base+z_filter_s+8] shr r5d, 30 ; filter_strength movddup m0, [base+pb_8] pminub m7, m0 pshufb m0, m3, [base+z_filter_s] movddup m4, [base+z_filter_k-8+r5*8+24*0] pshufb m3, m7 movddup m5, [base+z_filter_k-8+r5*8+24*1] shufps m2, m0, m3, q2121 movddup m6, [base+z_filter_k-8+r5*8+24*2] pmaddubsw m0, m4 pmaddubsw m1, m2, m4 pmaddubsw m2, m5 paddd m5, m6 pmaddubsw m4, m3, m5 pmaddubsw m3, m6 paddw m0, m2 paddw m1, m4 paddw m0, m3 pshufd m1, m1, q3333 pmulhrsw m0, m10 pmulhrsw m1, m10 mov r5d, 9 mov tlq, rsp cmp hd, 4 cmovne r3d, r5d packuswb m0, m1 mova [tlq], m0 .w4_main: add tlq, r3 movd m5, dxd movddup m0, [base+z_base_inc] ; base_inc << 6 movd m7, [tlq] ; top[max_base_x] shl r3d, 6 movd m4, r3d pshufb m5, [base+pw_256] mov r5d, dxd ; xpos pshufb m7, [base+pw_m256] sub r5, r3 pshufb m4, [base+pw_256] mova m3, [base+z1_shuf_w4] paddw m6, m5, m5 psubw m4, m0 ; max_base_x punpcklqdq m5, m6 ; xpos0 xpos1 .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3] pand m2, m8, m5 ; frac psubw m1, m9, m2 ; 64-frac psllw m2, 8 pshufb m0, m3 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 movifnidn strideq, stridemp pcmpgtw m1, m4, m5 ; base < max_base_x pmulhrsw m0, m10 paddw m5, m6 ; xpos += dx pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movd [dstq+strideq*0], m0 pshuflw m0, m0, q1032 movd [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop packuswb m7, m7 .w4_end_loop: movd [dstq+strideq*0], m7 movd [dstq+strideq*1], m7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 mova m5, [base+z_upsample1] movu m3, [base+z_filter_s+6] movd m4, hd mova m0, [tlq-1] movu m1, [tlq+7] pxor m7, m7 pshufb m4, m7 movddup m7, [base+pb_36_m4] pminub m4, m3 add dxd, dxd pshufb m2, m0, m5 pmaddubsw m2, m7 pshufb m0, m3 pmaddubsw m0, m7 movd m6, dxd pshufb m3, m1, m5 pmaddubsw m3, m7 pshufb m1, m4 pmaddubsw m1, m7 pshufb m6, [base+pw_256] mov r5d, dxd paddw m2, m0 paddw m7, m6, m6 paddw m3, m1 punpcklqdq m6, m7 ; xpos0 xpos1 movu m1, [tlq] pmulhrsw m2, m10 pmulhrsw m3, m10 packuswb m2, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 movifnidn strideq, stridemp mova [rsp+16*0], m0 mova [rsp+16*1], m1 .w8_upsample_loop: lea r2d, [r5+dxq] shr r5d, 6 ; base0 movu m0, [rsp+r5] lea r5d, [r2+dxq] shr r2d, 6 ; base1 movu m1, [rsp+r2] pand m2, m8, m6 psubw m3, m9, m2 psllw m2, 8 por m3, m2 punpcklqdq m2, m3, m3 ; frac0 pmaddubsw m0, m2 punpckhqdq m3, m3 ; frac1 pmaddubsw m1, m3 paddw m6, m7 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m0, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movu m1, [base+z_filter_wh8] psrldq m3, [base+z_filter_t_w48+angleq*8], 4 pcmpeqb m1, m0 pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .w8_main ; filter_strength == 0 movd m3, [tlq-1] movu m0, [tlq+16*0] imul r5d, 0x55555555 movu m1, [tlq+16*1] shr r5d, 30 ; filter_strength movd m2, [tlq+r3] lea tlq, [rsp+16*4] sub r5, 3 mova [tlq-16*1], m0 pxor m7, m7 mova [tlq+16*0], m1 pshufb m3, m7 pshufb m2, m7 mova [tlq-16*2], m3 movq [tlq+r3-15], m2 call .filter_edge sar r5d, 1 add r5d, 17 cmp hd, 8 cmova r3d, r5d .w8_main: add tlq, r3 movd m5, dxd movd m7, [tlq] shl r3d, 6 movu m3, [base+z_filter_s+2] movd m4, r3d pshufb m5, [base+pw_256] mov r5d, dxd pshufb m7, [base+pw_m256] sub r5, r3 pshufb m4, [base+pw_256] psubw m4, [base+z_base_inc] mova m6, m5 .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3] pand m1, m8, m5 psubw m2, m9, m1 psllw m1, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [dstq], m0 dec hd jz .w8_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w8_loop packuswb m7, m7 .w8_end_loop: movq [dstq], m7 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: lea r3d, [hq+15] movd m0, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m0, [base+z_filter_wh16] pand m0, m2 pcmpgtb m0, m3 pmovmskb r5d, m0 test r5d, r5d jz .w16_main ; filter_strength == 0 movd m4, [tlq-1] movu m0, [tlq+16*0] imul r5d, 0x24924924 movu m1, [tlq+16*1] shr r5d, 30 movd m2, [tlq+30] adc r5, -4 ; filter_strength-3 movd m3, [tlq+r3] lea tlq, [rsp+16*4] mova [tlq-16*1], m0 pxor m7, m7 mova [tlq+16*0], m1 pshufb m4, m7 movd [rsp], m2 pshufb m3, m7 mova [tlq-16*2], m4 movd [tlq+r3-16], m3 call .filter_edge cmp hd, 16 jle .w16_main pshuflw m0, [rsp], q0000 sar r5, 1 movd m1, [base+z_filter_k_tail+4+r5*4] lea r3d, [r5+33] pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq+32], m0 .w16_main: add tlq, r3 movd m5, dxd movd m7, [tlq] movd m4, r3d shl r3d, 6 pshufb m5, [base+pw_256] pxor m6, m6 pshufb m7, m6 mov r5d, dxd pshufb m4, m6 sub r5, r3 psubb m4, [base+pb_0to15] mova m6, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+0] pand m0, m8, m5 movu m2, [tlq+r3+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m3, m5, 6 packsswb m3, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 paddw m5, m6 pcmpgtb m2, m4, m3 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq], m0 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq], m7 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main movd m6, [tlq-1] movu m0, [tlq+16*0] movu m1, [tlq+16*1] movu m2, [tlq+16*2] movu m3, [tlq+16*3] movd m4, [tlq+62] movd m5, [tlq+r3] lea tlq, [rsp+16*6] mova [tlq-16*3], m0 pxor m7, m7 mova [tlq-16*2], m1 pshufb m6, m7 mova [tlq-16*1], m2 xor r5d, r5d ; filter_strength = 3 mova [tlq+16*0], m3 movd [rsp], m4 pshufb m5, m7 mova [tlq-16*4], m6 movd [tlq+r3-48], m5 call .filter_edge sub tlq, 16*2 call .filter_edge cmp hd, 32 jle .w32_main pshuflw m0, [rsp], q0000 movd m1, [base+z_filter_k_tail+4] add r3d, 2 pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq+64], m0 .w32_main: add tlq, r3 movd m0, r3d movd m7, [tlq] shl r3d, 6 movd m5, dxd pxor m6, m6 mov r5d, dxd pshufb m0, m6 pshufb m5, [base+pw_256] sub r5, r3 pshufb m7, m6 psubb m0, [base+pb_0to15] movddup m1, [base+pb_m16] mova [rsp+16*0], m0 paddb m0, m1 mova [rsp+16*1], m0 mova m6, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+16*0+0] pand m0, m8, m5 movu m2, [tlq+r3+16*0+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m4, m5, 6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m4, m4 pcmpgtb m2, [rsp+16*0], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*1+0] movu m2, [tlq+r3+16*1+1] mova [dstq+16*0], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*1], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq+16*1], m0 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: mova [dstq+16*0], m7 mova [dstq+16*1], m7 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main movd m4, [tlq-1] movu m0, [tlq+16*0] movu m1, [tlq+16*1] movu m2, [tlq+16*2] movu m3, [tlq+16*3] mova [rsp+16*3], m0 pxor m7, m7 mova [rsp+16*4], m1 pshufb m4, m7 mova [rsp+16*5], m2 mova [rsp+16*6], m3 mova [rsp+16*2], m4 movu m0, [tlq+16*4] movu m1, [tlq+16*5] movu m2, [tlq+16*6] movu m3, [tlq+16*7] movd m4, [tlq+r3] lea tlq, [rsp+16*10] mova [tlq-16*3], m0 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*2], m1 pshufb m4, m7 mova [tlq-16*1], m2 mova [tlq+16*0], m3 movd [tlq+r3-16*7], m4 cmp hd, 64 jl .w64_filter96 ; skip one call if the last 32 bytes aren't used call .filter_edge .w64_filter96: sub tlq, 16*2 call .filter_edge sub tlq, 16*2 call .filter_edge sub tlq, 16*2 call .filter_edge .w64_main: add tlq, r3 movd m0, r3d movd m7, [tlq] shl r3d, 6 movd m5, dxd pxor m6, m6 mov r5d, dxd pshufb m0, m6 sub r5, r3 pshufb m5, [base+pw_256] pshufb m7, m6 psubb m0, [base+pb_0to15] movddup m1, [base+pb_m16] mova [rsp+16*0], m0 paddb m0, m1 mova [rsp+16*1], m0 paddb m0, m1 mova [rsp+16*2], m0 paddb m0, m1 mova [rsp+16*3], m0 mova m6, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+16*0+0] pand m0, m8, m5 movu m2, [tlq+r3+16*0+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m4, m5, 6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m4, m4 pcmpgtb m2, [rsp+16*0], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*1+0] movu m2, [tlq+r3+16*1+1] mova [dstq+16*0], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*1], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*2+0] movu m2, [tlq+r3+16*2+1] mova [dstq+16*1], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*2], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*3+0] movu m2, [tlq+r3+16*3+1] mova [dstq+16*2], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*3], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq+16*3], m0 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: mova [dstq+16*0], m7 mova [dstq+16*1], m7 mova [dstq+16*2], m7 mova [dstq+16*3], m7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_edge: ; 32 pixels/iteration movddup m7, [base+z_filter_k+8*2+r5*8+24*0] movu m2, [tlq-18] movu m1, [tlq-17] movu m3, [tlq- 2] movu m4, [tlq- 1] punpcklbw m0, m2, m1 pmaddubsw m0, m7 punpckhbw m2, m1 pmaddubsw m2, m7 punpcklbw m1, m3, m4 pmaddubsw m1, m7 punpckhbw m3, m4 pmaddubsw m3, m7 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] mova m5, [tlq-16] movu m6, [tlq-15] punpcklbw m4, m5, m6 pmaddubsw m4, m7 punpckhbw m5, m6 pmaddubsw m5, m7 paddw m0, m4 paddw m2, m5 mova m5, [tlq+ 0] movu m6, [tlq+ 1] punpcklbw m4, m5, m6 pmaddubsw m4, m7 punpckhbw m5, m6 pmaddubsw m5, m7 paddw m1, m4 paddw m3, m5 test r5d, r5d jnz .filter_end ; 3-tap movddup m7, [base+z_filter_k+8*8] movu m5, [tlq-14] movu m6, [tlq+ 2] punpcklbw m4, m5, m5 pmaddubsw m4, m7 punpckhbw m5, m5 pmaddubsw m5, m7 paddw m0, m4 paddw m2, m5 punpcklbw m5, m6, m6 pmaddubsw m5, m7 punpckhbw m6, m6 pmaddubsw m6, m7 paddw m1, m5 paddw m3, m6 .filter_end: %if ARCH_X86_64 REPX {pmulhrsw x, m10}, m0, m2, m1, m3 %else mova m4, m10 REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 %endif packuswb m0, m2 packuswb m1, m3 mova [tlq+16*0], m0 mova [tlq+16*1], m1 ret %if ARCH_X86_64 cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m lea r7, [$$] mov hd, hm mova m8, [base+pw_62] mova m9, [base+pw_64] lea r9d, [wq-4] mova m10, [base+pw_512] shl r9d, 6 mova m11, [base+z1_shuf_w4] or r9d, hd mova m12, [base+z2_h_shuf] %else cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define m11 [rsp+16*16] %define m12 [rsp+16*17] %define r9b byte [rsp+16*18+4*0] %define r9d dword [rsp+16*18+4*0] %define r10d dword [rsp+16*18+4*1] %define r11d dword [rsp+16*18+4*2] %define maxwm [rsp+16*18+4*3] %define maxhm [rsp+16*19+4*0] %define stridemp [rsp+16*19+4*1] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov maxwm, r1d mov maxhm, r4d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z1_shuf_w4] shl hd, 6 mova m1, [base+z2_h_shuf] or hd, hm mova m11, m0 mov r9d, hd mova m12, m1 %endif tzcnt wd, wd movifnidn angled, anglem movsxd wq, [base+ipred_z2_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif xor angled, 0x400 mova m0, [tlq-16*4] mov dyd, dxd mova m1, [tlq-16*3] neg dxq mova m2, [tlq-16*2] and dyd, ~1 mova m3, [tlq-16*1] and dxq, ~1 movd m4, [tlq] movu m5, [tlq+16*0+1] movu m6, [tlq+16*1+1] movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle mova [rsp+16*2], m0 pxor m7, m7 mova [rsp+16*3], m1 pshufb m4, m7 mova [rsp+16*4], m2 lea wq, [base+ipred_z2_ssse3_table+wq] mova [rsp+16*5], m3 neg dxd mova [rsp+16*6], m4 or dyd, 4<<16 mova [rsp+16*7], m4 mova [rsp+16*8], m5 mova [rsp+16*9], m6 movq m0, [base+z_base_inc+2] movsldup m1, [base+z2_dy_offset] movq m2, [base+pw_256] ; 4<<6 movq [rsp+16*14+8*0], m0 movq [rsp+16*15+8*0], m1 movq [rsp+16*15+8*1], m2 %if ARCH_X86_64 lea r10d, [dxq+(128<<6)] ; xpos %else mov [rsp+16*7+4*1], dyd lea r4d, [dxq+(128<<6)] mov r10d, r4d movzx hd, r9b %endif mov r11d, (128-4)<<6 jmp wq .w4: test angled, 0x400 jnz .w4_main movd m5, [tlq+4] lea r3d, [hq+2] add angled, 1022 pshufb m5, m7 shl r3d, 6 movd [rsp+16*8+4], m5 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m0, r3d movd m6, angled shr angled, 8 ; is_sm << 1 pshufb m0, m7 pshufb m6, m7 pcmpeqb m0, [base+z_filter_wh4] pand m6, m0 pcmpgtb m6, [base+z_filter_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 movq m3, [rsp+gprsize+16*8-2] movq m1, [rsp+gprsize+16*8-1] movq m0, [rsp+gprsize+16*8+0] movq m4, [rsp+gprsize+16*8+1] movddup m5, [base+pb_36_m4] punpcklbw m1, m3 punpcklbw m2, m0, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 %if ARCH_X86_64 mova m11, [base+pb_0to15] lea r10d, [r10+dxq+(1<<6)] mov r11d, (128-7)<<6 %else mova m3, [base+pb_0to15] mov r3d, [rsp+gprsize+16*18+4*1] mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 lea r3d, [r3+dxq+(1<<6)] mov [rsp+gprsize+16*18+4*1], r3d mova [rsp+gprsize+16*16], m3 %endif add dxd, dxd paddw m1, m2 pmulhrsw m1, m10 movq m2, [rsp+gprsize+16*14] paddw m2, m2 movq [rsp+gprsize+16*14], m2 packuswb m1, m1 punpcklbw m1, m0 mova [rsp+gprsize+16*8], m1 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp], angled sub angled, 1112 ; angle - 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m3, [base+z_filter_wh4] mova m4, [base+z_filter_t_w48+angleq*8] call .w8_filter_top mov angled, [rsp] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 neg hq movd m0, [tlq+hq] pshufb m0, m7 movd [rsp+16*6+hq-4], m0 movq m3, [rsp+16*5+7] movq m0, [rsp+16*5+8] movq m2, [rsp+16*5+9] movq m4, [rsp+16*5+10] movddup m5, [base+pb_36_m4] punpcklbw m1, m0, m3 punpcklbw m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 movshdup m3, [base+z2_dy_offset] %if ARCH_X86_64 mova m12, [base+z2_upsample] add dyd, dyd %else mova m4, [base+z2_upsample] shl dword [rsp+16*7+4*1], 1 mova m12, m4 %endif paddw m1, m2 pmulhrsw m1, m10 movq [rsp+16*15], m3 packuswb m1, m1 punpcklbw m0, m1 mova [rsp+16*5], m0 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+16*7+4*1] %endif movddup m0, [rsp+16*14+8*0] pshufb m6, [base+pw_256] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+16*15] movq [rsp+16*6+8*1], m3 movq [rsp+8*1], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*12], m6 movq [rsp+8*0], m4 pand m0, m4, m8 psraw m4, 6 psubw m1, m9, m0 psllw m0, 8 por m0, m1 ; 64-frac_y, frac_y movq [rsp+8*3], m0 pabsw m4, m4 movq [rsp+8*2], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movq m0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movhps m0, [rsp+r3] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movq m1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movhps m1, [rsp+r3] pand m2, m8, m6 paddsw m5, m6, m7 psubw m3, m9, m2 psllw m2, 8 pshufb m0, m11 por m2, m3 pmaddubsw m0, m2 pand m2, m8, m5 psubw m3, m9, m2 psllw m2, 8 pshufb m1, m11 por m2, m3 pmaddubsw m1, m2 cmp r3d, 127 ; topleft jge .w4_toponly movzx r3d, byte [rsp+8*2+0] ; base_y0 movq m3, [rsp+r3] movzx r3d, byte [rsp+8*2+2] ; base_y1 movhps m3, [rsp+r3] movzx r3d, byte [rsp+8*2+4] ; base_y2 movq m4, [rsp+r3] movzx r3d, byte [rsp+8*2+6] ; base_y3 movhps m4, [rsp+r3] pshufb m3, m12 pshufb m4, m12 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*3] pmaddubsw m2, m4 pmaddubsw m3, m4 psraw m6, 15 ; base_x < topleft pand m2, m6 pandn m6, m0 por m0, m2, m6 psraw m6, m5, 15 pand m3, m6 pandn m6, m1 por m1, m3, m6 .w4_toponly: pmulhrsw m0, m10 pmulhrsw m1, m10 movifnidn strideq, stridemp packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] punpckhqdq m0, m0 movd [dstq+strideq*0], m0 psrlq m0, 32 movd [dstq+strideq*1], m0 sub hd, 4 jz .w4_end movq m4, [rsp+8*2] movq m3, [rsp+16*6+8*1] paddw m6, m5, m7 ; xpos += dx psubw m4, m3 movq [rsp+8*2], m4 lea dstq, [dstq+strideq*2] cmp r2d, r11d jge .w4_loop movddup m5, [rsp+8*3] .w4_leftonly_loop: movzx r2d, byte [rsp+8*2+0] ; base_y0 movq m1, [rsp+r2] movzx r2d, byte [rsp+8*2+2] ; base_y1 movhps m1, [rsp+r2] movzx r2d, byte [rsp+8*2+4] ; base_y2 movq m2, [rsp+r2] movzx r2d, byte [rsp+8*2+6] ; base_y3 movhps m2, [rsp+r2] psubw m4, m3 pshufb m1, m12 pshufb m2, m12 movq [rsp+8*2], m4 punpckldq m0, m1, m2 punpckhdq m1, m2 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] punpckhqdq m0, m0 movd [dstq+strideq*0], m0 psrlq m0, 32 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*1] add r5, 4 mov dstq, r5 paddw m4, [rsp+8*0] ; base_y += 4*dy movzx r2d, word [rsp+16*15+8*1] movddup m6, [rsp+16*15+8*1] paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main movd m5, [tlq+8] lea r3d, [angleq+126] pshufb m5, m7 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movd [rsp+16*8+8], m5 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filter_wh8] movd m0, r3d movd m6, angled shr angled, 8 ; is_sm << 1 psrldq m2, [base+z_filter_t_w48+angleq*8], 4 pshufb m0, m7 pshufb m6, m7 pcmpeqb m0, m1 pand m6, m0 pcmpgtb m6, m2 %if ARCH_X86_64 movq [rsp+16*15+8*1], m10 ; 8<<6 %else movq m0, m10 movq [rsp+16*15+8*1], m0 %endif jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp], angled sub angled, 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m3, [base+z_filter_wh8] psrldq m4, [base+z_filter_t_w48+angleq*8], 4 call .w8_filter_top mov r3d, [rsp] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m6 test r5d, r5d jz .w4_main imul r5d, 0x55555555 mov r3, tlq shr r5d, 30 sub r5, 3 ; filter_strength-3 jmp .filter_left .w8_filter_top: movd m6, r3d REPX {pshufb x, m7}, m0, m1, m6 pcmpeqb m0, m3 pand m1, m0 pand m6, m0 pcmpgtb m1, m4 pcmpgtb m6, m4 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 movq m0, [rsp+gprsize+16*8-2] shr r5d, 30 movq m1, [rsp+gprsize+16*8-1] sub r5, 3 ; filter_strength-3 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] punpcklbw m0, m1 pmaddubsw m0, m7 movq m1, [rsp+gprsize+16*8+0] movq m2, [rsp+gprsize+16*8+1] movddup m7, [base+z_filter_k+8*2+r5*8+24*1] punpcklbw m1, m2 pmaddubsw m1, m7 movq m2, [rsp+gprsize+16*8+2] movddup m7, [base+z_filter_k+8*2+r5*8+24*2] punpcklbw m2, m2 pmaddubsw m2, m7 paddw m0, m1 paddw m0, m2 %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+gprsize+16*18+4*3] %endif pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movq [rsp+gprsize+16*8], m0 cmp r3d, 8 jge .w8_filter_top_end movq m0, [tlq+r3+1] movq [rsp+gprsize+r3+16*8], m0 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m6, r3d REPX {pshufb x, m7}, m0, m1, m6 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m0, [base+z_filter_wh16] pand m1, m0 pand m6, m0 pcmpgtb m1, m3 pcmpgtb m6, m3 pmovmskb r5d, m1 mov r3, tlq test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufb m5, [base+z_filter_t_w16] ; tlq[16] shr r5d, 30 adc r5, -4 ; filter_strength-3 movd [rsp+16*9], m5 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] movu m1, [rsp+16*8-2] movu m2, [rsp+16*8-1] punpcklbw m0, m1, m2 pmaddubsw m0, m7 punpckhbw m1, m2 pmaddubsw m1, m7 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] mova m3, [rsp+16*8+0] movu m4, [rsp+16*8+1] punpcklbw m2, m3, m4 pmaddubsw m2, m7 punpckhbw m3, m4 pmaddubsw m3, m7 paddw m0, m2 paddw m1, m3 test r5d, r5d jnz .w16_filter_end ; 3-tap movddup m7, [base+z_filter_k+8*8] movu m3, [rsp+16*8+2] punpcklbw m2, m3, m3 pmaddubsw m2, m7 punpckhbw m3, m3 pmaddubsw m3, m7 paddw m0, m2 paddw m1, m3 .w16_filter_end: mov r2d, maxwm pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*8], m0 cmp r2d, 16 jge .w16_filter_left movu m0, [r3+r2+1] movu [rsp+r2+16*8], m0 .w16_filter_left: pmovmskb r5d, m6 test r5d, r5d jz .w4_main imul r5d, 0x24924924 shr r5d, 30 adc r5, -4 ; filter_strength-3 jmp .filter_left .w32: test angled, 0x400 jnz .w4_main pshufb m6, [base+z_filter_t_w16] ; tlq[32] mov r3, tlq lea tlq, [rsp+16*9] movd [tlq+16*1], m6 xor r5d, r5d ; filter_strength = 3 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mov r2d, maxwm mova [rsp+16*8], m0 mova [rsp+16*9], m1 cmp r2d, 32 jge .filter_left movu m0, [r3+r2+16*0+1] movu m1, [r3+r2+16*1+1] movu [rsp+r2+16*8], m0 movu [rsp+r2+16*9], m1 jmp .filter_left .w64: movu m0, [tlq+16*2+1] movu m1, [tlq+16*3+1] mova [rsp+16*10], m0 mova [rsp+16*11], m1 test angled, 0x400 jnz .w4_main pshufb m1, [base+z_filter_t_w16] ; tlq[64] mov r3, tlq lea tlq, [rsp+16*11] movd [tlq+16*1], m1 xor r5d, r5d ; filter_strength = 3 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mova m2, [tlq+16*2] mova m3, [tlq+16*3] mov r2d, maxwm mova [rsp+16* 8], m0 mova [rsp+16* 9], m1 mova [rsp+16*10], m2 mova [rsp+16*11], m3 cmp r2d, 64 jge .filter_left movu m0, [r3+r2+16*0+1] movu m1, [r3+r2+16*1+1] movu [rsp+r2+16* 8], m0 movu [rsp+r2+16* 9], m1 cmp r2d, 32 jge .filter_left movu m0, [r3+r2+16*2+1] movu m1, [r3+r2+16*3+1] movu [rsp+r2+16*10], m0 movu [rsp+r2+16*11], m1 .filter_left: neg hq movd m0, [r3+hq] pxor m1, m1 pshufb m0, m1 movd [rsp+16*6+hq-4], m0 lea tlq, [rsp+16*5] call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge cmp hd, -32 jge .filter_left_end sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mova [rsp+16*2], m0 mova [rsp+16*3], m1 .filter_left_end: mov r2d, maxhm mova m0, [rsp+16*5] mova m1, [rsp+16*6] mova m2, [rsp+16*7] neg r2 mova [rsp+16*4], m0 mova [rsp+16*5], m1 mova [rsp+16*6], m2 cmp r2d, hd jle .w4_main movu m0, [r3+r2-16*2] movu m1, [r3+r2-16*1] movu [rsp+r2+16*4], m0 movu [rsp+r2+16*5], m1 cmp r2d, -32 jle .w4_main movu m0, [r3+r2-16*4] movu m1, [r3+r2-16*3] movu [rsp+r2+16*2], m0 movu [rsp+r2+16*3], m1 jmp .w4_main %if ARCH_X86_64 cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mova m8, [base+pw_62] mova m9, [base+pw_64] mova m10, [base+pw_512] mov org_wd, wd %else cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define org_wd r5 %define org_wq r5 mov [dstq+strideq*0], strideq mov [dstq+strideq*1], wd LEA r1, $$ %endif tzcnt hd, hm movifnidn angled, anglem dec tlq movsxd hq, [base+ipred_z3_ssse3_table+hq*4] sub angled, 180 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e lea hq, [base+ipred_z3_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) movu m3, [tlq-7] movu m1, [base+z_upsample1-4] movu m4, [base+z_filter_s+2] pshufb m0, m3, m1 pxor m1, m1 pshufb m2, m3, m1 pshufb m1, m3, m4 mova [rsp+16], m2 ; top[max_base_y] movddup m2, [base+pb_36_m4] add dyd, dyd pmaddubsw m0, m2 pmaddubsw m1, m2 movd m5, dyd mov r5d, dyd pshufb m5, [base+pw_256] paddw m0, m1 pmulhrsw m0, m10 shl wd, 2 mov tlq, rsp sub rsp, wq packuswb m0, m0 punpcklbw m0, m3 paddw m6, m5, m5 punpcklqdq m5, m6 pshufb m0, [base+pb_15to0] mova [tlq], m0 .h4_upsample_loop: lea r4d, [r5+dyq] shr r5d, 6 movq m0, [tlq+r5] lea r5d, [r4+dyq] shr r4d, 6 movhps m0, [tlq+r4] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 por m1, m2 pmaddubsw m0, m1 paddw m5, m6 pmulhrsw m0, m10 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jg .h4_upsample_loop jmp .h4_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m0, r4d movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 pcmpeqb m1, m0, [base+z_filter_wh4] pand m1, m2 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 movu m2, [tlq-7] imul r5d, 0x55555555 movu m3, [base+z_filter_s-2] shr r5d, 30 ; filter_strength mova m4, [base+z_upsample2] movddup m5, [base+z_filter_k-8+r5*8+24*0] movddup m6, [base+z_filter_k-8+r5*8+24*1] movddup m7, [base+z_filter_k-8+r5*8+24*2] pshufb m0, m2, m3 shufps m3, m4, q2121 pmaddubsw m1, m0, m5 pmaddubsw m0, m6 pshufb m5, m2, m3 pmaddubsw m3, m5, m6 pmaddubsw m5, m7 pshufb m2, m4 pmaddubsw m2, m7 paddw m0, m1 paddw m1, m3 paddw m0, m5 paddw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 lea r2d, [r4+2] cmp wd, 4 cmovne r4d, r2d pshufd m0, m0, q0000 lea tlq, [rsp+15] packuswb m0, m1 mova [rsp], m0 .h4_main: movd m5, dyd movddup m0, [base+z_base_inc] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m7, [tlq] movd m4, r4d pshufb m5, [base+pw_256] neg dyq pshufb m7, [base+pw_m256] mova m3, [base+z3_shuf_h4] lea r5, [dyq+r4+63] ; ypos pshufb m4, [base+pw_256] psubw m4, m0 ; max_base_y shl wd, 2 paddw m6, m5, m5 sub rsp, wq punpcklqdq m5, m6 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movq m0, [tlq+r5-4] lea r5, [r4+dyq] sar r4, 6 movhps m0, [tlq+r4-4] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jz .h4_transpose test r5d, r5d jg .h4_loop packuswb m7, m7 .h4_end_loop: movq [rsp+wq-8], m7 sub wd, 8 jg .h4_end_loop .h4_transpose: mova m1, [base+z_transpose4] %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif lea r2, [strideq*3] lea dstq, [dstq+org_wq-4] .h4_transpose_loop: mova m0, [rsp] add rsp, 16 pshufb m0, m1 movd [dstq+strideq*0], m0 pshuflw m2, m0, q1032 movd [dstq+strideq*1], m2 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r2 ], m0 sub dstq, 4 sub org_wd, 4 jg .h4_transpose_loop RET .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m4, [tlq-15] and r4d, 4 movu m3, [tlq- 9] movd m1, r4d movu m2, [base+z_filter_s+2] pxor m0, m0 movu m5, [base+z_filter_s+6] movddup m7, [base+pb_36_m4] pshufb m1, m0 ; w & 4 movu m0, [base+z_upsample1-4] pmaxub m1, m0 ; clip 4x8 add dyd, dyd pshufb m0, m4, m1 pmaddubsw m0, m7 pshufb m1, m4, m2 pmaddubsw m1, m7 pshufb m2, m3, [base+z_upsample1] pmaddubsw m2, m7 pshufb m3, m5 pmaddubsw m3, m7 movd m5, dyd neg dyq paddw m1, m0 paddw m2, m3 pmulhrsw m1, m10 pmulhrsw m2, m10 shl wd, 3 lea tlq, [rsp+16] pshufb m5, [base+pw_256] sub rsp, wq packuswb m1, m2 lea r5, [dyq+63] punpcklbw m0, m1, m4 punpckhbw m1, m4 mova [tlq-16*1], m0 mova [tlq-16*0], m1 paddw m6, m5, m5 punpcklqdq m5, m6 .h8_upsample_loop: lea r4, [r5+dyq] sar r5, 6 movu m0, [tlq+r5] lea r5, [r4+dyq] sar r4, 6 movu m1, [tlq+r4] pand m3, m8, m5 psubw m2, m9, m3 psllw m2, 8 por m3, m2 pshufd m2, m3, q1010 pmaddubsw m0, m2 punpckhqdq m3, m3 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m1, m0 mova [rsp+wq-16], m1 sub wd, 16 jg .h8_upsample_loop jmp .h8_transpose .h8_no_upsample: lea r4d, [wq+7] movd m0, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movu m1, [base+z_filter_wh8] psrldq m3, [base+z_filter_t_w48+angleq*8], 4 pcmpeqb m1, m0 pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .h8_main ; filter_strength == 0 mova m0, [tlq-15] imul r5d, 0x55555555 movd m1, [tlq+1] neg r4 movd m2, [tlq+r4] shr r5d, 30 pxor m7, m7 lea tlq, [rsp+16*2] sub r5, 3 ; filter_strength-3 mova [tlq+16*0], m0 pshufb m1, m7 mova [tlq+16*1], m1 pshufb m2, m7 movq [tlq+r4+8], m2 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sar r5d, 1 add tlq, 31 add r5d, 17 cmp wd, 8 cmova r4d, r5d .h8_main: movd m5, dyd sub tlq, r4 shl r4d, 6 movd m7, [tlq] movd m4, r4d pshufb m5, [base+pw_256] neg dyq pshufb m7, [base+pw_m256] mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, [base+pw_256] psubw m4, [base+z3_base_inc] shl wd, 3 mova m6, m5 sub rsp, wq .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4-8] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jz .h8_transpose add r5, dyq jg .h8_loop packuswb m7, m7 .h8_end_loop: movq [rsp+wq-8], m7 sub wd, 8 jg .h8_end_loop .h8_transpose: %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif or r3d, 8 cmp org_wd, 4 %if ARCH_X86_64 jne .end_transpose_main %else jne .end_transpose_loop %endif mova m1, [rsp+16*1] mova m0, [rsp+16*0] lea r2, [strideq*3] add rsp, 16*2 punpcklbw m2, m1, m0 punpckhbw m1, m0 punpckhbw m0, m1, m2 punpcklbw m1, m2 .write_4x8_end: call .write_4x8 RET .write_4x8: movd [dstq+r2 ], m0 pshuflw m4, m0, q1032 movd [dstq+strideq*2], m4 punpckhqdq m0, m0 movd [dstq+strideq*1], m0 psrlq m0, 32 movd [dstq+strideq*0], m0 lea dstq, [dstq+strideq*4] movd [dstq+r2 ], m1 pshuflw m4, m1, q1032 movd [dstq+strideq*2], m4 punpckhqdq m1, m1 movd [dstq+strideq*1], m1 psrlq m1, 32 movd [dstq+strideq*0], m1 ret .h16: lea r4d, [wq+15] movd m0, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m1, m0, [base+z_filter_wh16] pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 mova m0, [tlq-16*2+1] imul r5d, 0x24924924 mova m1, [tlq-16*1+1] neg r4 movd m2, [tlq-16*0+1] shr r5d, 30 movd m3, [tlq+r4] adc r5, -4 ; filter_strength-3 pxor m7, m7 lea tlq, [rsp+16*2] mova [tlq-16*1], m0 pshufb m2, m7 mova [tlq+16*0], m1 pshufb m3, m7 mova [tlq+16*1], m2 movq [tlq+r4+8], m3 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge add tlq, 31 cmp wd, 16 jle .h16_main pshuflw m0, [tlq-47], q0000 sar r5, 1 movq m1, [base+z3_filter_k_tail+r5*4] lea r4d, [r5+33] pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq-35], m0 .h16_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] shl wd, 4 mova m6, m5 sub rsp, wq .h16_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*2] por m2, m1 movu m1, [tlq+r4-8*1] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 psrlw m2, m5, 6 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 pand m0, m1 pandn m1, m7 por m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16], m7 sub wd, 16 jg .h16_end_loop .h16_transpose: %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif or r3d, 16 cmp org_wd, 4 %if ARCH_X86_64 jne .end_transpose_main %else jne .end_transpose_loop %endif .h16_transpose_w4: mova m2, [rsp+16*3] mova m4, [rsp+16*2] mova m3, [rsp+16*1] mova m0, [rsp+16*0] lea r2, [strideq*3] add rsp, 16*4 punpckhbw m1, m2, m4 punpcklbw m2, m4 punpckhbw m4, m3, m0 punpcklbw m3, m0 punpckhwd m0, m1, m4 punpcklwd m1, m4 call .write_4x8 lea dstq, [dstq+strideq*4] punpckhwd m0, m2, m3 punpcklwd m1, m2, m3 jmp .write_4x8_end .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main mova m0, [tlq-16*4+1] mova m1, [tlq-16*3+1] mova m2, [tlq-16*2+1] mova m3, [tlq-16*1+1] movd m4, [tlq-16*0+1] neg r4 movd m5, [tlq+r4] pxor m7, m7 lea tlq, [rsp+16*4] mova [tlq-16*3], m0 mova [tlq-16*2], m1 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*1], m2 pshufb m4, m7 mova [tlq+16*0], m3 pshufb m5, m7 mova [tlq+16*1], m4 movq [tlq+r4+8], m5 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge add tlq, 63 cmp wd, 32 jle .h32_main pshuflw m0, [tlq-79], q0000 movq m1, [base+z3_filter_k_tail] add r4d, 2 pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq-67], m0 .h32_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] mova m6, m5 .h32_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*4] por m2, m1 movu m1, [tlq+r4-8*3] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 sub rsp, 32 packuswb m0, m1 mova [rsp+16*0], m0 movu m0, [tlq+r4-8*2] movu m1, [tlq+r4-8*1] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 psrlw m2, m5, 6 paddw m5, m6 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] pand m0, m1 pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 mova [rsp+16*1], m0 pand m0, m1, [rsp+16*0] pandn m1, m7 por m0, m1 mova [rsp+16*0], m0 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 32 mova [rsp+16*1], m7 mova [rsp+16*0], m7 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32 jmp .end_transpose_main .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main mova m0, [tlq-16*8+1] mova m1, [tlq-16*7+1] mova m2, [tlq-16*6+1] mova m3, [tlq-16*5+1] mova [rsp+16*1], m0 mova [rsp+16*2], m1 mova [rsp+16*3], m2 mova [rsp+16*4], m3 mova m0, [tlq-16*4+1] mova m1, [tlq-16*3+1] mova m2, [tlq-16*2+1] mova m3, [tlq-16*1+1] movd m4, [tlq-16*0+1] neg r4 movd m5, [tlq+r4] pxor m7, m7 lea tlq, [rsp+16*8] mova [tlq-16*3], m0 mova [tlq-16*2], m1 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*1], m2 pshufb m4, m7 mova [tlq+16*0], m3 pshufb m5, m7 mova [tlq+16*1], m4 movq [tlq+r4+8], m5 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 cmp wd, 64 jl .h64_filter96 ; skip one call if the last 32 bytes aren't used call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge .h64_filter96: add tlq, 127 .h64_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] mova m6, m5 .h64_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*8] por m2, m1 movu m1, [tlq+r4-8*7] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 sub rsp, 64 packuswb m0, m1 mova [rsp+16*0], m0 movu m0, [tlq+r4-8*6] movu m1, [tlq+r4-8*5] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*1], m0 movu m0, [tlq+r4-8*4] movu m1, [tlq+r4-8*3] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*2], m0 movu m0, [tlq+r4-8*2] movu m1, [tlq+r4-8*1] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 psrlw m2, m5, 6 paddw m5, m6 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] pand m0, m1 pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] mova [rsp+16*3], m0 pand m0, m1, [rsp+16*2] pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] mova [rsp+16*2], m0 pand m0, m1, [rsp+16*1] pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 mova [rsp+16*1], m0 pand m0, m1, [rsp+16*0] pandn m1, m7 por m0, m1 mova [rsp+16*0], m0 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 64 mova [rsp+16*3], m7 mova [rsp+16*2], m7 mova [rsp+16*1], m7 mova [rsp+16*0], m7 dec wd jg .h64_end_loop .h64_transpose: or r3d, 64 .end_transpose_main: %if ARCH_X86_64 lea r5, [r3*3] lea r7, [strideq*3] %else mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif .end_transpose_loop: lea r4, [rsp+r3-8] lea r6, [dstq+org_wq-8] .end_transpose_loop_y: movq m0, [r4+r3*1] movq m4, [r4+r3*0] %if ARCH_X86_64 movq m1, [r4+r5 ] movq m5, [r4+r3*2] lea r2, [r4+r3*4] %else lea r2, [r4+r3*2] movq m1, [r2+r3*1] movq m5, [r2+r3*0] lea r2, [r2+r3*2] %endif movq m2, [r2+r3*1] movq m6, [r2+r3*0] %if ARCH_X86_64 movq m3, [r2+r5 ] movq m7, [r2+r3*2] %else lea r2, [r2+r3*2] movq m3, [r2+r3*1] movq m7, [r2+r3*0] %endif sub r4, 8 punpcklbw m0, m4 punpcklbw m1, m5 punpcklbw m2, m6 punpcklbw m3, m7 punpckhwd m4, m1, m0 punpcklwd m1, m0 punpckhwd m0, m3, m2 punpcklwd m3, m2 punpckhdq m2, m3, m1 punpckldq m3, m1 punpckldq m1, m0, m4 punpckhdq m0, m4 movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 %if ARCH_X86_64 movhps [r6+strideq*2], m1 movq [r6+r7 ], m1 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 lea r6, [r6+strideq*2] %endif movhps [r6+strideq*0], m2 movq [r6+strideq*1], m2 %if ARCH_X86_64 movhps [r6+strideq*2], m3 movq [r6+r7 ], m3 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m3 movq [r6+strideq*1], m3 lea r6, [r6+strideq*2] %endif cmp r4, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*8] sub org_wd, 8 jg .end_transpose_loop RET ;------------------------------------------------------------------------------- ;int dav2d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal, ; const uint8_t *idx, int w, int h); ;------------------------------------------------------------------------------- cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h movq m4, [palq] LEA r2, pal_pred_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] add wq, r2 lea r2, [strideq*3] jmp wq .w4: movq m1, [idxq] add idxq, 8 psrlw m0, m1, 4 punpcklbw m1, m0 pshufb m0, m4, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r2 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movu m0, [idxq] add idxq, 16 pshufb m1, m4, m0 psrlw m0, 4 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r2 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: movu m0, [idxq] add idxq, 16 pshufb m1, m4, m0 psrlw m0, 4 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: movu m0, [idxq] add idxq, 16 pshufb m1, m4, m0 psrlw m0, 4 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movu [dstq+16*0], m0 movu [dstq+16*1], m1 add dstq, strideq dec hd jg .w32 RET .w64: movu m0, [idxq+16*0] movu m2, [idxq+16*1] add idxq, 32 pshufb m1, m4, m0 psrlw m0, 4 pshufb m3, m4, m0 punpcklbw m0, m1, m3 punpckhbw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 pshufb m1, m4, m2 psrlw m2, 4 pshufb m3, m4, m2 punpcklbw m0, m1, m3 punpckhbw m1, m3 mova [dstq+16*2], m0 mova [dstq+16*3], m1 add dstq, strideq sub hd, 1 jg .w64 RET ;--------------------------------------------------------------------------------------- ;void dav2d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn wd, wm movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_ssse3_table tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+16] pcmpeqd m3, m3 psrlw m4, 1 add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 paddw m0, m1 pmaddwd m0, m3 cmp hd, 4 jg .w4_mul psrlw m0, 3 ; dc >>= ctz(width + height); jmp .w4_end .w4_mul: punpckhqdq m1, m0, m0 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 psrlw m0, 2 mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8 cmovz r6d, r2d movd m5, r6d pmulhuw m0, m5 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movd [dstq+strideq*0], m4 pshuflw m4, m4, q1032 movd [dstq+strideq*1], m4 punpckhqdq m4, m4 movd [dstq+strideq*2], m4 psrlq m4, 32 movd [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movq [dstq ], m4 movhps [dstq+strideq ], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq+strideq], m4 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] pmaddubsw m2, m3 paddw m0, m2 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x5556 mov r2d, 0x3334 test hd, 64|16 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq+16], m4 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET ;--------------------------------------------------------------------------------------- ;void dav2d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 movd m3, t0d movd m2, r6d psrld m3, m2 LEA t0, ipred_cfl_left_ssse3_table movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: pmaddwd m0, m2 pmulhrsw m0, m3 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq ;--------------------------------------------------------------------------------------- ;void dav2d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha LEA t0, ipred_cfl_left_ssse3_table tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 movd m3, r6d movd m2, wd psrld m3, m2 movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 ;--------------------------------------------------------------------------------------- ;void dav2d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha tzcnt wd, wm movifnidn hd, hm LEA r6, ipred_cfl_splat_ssse3_table movsxd wq, [r6+wq*4] movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] add wq, r6 movifnidn acq, acmp jmp wq %macro RELOAD_ACQ_32 1 mov acq, ac_bakq ; restore acq %endmacro %if ARCH_X86_64 cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak DECLARE_REG_TMP 7 movddup m2, [pb_2] %else cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h DECLARE_REG_TMP 4 %define ac_bakq acmp mov t0d, 0x02020202 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm mov t0d, hm mov hd, t0d imul t0d, wd movd m5, t0d movifnidn hpadd, hpadm %if ARCH_X86_64 mov ac_bakq, acq %endif shl hpadd, 2 sub hd, hpadd pxor m4, m4 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movq m0, [yq] movq m1, [yq+strideq] movhps m0, [yq+strideq*2] movhps m1, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .calc_avg_4_8 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop jmp .calc_avg_4_8 .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 mova m0, [yq+strideq*2] mova m1, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_loop test hpadd, hpadd jz .calc_avg_4_8 jmp .w8_hpad .w8_wpad: ; wpadd=1 movddup m0, [yq] movddup m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufhw m0, m0, q3333 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 16 sub hd, 1 jg .w8_wpad test hpadd, hpadd jz .calc_avg_4_8 .w8_hpad: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 1 jg .w8_hpad jmp .calc_avg_4_8 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 mova m6, [yq+16] mova m1, [yq+strideq+16] pmaddubsw m6, m2 pmaddubsw m1, m2 paddw m6, m1 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_loop test hpadd, hpadd jz .calc_avg16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movddup m0, [yq] movddup m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufhw m0, m0, q3333 mova [acq], m0 paddw m4, m0 mova m6, m0 punpckhqdq m6, m0, m0 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 pshufhw m6, m0, q3333 punpckhqdq m6, m6 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 movddup m6, [yq+16] movddup m1, [yq+strideq+16] pmaddubsw m6, m2 pmaddubsw m1, m2 paddw m6, m1 pshufhw m6, m6, q3333 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg16 .w16_hpad_loop: mova [acq], m0 paddw m4, m0 mova [acq+16], m6 paddw m4, m6 add acq, 32 dec hpadd jg .w16_hpad_loop jmp .calc_avg16 %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_4_8: psrlw m2, 9 pmaddwd m4, m2 jmp .calc_avg .calc_avg16: psrld m0, m4, 16 pslld m4, 16 psrld m4, 16 paddd m4, m0 .calc_avg: movd szd, m5 psrad m5, 1 tzcnt r1d, szd paddd m4, m5 movd m1, r1d pshufd m0, m4, q2301 paddd m0, m4 pshufd m4, m0, q1032 paddd m0, m4 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq .sub_loop: mova m1, [acq] psubw m1, m0 ; ac[x] -= sum; mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET %if ARCH_X86_64 cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm mov t0d, hm mov hd, t0d imul t0d, wd movd m6, t0d movifnidn hpadd, hpadm %if ARCH_X86_64 mov ac_bakq, acq %endif shl hpadd, 2 sub hd, hpadd pxor m4, m4 pxor m5, m5 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movq m1, [yq] movhps m1, [yq+strideq] movq m0, [yq+strideq*2] movhps m0, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_4 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop jmp .calc_avg_4 .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova m1, [yq] mova m0, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m4, m0 paddw m5, m1 mova m1, [yq+strideq*2] mova m0, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w8_hpad .w8_wpad: movddup m1, [yq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 movddup m0, [yq+strideq] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 sub hd, 2 jg .w8_wpad test hpadd, hpadd jz .calc_avg_8_16 .w8_hpad: mova [acq], m0 paddw m4, m0 mova [acq+16], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad jmp .calc_avg_8_16 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m1, [yq] mova m0, [yq+16] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m5, m0 paddw m5, m1 mova m1, [yq+strideq] mova m0, [yq+strideq+16] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m0 paddw m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movddup m1, [yq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movddup m1, [yq+strideq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq+32], m1 paddw m4, m1 punpckhqdq m0, m1, m1 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: mova m1, [yq] pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 pshufhw m1, m1, q3333 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 mova m1, [yq+strideq] pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 mova m0, m1 pshufhw m0, m0, q3333 punpckhqdq m0, m0 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m1, [yq] pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 movddup m0, [yq+16] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m5, m0 mova m1, [yq+strideq] pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 movddup m0, [yq+strideq+16] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg_8_16 .w16_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m4, m1 paddw m5, m0 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m1 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop jmp .calc_avg_8_16 %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_4: psrlw m2, 10 pmaddwd m5, m2 pmaddwd m0, m4, m2 jmp .calc_avg .calc_avg_8_16: mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, m4 psrld m0, 16 pslld m4, 16 psrld m4, 16 paddd m0, m4 .calc_avg: paddd m5, m0 movd szd, m6 psrad m6, 1 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); paddd m5, m6 movd m1, r1d pshufd m0, m5, q2301 paddd m0, m5 pshufd m5, m0, q1032 paddd m0, m5 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq ; ac = ac_orig .sub_loop: mova m1, [acq] psubw m1, m0 mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET %if ARCH_X86_64 cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h %define ac_bakq [rsp+16*4] mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm movifnidn hpadd, hpadm movd m0, hpadd mov t0d, hm mov hd, t0d imul t0d, wd movd m6, t0d movd hpadd, m0 mov ac_bakq, acq shl hpadd, 2 sub hd, hpadd pxor m5, m5 pxor m4, m4 cmp wd, 16 jg .w32 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movd m1, [yq] movd m3, [yq+strideq] punpckldq m1, m3 punpcklbw m1, m1 movd m0, [yq+strideq*2] movd m3, [yq+stride3q] punpckldq m0, m3 punpcklbw m0, m0 pmaddubsw m1, m2 pmaddubsw m0, m2 mova [acq], m1 mova [acq+16], m0 paddw m5, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_4 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m5, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop .calc_avg_4: psrlw m2, 10 pmaddwd m5, m2 jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: movq m1, [yq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 movq m0, [yq+strideq] punpcklbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 movq m1, [yq+strideq*2] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 movq m0, [yq+stride3q] punpcklbw m0, m0 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w8_hpad .w8_wpad: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 movd m0, [yq+strideq] punpcklbw m0, m0 punpcklqdq m0, m0 pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 sub hd, 2 jg .w8_wpad test hpadd, hpadd jz .calc_avg_8_16 .w8_hpad: mova [acq], m0 paddw m5, m0 mova [acq+16], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad jmp .calc_avg_8_16 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 mova m0, [yq+strideq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movd m1, [yq+strideq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhqdq m0, m1, m1 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: movq m1, [yq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 pshufhw m1, m1, q3333 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movq m1, [yq+strideq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 mova m0, m1 pshufhw m0, m0, q3333 punpckhqdq m0, m0 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 mova m0, [yq+strideq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg_8_16 .w16_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m4, m1 paddw m5, m0 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m1 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop .calc_avg_8_16: mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, m4 psrld m0, 16 pslld m4, 16 psrld m4, 16 paddd m0, m4 paddd m5, m0 jmp .calc_avg .w32: pxor m0, m0 mova [rsp ], m0 mova [rsp+16], m0 mova [rsp+32], m0 mova [rsp+48], m0 test wpadd, wpadd jnz .w32_wpad .w32_loop: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m4, [yq+16] mova m3, m4 punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 punpckhbw m4, m4 pmaddubsw m4, m2 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_loop test hpadd, hpadd jz .calc_avg_32 jmp .w32_hpad_loop .w32_wpad: cmp wpadd, 2 jl .w32_pad1 je .w32_pad2 cmp wpadd, 4 jl .w32_pad3 je .w32_pad4 cmp wpadd, 6 jl .w32_pad5 je .w32_pad6 .w32_pad7: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 mova m0, m1 punpckhqdq m0, m0 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad7 jmp .w32_wpad_done .w32_pad6: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 pshufhw m0, m1, q3333 punpckhqdq m0, m0 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad6 jmp .w32_wpad_done .w32_pad5: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 mova m5, [rsp] paddw m5, m1 mova [rsp ], m5 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 punpckhqdq m3, m3 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad5 jmp .w32_wpad_done .w32_pad4: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 pshufhw m3, m3, q3333 punpckhqdq m3, m3 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad4 jmp .w32_wpad_done .w32_pad3: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 movd m3, [yq+16] punpcklbw m3, m3 punpcklqdq m3, m3 pshufhw m3, m3, q3333 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 punpckhqdq m4, m4 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad3 jmp .w32_wpad_done .w32_pad2: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, [yq+16] punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 pshufhw m4, m3, q3333 punpckhqdq m4, m4 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad2 jmp .w32_wpad_done .w32_pad1: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m4, [yq+16] mova m3, m4 punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 punpckhbw m4, m4 punpcklqdq m4, m4 pshufhw m4, m4, q3333 pmaddubsw m4, m2 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad1 .w32_wpad_done: test hpadd, hpadd jz .calc_avg_32 .w32_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m5, m1, [rsp] mova [rsp ], m5 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova [acq+32], m3 mova [acq+48], m4 paddw m5, m3, [rsp+32] mova [rsp+32], m5 paddw m5, m4, [rsp+48] mova [rsp+48], m5 add acq, 64 sub hpadd, 1 jg .w32_hpad_loop %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_32: mova m5, [rsp] mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, [rsp+16] mova m3, m0 psrld m0, 16 pslld m3, 16 psrld m3, 16 paddd m0, m3 paddd m5, m0 mova m0, [rsp+32] mova m3, m0 psrld m0, 16 pslld m3, 16 psrld m3, 16 paddd m0, m3 mova m1, [rsp+48] mova m3, m1 psrld m1, 16 pslld m3, 16 psrld m3, 16 paddd m1, m3 paddd m1, m0 paddd m5, m1 .calc_avg: movd szd, m6 psrad m6, 1 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); paddd m5, m6 movd m1, r1d pshufd m0, m5, q2301 paddd m0, m5 pshufd m5, m0, q1032 paddd m0, m5 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq ; ac = ac_orig .sub_loop: mova m1, [acq] psubw m1, m0 mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET ; %1 simd register that hold the mask and will hold the result ; %2 simd register that holds the "true" values ; %3 location of the "false" values (simd register/memory) %macro BLEND 3 ; mask, true, false pand %2, %1 pandn %1, %3 por %1, %2 %endmacro %macro PAETH 2 ; top, ldiff pavgb m1, m%1, m3 pxor m0, m%1, m3 pand m0, m4 psubusb m2, m5, m1 psubb m1, m0 psubusb m1, m5 por m1, m2 paddusb m1, m1 por m1, m0 ; min(tldiff, 255) psubusb m2, m5, m3 psubusb m0, m3, m5 por m2, m0 ; tdiff %ifnum %2 pminub m2, m%2 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff %else mova m0, %2 pminub m2, m0 pcmpeqb m0, m2 %endif pminub m1, m2 pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff mova m2, m3 BLEND m0, m2, m%1 BLEND m1, m0, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h %define base r5-ipred_paeth_ssse3_table tzcnt wd, wm movifnidn hd, hm pxor m0, m0 movd m5, [tlq] pshufb m5, m0 LEA r5, ipred_paeth_ssse3_table movsxd wq, [r5+wq*4] movddup m4, [base+ipred_paeth_shuf] add wq, r5 jmp wq .w4: movd m6, [tlq+1] ; top pshufd m6, m6, q0000 lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: sub tlq, 4 movd m3, [tlq] mova m1, [base+ipred_h_shuf] pshufb m3, m1 ; left PAETH 6, 7 movd [dstq ], m1 pshuflw m0, m1, q1032 movd [dstq+strideq ], m0 punpckhqdq m1, m1 movd [dstq+strideq*2], m1 psrlq m1, 32 movd [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: movddup m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: sub tlq, 2 movd m3, [tlq] pshufb m3, [base+ipred_paeth_shuf] PAETH 6, 7 movq [dstq ], m1 movhps [dstq+strideq], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: sub tlq, 1 movd m3, [tlq] pxor m1, m1 pshufb m3, m1 PAETH 6, 7 movu [dstq], m1 add dstq, strideq sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp ], m6 mova [rsp+16], m7 movu m6, [tlq+17] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+32], m6 .w32_loop: dec tlq movd m3, [tlq] pxor m1, m1 pshufb m3, m1 mova m6, [rsp] PAETH 6, [rsp+16] movu [dstq ], m1 mova m6, [rsp+32] PAETH 6, 7 movu [dstq+16], m1 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp ], m6 mova [rsp+16], m7 movu m6, [tlq+17] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+32], m6 mova [rsp+48], m7 movu m6, [tlq+33] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+64], m6 mova [rsp+80], m7 movu m6, [tlq+49] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+96], m6 .w64_loop: dec tlq movd m3, [tlq] pxor m1, m1 pshufb m3, m1 mova m6, [rsp] PAETH 6, [rsp+16] mova [dstq ], m1 mova m6, [rsp+32] PAETH 6, [rsp+48] mova [dstq+16], m1 mova m6, [rsp+64] PAETH 6, [rsp+80] mova [dstq+32], m1 mova m6, [rsp+96] PAETH 6, 7 mova [dstq+48], m1 add dstq, strideq dec hd jg .w64_loop RET %macro FILTER 4 ;dst, src, tmp, shuf %ifnum %4 pshufb m%2, m%4 %else pshufb m%2, %4 %endif pshufd m%1, m%2, q0000 ;p0 p1 pmaddubsw m%1, m2 pshufd m%3, m%2, q1111 ;p2 p3 pmaddubsw m%3, m3 paddw m%1, [base+pw_8] paddw m%1, m%3 pshufd m%3, m%2, q2222 ;p4 p5 pmaddubsw m%3, m4 paddw m%1, m%3 pshufd m%3, m%2, q3333 ;p6 __ pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 packuswb m%1, m%1 %endmacro cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter %define base r6-$$ LEA r6, $$ tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 lea filterq, [base+filter_intra_taps+filterq] movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 movsxd wq, [base+ipred_filter_ssse3_table+wq*4] mova m2, [filterq+16*0] mova m3, [filterq+16*1] mova m4, [filterq+16*2] mova m5, [filterq+16*3] lea wq, [base+ipred_filter_ssse3_table+wq] mov hd, hm jmp wq .w4: mova m1, [base+filter_shuf1] sub tlq, 3 sub tlq, hq jmp .w4_loop_start .w4_loop: movd m0, [tlq+hq] punpckldq m0, m6 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER 6, 0, 7, 1 movd [dstq+strideq*0], m6 pshuflw m6, m6, q1032 movd [dstq+strideq*1], m6 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 sub tlq, 5 sub tlq, hq .w8_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER 0, 6, 1, [base+filter_shuf2] punpckldq m6, m7, m0 movq [dstq+strideq*0], m6 punpckhqdq m6, m6 movq [dstq+strideq*1], m6 movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m6, [tlq+1] ;top row sub tlq, 5 sub tlq, hq .w16_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+4+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+8+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movd [dstq+12+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+strideq*1], m6 movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] ;top row lea filterq, [tlq+17] sub tlq, 5 sub tlq, hq .w32_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+4+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+8+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movu m1, [filterq] punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+12+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+strideq*1], m6 mova m6, m1 FILTER 7, 0, 6, [base+filter_shuf2] punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+16+strideq*0], m7 psrlq m7, 32 palignr m7, m1, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+20+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+24+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movd [dstq+28+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+16+strideq*1], m6 mova m6, [dstq+strideq*1] movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea filterq, [dstq+16+strideq*1] lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx.h000066400000000000000000000042621517466257200222410ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" decl_cctx_fn(BF(dav2d_cctx, avx2)); decl_itx_fns(avx2); static ALWAYS_INLINE void itx_dsp_init_x86(Dav2dInvTxfmDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; #if BITDEPTH == 8 c->cctx = BF(dav2d_cctx, avx2); assign_itx_fn( , 4, 4, avx2); assign_itx_fn(R, 4, 8, avx2); assign_itx_fn(R, 4, 16, avx2); assign_itx_fn(R, 4, 32, avx2); assign_itx_fn(R, 8, 4, avx2); assign_itx_fn( , 8, 8, avx2); assign_itx_fn(R, 8, 16, avx2); assign_itx_fn(R, 8, 32, avx2); assign_itx_fn(R, 16, 4, avx2); assign_itx_fn(R, 16, 8, avx2); assign_itx_fn( , 16, 16, avx2); assign_itx_fn(R, 32, 4, avx2); assign_itx_fn(R, 32, 8, avx2); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx16_avx2.asm000066400000000000000000011564541517466257200237150ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 %macro COEF_PAIR 2-3 0 pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) %if %3 dd -%2, -%2 %define pd_%2_m%2 pd_%2 %endif %endmacro COEF_PAIR 201, 995 COEF_PAIR 401, 1931 COEF_PAIR 799, 3406 COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2751, 2106 COEF_PAIR 2896, 1567, 1 COEF_PAIR 2896, 3784, 1 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 COEF_PAIR 3857, 4052 COEF_PAIR 4017, 2276 COEF_PAIR 4076, 3612 COEF_PAIR 4091, 3973 pd_8: dd 8 pd_m601: dd -601 pd_m1189: dd -1189 pd_m1380: dd -1380 pd_m2106: dd -2106 pd_m2598: dd -2598 pd_m2751: dd -2751 pd_m3344: dd -3344 pd_1024: dd 1024 pd_1321: dd 1321 pd_1448: dd 1448 pd_1697: dd 1697 pd_2482: dd 2482 pd_3072: dd 3072 ; 1024 + 2048 pd_3803: dd 3803 pd_5119: dd 5119 ; 1024 + 4096 - 1 pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 pd_6144: dd 6144 ; 2048 + 4096 pd_17408: dd 17408 ; 1024 + 16384 pixel_10bpc_max: times 2 dw 0x03ff pixel_12bpc_max: times 2 dw 0x0fff dconly_10bpc: times 2 dw 0x7c00 dconly_12bpc: times 2 dw 0x7000 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 clip_20b_max: dd 0x7ffff const idct64_mul_16bpc dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 cextern deint_shuf cextern idct64_mul cextern pw_1697x8 cextern pw_1697x16 cextern pw_1567_3784 cextern pw_m1567_m3784 cextern pw_m3784_1567 cextern pw_2896_2896 cextern pw_m2896_2896 cextern pw_5 cextern pw_2048 cextern pw_4096 cextern pw_8192 cextern pw_16384 cextern pw_2896x8 cextern pd_2048 cextern idct_4x8_internal_8bpc_avx2.main cextern idct_4x16_internal_8bpc_avx2.main cextern idct_8x8_internal_8bpc_avx2.main cextern idct_8x16_internal_8bpc_avx2.main cextern idct_16x4_internal_8bpc_avx2.main cextern idct_16x8_internal_8bpc_avx2.main cextern idct_16x16_internal_8bpc_avx2.main cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal cextern iadst_4x4_internal_8bpc_avx2.main cextern iadst_4x8_internal_8bpc_avx2.main_pass2 cextern iadst_4x16_internal_8bpc_avx2.main2 cextern iadst_8x4_internal_8bpc_avx2.main cextern iadst_8x8_internal_8bpc_avx2.main_pass2 cextern iadst_8x16_internal_8bpc_avx2.main cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end cextern iadst_16x4_internal_8bpc_avx2.main cextern iadst_16x8_internal_8bpc_avx2.main cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end cextern iadst_16x16_internal_8bpc_avx2.main cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %macro WRAP_XMM 1+ INIT_XMM cpuname %1 INIT_YMM cpuname %endmacro %macro IWHT4_1D_PACKED 0 ; m0 = in0 in2, m1 = in1 in3 psubd m2, m0, m1 ; t2 paddd xm0, xm1 ; t0 vpermq m2, m2, q3322 vpermq m0, m0, q1100 vpermq m1, m1, q3120 psubd m3, m0, m2 psrad m3, 1 psubd m3, m1 ; t1 t3 psubd m0, m3 ; ____ out0 paddd m2, m3 ; out3 ____ %endmacro INIT_YMM avx2 cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax mova xm0, [cq+16*0] vinserti128 m0, [cq+16*2], 1 mova xm1, [cq+16*1] vinserti128 m1, [cq+16*3], 1 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 lea r6, [dstq+strideq*2] psrad m0, 2 psrad m1, 2 IWHT4_1D_PACKED punpckhdq m0, m3 punpckldq m3, m2 punpckhqdq m1, m0, m3 punpcklqdq m0, m3 IWHT4_1D_PACKED vpblendd m0, m2, 0x33 packssdw m0, m3 vextracti128 xm2, m0, 1 punpckhdq xm1, xm0, xm2 ; out2 out1 punpckldq xm0, xm2 ; out3 out0 movq xm2, [r6 +strideq*1] movhps xm2, [dstq+strideq*0] movq xm3, [r6 +strideq*0] movhps xm3, [dstq+strideq*1] %ifidn bdmaxd, bdmaxm movd xm5, bdmaxd vpbroadcastw xm5, xm5 %else ; win64: load from stack vpbroadcastw xm5, bdmaxm %endif paddsw xm0, xm2 paddsw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movq [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm0 RET ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 ; flags: 1 = packed, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else %if %9 & 1 vbroadcasti128 m%3, [pd_%8] %else vpbroadcastd m%3, [pd_%8] %endif pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif %if %7 < 32 pmulld m%1, m%7 pmulld m%2, m%7 %else %if %9 & 1 vbroadcasti128 m%5, [pd_%7] %else vpbroadcastd m%5, [pd_%7] %endif pmulld m%1, m%5 pmulld m%2, m%5 %endif %if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else %ifnum %6 paddd m%4, m%6 %endif paddd m%2, m%4 %endif %ifnum %6 paddd m%1, m%6 %endif psubd m%1, m%3 %ifnum %6 psrad m%2, 12 psrad m%1, 12 %endif %endmacro %macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_%5bpc) ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else %if %3 add eobd, %3 %endif ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x4, %3 %ifidn %1_%2, dct_dct vpbroadcastd xm2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 4 .dconly2: add r6d, 128 sar r6d, 8 .dconly3: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm2 vpbroadcastw xm0, xm0 .dconly_loop: movq xm1, [dstq+strideq*0] movhps xm1, [dstq+strideq*1] paddsw xm1, xm0 psubusw xm1, xm2 movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop WRAP_XMM RET %else jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly %endif %endif %endmacro %macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 punpckhqdq m%3, m%2, m%1 ; t3 t2 punpcklqdq m%2, m%1 ; t0 t1 paddd m%1, m%2, m%3 ; out0 out1 psubd m%2, m%3 ; out3 out2 %endmacro %macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd vpbroadcastd m%5, [pw_m3784_1567] punpckhwd m%3, m%2, m%1 vpbroadcastd m%4, [pw_1567_3784] punpcklwd m%2, m%1 vpbroadcastd m%1, [pw_m2896_2896] pmaddwd m%5, m%3 pmaddwd m%3, m%4 vpbroadcastd m%4, [pw_2896_2896] pmaddwd m%1, m%2 pmaddwd m%2, m%4 REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 packssdw m%3, m%5 ; t3 t2 packssdw m%2, m%1 ; t0 t1 paddsw m%1, m%2, m%3 ; out0 out1 psubsw m%2, m%3 ; out3 out2 %endmacro INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, identity INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main vbroadcasti128 m2, [idct4_shuf] packssdw m0, m1 pshufb m0, m2 jmp tx2q .pass2: vextracti128 xm1, m0, 1 WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 packssdw xm5, xm5 ; pw_2048 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] lea r6, [dstq+strideq*2] movq xm3, [r6 +strideq*1] movhps xm3, [r6 +strideq*0] vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movhps [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm1 RET ALIGN function_align .main: vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m5, [pd_2048] .main2: IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 ret INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity %macro IADST4_1D 0 vpbroadcastd m5, [pd_1321] vpbroadcastd m7, [pd_2482] pmulld m4, m0, m5 ; 1321*in0 pmulld m6, m3, m7 ; 2482*in3 paddd m4, m6 ; 1321*in0 + 2482*in3 pmulld m6, m0, m7 ; 2482*in0 paddd m0, m3 ; in0 + in3 paddd m7, m5 ; pd_3803 pmulld m5, m2 ; 1321*in2 pmulld m3, m7 ; 3803*in3 pmulld m7, m2 ; 3803*in2 psubd m2, m0 ; in2 - in0 - in3 vpbroadcastd m0, [pd_m3344] pmulld m1, m0 ; -t3 pmulld m2, m0 ; out2 (unrounded) psubd m6, m5 ; 2482*in0 - 1321*in2 paddd m4, m7 ; t0 psubd m6, m3 ; t1 paddd m3, m4, m6 psubd m4, m1 ; out0 (unrounded) psubd m6, m1 ; out1 (unrounded) paddd m3, m1 ; out3 (unrounded) %endmacro cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main vinserti128 m0, m4, xm6, 1 vinserti128 m1, m2, xm3, 1 .pass1_end: vpbroadcastd m5, [pd_2048] mova m2, [itx4_shuf] paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 packssdw m0, m1 vpermd m0, m2, m0 psrld m2, 4 pshufb m0, m2 %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif jmp tx2q .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main .end: vpbroadcastd xm4, [pw_2048] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] lea r6, [dstq+strideq*2] movq xm3, [r6 +strideq*0] movhps xm3, [r6 +strideq*1] vpbroadcastd xm5, [pixel_10bpc_max] pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 RET ALIGN function_align .main: mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] mova xm3, [cq+16*3] %if WIN64 movaps [rsp+16], xmm6 movaps [rsp+32], xmm7 %endif .main2: WRAP_XMM IADST4_1D ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main vinserti128 m0, m3, xm2, 1 vinserti128 m1, m6, xm4, 1 jmp m(iadst_4x4_internal_10bpc).pass1_end .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main vpbroadcastd xm4, [pw_2048] movq xm3, [dstq+strideq*1] movhps xm3, [dstq+strideq*0] lea r6, [dstq+strideq*2] movq xm2, [r6 +strideq*1] movhps xm2, [r6 +strideq*0] vpbroadcastd xm5, [pixel_10bpc_max] pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movhps [dstq+strideq*0], xm1 movq [dstq+strideq*1], xm1 movhps [r6 +strideq*0], xm0 movq [r6 +strideq*1], xm0 RET INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 vpbroadcastd m1, [pd_5793] pmulld m0, m1, [cq+32*0] pmulld m1, [cq+32*1] vpbroadcastd m5, [pd_2048] mova m3, [itx4_shuf] paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 packssdw m0, m1 vpermd m0, m3, m0 psrld m3, 4 pshufb m0, m3 jmp tx2q .pass2: vpbroadcastd m1, [pw_1697x8] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] lea r6, [dstq+strideq*2] pmulhrsw m1, m0 paddsw m0, m1 movq xm3, [r6 +strideq*0] movhps xm3, [r6 +strideq*1] vpbroadcastd xm4, [pixel_10bpc_max] packssdw m5, m5 ; pw_2048 pmulhrsw m0, m5 pxor m5, m5 mova [cq+32*0], m5 mova [cq+32*1], m5 vextracti128 xm1, m0, 1 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm5 pmaxsw xm1, xm5 pminsw xm0, xm4 pminsw xm1, xm4 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 RET INV_TXFM_4X4_FN dct, dct, 12 INV_TXFM_4X4_FN dct, identity, 12 INV_TXFM_4X4_FN dct, adst, 12 INV_TXFM_4X4_FN dct, flipadst, 12 cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(idct_4x4_internal_10bpc).main mova m3, [idct4_12_shuf] mova m4, [idct4_12_shuf2] vpermd m2, m4, m1 vpermd m1, m3, m0 jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: vpbroadcastd m5, [pd_2048] vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_4x4_internal_10bpc).main2 vpermq m0, m0, q3120 vpermq m1, m1, q2031 jmp m(iadst_4x4_internal_12bpc).end INV_TXFM_4X4_FN adst, dct, 12 INV_TXFM_4X4_FN adst, adst, 12 INV_TXFM_4X4_FN adst, flipadst, 12 INV_TXFM_4X4_FN adst, identity, 12 cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main vinserti128 m1, m4, xm6, 1 vinserti128 m2, xm3, 1 .pass1_end: mova m3, [itx4_shuf] vpbroadcastd m5, [pd_1024] psrad m1, 1 psrad m2, 1 vpermd m1, m3, m1 vpermd m2, m3, m2 paddd m1, m5 paddd m2, m5 psrad m1, 11 psrad m2, 11 .pass1_end2: vpbroadcastd m3, [clip_18b_min] vpbroadcastd m4, [clip_18b_max] punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pmaxsd m0, m3 pmaxsd m1, m3 pminsd m0, m4 pminsd m1, m4 jmp tx2q .pass2: call .main_pass2 vinserti128 m0, m4, xm6, 1 vinserti128 m1, m2, xm3, 1 .pass2_end: vpbroadcastd m5, [pd_2048] paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 .end: %if WIN64 WIN64_RESTORE_XMM_INTERNAL %assign xmm_regs_used 6 %endif .end2: vpbroadcastd m4, [pw_16384] movq xm2, [dstq+strideq*0] movq xm3, [dstq+strideq*1] lea r6, [dstq+strideq*2] movhps xm2, [r6 +strideq*0] ; dst0 dst2 movhps xm3, [r6 +strideq*1] ; dst1 dst3 vpbroadcastd m5, [pixel_12bpc_max] vinserti128 m2, xm3, 1 psrad m0, 3 psrad m1, 3 packssdw m0, m1 ; t0 t2 t1 t3 pmulhrsw m0, m4 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw m0, m2 ; out0 out2 out1 out3 pmaxsw m0, m4 pminsw m0, m5 vextracti128 xm1, m0, 1 ; out1 out3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [r6 +strideq*0], xm0 movhps [r6 +strideq*1], xm1 RET .main_pass2: vextracti128 xm3, m1, 1 mova xm2, xm1 vextracti128 xm1, m0, 1 jmp m(iadst_4x4_internal_10bpc).main2 INV_TXFM_4X4_FN flipadst, dct, 12 INV_TXFM_4X4_FN flipadst, adst, 12 INV_TXFM_4X4_FN flipadst, flipadst, 12 INV_TXFM_4X4_FN flipadst, identity, 12 cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main vinserti128 m1, m3, xm2, 1 vinserti128 m2, m6, xm4, 1 jmp m(iadst_4x4_internal_12bpc).pass1_end .pass2: call m(iadst_4x4_internal_12bpc).main_pass2 vinserti128 m0, m3, xm2, 1 vinserti128 m1, m6, xm4, 1 jmp m(iadst_4x4_internal_12bpc).pass2_end INV_TXFM_4X4_FN identity, dct, 12 INV_TXFM_4X4_FN identity, adst, 12 INV_TXFM_4X4_FN identity, flipadst, 12 INV_TXFM_4X4_FN identity, identity, 12 cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 mova m2, [itx4_shuf] vpbroadcastd m3, [pd_1697] vpermd m0, m2, [cq+32*0] vpermd m2, m2, [cq+32*1] vpbroadcastd m5, [pd_2048] pmulld m1, m3, m0 pmulld m3, m2 paddd m1, m5 paddd m3, m5 psrad m1, 12 psrad m3, 12 paddd m1, m0 paddd m2, m3 jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 vpbroadcastd m3, [pd_5793] vpbroadcastd m5, [pd_2048] pmulld m0, m3 pmulld m1, m3 paddd m0, m5 ; 2048 paddd m1, m5 psrad m0, 12 psrad m1, 12 jmp m(iadst_4x4_internal_12bpc).end %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct vpbroadcastd xm2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 %else jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly %endif %endif %endmacro %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 vpbroadcastd m%5, [pd_2896] pmulld m%1, m%5 pmulld m%3, m%5 paddd m%1, m%8 paddd m%5, m%1, m%3 psubd m%1, m%3 psrad m%5, 12 ; t0 psrad m%1, 12 ; t1 psubd m%3, m%1, m%2 paddd m%2, m%1 paddd m%1, m%5, m%4 psubd m%4, m%5, m%4 %endmacro INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, identity INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m3, [pd_2896] pmulld m0, m3, [cq+32*0] pmulld m1, m3, [cq+32*1] pmulld m2, m3, [cq+32*2] pmulld m3, m3, [cq+32*3] vpbroadcastd m7, [pd_2048] REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 jmp tx2q .pass2: packssdw m0, m2 packssdw m1, m3 lea r6, [deint_shuf+128] punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 ; 2 3 punpckldq m0, m2 ; 0 1 vextracti128 xm2, m0, 1 ; 4 5 vextracti128 xm3, m1, 1 ; 6 7 call m(idct_4x8_internal_8bpc).main vpbroadcastd xm4, [pw_2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] movq xm5, [dstq+r3 ] movhps xm5, [dstq+strideq*2] movq xm6, [r6 +strideq*0] movhps xm6, [r6 +strideq*1] movq xm7, [r6 +r3 ] movhps xm7, [r6 +strideq*2] paddw xm0, xm4 ; 0 1 paddw xm1, xm5 ; 3 2 paddw xm2, xm6 ; 4 5 paddw xm3, xm7 ; 7 6 vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movhps [r6 +strideq*2], xm3 movq [r6 +r3 ], xm3 RET INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main vpbroadcastd m5, [pd_2048] paddd m0, m5, m4 paddd m1, m5, m6 paddd m2, m5 paddd m3, m5 .pass1_end: REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: call .pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 .end: lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] movq xm5, [dstq+strideq*2] movhps xm5, [dstq+r3 ] movq xm6, [r6 +strideq*0] movhps xm6, [r6 +strideq*1] movq xm7, [r6 +strideq*2] movhps xm7, [r6 +r3 ] paddw xm0, xm4 ; 0 1 paddw xm1, xm5 ; 2 3 paddw xm2, xm6 ; 4 5 paddw xm3, xm7 ; 6 7 vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 RET ALIGN function_align .pass2_main: packssdw m0, m2 packssdw m1, m3 lea r6, [deint_shuf+128] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpckhdq m5, m4, m0 punpckldq m4, m0 vextracti128 xm2, m4, 1 ; 4 5 vextracti128 xm3, m5, 1 ; 6 7 pshufd xm4, xm4, q1032 ; 1 0 pshufd xm5, xm5, q1032 ; 3 2 jmp m(iadst_4x8_internal_8bpc).main_pass2 ALIGN function_align .main: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] .main2: vbroadcasti128 m0, [cq+16*0] vbroadcasti128 m2, [cq+16*2] vbroadcasti128 m3, [cq+16*5] vbroadcasti128 m1, [cq+16*7] vpbroadcastd m6, [pd_2896] shufpd m0, m2, 0x0c ; 0 2 shufpd m1, m3, 0x0c ; 7 5 vbroadcasti128 m2, [cq+16*4] vbroadcasti128 m4, [cq+16*6] vbroadcasti128 m5, [cq+16*1] vbroadcasti128 m3, [cq+16*3] vpbroadcastd m7, [pd_2048] shufpd m2, m4, 0x0c ; 4 6 shufpd m3, m5, 0x0c ; 3 1 REPX {pmulld x, m6}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 .main3: ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 psubd m4, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 REPX {pmaxsd x, m8}, m4, m2, m0, m1 REPX {pminsd x, m9}, m4, m2, m0, m1 pxor m5, m5 psubd m5, m4 vpblendd m4, m2, 0xcc ; t4 t7 vpblendd m2, m5, 0xcc ; t5 -t6 ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 vpbroadcastd m5, [pd_2896] vbroadcasti128 m6, [pw_2048_m2048] ; + + - - punpckhqdq m3, m0, m1 punpcklqdq m0, m1 psubd m1, m0, m3 ; t2 t3 paddd m0, m3 ; out0 -out7 punpckhqdq m3, m4, m2 ; t7a t6a punpcklqdq m4, m2 ; t5a t4a psubd m2, m4, m3 ; t7 t6 paddd m4, m3 ; out6 -out1 REPX {pmaxsd x, m8}, m1, m2 REPX {pminsd x, m9}, m1, m2 vpblendd m3, m1, m2, 0xcc shufpd m1, m2, 0x05 pmulld m3, m5 pmulld m5, m1 psignd m0, m6 ; out0 out7 psignd m4, m6 ; out6 out1 paddd m3, m7 psubd m2, m3, m5 paddd m5, m3 psrad m2, 12 ; out4 -out5 psrad m5, 12 ; -out3 out2 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main vpbroadcastd m5, [pd_2048] paddd m0, m5, m3 paddd m1, m5, m2 paddd m2, m5, m6 paddd m3, m5, m4 jmp m(iadst_4x8_internal_10bpc).pass1_end .pass2: call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*1] movhps xm4, [dstq+strideq*0] movq xm5, [dstq+r3 ] movhps xm5, [dstq+strideq*2] movq xm6, [r6 +strideq*1] movhps xm6, [r6 +strideq*0] movq xm7, [r6 +r3 ] movhps xm7, [r6 +strideq*2] paddw xm3, xm4 ; 1 0 paddw xm2, xm5 ; 3 2 paddw xm1, xm6 ; 5 4 paddw xm0, xm7 ; 7 6 vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 movhps [dstq+strideq*0], xm3 movq [dstq+strideq*1], xm3 movhps [dstq+strideq*2], xm2 movq [dstq+r3 ], xm2 movhps [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm1 movhps [r6 +strideq*2], xm0 movq [r6 +r3 ], xm0 RET INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m3, [pd_2896] pmulld m0, m3, [cq+32*0] pmulld m1, m3, [cq+32*1] pmulld m2, m3, [cq+32*2] pmulld m3, [cq+32*3] vpbroadcastd m5, [pd_2048] vpbroadcastd m4, [pd_5793] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {pmulld x, m4}, m0, m1, m2, m3 REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: vpbroadcastd m6, [pixel_10bpc_max] call .pass2_end RET ALIGN function_align .pass2_end: vpbroadcastd m4, [pw_4096] packssdw m0, m2 packssdw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 pmulhrsw m2, m4 pmulhrsw m0, m4 punpckhdq m1, m0, m2 ; 2 3 6 7 punpckldq m0, m2 ; 0 1 4 5 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] vpbroadcastq m4, [r6 +strideq*0] vpbroadcastq m5, [r6 +strideq*1] movq xm3, [dstq+strideq*2] movhps xm3, [dstq+r3 ] vpblendd m2, m4, 0x30 vpblendd m2, m5, 0xc0 vpbroadcastq m4, [r6 +strideq*2] vpbroadcastq m5, [r6 +r3 ] vpblendd m3, m4, 0x30 vpblendd m3, m5, 0xc0 pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 ; out0 out1 out4 out5 paddw m1, m3 ; out2 out3 out6 out7 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m6 pminsw m1, m6 vextracti128 xm2, m0, 1 ; out4 out5 vextracti128 xm3, m1, 1 ; out6 out7 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 ret INV_TXFM_4X8_FN dct, dct, 12 INV_TXFM_4X8_FN dct, identity, 12 INV_TXFM_4X8_FN dct, adst, 12 INV_TXFM_4X8_FN dct, flipadst, 12 cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(idct_4x8_internal_10bpc).pass1 .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 ; transpose & interleave pshufd m0, m0, q1320 pshufd m1, m1, q1320 pshufd m2, m2, q1320 pshufd m3, m3, q1320 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 vpermq m0, m0, q3102 vpermq m2, m2, q3102 vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) vpbroadcastd m7, [pd_2048] call m(idct_8x4_internal_10bpc).main psubd m3, m0, m4 ; out7 out6 paddd m0, m4 ; out0 out1 paddd m1, m2, m5 ; out3 out2 psubd m2, m5 ; out4 out5 pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp m(iadst_4x8_internal_12bpc).end INV_TXFM_4X8_FN adst, dct, 12 INV_TXFM_4X8_FN adst, adst, 12 INV_TXFM_4X8_FN adst, flipadst, 12 INV_TXFM_4X8_FN adst, identity, 12 cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main psrad m0, m4, 1 psrad m1, m6, 1 psrad m2, 1 psrad m3, 1 .pass1_end: vpbroadcastd m5, [pd_1024] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 11}, m0, m1, m2, m3 jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main vpblendd m3, m0, m4, 0x33 ; out6 out7 vpblendd m0, m4, 0xcc ; out0 out1 pshufd m1, m5, q1032 psignd m2, m6 ; out4 out5 psignd m1, m6 ; out2 out3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 packssdw m0, m2 ; 0 1 4 5 (interleaved) packssdw m1, m3 ; 2 3 6 7 (interleaved) mova m2, [iadst8_12_shuf] vpermd m0, m2, m0 ; 0 1 4 5 vpermd m1, m2, m1 ; 2 3 6 7 pmulhrsw m0, m4 pmulhrsw m1, m4 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] movq xm5, [dstq+strideq*2] movhps xm5, [dstq+r3 ] movq xm6, [r6 +strideq*0] movhps xm6, [r6 +strideq*1] vinserti128 m4, xm6, 1 movq xm7, [r6 +strideq*2] movhps xm7, [r6 +r3 ] vinserti128 m5, xm7, 1 paddw m0, m4 ; 0 1 4 5 paddw m1, m5 ; 2 3 6 7 vpbroadcastd m5, [pixel_12bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, m4}, m0, m1 REPX {pminsw x, m5}, m0, m1 vextracti128 xm2, m0, 1 ; out4 out5 vextracti128 xm3, m1, 1 ; out6 out7 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 RET ALIGN function_align .pass2_main: ; transpose & interleave pshufd m0, m0, q1320 pshufd m1, m1, q1320 pshufd m2, m2, q1320 pshufd m3, m3, q1320 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) vpbroadcastd m7, [pd_2048] jmp m(iadst_4x8_internal_10bpc).main3 INV_TXFM_4X8_FN flipadst, dct, 12 INV_TXFM_4X8_FN flipadst, adst, 12 INV_TXFM_4X8_FN flipadst, flipadst, 12 INV_TXFM_4X8_FN flipadst, identity, 12 cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main psrad m0, m3, 1 psrad m1, m2, 1 psrad m2, m6, 1 psrad m3, m4, 1 jmp m(iadst_4x8_internal_12bpc).pass1_end .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_4x8_internal_12bpc).pass2_main shufpd m3, m4, m0, 0x05 ; out1 out0 shufpd m0, m4, 0x05 ; out7 out6 psignd m2, m6 pshufd m6, m6, q1032 pshufd m1, m2, q1032 ; out5 out4 psignd m2, m5, m6 ; out3 out2 jmp m(iadst_4x8_internal_12bpc).end INV_TXFM_4X8_FN identity, dct, 12 INV_TXFM_4X8_FN identity, adst, 12 INV_TXFM_4X8_FN identity, flipadst, 12 INV_TXFM_4X8_FN identity, identity, 12 cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(iidentity_4x8_internal_10bpc).pass1 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 ; m2 = in4 in5 ; m3 = in6 in7 vpbroadcastd m6, [pixel_12bpc_max] call m(iidentity_4x8_internal_10bpc).pass2_end RET %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd xm2, [dconly_%3bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 %endif %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, identity INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m10, [pd_3072] mova m1, [cq+32*2] mova m3, [cq+32*6] mova m5, [cq+32*3] mova m7, [cq+32*7] call .pass1_main pmulld m0, m6, [cq+32*0] pmulld m2, m6, [cq+32*4] pmulld m4, m6, [cq+32*1] pmulld m6, [cq+32*5] call .pass1_main2 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 lea r6, [deint_shuf+128] punpcklwd m4, m2, m3 punpckhwd m2, m3 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m4 ; 2 3 punpckldq m0, m4 ; 0 1 punpckldq m4, m5, m2 ; 8 9 punpckhdq m5, m2 ; a b vextracti128 xm2, m0, 1 ; 4 5 vextracti128 xm3, m1, 1 ; 6 7 vextracti128 xm6, m4, 1 ; c d vextracti128 xm7, m5, 1 ; e f call m(idct_4x16_internal_8bpc).main vpbroadcastd m9, [pw_2048] vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 vinserti128 m2, m4, xm5, 1 ; 8 9 b a vinserti128 m3, m6, xm7, 1 ; c d f e vpbroadcastd m8, [pixel_10bpc_max] call .pass2_end RET ALIGN function_align .pass1_main: vpbroadcastd m4, [pd_3784] vpbroadcastd m8, [pd_1567] vpbroadcastd m9, [pd_2048] vpbroadcastd m6, [pd_1448] ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h ret ALIGN function_align .pass1_main2: paddd m0, m10 paddd m4, m10 paddd m8, m0, m2 psubd m0, m2 paddd m9, m4, m6 psubd m4, m6 REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h psubd m2, m0, m1 paddd m1, m0 psubd m6, m4, m5 paddd m5, m4 paddd m0, m8, m3 psubd m3, m8, m3 paddd m4, m9, m7 psubd m7, m9, m7 ret ALIGN function_align .pass2_end: lea r6, [strideq*3] pxor m7, m7 pmulhrsw m0, m9 call .write_4x4 pmulhrsw m0, m1, m9 call .write_4x4 pmulhrsw m0, m2, m9 call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 ret ALIGN function_align .write_4x4: movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] vpbroadcastq m5, [dstq+strideq*2] vpbroadcastq m6, [dstq+r6 ] mova [cq+32*0], m7 mova [cq+32*1], m7 add cq, 32*2 vpblendd m4, m5, 0xc0 vpblendd m4, m6, 0x30 paddw m4, m0 pmaxsw m4, m7 pminsw m4, m8 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm4 movhps [dstq+strideq*2], xm5 movq [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end psrad m0, m4, 13 psrad m1, m5, 13 psrad m2, 13 psrad m3, 13 psrad m4, m8, 13 psrad m5, m9, 13 psrad m6, 13 psrad m7, 13 jmp tx2q .pass2: call .pass2_main vpbroadcastd m5, [pw_2048] vpbroadcastd m8, [pixel_10bpc_max] lea r6, [strideq*3] vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 pxor m7, m7 psubw m9, m7, m5 vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 pmulhrsw m0, m4, m9 call .write_4x4 pmulhrsw m0, m1, m9 call .write_4x4 pmulhrsw m0, m2, m9 call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 RET ALIGN function_align .write_4x4: movq xm4, [dstq+r6 ] movhps xm4, [dstq+strideq*0] vpbroadcastq m5, [dstq+strideq*1] vpbroadcastq m6, [dstq+strideq*2] mova [cq+32*0], m7 mova [cq+32*1], m7 add cq, 32*2 vpblendd m4, m5, 0xc0 vpblendd m4, m6, 0x30 paddw m4, m0 pmaxsw m4, m7 pminsw m4, m8 vextracti128 xm5, m4, 1 movhps [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm5 movq [dstq+strideq*2], xm5 movq [dstq+r6 ], xm4 lea dstq, [dstq+strideq*4] ret ALIGN function_align .pass2_main: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 lea r6, [deint_shuf+128] punpcklwd m4, m2, m3 punpckhwd m2, m3 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m4 punpckldq m0, m4 punpckldq m4, m5, m2 punpckhdq m5, m2 vpblendd m3, m0, m1, 0x33 vpblendd m0, m1, 0xcc shufpd m2, m5, m4, 0x05 shufpd m4, m5, 0x05 vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 vinserti128 m0, xm3, 1 ; 0 3 2 1 vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? vinserti128 m2, xm4, 1 ; b 8 9 a call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m5, [pw_2896x8] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 ret ALIGN function_align .main: vbroadcasti128 m0, [cq+16* 0] vbroadcasti128 m4, [cq+16* 2] vbroadcasti128 m1, [cq+16*15] vbroadcasti128 m5, [cq+16*13] vbroadcasti128 m2, [cq+16* 4] vbroadcasti128 m6, [cq+16* 6] vbroadcasti128 m3, [cq+16*11] vbroadcasti128 m7, [cq+16* 9] shufpd m0, m4, 0x0c ; 0 2 shufpd m1, m5, 0x0c ; 15 13 shufpd m2, m6, 0x0c ; 4 6 shufpd m3, m7, 0x0c ; 11 9 vbroadcasti128 m4, [cq+16* 8] vbroadcasti128 m6, [cq+16*10] vbroadcasti128 m5, [cq+16* 7] vbroadcasti128 m7, [cq+16* 5] shufpd m4, m6, 0x0c ; 8 10 shufpd m5, m7, 0x0c ; 7 5 vbroadcasti128 m6, [cq+16*12] vbroadcasti128 m7, [cq+16*14] shufpd m6, m7, 0x0c ; 12 14 vbroadcasti128 m7, [cq+16* 3] vbroadcasti128 m8, [cq+16* 1] shufpd m7, m8, 0x0c ; 3 1 .main2: ; expects: m12 = clip_min m13 = clip_max vpbroadcastd m11, [pd_2048] ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 psubd m8, m0, m4 ; t8a t10a paddd m0, m4 ; t0a t2a psubd m4, m1, m5 ; t9a t11a paddd m1, m5 ; t1a t3a psubd m5, m2, m6 ; t12a t14a paddd m2, m6 ; t4a t6a psubd m6, m3, m7 ; t13a t15a paddd m3, m7 ; t5a t7a REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 psubd m7, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 psubd m3, m4, m6 ; t12a t14a paddd m4, m6 ; t8a t10a psubd m6, m8, m5 ; t13a t15a paddd m8, m5 ; t9a t11a REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 punpcklqdq m5, m3, m7 ; t12a t4 punpckhqdq m3, m7 ; t14a t6 punpckhqdq m7, m6, m2 ; t15a t7 punpcklqdq m6, m2 ; t13a t5 ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 vpbroadcastd m10, [pd_2896] vbroadcasti128 m9, [pw_2048_m2048] ; + + - - punpckhqdq m2, m4, m0 ; t10a t2 punpcklqdq m4, m0 ; t8a t0 punpckhqdq m0, m8, m1 ; t11a t3 punpcklqdq m8, m1 ; t9a t1 paddd m1, m6, m7 ; out2 -out3 psubd m6, m7 ; t14a t6 paddd m7, m5, m3 ; -out13 out12 psubd m5, m3 ; t15a t7 psubd m3, m8, m0 ; t11 t3a paddd m8, m0 ; out14 -out15 paddd m0, m4, m2 ; -out1 out0 psubd m4, m2 ; t10 t2a REPX {pmaxsd x, m12}, m6, m5, m3, m4 REPX {pminsd x, m13}, m6, m5, m3, m4 REPX {pmulld x, m10}, m6, m5, m3, m4 paddd m6, m11 paddd m4, m11 paddd m2, m6, m5 ; -out5 out4 psubd m6, m5 ; out10 -out11 psubd m5, m4, m3 ; -out9 out8 paddd m3, m4 ; out6 -out7 REPX {psrad x, 12}, m2, m3, m5, m6 REPX {psignd x, m9}, m1, m8, m3, m6 pshufd m9, m9, q1032 REPX {psignd x, m9}, m0, m7, m2, m5 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end psrad m0, m3, 13 psrad m1, m2, 13 psrad m2, m5, 13 psrad m3, m4, 13 psrad m4, m7, 13 psrad m5, m6, 13 psrad m6, m9, 13 psrad m7, m8, 13 jmp tx2q .pass2: call m(iadst_4x16_internal_10bpc).pass2_main vpbroadcastd m5, [pw_2048] vpbroadcastd m8, [pixel_10bpc_max] lea r6, [strideq*3] vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 pxor m7, m7 psubw m9, m7, m5 vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 pmulhrsw m0, m4, m9 call .write_4x4 pmulhrsw m0, m2, m9 call .write_4x4 pmulhrsw m0, m1, m9 call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 RET ALIGN function_align .write_4x4: movq xm4, [dstq+strideq*0] movhps xm4, [dstq+r6 ] vpbroadcastq m5, [dstq+strideq*1] vpbroadcastq m6, [dstq+strideq*2] mova [cq+32*0], m7 mova [cq+32*1], m7 add cq, 32*2 vpblendd m4, m5, 0x30 vpblendd m4, m6, 0xc0 paddw m4, m0 pmaxsw m4, m7 pminsw m4, m8 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 movhps [dstq+strideq*2], xm5 movhps [dstq+r6 ], xm4 lea dstq, [dstq+strideq*4] ret INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 vpbroadcastd m7, [pd_5793] pmulld m0, m7, [cq+32*0] pmulld m4, m7, [cq+32*1] pmulld m1, m7, [cq+32*2] pmulld m5, m7, [cq+32*3] pmulld m2, m7, [cq+32*4] pmulld m6, m7, [cq+32*5] pmulld m3, m7, [cq+32*6] pmulld m7, [cq+32*7] vpbroadcastd m8, [pd_6144] REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 jmp tx2q .pass2: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m7, [pw_1697x16] vpbroadcastd m8, [pw_2048] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 vpbroadcastd m4, [pixel_10bpc_max] call .pass2_end RET ALIGN function_align .pass2_end: punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 lea r6, [strideq*5] pxor m3, m3 punpckhdq m5, m0, m2 ; 2 3 6 7 punpckldq m0, m2 ; 0 1 4 5 punpckldq m6, m7, m1 ; 8 9 c d punpckhdq m7, m1 ; a b e f pmulhrsw m0, m8 call .write_2x4x2 pmulhrsw m0, m5, m8 call .write_2x4x2 pmulhrsw m0, m6, m8 lea dstq, [dstq+strideq*4] call .write_2x4x2 pmulhrsw m0, m7, m8 call .write_2x4x2 ret ALIGN function_align .write_2x4x2: movq xm1, [dstq+strideq*0] movhps xm1, [dstq+strideq*1] vpbroadcastq m2, [dstq+strideq*4] vpblendd m1, m2, 0x30 vpbroadcastq m2, [dstq+r6 ] vpblendd m1, m2, 0xc0 mova [cq+32*0], m3 mova [cq+32*1], m3 add cq, 32*2 paddw m1, m0 pmaxsw m1, m3 pminsw m1, m4 vextracti128 xm2, m1, 1 movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 movq [dstq+strideq*4], xm2 movhps [dstq+r6 ], xm2 lea dstq, [dstq+strideq*2] ret INV_TXFM_4X16_FN dct, dct, 12 INV_TXFM_4X16_FN dct, identity, 12 INV_TXFM_4X16_FN dct, adst, 12 INV_TXFM_4X16_FN dct, flipadst, 12 cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(idct_4x16_internal_10bpc).pass1 .pass2: punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m9, m2, m3 punpckhdq m2, m3 punpckldq m1, m4, m5 punpckhdq m4, m5 punpckldq m3, m6, m7 punpckhdq m6, m7 punpcklqdq m5, m0, m2 ; 2 6 punpckhqdq m12, m0, m2 ; 3 7 punpcklqdq m0, m8, m9 ; 0 4 punpckhqdq m10, m8, m9 ; 1 5 punpcklqdq m2, m1, m3 ; 8 12 punpckhqdq m13, m1, m3 ; 9 13 punpcklqdq m9, m4, m6 ; 10 14 punpckhqdq m4, m6 ; 11 15 vperm2i128 m1, m5, m9, 0x20 ; 2 10 vperm2i128 m3, m9, m5, 0x31 ; 14 6 vpermq m11, m4, q1302 ; 15 11 ; interleave REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 call m(idct_16x4_internal_10bpc).pass1_main vpermq m6, m12, q1302 ; 7 3 vpermq m5, m13, q3120 ; 9 13 call m(idct_16x4_internal_10bpc).pass1_main2 call m(idct_16x4_internal_10bpc).pass1_main3 REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 mova m4, [idct16_12_shuf] REPX {vpermd x, m4, x}, m0, m1, m2, m3 vpbroadcastd m9, [pw_16384] vpbroadcastd m8, [pixel_12bpc_max] call m(idct_4x16_internal_10bpc).pass2_end RET INV_TXFM_4X16_FN adst, dct, 12 INV_TXFM_4X16_FN adst, adst, 12 INV_TXFM_4X16_FN adst, flipadst, 12 INV_TXFM_4X16_FN adst, identity, 12 cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 call .main_pass1 psrad m0, m4, 12 psrad m1, m5, 12 psrad m2, 12 psrad m3, 12 psrad m4, m8, 12 psrad m5, m9, 12 psrad m6, 12 psrad m7, 12 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose_16x4 call m(iadst_4x16_internal_10bpc).main2 pshufd m4, m5, q1032 psrad m5, m6, 3 pshufd m6, m7, q1032 psrad m7, m8, 3 REPX {pshufd x, x, q1032}, m0, m2 REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 .pass2_end: packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 mova m4, [iadst16_12_shuf] REPX {vpermd x, m4, x}, m0, m1, m2, m3 vpbroadcastd m9, [pw_16384] vpbroadcastd m8, [pixel_12bpc_max] lea r6, [strideq*3] pxor m7, m7 pmulhrsw m0, m9 call m(iadst_4x16_internal_10bpc).write_4x4 pmulhrsw m0, m9, m1 call m(iadst_4x16_internal_10bpc).write_4x4 pmulhrsw m0, m9, m2 call m(iadst_4x16_internal_10bpc).write_4x4 pmulhrsw m0, m9, m3 call m(iadst_4x16_internal_10bpc).write_4x4 RET ALIGN function_align .transpose_16x4: ; transpose & interleave punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m9, m2, m3 punpckhdq m2, m3 punpckldq m1, m4, m5 punpckhdq m4, m5 punpckldq m3, m6, m7 punpckhdq m6, m7 punpcklqdq m10, m8, m0 punpckhqdq m0, m8 punpcklqdq m11, m9, m2 punpckhqdq m2, m9 punpcklqdq m8, m1, m4 punpckhqdq m4, m1 punpcklqdq m9, m3, m6 punpckhqdq m6, m3 vperm2i128 m5, m0, m2, 0x31 ; 7 5 vperm2i128 m7, m0, m2, 0x20 ; 3 1 vperm2i128 m0, m10, m11, 0x20 ; 0 2 vperm2i128 m2, m10, m11, 0x31 ; 4 6 vperm2i128 m1, m4, m6, 0x31 ; 15 13 vperm2i128 m3, m4, m6, 0x20 ; 11 9 vperm2i128 m4, m8, m9, 0x20 ; 8 10 vperm2i128 m6, m8, m9, 0x31 ; 12 14 ret ALIGN function_align .main_pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_3072] paddd m10, m4, m5 psubd m4, m3 psubd m5, m3 paddd m3, m10 psubd m8, m7, m1 paddd m7, m9 psubd m9, m1 paddd m7, m1 REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 paddd m6, m0 ret INV_TXFM_4X16_FN flipadst, dct, 12 INV_TXFM_4X16_FN flipadst, adst, 12 INV_TXFM_4X16_FN flipadst, flipadst, 12 INV_TXFM_4X16_FN flipadst, identity, 12 cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 call m(iadst_4x16_internal_12bpc).main_pass1 psrad m0, m3, 12 psrad m1, m2, 12 psrad m2, m5, 12 psrad m3, m4, 12 psrad m4, m7, 12 psrad m5, m6, 12 psrad m6, m9, 12 psrad m7, m8, 12 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_4x16_internal_12bpc).transpose_16x4 call m(iadst_4x16_internal_10bpc).main2 pshufd m4, m3, q1032 psrad m3, m5, 3 psrad m5, m2, 3 pshufd m2, m6, q1032 pshufd m6, m1, q1032 psrad m1, m7, 3 psrad m7, m0, 3 pshufd m0, m8, q1032 REPX {psrad x, 3}, m0, m2, m4, m6 jmp m(iadst_4x16_internal_12bpc).pass2_end INV_TXFM_4X16_FN identity, dct, 12 INV_TXFM_4X16_FN identity, adst, 12 INV_TXFM_4X16_FN identity, flipadst, 12 INV_TXFM_4X16_FN identity, identity, 12 cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [pd_1697] mova m0, [cq+32*0] mova m4, [cq+32*1] mova m1, [cq+32*2] mova m5, [cq+32*3] vpbroadcastd m9, [pd_6144] pmulld m2, m8, m0 pmulld m6, m8, m4 pmulld m3, m8, m1 pmulld m7, m8, m5 mova m10, [cq+32*4] mova m11, [cq+32*5] mova m12, [cq+32*6] mova m13, [cq+32*7] REPX {paddd x, m9}, m2, m6, m3, m7 REPX {psrad x, 12}, m2, m6, m3, m7 paddd m0, m2 pmulld m2, m8, m10 paddd m4, m6 pmulld m6, m8, m11 paddd m1, m3 pmulld m3, m8, m12 paddd m5, m7 pmulld m7, m8, m13 REPX {psrad x, 1 }, m0, m4, m1, m5 REPX {paddd x, m9}, m2, m6, m3, m7 REPX {psrad x, 12}, m2, m6, m3, m7 paddd m2, m10 paddd m6, m11 paddd m3, m12 paddd m7, m13 REPX {psrad x, 1 }, m2, m6, m3, m7 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m8, [pd_5793] vpbroadcastd m9, [pd_1024] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m8, [pw_16384] vpbroadcastd m4, [pixel_12bpc_max] call m(iidentity_4x16_internal_10bpc).pass2_end RET %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct vpbroadcastd m2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 4 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 %else jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly %endif %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, identity INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] .pass1: vbroadcasti128 m1, [cq+16*1] vbroadcasti128 m0, [cq+16*5] vbroadcasti128 m2, [cq+16*3] vbroadcasti128 m3, [cq+16*7] vpbroadcastd m6, [pd_2896] shufpd m1, m0, 0x0c ; 1 5 shufpd m3, m2, 0x0c ; 7 3 vbroadcasti128 m0, [cq+16*0] vbroadcasti128 m4, [cq+16*2] vbroadcasti128 m2, [cq+16*4] vbroadcasti128 m5, [cq+16*6] vpbroadcastd m7, [pd_2048] shufpd m0, m4, 0x0c ; 0 2 shufpd m2, m5, 0x0c ; 4 6 REPX {pmulld x, m6}, m1, m3, m0, m2 REPX {paddd x, m7}, m1, m3, m0, m2 REPX {psrad x, 12}, m1, m3, m0, m2 call .main psubd m3, m0, m4 ; out7 out6 (interleaved) paddd m0, m4 ; out0 out1 (interleaved) paddd m1, m2, m5 ; out3 out2 (interleaved) psubd m2, m5 ; out4 out5 (interleaved) pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp tx2q .pass2: vbroadcasti128 m4, [deint_shuf] packssdw m0, m1 packssdw m2, m3 vperm2i128 m1, m0, m2, 0x31 vinserti128 m0, xm2, 1 pshufb m0, m4 pshufb m1, m4 IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 vpermq m0, m0, q3120 ; out0 out1 vpermq m2, m1, q2031 ; out2 out3 jmp m(iadst_8x4_internal_10bpc).end ALIGN function_align .main: ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 vpbroadcastd m6, [pd_2896] punpcklqdq m4, m1, m3 ; t4a t7a punpckhqdq m1, m3 ; t5a t6a psubd m3, m4, m1 ; t5a t6a paddd m4, m1 ; t4 t7 REPX {pmaxsd x, m8}, m3, m4, m0, m2 REPX {pminsd x, m9}, m3, m4, m0, m2 pmulld m3, m6 pshufd m1, m3, q1032 paddd m3, m7 psubd m5, m3, m1 paddd m1, m3 psrad m5, 12 psrad m1, 12 vpblendd m5, m4, 0x33 ; t4 t5 punpckhqdq m4, m1 ; t7 t6 ret INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_10bpc).main vpblendd m3, m0, m4, 0x33 ; out6 out7 vpblendd m0, m4, 0xcc ; out0 out1 pshufd m1, m5, q1032 psignd m2, m6 ; out4 out5 psignd m1, m6 ; out2 out3 jmp tx2q .pass2: call .pass2_main vpermq m0, m0, q3120 ; out0 out1 vpermq m2, m1, q3120 ; out2 out3 .end: vpbroadcastd m1, [pw_2048] pmulhrsw m0, m1 pmulhrsw m1, m2 vpbroadcastd m5, [pixel_10bpc_max] .end2: mova xm2, [dstq+strideq*0] vinserti128 m2, [dstq+strideq*1], 1 lea r6, [dstq+strideq*2] mova xm3, [r6 +strideq*0] vinserti128 m3, [r6 +strideq*1], 1 pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [r6 +strideq*0], xm1 vextracti128 [r6 +strideq*1], m1, 1 RET ALIGN function_align .pass2_main: vbroadcasti128 m4, [deint_shuf] packssdw m0, m1 packssdw m2, m3 lea r6, [deint_shuf+128] vperm2i128 m1, m0, m2, 0x31 vinserti128 m0, xm2, 1 pshufb m0, m4 pshufb m1, m4 jmp m(iadst_8x4_internal_8bpc).main ALIGN function_align .main: vpbroadcastd m1, [pd_2896] pmulld m0, m1, [cq+32*0] pmulld m3, m1, [cq+32*3] pmulld m2, m1, [cq+32*2] pmulld m1, [cq+32*1] vpbroadcastd m4, [pd_2048] REPX {paddd x, m4}, m0, m3, m2, m1 REPX {psrad x, 12}, m0, m3, m2, m1 .main2: IADST4_1D ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_10bpc).main shufpd m3, m4, m0, 0x05 shufpd m0, m4, 0x05 psignd m2, m6 pshufd m6, m6, q1032 pshufd m1, m2, q1032 psignd m2, m5, m6 jmp tx2q .pass2: call m(iadst_8x4_internal_10bpc).pass2_main vpermq m2, m0, q2031 vpermq m0, m1, q2031 jmp m(iadst_8x4_internal_10bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m4, [pd_2896] vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpermq m2, [cq+32*2], q3120 vpermq m3, [cq+32*3], q3120 vpbroadcastd m7, [pd_2048] REPX {pmulld x, m4}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {paddd x, x }, m0, m1, m2, m3 jmp tx2q .pass2: vpbroadcastd m5, [pixel_10bpc_max] vpbroadcastd m4, [pw_1697x8] packssdw m0, m1 packssdw m2, m3 pmulhrsw m1, m4, m0 pmulhrsw m4, m2 paddsw m0, m1 paddsw m2, m4 packssdw m7, m7 ; pw_2048 .pass2_end: punpckhwd m1, m0, m2 punpcklwd m0, m2 lea r6, [dstq+strideq*2] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmulhrsw m2, m7 pmulhrsw m0, m7 punpckhwd m1, m0, m2 punpcklwd m0, m2 mova xm2, [dstq+strideq*0] vinserti128 m2, [r6 +strideq*0], 1 mova xm3, [dstq+strideq*1] vinserti128 m3, [r6 +strideq*1], 1 pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 vextracti128 [r6 +strideq*0], m0, 1 vextracti128 [r6 +strideq*1], m1, 1 RET INV_TXFM_8X4_FN dct, dct, 12 INV_TXFM_8X4_FN dct, identity, 12 INV_TXFM_8X4_FN dct, adst, 12 INV_TXFM_8X4_FN dct, flipadst, 12 cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] jmp m(idct_8x4_internal_10bpc).pass1 .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).transpose_4x8 IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 jmp m(iadst_8x4_internal_12bpc).end INV_TXFM_8X4_FN adst, dct, 12 INV_TXFM_8X4_FN adst, adst, 12 INV_TXFM_8X4_FN adst, flipadst, 12 INV_TXFM_8X4_FN adst, identity, 12 cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] call m(iadst_4x8_internal_10bpc).main2 vpblendd m3, m0, m4, 0x33 ; out6 out7 vpblendd m0, m4, 0xcc ; out0 out1 pshufd m1, m5, q1032 psignd m2, m6 ; out4 out5 psignd m1, m6 ; out2 out3 jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main vpbroadcastd m5, [pd_2048] paddd m0, m5, m4 paddd m1, m5, m6 paddd m2, m5 paddd m3, m5 .pass2_end: REPX {psrad x, 12}, m0, m1, m2, m3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 pmulhrsw m0, m4 pmulhrsw m1, m2, m4 vpermq m0, m0, q3120 ; out0 out1 vpermq m1, m1, q3120 ; out2 out3 vpbroadcastd m5, [pixel_12bpc_max] jmp m(iadst_8x4_internal_10bpc).end2 ALIGN function_align .pass2_main: call .transpose_4x8 jmp m(iadst_8x4_internal_10bpc).main2 ALIGN function_align .transpose_4x8: ; deinterleave pshufd m0, m0, q3120 pshufd m1, m1, q3120 pshufd m2, m2, q3120 pshufd m3, m3, q3120 ; transpose punpcklqdq m4, m0, m1 punpckhqdq m0, m1 punpcklqdq m5, m2, m3 punpckhqdq m2, m3 vperm2i128 m1, m0, m2, 0x20 ; out1 vperm2i128 m3, m0, m2, 0x31 ; out3 vperm2i128 m2, m4, m5, 0x31 ; out2 vperm2i128 m0, m4, m5, 0x20 ; out0 ret INV_TXFM_8X4_FN flipadst, dct, 12 INV_TXFM_8X4_FN flipadst, adst, 12 INV_TXFM_8X4_FN flipadst, flipadst, 12 INV_TXFM_8X4_FN flipadst, identity, 12 cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] call m(iadst_4x8_internal_10bpc).main2 shufpd m3, m4, m0, 0x05 shufpd m0, m4, 0x05 psignd m2, m6 pshufd m6, m6, q1032 pshufd m1, m2, q1032 psignd m2, m5, m6 jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).pass2_main vpbroadcastd m5, [pd_2048] paddd m0, m5, m3 paddd m1, m5, m2 paddd m3, m5, m4 paddd m2, m5, m6 jmp m(iadst_8x4_internal_12bpc).pass2_end INV_TXFM_8X4_FN identity, dct, 12 INV_TXFM_8X4_FN identity, adst, 12 INV_TXFM_8X4_FN identity, flipadst, 12 INV_TXFM_8X4_FN identity, identity, 12 cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(iidentity_8x4_internal_10bpc).pass1 .pass2: ; m0 = in0 in1 (interleaved) ; m1 = in2 in3 (interleaved) ; m2 = in4 in5 (interleaved) ; m3 = in6 in7 (interleaved) vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 vpbroadcastd m4, [pd_5793] REPX {pmulld x, m4}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 15}, m0, m1, m2, m3 vpbroadcastd m5, [pixel_12bpc_max] vpbroadcastd m7, [pw_16384] packssdw m0, m1 packssdw m2, m3 jmp m(iidentity_8x4_internal_10bpc).pass2_end %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct vpbroadcastd m2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 .dconly2: add r6d, 384 sar r6d, 9 .dconly3: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm2 vpbroadcastw m0, xm0 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 paddsw m1, m0 psubusw m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %else jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly %endif %endif %endmacro %macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a psubd m%9, m%3, m%7 ; t6 paddd m%3, m%7 ; t2 psubd m%7, m%1, m%5 ; t4 paddd m%1, m%5 ; t0 psubd m%5, m%6, m%2 ; t7 paddd m%6, m%2 ; t3 psubd m%2, m%8, m%4 ; t5 paddd m%8, m%4 ; t1 REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a psubd m%10, m%7, m%9 ; t7 paddd m%7, m%9 ; out6 vpbroadcastd m%9, [pd_1448] psubd m%4, m%8, m%6 ; t3 paddd m%8, m%6 ; -out7 psubd m%6, m%1, m%3 ; t2 paddd m%1, m%3 ; out0 psubd m%3, m%2, m%5 ; t6 paddd m%2, m%5 ; -out1 REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 paddd m%4, m%6 ; (t2 + t3) * 1448 psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 paddd m%3, m%10 ; (t6 + t7) * 1448 %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, identity INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] mova m4, [cq+32*4] mova m5, [cq+32*5] mova m6, [cq+32*6] mova m7, [cq+32*7] vpbroadcastd m11, [pd_2048] call .main call .round_shift1 jmp tx2q .pass2: call .transpose_8x8_packed call m(idct_8x8_internal_8bpc).main vpbroadcastd m12, [pw_2048] vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call .write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call .write_8x4 RET ALIGN function_align .write_8x4_start: vpbroadcastd m11, [pixel_10bpc_max] lea r6, [strideq*3] pxor m10, m10 .write_8x4: mova xm8, [dstq+strideq*0] vinserti128 m8, [dstq+strideq*1], 1 mova xm9, [dstq+strideq*2] vinserti128 m9, [dstq+r6 ], 1 mova [cq+32*0], m10 mova [cq+32*1], m10 mova [cq+32*2], m10 mova [cq+32*3], m10 add cq, 32*4 paddw m0, m8 paddw m1, m9 pmaxsw m0, m10 pmaxsw m1, m10 pminsw m0, m11 pminsw m1, m11 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+strideq*4] ret ALIGN function_align .transpose_8x8_packed: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 lea r6, [deint_shuf+128] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhdq m2, m4, m1 punpckldq m4, m1 vinserti128 m1, m3, xm2, 1 vperm2i128 m3, m2, 0x31 vperm2i128 m2, m0, m4, 0x31 vinserti128 m0, xm4, 1 ret ALIGN function_align .main_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main: ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a paddd m9, m7, m3 ; t7 psubd m7, m3 ; t6a vpbroadcastd m3, [pd_2896] REPX {pmaxsd x, m12}, m1, m8, m7, m9 REPX {pminsd x, m13}, m1, m8, m7, m9 REPX {pmulld x, m3 }, m0, m4, m7, m1 paddd m0, m11 paddd m7, m11 psubd m5, m0, m4 paddd m0, m4 psubd m4, m7, m1 paddd m7, m1 REPX {psrad x, 12 }, m5, m0, m4, m7 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 REPX {pmaxsd x, m12}, m0, m6, m5, m3 REPX {pminsd x, m13}, m0, m6, m5, m3 ret ALIGN function_align .round_shift1: pcmpeqd m1, m1 REPX {psubd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call .main call .main_end jmp tx2q .pass2: call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m5, [pw_2048] vpbroadcastd xm12, [pw_4096] psubw m12, m5 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .main: mova m0, [cq+32*0] mova m7, [cq+32*7] mova m1, [cq+32*1] mova m6, [cq+32*6] mova m2, [cq+32*2] mova m5, [cq+32*5] mova m3, [cq+32*3] mova m4, [cq+32*4] vpbroadcastd m11, [pd_2048] .main2: IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 psrld m8, 10 ; pd_1 vpbroadcastd m9, [pd_3072] ret ALIGN function_align .main_end: paddd m0, m8 psubd m1, m8, m1 paddd m6, m8 psubd m7, m8, m7 REPX {psrad x, 1 }, m0, m1, m6, m7 ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 psubd m8, m9, m8 ; pd_3071 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 REPX {psrad x, 12}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call m(iadst_8x8_internal_10bpc).main call .main_end jmp tx2q .pass2: call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m12, [pw_2048] vpbroadcastd xm5, [pw_4096] psubw m12, m5 vpermq m8, m3, q2031 vpermq m9, m2, q2031 vpermq m2, m1, q2031 vpermq m3, m0, q2031 pmulhrsw m0, m8, m12 pmulhrsw m1, m9, m12 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .main_end: paddd m10, m8, m0 psubd m0, m8, m7 psubd m7, m8, m1 paddd m1, m8, m6 psrad m0, 1 psrad m1, 1 psrad m6, m7, 1 psrad m7, m10, 1 psubd m8, m9, m8 ; pd_6143 psubd m10, m8, m5 paddd m5, m9, m2 psubd m2, m8, m3 paddd m3, m9, m4 psrad m4, m2, 12 psrad m2, m10, 12 psrad m3, 12 psrad m5, 12 ret INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 .pass1: mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] mova m4, [cq+32*4] mova m5, [cq+32*5] mova m6, [cq+32*6] mova m7, [cq+32*7] jmp tx2q .pass2: packssdw m3, m7 vpbroadcastd m7, [pixel_10bpc_max] .pass2_main: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 vpbroadcastd m12, [pw_4096] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m1 punpckhdq m4, m1 punpckhqdq m1, m0, m2 ; 1 5 punpcklqdq m0, m2 ; 0 4 punpcklqdq m2, m3, m4 ; 2 6 punpckhqdq m3, m4 ; 3 7 pmulhrsw m0, m12 pmulhrsw m1, m12 call .write_2x8x2_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call .write_2x8x2_zero RET .write_2x8x2_start: lea r6, [strideq*5] pxor m6, m6 .write_2x8x2_zero: mova [cq+32*0], m6 mova [cq+32*1], m6 mova [cq+32*2], m6 mova [cq+32*3], m6 add cq, 32*4 .write_2x8x2: mova xm4, [dstq+strideq*0] vinserti128 m4, [dstq+strideq*4], 1 mova xm5, [dstq+strideq*1] vinserti128 m5, [dstq+r6 ], 1 paddw m0, m4 paddw m1, m5 pmaxsw m0, m6 pmaxsw m1, m6 pminsw m0, m7 pminsw m1, m7 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 vextracti128 [dstq+strideq*4], m0, 1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+strideq*2] ret %macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] punpckldq m%9, m%1, m%2 ; aibj emfn punpckhdq m%1, m%2 ; ckdl gohp punpckldq m%10, m%3, m%4 ; qyrz uCvD punpckhdq m%3, m%4 ; sAtB wExF punpckldq m%11, m%5, m%6 ; GOHP KSLT punpckhdq m%5, m%6 ; IQJR MUNV punpckldq m%12, m%7, m%8 ; WeXf aibj punpckhdq m%7, m%8 ; YgZh ckdl punpcklqdq m%2, m%9, m%10 ; aiqy emuC punpckhqdq m%9, m%10 ; bjrz fnvD punpcklqdq m%4, m%1, m%3 ; cksA gowE punpckhqdq m%10, m%1, m%3 ; dltB hpxF punpcklqdq m%6, m%11, m%12 ; GOWe KSai punpckhqdq m%11, m%12 ; HPXf LTbj punpcklqdq m%8, m%5, m%7 ; IQYg MUck punpckhqdq m%12, m%5, m%7 ; JRZh NVdl vperm2i128 m%1, m%2, m%6, 0x20 ; out0 vperm2i128 m%5, m%2, m%6, 0x31 ; out4 vperm2i128 m%2, m%9, m%11, 0x20 ; out1 vperm2i128 m%6, m%9, m%11, 0x31 ; out5 vperm2i128 m%3, m%4, m%8, 0x20 ; out2 vperm2i128 m%7, m%4, m%8, 0x31 ; out6 vperm2i128 m%4, m%10, m%12, 0x20 ; out3 vperm2i128 m%8, m%10, m%12, 0x31 ; out7 %endmacro INV_TXFM_8X8_FN dct, dct, 12 INV_TXFM_8X8_FN dct, identity, 12 INV_TXFM_8X8_FN dct, adst, 12 INV_TXFM_8X8_FN dct, flipadst, 12 cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_8x8_internal_10bpc).pass1 .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose_8x8 vpbroadcastd m11, [pd_2048] call m(idct_8x8_internal_10bpc).main call .round_shift4 jmp m(iadst_8x8_internal_12bpc).pass2_end ALIGN function_align .write_8x4_start: vpbroadcastd m11, [pixel_12bpc_max] lea r6, [strideq*3] pxor m10, m10 ret ALIGN function_align .transpose_8x8: TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ret ALIGN function_align .round_shift4: vpbroadcastd m1, [pd_8] REPX {paddd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X8_FN adst, dct, 12 INV_TXFM_8X8_FN adst, adst, 12 INV_TXFM_8X8_FN adst, flipadst, 12 INV_TXFM_8X8_FN adst, identity, 12 cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iadst_8x8_internal_10bpc).pass1 .pass2: call .pass2_main .pass2_end: packssdw m0, m1 packssdw m1, m2, m3 REPX {vpermq x, x, q3120}, m0, m1 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 packssdw m0, m4, m5 packssdw m1, m6, m7 REPX {vpermq x, x, q3120}, m0, m1 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .pass2_main: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_12bpc).transpose_8x8 vpbroadcastd m11, [pd_2048] .pass2_main2: call m(iadst_8x8_internal_10bpc).main2 pslld m9, m8, 3 ; pd_8 paddd m0, m9 psubd m1, m9, m1 ; 8+x paddd m6, m9 psubd m7, m9, m7 REPX {psrad x, 4}, m0, m1, m6, m7 vpbroadcastd m9, [pd_17408] psubd m8, m9, m8 ; 17407 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 REPX {psrad x, 15}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct, 12 INV_TXFM_8X8_FN flipadst, adst, 12 INV_TXFM_8X8_FN flipadst, flipadst, 12 INV_TXFM_8X8_FN flipadst, identity, 12 cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_8x8_internal_10bpc).pass1 .pass2: call m(iadst_8x8_internal_12bpc).pass2_main packssdw m7, m7, m6 packssdw m6, m1, m0 packssdw m1, m5, m4 vpermq m0, m7, q3120 vpermq m1, m1, q3120 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 packssdw m0, m3, m2 vpermq m0, m0, q3120 vpermq m1, m6, q3120 call m(idct_8x8_internal_10bpc).write_8x4 RET INV_TXFM_8X8_FN identity, dct, 12 INV_TXFM_8X8_FN identity, adst, 12 INV_TXFM_8X8_FN identity, flipadst, 12 INV_TXFM_8X8_FN identity, identity, 12 cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(iidentity_8x8_internal_10bpc).pass1 .pass2: packssdw m3, m7 vpbroadcastd m7, [pixel_12bpc_max] jmp m(iidentity_8x8_internal_10bpc).pass2_main %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd m2, [dconly_%4bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity, 35 INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 call .pass1_main sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 mova [cq+32* 5], m2 mova [cq+32* 7], m3 mova [cq+32* 9], m4 mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 call .pass1_main mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] mova m11, [cq+32* 7] mova m12, [cq+32* 9] mova m13, [cq+32*11] mova m14, [cq+32*13] jmp tx2q .fast: call .pass1_main pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call .transpose call m(idct_8x16_internal_8bpc).main vpbroadcastd m12, [pw_2048] REPX {vpermq x, x, q3120}, m0, m2, m4, m6 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 .end: pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m0, m4, m12 pmulhrsw m1, m5, m12 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m0, m6, m12 pmulhrsw m1, m7, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .transpose: packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m7, m15 lea r6, [deint_shuf+128] punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpcklwd m3, m4, m5 punpckhwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhdq m7, m3, m6 punpckldq m3, m6 punpckhdq m6, m4, m5 punpckldq m4, m5 punpckhdq m5, m8, m1 punpckldq m8, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 vperm2i128 m2, m0, m3, 0x31 vinserti128 m0, xm3, 1 vperm2i128 m3, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m7, m5, m6, 0x31 vinserti128 m5, xm6, 1 vperm2i128 m6, m8, m4, 0x31 vinserti128 m4, m8, xm4, 1 ret ALIGN function_align .pass1_main: pmulld m0, m14, [cq+32* 0] pmulld m1, m14, [cq+32* 2] pmulld m2, m14, [cq+32* 4] pmulld m3, m14, [cq+32* 6] pmulld m4, m14, [cq+32* 8] pmulld m5, m14, [cq+32*10] pmulld m6, m14, [cq+32*12] pmulld m7, m14, [cq+32*14] call m(idct_8x8_internal_10bpc).main_rect2 jmp m(idct_8x8_internal_10bpc).round_shift1 ALIGN function_align .main_evenhalf: paddd m1, m6, m7 ; idct8 out1 psubd m6, m7 ; idct8 out6 psubd m7, m0, m9 ; idct8 out7 paddd m0, m9 ; idct8 out0 paddd m2, m5, m4 ; idct8 out2 psubd m5, m4 ; idct8 out5 psubd m4, m3, m8 ; idct8 out4 paddd m3, m8 ; idct8 out3 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 ret .main_oddhalf_fast_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_fast: ; lower half zero vpbroadcastd m7, [pd_4076] vpbroadcastd m8, [pd_401] vpbroadcastd m6, [pd_m1189] vpbroadcastd m9, [pd_3920] vpbroadcastd m5, [pd_3612] vpbroadcastd m10, [pd_1931] vpbroadcastd m4, [pd_m2598] vpbroadcastd m15, [pd_3166] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 pmulld m1, m9 pmulld m5, m2 pmulld m2, m10 pmulld m4, m3 pmulld m3, m15 jmp .main_oddhalf_fast2 .main_oddhalf_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf: ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a .main_oddhalf_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 psubd m8, m0, m4 ; t9 paddd m0, m4 ; t8 psubd m4, m6, m2 ; t10 paddd m2, m6 ; t11 psubd m6, m1, m5 ; t13 paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a paddd m0, m2 ; t8a psubd m2, m8, m6 ; t13 paddd m6, m8 ; t14 psubd m8, m7, m5 ; t12a paddd m7, m5 ; t15a REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pmulld x, m14}, m2, m8, m3, m4 paddd m2, m11 paddd m8, m11 paddd m5, m2, m3 ; t13a psubd m2, m3 ; t10a psubd m3, m8, m4 ; t11 paddd m4, m8 ; t12 REPX {psrad x, 12}, m5, m2, m3, m4 mova [r6-32*4], m7 mova [r6-32*3], m6 mova [r6-32*2], m5 mova [r6-32*1], m4 mova [r6+32*0], m3 mova [r6+32*1], m2 mova [r6+32*2], m1 mova [r6+32*3], m0 ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity, 35 cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 call .pass1_main call m(iadst_8x8_internal_10bpc).main_end sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 mova [cq+32* 5], m2 mova [cq+32* 7], m3 mova [cq+32* 9], m4 mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 call .pass1_main call m(iadst_8x8_internal_10bpc).main_end mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] mova m11, [cq+32* 7] mova m12, [cq+32* 9] mova m13, [cq+32*11] mova m14, [cq+32*13] jmp tx2q .fast: call .pass1_main call m(iadst_8x8_internal_10bpc).main_end pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [pw_2048] vpbroadcastd xm12, [pw_4096] REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 psubw m12, m8 jmp m(idct_8x16_internal_10bpc).end ALIGN function_align .pass1_main: pmulld m0, m14, [cq+32* 0] pmulld m7, m14, [cq+32*14] pmulld m1, m14, [cq+32* 2] pmulld m6, m14, [cq+32*12] pmulld m2, m14, [cq+32* 4] pmulld m5, m14, [cq+32*10] pmulld m3, m14, [cq+32* 6] pmulld m4, m14, [cq+32* 8] REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 jmp m(iadst_8x8_internal_10bpc).main2 INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity, 35 cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 call m(iadst_8x16_internal_10bpc).pass1_main call m(iflipadst_8x8_internal_10bpc).main_end sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 mova [cq+32* 5], m2 mova [cq+32* 7], m3 mova [cq+32* 9], m4 mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 call m(iadst_8x16_internal_10bpc).pass1_main call m(iflipadst_8x8_internal_10bpc).main_end mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] mova m11, [cq+32* 7] mova m12, [cq+32* 9] mova m13, [cq+32*11] mova m14, [cq+32*13] jmp tx2q .fast: call m(iadst_8x16_internal_10bpc).pass1_main call m(iflipadst_8x8_internal_10bpc).main_end pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m12, [pw_2048] vpbroadcastd xm13, [pw_4096] mova m11, m0 vpermq m0, m7, q2031 mova m10, m1 vpermq m1, m6, q2031 mova m9, m2 vpermq m2, m5, q2031 mova m8, m3 vpermq m3, m4, q2031 vpermq m4, m8, q3120 vpermq m5, m9, q3120 vpermq m6, m10, q3120 vpermq m7, m11, q3120 psubw m12, m13 jmp m(idct_8x16_internal_10bpc).end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 %ifnum %4 pmulhrsw m%2, m%4 %else ; without rounding psraw m%2, 1 %endif %else paddsw m%1, m%1 %endif paddsw m%1, m%2 %endmacro cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m8, m15, [cq+32* 1] pmulld m1, m15, [cq+32* 2] pmulld m9, m15, [cq+32* 3] pmulld m2, m15, [cq+32* 4] pmulld m10, m15, [cq+32* 5] pmulld m3, m15, [cq+32* 6] pmulld m11, m15, [cq+32* 7] pmulld m4, m15, [cq+32* 8] pmulld m12, m15, [cq+32* 9] pmulld m5, m15, [cq+32*10] pmulld m13, m15, [cq+32*11] pmulld m6, m15, [cq+32*12] pmulld m14, m15, [cq+32*13] pmulld m7, m15, [cq+32*14] pmulld m15, [cq+32*15] mova [cq], m7 vpbroadcastd m7, [pd_2048] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m13, m7, m15 vpbroadcastd m8, [pw_1697x16] REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 vpbroadcastd m7, [pixel_10bpc_max] vpbroadcastd m12, [pw_2048] call .pass2_end RET ALIGN function_align .pass2_end: punpckhwd m9, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m13 punpcklwd m6, m13 punpckhwd m13, m4, m5 punpcklwd m4, m5 punpcklwd m5, m2, m3 punpckhwd m2, m3 punpckhdq m3, m0, m5 punpckldq m0, m5 punpckhdq m11, m9, m2 punpckldq m9, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckldq m6, m13, m1 punpckhdq m13, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m8, m9, m6 punpckhqdq m9, m6 punpcklqdq m10, m11, m13 punpckhqdq m11, m13 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(iidentity_8x8_internal_10bpc).write_2x8x2_start pmulhrsw m0, m12, m2 pmulhrsw m1, m12, m3 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero pmulhrsw m0, m12, m8 pmulhrsw m1, m12, m9 lea dstq, [dstq+strideq*4] call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero pmulhrsw m0, m12, m10 pmulhrsw m1, m12, m11 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero ret INV_TXFM_8X16_FN dct, dct, 0, 12 INV_TXFM_8X16_FN dct, identity, 35, 12 INV_TXFM_8X16_FN dct, adst, 0, 12 INV_TXFM_8X16_FN dct, flipadst, 0, 12 cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_8x16_internal_10bpc).pass1 .pass2: lea r6, [rsp+32*4] call .transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] mova [cq+32* 8], m0 mova [cq+32*10], m2 mova [cq+32*12], m4 mova [cq+32*14], m6 pmaxsd m0, m12, [cq+32* 1] pmaxsd m4, m12, m1 pmaxsd m1, m12, [cq+32* 3] pmaxsd m2, m12, [cq+32* 5] pmaxsd m6, m12, m5 pmaxsd m5, m12, m3 pmaxsd m3, m12, [cq+32* 7] pmaxsd m7, m12 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(idct_8x16_internal_10bpc).main_oddhalf pmaxsd m0, m12, [cq+32* 0] pmaxsd m1, m12, [cq+32* 2] pmaxsd m2, m12, [cq+32* 4] pmaxsd m3, m12, [cq+32* 6] pmaxsd m4, m12, [cq+32* 8] pmaxsd m5, m12, [cq+32*10] pmaxsd m6, m12, [cq+32*12] pmaxsd m7, m12, [cq+32*14] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf vpbroadcastd m11, [pd_8] REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_16x8_internal_10bpc).pass1_rotations REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 .end: packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 packssdw m4, m8, m9 packssdw m5, m10, m11 packssdw m6, m12, m13 packssdw m7, m14, m15 vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m2, q3120 vpermq m1, m3, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m5, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m6, q3120 vpermq m1, m7, q3120 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .transpose: mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 mova [cq+32*11], m11 call m(idct_8x8_internal_12bpc).transpose_8x8 mova [cq+32* 0], m0 mova [cq+32* 1], m1 mova [cq+32* 2], m2 mova [cq+32* 3], m3 mova [cq+32* 4], m4 mova [cq+32* 5], m5 mova [cq+32* 6], m6 mova [cq+32* 7], m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, m12 mova m5, m13 mova m6, m14 mova m7, m15 jmp m(idct_8x8_internal_12bpc).transpose_8x8 INV_TXFM_8X16_FN adst, dct, 0, 12 INV_TXFM_8X16_FN adst, adst, 0, 12 INV_TXFM_8X16_FN adst, flipadst, 0, 12 INV_TXFM_8X16_FN adst, identity, 35, 12 cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iadst_8x16_internal_10bpc).pass1 .pass2: lea r6, [rsp+32*4] call .pass2_main call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_end: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 jmp m(idct_8x16_internal_12bpc).end ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] mova [cq+32* 8], m0 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*15], m7 pmaxsd m0, m13, [cq+32* 2] ; 2 pmaxsd m3, m13, m1 ; 9 pmaxsd m1, m13, m5 ; 13 pmaxsd m4, m13, m2 ; 10 pmaxsd m2, m13, [cq+32* 6] ; 6 pmaxsd m5, m13, [cq+32* 5] ; 5 pmaxsd m6, m13, m6 ; 14 pmaxsd m7, m13, [cq+32* 1] ; 1 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m12, [pd_2048] vpbroadcastd m15, [pd_2896] call m(iadst_16x8_internal_10bpc).main_part1 pmaxsd m0, m13, [cq+32* 0] ; 0 pmaxsd m1, m13, [cq+32*15] ; 15 pmaxsd m2, m13, [cq+32* 4] ; 4 pmaxsd m3, m13, [cq+32*11] ; 11 pmaxsd m4, m13, [cq+32* 8] ; 8 pmaxsd m5, m13, [cq+32* 7] ; 7 pmaxsd m6, m13, [cq+32*12] ; 12 pmaxsd m7, m13, [cq+32* 3] ; 3 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_16x8_internal_10bpc).main_part2 vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret INV_TXFM_8X16_FN flipadst, dct, 0, 12 INV_TXFM_8X16_FN flipadst, adst, 0, 12 INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 INV_TXFM_8X16_FN flipadst, identity, 35, 12 cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_8x16_internal_10bpc).pass1 .pass2: lea r6, [rsp+32*4] call m(iadst_8x16_internal_12bpc).pass2_main call m(iflipadst_16x8_internal_10bpc).pass1_rotations jmp m(iadst_8x16_internal_12bpc).pass2_end INV_TXFM_8X16_FN identity, dct, 0, 12 INV_TXFM_8X16_FN identity, adst, 0, 12 INV_TXFM_8X16_FN identity, flipadst, 0, 12 INV_TXFM_8X16_FN identity, identity, 0, 12 cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_8x16_internal_10bpc).pass1 .pass2: call .pass2_main packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m13, m7, m15 vpbroadcastd m7, [pixel_12bpc_max] vpbroadcastd m12, [pw_16384] call m(iidentity_8x16_internal_10bpc).pass2_end RET ALIGN function_align .pass2_main: mova [cq], m7 vpbroadcastd m7, [clip_18b_min] REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmaxsd m7, [cq] mova [cq], m15 vpbroadcastd m15, [clip_18b_max] REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pminsd m15, [cq] mova [cq], m7 vpbroadcastd m7, [pd_5793] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmulld m7, [cq] mova [cq], m15 vpbroadcastd m15, [pd_1024] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq] REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct vpbroadcastd m3, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 4 .dconly2: add r6d, 384 sar r6d, 9 .dconly3: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm3 vpbroadcastw m0, xm0 .dconly_loop: paddsw m1, m0, [dstq+strideq*0] paddsw m2, m0, [dstq+strideq*1] psubusw m1, m3 psubusw m2, m3 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %else jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly %endif %endif %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, identity INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] .pass1: vbroadcasti128 m0, [cq+16* 0] vbroadcasti128 m4, [cq+16* 4] vbroadcasti128 m1, [cq+16* 2] vbroadcasti128 m7, [cq+16* 6] vbroadcasti128 m5, [cq+16*10] vbroadcasti128 m2, [cq+16* 8] vbroadcasti128 m6, [cq+16*12] vbroadcasti128 m3, [cq+16*14] shufpd m0, m4, 0x0c ; 0 4 shufpd m1, m5, 0x0c ; 2 10 shufpd m2, m6, 0x0c ; 8 12 shufpd m3, m7, 0x0c ; 14 6 call .pass1_main vbroadcasti128 m10, [cq+16* 1] vbroadcasti128 m4, [cq+16* 5] vbroadcasti128 m11, [cq+16*15] vbroadcasti128 m5, [cq+16*11] shufpd m10, m4, 0x0c ; 1 5 shufpd m11, m5, 0x0c ; 15 11 vbroadcasti128 m5, [cq+16* 9] vbroadcasti128 m4, [cq+16*13] shufpd m5, m4, 0x0c ; 9 13 vbroadcasti128 m6, [cq+16* 7] vbroadcasti128 m4, [cq+16* 3] shufpd m6, m4, 0x0c ; 7 3 call .pass1_main2 pcmpeqd m4, m4 REPX {psubd x, m4}, m0, m1, m2, m3 call .pass1_main3 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call .transpose_4x16_packed lea r6, [deint_shuf+128] call m(idct_16x4_internal_8bpc).main .end: vpbroadcastd m4, [pw_2048] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 vpbroadcastd m5, [pixel_10bpc_max] .end2: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] .end3: lea r6, [dstq+strideq*2] paddw m2, [r6 +strideq*0] paddw m3, [r6 +strideq*1] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmaxsw x, m4}, m0, m1, m2, m3 REPX {pminsw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [r6 +strideq*0], m2 mova [r6 +strideq*1], m3 RET ALIGN function_align .pass1_main: vpbroadcastd m7, [pd_2048] call m(idct_8x4_internal_10bpc).main psubd m3, m0, m4 ; idct8 out7 out6 paddd m0, m4 ; idct8 out0 out1 paddd m1, m2, m5 ; idct8 out3 out2 psubd m2, m5 ; idct8 out4 out5 ret ALIGN function_align .pass1_main2: ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 vbroadcasti128 m12, [pd_3784_m3784] psubd m4, m10, m5 paddd m10, m5 ; t8 t11 psignd m4, m12 ; t9 t10 psubd m5, m11, m6 paddd m11, m6 ; t15 t12 psignd m5, m12 ; t14 t13 vpbroadcastd m6, [pd_1567] vpbroadcastd m13, [pd_3784] REPX {pmaxsd x, m8}, m5, m4 REPX {pminsd x, m9}, m5, m4 pmulld m12, m5 pmulld m5, m6 vbroadcasti128 m6, [pd_1567_m1567] pmulld m13, m4 pmulld m4, m6 REPX {pmaxsd x, m8}, m10, m11, m0, m1 REPX {pminsd x, m9}, m10, m11, m0, m1 paddd m12, m7 paddd m5, m7 paddd m4, m12 psubd m5, m13 psrad m4, 12 ; t14a t10a psrad m5, 12 ; t9a t13a vpbroadcastd m12, [pd_2896] punpckhqdq m6, m11, m5 punpcklqdq m11, m4 punpckhqdq m4, m10, m4 punpcklqdq m10, m5 psubd m5, m11, m6 ; t12a t13 paddd m11, m6 ; t15a t14 psubd m6, m10, m4 ; t11a t10 paddd m10, m4 ; t8a t9 REPX {pmaxsd x, m8}, m5, m6 REPX {pminsd x, m9}, m5, m6 pmulld m5, m12 pmulld m6, m12 REPX {pmaxsd x, m8}, m2, m3, m11, m10 REPX {pminsd x, m9}, m2, m3, m11, m10 ret ALIGN function_align .pass1_main3: paddd m5, m7 psubd m4, m5, m6 paddd m5, m6 psrad m4, 12 ; t11 t10a psrad m5, 12 ; t12 t13a psubd m7, m0, m11 ; out15 out14 paddd m0, m11 ; out0 out1 psubd m6, m1, m5 ; out12 out13 paddd m1, m5 ; out3 out2 psubd m5, m2, m4 ; out11 out10 paddd m2, m4 ; out4 out5 psubd m4, m3, m10 ; out8 out9 paddd m3, m10 ; out7 out6 REPX {pshufd x, x, q1032}, m1, m3, m5, m7 ret ALIGN function_align .transpose_4x16_packed: vbroadcasti128 m8, [deint_shuf] packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 REPX {pshufb x, m8}, m0, m2, m4, m6 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpckhqdq m2, m4, m6 punpcklqdq m4, m6 vperm2i128 m3, m1, m2, 0x31 vinserti128 m1, xm2, 1 vperm2i128 m2, m0, m4, 0x31 vinserti128 m0, xm4, 1 ret INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call m(iadst_4x16_internal_10bpc).main psrad m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3 paddd m4, m5, m11 paddd m5, m6, m11 paddd m6, m7, m11 paddd m7, m8, m11 .pass1_end: REPX {pshufd x, x, q1032}, m0, m2, m4, m6 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main jmp m(idct_16x4_internal_10bpc).end ALIGN function_align .main: vpbroadcastd m6, [pd_1321] mova m0, [cq+32*0] mova m1, [cq+32*1] vpbroadcastd m7, [pd_2482] mova m2, [cq+32*6] mova m3, [cq+32*7] pmulld m4, m0, m6 pmulld m5, m1, m6 ; 1321*in0 pmulld m9, m2, m7 pmulld m8, m3, m7 ; 2482*in3 paddd m4, m9 paddd m8, m5 ; 1321*in0 + 2482*in3 pmulld m5, m0, m7 pmulld m9, m1, m7 ; 2482*in0 paddd m0, m2 paddd m1, m3 ; in0 + in3 paddd m7, m6 ; pd_3803 pmulld m2, m7 pmulld m3, m7 ; 3803*in3 psubd m5, m2 psubd m9, m3 ; 2482*in0 - 3803*in3 mova m2, [cq+32*4] pmulld m10, m7, m2 pmulld m3, m6, m2 psubd m2, m0 mova m0, [cq+32*5] pmulld m7, m0 ; 3803*in2 pmulld m6, m0 ; 1321*in2 psubd m0, m1 ; in2 - in0 - in3 vpbroadcastd m1, [pd_m3344] paddd m4, m10 paddd m7, m8 ; t0 psubd m5, m3 psubd m9, m6 ; t1 pmulld m2, m1 pmulld m0, m1 ; t2 pmulld m3, m1, [cq+32*2] pmulld m1, [cq+32*3] ; -t3 ret ALIGN function_align .main_end: ; expects: m6 = rnd paddd m5, m6 paddd m9, m6 paddd m10, m4, m5 paddd m4, m6 paddd m8, m7, m6 paddd m7, m9 psubd m4, m3 ; out0 (unshifted) psubd m5, m3 ; out1 (unshifted) paddd m2, m6 ; out2 (unshifted) paddd m3, m10 ; out3 (unshifted) psubd m8, m1 ; out4 (unshifted) psubd m9, m1 ; out5 (unshifted) paddd m6, m0 ; out6 (unshifted) paddd m7, m1 ; out7 (unshifted) ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call m(iadst_4x16_internal_10bpc).main psrad m11, 11 ; pd_1 paddd m4, m3, m11 paddd m3, m5, m11 paddd m5, m2, m11 paddd m2, m6, m11 paddd m6, m1, m11 paddd m1, m7, m11 paddd m7, m0, m11 paddd m0, m8, m11 jmp m(iadst_16x4_internal_10bpc).pass1_end .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [pw_2048] pmulhrsw m5, m3, m4 pmulhrsw m6, m2, m4 pmulhrsw m2, m1, m4 pmulhrsw m3, m0, m4 paddw m0, m5, [dstq+strideq*0] paddw m1, m6, [dstq+strideq*1] vpbroadcastd m5, [pixel_10bpc_max] jmp m(idct_16x4_internal_10bpc).end3 INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [pd_5793] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m4, [cq+32*4], q3120 ; 8 9 vpermq m5, [cq+32*5], q3120 ; a b vpermq m6, [cq+32*6], q3120 ; c d vpermq m7, [cq+32*7], q3120 ; e f vpbroadcastd m9, [pd_3072] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed vpbroadcastd m7, [pw_1697x8] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(idct_16x4_internal_10bpc).end INV_TXFM_16X4_FN dct, dct, 12 INV_TXFM_16X4_FN dct, identity, 12 INV_TXFM_16X4_FN dct, adst, 12 INV_TXFM_16X4_FN dct, flipadst, 12 cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] jmp m(idct_16x4_internal_10bpc).pass1 .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 ; deinterleave REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 ; transpose punpcklqdq m8, m0, m1 punpckhqdq m0, m1 punpcklqdq m9, m2, m3 punpckhqdq m2, m3 punpcklqdq m10, m4, m5 punpckhqdq m4, m5 punpcklqdq m11, m6, m7 punpckhqdq m6, m7 vperm2i128 m3, m0, m2, 0x31 ; out6 vperm2i128 m1, m0, m2, 0x20 ; out2 vperm2i128 m7, m4, m6, 0x31 ; out7 vperm2i128 m5, m4, m6, 0x20 ; out3 vperm2i128 m13, m10, m11, 0x31 ; out5 vperm2i128 m12, m10, m11, 0x20 ; out1 vperm2i128 m11, m8, m9, 0x31 ; out4 vperm2i128 m10, m8, m9, 0x20 ; out0 call m(idct_4x16_internal_10bpc).pass1_main pmulld m0, m6, m10 pmulld m2, m6, m11 pmulld m4, m6, m12 pmulld m6, m13 vpbroadcastd m10, [pd_17408] call m(idct_4x16_internal_10bpc).pass1_main2 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m5, [pixel_12bpc_max] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN adst, dct, 12 INV_TXFM_16X4_FN adst, adst, 12 INV_TXFM_16X4_FN adst, flipadst, 12 INV_TXFM_16X4_FN adst, identity, 12 cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iadst_16x4_internal_10bpc).pass1 .pass2: call .pass2_main REPX {vpermq x, x, q3120}, m0, m1, m2, m3 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 ALIGN function_align .pass2_main: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 pmaxsd m8, m4, m12 pmaxsd m9, m5, m12 REPX {pminsd x, m13}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).transpose_4x8 mova [cq+32*0], m0 mova [cq+32*2], m1 mova [cq+32*4], m2 mova [cq+32*6], m3 pminsd m0, m8, m13 pminsd m1, m9, m13 pminsd m2, m6, m13 pminsd m3, m7, m13 call m(iadst_8x4_internal_12bpc).transpose_4x8 mova [cq+32*1], m0 mova [cq+32*3], m1 mova [cq+32*5], m2 mova [cq+32*7], m3 call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_2048] call m(iadst_16x4_internal_10bpc).main_end psrad m0, m4, 15 psrad m1, m5, 15 psrad m2, 15 psrad m3, 15 psrad m4, m8, 15 psrad m5, m9, 15 psrad m6, 15 psrad m7, 15 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m4, [pw_16384] vpbroadcastd m5, [pixel_12bpc_max] ret INV_TXFM_16X4_FN flipadst, dct, 12 INV_TXFM_16X4_FN flipadst, adst, 12 INV_TXFM_16X4_FN flipadst, flipadst, 12 INV_TXFM_16X4_FN flipadst, identity, 12 cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_16x4_internal_10bpc).pass1 .pass2: call m(iadst_16x4_internal_12bpc).pass2_main vpermq m7, m0, q3120 vpermq m6, m1, q3120 vpermq m1, m2, q3120 vpermq m0, m3, q3120 pmulhrsw m0, m4 pmulhrsw m1, m4 pmulhrsw m2, m6, m4 pmulhrsw m3, m7, m4 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN identity, dct, 12 INV_TXFM_16X4_FN identity, adst, 12 INV_TXFM_16X4_FN identity, flipadst, 12 INV_TXFM_16X4_FN identity, identity, 12 cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [pd_1697] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m3, [cq+32*3], q3120 ; 6 7 vpbroadcastd m9, [pd_3072] pmulld m4, m8, m0 pmulld m5, m8, m1 pmulld m6, m8, m2 pmulld m7, m8, m3 vpermq m10, [cq+32*4], q3120 ; 8 9 vpermq m11, [cq+32*5], q3120 ; a b vpermq m12, [cq+32*6], q3120 ; c d vpermq m13, [cq+32*7], q3120 ; e f REPX {paddd x, m9}, m4, m5, m6, m7 REPX {psrad x, 12}, m4, m5, m6, m7 paddd m0, m4 pmulld m4, m8, m10 paddd m1, m5 pmulld m5, m8, m11 paddd m2, m6 pmulld m6, m8, m12 paddd m3, m7 pmulld m7, m8, m13 REPX {paddd x, m9}, m4, m5, m6, m7 REPX {psrad x, 12}, m4, m5, m6, m7 paddd m4, m10 paddd m5, m11 paddd m6, m12 paddd m7, m13 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m8, [pd_5793] vpbroadcastd m9, [pd_2048] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_16x4_internal_10bpc).transpose_4x16_packed vpbroadcastd m4, [pw_16384] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 vpbroadcastd m5, [pixel_12bpc_max] jmp m(idct_16x4_internal_10bpc).end2 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd m3, [dconly_%3bpc] mov [cq], eobd ; 0 or r3d, 8 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] pmulld m0, m14, [cq+32* 1] pmulld m1, m14, [cq+32* 3] pmulld m2, m14, [cq+32* 5] pmulld m3, m14, [cq+32* 7] pmulld m4, m14, [cq+32* 9] pmulld m5, m14, [cq+32*11] pmulld m6, m14, [cq+32*13] pmulld m7, m14, [cq+32*15] vpbroadcastd m11, [pd_2048] lea r6, [rsp+32*4] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+32* 0] pmulld m1, m14, [cq+32* 2] pmulld m2, m14, [cq+32* 4] pmulld m3, m14, [cq+32* 6] pmulld m4, m14, [cq+32* 8] pmulld m5, m14, [cq+32*10] pmulld m6, m14, [cq+32*12] pmulld m7, m14, [cq+32*14] call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf psrld m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 call .pass1_rotations REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call .transpose call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [pw_2048] .end: pmulhrsw m0, m10 pmulhrsw m1, m10 pmulhrsw m2, m10 pmulhrsw m3, m10 call .write_16x4_start .end2: pmulhrsw m0, m4, m10 pmulhrsw m1, m5, m10 pmulhrsw m2, m6, m10 pmulhrsw m3, m7, m10 call .write_16x4_zero RET ALIGN function_align .pass1_rotations: mova m14, [r6-32*4] mova m13, [r6-32*3] mova m12, [r6-32*2] mova m11, [r6-32*1] mova m10, [r6+32*0] mova m9, [r6+32*1] mova m8, [r6+32*2] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r6+32*3] ; out8 paddd m7, [r6+32*3] ; out7 ret ALIGN function_align .transpose: lea r6, [deint_shuf+128] .transpose2: packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m7, m15 .transpose3: punpckhwd m8, m0, m1 punpcklwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhdq m7, m4, m6 punpckldq m4, m6 punpckldq m6, m8, m2 punpckhdq m8, m2 punpckhdq m2, m0, m1 punpckldq m0, m1 punpckhdq m1, m3, m5 punpckldq m3, m5 punpcklqdq m5, m6, m3 punpckhqdq m6, m3 punpckhqdq m3, m2, m7 punpcklqdq m2, m7 punpcklqdq m7, m8, m1 punpckhqdq m8, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 vperm2i128 m4, m0, m5, 0x31 vinserti128 m0, xm5, 1 vperm2i128 m5, m1, m6, 0x31 vinserti128 m1, xm6, 1 vperm2i128 m6, m2, m7, 0x31 vinserti128 m2, xm7, 1 vperm2i128 m7, m3, m8, 0x31 vinserti128 m3, xm8, 1 ret ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_10bpc_max] lea r3, [strideq*3] pxor m8, m8 .write_16x4_zero: REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 add cq, 32*8 .write_16x4: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3 ] REPX {pmaxsw x, m8}, m0, m1, m2, m3 REPX {pminsw x, m9}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 lea dstq, [dstq+strideq*4] ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: lea r6, [rsp+32*4] call .main vpbroadcastd m14, [pd_3072] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_3071 call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 pmulhrsw m0, m10 pmulhrsw m1, m11 pmulhrsw m2, m10 pmulhrsw m3, m11 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m4, m10 pmulhrsw m1, m5, m11 pmulhrsw m2, m6, m10 pmulhrsw m3, m7, m11 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .pass1_rotations: paddd m0, m15 psubd m1, m15, m1 paddd m2, m15 psubd m3, m15, m3 paddd m4, m14 psubd m5, m13, m5 paddd m6, m14 psubd m7, m13, m7 paddd m8, m14, m9 psubd m9, m13, m10 paddd m10, m14, m11 psubd m11, m13, m12 paddd m12, m15, [r6-32*1] psubd m13, m15, [r6-32*2] paddd m14, m15, [r6-32*3] psubd m15, [r6-32*4] ret ALIGN function_align .main: ; expects: m13 = clip_min m14 = clip_max vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 2] pmulld m1, m15, [cq+32*13] pmulld m2, m15, [cq+32* 6] pmulld m3, m15, [cq+32* 9] pmulld m4, m15, [cq+32*10] pmulld m5, m15, [cq+32* 5] pmulld m6, m15, [cq+32*14] pmulld m7, m15, [cq+32* 1] vpbroadcastd m12, [pd_2048] REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 call .main_part1 pmulld m0, m15, [cq+32* 0] pmulld m1, m15, [cq+32*15] pmulld m2, m15, [cq+32* 4] pmulld m3, m15, [cq+32*11] pmulld m4, m15, [cq+32* 8] pmulld m5, m15, [cq+32* 7] pmulld m6, m15, [cq+32*12] pmulld m7, m15, [cq+32* 3] REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_part2: ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 psubd m8, m0, m4 ; t8a paddd m0, m4 ; t0a psubd m4, m1, m5 ; t9a paddd m1, m5 ; t1a psubd m5, m2, m6 ; t12a paddd m2, m6 ; t4a psubd m6, m3, m7 ; t13a paddd m7, m3 ; t5a REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 vpbroadcastd m11, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 psubd m3, m0, m2 ; t4 paddd m0, m2 ; t0 psubd m2, m1, m7 ; t5 paddd m1, m7 ; t1 psubd m7, m4, m6 ; t12a paddd m4, m6 ; t8a psubd m6, m8, m5 ; t13a paddd m5, m8 ; t9a REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 vpbroadcastd m11, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 pminsd m10, m14, [r6-32*4] ; t2 pminsd m8, m14, [r6-32*3] ; t3 psubd m9, m0, m10 ; t2a paddd m0, m10 ; out0 psubd m10, m1, m8 ; t3a paddd m1, m8 ; -out15 pmaxsd m9, m13 pmaxsd m10, m13 pminsd m9, m14 pminsd m10, m14 mova [r6-32*4], m1 mova m11, [r6-32*1] ; t7a mova m1, [r6-32*2] ; t6a psubd m8, m3, m11 ; t7 paddd m11, m3 ; out12 paddd m3, m2, m1 ; -out3 psubd m2, m1 ; t6 pmaxsd m8, m13 pmaxsd m2, m13 pminsd m8, m14 pminsd m2, m14 mova [r6-32*1], m11 mova [r6-32*3], m2 mova m1, [r6+32*3] ; t15 mova m2, [r6+32*2] ; t14 paddd m12, m7, m1 ; -out13 psubd m7, m1 ; t15a psubd m11, m6, m2 ; t14a paddd m2, m6 ; out2 pmaxsd m7, m13 pmaxsd m11, m13 pminsd m7, m14 pminsd m11, m14 mova [r6-32*2], m12 pminsd m1, m14, [r6+32*0] ; t10a pminsd m12, m14, [r6+32*1] ; t11a psubd m6, m4, m1 ; t10 paddd m1, m4 ; -out1 psubd m4, m5, m12 ; t11 paddd m5, m12 ; out14 vpbroadcastd m12, [pd_1448] pmaxsd m6, m13 pmaxsd m4, m13 pminsd m6, m14 pminsd m4, m14 REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 pmulld m12, [r6-32*3] ; t6 mova [r6-32*3], m5 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) paddd m7, m9, m10 ; -out7 (unshifted) psubd m9, m10 ; out8 (unshifted) psubd m10, m6, m4 ; -out9 (unshifted) paddd m6, m4 ; out6 (unshifted) paddd m4, m12, m8 ; out4 (unshifted) psubd m12, m8 ; -out11 (unshifted) ret .main_part1: ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 psubd m8, m0, m4 ; t10a paddd m0, m4 ; t2a psubd m4, m1, m5 ; t11a paddd m1, m5 ; t3a psubd m5, m2, m6 ; t14a paddd m2, m6 ; t6a psubd m6, m3, m7 ; t15a paddd m7, m3 ; t7a REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 vpbroadcastd m11, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 psubd m3, m0, m2 ; t6 paddd m0, m2 ; t2 psubd m2, m1, m7 ; t7 paddd m1, m7 ; t3 psubd m7, m4, m6 ; t14a paddd m4, m6 ; t10a psubd m6, m8, m5 ; t15a paddd m5, m8 ; t11a REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later vpbroadcastd m11, [pd_1567] vpbroadcastd m10, [pd_3784] ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*2], m6 mova [r6+32*3], m7 ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: lea r6, [rsp+32*4] call m(iadst_16x8_internal_10bpc).main vpbroadcastd m14, [pd_3072] psrld m15, 11 psubd m13, m14, m15 call .pass1_rotations jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 mova m12, m0 pmulhrsw m0, m7, m11 mova m7, m1 pmulhrsw m1, m6, m10 mova m6, m2 pmulhrsw m2, m5, m11 mova m5, m3 pmulhrsw m3, m4, m10 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m5, m11 pmulhrsw m1, m6, m10 pmulhrsw m2, m7, m11 pmulhrsw m3, m12, m10 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .pass1_rotations: psubd m8, m13, m7 paddd m7, m14, m9 paddd m9, m14, m6 psubd m6, m13, m10 psubd m10, m13, m5 paddd m5, m14, m11 paddd m11, m14, m4 psubd m4, m13, m12 psubd m12, m15, m3 paddd m3, m15, [r6-32*1] paddd m13, m15, m2 psubd m2, m15, [r6-32*2] psubd m14, m15, m1 mova m1, m15 paddd m15, m0 psubd m0, m1, [r6-32*4] paddd m1, [r6-32*3] ret INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m1, m15, [cq+32* 1] pmulld m2, m15, [cq+32* 2] pmulld m3, m15, [cq+32* 3] pmulld m4, m15, [cq+32* 4] pmulld m5, m15, [cq+32* 5] pmulld m6, m15, [cq+32* 6] pmulld m7, m15, [cq+32* 7] pmulld m8, m15, [cq+32* 8] pmulld m9, m15, [cq+32* 9] pmulld m10, m15, [cq+32*10] pmulld m11, m15, [cq+32*11] pmulld m12, m15, [cq+32*12] pmulld m13, m15, [cq+32*13] pmulld m14, m15, [cq+32*14] pmulld m15, [cq+32*15] mova [rsp], m7 vpbroadcastd m7, [pd_2048] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [rsp], m15 vpbroadcastd m15, [pd_5793] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [rsp] mova [rsp], m7 vpbroadcastd m7, [pd_3072] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose vpbroadcastd m10, [pw_4096] jmp m(idct_16x8_internal_10bpc).end INV_TXFM_16X8_FN dct, dct, 12 INV_TXFM_16X8_FN dct, identity, 12 INV_TXFM_16X8_FN dct, adst, 12 INV_TXFM_16X8_FN dct, flipadst, 12 cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x8_internal_10bpc).pass1 .pass2: call .pass2_main RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m11, [pd_2048] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_12bpc).round_shift4 mova [cq+32* 8], m0 mova [cq+32* 9], m1 mova [cq+32*10], m2 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*13], m5 mova [cq+32*14], m6 mova [cq+32*15], m7 pmaxsd m0, m12, [cq+32*0] pmaxsd m1, m12, [cq+32*1] pmaxsd m2, m12, [cq+32*2] pmaxsd m3, m12, [cq+32*3] pmaxsd m4, m12, [cq+32*4] pmaxsd m5, m12, [cq+32*5] pmaxsd m6, m12, [cq+32*6] pmaxsd m7, m12, [cq+32*7] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_12bpc).round_shift4 .end: packssdw m0, [cq+32* 8] packssdw m1, [cq+32* 9] packssdw m2, [cq+32*10] packssdw m3, [cq+32*11] packssdw m4, [cq+32*12] packssdw m5, [cq+32*13] packssdw m6, [cq+32*14] packssdw m7, [cq+32*15] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 call .write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero vpermq m0, m4, q3120 vpermq m1, m5, q3120 vpermq m2, m6, q3120 vpermq m3, m7, q3120 jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_12bpc_max] lea r3, [strideq*3] pxor m8, m8 ret INV_TXFM_16X8_FN adst, dct, 12 INV_TXFM_16X8_FN adst, adst, 12 INV_TXFM_16X8_FN adst, flipadst, 12 INV_TXFM_16X8_FN adst, identity, 12 cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iadst_16x8_internal_10bpc).pass1 .pass2: call .pass2_main call m(idct_16x8_internal_12bpc).end RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m11, [pd_2048] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_8x8_internal_12bpc).pass2_main2 mova [cq+32* 8], m0 mova [cq+32* 9], m1 mova [cq+32*10], m2 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*13], m5 mova [cq+32*14], m6 mova [cq+32*15], m7 pmaxsd m0, m12, [cq+32*0] pmaxsd m1, m12, [cq+32*1] pmaxsd m2, m12, [cq+32*2] pmaxsd m3, m12, [cq+32*3] pmaxsd m4, m12, [cq+32*4] pmaxsd m5, m12, [cq+32*5] pmaxsd m6, m12, [cq+32*6] pmaxsd m7, m12, [cq+32*7] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_8x8_internal_12bpc).pass2_main2 ret INV_TXFM_16X8_FN flipadst, dct, 12 INV_TXFM_16X8_FN flipadst, adst, 12 INV_TXFM_16X8_FN flipadst, flipadst, 12 INV_TXFM_16X8_FN flipadst, identity, 12 cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iflipadst_16x8_internal_10bpc).pass1 .pass2: call m(iadst_16x8_internal_12bpc).pass2_main packssdw m13, m0, [cq+32* 8] packssdw m12, m1, [cq+32* 9] packssdw m11, m2, [cq+32*10] packssdw m10, m3, [cq+32*11] packssdw m3, m4, [cq+32*12] packssdw m2, m5, [cq+32*13] packssdw m1, m6, [cq+32*14] packssdw m0, m7, [cq+32*15] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero vpermq m0, m10, q3120 vpermq m1, m11, q3120 vpermq m2, m12, q3120 vpermq m3, m13, q3120 call m(idct_16x8_internal_10bpc).write_16x4_zero RET INV_TXFM_16X8_FN identity, dct, 12 INV_TXFM_16X8_FN identity, adst, 12 INV_TXFM_16X8_FN identity, flipadst, 12 INV_TXFM_16X8_FN identity, identity, 12 cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_16x8_internal_10bpc).pass1 .pass2: call m(idct_16x8_internal_10bpc).transpose2 vpbroadcastd m10, [pw_4096] pmulhrsw m0, m10 pmulhrsw m1, m10 pmulhrsw m2, m10 pmulhrsw m3, m10 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero jmp m(idct_16x8_internal_10bpc).end2 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd m3, [dconly_%4bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity, 28 INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 call .main sub cq, 32 mova m10, [r6-32*4] mova m9, [r6-32*3] mova m8, [r6-32*2] psubd m15, m0, m10 ; out15 paddd m0, m10 ; out0 psubd m10, m1, m9 ; out14 paddd m1, m9 ; out1 psubd m9, m2, m8 ; out13 paddd m2, m8 ; out2 REPX {psrad x, 2}, m0, m1, m2 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova m2, [r6-32*1] mova m1, [r6+32*0] mova m0, [r6+32*1] REPX {psrad x, 2}, m9, m10, m15 psubd m8, m3, m2 ; out12 paddd m3, m2 ; out3 psubd m2, m4, m1 ; out11 paddd m4, m1 ; out4 psubd m1, m5, m0 ; out10 paddd m5, m0 ; out5 REPX {psrad x, 2}, m3, m4, m5 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova m4, [r6+32*2] mova m3, [r6+32*3] REPX {psrad x, 2}, m1, m2, m8 psubd m5, m6, m4 ; out9 paddd m6, m4 ; out6 psubd m4, m7, m3 ; out8 paddd m7, m3 ; out7 REPX {psrad x, 2}, m6, m7, m4, m5 mova [r6+32*2], m6 mova [r6+32*3], m7 add r6, 32*8 mova [r6-32*4], m4 mova [r6-32*3], m5 mova [r6-32*2], m1 mova [r6-32*1], m2 mova [r6+32*0], m8 mova [r6+32*1], m9 mova [r6+32*2], m10 mova [r6+32*3], m15 .fast: add r6, 32*8 call .main mova m14, [r6-32*4] mova m13, [r6-32*3] mova m12, [r6-32*2] mova m11, [r6-32*1] mova m10, [r6+32*0] mova m9, [r6+32*1] mova m8, [r6+32*2] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r6+32*3] ; out8 paddd m7, [r6+32*3] ; out7 sub r6, 32*8 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call .transpose lea r6, [pw_5+128] mova [rsp], m15 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] .end: call .write_16x16 RET ALIGN function_align .write_16x16: mova [rsp+gprsize+32*0], m8 mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_2048] pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 call m(idct_16x8_internal_10bpc).write_16x4_start .write_16x16_2: pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+gprsize+32*0] pmulhrsw m1, m12, [rsp+gprsize+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m12, m11 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+gprsize+32*2] pmulhrsw m1, m12, m13 pmulhrsw m2, m12, m14 pmulhrsw m3, m12, m15 jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .transpose: test eobd, eobd jl .transpose_fast packssdw m8, [r6-32*4] packssdw m9, [r6-32*3] packssdw m10, [r6-32*2] packssdw m11, [r6-32*1] packssdw m12, [r6+32*0] packssdw m13, [r6+32*1] packssdw m14, [r6+32*2] packssdw m15, [r6+32*3] sub r6, 32*8 packssdw m0, [r6-32*4] packssdw m1, [r6-32*3] packssdw m2, [r6-32*2] packssdw m3, [r6-32*1] packssdw m4, [r6+32*0] packssdw m5, [r6+32*1] packssdw m6, [r6+32*2] packssdw m7, [r6+32*3] mova [r6], m8 punpckhwd m8, m0, m1 punpcklwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpckhwd m3, m6, m7 punpcklwd m6, m7 punpcklwd m7, m4, m5 punpckhwd m4, m5 punpckldq m5, m8, m2 punpckhdq m8, m2 punpckhdq m2, m0, m1 punpckldq m0, m1 punpckhdq m1, m7, m6 punpckldq m7, m6 punpckhdq m6, m4, m3 punpckldq m4, m3 punpckhqdq m3, m2, m1 punpcklqdq m2, m1 punpckhqdq m1, m0, m7 punpcklqdq m0, m7 punpcklqdq m7, m8, m6 punpckhqdq m8, m6 punpckhqdq m6, m5, m4 punpcklqdq m5, m4 mova m4, [r6] mova [r6], m8 punpcklwd m8, m4, m9 punpckhwd m4, m9 punpcklwd m9, m10, m11 punpckhwd m10, m11 punpckhwd m11, m14, m15 punpcklwd m14, m15 punpckhwd m15, m12, m13 punpcklwd m12, m13 punpckldq m13, m4, m10 punpckhdq m4, m10 punpckhdq m10, m8, m9 punpckldq m8, m9 punpckhdq m9, m12, m14 punpckldq m12, m14 punpckhdq m14, m15, m11 punpckldq m15, m11 punpckhqdq m11, m10, m9 punpcklqdq m10, m9 punpckhqdq m9, m8, m12 punpcklqdq m8, m12 punpcklqdq m12, m13, m15 punpckhqdq m13, m15 punpckhqdq m15, m4, m14 punpcklqdq m14, m4, m14 vperm2i128 m4, m0, m8, 0x31 vinserti128 m0, xm8, 1 vinserti128 m8, m5, xm12, 1 vperm2i128 m12, m5, 0x13 vperm2i128 m5, m1, m9, 0x31 vinserti128 m1, xm9, 1 vinserti128 m9, m6, xm13, 1 vperm2i128 m13, m6, 0x13 vperm2i128 m6, m2, m10, 0x31 vinserti128 m2, xm10, 1 vinserti128 m10, m7, xm14, 1 vperm2i128 m14, m7, 0x13 vperm2i128 m7, m3, m11, 0x31 vinserti128 m3, xm11, 1 mova xm11, [r6] vinserti128 m11, xm15, 1 vinserti128 m15, [r6+16], 0 ret .transpose_fast: call m(idct_16x8_internal_10bpc).transpose2 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 ret ALIGN function_align .main: mova m0, [cq+64* 1] mova m1, [cq+64* 3] mova m2, [cq+64* 5] mova m3, [cq+64* 7] mova m4, [cq+64* 9] mova m5, [cq+64*11] mova m6, [cq+64*13] mova m7, [cq+64*15] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+64* 0] mova m1, [cq+64* 2] mova m2, [cq+64* 4] mova m3, [cq+64* 6] mova m4, [cq+64* 8] mova m5, [cq+64*10] mova m6, [cq+64*12] mova m7, [cq+64*14] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf psrld m10, m11, 10 ; pd_2 REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: vpbroadcastd m15, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 call .main sub cq, 32 vpbroadcastd m8, [pd_5120] paddd m4, m8 paddd m6, m8 paddd m9, m8 paddd m11, m8 vpbroadcastd m8, [pd_5119] psubd m5, m8, m5 psubd m7, m8, m7 psubd m10, m8, m10 psubd m12, m8, m12 REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 psrld m4, m15, 10 ; pd_2 paddd m0, m4 psubd m1, m4, m1 paddd m2, m4 psubd m3, m4, m3 psubd m7, m4, [r6-32*4] paddd m6, m4, [r6-32*3] psubd m5, m4, [r6-32*2] paddd m4, [r6-32*1] REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 add r6, 32*8 mova [r6-32*4], m9 mova [r6-32*3], m10 mova [r6-32*2], m11 mova [r6-32*1], m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 .fast: add r6, 32*8 call .main vpbroadcastd m14, [pd_5120] vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 paddd m0, m15 psubd m1, m15, m1 paddd m2, m15 psubd m3, m15, m3 paddd m4, m14 psubd m5, m13, m5 paddd m6, m14 psubd m7, m13, m7 paddd m8, m14, m9 psubd m9, m13, m10 paddd m10, m14, m11 psubd m11, m13, m12 paddd m12, m15, [r6-32*1] psubd m13, m15, [r6-32*2] paddd m14, m15, [r6-32*3] psubd m15, [r6-32*4] .pass1_end: REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 sub r6, 32*8 jmp tx2q .pass2: call m(idct_16x16_internal_10bpc).transpose lea r6, [pw_5+128] mova [rsp], m15 call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*0], m8 mova [rsp+32*2], m12 mova [rsp+32*3], m13 vpbroadcastd m12, [pw_2048] pxor m13, m13 psubw m13, m12 pmulhrsw m0, m12 pmulhrsw m1, m13, [rsp+32*1] mova [rsp+32*1], m9 pmulhrsw m2, m12 pmulhrsw m3, m13 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m12, m4 pmulhrsw m1, m13, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m13, m7 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+32*0] pmulhrsw m1, m13, [rsp+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m13, m11 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+32*2] pmulhrsw m1, m13, [rsp+32*3] pmulhrsw m2, m12, m14 pmulhrsw m3, m13, m15 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .main: mova m0, [cq+64* 2] mova m1, [cq+64*13] mova m2, [cq+64* 6] mova m3, [cq+64* 9] mova m4, [cq+64*10] mova m5, [cq+64* 5] mova m6, [cq+64*14] mova m7, [cq+64* 1] vpbroadcastd m12, [pd_2048] call m(iadst_16x8_internal_10bpc).main_part1 mova m0, [cq+64* 0] mova m1, [cq+64*15] mova m2, [cq+64* 4] mova m3, [cq+64*11] mova m4, [cq+64* 8] mova m5, [cq+64* 7] mova m6, [cq+64*12] mova m7, [cq+64* 3] jmp m(iadst_16x8_internal_10bpc).main_part2 INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: vpbroadcastd m15, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 call m(iadst_16x16_internal_10bpc).main sub cq, 32 vpbroadcastd m8, [pd_5120] paddd m11, m8 paddd m9, m8 paddd m6, m8 paddd m4, m8 vpbroadcastd m8, [pd_5119] psubd m12, m8, m12 psubd m10, m8, m10 psubd m7, m8, m7 psubd m5, m8, m5 REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 mova [r6+32*0], m12 mova [r6+32*1], m11 mova [r6+32*2], m10 mova [r6+32*3], m9 psrld m9, m15, 10 ; pd_2 psubd m3, m9, m3 paddd m2, m9 psubd m1, m9, m1 paddd m0, m9 psubd m12, m9, [r6-32*4] paddd m11, m9, [r6-32*3] psubd m10, m9, [r6-32*2] paddd m9, [r6-32*1] REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 mova [r6-32*4], m12 mova [r6-32*3], m11 mova [r6-32*2], m10 mova [r6-32*1], m9 add r6, 32*8 mova [r6-32*4], m7 mova [r6-32*3], m6 mova [r6-32*2], m5 mova [r6-32*1], m4 mova [r6+32*0], m3 mova [r6+32*1], m2 mova [r6+32*2], m1 mova [r6+32*3], m0 .fast: add r6, 32*8 call m(iadst_16x16_internal_10bpc).main vpbroadcastd m14, [pd_5120] vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 psubd m8, m13, m7 paddd m7, m14, m9 paddd m9, m14, m6 psubd m6, m13, m10 psubd m10, m13, m5 paddd m5, m14, m11 paddd m11, m14, m4 psubd m4, m13, m12 psubd m12, m15, m3 paddd m3, m15, [r6-32*1] paddd m13, m15, m2 psubd m2, m15, [r6-32*2] psubd m14, m15, m1 mova m1, m15 paddd m15, m0 psubd m0, m1, [r6-32*4] paddd m1, [r6-32*3] jmp m(iadst_16x16_internal_10bpc).pass1_end .pass2: call m(idct_16x16_internal_10bpc).transpose lea r6, [pw_5+128] mova [rsp], m15 call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*3], m3 mova [rsp+32*2], m2 mova [rsp+32*0], m0 mova m2, m13 mova m3, m12 vpbroadcastd m12, [pw_2048] pxor m13, m13 psubw m13, m12 pmulhrsw m0, m13, m15 pmulhrsw m1, m12, m14 pmulhrsw m2, m13 pmulhrsw m3, m12 mova m14, m8 mova m15, m9 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m13, m11 pmulhrsw m1, m12, m10 pmulhrsw m2, m13, m15 pmulhrsw m3, m12, m14 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m13, m7 pmulhrsw m1, m12, m6 pmulhrsw m2, m13, m5 pmulhrsw m3, m12, m4 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m13, [rsp+32*3] pmulhrsw m1, m12, [rsp+32*2] pmulhrsw m2, m13, [rsp+32*1] pmulhrsw m3, m12, [rsp+32*0] call m(idct_16x8_internal_10bpc).write_16x4_zero RET INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m15, [pd_5793] vpbroadcastd m7, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast mov r3, -32*8*4 .righthalf: pmulld m0, m15, [cq+r3+32*33] pmulld m1, m15, [cq+r3+32*35] pmulld m2, m15, [cq+r3+32*37] pmulld m3, m15, [cq+r3+32*39] add r6, 32*4 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 13}, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 add r3, 32*8 jl .righthalf .fast: pmulld m0, m15, [cq+64* 0] pmulld m1, m15, [cq+64* 1] pmulld m2, m15, [cq+64* 2] pmulld m3, m15, [cq+64* 3] pmulld m4, m15, [cq+64* 4] pmulld m5, m15, [cq+64* 5] pmulld m6, m15, [cq+64* 6] pmulld m8, m15, [cq+64* 7] mova [cq], m8 pmulld m8, m15, [cq+64* 8] pmulld m9, m15, [cq+64* 9] pmulld m10, m15, [cq+64*10] pmulld m11, m15, [cq+64*11] pmulld m12, m15, [cq+64*12] pmulld m13, m15, [cq+64*13] pmulld m14, m15, [cq+64*14] pmulld m15, [cq+64*15] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_16x16_internal_10bpc).transpose mova [cq+32*0], m15 mova [cq+32*1], m0 vpbroadcastd m15, [pw_1697x16] REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14 mova m0, [cq+32*1] mova [cq+32*1], m1 IDTX16 0, 1, 15 mova m1, [cq+32*0] pmulhrsw m15, m1 paddsw m1, m1 paddsw m15, m1 mova m1, [cq+32*1] jmp m(idct_16x16_internal_10bpc).end INV_TXFM_16X16_FN dct, dct, 0, 12 INV_TXFM_16X16_FN dct, identity, 28, 12 INV_TXFM_16X16_FN dct, adst, 0, 12 INV_TXFM_16X16_FN dct, flipadst, 0, 12 cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x16_internal_10bpc).pass1 .pass2: mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 mova [cq+32*11], m11 mova [cq+32*12], m12 mova [cq+32*13], m13 mova [cq+32*14], m14 mova [cq+32*15], m15 call .pass2_main packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 packssdw m4, m8, m9 packssdw m5, m10, m11 packssdw m6, m12, m13 packssdw m7, m14, m15 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, [cq+32*12] mova m5, [cq+32*13] mova m6, [cq+32*14] mova m7, [cq+32*15] mov r5, r6 add r6, 32*16 call .pass2_main jmp m(iadst_16x16_internal_12bpc).end ALIGN function_align .write_16x16: mova [rsp+gprsize+32*0], m8 mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_16384] pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero jmp m(idct_16x16_internal_10bpc).write_16x16_2 ALIGN function_align .pass2_main: call m(idct_8x8_internal_12bpc).transpose_8x8 mova [cq+32* 0], m0 mova [cq+32* 1], m2 mova [cq+32* 2], m4 mova [cq+32* 3], m6 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] pmaxsd m0, m12, m1 pmaxsd m1, m12, m3 pmaxsd m2, m12, m5 pmaxsd m3, m12, m7 REPX {pminsd x, m13}, m0, m1, m2, m3 test eobd, eobd jge .pass2_slow pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 jmp .pass2_fast .pass2_slow: sub r6, 32*8 mova m8, [r6-32*4] mova m4, [r6-32*3] mova m10, [r6-32*2] mova m5, [r6-32*1] mova m12, [r6+32*0] mova m6, [r6+32*1] mova m14, [r6+32*2] mova m7, [r6+32*3] TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 mova [cq+32* 4], m8 mova [cq+32* 5], m10 mova [cq+32* 6], m12 mova [cq+32* 7], m14 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m4, m5, m6, m7 REPX {pminsd x, m13}, m4, m5, m6, m7 .pass2_fast: vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(idct_8x16_internal_10bpc).main_oddhalf pmaxsd m0, m12, [cq+32* 0] pmaxsd m1, m12, [cq+32* 1] pmaxsd m2, m12, [cq+32* 2] pmaxsd m3, m12, [cq+32* 3] REPX {pminsd x, m13}, m0, m1, m2, m3 test eobd, eobd jge .pass2_slow2 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 jmp .pass2_fast2 .pass2_slow2: pmaxsd m4, m12, [cq+32* 4] pmaxsd m5, m12, [cq+32* 5] pmaxsd m6, m12, [cq+32* 6] pmaxsd m7, m12, [cq+32* 7] REPX {pminsd x, m13}, m4, m5, m6, m7 .pass2_fast2: call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf psrad m11, 8 ; pd_8 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_16x8_internal_10bpc).pass1_rotations REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret INV_TXFM_16X16_FN adst, dct, 0, 12 INV_TXFM_16X16_FN adst, adst, 0, 12 INV_TXFM_16X16_FN adst, flipadst, 0, 12 cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iadst_16x16_internal_10bpc).pass1 .pass2: call .pass2_part1 call m(iadst_16x8_internal_10bpc).pass1_rotations call .pass2_part2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_part3: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 .end: packssdw m15, m14 packssdw m14, m13, m12 packssdw m13, m11, m10 packssdw m12, m9, m8 packssdw m11, m7, m6 packssdw m10, m5, m4 packssdw m7, m3, m2 packssdw m6, m1, m0 vpblendd m0, m6, [r5-32*4], 0x33 vpblendd m1, m6, [r5-32*4], 0xcc vpblendd m2, m7, [r5-32*3], 0x33 vpblendd m3, m7, [r5-32*3], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero vpblendd m0, m10, [r5-32*2], 0x33 vpblendd m1, m10, [r5-32*2], 0xcc vpblendd m2, m11, [r5-32*1], 0x33 vpblendd m3, m11, [r5-32*1], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_10bpc).write_16x4_zero vpblendd m0, m12, [r5+32*0], 0x33 vpblendd m1, m12, [r5+32*0], 0xcc vpblendd m2, m13, [r5+32*1], 0x33 vpblendd m3, m13, [r5+32*1], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_10bpc).write_16x4_zero vpblendd m0, m14, [r5+32*2], 0x33 vpblendd m1, m14, [r5+32*2], 0xcc vpblendd m2, m15, [r5+32*3], 0x33 vpblendd m3, m15, [r5+32*3], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .pass2_part1: mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 mova [cq+32*11], m11 mova [cq+32*12], m12 mova [cq+32*13], m13 mova [cq+32*14], m14 mova [cq+32*15], m15 .pass2_main: call m(idct_8x8_internal_12bpc).transpose_8x8 mova [cq+32* 0], m0 mova [cq+32* 1], m3 mova [cq+32* 2], m4 mova [cq+32* 3], m7 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] pmaxsd m0, m13, m2 pmaxsd m2, m13, m6 pmaxsd m5, m13, m5 pmaxsd m7, m13, m1 REPX {pminsd x, m14}, m0, m2, m5, m7 test eobd, eobd jge .pass2_slow pxor m1, m1 REPX {mova x, m1}, m3, m4, m6 jmp .pass2_fast .pass2_slow: sub r6, 32*8 mova m8, [r6-32*4] mova m3, [r6-32*3] mova m4, [r6-32*2] mova m11, [r6-32*1] mova m12, [r6+32*0] mova m1, [r6+32*1] mova m6, [r6+32*2] mova m15, [r6+32*3] TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 mova [cq+32* 4], m8 mova [cq+32* 5], m11 mova [cq+32* 6], m12 mova [cq+32* 7], m15 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] REPX {pmaxsd x, m13}, m1, m3, m4, m6 REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast: vpbroadcastd m12, [pd_2048] vpbroadcastd m15, [pd_2896] call m(iadst_16x8_internal_10bpc).main_part1 pmaxsd m0, m13, [cq+32* 0] ; 0 pmaxsd m7, m13, [cq+32* 1] ; 3 pmaxsd m2, m13, [cq+32* 2] ; 4 pmaxsd m5, m13, [cq+32* 3] ; 7 REPX {pminsd x, m14}, m0, m2, m5, m7 test eobd, eobd jge .pass2_slow2 pxor m1, m1 REPX {mova x, m1}, m3, m4, m6 jmp .pass2_fast2 .pass2_slow2: pmaxsd m4, m13, [cq+32* 4] ; 8 pmaxsd m3, m13, [cq+32* 5] ; 11 pmaxsd m6, m13, [cq+32* 6] ; 12 pmaxsd m1, m13, [cq+32* 7] ; 15 REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast2: call m(iadst_16x8_internal_10bpc).main_part2 vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret ALIGN function_align .pass2_part2: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 packssdw m4, m8, m9 packssdw m5, m10, m11 packssdw m6, m12, m13 packssdw m7, m14, m15 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, [cq+32*12] mova m5, [cq+32*13] mova m6, [cq+32*14] mova m7, [cq+32*15] mov r5, r6 add r6, 32*16 jmp .pass2_main INV_TXFM_16X16_FN flipadst, dct, 0, 12 INV_TXFM_16X16_FN flipadst, adst, 0, 12 INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iflipadst_16x16_internal_10bpc).pass1 .pass2: call m(iadst_16x16_internal_12bpc).pass2_part1 call m(iflipadst_16x8_internal_10bpc).pass1_rotations call m(iadst_16x16_internal_12bpc).pass2_part2 call m(iflipadst_16x8_internal_10bpc).pass1_rotations jmp m(iadst_16x16_internal_12bpc).pass2_part3 INV_TXFM_16X16_FN identity, dct, -92, 12 INV_TXFM_16X16_FN identity, identity, 0, 12 %macro IDTX16_12BPC 1 ; src pmulld m6, m7, m%1 paddd m6, m15 psrad m6, 12 paddd m6, m%1 psrad m%1, m6, 1 %endmacro cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m7, [pd_1697] vpbroadcastd m15, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast mov r3, -32*8*4 .righthalf: mova m10, [cq+r3+32*33] mova m11, [cq+r3+32*35] mova m12, [cq+r3+32*37] mova m13, [cq+r3+32*39] add r6, 32*4 pmulld m0, m7, m10 pmulld m1, m7, m11 pmulld m2, m7, m12 pmulld m3, m7, m13 REPX {paddd x, m15}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 paddd m0, m10 paddd m1, m11 paddd m2, m12 paddd m3, m13 REPX {psrad x, 1 }, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 add r3, 32*8 jl .righthalf .fast: mova m0, [cq+64* 0] mova m1, [cq+64* 1] mova m2, [cq+64* 2] mova m3, [cq+64* 3] mova m4, [cq+64* 4] mova m5, [cq+64* 5] mova m8, [cq+64* 6] mova m9, [cq+64* 7] REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 mova [cq+64*0], m8 mova [cq+64*1], m9 mova m8, [cq+64* 8] mova m9, [cq+64* 9] mova m10, [cq+64*10] mova m11, [cq+64*11] mova m12, [cq+64*12] mova m13, [cq+64*13] mova m14, [cq+64*14] REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 mova m6, [cq+64*15] pmulld m7, m6 paddd m7, m15 psrad m7, 12 paddd m7, m6 mova m6, [cq+64*0] psrad m15, m7, 1 mova m7, [cq+64*1] jmp tx2q .pass2: call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x16_internal_10bpc).transpose_fast test eobd, eobd jl .pass2_fast mova [cq+32* 8], m0 mova [cq+32* 9], m1 mova [cq+32*10], m2 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*13], m5 mova [cq+32*14], m6 mova [cq+32*15], m7 mova m8, [r6-32*4] mova m9, [r6-32*3] mova m10, [r6-32*2] mova m11, [r6-32*1] mova m12, [r6+32*0] mova m13, [r6+32*1] mova m14, [r6+32*2] mova m15, [r6+32*3] sub r6, 32*8 mova m0, [r6-32*4] mova m1, [r6-32*3] mova m2, [r6-32*2] mova m3, [r6-32*1] mova m4, [r6+32*0] mova m5, [r6+32*1] mova m6, [r6+32*2] mova m7, [r6+32*3] call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x8_internal_10bpc).transpose2 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 mova m12, m4 mova m13, m5 mova m14, m6 mova m15, m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, [cq+32*12] mova m5, [cq+32*13] mova m6, [cq+32*14] mova m7, [cq+32*15] .pass2_fast: call m(idct_16x16_internal_12bpc).write_16x16 RET %macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack mova m%4, [r6+32*(%1-4)] mova m%2, [r5+32*(3-%1)] mova m%5, [r4+32*(%1-4)] psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 pmaxsd m%3, m12 pminsd m%1, m13 pminsd m%3, m13 paddd m%1, m11 paddd m%3, m11 psubd m%4, m%1, m%2 ; out31 - n paddd m%1, m%2 ; out0 + n paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 %if %7 & 1 packssdw m%1, m%3 ; out0 + n, out16 + n packssdw m%2, m%4 ; out15 - n, out31 - n %endif %endmacro cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vbroadcasti128 m14, [idct32_shuf] mov r4, cq call .pass1_main mova [rsp+32*0], m2 mova [rsp+32*1], m3 cmp eobd, 43 jge .eob43 pxor m4, m4 REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 jmp .pass1_end_fast .eob43: lea r6, [rsp+32*8] mova [r6-32*4], m0 mova [r6-32*3], m1 call .pass1_main mova [rsp+32*2], m2 cmp eobd, 107 jge .eob107 mova m11, m3 mova m2, m0 mova m3, m1 mova m0, [r6-32*4] mova m1, [r6-32*3] pxor m4, m4 .pass1_end_fast: vpbroadcastd m10, [pw_2048] lea r6, [deint_shuf+128] REPX {mova x, m4}, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end .eob107: mova [rsp+32*3], m3 mova [r6-32*2], m0 mova [r6-32*1], m1 call .pass1_main cmp eobd, 171 jge .eob171 pshufd m12, m2, q1032 pshufd m13, m3, q1032 mova m4, m0 mova m5, m1 pxor m6, m6 REPX {mova x, m6}, m7, m14, m15 jmp .pass1_end .eob171: mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 call .pass1_main pshufd m12, [r6+32*2], q1032 ; out19 out17 pshufd m13, [r6+32*3], q1032 ; out23 out21 mova m4, [r6+32*0] ; out16 out18 mova m5, [r6+32*1] ; out20 out22 pshufd m14, m2, q1032 ; out27 out25 pshufd m15, m3, q1032 ; out31 out29 mova m6, m0 ; out24 out26 mova m7, m1 ; out28 out30 .pass1_end: mova m0, [r6-32*4] ; out0 out2 mova m1, [r6-32*3] ; out4 out6 mova m2, [r6-32*2] ; out8 out10 mova m3, [r6-32*1] ; out12 out14 lea r6, [deint_shuf+128] mova m11, [rsp+32*3] ; out13 out15 vpbroadcastd m10, [pw_2048] call m(inv_txfm_add_dct_dct_8x32_8bpc).main .end: ; [rsp+0*32] = m12 vpbroadcastd m12, [pw_2048] mov cq, r4 mova [rsp+32*1], m8 mova [rsp+32*2], m9 mova [rsp+32*3], m10 mova [rsp+32*4], m11 vpermq m0, m0, q3120 vpermq m1, m1, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4_start vpermq m0, m2, q3120 vpermq m1, m3, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m5, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m6, q3120 vpermq m1, m7, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*1], q3120 vpermq m1, [rsp+32*2], q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*3], q3120 vpermq m1, [rsp+32*4], q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*0], q3120 vpermq m1, m13, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m14, q3120 vpermq m1, m15, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: imul r6d, [cq], 181 vpbroadcastd m2, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align .pass1_main_part1: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] mova m3, [cq+128*3] mova m4, [cq+128*4] mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] call m(idct_8x8_internal_10bpc).main psrld m1, m11, 10 ; pd_2 REPX {paddd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 ret ALIGN function_align .pass1_main: call .pass1_main_part1 add cq, 32 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 pshufb m0, m14 pshufb m2, m14 pshufb m4, m14 pshufb m6, m14 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 vperm2i128 m1, m0, m2, 0x31 ; 4 6 vinserti128 m0, xm2, 1 ; 0 2 vinserti128 m2, m3, xm4, 1 ; 1 3 vperm2i128 m3, m4, 0x31 ; 5 7 ret .main_oddhalf_part1_fast_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_part1_fast: ; lower half zero vpbroadcastd m7, [pd_4091] vpbroadcastd m8, [pd_201] vpbroadcastd m6, [pd_m1380] vpbroadcastd m9, [pd_3857] vpbroadcastd m5, [pd_3703] vpbroadcastd m10, [pd_1751] vpbroadcastd m4, [pd_m2751] vpbroadcastd m15, [pd_3035] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 pmulld m1, m9 pmulld m5, m2 pmulld m2, m10 pmulld m4, m3 pmulld m3, m15 jmp .main_oddhalf_part1_fast2 .main_oddhalf_part1_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a .main_oddhalf_part1_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 psubd m8, m0, m4 ; t17 paddd m0, m4 ; t16 psubd m4, m6, m2 ; t18 paddd m6, m2 ; t19 psubd m2, m1, m5 ; t29 paddd m1, m5 ; t28 psubd m5, m7, m3 ; t30 paddd m7, m3 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 vpbroadcastd m15, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a paddd m7, m1 ; t31a psubd m1, m5, m4 ; t18 paddd m5, m4 ; t17 psubd m4, m8, m2 ; t29 paddd m8, m2 ; t30 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 mova [r6-32*4], m0 mova [r6-32*3], m5 mova [r6-32*2], m4 mova [r6-32*1], m6 mova [r6+32*0], m3 mova [r6+32*1], m1 mova [r6+32*2], m8 mova [r6+32*3], m7 ret .main_oddhalf_part2_fast_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_part2_fast: ; lower half zero vpbroadcastd m7, [pd_m601] vpbroadcastd m8, [pd_4052] vpbroadcastd m6, [pd_3973] vpbroadcastd m9, [pd_995] vpbroadcastd m5, [pd_m2106] vpbroadcastd m10, [pd_3513] vpbroadcastd m4, [pd_3290] vpbroadcastd m15, [pd_2440] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 pmulld m1, m9 pmulld m5, m2 pmulld m2, m10 pmulld m4, m3 pmulld m3, m15 jmp .main_oddhalf_part2_fast2 .main_oddhalf_part2_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a .main_oddhalf_part2_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 psubd m8, m0, m4 ; t25 paddd m0, m4 ; t24 psubd m4, m6, m2 ; t26 paddd m6, m2 ; t27 psubd m2, m1, m5 ; t21 paddd m1, m5 ; t20 psubd m5, m7, m3 ; t22 paddd m7, m3 ; t23 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 vpbroadcastd m15, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a paddd m7, m1 ; t23a psubd m1, m5, m4 ; t21 paddd m5, m4 ; t22 psubd m4, m8, m2 ; t26 paddd m8, m2 ; t25 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 mova m9, [r6-32*4] ; t16a mova m10, [r6-32*3] ; t17 psubd m2, m9, m7 ; t23 paddd m9, m7 ; t16 psubd m7, m10, m5 ; t22a paddd m10, m5 ; t17a REPX {pmaxsd x, m12}, m9, m10, m2, m7 REPX {pminsd x, m13}, m9, m10, m2, m7 mova [r6-32*4], m9 mova [r6-32*3], m10 mova m9, [r6-32*2] ; t18a mova m10, [r6-32*1] ; t19 psubd m5, m9, m1 ; t21 paddd m9, m1 ; t18 psubd m1, m10, m6 ; t20a paddd m10, m6 ; t19a REPX {pmaxsd x, m12}, m9, m10, m5, m1 REPX {pminsd x, m13}, m9, m10, m5, m1 mova [r6-32*2], m9 mova [r6-32*1], m10 mova m9, [r6+32*0] ; t28 mova m10, [r6+32*1] ; t29a psubd m6, m9, m3 ; t27a paddd m9, m3 ; t28a psubd m3, m10, m4 ; t26 paddd m10, m4 ; t29 REPX {pmaxsd x, m12}, m9, m10, m6, m3 REPX {pminsd x, m13}, m9, m10, m6, m3 REPX {pmulld x, m14}, m6, m3, m1, m5 paddd m6, m11 paddd m3, m11 psubd m4, m6, m1 ; t20 paddd m6, m1 ; t27 psubd m1, m3, m5 ; t21a paddd m3, m5 ; t26a REPX {psrad x, 12 }, m4, m1, m3, m6 mova [r6+32*0], m4 mova [r6+32*1], m1 mova m4, [r6+32*2] ; t30 mova m1, [r6+32*3] ; t31a psubd m5, m4, m8 ; t25a paddd m4, m8 ; t30a psubd m8, m1, m0 ; t24 paddd m1, m0 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m1 REPX {pminsd x, m13}, m8, m5, m4, m1 REPX {pmulld x, m14}, m5, m8, m7, m2 paddd m5, m11 paddd m8, m11 psubd m0, m5, m7 ; t22 paddd m5, m7 ; t25 psubd m7, m8, m2 ; t23a paddd m2, m8 ; t24a REPX {psrad x, 12 }, m0, m7, m2, m5 mova [r6+32*2], m0 mova [r6+32*3], m7 mov r4, r6 add r6, 32*8 mova [r6-32*4], m2 mova [r6-32*3], m5 mova [r6-32*2], m3 mova [r6-32*1], m6 mova [r6+32*0], m9 mova [r6+32*1], m10 mova [r6+32*2], m4 mova [r6+32*3], m1 mov r5, r6 add r6, 32*8 ret ALIGN function_align .main_end: psrld m11, 10 ; pd_2 IDCT32_END 0, 15, 8, 9, 10, 2 IDCT32_END 1, 14, 8, 9, 10, 2 punpckhwd m8, m0, m1 ; 16 17 punpcklwd m0, m1 ; 0 1 punpcklwd m1, m14, m15 ; 14 15 punpckhwd m14, m15 ; 30 31 mova [r5+32*3], m8 mova [r5+32*2], m14 IDCT32_END 2, 15, 8, 9, 10, 2 IDCT32_END 3, 14, 8, 9, 10, 2 punpckhwd m8, m2, m3 ; 18 19 punpcklwd m2, m3 ; 2 3 punpcklwd m3, m14, m15 ; 12 13 punpckhwd m14, m15 ; 28 29 mova [r5+32*1], m8 mova [r5+32*0], m14 IDCT32_END 4, 15, 8, 9, 10, 2 IDCT32_END 5, 14, 8, 9, 10, 2 punpckhwd m8, m4, m5 ; 20 21 punpcklwd m4, m5 ; 4 5 punpcklwd m5, m14, m15 ; 10 11 punpckhwd m14, m15 ; 26 27 mova [r5-32*1], m8 mova [r5-32*2], m14 IDCT32_END 6, 15, 8, 9, 10, 2 IDCT32_END 7, 14, 8, 9, 10, 2 punpckhwd m8, m6, m7 ; 22 23 punpcklwd m6, m7 ; 6 7 punpcklwd m7, m14, m15 ; 8 9 punpckhwd m14, m15 ; 24 25 mova [r5-32*3], m8 mova [r5-32*4], m14 .transpose: punpckhdq m15, m3, m1 punpckldq m3, m1 punpckhdq m1, m4, m6 punpckldq m4, m6 punpckhdq m6, m0, m2 punpckldq m0, m2 punpckhdq m2, m7, m5 punpckldq m7, m5 punpcklqdq m5, m2, m15 punpckhqdq m2, m15 punpckhqdq m15, m7, m3 punpcklqdq m7, m3 punpckhqdq m3, m6, m1 punpcklqdq m6, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 vperm2i128 m4, m0, m7, 0x31 vinserti128 m0, xm7, 1 vperm2i128 m7, m3, m2, 0x31 vinserti128 m3, xm2, 1 vinserti128 m2, m6, xm5, 1 vperm2i128 m6, m5, 0x31 vperm2i128 m5, m1, m15, 0x31 vinserti128 m1, xm15, 1 ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m5, [pw_5] pxor m6, m6 mov r6d, eobd add eobb, 21 cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 lea r6, [strideq*3] lea r5, [strideq*5] lea r4, [strideq+r6*2] ; strideq*7 .loop: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {paddsw x, m5}, m0, m1, m2, m3 REPX {psraw x, 3 }, m0, m1, m2, m3 call .main_zero add cq, 32 lea dstq, [dstq+strideq*8] sub eobd, 64 jge .loop RET ALIGN function_align .main_zero: REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 .main: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpckhwd m4, m2, m1 punpcklwd m2, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 mova xm4, [dstq+strideq*0] vinserti128 m4, [dstq+strideq*4], 1 paddw m0, m4 mova xm4, [dstq+strideq*1] vinserti128 m4, [dstq+r5 ], 1 paddw m1, m4 mova xm4, [dstq+strideq*2] vinserti128 m4, [dstq+r6*2 ], 1 paddw m2, m4 mova xm4, [dstq+r6 ] vinserti128 m4, [dstq+r4 ], 1 paddw m3, m4 REPX {pmaxsw x, m6}, m0, m1, m2, m3 REPX {pminsw x, m7}, m0, m1, m2, m3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*4], m0, 1 mova [dstq+strideq*1], xm1 vextracti128 [dstq+r5 ], m1, 1 mova [dstq+strideq*2], xm2 vextracti128 [dstq+r6*2 ], m2, 1 mova [dstq+r6 ], xm3 vextracti128 [dstq+r4 ], m3, 1 ret cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] mov r4, cq lea r6, [rsp+32*4] call .pass1_main cmp eobd, 43 jge .eob43 jmp .pass2_fast .eob43: call .pass1_main cmp eobd, 107 jge .eob107 .pass2_fast: mov cq, r4 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] pmaxsd m0, m12, [cq+128*1+ 0] pmaxsd m1, m12, [cq+128*7+ 0] pmaxsd m2, m12, [cq+128*1+32] pmaxsd m3, m12, [cq+128*7+32] REPX {pminsd x, m13}, m0, m1, m2, m3 vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast pmaxsd m0, m12, [cq+128*3+ 0] pmaxsd m1, m12, [cq+128*5+ 0] pmaxsd m2, m12, [cq+128*3+32] pmaxsd m3, m12, [cq+128*5+32] REPX {pminsd x, m13}, m0, m1, m2, m3 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast pmaxsd m0, m12, [cq+128*2+ 0] pmaxsd m1, m12, [cq+128*6+ 0] pmaxsd m2, m12, [cq+128*2+32] pmaxsd m3, m12, [cq+128*6+32] REPX {pminsd x, m13}, m0, m1, m2, m3 call m(idct_8x16_internal_10bpc).main_oddhalf_fast pmaxsd m0, m12, [cq+128*0+ 0] pmaxsd m1, m12, [cq+128*4+ 0] pmaxsd m2, m12, [cq+128*0+32] pmaxsd m3, m12, [cq+128*4+32] REPX {pminsd x, m13}, m0, m1, m2, m3 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf jmp .pass2_end .eob107: call .pass1_main cmp eobd, 171 jge .eob171 jmp .pass2 .eob171: call .pass1_main .pass2: mov cq, r4 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] pmaxsd m0, m12, [cq+128*1+ 0] pmaxsd m1, m12, [cq+128*7+ 0] pmaxsd m2, m12, [cq+128*1+32] pmaxsd m3, m12, [cq+128*7+32] pmaxsd m4, m12, [cq+128*1+64] pmaxsd m5, m12, [cq+128*7+64] pmaxsd m6, m12, [cq+128*1+96] pmaxsd m7, m12, [cq+128*7+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 pmaxsd m0, m12, [cq+128*3+ 0] pmaxsd m1, m12, [cq+128*5+ 0] pmaxsd m2, m12, [cq+128*3+32] pmaxsd m3, m12, [cq+128*5+32] pmaxsd m4, m12, [cq+128*3+64] pmaxsd m5, m12, [cq+128*5+64] pmaxsd m6, m12, [cq+128*3+96] pmaxsd m7, m12, [cq+128*5+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 pmaxsd m0, m12, [cq+128*2+ 0] pmaxsd m1, m12, [cq+128*6+ 0] pmaxsd m2, m12, [cq+128*2+32] pmaxsd m3, m12, [cq+128*6+32] pmaxsd m4, m12, [cq+128*2+64] pmaxsd m5, m12, [cq+128*6+64] pmaxsd m6, m12, [cq+128*2+96] pmaxsd m7, m12, [cq+128*6+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x16_internal_10bpc).main_oddhalf pmaxsd m0, m12, [cq+128*0+ 0] pmaxsd m1, m12, [cq+128*4+ 0] pmaxsd m2, m12, [cq+128*0+32] pmaxsd m3, m12, [cq+128*4+32] pmaxsd m4, m12, [cq+128*0+64] pmaxsd m5, m12, [cq+128*4+64] pmaxsd m6, m12, [cq+128*0+96] pmaxsd m7, m12, [cq+128*4+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf .pass2_end: psrld m11, 8 ; pd_8 IDCT32_END 0, 15, 8, 9, 10, 4 IDCT32_END 1, 14, 8, 9, 10, 4 punpckhqdq m8, m0, m1 ; 16 17 (interleaved) punpcklqdq m0, m1 ; 0 1 (interleaved) punpcklqdq m1, m14, m15 ; 14 15 (interleaved) punpckhqdq m14, m15 ; 30 31 (interleaved) mova [r5+32*3], m8 mova [r5+32*2], m14 IDCT32_END 2, 15, 8, 9, 10, 4 IDCT32_END 3, 14, 8, 9, 10, 4 punpckhqdq m8, m2, m3 ; 18 19 (interleaved) punpcklqdq m2, m3 ; 2 3 (interleaved) punpcklqdq m3, m14, m15 ; 12 13 (interleaved) punpckhqdq m14, m15 ; 28 29 (interleaved) mova [r5+32*1], m8 mova [r5+32*0], m14 IDCT32_END 4, 15, 8, 9, 10, 4 IDCT32_END 5, 14, 8, 9, 10, 4 punpckhqdq m8, m4, m5 ; 20 21 (interleaved) punpcklqdq m4, m5 ; 4 5 (interleaved) punpcklqdq m5, m14, m15 ; 10 11 (interleaved) punpckhqdq m14, m15 ; 26 27 (interleaved) mova [r5-32*1], m8 mova [r5-32*2], m14 IDCT32_END 6, 15, 8, 9, 10, 4 IDCT32_END 7, 14, 8, 9, 10, 4 punpckhqdq m8, m6, m7 ; 22 23 (interleaved) punpcklqdq m6, m7 ; 6 7 (interleaved) punpcklqdq m7, m14, m15 ; 8 9 (interleaved) punpckhqdq m14, m15 ; 24 25 (interleaved) mova [r5-32*3], m8 mova [r5-32*4], m14 mova m15, m1 .end: vpermq m0, m0, q3120 vpermq m1, m2, q3120 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m6, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m7, q3120 vpermq m1, m5, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m3, q3120 vpermq m1, m15, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5+32*3], q3120 vpermq m1, [r5+32*1], q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5-32*1], q3120 vpermq m1, [r5-32*3], q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5-32*4], q3120 vpermq m1, [r5-32*2], q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5+32*0], q3120 vpermq m1, [r5+32*2], q3120 call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: imul r6d, [cq], 181 vpbroadcastd m2, [dconly_12bpc] mov [cq], eobd ; 0 or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align .pass1_main: call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 mova [cq+128*0], m0 mova [cq+128*1], m1 mova [cq+128*2], m2 mova [cq+128*3], m3 mova [cq+128*4], m4 mova [cq+128*5], m5 mova [cq+128*6], m6 mova [cq+128*7], m7 add cq, 32 ret ALIGN function_align .main_end: psrld m11, 10 ; pd_2 IDCT32_END 0, 15, 8, 9, 10, 2, 0 mova [cq+32*16], m8 mova [cq+32*31], m9 IDCT32_END 1, 14, 8, 9, 10, 2, 0 mova [cq+32*17], m8 mova [cq+32*30], m9 mova [cq+32*14], m14 IDCT32_END 2, 14, 8, 9, 10, 2, 0 mova [cq+32*18], m8 mova [cq+32*29], m9 mova [cq+32*13], m14 IDCT32_END 3, 14, 8, 9, 10, 2, 0 mova [cq+32*19], m8 mova [cq+32*28], m9 mova [cq+32*12], m14 IDCT32_END 4, 14, 8, 9, 10, 2, 0 mova [cq+32*20], m8 mova [cq+32*27], m9 mova [cq+32* 0], m0 mova [cq+32* 1], m1 mova [cq+32* 2], m2 IDCT32_END 5, 10, 0, 1, 2, 2, 0 mova [cq+32*21], m0 mova [cq+32*26], m1 IDCT32_END 6, 9, 0, 1, 2, 2, 0 mova [cq+32*22], m0 mova [cq+32*25], m1 IDCT32_END 7, 8, 0, 1, 2, 2, 0 mova [cq+32*23], m0 mova [cq+32*24], m1 mova m0, [cq+32* 0] mova m1, [cq+32* 1] mova m2, [cq+32* 2] mova m11, m14 mova m12, [cq+32*12] mova m13, [cq+32*13] mova m14, [cq+32*14] ret cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 8 .dconly: add r6d, 640 sar r6d, 10 .dconly2: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm3 vpbroadcastw m0, xm0 .dconly_loop: paddsw m1, m0, [dstq+32*0] paddsw m2, m0, [dstq+32*1] psubusw m1, m3 psubusw m2, m3 mova [dstq+32*0], m1 mova [dstq+32*1], m2 add dstq, strideq dec r3d jg .dconly_loop RET .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob lea r6, [rsp+32*4] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] call .pass1 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end lea r6, [deint_shuf+128] vpbroadcastd m11, [pw_2048] mov r4, dstq call .pass2 mova m0, [r5+32*3] ; 16 17 mova m1, [r5+32*2] ; 30 31 mova m2, [r5+32*1] ; 18 19 mova m3, [r5+32*0] ; 28 29 mova m4, [r5-32*1] ; 20 21 mova m5, [r5-32*2] ; 26 27 mova m6, [r5-32*3] ; 22 23 mova m7, [r5-32*4] ; 24 25 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose lea dstq, [r4+32] call .pass2 RET ALIGN function_align .pass2: call m(idct_16x8_internal_8bpc).main REPX {pmulhrsw x, m11}, m0, m1, m2, m3 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 pmulhrsw m2, m11, m6 pmulhrsw m3, m11, m7 jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .pass1: mova m0, [cq+32* 1] mova m1, [cq+32* 7] mova m2, [cq+32* 9] mova m3, [cq+32*15] mova m4, [cq+32*17] mova m5, [cq+32*23] mova m6, [cq+32*25] mova m7, [cq+32*31] vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+32* 3] mova m1, [cq+32* 5] mova m2, [cq+32*11] mova m3, [cq+32*13] mova m4, [cq+32*19] mova m5, [cq+32*21] mova m6, [cq+32*27] mova m7, [cq+32*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 mova m0, [cq+32* 2] mova m1, [cq+32* 6] mova m2, [cq+32*10] mova m3, [cq+32*14] mova m4, [cq+32*18] mova m5, [cq+32*22] mova m6, [cq+32*26] mova m7, [cq+32*30] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+32* 0] mova m1, [cq+32* 4] mova m2, [cq+32* 8] mova m3, [cq+32*12] mova m4, [cq+32*16] mova m5, [cq+32*20] mova m6, [cq+32*24] mova m7, [cq+32*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m5, [pw_4096] pxor m6, m6 mov r6d, eobd add eobb, 21 cmovc eobd, r6d lea r6, [strideq*3] lea r5, [strideq*5] lea r4, [strideq+r6*2] ; strideq*7 .loop: mova m0, [cq+32*0] packssdw m0, [cq+32*1] mova m1, [cq+32*2] packssdw m1, [cq+32*3] REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 add cq, 32*8 mova m2, [cq-32*4] packssdw m2, [cq-32*3] mova m3, [cq-32*2] packssdw m3, [cq-32*1] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 call m(inv_txfm_add_identity_identity_8x32_10bpc).main add dstq, 16 sub eobd, 64 jge .loop RET cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 181 vpbroadcastd m3, [dconly_12bpc] mov [cq], eobd ; 0 or r3d, 8 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob lea r6, [rsp+32*4] vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end mov r4, dstq call m(idct_16x8_internal_12bpc).pass2_main mova m0, [cq+32* 0] ; 16 mova m1, [cq+32* 1] ; 17 mova m2, [cq+32* 2] ; 18 mova m3, [cq+32* 3] ; 19 mova m4, [cq+32* 4] ; 20 mova m5, [cq+32* 5] ; 21 mova m6, [cq+32* 6] ; 22 mova m7, [cq+32* 7] ; 23 mova m8, [cq+32* 8] ; 24 mova m9, [cq+32* 9] ; 25 mova m10, [cq+32*10] ; 26 mova m11, [cq+32*11] ; 27 mova m12, [cq+32*12] ; 28 mova m13, [cq+32*13] ; 29 mova m14, [cq+32*14] ; 30 mova m15, [cq+32*15] ; 31 lea dstq, [r4+32] call m(idct_16x8_internal_12bpc).pass2_main RET cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 psubsw m%1, m%4 %if %1 == 0 pxor m6, m6 %endif pmulhrsw m%3, m15 pmulhrsw m%1, m15 paddw m%3, [dstq+%5] paddw m%1, [r2+%6] pmaxsw m%3, m6 pmaxsw m%1, m6 pminsw m%3, m7 pminsw m%1, m7 mova [dstq+%5], m%3 mova [r2+%6], m%1 %endmacro cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*16] lea r4, [r6+32*8] lea r5, [r6+32*16] call .main sub eobd, 44 jge .eob44 vperm2i128 m2, m0, m3, 0x31 ; 5 vinserti128 m0, xm3, 1 ; 1 vperm2i128 m3, m1, m4, 0x31 ; 7 vinserti128 m1, xm4, 1 ; 3 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 jmp .fast .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 .eob44: mova [r4+16*0], xm0 mova [r4+16*1], xm3 mova [r4+16*2], xm1 mova [r4+16*3], xm4 vextracti128 [r4+16*4], m0, 1 vextracti128 [r4+16*5], m3, 1 vextracti128 [r4+16*6], m1, 1 vextracti128 [r4+16*7], m4, 1 call .main sub eobd, 107 jge .eob151 vperm2i128 m7, m1, m4, 0x31 ; 15 vinserti128 m5, m1, xm4, 1 ; 11 vperm2i128 m6, m0, m3, 0x31 ; 13 vinserti128 m4, m0, xm3, 1 ; 9 mova m0, [r4+32*0] mova m1, [r4+32*1] mova m2, [r4+32*2] mova m3, [r4+32*3] .fast: lea r6, [pw_5+128] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp .idct16 .eob151: mova [r4-16*8], xm0 mova [r4-16*7], xm3 mova [r4-16*6], xm1 mova [r4-16*5], xm4 vextracti128 [r4-16*4], m0, 1 vextracti128 [r4-16*3], m3, 1 vextracti128 [r4-16*2], m1, 1 vextracti128 [r4-16*1], m4, 1 call .main sub eobd, 128 jge .eob279 vperm2i128 m10, m0, m3, 0x31 ; 21 vinserti128 m8, m0, xm3, 1 ; 17 vperm2i128 m11, m1, m4, 0x31 ; 23 vinserti128 m9, m1, xm4, 1 ; 19 pxor m12, m12 REPX {mova x, m12}, m13, m14, m15 REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 jmp .full .eob279: mova [r5+16*0], xm0 mova [r5+16*1], xm3 mova [r5+16*2], xm1 mova [r5+16*3], xm4 vextracti128 [r5+16*4], m0, 1 vextracti128 [r5+16*5], m3, 1 vextracti128 [r5+16*6], m1, 1 vextracti128 [r5+16*7], m4, 1 call .main vperm2i128 m14, m0, m3, 0x31 ; 29 vinserti128 m12, m0, xm3, 1 ; 25 vperm2i128 m15, m1, m4, 0x31 ; 31 vinserti128 m13, m1, xm4, 1 ; 27 mova m8, [r5+32*0] mova m9, [r5+32*1] mova m10, [r5+32*2] mova m11, [r5+32*3] .full: mova m0, [r4+32*0] mova m1, [r4+32*1] mova m2, [r4+32*2] mova m3, [r4+32*3] mova m4, [r4-32*4] mova m5, [r4-32*3] mova m6, [r4-32*2] mova m7, [r4-32*1] lea r6, [pw_5 + 128] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea r3, [rsp+32*8] mova m8, [r3+32*0] mova m9, [r3+32*1] mova m10, [r3+32*2] mova m11, [r3+32*3] mova m12, [r3-32*4] mova m13, [r3-32*3] mova m14, [r3-32*2] mova m15, [r3-32*1] .idct16: lea r3, [rsp+32*16] mova m0, [r3+32*0] mova m1, [r3+32*1] mova m2, [r3+32*2] mova m3, [r3+32*3] mova m4, [r3-32*4] mova m5, [r3-32*3] mova m6, [r3-32*2] mova m7, [r3-32*1] mova [rsp], m15 call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq call .pass2_end RET ALIGN function_align .main: pmulld m0, m14, [cq+128* 1] pmulld m1, m14, [cq+128* 3] pmulld m2, m14, [cq+128* 5] pmulld m3, m14, [cq+128* 7] pmulld m4, m14, [cq+128* 9] pmulld m5, m14, [cq+128*11] pmulld m6, m14, [cq+128*13] pmulld m7, m14, [cq+128*15] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 2] pmulld m2, m14, [cq+128* 4] pmulld m3, m14, [cq+128* 6] pmulld m4, m14, [cq+128* 8] pmulld m5, m14, [cq+128*10] pmulld m6, m14, [cq+128*12] pmulld m7, m14, [cq+128*14] call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf psrld m15, m11, 11 ; pd_1 mova m8, [r6-32*4] mova m9, [r6-32*3] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m10, m0, m8 ; out15 paddd m0, m8 ; out0 mova m8, [r6-32*2] paddd m15, m1, m9 ; out1 psubd m1, m9 ; out14 mova m9, [r6-32*1] REPX {psrad x, 1}, m0, m15, m10, m1 packssdw m0, m15 packssdw m1, m10 psubd m10, m2, m8 ; out13 paddd m2, m8 ; out2 mova m8, [r6+32*0] paddd m15, m3, m9 ; out3 psubd m3, m9 ; out12 mova m9, [r6+32*1] REPX {psrad x, 1}, m2, m15, m10, m3 packssdw m2, m15 packssdw m3, m10 psubd m10, m4, m8 ; out11 paddd m4, m8 ; out4 mova m8, [r6+32*2] paddd m15, m5, m9 ; out5 psubd m5, m9 ; out10 mova m9, [r6+32*3] REPX {psrad x, 1}, m4, m10, m15, m5 packssdw m4, m15 packssdw m5, m10 psubd m10, m6, m8 ; out9 paddd m6, m8 ; out6 paddd m15, m7, m9 ; out7 psubd m7, m9 ; out8 REPX {psrad x, 1}, m6, m10, m15, m7 packssdw m6, m15 packssdw m7, m10 punpckhwd m8, m0, m2 punpcklwd m0, m2 punpckhwd m2, m3, m1 punpcklwd m3, m1 punpckhwd m1, m4, m6 punpcklwd m4, m6 punpcklwd m6, m7, m5 punpckhwd m7, m5 pxor m5, m5 mov r7d, 128*13 .main_zero_loop: mova [cq+r7-128*1], m5 mova [cq+r7+128*0], m5 mova [cq+r7+128*1], m5 mova [cq+r7+128*2], m5 sub r7d, 128*4 jg .main_zero_loop add cq, 32 punpcklwd m5, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m1 punpckhwd m4, m1 punpckhwd m1, m0, m8 punpcklwd m0, m8 punpckhwd m8, m6, m7 punpcklwd m6, m7 punpcklqdq m7, m1, m4 punpckhqdq m1, m4 punpckhqdq m4, m8, m3 punpcklqdq m8, m3 punpckhqdq m3, m6, m5 punpcklqdq m6, m5 punpcklqdq m5, m0, m2 punpckhqdq m0, m2 mova [r6+16*0], xm5 mova [r6+16*1], xm6 mova [r6+16*2], xm7 mova [r6+16*3], xm8 vextracti128 [r6+16*4], m5, 1 vextracti128 [r6+16*5], m6, 1 vextracti128 [r6+16*6], m7, 1 vextracti128 [r6+16*7], m8, 1 sub r6, 32*4 ret ALIGN function_align .pass2_end: mova [rsp+gprsize+32*0], m6 mova [rsp+gprsize+32*2], m7 mova [rsp+gprsize+32*3], m15 vpbroadcastd m15, [pw_2048] vpbroadcastd m7, [pixel_10bpc_max] IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*1] IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*0] IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*2] mova m2, [rsp+gprsize+32*3] IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 ret cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m11, [pw_8192] lea r6, [strideq*5] pxor m6, m6 paddw m10, m11, m11 ; pw_16384 mov r5, dstq call .main sub eobd, 36 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main sub cq, 128*8-32 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 107 ; eob < 143 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main sub cq, 128*8-32 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 271 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main sub cq, 128*8-32 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 399 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main .ret: RET ALIGN function_align .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 .main2: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpcklwd m4, m2, m1 punpckhwd m2, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 call m(iidentity_8x8_internal_10bpc).write_2x8x2 punpcklqdq m0, m3, m2 punpckhqdq m1, m3, m2 jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*4] call .main cmp eobd, 36 jge .full call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] lea r6, [pw_5+128] mov r7, dstq call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] jmp .end .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 mova [r4+32*3], m0 mova [r4+32*2], m1 mova [r4+32*1], m2 mova [r4+32*0], m3 mova [r4-32*1], m4 mova [r4-32*2], m5 mova [r4-32*3], m6 mova [r4-32*4], m7 call .main sub r4, 32*16 ; topleft 16x8 call .transpose_16x16 lea r6, [pw_5+128] mov r7, dstq call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] add r4, 32*8 ; bottomleft 16x8 call .transpose_16x16 .end: lea dstq, [r7+32] call m(idct_16x16_internal_8bpc).main call .write_16x16 RET ALIGN function_align .transpose_16x16: punpckhdq m8, m3, m1 punpckldq m3, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckhdq m2, m7, m5 punpckldq m7, m5 punpckhdq m5, m4, m6 punpckldq m4, m6 punpckhqdq m6, m0, m4 punpcklqdq m0, m4 punpckhqdq m4, m1, m5 punpcklqdq m1, m5 punpckhqdq m5, m7, m3 punpcklqdq m7, m3 punpckhqdq m3, m2, m8 punpcklqdq m2, m8 vinserti128 m8, m0, xm7, 1 vperm2i128 m12, m0, m7, 0x31 vinserti128 m9, m6, xm5, 1 vperm2i128 m13, m6, m5, 0x31 vinserti128 m10, m1, xm2, 1 vperm2i128 m14, m1, m2, 0x31 vinserti128 m11, m4, xm3, 1 vperm2i128 m15, m4, m3, 0x31 mova m0, [r4+32*3] mova m1, [r4+32*2] mova m2, [r4+32*1] mova m3, [r4+32*0] mova m4, [r4-32*1] mova m5, [r4-32*2] mova m6, [r4-32*3] mova m7, [r4-32*4] mova [rsp+gprsize], m15 jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose ALIGN function_align .main: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] pmulld m0, m14, [cq+64* 1] pmulld m1, m14, [cq+64* 7] pmulld m2, m14, [cq+64* 9] pmulld m3, m14, [cq+64*15] pmulld m4, m14, [cq+64*17] pmulld m5, m14, [cq+64*23] pmulld m6, m14, [cq+64*25] pmulld m7, m14, [cq+64*31] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 pmulld m0, m14, [cq+64* 3] pmulld m1, m14, [cq+64* 5] pmulld m2, m14, [cq+64*11] pmulld m3, m14, [cq+64*13] pmulld m4, m14, [cq+64*19] pmulld m5, m14, [cq+64*21] pmulld m6, m14, [cq+64*27] pmulld m7, m14, [cq+64*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 pmulld m0, m14, [cq+64* 2] pmulld m1, m14, [cq+64* 6] pmulld m2, m14, [cq+64*10] pmulld m3, m14, [cq+64*14] pmulld m4, m14, [cq+64*18] pmulld m5, m14, [cq+64*22] pmulld m6, m14, [cq+64*26] pmulld m7, m14, [cq+64*30] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+64* 0] pmulld m1, m14, [cq+64* 4] pmulld m2, m14, [cq+64* 8] pmulld m3, m14, [cq+64*12] pmulld m4, m14, [cq+64*16] pmulld m5, m14, [cq+64*20] pmulld m6, m14, [cq+64*24] pmulld m7, m14, [cq+64*28] call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf pxor m8, m8 mov r7d, 64*30 .main_zero_loop: mova [cq+r7-64*2], m8 mova [cq+r7-64*1], m8 mova [cq+r7+64*0], m8 mova [cq+r7+64*1], m8 sub r7d, 64*4 jg .main_zero_loop .main_end: psrld m11, 11 ; pd_1 IDCT32_END 0, 15, 8, 9, 10, 1 IDCT32_END 1, 14, 8, 9, 10, 1 punpckhwd m8, m0, m1 ; 16 17 punpcklwd m0, m1 ; 0 1 punpcklwd m1, m14, m15 ; 14 15 punpckhwd m14, m15 ; 30 31 mova [r5+32*3], m8 mova [r5+32*2], m14 IDCT32_END 2, 15, 8, 9, 10, 1 IDCT32_END 3, 14, 8, 9, 10, 1 punpckhwd m8, m2, m3 ; 18 19 punpcklwd m2, m3 ; 2 3 punpcklwd m3, m14, m15 ; 12 13 punpckhwd m14, m15 ; 28 29 mova [r5+32*1], m8 mova [r5+32*0], m14 IDCT32_END 4, 15, 8, 9, 10, 1 IDCT32_END 5, 14, 8, 9, 10, 1 punpckhwd m8, m4, m5 ; 20 21 punpcklwd m4, m5 ; 4 5 punpcklwd m5, m14, m15 ; 10 11 punpckhwd m14, m15 ; 26 27 mova [r5-32*1], m8 mova [r5-32*2], m14 IDCT32_END 6, 15, 8, 9, 10, 1 IDCT32_END 7, 14, 8, 9, 10, 1 punpckhwd m8, m6, m7 ; 22 23 punpcklwd m6, m7 ; 6 7 punpcklwd m7, m14, m15 ; 8 9 punpckhwd m14, m15 ; 24 25 mova [r5-32*3], m8 mova [r5-32*4], m14 ret ALIGN function_align .write_16x16: mova m1, [rsp+gprsize+32*1] mova [rsp+gprsize+32*0], m8 mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_2048] vpbroadcastd m9, [pixel_10bpc_max] lea r3, [strideq*3] pxor m8, m8 pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, [rsp+gprsize+32*0] pmulhrsw m1, m12, [rsp+gprsize+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m12, m11 call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, [rsp+gprsize+32*2] pmulhrsw m1, m12, m13 pmulhrsw m2, m12, m14 pmulhrsw m3, m12, m15 jmp m(idct_16x8_internal_10bpc).write_16x4 cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m10, [pw_4096] lea r6, [strideq*5] pxor m6, m6 mov r5, dstq call .main sub eobd, 36 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main add cq, 64*8-32 lea dstq, [r5+16*1] call .main sub eobd, 107 ; eob < 143 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main add cq, 64*8-32 lea dstq, [r5+16*2] call .main sub eobd, 128 ; eob < 271 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main add cq, 64*8-32 lea dstq, [r5+16*3] call .main sub eobd, 128 ; eob < 399 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main .ret: RET ALIGN function_align .main: mova m0, [cq+64*0] packssdw m0, [cq+64*1] mova m1, [cq+64*2] packssdw m1, [cq+64*3] mova m2, [cq+64*4] packssdw m2, [cq+64*5] mova m3, [cq+64*6] packssdw m3, [cq+64*7] REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 REPX {paddsw x, x }, m0, m1, m2, m3 REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*7] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] pxor m0, m0 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r3, [rsp+32*3] mov r4, r6 lea r5, [r6+32*8] lea r6, [pw_5+128] call .pass2_oddhalf call .pass2_evenhalf imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] add dstq, 32 lea r3, [rsp+32*11] call .pass2_oddhalf call .pass2_evenhalf lea r3, [strideq*3] call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end RET ALIGN function_align .main: mova m0, [cq+128* 1] mova m1, [cq+128* 7] mova m2, [cq+128* 9] mova m3, [cq+128*15] mova m4, [cq+128*17] mova m5, [cq+128*23] mova m6, [cq+128*25] mova m7, [cq+128*31] vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+128* 3] mova m1, [cq+128* 5] mova m2, [cq+128*11] mova m3, [cq+128*13] mova m4, [cq+128*19] mova m5, [cq+128*21] mova m6, [cq+128*27] mova m7, [cq+128*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 mova m0, [cq+128* 2] mova m1, [cq+128* 6] mova m2, [cq+128*10] mova m3, [cq+128*14] mova m4, [cq+128*18] mova m5, [cq+128*22] mova m6, [cq+128*26] mova m7, [cq+128*30] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] mova m3, [cq+128*12] mova m4, [cq+128*16] mova m5, [cq+128*20] mova m6, [cq+128*24] mova m7, [cq+128*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop add cq, 32 mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 mova [r5-32*1], m3 mova [r5+32*0], m4 mova [r5+32*1], m5 mova [r5+32*2], m6 mova [r5+32*3], m7 ret ALIGN function_align .pass2_oddhalf: mova m0, [r3+32* 1] ; 1 mova m1, [r3+32* 3] ; 3 mova m2, [r3+32* 5] ; 5 mova m3, [r3+32* 7] ; 7 mova m4, [r3+32*17] ; 9 mova m5, [r3+32*19] ; 11 mova m6, [r3+32*21] ; 13 mova m7, [r3+32*23] ; 15 mova m8, [r3+32*33] ; 17 mova m9, [r3+32*35] ; 19 mova m10, [r3+32*37] ; 21 mova m11, [r3+32*39] ; 23 mova m12, [r3+32*49] ; 25 mova m13, [r3+32*51] ; 27 mova m14, [r3+32*53] ; 29 mova m15, [r3+32*55] ; 31 jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf ALIGN function_align .pass2_evenhalf: mova m0, [r3+32* 0] ; 0 mova m1, [r3+32* 2] ; 2 mova m2, [r3+32* 4] ; 4 mova m3, [r3+32* 6] ; 6 mova m4, [r3+32*16] ; 8 mova m5, [r3+32*18] ; 10 mova m6, [r3+32*20] ; 12 mova m7, [r3+32*22] ; 14 mova m8, [r3+32*32] ; 16 mova m9, [r3+32*34] ; 18 mova m10, [r3+32*36] ; 20 mova m11, [r3+32*38] ; 22 mova m12, [r3+32*48] ; 24 mova m13, [r3+32*50] ; 26 mova m14, [r3+32*52] ; 28 mova m15, [r3+32*54] ; 30 mova [rsp+gprsize], m15 jmp m(idct_16x16_internal_8bpc).main cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob %undef cmp vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m5, [pw_8192] pxor m6, m6 lea r6, [strideq*3] lea r5, [strideq*5] lea r4, [strideq+r6*2] ; strideq*7 call .main ; 0 cmp eobd, 36 jl .ret add cq, 128*8 ; 0 1 mov r7, dstq ; 1 add dstq, 16 call .main call .main2 cmp eobd, 136 jl .ret add cq, 128*16-32 ; 0 1 2 lea dstq, [r7+16*2] ; 1 2 call .main ; 2 call .main2 call .main2 cmp eobd, 300 jl .ret add cq, 128*24-64 ; 0 1 2 3 add r7, 16*3 ; 1 2 3 mov dstq, r7 ; 2 3 call .main ; 3 call .main2 call .main2 call .main2 cmp eobd, 535 jl .ret add cq, 128*24-64 ; 0 1 2 3 lea dstq, [r7+strideq*8] ; 1 2 3 4 mov r7, dstq ; 2 3 4 call .main ; 3 4 call .main2 call .main2 cmp eobd, 755 jl .ret add cq, 128*16-32 ; 0 1 2 3 lea dstq, [r7+strideq*8] ; 1 2 3 4 call .main ; 2 3 4 5 call .main2 ; 3 4 5 cmp eobd, 911 jl .ret add cq, 128*8 ; 0 1 2 3 add dstq, 16 ; 1 2 3 4 call .main ; 2 3 4 5 .ret: ; 3 4 5 6 RET ALIGN function_align .main2: sub cq, 128*8-32 lea dstq, [dstq+strideq*8-16] .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n mova m%4, [r4-32*(14+%1)] ; idct32 out31-n %else mova m%5, [r4-32*(45-%1)] mova m%4, [r5-32*(20+%1)] %endif paddsw m%6, m%5, m%4 ; idct32 out 0+n psubsw m%5, m%4 ; idct32 out31-n paddsw m%4, m%5, m%3 ; out31-n psubsw m%5, m%3 ; out32+n paddsw m%3, m%6, m%2 ; out 0+n psubsw m%6, m%2 ; out63-n REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 %if %1 & 1 %define %%d0 r2 %define %%d1 dstq %else %define %%d0 dstq %define %%d1 r2 %endif paddw m%3, [%%d0+%7 ] paddw m%4, [%%d1+%8 ] paddw m%5, [%%d0+%9 ] paddw m%6, [%%d1+%10] pxor m%2, m%2 REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 vpbroadcastd m%2, [pixel_10bpc_max] REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 mova [%%d0+%7 ], m%3 mova [%%d1+%8 ], m%4 mova [%%d0+%9 ], m%5 mova [%%d1+%10], m%6 %endmacro cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*6] call .main sub eobd, 44 jl .fast call .main sub eobd, 107 jl .fast call .main sub eobd, 128 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 64 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 .fast: lea r4, [rsp+32*38] pxor m0, m0 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r6, [pw_5+128] mova m0, [rsp+32* 2] ; in0 mova m1, [rsp+32* 6] ; in4 mova m2, [rsp+32*10] ; in8 mova m3, [rsp+32*14] ; in12 mova m4, [rsp+32*18] ; in16 mova m5, [rsp+32*22] ; in20 mova m6, [rsp+32*26] ; in24 mova m7, [rsp+32*30] ; in28 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*38] mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 add r4, 32*8 mova [r4-32*4], m8 mova [r4-32*3], m9 mova [r4-32*2], m10 mova [r4-32*1], m11 mova [r4+32*0], m12 mova [r4+32*1], m13 mova [r4+32*2], m14 mova [r4+32*3], m15 mova m0, [rsp+32* 4] ; in2 mova m1, [rsp+32* 8] ; in6 mova m2, [rsp+32*12] ; in10 mova m3, [rsp+32*16] ; in14 mova m4, [rsp+32*20] ; in18 mova m5, [rsp+32*24] ; in22 mova m6, [rsp+32*28] ; in26 mova m7, [rsp+32*32] ; in30 lea r5, [r4+32*16] add r4, 32*8 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [rsp+32* 3] ; in1 mova m1, [rsp+32*33] ; in31 mova m2, [rsp+32*19] ; in17 mova m3, [rsp+32*17] ; in15 mova m4, [rsp+32*11] ; in9 mova m5, [rsp+32*25] ; in23 mova m6, [rsp+32*27] ; in25 mova m7, [rsp+32* 9] ; in7 lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [rsp+32* 7] ; in5 mova m1, [rsp+32*29] ; in27 mova m2, [rsp+32*23] ; in21 mova m3, [rsp+32*13] ; in11 mova m4, [rsp+32*15] ; in13 mova m5, [rsp+32*21] ; in19 mova m6, [rsp+32*31] ; in29 mova m7, [rsp+32* 5] ; in3 add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 call .main_part2_pass2 RET ALIGN function_align .main: mova m0, [cq+128* 1] mova m1, [cq+128* 3] mova m2, [cq+128* 5] mova m3, [cq+128* 7] mova m4, [cq+128* 9] mova m5, [cq+128*11] mova m6, [cq+128*13] mova m7, [cq+128*15] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+128* 0] mova m1, [cq+128* 2] mova m2, [cq+128* 4] mova m3, [cq+128* 6] mova m4, [cq+128* 8] mova m5, [cq+128*10] mova m6, [cq+128*12] mova m7, [cq+128*14] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf pxor m15, m15 mov r7d, 128*13 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop add cq, 32 psrld m15, m11, 10 ; pd_2 mova m8, [r6-32*4] mova m9, [r6+32*3] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m10, m0, m8 ; out15 paddd m0, m8 ; out0 mova m8, [r6-32*3] psubd m15, m7, m9 ; out8 paddd m7, m9 ; out7 mova m9, [r6+32*2] REPX {psrad x, 2}, m0, m15, m10, m7 packssdw m0, m15 packssdw m7, m10 psubd m10, m1, m8 ; out14 paddd m1, m8 ; out1 mova m8, [r6-32*2] psubd m15, m6, m9 ; out9 paddd m6, m9 ; out6 mova m9, [r6+32*1] REPX {psrad x, 2}, m1, m15, m10, m6 packssdw m1, m15 packssdw m6, m10 psubd m10, m2, m8 ; out13 paddd m2, m8 ; out2 mova m8, [r6-32*1] psubd m15, m5, m9 ; out10 paddd m5, m9 ; out5 mova m9, [r6+32*0] REPX {psrad x, 2}, m2, m15, m10, m5 packssdw m2, m15 packssdw m5, m10 psubd m10, m3, m8 ; out12 paddd m3, m8 ; out3 psubd m15, m4, m9 ; out11 paddd m4, m9 ; out4 REPX {psrad x, 2}, m3, m15, m10, m4 packssdw m3, m15 packssdw m4, m10 call m(idct_16x8_internal_10bpc).transpose3 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 add r6, 32*8 ret .main_part2_pass2: vpbroadcastd m11, [pw_1567_3784] vpbroadcastd m12, [pw_m3784_1567] vpbroadcastd m13, [pw_2896_2896] lea r6, [pw_5+128] lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [pw_m2896_2896] call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal vpbroadcastd m14, [pw_2048] IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 add dstq, strideq sub r2, strideq cmp r4, r5 jne .main_part2_pass2_loop ret ALIGN function_align .main_part1_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_part1: ; idct64 steps 1-5 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vpbroadcastd m7, [r5+4*0] vpbroadcastd m8, [r5+4*1] vpbroadcastd m6, [r5+4*2] vpbroadcastd m9, [r5+4*3] vpbroadcastd m5, [r5+4*4] vpbroadcastd m10, [r5+4*5] vpbroadcastd m4, [r5+4*6] vpbroadcastd m15, [r5+4*7] pmulld m7, m0 ; t63a pmulld m0, m8 ; t32a pmulld m6, m1 ; t62a pmulld m1, m9 ; t33a pmulld m5, m2 ; t61a pmulld m2, m10 ; t34a pmulld m4, m3 ; t60a pmulld m3, m15 ; t35a vpbroadcastd m10, [r5+4*8] vpbroadcastd m15, [r5+4*9] REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 psubd m8, m0, m1 ; t33 paddd m0, m1 ; t32 psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m5 ; t61 paddd m4, m5 ; t60 REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 vpbroadcastd m10, [r5+4*10] vpbroadcastd m15, [r5+4*11] psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m8, m2 ; t61 paddd m8, m2 ; t62 REPX {pmaxsd x, m12}, m5, m3, m4, m6 REPX {pminsd x, m13}, m5, m3, m4, m6 ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a REPX {pmaxsd x, m12}, m0, m7, m1, m8 REPX {pminsd x, m13}, m0, m7, m1, m8 add r5, 4*12 mova [r6-32*4], m0 mova [r6+32*3], m7 mova [r6-32*3], m1 mova [r6+32*2], m8 mova [r6-32*2], m6 mova [r6+32*1], m4 mova [r6-32*1], m3 mova [r6+32*0], m5 add r6, 32*8 ret .main_part2: ; idct64 steps 6-9 lea r5, [r6+32*3] sub r6, 32*4 vpbroadcastd m10, [pd_1567] vpbroadcastd m15, [pd_3784] .main_part2_loop: mova m0, [r6-32*32] ; t32a mova m1, [r5-32*24] ; t39a mova m2, [r5-32*32] ; t63a mova m3, [r6-32*24] ; t56a mova m4, [r6-32*16] ; t40a mova m5, [r5-32* 8] ; t47a mova m6, [r5-32*16] ; t55a mova m7, [r6-32* 8] ; t48a psubd m8, m0, m1 ; t39 paddd m0, m1 ; t32 psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a paddd m2, m7 ; t63a psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m8, m4 ; t55 paddd m8, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 REPX {pmaxsd x, m12}, m6, m7, m5, m4 REPX {pminsd x, m13}, m6, m7, m5, m4 REPX {pmulld x, m14}, m6, m7, m5, m4 REPX {pmaxsd x, m12}, m2, m0, m8, m1 REPX {pminsd x, m13}, m2, m0, m8, m1 paddd m6, m11 paddd m5, m11 psubd m3, m6, m7 ; t47 paddd m6, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m3, m6, m7, m5 mova [r5-32* 8], m2 mova [r6-32*32], m0 mova [r6-32* 8], m8 mova [r5-32*32], m1 mova [r5-32*24], m3 mova [r6-32*16], m6 mova [r6-32*24], m7 mova [r5-32*16], m5 add r6, 32 sub r5, 32 cmp r6, r5 jl .main_part2_loop ret cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*6] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] pxor m0, m0 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r6, [pw_5 + 128] mov r10, rsp lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 .pass2_loop: mova m0, [r10+32* 2] ; in0 mova m1, [r10+32* 6] ; in4 mova m2, [r10+32*18] ; in8 mova m3, [r10+32*22] ; in12 mova m4, [r10+32*34] ; in16 mova m5, [r10+32*38] ; in20 mova m6, [r10+32*50] ; in24 mova m7, [r10+32*54] ; in28 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*70] mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 add r4, 32*8 mova [r4-32*4], m8 mova [r4-32*3], m9 mova [r4-32*2], m10 mova [r4-32*1], m11 mova [r4+32*0], m12 mova [r4+32*1], m13 mova [r4+32*2], m14 mova [r4+32*3], m15 mova m0, [r10+32* 4] ; in2 mova m1, [r10+32* 8] ; in6 mova m2, [r10+32*20] ; in10 mova m3, [r10+32*24] ; in14 mova m4, [r10+32*36] ; in18 mova m5, [r10+32*40] ; in22 mova m6, [r10+32*52] ; in26 mova m7, [r10+32*56] ; in30 lea r5, [r4+32*16] add r4, 32*8 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10+32* 3] ; in1 mova m1, [r10+32*57] ; in31 mova m2, [r10+32*35] ; in17 mova m3, [r10+32*25] ; in15 mova m4, [r10+32*19] ; in9 mova m5, [r10+32*41] ; in23 mova m6, [r10+32*51] ; in25 mova m7, [r10+32* 9] ; in7 lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10+32* 7] ; in5 mova m1, [r10+32*53] ; in27 mova m2, [r10+32*39] ; in21 mova m3, [r10+32*21] ; in11 mova m4, [r10+32*23] ; in13 mova m5, [r10+32*37] ; in19 mova m6, [r10+32*55] ; in29 mova m7, [r10+32* 5] ; in3 add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 add r10, 32*8 sub r4, 32*98 ; rsp+32*16 sub dstq, r8 add dstq, 32 cmp r10, r4 jl .pass2_loop RET ALIGN function_align .main: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] pmulld m0, m14, [cq+128* 1] pmulld m1, m14, [cq+128* 7] pmulld m2, m14, [cq+128* 9] pmulld m3, m14, [cq+128*15] pmulld m4, m14, [cq+128*17] pmulld m5, m14, [cq+128*23] pmulld m6, m14, [cq+128*25] pmulld m7, m14, [cq+128*31] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 pmulld m0, m14, [cq+128* 3] pmulld m1, m14, [cq+128* 5] pmulld m2, m14, [cq+128*11] pmulld m3, m14, [cq+128*13] pmulld m4, m14, [cq+128*19] pmulld m5, m14, [cq+128*21] pmulld m6, m14, [cq+128*27] pmulld m7, m14, [cq+128*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 pmulld m0, m14, [cq+128* 2] pmulld m1, m14, [cq+128* 6] pmulld m2, m14, [cq+128*10] pmulld m3, m14, [cq+128*14] pmulld m4, m14, [cq+128*18] pmulld m5, m14, [cq+128*22] pmulld m6, m14, [cq+128*26] pmulld m7, m14, [cq+128*30] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 4] pmulld m2, m14, [cq+128* 8] pmulld m3, m14, [cq+128*12] pmulld m4, m14, [cq+128*16] pmulld m5, m14, [cq+128*20] pmulld m6, m14, [cq+128*24] pmulld m7, m14, [cq+128*28] pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop add cq, 32 call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 mova [r5-32*1], m3 mova [r5+32*0], m4 mova [r5+32*1], m5 mova [r5+32*2], m6 mova [r5+32*3], m7 ret cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .normal imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 16 .dconly: add r6d, 640 sar r6d, 10 .dconly2: vpbroadcastd m5, [dconly_10bpc] imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm5 vpbroadcastw m0, xm0 .dconly_loop: paddsw m1, m0, [dstq+32*0] paddsw m2, m0, [dstq+32*1] paddsw m3, m0, [dstq+32*2] paddsw m4, m0, [dstq+32*3] REPX {psubusw x, m5}, m1, m2, m3, m4 mova [dstq+32*0], m1 mova [dstq+32*1], m2 mova [dstq+32*2], m3 mova [dstq+32*3], m4 add dstq, strideq dec r3d jg .dconly_loop RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] call .main call .shift_transpose cmp eobd, 36 jl .fast call .main call .shift_transpose jmp .pass2 .fast: pxor m0, m0 mov r3d, 4 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 dec r3d jg .fast_loop .pass2: lea r7, [r6-32*64] lea r4, [r6-32*32] lea r6, [pw_5+128] mov r5, dstq .pass2_loop: mova m0, [r7-32*4] mova m1, [r7-32*3] mova m2, [r7-32*2] mova m3, [r7-32*1] mova m4, [r7+32*0] mova m5, [r7+32*1] mova m6, [r7+32*2] mova m7, [r7+32*3] add r7, 32*32 mova m8, [r7-32*4] mova m9, [r7-32*3] mova m10, [r7-32*2] mova m11, [r7-32*1] mova m12, [r7+32*0] mova m13, [r7+32*1] mova m14, [r7+32*2] mova m15, [r7+32*3] sub r7, 32*24 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 add r5, 32 mov dstq, r5 cmp r7, r4 jl .pass2_loop RET ALIGN function_align .main: lea r5, [idct64_mul_16bpc] mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 mova m0, [cq+64* 2] mova m1, [cq+64*14] mova m2, [cq+64*18] mova m3, [cq+64*30] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast mova m0, [cq+64* 6] mova m1, [cq+64*10] mova m2, [cq+64*22] mova m3, [cq+64*26] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast mova m0, [cq+64* 4] mova m1, [cq+64*12] mova m2, [cq+64*20] mova m3, [cq+64*28] call m(idct_8x16_internal_10bpc).main_oddhalf_fast mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] mova m3, [cq+64*24] pxor m15, m15 mov r7d, 64*30 .main_zero_loop: mova [cq+r7-64*2], m15 mova [cq+r7-64*1], m15 mova [cq+r7+64*0], m15 mova [cq+r7+64*1], m15 sub r7d, 64*4 jg .main_zero_loop .main_end: psrld m15, m11, 10 ; pd_2 .main_end2: add cq, 32 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_10bpc).main add r6, 32*8 call m(idct_8x16_internal_10bpc).main_evenhalf mova [r6+32*2], m1 mova [r6+32*1], m2 mova [r6+32*0], m3 mova [r6-32*1], m4 mova [r6-32*2], m5 mova [r6-32*3], m6 mova [r6-32*4], m7 jmp .main_end_loop_start .main_end_loop: mova m0, [r6+32* 3] ; idct8 0 + n .main_end_loop_start: mova m1, [r5+32* 4] ; idct16 15 - n mova m2, [r5-32*12] ; idct32 16 + n mova m3, [r6-32*13] ; idct32 31 - n mova m4, [r6-32*29] ; idct64 63 - n mova m5, [r5-32*28] ; idct64 48 + n mova m6, [r6-32*45] ; idct64 47 - n mova m7, [r5-32*44] ; idct64 32 + n paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 REPX {pminsd x, m13}, m8, m0 paddd m1, m8, m3 ; idct32 out0 + n psubd m8, m3 ; idct32 out31 - n paddd m3, m0, m2 ; idct32 out15 - n psubd m0, m2 ; idct32 out16 + n REPX {pmaxsd x, m12}, m1, m8, m3, m0 REPX {pminsd x, m13}, m1, m3, m8, m0 REPX {paddd x, m15}, m1, m3, m0, m8 paddd m2, m1, m4 ; idct64 out0 + n (unshifted) psubd m1, m4 ; idct64 out63 - n (unshifted) paddd m4, m3, m5 ; idct64 out15 - n (unshifted) psubd m3, m5 ; idct64 out48 + n (unshifted) paddd m5, m0, m6 ; idct64 out16 + n (unshifted) psubd m0, m6 ; idct64 out47 - n (unshifted) paddd m6, m8, m7 ; idct64 out31 - n (unshifted) psubd m8, m7 ; idct64 out32 + n (unshifted) mova [r5-32*44], m2 mova [r6+32* 3], m1 mova [r6-32*45], m4 mova [r5+32* 4], m3 mova [r5-32*28], m5 mova [r6-32*13], m0 mova [r6-32*29], m6 mova [r5-32*12], m8 add r5, 32 sub r6, 32 cmp r5, r6 jl .main_end_loop ret .shift_transpose: %macro IDCT64_SHIFT_TRANSPOSE 1 ; shift sub r6, 32*48 mov r5, r6 %%loop: mova m0, [r6-32* 4] mova m4, [r6+32* 4] mova m1, [r6-32* 3] mova m5, [r6+32* 5] mova m2, [r6-32* 2] mova m6, [r6+32* 6] mova m3, [r6-32* 1] mova m7, [r6+32* 7] REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 mova m4, [r6+32* 0] mova m6, [r6+32* 8] mova m5, [r6+32* 1] mova m7, [r6+32* 9] REPX {psrad x, %1}, m4, m6, m5, m7 packssdw m4, m6 packssdw m5, m7 mova m6, [r6+32* 2] mova m8, [r6+32*10] mova m7, [r6+32* 3] mova m9, [r6+32*11] REPX {psrad x, %1}, m6, m8, m7, m9 packssdw m6, m8 packssdw m7, m9 call m(idct_16x8_internal_10bpc).transpose3 mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 mova [r5-32*1], m3 mova [r5+32*0], m4 mova [r5+32*1], m5 mova [r5+32*2], m6 mova [r5+32*3], m7 add r6, 32*16 add r5, 32*8 cmp r5, r4 jl %%loop mov r6, r4 %endmacro IDCT64_SHIFT_TRANSPOSE 2 ret cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*7] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .fast: pxor m0, m0 lea r4, [rsp+32*135] .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r7, [r6-32*32] lea r5, [r6+32*8] lea r6, [pw_5+128] imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq .pass2_loop: mova m0, [r7-32*99] mova m1, [r7-32*97] mova m2, [r7-32*95] mova m3, [r7-32*93] mova m4, [r7-32*67] mova m5, [r7-32*65] mova m6, [r7-32*63] mova m7, [r7-32*61] mova m8, [r7-32*35] mova m9, [r7-32*33] mova m10, [r7-32*31] mova m11, [r7-32*29] mova m12, [r7-32* 3] mova m13, [r7-32* 1] mova m14, [r7+32* 1] mova m15, [r7+32* 3] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf mova m0, [r7-32*100] mova m1, [r7-32*98] mova m2, [r7-32*96] mova m3, [r7-32*94] mova m4, [r7-32*68] mova m5, [r7-32*66] mova m6, [r7-32*64] mova m7, [r7-32*62] mova m8, [r7-32*36] mova m9, [r7-32*34] mova m10, [r7-32*32] mova m11, [r7-32*30] mova m12, [r7-32* 4] mova m13, [r7-32* 2] mova m14, [r7+32* 0] mova m15, [r7+32* 2] add r7, 32*8 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] add dstq, 32 cmp r7, r4 jl .pass2_loop RET ALIGN function_align .main: lea r5, [idct64_mul_16bpc] pmulld m0, m14, [cq+128* 1] pmulld m1, m14, [cq+128*31] pmulld m2, m14, [cq+128*17] pmulld m3, m14, [cq+128*15] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 7] pmulld m1, m14, [cq+128*25] pmulld m2, m14, [cq+128*23] pmulld m3, m14, [cq+128* 9] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 5] pmulld m1, m14, [cq+128*27] pmulld m2, m14, [cq+128*21] pmulld m3, m14, [cq+128*11] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 3] pmulld m1, m14, [cq+128*29] pmulld m2, m14, [cq+128*19] pmulld m3, m14, [cq+128*13] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 pmulld m0, m14, [cq+128* 2] pmulld m1, m14, [cq+128*14] pmulld m2, m14, [cq+128*18] pmulld m3, m14, [cq+128*30] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 pmulld m0, m14, [cq+128* 6] pmulld m1, m14, [cq+128*10] pmulld m2, m14, [cq+128*22] pmulld m3, m14, [cq+128*26] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 pmulld m0, m14, [cq+128* 4] pmulld m1, m14, [cq+128*12] pmulld m2, m14, [cq+128*20] pmulld m3, m14, [cq+128*28] call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 8] pmulld m2, m14, [cq+128*16] pmulld m3, m14, [cq+128*24] pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop psrld m15, m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 IDCT64_SHIFT_TRANSPOSE 1 ret cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*7] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 lea r4, [rsp+32*135] .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r10, [r6-32*32] lea r6, [pw_5+128] lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 .pass2_loop: mova m0, [r10-32*100] ; in0 mova m1, [r10-32*96] ; in4 mova m2, [r10-32*68] ; in8 mova m3, [r10-32*64] ; in12 mova m4, [r10-32*36] ; in16 mova m5, [r10-32*32] ; in20 mova m6, [r10-32* 4] ; in24 mova m7, [r10+32* 0] ; in28 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 add r4, 32*8 mova [r4-32*4], m8 mova [r4-32*3], m9 mova [r4-32*2], m10 mova [r4-32*1], m11 mova [r4+32*0], m12 mova [r4+32*1], m13 mova [r4+32*2], m14 mova [r4+32*3], m15 mova m0, [r10-32*98] ; in2 mova m1, [r10-32*94] ; in6 mova m2, [r10-32*66] ; in10 mova m3, [r10-32*62] ; in14 mova m4, [r10-32*34] ; in18 mova m5, [r10-32*30] ; in22 mova m6, [r10-32* 2] ; in26 mova m7, [r10+32* 2] ; in30 lea r5, [r4+32*16] add r4, 32*8 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10-32*99] ; in1 mova m1, [r10+32* 3] ; in31 mova m2, [r10-32*35] ; in17 mova m3, [r10-32*61] ; in15 mova m4, [r10-32*67] ; in9 mova m5, [r10-32*29] ; in23 mova m6, [r10-32* 3] ; in25 mova m7, [r10-32*93] ; in7 lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10-32*95] ; in5 mova m1, [r10-32* 1] ; in27 mova m2, [r10-32*31] ; in21 mova m3, [r10-32*65] ; in11 mova m4, [r10-32*63] ; in13 mova m5, [r10-32*33] ; in19 mova m6, [r10+32* 1] ; in29 mova m7, [r10-32*97] ; in3 add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 add r10, 32*8 sub dstq, r8 sub r4, 32*44 add dstq, 32 cmp r10, r4 jl .pass2_loop RET ALIGN function_align .main: lea r5, [idct64_mul_16bpc] mova m0, [cq+128* 1] mova m1, [cq+128*31] mova m2, [cq+128*17] mova m3, [cq+128*15] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 7] mova m1, [cq+128*25] mova m2, [cq+128*23] mova m3, [cq+128* 9] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 5] mova m1, [cq+128*27] mova m2, [cq+128*21] mova m3, [cq+128*11] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 3] mova m1, [cq+128*29] mova m2, [cq+128*19] mova m3, [cq+128*13] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 mova m0, [cq+128* 2] mova m1, [cq+128*14] mova m2, [cq+128*18] mova m3, [cq+128*30] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6] mova m1, [cq+128*10] mova m2, [cq+128*22] mova m3, [cq+128*26] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast mova m0, [cq+128* 4] mova m1, [cq+128*12] mova m2, [cq+128*20] mova m3, [cq+128*28] call m(idct_8x16_internal_10bpc).main_oddhalf_fast mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m2, [cq+128*16] mova m3, [cq+128*24] pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx16_avx512.asm000066400000000000000000007075411517466257200240610ustar00rootroot00000000000000; Copyright © 2022-2023, VideoLAN and dav2d authors ; Copyright © 2022-2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23 db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31 db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55 db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63 idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51 db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59 db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17 db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25 iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23 db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31 db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19 db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27 permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13 db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29 db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15 db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31 permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2 db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6 db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7 db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3 permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 pw_2048_m2048: times 16 dw 2048 pw_m2048_2048: times 16 dw -2048 pw_2048: times 16 dw 2048 ; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=-- %macro COEF_PAIR 2-3 0 ; a, b, flags %if %3 == 1 pd_%1_m%2: dd %1, %1, -%2, -%2 %define pd_%1 (pd_%1_m%2 + 4*0) %define pd_m%2 (pd_%1_m%2 + 4*2) %elif %3 == 2 pd_m%1_%2: dd -%1, -%1, %2, %2 %define pd_m%1 (pd_m%1_%2 + 4*0) %define pd_%2 (pd_m%1_%2 + 4*2) %elif %3 == 4 pd_m%1_m%2: dd -%1, -%1, -%2, -%2 %define pd_m%1 (pd_m%1_m%2 + 4*0) %define pd_m%2 (pd_m%1_m%2 + 4*2) %else pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) %if %3 == 3 %define pd_%2_m%2 pd_%2 dd -%2, -%2 %endif %endif %endmacro COEF_PAIR 101, 501 COEF_PAIR 201, 601, 1 COEF_PAIR 201, 995 COEF_PAIR 401, 1189, 1 COEF_PAIR 401, 1931 COEF_PAIR 401, 3920 COEF_PAIR 401, 4076 COEF_PAIR 700, 301, 4 COEF_PAIR 799, 2276, 1 COEF_PAIR 799, 3406 COEF_PAIR 799, 4017 COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2598, 1931, 2 COEF_PAIR 2598, 3612 COEF_PAIR 2751, 2106 COEF_PAIR 2896, 1567, 3 COEF_PAIR 2896, 3784, 3 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 1931 COEF_PAIR 3166, 3612 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 COEF_PAIR 3857, 4052 COEF_PAIR 4017, 2276 COEF_PAIR 4017, 3406 COEF_PAIR 4036, 4085 COEF_PAIR 4076, 1189 COEF_PAIR 4076, 3612 COEF_PAIR 4076, 3920 COEF_PAIR 4091, 3973 COEF_PAIR 4091, 4052 COEF_PAIR 4095, 4065 pb_32: times 4 db 32 pw_5: times 2 dw 5 pw_4096: times 2 dw 4096 pw_8192: times 2 dw 8192 pw_1697x16: times 2 dw 1697*16 pw_2896x8: times 2 dw 2896*8 pixel_10bpc_max: times 2 dw 0x03ff dconly_10bpc: times 2 dw 0x7c00 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff pd_1: dd 1 pd_2: dd 2 pd_1448: dd 1448 pd_2048: dd 2048 pd_3071: dd 3071 ; 1024 + 2048 - 1 pd_3072: dd 3072 ; 1024 + 2048 pd_5119: dd 5119 ; 1024 + 4096 - 1 pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 cextern dup16_perm cextern int8_permA cextern idct64_mul_16bpc cextern idct_8x8_internal_8bpc_avx512icl.main cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 cextern idct_8x16_internal_8bpc_avx512icl.main cextern idct_8x16_internal_8bpc_avx512icl.main2 cextern idct_8x16_internal_8bpc_avx512icl.main_fast cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 cextern iadst_8x16_internal_8bpc_avx512icl.main2 cextern idct_16x8_internal_8bpc_avx512icl.main cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 cextern idct_16x16_internal_8bpc_avx512icl.main cextern idct_16x16_internal_8bpc_avx512icl.main2 cextern idct_16x16_internal_8bpc_avx512icl.main_fast cextern idct_16x16_internal_8bpc_avx512icl.main_fast2 cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2 SECTION .text %define o_base (pw_2048+4*128) %define o_base_8bpc (int8_permA+64*18) %define o(x) (r5 - o_base + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) INIT_ZMM avx512icl ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 ; flags: 1 = inv_dst1, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else %if %8 < 4096 vpbroadcastd m%3, [o(pd_%8)] %else vbroadcasti32x4 m%3, [o(pd_%8)] %endif pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif %if %7 < 32 pmulld m%1, m%7 pmulld m%2, m%7 %else %if %7 < 4096 vpbroadcastd m%5, [o(pd_%7)] %else vbroadcasti32x4 m%5, [o(pd_%7)] %endif pmulld m%1, m%5 pmulld m%2, m%5 %endif %if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else %ifnum %6 paddd m%4, m%6 %endif paddd m%2, m%4 %endif %ifnum %6 paddd m%1, m%6 %endif %if %9 & 1 psubd m%1, m%3, m%1 %else psubd m%1, m%3 %endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 %endif %endmacro %macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_10bpc) lea r5, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%4_internal_10bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else %if %3 add eobd, %3 %endif ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 8x8 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 .dconly: add r6d, 384 sar r6d, 9 .dconly2: vpbroadcastd ym2, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw ym1, r6d paddsw ym1, ym2 .dconly_loop: mova xm0, [dstq+strideq*0] vinserti32x4 ym0, [dstq+strideq*1], 1 paddsw ym0, ym1 psubusw ym0, ym2 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call .load vpermi2q m1, m0, m2 ; 1 5 vpermi2q m3, m6, m4 ; 7 3 vpermt2q m0, m5, m4 ; 0 2 vpermt2q m2, m5, m6 ; 4 6 call .main call .main_end mova m4, [o(idct8x8p)] packssdw m0, m2 ; 0 1 4 5 packssdw m1, m3 ; 3 2 7 6 vpermb m0, m4, m0 vprolq m1, 32 vpermb m2, m4, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 jmp tx2q .pass2: lea r5, [o_base_8bpc] vextracti32x8 ym2, m0, 1 vextracti32x8 ym3, m1, 1 call m(idct_8x8_internal_8bpc).main mova m10, [permC] vpbroadcastd m12, [pw_2048] .end: vpermt2q m0, m10, m1 vpermt2q m2, m10, m3 .end2: vpbroadcastd m11, [pixel_10bpc_max] lea r6, [strideq*3] pxor m10, m10 pmulhrsw m8, m12, m0 call .write_8x4_start pmulhrsw m8, m12, m2 .write_8x4: lea dstq, [dstq+strideq*4] add cq, 64*2 .write_8x4_start: mova xm9, [dstq+strideq*0] vinserti32x4 ym9, [dstq+strideq*1], 1 vinserti32x4 m9, [dstq+strideq*2], 2 vinserti32x4 m9, [dstq+r6 ], 3 mova [cq+64*0], m10 mova [cq+64*1], m10 paddw m9, m8 pmaxsw m9, m10 pminsw m9, m11 mova [dstq+strideq*0], xm9 vextracti32x4 [dstq+strideq*1], ym9, 1 vextracti32x4 [dstq+strideq*2], m9, 2 vextracti32x4 [dstq+r6 ], m9, 3 ret ALIGN function_align .load: mova m0, [cq+64*0] ; 0 1 mova m4, [cq+64*1] ; 2 3 mova m1, [o(permB)] mova m2, [cq+64*2] ; 4 5 mova m6, [cq+64*3] ; 6 7 vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] psrlq m5, m1, 32 vpbroadcastd m12, [o(pd_2896)] mova m3, m1 vpbroadcastd m11, [o(pd_1)] ret ALIGN function_align .main_fast: ; bottom half is zero vbroadcasti32x4 m3, [o(pd_4017_3406)] vbroadcasti32x4 m8, [o(pd_799_m2276)] vbroadcasti32x4 m2, [o(pd_2896_3784)] vbroadcasti32x4 m9, [o(pd_2896_1567)] pmulld m3, m1 ; t4a t5a pmulld m1, m8 ; t7a t6a pmulld m2, m0 ; t0 t3 pmulld m0, m9 ; t1 t2 jmp .main2 .main: ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276 ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784 .main2: REPX {paddd x, m13}, m1, m3, m0, m2 REPX {psrad x, 12 }, m1, m3, m0, m2 punpcklqdq m8, m1, m3 ; t4a t7a punpckhqdq m1, m3 ; t5a t6a psubd m3, m8, m1 ; t5a t6a paddd m8, m1 ; t4 t7 pmaxsd m3, m14 punpckhqdq m1, m2, m0 ; t3 t2 pminsd m3, m15 punpcklqdq m2, m0 ; t0 t1 pmulld m3, m12 paddd m0, m2, m1 ; dct4 out0 out1 psubd m2, m1 ; dct4 out3 out2 REPX {pmaxsd x, m14}, m8, m0, m2 REPX {pminsd x, m15}, m8, m0, m2 .main3: pshufd m1, m3, q1032 paddd m3, m13 psubd m9, m3, m1 paddd m3, m1 psrad m9, 12 psrad m3, 12 punpckhqdq m1, m8, m3 ; t7 t6 shufpd m8, m9, 0xaa ; t4 t5 ret .main_end: paddd m0, m11 paddd m2, m11 psubd m3, m0, m1 ; out7 out6 paddd m0, m1 ; out0 out1 paddd m1, m2, m8 ; out3 out2 psubd m2, m8 ; out4 out5 REPX {vpsravd x, m11}, m0, m2, m3, m1 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity INV_TXFM_8X8_FN adst, adst cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x8_internal_10bpc).load vpermi2q m1, m6, m2 ; 7 5 vpermi2q m3, m4, m0 ; 3 1 vpermt2q m0, m5, m4 ; 0 2 vpermt2q m2, m5, m6 ; 4 6 call .main punpckldq m1, m2, m4 ; out4 out6 punpckhdq m2, m0 ; -out5 -out7 punpckldq m0, m3 ; out0 out2 punpckhdq m4, m3 ; -out1 -out3 paddd m1, m11 psubd m3, m11, m2 paddd m0, m11 psubd m4, m11, m4 .pass1_end: REPX {psrad x, 1}, m1, m0, m3, m4 packssdw m0, m1 ; 0 2 4 6 packssdw m4, m3 ; 1 3 5 7 psrlq m1, [o(permB)], 8 punpckhwd m3, m0, m4 punpcklwd m0, m4 psrlq m2, m1, 32 vpermi2q m1, m0, m3 vpermt2q m0, m2, m3 jmp tx2q .pass2: call .main_pass2 movu m10, [permC+2] vbroadcasti32x8 m12, [pw_2048_m2048+16] jmp m(idct_8x8_internal_10bpc).end .main_pass2: vextracti32x8 ym2, m0, 1 vextracti32x8 ym3, m1, 1 lea r5, [o_base_8bpc] pshufd ym4, ym0, q1032 pshufd ym5, ym1, q1032 jmp m(iadst_8x8_internal_8bpc).main_pass2 ALIGN function_align .main: ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612 ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189 psubd m4, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 REPX {pmaxsd x, m14}, m4, m2, m0, m1 REPX {pminsd x, m15}, m4, m2, m0, m1 pxor m5, m5 psubd m5, m4 shufpd m4, m2, 0xaa ; t4 t7 shufpd m2, m5, 0xaa ; t5 -t6 ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784 punpckhqdq m3, m0, m1 punpcklqdq m0, m1 psubd m1, m0, m3 ; t2 t3 paddd m0, m3 ; out0 -out7 punpckhqdq m3, m4, m2 ; t7a t6a punpcklqdq m4, m2 ; t5a t4a psubd m2, m4, m3 ; t7 t6 paddd m4, m3 ; out6 -out1 REPX {pmaxsd x, m14}, m1, m2 REPX {pminsd x, m15}, m1, m2 shufpd m3, m1, m2, 0xaa shufpd m1, m2, 0x55 pmulld m3, m12 pmulld m1, m12 paddd m3, m13 psubd m2, m3, m1 paddd m3, m1 psrad m2, 12 ; out4 -out5 pshufd m3, m3, q1032 psrad m3, 12 ; out2 -out3 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, identity INV_TXFM_8X8_FN flipadst, flipadst cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x8_internal_10bpc).load vpermi2q m1, m6, m2 ; 7 5 vpermi2q m3, m4, m0 ; 3 1 vpermt2q m0, m5, m4 ; 0 2 vpermt2q m2, m5, m6 ; 4 6 call m(iadst_8x8_internal_10bpc).main punpckhdq m1, m3, m4 ; -out3 -out1 punpckldq m3, m0 ; out2 out0 punpckhdq m0, m2 ; -out7 -out5 punpckldq m4, m2 ; out6 out4 psubd m1, m11, m1 paddd m3, m11 psubd m0, m11, m0 paddd m4, m11 jmp m(iadst_8x8_internal_10bpc).pass1_end .pass2: call m(iadst_8x8_internal_10bpc).main_pass2 movu m10, [permC+1] vbroadcasti32x8 m12, [pw_m2048_2048+16] lea r6, [strideq*3] vpermt2q m0, m10, m1 ; 7 6 5 4 vpbroadcastd m11, [pixel_10bpc_max] vpermt2q m2, m10, m3 ; 3 2 1 0 pxor m10, m10 pmulhrsw m8, m12, m2 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m8, m12, m0 jmp m(idct_8x8_internal_10bpc).write_8x4 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 mova m1, [cq+64*0] packssdw m1, [cq+64*2] ; 0 4 1 5 mova m2, [cq+64*1] ; 2 6 3 7 packssdw m2, [cq+64*3] mova m0, [o(idtx8x8p)] vpermb m1, m0, m1 vpermb m2, m0, m2 punpckldq m0, m1, m2 ; 0 1 4 5 punpckhdq m1, m2 ; 2 3 6 7 jmp tx2q .pass2: movu m3, [o(permC+2)] vpbroadcastd m12, [o(pw_4096)] psrlq m2, m3, 32 vpermi2q m2, m0, m1 vpermt2q m0, m3, m1 jmp m(idct_8x8_internal_10bpc).end2 %macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 8x16 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity, 35 INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, adst cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 43 jl .fast call .load call .main call .main_end .pass1_end: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 jmp tx2q .pass2: mova m8, [o(idct8x16p)] REPX {vpermb x, m8, x}, m0, m1, m2, m3 punpckhdq m5, m0, m1 punpckldq m0, m1 punpckhdq m4, m2, m3 punpckldq m2, m3 punpcklqdq m8, m0, m2 ; 15 1 punpckhqdq m0, m2 ; 7 9 punpckhqdq m1, m5, m4 ; 3 13 punpcklqdq m5, m4 ; 11 5 lea r5, [o_base_8bpc] vextracti32x8 ym7, m8, 1 ; 14 2 vextracti32x8 ym3, m0, 1 ; 6 10 vextracti32x8 ym6, m1, 1 ; 12 4 vextracti32x8 ym9, m5, 1 ; 8 0 call m(idct_8x16_internal_8bpc).main2 mova m8, [permC] vpbroadcastd m12, [pw_2048] vpermt2q m0, m8, m1 lea r6, [strideq*3] vpermt2q m2, m8, m3 vpbroadcastd m11, [pixel_10bpc_max] vpermt2q m4, m8, m5 pxor m10, m10 vpermt2q m6, m8, m7 pmulhrsw m8, m12, m0 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m8, m12, m2 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m4 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m6 jmp m(idct_8x8_internal_10bpc).write_8x4 .fast: mova ym0, [cq+64*0] mova ym4, [cq+64*2] mova ym1, [cq+64*1] mova ym5, [cq+64*5] mova ym2, [cq+64*4] mova ym6, [cq+64*6] mova ym3, [cq+64*7] mova ym7, [cq+64*3] call .round_input_fast call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_10bpc).main_end movu m6, [o(permC+3)] packssdw m3, m1, m3 packssdw m1, m0, m2 vprolq m3, 32 vpermd m1, m6, m1 vpermd m3, m6, m3 mova ym0, ym1 ; 0 4 vextracti32x8 ym1, m1, 1 ; 1 5 mova ym2, ym3 ; 2 6 vextracti32x8 ym3, m3, 1 ; 3 7 jmp tx2q ALIGN function_align .round_input_fast: movshdup m8, [o(permB)] vpbroadcastd m12, [o(pd_2896)] vpermt2q m0, m8, m4 vpermt2q m1, m8, m5 vpermt2q m2, m8, m6 vpermt2q m3, m8, m7 vpbroadcastd m13, [o(pd_2048)] REPX {pmulld x, m12}, m0, m1, m2, m3 vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] REPX {paddd x, m13}, m0, m1, m2, m3 vpbroadcastd m11, [o(pd_1)] REPX {psrad x, 12 }, m0, m1, m2, m3 ret ALIGN function_align .load: vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] .load2: vpbroadcastd m12, [o(pd_2896)] pmulld m0, m12, [cq+64*0] pmulld m1, m12, [cq+64*1] pmulld m2, m12, [cq+64*2] pmulld m3, m12, [cq+64*3] vpbroadcastd m13, [o(pd_2048)] pmulld m4, m12, [cq+64*4] pmulld m5, m12, [cq+64*5] pmulld m6, m12, [cq+64*6] pmulld m7, m12, [cq+64*7] .round: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 REPX {paddd x, m13}, m4, m5, m6, m7 REPX {psrad x, 12 }, m4, m5, m6, m7 ret ALIGN function_align .main_fast2_rect2: REPX {paddd x, m13}, m0, m1 REPX {psrad x, 12 }, m0, m1 .main_fast2: pmulld m0, m12 pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a pmulld m8, m1, [o(pd_799)] {1to16} ; t4a REPX {paddd x, m13}, m0, m6, m8 REPX {psrad x, 12 }, m0, m6, m8 pmulld m5, m6, m12 pmulld m1, m8, m12 paddd m5, m13 psubd m4, m5, m1 paddd m5, m1 REPX {psrad x, 12 }, m4, m5 REPX {mova x, m0 }, m1, m2, m3 ret .main_fast_rect2: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_fast: pmulld m0, m12 pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a pmulld m3, [o(pd_3406)] {1to16} ; t6a pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a pmulld m1, [o(pd_799)] {1to16} ; t4a pmulld m6, m2, [o(pd_3784)] {1to16} ; t3 pmulld m2, [o(pd_1567)] {1to16} ; t2 paddd m0, m13 psubd m5, m13, m5 psrad m0, 12 ; t0 mova m9, m0 ; t1 jmp .main2 .main_rect2: call .round .main: pmulld m0, m12 ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 pmulld m4, m12 paddd m0, m13 paddd m5, m13 psubd m9, m0, m4 ; t1 paddd m0, m4 ; t0 psrad m9, 12 psrad m0, 12 .main2: REPX {paddd x, m13}, m3, m1, m7 REPX {psrad x, 12 }, m5, m1, m3, m7 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a psubd m5, m7, m3 ; t6a paddd m7, m3 ; t7 pmaxsd m5, m14 pmaxsd m1, m14 paddd m2, m13 paddd m6, m13 pminsd m5, m15 pminsd m1, m15 pmulld m5, m12 pmulld m1, m12 pmaxsd m8, m14 pmaxsd m7, m14 pminsd m8, m15 paddd m5, m13 psubd m4, m5, m1 paddd m5, m1 REPX {psrad x, 12 }, m2, m6, m5, m4 paddd m1, m9, m2 ; dct4 out1 psubd m2, m9, m2 ; dct4 out2 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 pminsd m6, m15, m7 REPX {pmaxsd x, m14}, m0, m1, m2, m3 REPX {pminsd x, m15}, m0, m1, m2, m3 ret .main_end: vpbroadcastd m11, [o(pd_1)] .main_end2: REPX {paddd x, m11}, m0, m1, m2, m3 psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 psubd m6, m1, m5 ; out6 paddd m1, m5 ; out1 psubd m5, m2, m4 ; out5 paddd m2, m4 ; out2 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, identity, 35 INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, adst cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 43 jl .fast call m(idct_8x16_internal_10bpc).load call .main psrad m0, 1 psrad m1, 1 psrad m6, m10, 1 psrad m7, m11, 1 psrad m2, 12 psrad m3, 12 psrad m4, m8, 12 psrad m5, m9, 12 jmp m(idct_8x16_internal_10bpc).pass1_end .fast: call .fast_main punpcklqdq m1, m2, m4 ; out4 out6 punpckhqdq m2, m0 ; -out5 -out7 punpcklqdq m0, m3 ; out0 out2 punpckhqdq m4, m3 ; -out1 -out3 paddd m1, m11 psubd m3, m11, m2 paddd m0, m11 psubd m4, m11, m4 .fast_end: movu m5, [o(permC+3)] REPX {psrad x, 1}, m1, m0, m3, m4 packssdw m2, m0, m1 ; 0 2 4 6 packssdw m3, m4, m3 ; 1 3 5 7 vpermd m2, m5, m2 vpermd m3, m5, m3 mova ym0, ym2 vextracti32x8 ym2, m2, 1 mova ym1, ym3 vextracti32x8 ym3, m3, 1 jmp tx2q .pass2: call .pass2_main movu m4, [permB+2] vbroadcasti32x8 m12, [pw_2048_m2048+16] psrlq m7, m4, 8 vpermi2q m4, m0, m3 ; 0 1 2 3 psrlq m5, m7, 24 vpermi2q m7, m0, m3 ; 12 13 14 15 psrlq m6, m5, 8 vpermq m5, m5, m1 ; 4 5 6 7 vpermq m6, m6, m2 ; 8 9 10 11 .pass2_end: vpbroadcastd m11, [pixel_10bpc_max] pxor m10, m10 lea r6, [strideq*3] pmulhrsw m8, m12, m4 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m8, m12, m5 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m6 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m7 jmp m(idct_8x8_internal_10bpc).write_8x4 ALIGN function_align .main: ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a psubd m8, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 psubd m4, m5, m1 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7 REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7 vpbroadcastd m10, [o(pd_1567)] vpbroadcastd m11, [o(pd_3784)] ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a vpbroadcastd m12, [o(pd_1448)] psubd m9, m6, m8 ; t7 paddd m6, m8 ; out6 psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m2 ; t2 paddd m0, m2 ; out0 psubd m2, m1, m4 ; t6 paddd m1, m4 ; -out1 REPX {pmaxsd x, m14}, m5, m3, m2, m9 REPX {pminsd x, m15}, m5, m3, m2, m9 REPX {pmulld x, m12}, m5, m3, m2, m9 vpbroadcastd m4, [o(pd_1)] psubd m8, m5, m3 ; (t2 - t3) * 1448 paddd m3, m5 ; (t2 + t3) * 1448 psubd m5, m2, m9 ; (t6 - t7) * 1448 paddd m2, m9 ; (t6 + t7) * 1448 vpbroadcastd m9, [o(pd_3072)] paddd m0, m4 psubd m1, m4, m1 paddd m10, m6, m4 psubd m11, m4, m7 paddd m2, m9 paddd m8, m9 vpbroadcastd m9, [o(pd_3071)] psubd m3, m9, m3 psubd m9, m5 ret ALIGN function_align .fast_main: mova ym0, [cq+64*0] mova ym4, [cq+64*2] mova ym1, [cq+64*7] mova ym5, [cq+64*5] mova ym2, [cq+64*4] mova ym6, [cq+64*6] mova ym3, [cq+64*3] mova ym7, [cq+64*1] call m(idct_8x16_internal_10bpc).round_input_fast jmp m(iadst_8x8_internal_10bpc).main ALIGN function_align .pass2_main: mova m8, [o(iadst8x16p)] REPX {vpermb x, m8, x}, m0, m1, m2, m3 vpbroadcastd m10, [o(pw_2896x8)] punpckhdq m5, m0, m1 punpckldq m0, m1 punpckhdq m1, m2, m3 punpckldq m2, m3 lea r5, [o_base_8bpc] punpckhqdq m4, m0, m2 ; 12 3 14 1 punpcklqdq m0, m2 ; 0 15 2 13 punpckhqdq m6, m5, m1 ; 8 7 10 5 punpcklqdq m5, m1 ; 4 11 6 9 call m(iadst_8x16_internal_8bpc).main2 paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m10 ; -out7 out4 out6 -out5 pmulhrsw m2, m10 ; out8 -out11 -out9 out10 ret INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, identity, 35 INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 43 jl .fast call m(idct_8x16_internal_10bpc).load call m(iadst_8x16_internal_10bpc).main psrad m7, m0, 1 psrad m0, m11, 1 psrad m6, m1, 1 psrad m1, m10, 1 psrad m5, m2, 12 psrad m2, m9, 12 psrad m4, m3, 12 psrad m3, m8, 12 jmp m(idct_8x16_internal_10bpc).pass1_end .fast: call m(iadst_8x16_internal_10bpc).fast_main punpckhqdq m1, m3, m4 ; -out3 -out1 punpcklqdq m3, m0 ; out2 out0 punpckhqdq m0, m2 ; -out7 -out5 punpcklqdq m4, m2 ; out6 out4 psubd m1, m11, m1 paddd m3, m11 psubd m0, m11, m0 paddd m4, m11 jmp m(iadst_8x16_internal_10bpc).fast_end .pass2: call m(iadst_8x16_internal_10bpc).pass2_main movu m7, [permB+2] vbroadcasti32x8 m12, [pw_m2048_2048+16] psrlq m4, m7, 8 vpermi2q m7, m3, m0 ; 3 2 1 0 psrlq m5, m4, 24 vpermi2q m4, m3, m0 ; 15 14 13 12 psrlq m6, m5, 8 vpermq m5, m5, m2 ; 11 10 9 8 vpermq m6, m6, m1 ; 7 6 5 4 jmp m(iadst_8x16_internal_10bpc).pass2_end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x16_internal_10bpc).load2 jmp m(idct_8x16_internal_10bpc).pass1_end .pass2: vpbroadcastd m8, [o(pw_1697x16)] pmulhrsw m4, m8, m0 pmulhrsw m5, m8, m1 pmulhrsw m6, m8, m2 pmulhrsw m7, m8, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 vpbroadcastd m7, [o(pw_2048)] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 vpbroadcastd m6, [o(pixel_10bpc_max)] punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m1 punpckhdq m4, m1 pxor m5, m5 punpckhqdq m1, m0, m2 ; 1 5 9 13 punpcklqdq m0, m2 ; 0 4 8 12 punpcklqdq m2, m3, m4 ; 2 6 10 14 punpckhqdq m3, m4 ; 3 7 11 15 lea r6, [strideq*3] pmulhrsw m0, m7 call .write_8x4_start pmulhrsw m0, m7, m1 call .write_8x4 pmulhrsw m0, m7, m2 call .write_8x4 pmulhrsw m0, m7, m3 .write_8x4: add dstq, strideq add cq, 64*2 .write_8x4_start: mova xm4, [dstq+strideq*0] vinserti32x4 ym4, [dstq+strideq*4], 1 vinserti32x4 m4, [dstq+strideq*8], 2 vinserti32x4 m4, [dstq+r6*4 ], 3 mova [cq+64*0], m5 mova [cq+64*1], m5 paddw m4, m0 pmaxsw m4, m5 pminsw m4, m6 mova [dstq+strideq*0], xm4 vextracti32x4 [dstq+strideq*4], ym4, 1 vextracti32x4 [dstq+strideq*8], m4, 2 vextracti32x4 [dstq+r6*4 ], m4, 3 ret %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 16x8 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 .dconly: add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 .dconly2: vpbroadcastd m2, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw m1, r6d paddsw m1, m2 .dconly_loop: mova ym0, [dstq+strideq*0] vinserti32x8 m0, [dstq+strideq*1], 1 paddsw m0, m1 psubusw m0, m2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity, -21 INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, adst cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [o(pd_2896)] pmulld m4, m12, [cq+64*0] ; 0 1 pmulld m9, m12, [cq+64*1] ; 2 3 pmulld m8, m12, [cq+64*2] ; 4 5 pmulld m7, m12, [cq+64*3] ; 6 7 vpbroadcastd m13, [o(pd_2048)] pxor m2, m2 mova m15, [o(permB)] REPX {mova [cq+64*x], m2}, 0, 1, 2, 3 psrlq m0, m15, 32 REPX {paddd x, m13}, m4, m9, m8, m7 vpbroadcastd m14, [o(clip_18b_min)] REPX {psrad x, 12 }, m4, m8, m9, m7 mova m1, m0 vpermi2q m0, m4, m8 ; 0 4 cmp eobd, 43 jl .fast pmulld m5, m12, [cq+64*4] ; 8 9 pmulld m10, m12, [cq+64*5] ; 10 11 pmulld m11, m12, [cq+64*6] ; 12 13 pmulld m6, m12, [cq+64*7] ; 14 15 REPX {mova [cq+64*x], m2}, 4, 5, 6, 7 REPX {paddd x, m13}, m5, m10, m11, m6 REPX {psrad x, 12 }, m10, m5, m11, m6 mova m2, m1 vpermi2q m1, m9, m10 ; 2 10 mova m3, m2 vpermi2q m2, m5, m11 ; 8 12 vpermi2q m3, m6, m7 ; 14 6 vpermt2q m4, m15, m11 ; 1 13 vpermt2q m6, m15, m9 ; 15 3 vpermt2q m5, m15, m8 ; 9 5 vpermt2q m7, m15, m10 ; 7 11 vpbroadcastd m15, [o(clip_18b_max)] call m(idct_8x8_internal_10bpc).main call .main jmp .pass1_end .fast: vpermi2q m1, m9, m7 ; 2 6 vpermt2q m4, m15, m9 ; 1 3 vpermt2q m7, m15, m8 ; 7 5 vpbroadcastd m15, [o(clip_18b_max)] call m(idct_8x8_internal_10bpc).main_fast call .main_fast .pass1_end: call m(idct_8x16_internal_10bpc).main_end mova m8, [o(permA)] psrlq m9, m8, 8 .pass1_end2: mova m10, m9 mova m11, m8 call .transpose_16x8 jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(idct_16x8_internal_8bpc).main movshdup m4, [permC] vpbroadcastd m11, [pw_2048] psrlq m5, m4, 8 .end: vpbroadcastd m13, [pixel_10bpc_max] pxor m12, m12 vpermq m8, m4, m0 vpermq m9, m5, m1 lea r6, [strideq*3] call .write_16x4 vpermq m8, m4, m2 vpermq m9, m5, m3 .write_16x4: pmulhrsw m8, m11 pmulhrsw m9, m11 .write_16x4_noround: mova ym10, [dstq+strideq*0] vinserti32x8 m10, [dstq+strideq*1], 1 paddw m8, m10 mova ym10, [dstq+strideq*2] vinserti32x8 m10, [dstq+r6 ], 1 paddw m9, m10 pmaxsw m8, m12 pmaxsw m9, m12 pminsw m8, m13 pminsw m9, m13 mova [dstq+strideq*0], ym8 vextracti32x8 [dstq+strideq*1], m8, 1 mova [dstq+strideq*2], ym9 vextracti32x8 [dstq+r6 ], m9, 1 lea dstq, [dstq+strideq*4] ret ALIGN function_align .main_fast: ; bottom half is zero vbroadcasti32x4 m6, [o(pd_4076_3920)] vbroadcasti32x4 m3, [o(pd_401_m1189)] vbroadcasti32x4 m5, [o(pd_m2598_1931)] vbroadcasti32x4 m9, [o(pd_3166_3612)] pmulld m6, m4 ; t15a t12a pmulld m4, m3 ; t8a t11a pmulld m5, m7 ; t9a t10a pmulld m7, m9 ; t14a t13a jmp .main2 .main: ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 .main2: REPX {paddd x, m13}, m4, m6, m5, m7 REPX {psrad x, 12 }, m4, m5, m6, m7 paddd m9, m4, m5 ; t8 t11 psubd m4, m5 ; t9 t10 psubd m5, m6, m7 ; t14 t13 paddd m6, m7 ; t15 t12 REPX {pmaxsd x, m14}, m5, m4, m9, m6 REPX {pminsd x, m15}, m5, m4, m9, m6 .main3: psubd m3, m0, m1 ; dct8 out7 out6 paddd m0, m1 ; dct8 out0 out1 vbroadcasti32x4 m7, [o(pd_3784_m3784)] pmulld m7, m5 vpmulld m5, [o(pd_1567)] {1to16} paddd m1, m2, m8 ; dct8 out3 out2 psubd m2, m8 ; dct8 out4 out5 vbroadcasti32x4 m8, [o(pd_1567_m1567)] pmulld m8, m4 vpmulld m4, [o(pd_3784)] {1to16} REPX {pmaxsd x, m14}, m0, m1 REPX {pminsd x, m15}, m0, m1 paddd m7, m13 paddd m5, m13 paddd m7, m8 psubd m5, m4 psrad m7, 12 ; t14a t10a psrad m5, 12 ; t9a t13a punpckhqdq m4, m9, m7 punpcklqdq m8, m9, m5 punpckhqdq m5, m6, m5 punpcklqdq m6, m7 psubd m7, m8, m4 ; t11a t10 paddd m8, m4 ; t8a t9 psubd m4, m6, m5 ; t12a t13 paddd m6, m5 ; t15a t14 REPX {pmaxsd x, m14}, m4, m7 REPX {pminsd x, m15}, m4, m7 pmulld m4, m12 pmulld m7, m12 REPX {pmaxsd x, m14}, m2, m3, m6, m8 REPX {pminsd x, m15}, m2, m3, m6, m8 paddd m4, m13 paddd m5, m4, m7 psubd m4, m7 psrad m4, 12 ; t11 t10a psrad m5, 12 ; t12 t13a ret ALIGN function_align .transpose_16x8: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpermi2d m8, m0, m2 vpermt2d m0, m9, m2 vpermi2d m10, m1, m3 vpermi2d m11, m1, m3 punpckhwd m3, m8, m0 punpcklwd m1, m8, m0 punpckhwd m4, m10, m11 punpcklwd m2, m10, m11 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, identity, -21 INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, adst cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp call .main_pass1 vpbroadcastd m9, [o(pd_1)] paddd m0, m9 psubd m1, m9, m1 paddd m2, m9 psubd m3, m9, m3 paddd m4, m9, m5 psubd m5, m9, m6 paddd m6, m9, m7 psubd m7, m9, m8 .pass1_end: mova m9, [o(permA)] psrlq m8, m9, 8 REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 jmp m(idct_16x8_internal_10bpc).pass1_end2 .pass2: call .main_pass2 vpermq m8, m11, m0 vpermq m9, m11, m1 call m(idct_16x8_internal_10bpc).write_16x4_noround vpermq m8, m11, m2 vpermq m9, m11, m3 jmp m(idct_16x8_internal_10bpc).write_16x4_noround ALIGN function_align .main_pass1: vpbroadcastd m12, [o(pd_2896)] pmulld m2, m12, [cq+64*0] pmulld m7, m12, [cq+64*1] pmulld m1, m12, [cq+64*2] pmulld m5, m12, [cq+64*3] vpbroadcastd m13, [o(pd_2048)] pxor m4, m4 mova m10, [o(permB)] REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 REPX {paddd x, m13}, m2, m7, m1, m5 psrlq m6, m10, 32 REPX {psrad x, 12 }, m2, m7, m1, m5 mova m0, m6 vpermi2q m0, m2, m7 ; 0 2 vpermt2q m7, m10, m2 ; 3 1 mova m2, m6 vpermi2q m2, m1, m5 ; 4 6 vpermt2q m5, m10, m1 ; 7 5 cmp eobd, 43 jl .main_fast pmulld m8, m12, [cq+64*4] pmulld m3, m12, [cq+64*5] pmulld m9, m12, [cq+64*6] pmulld m1, m12, [cq+64*7] REPX {mova [cq+64*x], m4}, 4, 5, 6, 7 REPX {paddd x, m13}, m8, m3, m9, m1 REPX {psrad x, 12 }, m8, m3, m9, m1 mova m4, m6 vpermi2q m4, m8, m3 ; 8 10 vpermt2q m3, m10, m8 ; 11 9 vpermi2q m6, m9, m1 ; 12 14 vpermt2q m1, m10, m9 ; 15 13 .main: ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1 ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1 ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106 ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601 jmp .main2 .main_fast: vbroadcasti32x4 m1, [o(pd_4091_3973)] vbroadcasti32x4 m8, [o(pd_201_995)] vbroadcasti32x4 m3, [o(pd_3703_3290)] vbroadcasti32x4 m9, [o(pd_1751_2440)] vbroadcasti32x4 m4, [o(pd_2751_2106)] vbroadcasti32x4 m10, [o(pd_3035_3513)] vbroadcasti32x4 m6, [o(pd_1380_601)] vbroadcasti32x4 m11, [o(pd_3857_4052)] pmulld m1, m0 pmulld m0, m8 pmulld m3, m2 pmulld m2, m9 pmulld m4, m5 pmulld m5, m10 pmulld m6, m7 pmulld m7, m11 .main2: vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] REPX {psubd x, m13, x}, m1, m3 REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7 psubd m8, m0, m4 ; t8a t10a paddd m0, m4 ; t0a t2a psubd m4, m1, m5 ; t9a t11a paddd m1, m5 ; t1a t3a psubd m5, m2, m6 ; t12a t14a paddd m2, m6 ; t4a t6a psubd m6, m3, m7 ; t13a t15a paddd m3, m7 ; t5a t7a REPX {pmaxsd x, m14}, m8, m4, m5, m6 REPX {pminsd x, m15}, m8, m4, m5, m6 vbroadcasti32x4 m11, [o(pd_4017_2276)] vbroadcasti32x4 m10, [o(pd_799_3406)] ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11 ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10 REPX {pmaxsd x, m14}, m0, m2, m1, m3 REPX {pminsd x, m15}, m0, m2, m1, m3 psubd m7, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 psubd m3, m4, m6 ; t12a t14a paddd m4, m6 ; t8a t10a psubd m6, m8, m5 ; t13a t15a paddd m8, m5 ; t9a t11a REPX {pmaxsd x, m14}, m7, m3, m2, m6 REPX {pminsd x, m15}, m7, m3, m2, m6 punpcklqdq m5, m3, m7 ; t12a t4 punpckhqdq m3, m7 ; t14a t6 punpckhqdq m7, m6, m2 ; t15a t7 punpcklqdq m6, m2 ; t13a t5 vpbroadcastd m11, [o(pd_1567)] vpbroadcastd m10, [o(pd_3784)] ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11 ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10 REPX {pmaxsd x, m14}, m0, m4, m1, m8 REPX {pminsd x, m15}, m0, m4, m1, m8 punpckhqdq m2, m4, m0 ; t10a t2 punpcklqdq m4, m0 ; t8a t0 punpckhqdq m0, m8, m1 ; t11a t3 punpcklqdq m8, m1 ; t9a t1 paddd m1, m6, m7 ; out2 -out3 psubd m6, m7 ; t14a t6 paddd m7, m5, m3 ; -out13 out12 psubd m5, m3 ; t15a t7 psubd m3, m8, m0 ; t11 t3a paddd m8, m0 ; out14 -out15 paddd m0, m4, m2 ; -out1 out0 psubd m4, m2 ; t10 t2a REPX {pmaxsd x, m14}, m6, m5, m3, m4 mov r6d, 0x3333 REPX {pminsd x, m15}, m6, m5, m3, m4 kmovw k1, r6d REPX {pmulld x, m12}, m6, m5, m3, m4 pxor m9, m9 REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8 paddd m6, m13 paddd m4, m13 paddd m2, m6, m5 ; -out5 out4 psubd m6, m5 ; out10 -out11 psubd m5, m4, m3 ; -out9 out8 paddd m3, m4 ; out6 -out7 REPX {psrad x, 12}, m2, m3, m5, m6 REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6 ret ALIGN function_align .main_pass2: lea r5, [o_base_8bpc] pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_16x8_internal_8bpc).main_pass2 movshdup m11, [permC] pmulhrsw m0, m6 pmulhrsw m1, m6 vpbroadcastd m13, [pixel_10bpc_max] pxor m12, m12 lea r6, [strideq*3] ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, identity, -21 INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).main_pass1 vpbroadcastd m9, [o(pd_1)] psubd m4, m9, m3 paddd m3, m9, m5 paddd m5, m9, m2 psubd m2, m9, m6 psubd m6, m9, m1 paddd m1, m9, m7 paddd m7, m9, m0 psubd m0, m9, m8 jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: call m(iadst_16x8_internal_10bpc).main_pass2 psrlq m11, 8 vpermq m8, m11, m3 vpermq m9, m11, m2 call m(idct_16x8_internal_10bpc).write_16x4_noround vpermq m8, m11, m1 vpermq m9, m11, m0 jmp m(idct_16x8_internal_10bpc).write_16x4_noround INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x16_internal_10bpc).load2 vpbroadcastd m8, [o(pd_5793)] vpbroadcastd m13, [o(pd_3072)] pxor m10, m10 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7 call m(idct_8x16_internal_10bpc).round psrlq m8, [o(permA)], 16 psrlq m9, m8, 8 mova m10, m8 mova m11, m9 call m(idct_16x8_internal_10bpc).transpose_16x8 jmp tx2q .pass2: movshdup m4, [o(permC)] vpbroadcastd m11, [o(pw_4096)] mova m5, m4 jmp m(idct_16x8_internal_10bpc).end %macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 16x16 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 16 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity, 28 INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, adst cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 36 jl .fast mova m0, [cq+64* 0] mova m1, [cq+64* 2] mova m2, [cq+64* 4] mova m3, [cq+64* 6] mova m4, [cq+64* 8] mova m5, [cq+64*10] mova m6, [cq+64*12] mova m7, [cq+64*14] %if WIN64 movaps [cq+16*0], xmm6 movaps [cq+16*1], xmm7 %endif call m(idct_8x16_internal_10bpc).main mova m16, [cq+64* 1] mova m17, [cq+64* 3] mova m18, [cq+64* 5] mova m19, [cq+64* 7] mova m20, [cq+64* 9] mova m21, [cq+64*11] mova m22, [cq+64*13] mova m23, [cq+64*15] call .main call .main_end .pass1_end: %if WIN64 movaps xmm6, [cq+16*0] movaps xmm7, [cq+16*1] %endif vzeroupper .pass1_end2: call .main_end3 .pass1_end3: mov r6d, 64*12 pxor m8, m8 .zero_loop: mova [cq+r6+64*3], m8 mova [cq+r6+64*2], m8 mova [cq+r6+64*1], m8 mova [cq+r6+64*0], m8 sub r6d, 64*4 jge .zero_loop jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(idct_16x16_internal_8bpc).main movshdup m12, [permC] vpbroadcastd m11, [pw_2048] psrlq m13, m12, 8 vpermq m8, m12, m0 vpermq m0, m13, m7 vpermq m7, m13, m1 vpermq m1, m12, m6 vpermq m6, m12, m2 vpermq m2, m13, m5 vpermq m5, m13, m3 vpermq m3, m12, m4 .pass2_end: lea r6, [strideq*3] vpbroadcastd m13, [pixel_10bpc_max] pxor m12, m12 pmulhrsw m8, m11, m8 pmulhrsw m9, m11, m7 call m(idct_16x8_internal_10bpc).write_16x4_noround pmulhrsw m8, m11, m6 pmulhrsw m9, m11, m5 call m(idct_16x8_internal_10bpc).write_16x4_noround pmulhrsw m8, m11, m3 pmulhrsw m9, m11, m2 call m(idct_16x8_internal_10bpc).write_16x4_noround pmulhrsw m8, m11, m1 pmulhrsw m9, m11, m0 jmp m(idct_16x8_internal_10bpc).write_16x4_noround .fast: mova ym0, [cq+64*0] mova ym2, [cq+64*4] movshdup m8, [o(permB)] mova ym1, [cq+64*2] mova ym3, [cq+64*6] mova ym4, [cq+64*1] mova ym5, [cq+64*3] mova ym6, [cq+64*5] mova ym7, [cq+64*7] vpermt2q m0, m8, m2 ; 0 4 vpermt2q m1, m8, m3 ; 2 6 vpermt2q m4, m8, m5 ; 1 3 vpermt2q m7, m8, m6 ; 7 5 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast vpbroadcastd m11, [o(pd_2)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(permA)] psrlq m9, m8, 8 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 ALIGN function_align .main_fast2_rect2: REPX {paddd x, m13}, m16, m17 REPX {psrad x, 12 }, m16, m17 .main_fast2: pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a pmulld m9, m16, [o(pd_401)] {1to16} ; t8a pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a pmulld m17, [o(pd_3920)] {1to16} ; t12a psubd m18, m13, m18 REPX {paddd x, m13}, m22, m9, m17 REPX {psrad x, 12 }, m18, m22, m9, m17 mova m20, m9 mova m16, m18 mova m23, m22 mova m19, m17 jmp .main3 .main_fast_rect2: REPX {paddd x, m13}, m16, m17, m18, m19 REPX {psrad x, 12 }, m16, m17, m18, m19 .main_fast: pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a pmulld m16, [o(pd_401)] {1to16} ; t8a pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a pmulld m19, [o(pd_3166)] {1to16} ; t14a pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a pmulld m17, [o(pd_3920)] {1to16} ; t12a pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a pmulld m18, [o(pd_1931)] {1to16} ; t10a psubd m20, m13, m20 psubd m22, m13, m22 call .round2 jmp .main2 .main_rect2: call .round .main: ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a call .round .main2: paddd m9, m20, m16 ; t8 psubd m20, m16, m20 ; t9 psubd m16, m22, m18 ; t10 paddd m18, m22 ; t11 paddd m22, m23, m19 ; t15 psubd m23, m19 ; t14 psubd m19, m17, m21 ; t13 paddd m17, m21 ; t12 REPX {pmaxsd x, m14}, m20, m23, m16, m19 REPX {pminsd x, m15}, m20, m23, m16, m19 REPX {pmaxsd x, m14}, m9, m18, m22, m17 REPX {pminsd x, m15}, m9, m18, m22, m17 .main3: vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 paddd m21, m20, m19 ; t14 psubd m20, m19 ; t13 psubd m19, m9, m18 ; t11a paddd m9, m18 ; t8a psubd m18, m23, m16 ; t10 paddd m16, m23 ; t9 psubd m23, m22, m17 ; t12a paddd m22, m17 ; t15a REPX {pmaxsd x, m14}, m20, m23, m18, m19 REPX {pminsd x, m15}, m20, m23, m18, m19 REPX {pmulld x, m12}, m20, m23, m18, m19 psubd m7, m0, m6 ; dct8 out7 paddd m0, m6 ; dct8 out0 psubd m6, m1, m5 ; dct8 out6 paddd m1, m5 ; dct8 out1 REPX {pmaxsd x, m14}, m7, m0, m6, m1 psubd m5, m2, m4 ; dct8 out5 paddd m2, m4 ; dct8 out2 REPX {pminsd x, m15}, m7, m0, m6, m1 psubd m4, m3, m8 ; dct8 out4 paddd m3, m8 ; dct8 out3 REPX {pmaxsd x, m14}, m5, m2, m4, m3 paddd m20, m13 paddd m23, m13 REPX {pminsd x, m15}, m5, m2, m4, m3 psubd m17, m20, m18 ; t10a paddd m20, m18 ; t13a REPX {pmaxsd x, m14}, m22, m21, m16, m9 psubd m18, m23, m19 ; t11 paddd m19, m23 ; t12 REPX {pminsd x, m15}, m22, m21, m16, m9 REPX {psrad x, 12 }, m20, m19, m18, m17 ret .main_end: vpbroadcastd m11, [o(pd_2)] .main_end2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m23, m0, m22 ; out15 paddd m0, m22 ; out0 psubd m22, m1, m21 ; out14 paddd m1, m21 ; out1 psubd m21, m2, m20 ; out13 paddd m2, m20 ; out2 psubd m20, m3, m19 ; out12 paddd m3, m19 ; out3 psubd m19, m4, m18 ; out11 paddd m4, m18 ; out4 psubd m18, m5, m17 ; out10 paddd m5, m17 ; out5 psubd m17, m6, m16 ; out9 paddd m6, m16 ; out6 psubd m16, m7, m9 ; out8 paddd m7, m9 ; out7 REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ m4, m20, m5, m21, m6, m22, m7, m23 packssdw m0, m16 packssdw m1, m17 packssdw m2, m18 packssdw m3, m19 packssdw m4, m20 packssdw m5, m21 packssdw m6, m22 packssdw m7, m23 ret .main_end3: punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpcklwd m5, m6, m7 punpckhwd m6, m7 punpckhdq m7, m0, m2 punpckldq m0, m2 punpckhdq m2, m8, m1 punpckldq m8, m1 punpckhdq m1, m4, m5 punpckldq m4, m5 punpckhdq m5, m3, m6 punpckldq m3, m6 vshufi32x4 m6, m0, m4, q3232 vinserti32x8 m0, ym4, 1 vinserti32x8 m4, m8, ym3, 1 vshufi32x4 m8, m3, q3232 vinserti32x8 m3, m7, ym1, 1 vshufi32x4 m7, m1, q3232 vshufi32x4 m1, m2, m5, q3232 vinserti32x8 m2, ym5, 1 vshufi32x4 m5, m7, m1, q2020 ; 10 11 vshufi32x4 m7, m1, q3131 ; 14 15 vshufi32x4 m1, m3, m2, q2020 ; 2 3 vshufi32x4 m3, m2, q3131 ; 6 7 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 ret ALIGN function_align .round: paddd m20, m13 paddd m22, m13 .round2: paddd m16, m13 paddd m18, m13 .round3: REPX {psrad x, 12 }, m16, m18, m20, m22 REPX {paddd x, m13}, m17, m19, m21, m23 REPX {psrad x, 12 }, m17, m19, m21, m23 ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, flipadst INV_TXFM_16X16_FN adst, adst cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 36 jl .fast call .main_pass1 packssdw m0, m16 packssdw m1, m17 packssdw m2, m18 packssdw m3, m19 packssdw m4, m5, m20 packssdw m5, m6, m21 packssdw m6, m7, m22 packssdw m7, m8, m23 jmp m(idct_16x16_internal_10bpc).pass1_end .fast: call .main_pass1_fast vpbroadcastd m9, [o(pd_2)] paddd m0, m9 psubd m1, m9, m1 paddd m2, m9 psubd m3, m9, m3 paddd m4, m9, m5 psubd m5, m9, m6 paddd m6, m9, m7 psubd m7, m9, m8 .pass1_fast_end: mova m9, [o(permA)] psrlq m8, m9, 8 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 .pass1_fast_end2: mova m10, m9 mova m11, m8 call m(idct_16x8_internal_10bpc).transpose_16x8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7 jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(iadst_16x16_internal_8bpc).main_pass2b movshdup m12, [permC] mova m11, [pw_2048_m2048] psrlq m13, m12, 8 vpermq m8, m13, m0 vpermq m0, m12, m7 vpermq m7, m13, m1 vpermq m1, m12, m6 vpermq m6, m13, m2 vpermq m2, m12, m5 vpermq m5, m13, m3 vpermq m3, m12, m4 jmp m(idct_16x16_internal_10bpc).pass2_end ALIGN function_align .main_pass1: mova m0, [cq+64* 0] %if WIN64 movaps [cq+16*0], xmm6 movaps [cq+16*1], xmm7 %endif mova m23, [cq+64*15] vpbroadcastd m13, [o(pd_2048)] ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0 mova m7, [cq+64* 7] mova m16, [cq+64* 8] ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8 mova m2, [cq+64* 2] mova m21, [cq+64*13] ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2 mova m5, [cq+64* 5] mova m18, [cq+64*10] ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10 mova m4, [cq+64* 4] mova m19, [cq+64*11] ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4 mova m3, [cq+64* 3] mova m20, [cq+64*12] ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12 mova m6, [cq+64* 6] mova m17, [cq+64* 9] ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6 mova m1, [cq+64* 1] mova m22, [cq+64*14] ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] psubd m9, m23, m7 ; t9a paddd m23, m7 ; t1a psubd m7, m2, m18 ; t10a paddd m18, m2 ; t2a REPX {pmaxsd x, m14}, m9, m23, m7, m18 psubd m2, m17, m1 ; t15a paddd m17, m1 ; t7a REPX {pminsd x, m15}, m9, m23, m7, m18 psubd m1, m21, m5 ; t11a paddd m21, m5 ; t3a REPX {pmaxsd x, m14}, m2, m17, m1, m21 psubd m5, m4, m20 ; t12a paddd m4, m20 ; t4a REPX {pminsd x, m15}, m2, m17, m1, m21 psubd m20, m19, m3 ; t13a paddd m19, m3 ; t5a REPX {pmaxsd x, m14}, m5, m4, m20, m19 psubd m8, m6, m22 ; t14a paddd m6, m22 ; t6a REPX {pminsd x, m15}, m5, m4, m20, m19 psubd m22, m0, m16 ; t8a paddd m16, m0 ; t0a REPX {pmaxsd x, m14}, m8, m6, m22, m16 vpbroadcastd m11, [o(pd_4017)] vpbroadcastd m10, [o(pd_799)] REPX {pminsd x, m15}, m8, m6, m22, m16 ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 vpbroadcastd m11, [o(pd_2276)] vpbroadcastd m10, [o(pd_3406)] ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 paddd m0, m16, m4 ; t0 psubd m16, m4 ; t4 psubd m3, m23, m19 ; t5 paddd m23, m19 ; t1 REPX {pmaxsd x, m14}, m0, m16, m3, m23 psubd m19, m18, m6 ; t6 paddd m18, m6 ; t2 REPX {pminsd x, m15}, m0, m16, m3, m23 psubd m6, m21, m17 ; t7 paddd m21, m17 ; t3 REPX {pmaxsd x, m14}, m19, m18, m6, m21 paddd m17, m9, m20 ; t8a psubd m9, m20 ; t12a REPX {pminsd x, m15}, m19, m18, m6, m21 psubd m20, m22, m5 ; t13a paddd m22, m5 ; t9a REPX {pmaxsd x, m14}, m17, m9, m20, m22 psubd m5, m1, m2 ; t14a paddd m1, m2 ; t10a REPX {pminsd x, m15}, m17, m9, m20, m22 psubd m2, m7, m8 ; t15a paddd m7, m8 ; t11a REPX {pmaxsd x, m14}, m5, m1, m2, m7 vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] REPX {pminsd x, m15}, m5, m1, m2, m7 ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 psubd m8, m0, m18 ; t2a paddd m0, m18 ; out0 psubd m18, m23, m21 ; t3a paddd m23, m21 ; -out15 paddd m21, m9, m5 ; -out13 psubd m9, m5 ; t15a psubd m5, m3, m6 ; t6 paddd m3, m6 ; -out3 REPX {pmaxsd x, m14}, m8, m18, m9, m5 psubd m6, m20, m2 ; t14a paddd m2, m20 ; out2 paddd m20, m16, m19 ; out12 psubd m16, m19 ; t7 REPX {pminsd x, m15}, m8, m18, m9, m5 psubd m19, m22, m7 ; t11 paddd m22, m7 ; out14 psubd m7, m17, m1 ; t10 paddd m1, m17 ; -out1 REPX {pmaxsd x, m14}, m6, m16, m19, m7 vpbroadcastd m12, [o(pd_1448)] vpbroadcastd m4, [o(pd_2)] vpbroadcastd m10, [o(pd_5120)] vpbroadcastd m11, [o(pd_5119)] REPX {pminsd x, m15}, m6, m16, m19, m7 psubd m17, m7, m19 ; -out9 paddd m7, m19 ; out6 psubd m19, m5, m16 ; -out11 paddd m5, m16 ; out4 REPX {pmulld x, m12}, m17, m7, m19, m5 psubd m16, m8, m18 ; out8 paddd m8, m18 ; -out7 psubd m18, m6, m9 ; out10 paddd m6, m9 ; -out5 REPX {pmulld x, m12}, m16, m8, m18, m6 REPX {paddd x, m4 }, m0, m2, m20, m22 REPX {psubd x, m4, x}, m1, m3, m21, m23 REPX {paddd x, m10 }, m7, m5, m16, m18 REPX {psubd x, m11, x}, m17, m19, m8, m6 REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 ret ALIGN function_align .main_pass1_fast: mova ym0, [cq+64*0] mova ym1, [cq+64*2] movshdup m8, [o(permB)] mova ym6, [cq+64*1] mova ym7, [cq+64*3] mova ym2, [cq+64*4] mova ym3, [cq+64*6] mova ym4, [cq+64*5] mova ym5, [cq+64*7] vpermt2q m0, m8, m1 ; 0 2 vpermt2q m7, m8, m6 ; 3 1 vpermt2q m2, m8, m3 ; 4 6 vpermt2q m5, m8, m4 ; 7 5 vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m12, [o(pd_2896)] jmp m(iadst_16x8_internal_10bpc).main_fast INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 36 jl .fast call m(iadst_16x16_internal_10bpc).main_pass1 packssdw m4, m19, m3 packssdw m3, m20, m5 packssdw m5, m18, m2 packssdw m2, m21, m6 packssdw m6, m17, m1 packssdw m1, m22, m7 packssdw m7, m16, m0 packssdw m0, m23, m8 jmp m(idct_16x16_internal_10bpc).pass1_end .fast: call m(iadst_16x16_internal_10bpc).main_pass1_fast vpbroadcastd m9, [o(pd_2)] psubd m4, m9, m3 paddd m3, m9, m5 paddd m5, m9, m2 psubd m2, m9, m6 psubd m6, m9, m1 paddd m1, m9, m7 paddd m7, m9, m0 psubd m0, m9, m8 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end .pass2: lea r5, [o_base_8bpc] call m(iadst_16x16_internal_8bpc).main_pass2b movshdup m12, [permC] movu m11, [pw_m2048_2048] psrlq m13, m12, 8 vpermq m8, m13, m7 vpermq m7, m13, m6 vpermq m6, m13, m5 vpermq m5, m13, m4 vpermq m3, m12, m3 vpermq m2, m12, m2 vpermq m1, m12, m1 vpermq m0, m12, m0 jmp m(idct_16x16_internal_10bpc).pass2_end INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m10, [o(pd_5793)] vpbroadcastd m11, [o(pd_5120)] mov r6, cq cmp eobd, 36 jl .fast call .pass1_main packssdw m0, m6, m8 packssdw m1, m7, m9 call .pass1_main packssdw m2, m6, m8 packssdw m3, m7, m9 call .pass1_main packssdw m4, m6, m8 packssdw m5, m7, m9 call .pass1_main packssdw m6, m8 packssdw m7, m9 jmp m(idct_16x16_internal_10bpc).pass1_end2 .fast: call .pass1_main_fast packssdw m0, m6, m7 call .pass1_main_fast packssdw m1, m6, m7 call .pass1_main_fast packssdw m2, m6, m7 call .pass1_main_fast packssdw m3, m6, m7 punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckldq m3, m4, m1 punpckhdq m4, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 pxor m7, m7 vshufi32x4 m2, m0, m3, q3131 vshufi32x4 m0, m3, q2020 vshufi32x4 m3, m1, m4, q3131 vshufi32x4 m1, m4, q2020 REPX {mova x, m7}, m4, m5, m6 jmp m(idct_16x16_internal_10bpc).pass1_end3 .pass2: movshdup m14, [o(permC)] vpbroadcastd m15, [o(pw_1697x16)] lea r6, [strideq*3] vpbroadcastd m11, [o(pw_2048)] pxor m12, m12 vpbroadcastd m13, [pixel_10bpc_max] vpermq m8, m14, m0 vpermq m9, m14, m1 call .pass2_main vpermq m8, m14, m2 vpermq m9, m14, m3 call .pass2_main vpermq m8, m14, m4 vpermq m9, m14, m5 call .pass2_main vpermq m8, m14, m6 vpermq m9, m14, m7 .pass2_main: pmulhrsw m0, m15, m8 pmulhrsw m1, m15, m9 paddsw m8, m8 paddsw m9, m9 paddsw m8, m0 paddsw m9, m1 jmp m(idct_16x8_internal_10bpc).write_16x4 ALIGN function_align .pass1_main: pmulld m6, m10, [r6+64*0] pmulld m7, m10, [r6+64*1] pmulld m8, m10, [r6+64*8] pmulld m9, m10, [r6+64*9] add r6, 64*2 REPX {paddd x, m11}, m6, m7, m8, m9 REPX {psrad x, 13 }, m6, m8, m7, m9 ret ALIGN function_align .pass1_main_fast: mova ym6, [r6+64* 0] vinserti32x8 m6, [r6+64* 4], 1 mova ym7, [r6+64* 8] vinserti32x8 m7, [r6+64*12], 1 add r6, 64 REPX {pmulld x, m10}, m6, m7 REPX {paddd x, m11}, m6, m7 REPX {psrad x, 13 }, m6, m7 ret cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] vpbroadcastd m11, [o(pd_2)] mova m20, [o(idct8x32p)] pxor m21, m21 cmp eobd, 43 jl .fast call .pass1_main punpcklwd m16, m0, m1 punpcklwd m17, m2, m3 punpckhwd m18, m0, m1 punpckhwd m19, m2, m3 cmp eobd, 107 jge .full punpckldq m0, m16, m17 ; 0 2 punpckhdq m1, m16, m17 ; 4 6 punpckldq m2, m18, m19 ; 8 10 punpckhdq m3, m18, m19 ; 12 14 lea r5, [o_base_8bpc] vextracti32x8 ym14, m0, 1 vextracti32x8 ym15, m1, 1 vextracti32x8 ym16, m2, 1 vextracti32x8 ym17, m3, 1 call m(idct_8x16_internal_8bpc).main_fast call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end .full: add cq, 64 call .pass1_main punpcklwd m5, m0, m1 punpcklwd m6, m2, m3 punpckhwd m7, m0, m1 punpckhwd m8, m2, m3 punpckldq m0, m16, m17 ; 0 2 punpckhdq m1, m16, m17 ; 4 6 punpckldq m2, m18, m19 ; 8 10 punpckhdq m3, m18, m19 ; 12 14 punpckldq m4, m5, m6 ; 16 18 punpckhdq m5, m6 ; 20 22 punpckldq m6, m7, m8 ; 24 26 punpckhdq m7, m8 ; 28 30 lea r5, [o_base_8bpc] vextracti32x8 ym14, m0, 1 vextracti32x8 ym15, m1, 1 vextracti32x8 ym16, m2, 1 vextracti32x8 ym17, m3, 1 vextracti32x8 ym18, m4, 1 vextracti32x8 ym19, m5, 1 vextracti32x8 ym20, m6, 1 vextracti32x8 ym21, m7, 1 call m(idct_8x16_internal_8bpc).main REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 call m(inv_txfm_add_dct_dct_8x32_8bpc).main jmp .end .fast: movshdup m8, [o(permB)] mova ym1, [cq+128*1] mova ym5, [cq+128*5] mova ym7, [cq+128*3] mova ym3, [cq+128*7] mova ym0, [cq+128*0] mova ym4, [cq+128*2] mova ym2, [cq+128*4] mova ym6, [cq+128*6] vpermt2q m1, m8, m5 ; 1 5 vpermt2q m3, m8, m7 ; 7 3 vpermt2q m0, m8, m4 ; 0 2 vpermt2q m2, m8, m6 ; 4 6 mova [cq+128*0], ym21 REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_10bpc).main_end packssdw m0, m2 packssdw m1, m3 vpermb m0, m20, m0 vprold m20, 16 vpermb m2, m20, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 lea r5, [o_base_8bpc] vextracti32x8 ym14, m0, 1 vextracti32x8 ym15, m1, 1 call m(idct_8x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 .end: call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper lea r3, [strideq*2] vpbroadcastd m12, [pixel_10bpc_max] lea r6, [strideq*3] pxor m11, m11 lea r3, [dstq+r3*8] pmulhrsw m0, m10 pmulhrsw m1, m10 call .write_8x4x2 pmulhrsw m0, m10, m2 pmulhrsw m1, m10, m3 call .write_8x4x2 pmulhrsw m0, m10, m4 pmulhrsw m1, m10, m5 call .write_8x4x2 pmulhrsw m0, m10, m6 pmulhrsw m1, m10, m7 .write_8x4x2: mova xm8, [dstq+strideq*0] vinserti32x4 ym8, [dstq+strideq*1], 1 vinserti32x4 m8, [dstq+strideq*2], 2 vinserti32x4 m8, [dstq+r6 ], 3 mova xm9, [r3 +r6 ] vinserti32x4 ym9, [r3 +strideq*2], 1 vinserti32x4 m9, [r3 +strideq*1], 2 vinserti32x4 m9, [r3 +strideq*0], 3 paddw m8, m0 paddw m9, m1 pmaxsw m8, m11 pmaxsw m9, m11 pminsw m8, m12 pminsw m9, m12 mova [dstq+strideq*0], xm8 vextracti32x4 [dstq+strideq*1], ym8, 1 vextracti32x4 [dstq+strideq*2], m8, 2 vextracti32x4 [dstq+r6 ], m8, 3 lea dstq, [dstq+strideq*4] vextracti32x4 [r3 +strideq*0], m9, 3 vextracti32x4 [r3 +strideq*1], m9, 2 vextracti32x4 [r3 +strideq*2], ym9, 1 mova [r3 +r6 ], xm9 lea r3, [r3+strideq*4] ret .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 ALIGN function_align .pass1_main: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] mova m3, [cq+128*3] mova m4, [cq+128*4] mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 call m(idct_8x16_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_end2 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 REPX {vpermb x, m20, x}, m0, m1, m2, m3 ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob vpbroadcastd m9, [pw_5] lea r4, [strideq*3] pxor m10, m10 lea r5, [strideq*5] vpbroadcastd m11, [pixel_10bpc_max] sub eobd, 107 lea r6, [strideq+r4*2] .loop: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] lea r7, [dstq+strideq*8] REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 REPX {paddsw x, m9}, m0, m1, m2, m3 REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 REPX {psraw x, 3 }, m0, m1, m2, m3 add cq, 64 mova xm4, [dstq+strideq*0] mova xm5, [dstq+strideq*1] mova xm6, [dstq+strideq*2] mova xm7, [dstq+r4 *1] punpckhwd m8, m0, m1 vinserti32x4 ym4, [dstq+strideq*4], 1 punpcklwd m0, m1 vinserti32x4 ym5, [dstq+r5 *1], 1 punpckhwd m1, m2, m3 vinserti32x4 ym6, [dstq+r4 *2], 1 punpcklwd m2, m3 vinserti32x4 ym7, [dstq+r6 *1], 1 punpckhwd m3, m0, m8 vinserti32x4 m4, [r7 +strideq*0], 2 punpcklwd m0, m8 vinserti32x4 m5, [r7 +strideq*1], 2 punpckhwd m8, m2, m1 vinserti32x4 m6, [r7 +strideq*2], 2 punpcklwd m2, m1 vinserti32x4 m7, [r7 +r4 *1], 2 punpckhqdq m1, m0, m2 vinserti32x4 m4, [r7 +strideq*4], 3 punpcklqdq m0, m2 vinserti32x4 m5, [r7 +r5 *1], 3 punpcklqdq m2, m3, m8 vinserti32x4 m6, [r7 +r4 *2], 3 punpckhqdq m3, m8 vinserti32x4 m7, [r7 +r6 *1], 3 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 REPX {pmaxsw x, m10}, m0, m1, m2, m3 REPX {pminsw x, m11}, m0, m1, m2, m3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 mova [dstq+strideq*2], xm2 mova [dstq+r4 *1], xm3 vextracti32x4 [dstq+strideq*4], ym0, 1 vextracti32x4 [dstq+r5 *1], ym1, 1 vextracti32x4 [dstq+r4 *2], ym2, 1 vextracti32x4 [dstq+r6 *1], ym3, 1 lea dstq, [r7+strideq*8] vextracti32x4 [r7 +strideq*0], m0, 2 vextracti32x4 [r7 +strideq*1], m1, 2 vextracti32x4 [r7 +strideq*2], m2, 2 vextracti32x4 [r7 +r4 *1], m3, 2 vextracti32x4 [r7 +strideq*4], m0, 3 vextracti32x4 [r7 +r5 *1], m1, 3 vextracti32x4 [r7 +r4 *2], m2, 3 vextracti32x4 [r7 +r6 *1], m3, 3 add eobd, 0x80000000 jnc .loop RET cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly mova m11, [o(permB)] mova m0, [cq+64* 0] ; 0 1 mova m4, [cq+64* 1] ; 2 3 mova m1, [cq+64* 2] ; 4 5 mova m8, [cq+64* 3] ; 6 7 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] psrlq m10, m11, 32 %if WIN64 movaps [cq+16*0], xmm6 movaps [cq+16*1], xmm7 %endif mova m16, m11 vpermi2q m16, m0, m1 ; 1 5 mova m17, m11 vpermi2q m17, m8, m4 ; 7 3 cmp eobd, 43 jl .fast mova m18, [cq+64* 4] ; 8 9 mova m20, [cq+64* 5] ; 10 11 mova m6, [cq+64* 6] ; 12 13 mova m7, [cq+64* 7] ; 14 15 vpermt2q m0, m10, m18 ; 0 8 vpermt2q m18, m11, m6 ; 9 13 mova m19, m11 vpermi2q m19, m7, m20 ; 15 11 cmp eobd, 107 jge .full vpermt2q m1, m10, m6 ; 4 12 vpermt2q m4, m10, m8 ; 2 6 vpermt2q m7, m10, m20 ; 14 10 mov r6d, 64*1 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast call .main_fast call m(idct_16x16_internal_10bpc).main_end jmp .end .full: mova m2, [cq+64* 8] ; 16 17 mova m5, [cq+64* 9] ; 18 19 mova m9, [cq+64*10] ; 20 21 mova m21, [cq+64*11] ; 22 23 vpermt2q m1, m10, m9 ; 4 20 vpermt2q m7, m10, m21 ; 14 22 vpermt2q m21, m11, m5 ; 23 19 vpermt2q m5, m10, m20 ; 18 10 mova m20, m11 vpermi2q m20, m2, m9 ; 17 21 mova m22, [cq+64*12] ; 24 25 mova m9, [cq+64*13] ; 26 27 mova m3, [cq+64*14] ; 28 29 mova m23, [cq+64*15] ; 30 31 vpermt2q m2, m10, m22 ; 16 24 vpermt2q m22, m11, m3 ; 25 29 vpermt2q m3, m10, m6 ; 28 12 vpermt2q m4, m10, m9 ; 2 26 mova m6, m10 vpermi2q m6, m23, m8 ; 30 6 vpermt2q m23, m11, m9 ; 31 27 mov r6d, 64*3 call m(idct_8x8_internal_10bpc).main call m(idct_16x8_internal_10bpc).main call .main call m(idct_16x16_internal_10bpc).main_end jmp .end .fast: vpermq m0, m10, m0 ; 0 0 vpermq m1, m10, m1 ; 4 4 vpermt2q m4, m10, m8 ; 2 6 xor r6d, r6d call .main_fast2 call m(idct_16x16_internal_10bpc).main_end .end: %if WIN64 movaps xmm6, [cq+16*0] movaps xmm7, [cq+16*1] %endif vzeroupper call .transpose_8x32 pxor m14, m14 .zero_loop: mova [cq+r6*4+64*3], m14 mova [cq+r6*4+64*2], m14 mova [cq+r6*4+64*1], m14 mova [cq+r6*4+64*0], m14 sub r6d, 64 jge .zero_loop lea r5, [o_base_8bpc] punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 call m(inv_txfm_add_dct_dct_32x8_8bpc).main pxor m12, m12 .write_32x8_start: vpbroadcastd m11, [pw_2048] vpbroadcastd m13, [pixel_10bpc_max] lea r3, [strideq*3] .write_32x8: pmulhrsw m0, m11 pmulhrsw m1, m11 pmulhrsw m2, m11 pmulhrsw m3, m11 call .write_32x4 pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 pmulhrsw m2, m11, m6 pmulhrsw m3, m11, m7 .write_32x4: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3 ] REPX {pmaxsw x, m12}, m0, m1, m2, m3 REPX {pminsw x, m13}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 lea dstq, [dstq+strideq*4] ret .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 8 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 ALIGN function_align .main_fast3: ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3 vbroadcasti32x4 m5, [o(pd_401_4076)] pmulld m3, m0, m12 pmulld m4, m5 REPX {paddd x, m13}, m3, m4 REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a ; t8a t15a -> t8/9 t14/15 vbroadcasti32x4 m5, [o(pd_3784_m3784)] pshufd m7, m4, q1032 pmulld m6, m4, [o(pd_1567)]{bcstd} pmulld m5, m7 paddd m6, m13 paddd m5, m6 psrad m5, 12 ; m5=t9a t14a ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4] shufps m6, m4, m5, q1032 ; t12 t13 shufps m8, m4, m5, q3210 ; t11a t10 pmulld m9, m6, m12 pmulld m7, m8, m12 paddd m9, m13 paddd m5, m9, m7 ; t12 t13a psubd m4, m9, m7 ; t11 t10a REPX {psrad x, 12 }, m5, m4 psubd m7, m3, m6 ; dct16 out15 out14 paddd m0, m3, m6 ; dct16 out0 out1 psubd m6, m3, m5 ; dct16 out12 out13 paddd m1, m3, m5 ; dct16 out3 out2 psubd m5, m3, m4 ; dct16 out11 out10 paddd m2, m3, m4 ; dct16 out4 out5 psubd m4, m3, m8 ; dct16 out8 out9 paddd m3, m8 ; dct16 out7 out6 REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 ; idct32_bottomhalf vbroadcasti32x4 m18, [o(pd_201_m601)] vbroadcasti32x4 m19, [o(pd_4091_4052)] pmulld m17, m16, m19 pmulld m16, m18 REPX {paddd x, m13}, m17, m16 REPX {psrad x, 12 }, m17, m16 ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2] vbroadcasti32x4 m10, [o(pd_799_m2276)] vbroadcasti32x4 m11, [o(pd_4017_3406)] pmulld m18, m17, m10 pmulld m19, m17, m11 pmulld m8, m16, m11 pmulld m9, m16, m10 REPX {paddd x, m13}, m18, m19 psubd m18, m8 paddd m19, m9 REPX {psrad x, 12 }, m18, m19 ; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a ; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26 punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26] punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18] punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21] punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30] mova m21, m23 mova m18, m20 mova m17, m22 mova m19, m16 jmp .main4 .main_fast2: ; bottom three-quarters are zero vbroadcasti32x4 m8, [o(pd_799_4017)] pmulld m8, m1 ; t4 t7 vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1 REPX {paddd x, m13}, m8, m0 REPX {psrad x, 12 }, m8, m0 pmulld m3, m8, m12 mova m2, m0 ; t3 t2 call m(idct_8x8_internal_10bpc).main3 vbroadcasti32x4 m6, [o(pd_4076_3920)] vbroadcasti32x4 m3, [o(pd_401_m1189)] pmulld m6, m4 ; t15 t12 pmulld m4, m3 ; t9 t10 REPX {paddd x, m13}, m6, m4 REPX {psrad x, 12 }, m6, m4 mova m5, m6 ; t14 t13 mova m9, m4 ; t8 t11 call m(idct_16x8_internal_10bpc).main3 vbroadcasti32x4 m23, [o(pd_4091_3973)] vbroadcasti32x4 m7, [o(pd_201_995)] vbroadcasti32x4 m22, [o(pd_1380_601)] vbroadcasti32x4 m9, [o(pd_3857_4052)] pmulld m23, m16 ; t16 t20 pmulld m16, m7 ; t31 t27 pmulld m22, m17 ; -t19 -t25 pmulld m17, m9 ; t28 t24 REPX {paddd x, m13}, m23, m16, m17 psubd m22, m13, m22 REPX {psrad x, 12 }, m23, m16, m22, m17 mova m20, m23 ; t30 t26 mova m9, m16 ; t17 t21 mova m19, m22 ; t18 t22 mova m18, m17 ; t29 t25 jmp .main3 .main_fast: ; bottom half is zero vbroadcasti32x4 m23, [o(pd_4091_3973)] vbroadcasti32x4 m7, [o(pd_201_995)] vbroadcasti32x4 m20, [o(pd_2751_2106)] vbroadcasti32x4 m9, [o(pd_3035_3513)] vbroadcasti32x4 m21, [o(pd_3703_3290)] vbroadcasti32x4 m10, [o(pd_1751_2440)] vbroadcasti32x4 m22, [o(pd_1380_601)] vbroadcasti32x4 m11, [o(pd_3857_4052)] pmulld m23, m16 ; t16a t20a pmulld m16, m7 ; t31a t27a pmulld m20, m19 ; -t17a -t21a pmulld m19, m9 ; t30a t26a pmulld m21, m18 ; t18a t22a pmulld m18, m10 ; t29a t25a pmulld m22, m17 ; -t19a -t25a pmulld m17, m11 ; t28a t24a psubd m20, m13, m20 psubd m22, m13, m22 jmp .main2 .main: ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 paddd m20, m13 paddd m22, m13 .main2: REPX {paddd x, m13}, m16, m23, m19 REPX {psrad x, 12 }, m16, m20, m23, m19 psubd m9, m16, m20 ; t17 t21 paddd m16, m20 ; t16 t20 psubd m20, m23, m19 ; t30 t26 paddd m23, m19 ; t31 t27 REPX {pmaxsd x, m14}, m9, m16, m20, m23 REPX {paddd x, m13}, m21, m18, m17 REPX {psrad x, 12 }, m18, m22, m21, m17 psubd m19, m22, m18 ; t18 t22 paddd m22, m18 ; t19 t23 psubd m18, m17, m21 ; t29 t25 paddd m17, m21 ; t28 t24 REPX {pmaxsd x, m14}, m19, m22, m18, m17 REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 .main3: vbroadcasti32x4 m11, [o(pd_4017_2276)] vbroadcasti32x4 m10, [o(pd_799_3406)] psubd m7, m0, m6 ; dct16 out15 out14 paddd m0, m6 ; dct16 out0 out1 psubd m6, m1, m5 ; dct16 out12 out13 paddd m1, m5 ; dct16 out3 out2 psubd m5, m2, m4 ; dct16 out11 out10 paddd m2, m4 ; dct16 out4 out5 psubd m4, m3, m8 ; dct16 out8 out9 paddd m3, m8 ; dct16 out7 out6 ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 punpckhqdq m21, m16, m20 ; t20 t21a punpcklqdq m16, m20 ; t16 t17a punpcklqdq m20, m22, m19 ; t19 t18a punpckhqdq m22, m19 ; t23 t22a REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 punpcklqdq m19, m23, m9 ; t31 t30a punpckhqdq m23, m9 ; t27 t26a punpckhqdq m9, m17, m18 ; t24 t25a punpcklqdq m17, m18 ; t28 t29a psubd m18, m16, m20 ; t19a t18 paddd m20, m16 ; t16a t17 psubd m16, m19, m17 ; t28a t29 paddd m19, m17 ; t31a t30 psubd m17, m22, m21 ; t20a t21 paddd m22, m21 ; t23a t22 psubd m21, m9, m23 ; t27a t26 paddd m23, m9 ; t24a t25 REPX {pmaxsd x, m14}, m18, m16, m17, m21 REPX {pminsd x, m15}, m16, m18, m21, m17 REPX {pmaxsd x, m14}, m20, m22, m19, m23 REPX {pminsd x, m15}, m20, m22, m19, m23 .main4: vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 paddd m9, m20, m22 ; t16 t17a psubd m20, m22 ; t23 t22a paddd m22, m19, m23 ; t31 t30a psubd m19, m23 ; t24 t25a psubd m23, m16, m17 ; t20a t21 paddd m16, m17 ; t19a t18 psubd m17, m18, m21 ; t27a t26 paddd m21, m18 ; t28a t29 REPX {pmaxsd x, m14}, m20, m19, m23, m17 REPX {pminsd x, m15}, m19, m20, m17, m23 REPX {pmulld x, m12}, m19, m20, m17, m23 REPX {pmaxsd x, m14}, m22, m21, m16, m9 paddd m19, m13 paddd m17, m13 REPX {pminsd x, m15}, m22, m21, m16, m9 psubd m18, m19, m20 ; t23a t22 paddd m19, m20 ; t24a t25 paddd m20, m17, m23 ; t27 t26a psubd m17, m23 ; t20 t21a REPX {psrad x, 12 }, m20, m19, m18, m17 ret .transpose_8x32: mova m10, [o(idct32x8p)] psrlw m8, m10, 8 mova m9, m8 vpermi2w m8, m1, m5 vpermt2w m1, m10, m5 vprold m5, m9, 16 vpermi2w m9, m3, m7 vpermt2w m3, m10, m7 vprold m10, 16 mova m7, m5 vpermi2w m5, m0, m4 vpermt2w m0, m10, m4 vpermi2w m7, m2, m6 vpermt2w m2, m10, m6 punpckhdq m6, m5, m8 punpckldq m5, m8 punpckhdq m8, m7, m9 punpckldq m7, m9 punpckhdq m4, m2, m3 punpckldq m2, m3 punpckhdq m3, m0, m1 punpckldq m0, m1 ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob vpbroadcastd m5, [pw_4096] lea r4, [strideq*3] mova m6, [idtx32x8p] lea r5, [strideq*5] vpbroadcastd m9, [pixel_10bpc_max] lea r6, [strideq+r4*2] pxor m8, m8 sub eobd, 107 psrlw m7, m6, 8 .loop: mova m0, [cq+64*0] packssdw m0, [cq+64*1] ; 02 13 mova m1, [cq+64*2] packssdw m1, [cq+64*3] ; 46 57 mova m2, [cq+64*4] packssdw m2, [cq+64*5] ; 8a 9b mova m3, [cq+64*6] packssdw m3, [cq+64*7] ; ce df REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 mova m4, m6 vpermi2w m4, m1, m3 vpermt2w m1, m7, m3 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 mova m3, m7 vpermi2w m3, m0, m2 vpermt2w m0, m6, m2 add cq, 64*8 punpcklqdq m2, m3, m1 ; 4 5 punpckhqdq m3, m1 ; 6 7 punpckhqdq m1, m0, m4 ; 2 3 punpcklqdq m0, m4 ; 0 1 mova ym4, [dstq+strideq*0] vinserti32x8 m4, [dstq+strideq*1], 1 paddw m0, m4 mova ym4, [dstq+strideq*2] vinserti32x8 m4, [dstq+r4 *1], 1 paddw m1, m4 mova ym4, [dstq+strideq*4] vinserti32x8 m4, [dstq+r5 *1], 1 paddw m2, m4 mova ym4, [dstq+r4 *2] vinserti32x8 m4, [dstq+r6 *1], 1 paddw m3, m4 REPX {pmaxsw x, m8}, m0, m1, m2, m3 REPX {pminsw x, m9}, m0, m1, m2, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+r4 *1], m1, 1 mova [dstq+strideq*4], ym2 vextracti32x8 [dstq+r5 *1], m2, 1 mova [dstq+r4 *2], ym3 vextracti32x8 [dstq+r6 *1], m3, 1 add dstq, 32 add eobd, 0x80000000 jnc .loop RET cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif cmp eobd, 36 jl .fast call .pass1 cmp eobd, 151 jge .full lea r5, [o_base_8bpc] pxor m9, m9 punpcklwd m8, m1, m1 ; 2 punpckhwd m14, m1, m1 ; 3 punpcklwd m1, m3, m3 ; 6 punpckhwd m15, m3, m3 ; 7 punpcklwd m3, m6, m6 ; 12 punpckhwd m19, m6, m6 ; 13 punpcklwd m6, m9, m4 ; __ 8 punpckhwd m20, m4, m4 ; 9 punpckhwd m16, m5, m5 ; 11 punpcklwd m5, m5 ; 10 punpcklwd m9, m0 ; __ 0 punpckhwd m21, m0, m0 ; 1 punpcklwd m0, m7, m7 ; 14 punpckhwd m17, m7, m7 ; 15 punpcklwd m7, m2, m2 ; 4 punpckhwd m18, m2, m2 ; 5 call m(idct_16x16_internal_8bpc).main_fast call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mov r6d, 64*3 pxor m8, m8 .zero_loop: REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0 sub r6d, 64 jge .zero_loop jmp .pass2_end .full: mova [cq+128*0], m0 mova [cq+128*1], m1 mova [cq+128*2], m2 mova [cq+128*3], m3 mova [cq+128*4], m4 mova [cq+128*5], m5 mova [cq+128*6], m6 mova [cq+128*7], m7 add cq, 64 call .pass1 mova m9, [cq-64* 1] ; 0 1 mova m14, [cq+64* 1] ; 2 3 mova m18, [cq+64* 3] ; 4 5 mova m15, [cq+64* 5] ; 6 7 mova m20, [cq+64* 7] ; 8 9 mova m16, [cq+64* 9] ; 10 11 mova m22, [cq+64*11] ; 12 13 mova m19, [cq+64*13] ; 14 15 lea r5, [o_base_8bpc] punpcklwd m8, m7, m14 ; 30 2 punpckhwd m21, m7, m9 ; 31 1 punpcklwd m7, m6, m18 ; 28 4 punpckhwd m14, m6 ; 3 29 punpcklwd m9, m0, m9 ; 16 0 punpckhwd m17, m19, m0 ; 15 17 punpcklwd m0, m19, m1 ; 14 18 punpckhwd m19, m1, m22 ; 19 13 punpcklwd m1, m15, m5 ; 6 26 punpckhwd m18, m5, m18 ; 27 5 punpcklwd m6, m4, m20 ; 24 8 punpckhwd m15, m4 ; 7 25 punpcklwd m5, m3, m16 ; 22 10 punpckhwd m20, m3, m20 ; 23 9 punpcklwd m3, m22, m2 ; 12 20 punpckhwd m16, m2 ; 11 21 call m(idct_16x16_internal_8bpc).main2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf mov r6d, 32*7 pxor m8, m8 .full_zero_loop: REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1 sub r6d, 32 jge .full_zero_loop jmp .pass2_end .fast: mova ym0, [cq+128*0] mova ym2, [cq+128*4] movshdup m8, [o(permB)] mova ym1, [cq+128*2] mova ym3, [cq+128*6] mova ym4, [cq+128*1] mova ym5, [cq+128*3] mova ym6, [cq+128*5] mova ym7, [cq+128*7] vpermt2q m0, m8, m2 ; 0 4 vpermt2q m1, m8, m3 ; 2 6 vpermt2q m4, m8, m5 ; 1 3 vpermt2q m7, m8, m6 ; 7 5 REPX {pmulld x, m12}, m0, m1, m4, m7 pxor ym16, ym16 mova [cq+128*0], ym16 REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7 REPX {paddd x, m13}, m0, m1, m4, m7 REPX {psrad x, 12 }, m0, m1, m4, m7 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast vpbroadcastd m11, [o(pd_1)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(idct8x32p)] packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 mova m6, [dup16_perm] vpermb m0, m8, m0 vpermb m2, m8, m2 vprold m8, 16 vpermb m1, m8, m1 vpermb m3, m8, m3 punpckldq m4, m0, m2 punpckhdq m0, m2 punpckldq m2, m1, m3 punpckhdq m1, m3 punpckldq m21, m4, m2 punpckhdq m14, m4, m2 punpckldq m18, m0, m1 punpckhdq m15, m0, m1 vpermb m8, m6, m14 ; 2 vpermb m1, m6, m15 ; 6 vpermb m7, m6, m18 ; 4 pmovzxwd m9, ym21 ; 0 vpord m6, [o(pb_32)] {1to16} lea r5, [o_base_8bpc] vpermb m21, m6, m21 ; 1 vpermb m15, m6, m15 ; 7 vpermb m18, m6, m18 ; 5 vpermb m14, m6, m14 ; 3 pslld m9, 16 call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 .pass2_end: movshdup m22, [permC] vpbroadcastd m11, [pw_2048] vpbroadcastd m13, [pixel_10bpc_max] lea r6, [strideq*3] pxor m12, m12 psrlq m23, m22, 8 vpermq m8, m22, m0 vpermq m9, m23, m1 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m2 vpermq m9, m23, m3 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m4 vpermq m9, m23, m5 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m6 vpermq m9, m23, m7 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m14 vpermq m9, m23, m15 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m16 vpermq m9, m23, m17 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m18 vpermq m9, m23, m19 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m20 vpermq m9, m23, m21 %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif vzeroupper jmp m(idct_16x8_internal_10bpc).write_16x4 .pass1: pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 2] pmulld m2, m12, [cq+128* 4] pmulld m3, m12, [cq+128* 6] pmulld m4, m12, [cq+128* 8] pmulld m5, m12, [cq+128*10] pmulld m6, m12, [cq+128*12] pmulld m7, m12, [cq+128*14] call m(idct_8x16_internal_10bpc).main_rect2 pmulld m16, m12, [cq+128* 1] pmulld m17, m12, [cq+128* 3] pmulld m18, m12, [cq+128* 5] pmulld m19, m12, [cq+128* 7] pmulld m20, m12, [cq+128* 9] pmulld m21, m12, [cq+128*11] pmulld m22, m12, [cq+128*13] pmulld m23, m12, [cq+128*15] call m(idct_16x16_internal_10bpc).main_rect2 vpbroadcastd m11, [o(pd_1)] call m(idct_16x16_internal_10bpc).main_end2 jmp m(idct_16x16_internal_10bpc).main_end3 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob %undef cmp vpbroadcastd m10, [pw_2896x8] vpbroadcastd m11, [pw_1697x16] vpbroadcastd m13, [pw_8192] vpbroadcastd m15, [pixel_10bpc_max] lea r6, [strideq*9] pxor m14, m14 paddw m12, m13, m13 ; pw_16384 cmp eobd, 151 jl .main call .main add cq, 64-128*4 lea dstq, [dstq+strideq*8] .main: call .main_internal add cq, 128*4 pmulhrsw m1, m13, m2 pmulhrsw m3, m13, m4 pmulhrsw m5, m13, m6 pmulhrsw m7, m13, m8 call .main_internal .main2: pmulhrsw m2, m13 pmulhrsw m4, m13 pmulhrsw m6, m13 pmulhrsw m8, m13 punpcklqdq m0, m1, m2 ; 0 8 punpckhqdq m1, m2 ; 1 9 call .write_16x2x2 punpcklqdq m0, m3, m4 ; 2 10 punpckhqdq m1, m3, m4 ; 3 11 call .write_16x2x2 punpcklqdq m0, m5, m6 ; 4 12 punpckhqdq m1, m5, m6 ; 5 13 call .write_16x2x2 punpcklqdq m0, m7, m8 ; 6 14 punpckhqdq m1, m7, m8 ; 7 15 .write_16x2x2: mova ym2, [dstq+strideq*0] vinserti32x8 m2, [dstq+strideq*8], 1 mova ym9, [dstq+strideq*1] vinserti32x8 m9, [dstq+r6 ], 1 paddw m0, m2 paddw m1, m9 pmaxsw m0, m14 pmaxsw m1, m14 pminsw m0, m15 pminsw m1, m15 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*8], m0, 1 mova [dstq+strideq*1], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+strideq*2] ret .main_internal: mova m8, [cq+128* 0] packssdw m8, [cq+128* 8] mova m6, [cq+128* 1] packssdw m6, [cq+128* 9] mova m0, [cq+128* 2] packssdw m0, [cq+128*10] mova m2, [cq+128* 3] packssdw m2, [cq+128*11] REPX {pmulhrsw x, m10}, m8, m6, m0, m2 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 pmulhrsw m4, m11, m8 pmulhrsw m9, m11, m6 REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 pmulhrsw m4, m12 pmulhrsw m9, m12 paddsw m8, m4 paddsw m6, m9 pmulhrsw m4, m11, m0 pmulhrsw m9, m11, m2 REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 pmulhrsw m4, m12 pmulhrsw m9, m12 paddsw m0, m4 paddsw m2, m9 punpcklwd m4, m8, m6 punpckhwd m8, m6 punpcklwd m6, m0, m2 punpckhwd m0, m2 punpckldq m2, m4, m6 ; 0 1 punpckhdq m4, m6 ; 2 3 punpckldq m6, m8, m0 ; 4 5 punpckhdq m8, m0 ; 6 7 ret cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif mov r6d, 8*12 cmp eobd, 36 jl .fast pmulld m0, m12, [cq+64* 0] pmulld m1, m12, [cq+64* 4] pmulld m2, m12, [cq+64* 8] pmulld m3, m12, [cq+64*12] pmulld m16, m12, [cq+64* 2] pmulld m17, m12, [cq+64* 6] pmulld m18, m12, [cq+64*10] pmulld m19, m12, [cq+64*14] cmp eobd, 151 jge .full call m(idct_8x16_internal_10bpc).main_fast_rect2 call m(idct_16x16_internal_10bpc).main_fast_rect2 call .idct16_sumsub call .pass1_load_spill call .main_fast_rect2 jmp .pass1_end .full: pmulld m4, m12, [cq+64*16] pmulld m5, m12, [cq+64*20] pmulld m6, m12, [cq+64*24] pmulld m7, m12, [cq+64*28] pmulld m20, m12, [cq+64*18] pmulld m21, m12, [cq+64*22] pmulld m22, m12, [cq+64*26] pmulld m23, m12, [cq+64*30] add r6d, 8*16 call m(idct_8x16_internal_10bpc).main_rect2 call m(idct_16x16_internal_10bpc).main_rect2 call .idct16_sumsub call .pass1_load_spill pmulld m16, m12, [cq+64*17] pmulld m17, m12, [cq+64*19] pmulld m18, m12, [cq+64*21] pmulld m19, m12, [cq+64*23] pmulld m20, m12, [cq+64*25] pmulld m21, m12, [cq+64*27] pmulld m22, m12, [cq+64*29] pmulld m23, m12, [cq+64*31] call .main_rect2 .pass1_end: vpbroadcastd m11, [o(pd_1)] lea r4, [cq+64] call .idct32_pass1_end lea r5, [o_base_8bpc] punpckhqdq m19, m5, m16 ; 11 punpcklqdq m5, m16 ; 10 punpckhqdq m16, m2, m1 ; 5 punpcklqdq m2, m1 ; 4 punpcklqdq m1, m15, m4 ; 2 punpckhqdq m15, m4 ; 3 punpcklqdq m4, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpckhqdq m14, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m20, m6, m17 ; 13 punpcklqdq m6, m17 ; 12 punpckhqdq m17, m3, m21 ; 7 punpcklqdq m3, m21 ; 6 punpckhqdq m21, m7, m8 ; 15 punpcklqdq m7, m8 ; 14 call m(inv_txfm_add_dct_dct_32x8_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf jmp .end .fast: pmulld ym0, ym12, [cq+64*0] pmulld ym1, ym12, [cq+64*4] movshdup m7, [o(permB)] mova ym4, [cq+64*2] mova ym5, [cq+64*6] mova ym16, [cq+64*1] mova ym2, [cq+64*5] mova ym3, [cq+64*3] mova ym17, [cq+64*7] vpermt2q m4, m7, m5 ; 2 6 vpermt2q m16, m7, m2 ; 1 5 vpermt2q m17, m7, m3 ; 7 3 paddd ym0, ym13 paddd ym1, ym13 psrad ym0, 12 psrad ym1, 12 vpermq m0, m7, m0 ; 0 0 vpermq m1, m7, m1 ; 4 4 REPX {pmulld x, m12}, m4, m16, m17 REPX {paddd x, m13}, m4, m16, m17 REPX {psrad x, 12 }, m4, m16, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 vpbroadcastd m11, [o(pd_1)] call m(idct_16x16_internal_10bpc).main_end2 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 lea r5, [o_base_8bpc] punpckhqdq m14, m0, m2 ; 1 punpcklqdq m0, m2 ; 0 punpcklqdq m1, m3, m4 ; 2 punpckhqdq m15, m3, m4 ; 3 punpcklqdq m2, m5, m7 ; 4 punpckhqdq m16, m5, m7 ; 5 punpcklqdq m3, m6, m8 ; 6 punpckhqdq m17, m6, m8 ; 7 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast .end: %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif pxor m12, m12 .zero_loop: mova [cq+r6*8+64*3], m12 mova [cq+r6*8+64*2], m12 mova [cq+r6*8+64*1], m12 mova [cq+r6*8+64*0], m12 sub r6d, 8*4 jge .zero_loop call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start pmulhrsw m0, m11, m14 pmulhrsw m1, m11, m15 pmulhrsw m2, m11, m16 pmulhrsw m3, m11, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m18 pmulhrsw m1, m11, m19 pmulhrsw m2, m11, m20 pmulhrsw m3, m11, m21 vzeroupper jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 16 .dconly3: add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 .dconly2: vpbroadcastd m3, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw m2, r6d paddsw m2, m3 .dconly_loop: paddsw m0, m2, [dstq+strideq*0] paddsw m1, m2, [dstq+strideq*1] psubusw m0, m3 psubusw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET ALIGN function_align .idct16_sumsub: psubd m23, m0, m22 ; t15 paddd m0, m22 ; t0 psubd m22, m1, m21 ; t14 paddd m1, m21 ; t1 REPX {pmaxsd x, m14}, m23, m0, m22, m1 psubd m21, m2, m20 ; t13 paddd m2, m20 ; t2 REPX {pminsd x, m15}, m23, m0, m22, m1 psubd m20, m3, m19 ; t12 paddd m3, m19 ; t3 REPX {pmaxsd x, m14}, m21, m2, m20, m3 psubd m19, m4, m18 ; t11 paddd m4, m18 ; t4 REPX {pminsd x, m15}, m21, m2, m20, m3 psubd m18, m5, m17 ; t10 paddd m5, m17 ; t5 REPX {pmaxsd x, m14}, m19, m4, m18, m5 psubd m17, m6, m16 ; t9 paddd m6, m16 ; t6 REPX {pminsd x, m15}, m19, m4, m18, m5 psubd m16, m7, m9 ; t8 paddd m7, m9 ; t7 REPX {pmaxsd x, m14}, m17, m6, m16, m7 REPX {pminsd x, m15}, m17, m6, m16, m7 ret .idct32_pass1_end: psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 %macro IDCT32_PASS1_END 2 ; low, high paddd m8, m11, [r4+128*%1] paddd m9, m11, [cq+128*%1] psubd m10, m8, m%1 ; out 16+n paddd m8, m%1 ; out 15-n paddd m%1, m9, m%2 ; out 0+n psubd m9, m%2 ; out 31-n REPX {vpsravd x, m11}, m10, m%1, m8, m9 packssdw m%1, m10 ; 0+n 16+n packssdw m%2, m8, m9 ; 15-n 31-n %endmacro IDCT32_PASS1_END 0, 23 ; 0 16, 15 31 IDCT32_PASS1_END 7, 16 ; 7 23, 8 24 IDCT32_PASS1_END 1, 22 ; 1 17, 14 30 IDCT32_PASS1_END 6, 17 ; 6 22, 9 25 IDCT32_PASS1_END 2, 21 ; 2 18, 13 29 IDCT32_PASS1_END 5, 18 ; 5 21, 10 26 IDCT32_PASS1_END 3, 20 ; 3 19, 12 28 IDCT32_PASS1_END 4, 19 ; 4 20, 11 27 .transpose_16x32: mova m14, m13 vpermi2q m14, m0, m16 vpermt2q m0, m12, m16 mova m15, m13 vpermi2q m15, m1, m17 vpermt2q m1, m12, m17 mova m16, m13 vpermi2q m16, m2, m18 vpermt2q m2, m12, m18 mova m17, m13 vpermi2q m17, m3, m19 vpermt2q m3, m12, m19 mova m18, m13 vpermi2q m18, m4, m20 vpermt2q m4, m12, m20 mova m19, m13 vpermi2q m19, m5, m21 vpermt2q m5, m12, m21 mova m20, m13 vpermi2q m20, m6, m22 vpermt2q m6, m12, m22 mova m21, m13 vpermi2q m21, m7, m23 vpermt2q m7, m12, m23 punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07 punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03 punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07 punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03 punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07 punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03 punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07 punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03 punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15 punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11 punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15 punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11 punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15 punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11 punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15 punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11 punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07 punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05 punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11 punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09 punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11 punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09 punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01 punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03 punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13 punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15 punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03 punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01 punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05 punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07 punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15 punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13 ret .pass1_load_spill: mova [cq+64* 0], m0 mova [cq+64* 2], m1 mova [cq+64* 4], m2 mova [cq+64* 6], m3 mova [cq+64* 8], m4 mova [cq+64*10], m5 mova [cq+64*12], m6 mova [cq+64*14], m7 pmulld m0, m12, [cq+64* 1] pmulld m1, m12, [cq+64* 3] pmulld m2, m12, [cq+64* 5] pmulld m3, m12, [cq+64* 7] pmulld m4, m12, [cq+64* 9] pmulld m5, m12, [cq+64*11] pmulld m6, m12, [cq+64*13] pmulld m7, m12, [cq+64*15] mova [cq+64* 1], m23 mova [cq+64* 3], m22 mova [cq+64* 5], m21 mova [cq+64* 7], m20 mova [cq+64* 9], m19 mova [cq+64*11], m18 mova [cq+64*13], m17 mova [cq+64*15], m16 ret .main_fast2_rect2: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_fast2: ; bottom 3/4 is zero pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a pmulld m0, [o(pd_201)] {1to16} ; t16a pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a pmulld m3, [o(pd_3857)] {1to16} ; t28a pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a pmulld m2, [o(pd_995)] {1to16} ; t20a pmulld m6, m1, [o(pd_601)] {1to16} ; t23a pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a REPX {psubd x, m13, x}, m20, m6 REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17 REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17 mova m8, m0 mova m16, m23 mova m7, m20 mova m4, m3 mova m19, m2 mova m18, m21 mova m5, m6 mova m22, m17 jmp .main3 .main_fast_rect2: call m(idct_8x16_internal_10bpc).round .main_fast: ; bottom half is zero pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a pmulld m0, [o(pd_201)] {1to16} ; t16a pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a pmulld m7, [o(pd_3035)] {1to16} ; t30a pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a pmulld m4, [o(pd_1751)] {1to16} ; t18a pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a pmulld m3, [o(pd_3857)] {1to16} ; t28a pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a pmulld m2, [o(pd_995)] {1to16} ; t20a pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a pmulld m5, [o(pd_3513)] {1to16} ; t26a pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a pmulld m6, [o(pd_2440)] {1to16} ; t22a pmulld m22, m1, [o(pd_601)] {1to16} ; t23a pmulld m1, [o(pd_4052)] {1to16} ; t24a REPX {psubd x, m13, x}, m16, m20, m18, m22 call m(idct_16x16_internal_10bpc).round3 jmp .main2 .main_rect2: call m(idct_8x16_internal_10bpc).round call m(idct_16x16_internal_10bpc).round .main: ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a call m(idct_16x16_internal_10bpc).round .main2: call m(idct_8x16_internal_10bpc).round psubd m8, m0, m16 ; t17 paddd m0, m16 ; t16 psubd m16, m23, m7 ; t30 paddd m23, m7 ; t31 REPX {pmaxsd x, m14}, m8, m0, m16, m23 paddd m7, m20, m4 ; t19 psubd m20, m4 ; t18 REPX {pminsd x, m15}, m8, m0, m16, m23 paddd m4, m3, m19 ; t28 psubd m3, m19 ; t29 REPX {pmaxsd x, m14}, m7, m20, m4, m3 psubd m19, m2, m18 ; t21 paddd m2, m18 ; t20 REPX {pminsd x, m15}, m7, m20, m4, m3 psubd m18, m21, m5 ; t26 paddd m21, m5 ; t27 REPX {pmaxsd x, m14}, m19, m2, m18, m21 psubd m5, m22, m6 ; t22 paddd m6, m22 ; t23 REPX {pminsd x, m15}, m19, m2, m18, m21 psubd m22, m1, m17 ; t25 paddd m17, m1 ; t24 REPX {pmaxsd x, m14}, m5, m6, m22, m17 REPX {pminsd x, m15}, m5, m6, m22, m17 .main3: vpbroadcastd m11, [o(pd_4017)] vpbroadcastd m10, [o(pd_799)] ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a vpbroadcastd m11, [o(pd_2276)] vpbroadcastd m10, [o(pd_3406)] ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a paddd m1, m6, m2 ; t23a psubd m6, m2 ; t20a psubd m2, m17, m21 ; t27a paddd m17, m21 ; t24a REPX {pmaxsd x, m14}, m1, m6, m2, m17 psubd m21, m23, m4 ; t28a paddd m23, m4 ; t31a REPX {pminsd x, m15}, m1, m6, m2, m17 psubd m4, m16, m20 ; t18 paddd m16, m20 ; t17 REPX {pmaxsd x, m14}, m21, m23, m4, m16 psubd m20, m0, m7 ; t19a paddd m0, m7 ; t16a REPX {pminsd x, m15}, m21, m23, m4, m16 psubd m7, m8, m3 ; t29 paddd m3, m8 ; t30 REPX {pmaxsd x, m14}, m20, m0, m7, m3 paddd m8, m5, m18 ; t22 psubd m5, m18 ; t21 REPX {pminsd x, m15}, m20, m0, m7, m3 psubd m18, m22, m19 ; t26 paddd m22, m19 ; t25 REPX {pmaxsd x, m14}, m8, m5, m18, m22 vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] REPX {pminsd x, m15}, m8, m5, m18, m22 ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28 ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20 ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a psubd m19, m0, m1 ; t23 paddd m0, m1 ; t16 paddd m1, m8, m16 ; t17a psubd m8, m16, m8 ; t22a REPX {pmaxsd x, m14}, m19, m0, m1, m8 psubd m16, m23, m17 ; t24 paddd m23, m17 ; t31 REPX {pminsd x, m15}, m19, m0, m1, m8 psubd m17, m3, m22 ; t25a paddd m22, m3 ; t30a REPX {pmaxsd x, m14}, m16, m23, m17, m22 paddd m3, m6, m21 ; t19a psubd m6, m21, m6 ; t20a REPX {pminsd x, m15}, m16, m23, m17, m22 paddd m21, m18, m4 ; t29 psubd m18, m4, m18 ; t26 REPX {pmaxsd x, m14}, m3, m6, m21, m18 psubd m4, m20, m2 ; t27a paddd m20, m2 ; t28a REPX {pminsd x, m15}, m3, m6, m21, m18 paddd m2, m7, m5 ; t18 psubd m7, m5 ; t21 REPX {pmaxsd x, m14}, m4, m20, m2, m7 REPX {pminsd x, m15}, m4, m20, m2, m7 REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8 REPX {paddd x, m13}, m18, m16, m4, m17 psubd m5, m18, m7 ; t21a paddd m18, m7 ; t26a psubd m7, m16, m19 ; t23a paddd m16, m19 ; t24a REPX {psrad x, 12 }, m5, m18, m7, m16 paddd m19, m4, m6 ; t27 psubd m4, m6 ; t20 psubd m6, m17, m8 ; t22 paddd m17, m8 ; t25 REPX {psrad x, 12 }, m19, m4, m6, m17 ret cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob %undef cmp vpbroadcastd m10, [pw_2896x8] vpbroadcastd m11, [pw_1697x16] vpbroadcastd m13, [pw_2048] vpbroadcastd m15, [pixel_10bpc_max] lea r6, [strideq*9] pxor m14, m14 cmp eobd, 151 jl .main mov r4, dstq call .main add cq, 64*12 lea dstq, [r4+32] .main: call .main_internal add cq, 64*4 pmulhrsw m1, m13, m2 pmulhrsw m3, m13, m4 pmulhrsw m5, m13, m6 pmulhrsw m7, m13, m8 call .main_internal jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 .main_internal: mova m8, [cq+64* 0] packssdw m8, [cq+64* 8] mova m6, [cq+64* 1] packssdw m6, [cq+64* 9] mova m0, [cq+64* 2] packssdw m0, [cq+64*10] mova m2, [cq+64* 3] packssdw m2, [cq+64*11] REPX {pmulhrsw x, m10}, m8, m6, m0, m2 REPX {paddsw x, x }, m8, m6, m0, m2 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 pmulhrsw m4, m11, m8 pmulhrsw m9, m11, m6 paddsw m8, m8 paddsw m6, m6 REPX {mova [cq+64*x], m14}, 0, 1, 2, 3 paddsw m8, m4 paddsw m6, m9 pmulhrsw m4, m11, m0 pmulhrsw m9, m11, m2 paddsw m0, m0 paddsw m2, m2 REPX {mova [cq+64*x], m14}, 8, 9, 10, 11 paddsw m0, m4 paddsw m2, m9 punpcklwd m4, m8, m6 punpckhwd m8, m6 punpcklwd m6, m0, m2 punpckhwd m0, m2 punpckldq m2, m4, m6 ; 0 1 punpckhdq m4, m6 ; 2 3 punpckldq m6, m8, m0 ; 4 5 punpckhdq m8, m0 ; 6 7 ret cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] WIN64_SPILL_XMM 30 cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero mov r6d, 16*12 jmp .lefthalf .full: call .pass1 mov r6d, 16*28 .lefthalf: mova [cq+128* 0], m0 mova [cq+128* 1], m1 mova [cq+128* 2], m2 mova [cq+128* 3], m3 mova [cq+128* 4], m14 mova [cq+128* 5], m15 mova [cq+128* 6], m16 mova [cq+128* 7], m17 mova [cq+128* 8], m22 mova [cq+128* 9], m23 mova [cq+128*10], m24 mova [cq+128*11], m25 mova [cq+128*12], m26 mova [cq+128*13], m27 mova [cq+128*14], m28 mova [cq+128*15], m29 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] call .pass1 lea r5, [o_base_8bpc] call .pass2_start pxor m12, m12 .right_zero_loop: mova [cq+r6*8+64+128*3], m12 mova [cq+r6*8+64+128*2], m12 mova [cq+r6*8+64+128*1], m12 mova [cq+r6*8+64+128*0], m12 sub r6d, 16*4 jge .right_zero_loop mov r6d, 16*28 jmp .end2 .pass2_start: mova m4, [cq+64+128* 0] mova m5, [cq+64+128* 1] mova m6, [cq+64+128* 2] mova m7, [cq+64+128* 3] mova m18, [cq+64+128* 4] mova m19, [cq+64+128* 5] mova m20, [cq+64+128* 6] mova m21, [cq+64+128* 7] call m(inv_txfm_add_dct_dct_32x8_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+128*0], m14 mova [cq+128*1], m15 mova [cq+128*2], m16 mova [cq+128*3], m17 mova [cq+128*4], m18 mova [cq+128*5], m19 mova [cq+128*6], m20 mova [cq+128*7], m21 mova m14, [cq+64+128* 8] mova m15, [cq+64+128* 9] mova m16, [cq+64+128*10] mova m17, [cq+64+128*11] mova m18, [cq+64+128*12] mova m19, [cq+64+128*13] mova m20, [cq+64+128*14] mova m21, [cq+64+128*15] jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast lea r5, [o_base_8bpc] call .pass2_fast_start jmp .end .pass2_fast_start: call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova [cq+128*0], m14 mova [cq+128*1], m15 mova [cq+128*2], m16 mova [cq+128*3], m17 mova [cq+128*4], m18 mova [cq+128*5], m19 mova [cq+128*6], m20 mova [cq+128*7], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*0] mova ym1, [cq+128*4] mova ym4, [cq+128*2] mova ym5, [cq+128*6] mova ym16, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym17, [cq+128*7] mov r6d, 16*4 vpermq m0, m7, m0 ; 0 0 vpermq m1, m7, m1 ; 4 4 vpermt2q m4, m7, m5 ; 2 6 vpermt2q m16, m7, m2 ; 1 5 vpermt2q m17, m7, m3 ; 7 3 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 call m(idct_16x16_internal_10bpc).main_end call .pass2_fast2_start .end: pxor m12, m12 .end2: call .pass2_end .zero_loop: mova [cq+r6*8+128*3], m12 mova [cq+r6*8+128*2], m12 mova [cq+r6*8+128*1], m12 mova [cq+r6*8+128*0], m12 sub r6d, 16*4 jge .zero_loop WIN64_RESTORE_XMM vzeroupper ret .pass2_fast2_start: call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 lea r5, [o_base_8bpc] punpckhqdq m22, m0, m2 ; 1 punpcklqdq m0, m2 ; 0 punpcklqdq m1, m5, m7 ; 4 punpckhqdq m24, m5, m7 ; 5 punpcklqdq m14, m3, m4 ; 2 punpckhqdq m23, m3, m4 ; 3 punpcklqdq m15, m6, m8 ; 6 punpckhqdq m25, m6, m8 ; 7 mova m10, m13 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [cq+128*0], m14 mova [cq+128*1], m15 mova [cq+128*2], m16 mova [cq+128*3], m17 mova [cq+128*4], m18 mova [cq+128*5], m19 mova [cq+128*6], m20 mova [cq+128*7], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 .pass2_end: psubsw m9, m0, m29 ; out31 paddsw m0, m29 ; out0 psubsw m29, m1, m28 ; out30 paddsw m1, m28 ; out1 psubsw m28, m2, m27 ; out29 paddsw m2, m27 ; out2 psubsw m27, m3, m26 ; out28 paddsw m3, m26 ; out3 psubsw m26, m4, m25 ; out27 paddsw m4, m25 ; out4 psubsw m25, m5, m24 ; out26 paddsw m5, m24 ; out5 psubsw m24, m6, m23 ; out25 paddsw m6, m23 ; out6 psubsw m23, m7, m22 ; out24 paddsw m7, m22 ; out7 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] mova m3, [cq+128*3] mova m4, [cq+128*4] mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] psubsw m22, m0, m21 ; out23 paddsw m0, m21 ; out8 psubsw m21, m1, m20 ; out22 paddsw m1, m20 ; out9 psubsw m20, m2, m19 ; out21 paddsw m2, m19 ; out10 psubsw m19, m3, m18 ; out20 paddsw m3, m18 ; out11 psubsw m18, m4, m17 ; out19 paddsw m4, m17 ; out12 psubsw m17, m5, m16 ; out18 paddsw m5, m16 ; out13 psubsw m16, m6, m15 ; out17 paddsw m6, m15 ; out14 psubsw m15, m7, m14 ; out16 paddsw m7, m14 ; out15 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 pmulhrsw m0, m11, m15 pmulhrsw m1, m11, m16 pmulhrsw m2, m11, m17 pmulhrsw m3, m11, m18 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m19 pmulhrsw m1, m11, m20 pmulhrsw m2, m11, m21 pmulhrsw m3, m11, m22 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m23 pmulhrsw m1, m11, m24 pmulhrsw m2, m11, m25 pmulhrsw m3, m11, m26 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m27 pmulhrsw m1, m11, m28 pmulhrsw m2, m11, m29 pmulhrsw m3, m11, m9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 .pass1_fast: mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] mova m3, [cq+128*12] mov r6d, 16*12 call m(idct_8x16_internal_10bpc).main_fast mova m16, [cq+128* 2] mova m17, [cq+128* 6] mova m18, [cq+128*10] mova m19, [cq+128*14] call m(idct_16x16_internal_10bpc).main_fast call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast jmp .pass1_end .pass1: mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] mova m3, [cq+128*12] mova m4, [cq+128*16] mova m5, [cq+128*20] mova m6, [cq+128*24] mova m7, [cq+128*28] call m(idct_8x16_internal_10bpc).main mova m16, [cq+128* 2] mova m17, [cq+128* 6] mova m18, [cq+128*10] mova m19, [cq+128*14] mova m20, [cq+128*18] mova m21, [cq+128*22] mova m22, [cq+128*26] mova m23, [cq+128*30] call m(idct_16x16_internal_10bpc).main call .pass1_load_spill mova m16, [cq+128*17] mova m17, [cq+128*19] mova m18, [cq+128*21] mova m19, [cq+128*23] mova m20, [cq+128*25] mova m21, [cq+128*27] mova m22, [cq+128*29] mova m23, [cq+128*31] call m(inv_txfm_add_dct_dct_32x16_10bpc).main .pass1_end: vpbroadcastd m11, [o(pd_2)] lea r4, [cq+128*8] call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end punpckhqdq m22, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m24, m2, m1 ; 5 punpcklqdq m1, m2, m1 ; 4 punpcklqdq m2, m14, m18 ; 8 punpckhqdq m26, m14, m18 ; 9 punpcklqdq m14, m15, m4 ; 2 punpckhqdq m23, m15, m4 ; 3 punpckhqdq m25, m3, m21 ; 7 punpcklqdq m15, m3, m21 ; 6 punpckhqdq m28, m6, m17 ; 13 punpcklqdq m3, m6, m17 ; 12 punpckhqdq m27, m5, m16 ; 11 punpcklqdq m16, m5, m16 ; 10 punpckhqdq m29, m7, m8 ; 15 punpcklqdq m17, m7, m8 ; 14 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 mova m0, [cq+128* 1] mova [cq+128* 1], m1 mova [cq+128* 2], m2 mova m1, [cq+128* 3] mova m2, [cq+128* 5] mova [cq+128* 3], m3 mova [cq+128* 4], m4 mova m3, [cq+128* 7] mova m4, [cq+128* 9] mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 mova m5, [cq+128*11] mova m6, [cq+128*13] mova m7, [cq+128*15] mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob %undef cmp vpbroadcastd m13, [pw_8192] vpbroadcastd m15, [pixel_10bpc_max] pxor m14, m14 lea r6, [strideq*9] cmp eobd, 136 jl .main mov r4, dstq call .main add cq, 64-128*4 lea dstq, [dstq+strideq*8] call .main add cq, 128*12-64 lea dstq, [r4+32] cmp eobd, 543 jl .main call .main add cq, 64-128*4 lea dstq, [dstq+strideq*8] .main: call .main_internal add cq, 128*4 pmulhrsw m1, m13, m2 pmulhrsw m3, m13, m4 pmulhrsw m5, m13, m6 pmulhrsw m7, m13, m8 call .main_internal jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 .main_internal: mova m8, [cq+128* 0] packssdw m8, [cq+128* 8] mova m6, [cq+128* 1] packssdw m6, [cq+128* 9] mova m0, [cq+128* 2] packssdw m0, [cq+128*10] mova m2, [cq+128* 3] packssdw m2, [cq+128*11] REPX {vpermq x, x, q3120}, m8, m6, m0, m2 REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 punpcklwd m4, m8, m6 punpckhwd m8, m6 punpcklwd m6, m0, m2 punpckhwd m0, m2 REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 punpckldq m2, m4, m6 ; 0 1 punpckhdq m4, m6 ; 2 3 punpckldq m6, m8, m0 ; 4 5 punpckhdq m8, m0 ; 6 7 ret cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 36 jl .fast call .pass1 cmp eobd, 151 jge .full lea r5, [o_base_8bpc] punpckhwd m22, m0, m0 punpckhwd m23, m1, m1 punpckhwd m24, m2, m2 punpckhwd m25, m3, m3 punpckhwd m26, m4, m4 punpckhwd m27, m5, m5 punpckhwd m28, m6, m6 punpckhwd m29, m7, m7 punpcklwd m21, m1, m1 punpcklwd m14, m3, m3 punpcklwd m18, m5, m5 punpcklwd m15, m7, m7 pxor m9, m9 punpcklwd m9, m9, m0 punpcklwd m8, m2, m2 punpcklwd m7, m4, m4 punpcklwd m1, m6, m6 call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 mova [rsp+mmsize*0], m14 mova [rsp+mmsize*1], m15 mova [rsp+mmsize*2], m16 mova [rsp+mmsize*3], m17 mova [rsp+mmsize*4], m18 mova [rsp+mmsize*5], m19 mova [rsp+mmsize*6], m20 mova [rsp+mmsize*7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast pxor m12, m12 mov r3d, 64*3 .zero_loop: REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3 sub r3d, 64 jge .zero_loop jmp .pass2_end .full: mova [cq+128*0], m0 mova [cq+128*1], m1 mova [cq+128*2], m2 mova [cq+128*3], m3 mova [cq+128*4], m4 mova [cq+128*5], m5 mova [cq+128*6], m6 mova [cq+128*7], m7 add cq, 64 call .pass1 sub cq, 64 mova m22, [cq+128*0] ; 0 1 mova m23, [cq+128*1] ; 2 3 mova m24, [cq+128*2] ; 4 5 mova m25, [cq+128*3] ; 6 7 mova m26, [cq+128*4] ; 8 9 mova m27, [cq+128*5] ; 10 11 mova m28, [cq+128*6] ; 12 13 mova m29, [cq+128*7] ; 14 15 mova [cq+64* 8], m0 mova [cq+64* 9], m1 mova [cq+64*10], m2 mova [cq+64*11], m3 mova [cq+64*12], m4 mova [cq+64*13], m5 mova [cq+64*14], m6 mova [cq+64*15], m7 lea r5, [o_base_8bpc] punpcklwd m20, m1, m1 punpcklwd m16, m3, m3 punpcklwd m19, m5, m5 punpcklwd m17, m7, m7 punpcklwd m8, m24, m24 ; 4 punpcklwd m5, m2, m2 ; 20 punpcklwd m1, m28, m28 ; 12 punpcklwd m7, m26, m26 ; 8 punpcklwd m3, m4, m4 ; 24 punpcklwd m4, m6, m6 ; 28 pxor m9, m9 punpcklwd m6, m9, m0 ; __ 16 mova m0, m4 punpcklwd m9, m9, m22 ; __ 0 call m(idct_16x16_internal_8bpc).main_fast punpcklwd m21, m23, m23 ; 2 punpcklwd m15, m29, m29 ; 14 punpcklwd m18, m27, m27 ; 10 punpcklwd m14, m25, m25 ; 6 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova [rsp+mmsize*0], m14 mova [rsp+mmsize*1], m15 mova [rsp+mmsize*2], m16 mova [rsp+mmsize*3], m17 mova [rsp+mmsize*4], m18 mova [rsp+mmsize*5], m19 mova [rsp+mmsize*6], m20 mova [rsp+mmsize*7], m21 mova m21, [cq+64*15] mova m14, [cq+64* 8] mova m17, [cq+64*11] mova m18, [cq+64*12] mova m19, [cq+64*13] mova m16, [cq+64*10] mova m15, [cq+64* 9] mova m20, [cq+64*14] REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ m24, m19, m16, m27, m28, m15, m20, m23 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf pxor m12, m12 mov r3d, 32*7 .full_zero_loop: REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3 sub r3d, 32 jge .full_zero_loop jmp .pass2_end .fast: mova ym0, [cq+128*0] mova ym2, [cq+128*4] movshdup m8, [o(permB)] mova ym1, [cq+128*2] mova ym3, [cq+128*6] mova ym4, [cq+128*1] mova ym5, [cq+128*3] mova ym6, [cq+128*5] mova ym7, [cq+128*7] vpermt2q m0, m8, m2 ; 0 4 vpermt2q m1, m8, m3 ; 2 6 vpermt2q m4, m8, m5 ; 1 3 vpermt2q m7, m8, m6 ; 7 5 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast vpbroadcastd m11, [o(pd_2)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(idct8x32p)] packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 mova m6, [dup16_perm] vpermb m0, m8, m0 vpermb m2, m8, m2 vprold m8, 16 vpermb m1, m8, m1 vpermb m3, m8, m3 punpckldq m4, m0, m2 punpckhdq m0, m2 punpckldq m2, m1, m3 punpckhdq m1, m3 punpckldq m21, m4, m2 punpckhdq m14, m4, m2 punpckldq m18, m0, m1 punpckhdq m15, m0, m1 vpord m7, m6, [o(pb_32)] {1to16} vpermb m22, m7, m21 ; 1 pmovzxwd m9, ym21 ; 0 vpermb m8, m6, m18 ; 4 vpermb m24, m7, m18 ; 5 vpermb m21, m6, m14 ; 2 vpermb m23, m7, m14 ; 3 vpermb m14, m6, m15 ; 6 vpermb m25, m7, m15 ; 7 lea r5, [o_base_8bpc] pslld m9, 16 pxor m7, m7 REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29 call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 mova [rsp+mmsize*0], m14 mova [rsp+mmsize*1], m15 mova [rsp+mmsize*2], m16 mova [rsp+mmsize*3], m17 mova [rsp+mmsize*4], m18 mova [rsp+mmsize*5], m19 mova [rsp+mmsize*6], m20 mova [rsp+mmsize*7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast pxor m12, m12 REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 .pass2_end: movshdup m30, [permC] vpbroadcastd m11, [pw_2048] vpbroadcastd m13, [pixel_10bpc_max] lea r6, [strideq*3] psrlq m31, m30, 8 vpermq m8, m30, m0 vpermq m9, m31, m1 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m2 vpermq m9, m31, m3 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m4 vpermq m9, m31, m5 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m6 vpermq m9, m31, m7 call m(idct_16x8_internal_10bpc).write_16x4 mova m1, [rsp+mmsize*0] mova m2, [rsp+mmsize*1] mova m3, [rsp+mmsize*2] mova m4, [rsp+mmsize*3] mova m5, [rsp+mmsize*4] mova m6, [rsp+mmsize*5] mova m7, [rsp+mmsize*6] mova m8, [rsp+mmsize*7] paddsw m0, m1, m21 psubsw m21, m1, m21 paddsw m1, m2, m20 psubsw m20, m2, m20 paddsw m2, m3, m19 psubsw m19, m3, m19 paddsw m3, m4, m18 psubsw m18, m4, m18 paddsw m4, m5, m17 psubsw m17, m5, m17 paddsw m5, m6, m16 psubsw m16, m6, m16 paddsw m6, m7, m15 psubsw m15, m7, m15 paddsw m7, m8, m14 psubsw m14, m8, m14 vpermq m8, m30, m0 vpermq m9, m31, m1 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m2 vpermq m9, m31, m3 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m4 vpermq m9, m31, m5 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m6 vpermq m9, m31, m7 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m14 vpermq m9, m31, m15 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m16 vpermq m9, m31, m17 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m18 vpermq m9, m31, m19 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m20 vpermq m9, m31, m21 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m22 vpermq m9, m31, m23 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m24 vpermq m9, m31, m25 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m26 vpermq m9, m31, m27 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m28 vpermq m9, m31, m29 call m(idct_16x8_internal_10bpc).write_16x4 RET .pass1: mova m0, [cq+128* 0] mova m1, [cq+128* 2] mova m2, [cq+128* 4] mova m3, [cq+128* 6] mova m4, [cq+128* 8] mova m5, [cq+128*10] mova m6, [cq+128*12] mova m7, [cq+128*14] call m(idct_8x16_internal_10bpc).main mova m16, [cq+128* 1] mova m17, [cq+128* 3] mova m18, [cq+128* 5] mova m19, [cq+128* 7] mova m20, [cq+128* 9] mova m21, [cq+128*11] mova m22, [cq+128*13] mova m23, [cq+128*15] call m(idct_16x16_internal_10bpc).main call m(idct_16x16_internal_10bpc).main_end jmp m(idct_16x16_internal_10bpc).main_end3 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 64 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero jmp .lefthalf .full: call .pass1 mov r3d, 16*28 .lefthalf: mova [cq+128* 0], m27 mova [cq+128* 1], m14 mova [cq+128* 2], m28 mova [cq+128* 3], m15 mova [cq+128* 4], m22 mova [cq+128* 5], m23 mova [cq+128* 6], m24 mova [cq+128* 7], m25 mova [cq+128* 8], m0 mova [cq+128* 9], m26 mova [cq+128*10], m20 mova [cq+128*11], m21 mova [cq+128*12], m18 mova [cq+128*13], m16 mova [cq+128*14], m17 mova [cq+128*15], m3 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] call .pass1 call .pass2_start pxor m31, m31 .right_zero_loop: REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3 sub r3d, 16*4 jge .right_zero_loop mov r3d, 16*28 jmp .left_zero_loop .pass2_start: vpbroadcastd m10, [o(pd_2048)] lea r5, [o_base_8bpc] lea r4, [rsp+gprsize] mova m1, [cq+128*15+64] mova m2, [cq+128* 8+64] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, m21 mova m1, [cq+128*12+64] mova m2, [cq+128*11+64] mova m3, m18 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, m20 mova m1, [cq+128*13+64] mova m2, [cq+128*10+64] mova m3, m16 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, m26 mova m1, [cq+128*14+64] mova m2, [cq+128* 9+64] mova m3, m17 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m27 mova m1, m28 mova m2, [cq+128* 0+64] mova m3, [cq+128* 2+64] mova m16, [cq+128* 1+64] mova m17, [cq+128* 3+64] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova m26, [cq+128* 4+64] mova m27, [cq+128* 5+64] mova m28, [cq+128* 6+64] mova m29, [cq+128* 7+64] mova [rsp+64*32+gprsize], m14 mova [rsp+64*33+gprsize], m15 mova [rsp+64*34+gprsize], m16 mova [rsp+64*35+gprsize], m17 mova [rsp+64*36+gprsize], m18 mova [rsp+64*37+gprsize], m19 mova [rsp+64*38+gprsize], m20 mova [rsp+64*39+gprsize], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast vpbroadcastd m10, [o(pd_2048)] call .pass2_fast_start jmp .end .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*0] mova ym1, [cq+128*4] mova ym4, [cq+128*2] mova ym5, [cq+128*6] mova ym16, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym17, [cq+128*7] mov r3d, 16*4 vpermq m0, m7, m0 ; 0 0 vpermq m1, m7, m1 ; 4 4 vpermt2q m4, m7, m5 ; 2 6 vpermt2q m16, m7, m2 ; 1 5 vpermt2q m17, m7, m3 ; 7 3 REPX {pmulld x, m12}, m0, m1, m4, m16, m17 REPX {paddd x, m13}, m0, m1, m4, m16, m17 REPX {psrad x, 12 }, m0, m1, m4, m16, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 vpbroadcastd m11, [o(pd_1)] call m(idct_16x16_internal_10bpc).main_end2 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 punpcklqdq m27, m0, m2 ; 0 punpckhqdq m0, m2 ; 1 punpcklqdq m22, m3, m4 ; 2 punpckhqdq m26, m3, m4 ; 3 punpcklqdq m14, m5, m7 ; 4 punpckhqdq m20, m5, m7 ; 5 punpcklqdq m23, m6, m8 ; 6 punpckhqdq m21, m6, m8 ; 7 mova m10, m13 call .pass2_fast2_start .end: pxor m31, m31 .left_zero_loop: REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3 sub r3d, 16*4 jge .left_zero_loop call .pass2_end RET .pass2_end: DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi vpbroadcastd m30, [pixel_10bpc_max] vpbroadcastd m13, [pw_2048] mov stride32q, strideq shl stride32q, 5 lea stkhiq, [rsp+31*mmsize+gprsize] lea dst2q, [dstq+stride32q] lea stkloq, [rsp+gprsize] sub dst2q, strideq ; dst31 paddsw m8, m0, m29 ; t0[idct32] psubsw m9, m0, m29 ; t31[idct32] call .end_sumsub_write paddsw m8, m1, m28 ; t1[idct32] psubsw m9, m1, m28 ; t30[idct32] call .end_sumsub_write paddsw m8, m2, m27 ; t2[idct32] psubsw m9, m2, m27 ; t29[idct32] call .end_sumsub_write paddsw m8, m3, m26 ; t3[idct32] psubsw m9, m3, m26 ; t28[idct32] call .end_sumsub_write paddsw m8, m4, m25 ; t4[idct32] psubsw m9, m4, m25 ; t27[idct32] call .end_sumsub_write paddsw m8, m5, m24 ; t5[idct32] psubsw m9, m5, m24 ; t26[idct32] call .end_sumsub_write paddsw m8, m6, m23 ; t6[idct32] psubsw m9, m6, m23 ; t25[idct32] call .end_sumsub_write paddsw m8, m7, m22 ; t7[idct32] psubsw m9, m7, m22 ; t24[idct32] call .end_sumsub_write mova m0, [rsp+64*32+gprsize] mova m1, [rsp+64*33+gprsize] mova m2, [rsp+64*34+gprsize] mova m3, [rsp+64*35+gprsize] mova m4, [rsp+64*36+gprsize] mova m5, [rsp+64*37+gprsize] mova m6, [rsp+64*38+gprsize] mova m7, [rsp+64*39+gprsize] paddsw m8, m0, m21 ; t8[idct32] psubsw m9, m0, m21 ; t23[idct32] call .end_sumsub_write paddsw m8, m1, m20 ; t9[idct32] psubsw m9, m1, m20 ; t22[idct32] call .end_sumsub_write paddsw m8, m2, m19 ; t10[idct32] psubsw m9, m2, m19 ; t21[idct32] call .end_sumsub_write paddsw m8, m3, m18 ; t11[idct32] psubsw m9, m3, m18 ; t20[idct32] call .end_sumsub_write paddsw m8, m4, m17 ; t12[idct32] psubsw m9, m4, m17 ; t19[idct32] call .end_sumsub_write paddsw m8, m5, m16 ; t13[idct32] psubsw m9, m5, m16 ; t18[idct32] call .end_sumsub_write paddsw m8, m6, m15 ; t14[idct32] psubsw m9, m6, m15 ; t17[idct32] call .end_sumsub_write paddsw m8, m7, m14 ; t15[idct32] psubsw m9, m7, m14 ; t16[idct32] ; fall-through .end_sumsub_write: mova m10, [stkhiq] ; t63-n mova m12, [stkloq] ; t32+n psubsw m11, m8, m10 ; out63-n paddsw m8, m10 ; out0 +n psubsw m10, m9, m12 ; out32+n paddsw m9, m12 ; out32-n REPX {pmulhrsw x, m13}, m11, m8, m10, m9 paddw m8, [dstq] paddw m9, [dst2q] paddw m10, [dstq+stride32q] paddw m11, [dst2q+stride32q] REPX {pminsw x, m30}, m11, m8, m10, m9 REPX {pmaxsw x, m31}, m11, m8, m10, m9 mova [dstq ], m8 mova [dst2q ], m9 mova [dstq +stride32q], m10 mova [dst2q+stride32q], m11 add stkloq, mmsize sub stkhiq, mmsize add dstq, strideq sub dst2q, strideq ret .pass2_fast_start: lea r5, [o_base_8bpc] lea r4, [rsp+gprsize] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m21 mova m3, m18 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m20 mova m3, m16 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m26 mova m3, m17 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m27 mova m1, m28 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [rsp+64*32+gprsize], m14 mova [rsp+64*33+gprsize], m15 mova [rsp+64*34+gprsize], m16 mova [rsp+64*35+gprsize], m17 mova [rsp+64*36+gprsize], m18 mova [rsp+64*37+gprsize], m19 mova [rsp+64*38+gprsize], m20 mova [rsp+64*39+gprsize], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 .pass2_fast2_start: lea r5, [o_base_8bpc] lea r4, [rsp+gprsize] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 mova m0, m21 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 mova m0, m20 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 mova m0, m26 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m27 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3 mova [rsp+64*32+gprsize], m14 mova [rsp+64*33+gprsize], m15 mova [rsp+64*34+gprsize], m16 mova [rsp+64*35+gprsize], m17 mova [rsp+64*36+gprsize], m18 mova [rsp+64*37+gprsize], m19 mova [rsp+64*38+gprsize], m20 mova [rsp+64*39+gprsize], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3 .dconly: DEFINE_ARGS dst, stride, c, eob imul r6d, [cq], 181 mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3 .pass1_fast: pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 4] pmulld m2, m12, [cq+128* 8] pmulld m3, m12, [cq+128*12] mov r3d, 16*12 call m(idct_8x16_internal_10bpc).main_fast_rect2 pmulld m16, m12, [cq+128* 2] pmulld m17, m12, [cq+128* 6] pmulld m18, m12, [cq+128*10] pmulld m19, m12, [cq+128*14] call m(idct_16x16_internal_10bpc).main_fast_rect2 call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 jmp .pass1_end .pass1: pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 4] pmulld m2, m12, [cq+128* 8] pmulld m3, m12, [cq+128*12] pmulld m4, m12, [cq+128*16] pmulld m5, m12, [cq+128*20] pmulld m6, m12, [cq+128*24] pmulld m7, m12, [cq+128*28] call m(idct_8x16_internal_10bpc).main_rect2 pmulld m16, m12, [cq+128* 2] pmulld m17, m12, [cq+128* 6] pmulld m18, m12, [cq+128*10] pmulld m19, m12, [cq+128*14] pmulld m20, m12, [cq+128*18] pmulld m21, m12, [cq+128*22] pmulld m22, m12, [cq+128*26] pmulld m23, m12, [cq+128*30] call m(idct_16x16_internal_10bpc).main_rect2 call .pass1_load_spill pmulld m16, m12, [cq+128*17] pmulld m17, m12, [cq+128*19] pmulld m18, m12, [cq+128*21] pmulld m19, m12, [cq+128*23] pmulld m20, m12, [cq+128*25] pmulld m21, m12, [cq+128*27] pmulld m22, m12, [cq+128*29] pmulld m23, m12, [cq+128*31] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2 .pass1_end: vpbroadcastd m11, [o(pd_1)] lea r4, [cq+128*8] call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end punpcklqdq m27, m0, m20 ; 0 punpckhqdq m0, m20 ; 1 punpcklqdq m24, m5, m16 ; 10 punpckhqdq m16, m5, m16 ; 11 punpcklqdq m23, m3, m21 ; 6 punpckhqdq m21, m3, m21 ; 7 punpcklqdq m25, m7, m8 ; 14 punpckhqdq m3, m7, m8 ; 15 punpcklqdq m22, m15, m4 ; 2 punpckhqdq m26, m15, m4 ; 3 punpcklqdq m15, m6, m17 ; 12 punpckhqdq m17, m6, m17 ; 13 punpcklqdq m28, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpcklqdq m14, m2, m1 ; 4 punpckhqdq m20, m2, m1 ; 5 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 pmulld m0, m12, [cq+128* 1] mova [cq+128* 1], m1 mova [cq+128* 2], m2 pmulld m1, m12, [cq+128* 3] pmulld m2, m12, [cq+128* 5] mova [cq+128* 3], m3 mova [cq+128* 4], m4 pmulld m3, m12, [cq+128* 7] pmulld m4, m12, [cq+128* 9] mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 pmulld m5, m12, [cq+128*11] pmulld m6, m12, [cq+128*13] pmulld m7, m12, [cq+128*15] mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 36 jl .fast ; 8x8 cmp eobd, 151 jge .full ; 16x16 lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64] mova m0, [cq+64* 1] mova m3, [cq+64*15] call .main_part1_fast mova m0, [cq+64* 7] mova m3, [cq+64* 9] call .main_part1_fast mova m0, [cq+64* 5] mova m3, [cq+64*11] call .main_part1_fast mova m0, [cq+64* 3] mova m3, [cq+64*13] call .main_part1_fast call .main_part2 mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m16, [cq+64* 4] mova m17, [cq+64*12] call m(idct_8x16_internal_10bpc).main_fast2 call m(idct_16x16_internal_10bpc).main_fast2 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 mov r6d, 12*8 jmp .idct64_end .full: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64] mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] call .main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] call .main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] call .main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] call .main_part1 call .main_part2 mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] mova m3, [cq+64*24] mova m16, [cq+64* 4] mova m17, [cq+64*12] mova m18, [cq+64*20] mova m19, [cq+64*28] call m(idct_8x16_internal_10bpc).main_fast call m(idct_16x16_internal_10bpc).main_fast call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub call .pass1_load_spill mova m4, [cq+64*18] mova m5, [cq+64*22] mova m6, [cq+64*26] mova m7, [cq+64*30] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast mov r6d, 28*8 jmp .idct64_end .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 16 .dconly1: add r6d, 640 sar r6d, 10 .dconly2: vpbroadcastd m3, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw m2, r6d paddsw m2, m3 .dconly_loop: paddsw m0, m2, [dstq+64*0] paddsw m1, m2, [dstq+64*1] psubusw m0, m3 psubusw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec r3d jg .dconly_loop ret .pass1_load_spill: mova [cq+64* 0], m0 mova m0, [cq+64* 2] mova [cq+64* 2], m1 mova m1, [cq+64* 6] mova [cq+64* 4], m2 mova [cq+64* 6], m3 mova m2, [cq+64*10] mova m3, [cq+64*14] mova [cq+64* 8], m4 mova [cq+64*10], m5 mova [cq+64*12], m6 mova [cq+64*14], m7 mova [cq+64* 1], m23 mova [cq+64* 3], m22 mova [cq+64* 5], m21 mova [cq+64* 7], m20 mova [cq+64* 9], m19 mova [cq+64*11], m18 mova [cq+64*13], m17 mova [cq+64*15], m16 ret ALIGN function_align .main_part1_fast_rect2: REPX {paddd x, m13}, m0, m3 REPX {psrad x, 12 }, m0, m3 .main_part1_fast: pmulld m7, m0, [r4+4*0]{bcstd} ; t63a pmulld m0, [r4+4*1]{bcstd} ; t32a pmulld m4, m3, [r4+4*6]{bcstd} ; t60a pmulld m3, [r4+4*7]{bcstd} ; t35a vpbroadcastd m10, [r4+4*8] vpbroadcastd m11, [r4+4*9] REPX {paddd x, m13}, m7, m0, m4, m3 REPX {psrad x, 12 }, m7, m0, m4, m3 mova m8, m0 mova m1, m7 mova m6, m3 mova m2, m4 jmp .main_part1b .main_part1_rect2: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_part1: ; idct64 steps 1-5 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a pmulld m7, m0, [r4+4*0]{bcstd} ; t63a pmulld m0, [r4+4*1]{bcstd} ; t32a pmulld m6, m1, [r4+4*2]{bcstd} ; t62a pmulld m1, [r4+4*3]{bcstd} ; t33a pmulld m5, m2, [r4+4*4]{bcstd} ; t61a pmulld m2, [r4+4*5]{bcstd} ; t34a pmulld m4, m3, [r4+4*6]{bcstd} ; t60a pmulld m3, [r4+4*7]{bcstd} ; t35a vpbroadcastd m10, [r4+4*8] vpbroadcastd m11, [r4+4*9] REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 psubd m8, m0, m1 ; t33 paddd m0, m1 ; t32 psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m5 ; t61 paddd m4, m5 ; t60 .main_part1b: REPX {pmaxsd x, m14}, m8, m1, m6, m2 REPX {pminsd x, m15}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a REPX {pmaxsd x, m14}, m0, m3, m7, m4 REPX {pminsd x, m15}, m0, m3, m7, m4 vpbroadcastd m10, [r4+4*10] vpbroadcastd m11, [r4+4*11] psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m8, m2 ; t61 paddd m8, m2 ; t62 REPX {pmaxsd x, m14}, m5, m3, m4, m6 REPX {pminsd x, m15}, m5, m3, m4, m6 ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60 ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a REPX {pmaxsd x, m14}, m0, m7, m1, m8 REPX {pminsd x, m15}, m0, m7, m1, m8 add r4, 4*12 mova [r6-64*4], m0 mova [r6+64*3], m7 mova [r6-64*3], m1 mova [r6+64*2], m8 mova [r6-64*2], m6 mova [r6+64*1], m4 mova [r6-64*1], m3 mova [r6+64*0], m5 add r6, 64*8 ret .main_part2: ; idct64 steps 6-9 lea r4, [r6+64*3] sub r6, 64*4 vpbroadcastd m10, [pd_1567] vpbroadcastd m11, [pd_3784] .main_part2_loop: mova m0, [r6-64*32] ; t32a mova m1, [r4-64*24] ; t39a mova m2, [r4-64*32] ; t63a mova m3, [r6-64*24] ; t56a mova m4, [r6-64*16] ; t40a mova m5, [r4-64* 8] ; t47a mova m6, [r4-64*16] ; t55a mova m7, [r6-64* 8] ; t48a psubd m8, m0, m1 ; t39 paddd m0, m1 ; t32 psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 REPX {pmaxsd x, m14}, m8, m1, m3, m4 REPX {pminsd x, m15}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a REPX {pmaxsd x, m14}, m0, m2, m5, m7 REPX {pminsd x, m15}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a paddd m2, m7 ; t63a psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m8, m4 ; t55 paddd m8, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 REPX {pmaxsd x, m14}, m6, m7, m5, m4 REPX {pminsd x, m15}, m6, m7, m5, m4 REPX {pmulld x, m12}, m6, m7, m5, m4 REPX {pmaxsd x, m14}, m2, m0, m8, m1 REPX {pminsd x, m15}, m2, m0, m8, m1 paddd m6, m13 paddd m5, m13 psubd m3, m6, m7 ; t47 paddd m6, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m3, m6, m7, m5 mova [r4-64* 8], m2 mova [r6-64*32], m0 mova [r6-64* 8], m8 mova [r4-64*32], m1 mova [r4-64*24], m3 mova [r6-64*16], m6 mova [r6-64*24], m7 mova [r4-64*16], m5 add r6, 64 sub r4, 64 cmp r6, r4 jl .main_part2_loop ret .idct64_main_end: %macro IDCT64_PASS1_END 9 mova m%5, [%9+%1*128] ; t0+n [idct32] + idct64 rounding psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64] paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64] REPX {pmaxsd x, m14}, m%6, m%5 REPX {pminsd x, m15}, m%6, m%5 REPX {paddd x, m11}, m%6, m%5 mova m%2, [r3+%3*64] ; t32+n [idct64] mova m%7, [r3+%4*64] ; t63-n [idct64] psubd m%8, m%5, m%7 ; out63-n paddd m%5, m%7 ; out0+n psubd m%7, m%6, m%2 ; out32+n paddd m%6, m%2 ; out31-n REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6 %endmacro %macro IDCT64_PASS1_ENDx4 1 %assign %%m1 %1 ; t32+n %assign %%m2 (7-%1) ; t39-n %assign %%m3 (8+%1) ; t40+n %assign %%m4 (15-%1) ; t47-n %assign %%m5 (16+%1) ; t48+n %assign %%m6 (23-%1) ; t55-n %assign %%m7 (24+%1) ; t56+n %assign %%m8 (31-%1) ; t63-n %assign %%r1 %1 ; t16+n %assign %%r2 (7-%1) ; t23-n %assign %%r3 (16+%1) ; t24-n %assign %%r4 (23-%1) ; t31-n %assign %%c1 (%1) ; t0/8+n %assign %%c2 (7-%1) ; t7/15-n IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63 IDCT64_PASS1_END %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48 packssdw m %+ %%r1, m24, m29 packssdw m %+ %%r4, m28, m25 packssdw m26, m31 packssdw m30, m27 mova [r3+%%m5*mmsize], m26 mova [r3+%%m8*mmsize], m30 IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56 IDCT64_PASS1_END %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55 packssdw m %+ %%r2, m24, m29 packssdw m %+ %%r3, m28, m25 packssdw m26, m31 packssdw m30, m27 mova [r3+%%m6*mmsize], m26 mova [r3+%%m7*mmsize], m30 %endmacro IDCT64_PASS1_ENDx4 0 IDCT64_PASS1_ENDx4 1 IDCT64_PASS1_ENDx4 2 IDCT64_PASS1_ENDx4 3 ret .idct64_end: vpbroadcastd m11, [o(pd_2)] lea r4, [cq+64] mov r3, rsp lea r5, [o_base_8bpc] call .idct64_main_end pxor m12, m12 .zero_loop: REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3 sub r6d, 8*4 jge .zero_loop lea r3, [strideq*3] mov r4, dstq call .pass2 mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] mova m16, [rsp+24*mmsize] mova m17, [rsp+25*mmsize] mova m18, [rsp+26*mmsize] mova m19, [rsp+27*mmsize] mova m20, [rsp+28*mmsize] mova m21, [rsp+29*mmsize] mova m22, [rsp+30*mmsize] mova m23, [rsp+31*mmsize] lea dstq, [r4+64] call .pass2 RET .pass2: psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 punpckhqdq m19, m5, m16 ; 11 punpcklqdq m5, m16 ; 10 punpckhqdq m16, m2, m1 ; 5 punpcklqdq m2, m1 ; 4 punpcklqdq m1, m15, m4 ; 2 punpckhqdq m15, m4 ; 3 punpcklqdq m4, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpckhqdq m14, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m20, m6, m17 ; 13 punpcklqdq m6, m17 ; 12 punpckhqdq m17, m3, m21 ; 7 punpcklqdq m3, m21 ; 6 punpckhqdq m21, m7, m8 ; 15 punpcklqdq m7, m8 ; 14 call m(inv_txfm_add_dct_dct_32x8_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf .write: vpbroadcastd m11, [pw_2048] pxor m12, m12 vpbroadcastd m13, [pixel_10bpc_max] call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 pmulhrsw m0, m11, m14 pmulhrsw m1, m11, m15 pmulhrsw m2, m11, m16 pmulhrsw m3, m11, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m18 pmulhrsw m1, m11, m19 pmulhrsw m2, m11, m20 pmulhrsw m3, m11, m21 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 .fast: ; 8x8 packed movshdup m7, [o(permB)] mova ym0, [cq+64*1] mova ym2, [cq+64*5] mova ym3, [cq+64*3] mova ym1, [cq+64*7] vpermt2q m0, m7, m2 ; 1 5 vpermt2q m1, m7, m3 ; 7 3 call .main_oddhalf_packed mova [rsp+ 0*mmsize], m0 mova [rsp+ 1*mmsize], m1 mova [rsp+ 2*mmsize], m2 mova [rsp+ 3*mmsize], m3 mova [rsp+ 4*mmsize], m4 mova [rsp+ 5*mmsize], m5 mova [rsp+ 6*mmsize], m6 mova [rsp+ 7*mmsize], m7 mova [rsp+ 8*mmsize], m16 mova [rsp+ 9*mmsize], m17 mova [rsp+10*mmsize], m18 mova [rsp+11*mmsize], m19 mova [rsp+12*mmsize], m20 mova [rsp+13*mmsize], m21 mova [rsp+14*mmsize], m22 mova [rsp+15*mmsize], m23 movshdup m7, [o(permB)] mova ym0, [cq+64*0] mova ym4, [cq+64*4] mova ym16, [cq+64*2] mova ym5, [cq+64*6] vpermt2q m16, m7, m5 ; 2 6 vpermq m0, m7, m0 ; 0 0 vpermq m4, m7, m4 ; 4 4 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data ; zero input coefs pxor m12, m12 REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 vpbroadcastd m11, [o(pd_2)] call .main_end lea r3, [strideq*3] mov r4, dstq call .pass2_fast mova m0, m24 mova m1, m25 mova m2, m26 mova m3, m27 mova m4, m28 mova m5, m29 mova m6, m30 mova m7, m31 lea dstq, [r4+64] lea r5, [o_base] call .pass2_fast RET .pass2_fast: call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 lea r5, [o_base_8bpc] punpckhqdq m14, m0, m2 ; 1 punpcklqdq m0, m2 ; 0 punpcklqdq m1, m3, m4 ; 2 punpckhqdq m15, m3, m4 ; 3 punpcklqdq m2, m5, m7 ; 4 punpckhqdq m16, m5, m7 ; 5 punpcklqdq m3, m6, m8 ; 6 punpckhqdq m17, m6, m8 ; 7 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast jmp .write .main_end: %macro IDCT64_PASS1_PACKED_END 7 psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64] paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64] REPX {pmaxsd x, m14}, m%5, m%1 REPX {pminsd x, m15}, m%5, m%1 REPX {paddd x, m11}, m%5, m%1 mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64] mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64] psubd m%4, m%1, m%3 ; out63-n paddd m%1, m%3 ; out0+n psubd m%3, m%5, m%2 ; out32+n paddd m%2, m%5 ; out31-n REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2 %endmacro IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62 IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49 packssdw m0, m9 packssdw m7, m22 packssdw m24, m13 packssdw m31, m10 IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61 IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50 packssdw m1, m16 packssdw m6, m21 packssdw m25, m13 packssdw m30, m10 IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58 IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53 packssdw m2, m17 packssdw m5, m20 packssdw m26, m13 packssdw m29, m10 IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57 IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54 packssdw m3, m18 packssdw m4, m19 packssdw m27, m13 packssdw m28, m10 ret .main_oddhalf_packed_rect2: REPX {paddd x, m13}, m0, m1 REPX {psrad x, 12 }, m0, m1 .main_oddhalf_packed: ; m0=in1 in5, m1=in7 in3 vbroadcasti32x4 m2, [o(pd_101_501)] vbroadcasti32x4 m3, [o(pd_m700_m301)] vbroadcasti32x4 m4, [o(pd_4095_4065)] vbroadcasti32x4 m5, [o(pd_4036_4085)] pmulld m2, m0 pmulld m3, m1 pmulld m0, m4 pmulld m1, m5 REPX {paddd x, m13}, m2, m3, m0, m1 REPX {psrad x, 12 }, m2, m3, m0, m1 ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47 ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49 ; end of step 1-2 vbroadcasti32x4 m10, [o(pd_401_1931)] vbroadcasti32x4 m11, [o(pd_4076_3612)] mova m4, m0 mova m5, m2 ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11 vbroadcasti32x4 m10, [o(pd_3166_3920)] vbroadcasti32x4 m11, [o(pd_2598_1189)] mova m6, m3 mova m7, m1 ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2 ; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54 ; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50 ; and from earlier: ; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a ; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a ; end of step 3-4 punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34 punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38 punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42 punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46 punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50 punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54 punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58 punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62 mova m0, m22 mova m7, m21 mova m3, m18 mova m16, m17 mova m5, m6 mova m4, m19 mova m2, m8 mova m1, m23 ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a] ; step5 vpbroadcastd m10, [o(pd_799)] vpbroadcastd m11, [o(pd_4017)] ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a vpbroadcastd m10, [o(pd_3406)] vpbroadcastd m11, [o(pd_2276)] ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a] ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a] ; step6 psubd m20, m0, m21 ; t39/38a paddd m0, m21 ; t32/33a psubd m21, m1, m7 ; t36a/37 paddd m1, m7 ; t35a/34 REPX {pmaxsd x, m14}, m20, m0, m21, m1 psubd m7, m16, m18 ; t40/41a paddd m16, m18 ; t47/46a REPX {pminsd x, m15}, m20, m0, m21, m1 psubd m18, m17, m19 ; t43a/42 paddd m17, m19 ; t44a/45 REPX {pmaxsd x, m14}, m7, m16, m18, m17 psubd m19, m6, m4 ; t55/54a paddd m6, m4 ; t48/49a REPX {pminsd x, m15}, m7, m16, m18, m17 psubd m4, m5, m3 ; t52a/53 paddd m5, m3 ; t51a/50 REPX {pmaxsd x, m14}, m19, m6, m4, m5 psubd m3, m23, m2 ; t56/57a paddd m23, m2 ; t63/62a REPX {pminsd x, m15}, m19, m6, m4, m5 psubd m2, m22, m8 ; t59a/58 paddd m22, m8 ; t60a/61 REPX {pmaxsd x, m14}, m3, m23, m2, m22 REPX {pminsd x, m15}, m3, m23, m2, m22 ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a] ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a] ; step7 vpbroadcastd m10, [o(pd_1567)] vpbroadcastd m11, [o(pd_3784)] ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57 ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41 ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a] ; step8 psubd m8, m0, m16 ; t47a/46 paddd m0, m16 ; t32a/33 psubd m16, m1, m17 ; t44/45a paddd m1, m17 ; t35/34a REPX {pmaxsd x, m14}, m8, m0, m16, m1 psubd m17, m2, m18 ; t43a/42 paddd m2, m18 ; t36a/37 REPX {pminsd x, m15}, m8, m0, m16, m1 psubd m18, m3, m7 ; t40/41a paddd m3, m7 ; t39/38a REPX {pmaxsd x, m14}, m17, m2, m18, m3 psubd m7, m23, m6 ; t48a/49 paddd m23, m6 ; t63a/62 REPX {pminsd x, m15}, m17, m2, m18, m3 psubd m6, m22, m5 ; t51/50a paddd m22, m5 ; t60/61a REPX {pmaxsd x, m14}, m7, m23, m6, m22 psubd m5, m21, m4 ; t52a/53 paddd m21, m4 ; t59a/58 REPX {pminsd x, m15}, m7, m23, m6, m22 psubd m4, m20, m19 ; t55/54a paddd m20, m19 ; t56/57a REPX {pmaxsd x, m14}, m5, m21, m4, m20 REPX {pminsd x, m15}, m5, m21, m4, m20 ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a] ; step9 REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8 REPX {paddd x, m13}, m4, m5, m6, m7 paddd m19, m4, m18 ; t55a/54 psubd m4, m18 ; t40a/41 paddd m18, m5, m17 ; t52/53a psubd m5, m17 ; t43/42a paddd m17, m6, m16 ; t51a/50 psubd m6, m16 ; t44a/45 paddd m16, m7, m8 ; t48/49a psubd m7, m8 ; t47/46a REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7 ; m4-7=t40-47[a], m16-19=t48-55[a] ret cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero mov r7d, 16*12 jmp .lefthalf .full: call .pass1 mov r7d, 16*28 .lefthalf: mova [cq+128* 0], m0 mova [cq+128* 1], m1 mova [cq+128* 2], m2 mova [cq+128* 3], m3 mova [cq+128* 4], m14 mova [cq+128* 5], m15 mova [cq+128* 6], m16 mova [cq+128* 7], m17 mova [cq+128* 8], m22 mova [cq+128* 9], m23 mova [cq+128*10], m24 mova [cq+128*11], m25 mova [cq+128*12], m26 mova [cq+128*13], m27 mova [cq+128*14], m28 mova [cq+128*15], m29 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] sub rsp, 16*64 call .pass1 add rsp, 16*64 lea r5, [o_base_8bpc] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start mov r4, dstq pxor m12, m12 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end lea dstq, [r4+64] mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] mova m16, [rsp+24*mmsize] mova m17, [rsp+25*mmsize] mova m18, [rsp+26*mmsize] mova m19, [rsp+27*mmsize] mova m20, [rsp+28*mmsize] mova m21, [rsp+29*mmsize] mova m22, [rsp+30*mmsize] mova m23, [rsp+31*mmsize] call .transpose mova [cq+128* 0+64], m0 mova [cq+128* 1+64], m1 mova [cq+128* 2+64], m2 mova [cq+128* 3+64], m3 mova [cq+128* 4+64], m14 mova [cq+128* 5+64], m15 mova [cq+128* 6+64], m16 mova [cq+128* 7+64], m17 mova [cq+128* 8+64], m22 mova [cq+128* 9+64], m23 mova [cq+128*10+64], m24 mova [cq+128*11+64], m25 mova [cq+128*12+64], m26 mova [cq+128*13+64], m27 mova [cq+128*14+64], m28 mova [cq+128*15+64], m29 mova m0, [rsp+ 0*mmsize] mova m1, [rsp+ 1*mmsize] mova m2, [rsp+ 2*mmsize] mova m3, [rsp+ 3*mmsize] mova m4, [rsp+ 4*mmsize] mova m5, [rsp+ 5*mmsize] mova m6, [rsp+ 6*mmsize] mova m7, [rsp+ 7*mmsize] mova m16, [rsp+ 8*mmsize] mova m17, [rsp+ 9*mmsize] mova m18, [rsp+10*mmsize] mova m19, [rsp+11*mmsize] mova m20, [rsp+12*mmsize] mova m21, [rsp+13*mmsize] mova m22, [rsp+14*mmsize] mova m23, [rsp+15*mmsize] call .transpose call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start pxor m12, m12 .right_zero_loop: mova [cq+r7*8+64+128*3], m12 mova [cq+r7*8+64+128*2], m12 mova [cq+r7*8+64+128*1], m12 mova [cq+r7*8+64+128*0], m12 sub r7d, 16*4 jge .right_zero_loop mov r7d, 16*28 jmp .end .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast lea r5, [o_base_8bpc] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start mov r4, dstq pxor m12, m12 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end lea dstq, [r4+64] mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] mova m16, [rsp+24*mmsize] mova m17, [rsp+25*mmsize] mova m18, [rsp+26*mmsize] mova m19, [rsp+27*mmsize] mova m20, [rsp+28*mmsize] mova m21, [rsp+29*mmsize] mova m22, [rsp+30*mmsize] mova m23, [rsp+31*mmsize] call .transpose call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start mov r7d, 16*12 pxor m12, m12 jmp .end .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym1, [cq+128*7] vpermt2q m0, m7, m2 ; 1 5 vpermt2q m1, m7, m3 ; 7 3 REPX {pmulld x, m12}, m0, m1 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2 mova [rsp+ 0*mmsize], m0 mova [rsp+ 1*mmsize], m1 mova [rsp+ 2*mmsize], m2 mova [rsp+ 3*mmsize], m3 mova [rsp+ 4*mmsize], m4 mova [rsp+ 5*mmsize], m5 mova [rsp+ 6*mmsize], m6 mova [rsp+ 7*mmsize], m7 mova [rsp+ 8*mmsize], m16 mova [rsp+ 9*mmsize], m17 mova [rsp+10*mmsize], m18 mova [rsp+11*mmsize], m19 mova [rsp+12*mmsize], m20 mova [rsp+13*mmsize], m21 mova [rsp+14*mmsize], m22 mova [rsp+15*mmsize], m23 movshdup m7, [o(permB)] pmulld ym0, ym12, [cq+128*0] pmulld ym4, ym12, [cq+128*4] mova ym16, [cq+128*2] mova ym5, [cq+128*6] REPX {paddd x, ym13}, ym0, ym4 REPX {psrad x, 12 }, ym0, ym4 vpermt2q m16, m7, m5 ; 2 6 vpermq m0, m7, m0 ; 0 0 vpermq m4, m7, m4 ; 4 4 pmulld m16, m12 paddd m16, m13 psrad m16, 12 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 vpbroadcastd m11, [o(pd_1)] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end mova [rsp+16*mmsize], m24 mova [rsp+17*mmsize], m25 mova [rsp+18*mmsize], m26 mova [rsp+19*mmsize], m27 mova [rsp+20*mmsize], m28 mova [rsp+21*mmsize], m29 mova [rsp+22*mmsize], m30 mova [rsp+23*mmsize], m31 vpbroadcastd m13, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start mov r7d, 16*4 mov r4, dstq pxor m12, m12 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end lea dstq, [r4+64] mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] lea r5, [o_base] vpbroadcastd m13, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start pxor m12, m12 .end: call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end .zero_loop: mova [cq+r7*8+128*3], m12 mova [cq+r7*8+128*2], m12 mova [cq+r7*8+128*1], m12 mova [cq+r7*8+128*0], m12 sub r7d, 16*4 jge .zero_loop RET .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .pass1_fast: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] pmulld m0, m12, [cq+128* 1] pmulld m3, m12, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 pmulld m0, m12, [cq+128* 7] pmulld m3, m12, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 pmulld m0, m12, [cq+128* 5] pmulld m3, m12, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 pmulld m0, m12, [cq+128* 3] pmulld m3, m12, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 8] pmulld m16, m12, [cq+128* 4] pmulld m17, m12, [cq+128*12] call m(idct_8x16_internal_10bpc).main_fast2_rect2 call m(idct_16x16_internal_10bpc).main_fast2_rect2 call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2 jmp .pass1_end .pass1: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] pmulld m0, m12, [cq+128* 1] pmulld m1, m12, [cq+128*31] pmulld m2, m12, [cq+128*17] pmulld m3, m12, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 pmulld m0, m12, [cq+128* 7] pmulld m1, m12, [cq+128*25] pmulld m2, m12, [cq+128*23] pmulld m3, m12, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 pmulld m0, m12, [cq+128* 5] pmulld m1, m12, [cq+128*27] pmulld m2, m12, [cq+128*21] pmulld m3, m12, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 pmulld m0, m12, [cq+128* 3] pmulld m1, m12, [cq+128*29] pmulld m2, m12, [cq+128*19] pmulld m3, m12, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 8] pmulld m2, m12, [cq+128*16] pmulld m3, m12, [cq+128*24] pmulld m16, m12, [cq+128* 4] pmulld m17, m12, [cq+128*12] pmulld m18, m12, [cq+128*20] pmulld m19, m12, [cq+128*28] call m(idct_8x16_internal_10bpc).main_fast_rect2 call m(idct_16x16_internal_10bpc).main_fast_rect2 call .pass1_load_spill pmulld m4, m12, [cq+128*18] pmulld m5, m12, [cq+128*22] pmulld m6, m12, [cq+128*26] pmulld m7, m12, [cq+128*30] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 .pass1_end: vpbroadcastd m11, [o(pd_1)] lea r3, [rsp+gprsize] lea r4, [cq+8*128] call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end ; transpose one half immediately, we can transpose lower half later .transpose: ; transpose m0-7,16-23 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 punpckhqdq m22, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m24, m2, m1 ; 5 punpcklqdq m1, m2, m1 ; 4 punpcklqdq m2, m14, m18 ; 8 punpckhqdq m26, m14, m18 ; 9 punpcklqdq m14, m15, m4 ; 2 punpckhqdq m23, m15, m4 ; 3 punpckhqdq m25, m3, m21 ; 7 punpcklqdq m15, m3, m21 ; 6 punpckhqdq m28, m6, m17 ; 13 punpcklqdq m3, m6, m17 ; 12 punpckhqdq m27, m5, m16 ; 11 punpcklqdq m16, m5, m16 ; 10 punpckhqdq m29, m7, m8 ; 15 punpcklqdq m17, m7, m8 ; 14 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 mova [cq+128* 1], m1 pmulld m0, m12, [cq+128* 2] pmulld m1, m12, [cq+128* 6] mova [cq+128* 2], m2 mova [cq+128* 3], m3 pmulld m2, m12, [cq+128*10] pmulld m3, m12, [cq+128*14] mova [cq+128* 4], m4 mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero mov r7d, 16*12 jmp .lefthalf .full: call .pass1 mov r7d, 16*28 .lefthalf: mova [cq+128* 0], m27 mova [cq+128* 1], m14 mova [cq+128* 2], m28 mova [cq+128* 3], m15 mova [cq+128* 4], m22 mova [cq+128* 5], m23 mova [cq+128* 6], m24 mova [cq+128* 7], m25 mova [cq+128* 8], m0 mova [cq+128* 9], m26 mova [cq+128*10], m20 mova [cq+128*11], m21 mova [cq+128*12], m18 mova [cq+128*13], m16 mova [cq+128*14], m17 mova [cq+128*15], m3 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] sub rsp, 16*64 call .pass1 sub rsp, 24*64 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start mov r8, dstq pxor m31, m31 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end lea dstq, [r8+64] mova m0, [rsp+56*mmsize] mova m1, [rsp+57*mmsize] mova m2, [rsp+58*mmsize] mova m3, [rsp+59*mmsize] mova m4, [rsp+60*mmsize] mova m5, [rsp+61*mmsize] mova m6, [rsp+62*mmsize] mova m7, [rsp+63*mmsize] mova m16, [rsp+64*mmsize] mova m17, [rsp+65*mmsize] mova m18, [rsp+66*mmsize] mova m19, [rsp+67*mmsize] mova m20, [rsp+68*mmsize] mova m21, [rsp+69*mmsize] mova m22, [rsp+70*mmsize] mova m23, [rsp+71*mmsize] call .transpose mova [cq+128* 0+64], m27 mova [cq+128* 1+64], m14 mova [cq+128* 2+64], m28 mova [cq+128* 3+64], m15 mova [cq+128* 4+64], m22 mova [cq+128* 5+64], m23 mova [cq+128* 6+64], m24 mova [cq+128* 7+64], m25 mova [cq+128* 8+64], m0 mova [cq+128* 9+64], m26 mova [cq+128*10+64], m20 mova [cq+128*11+64], m21 mova [cq+128*12+64], m18 mova [cq+128*13+64], m16 mova [cq+128*14+64], m17 mova [cq+128*15+64], m3 mova m0, [rsp+40*mmsize] mova m1, [rsp+41*mmsize] mova m2, [rsp+42*mmsize] mova m3, [rsp+43*mmsize] mova m4, [rsp+44*mmsize] mova m5, [rsp+45*mmsize] mova m6, [rsp+46*mmsize] mova m7, [rsp+47*mmsize] mova m16, [rsp+48*mmsize] mova m17, [rsp+49*mmsize] mova m18, [rsp+50*mmsize] mova m19, [rsp+51*mmsize] mova m20, [rsp+52*mmsize] mova m21, [rsp+53*mmsize] mova m22, [rsp+54*mmsize] mova m23, [rsp+55*mmsize] add rsp, 32*64 call .transpose lea r5, [o_base] call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start .right_zero_loop: REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3 sub r7d, 16*4 jge .right_zero_loop mov r7d, 16*28 jmp .end .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast sub rsp, 24*64 vpbroadcastd m10, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start mov r8, dstq pxor m31, m31 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end lea dstq, [r8+64] mova m0, [rsp+40*mmsize] mova m1, [rsp+41*mmsize] mova m2, [rsp+42*mmsize] mova m3, [rsp+43*mmsize] mova m4, [rsp+44*mmsize] mova m5, [rsp+45*mmsize] mova m6, [rsp+46*mmsize] mova m7, [rsp+47*mmsize] mova m16, [rsp+48*mmsize] mova m17, [rsp+49*mmsize] mova m18, [rsp+50*mmsize] mova m19, [rsp+51*mmsize] mova m20, [rsp+52*mmsize] mova m21, [rsp+53*mmsize] mova m22, [rsp+54*mmsize] mova m23, [rsp+55*mmsize] add rsp, 16*64 call .transpose lea r5, [o_base] vpbroadcastd m10, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start mov r7d, 16*12 jmp .end .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym1, [cq+128*7] vpermt2q m0, m7, m2 ; 1 5 vpermt2q m1, m7, m3 ; 7 3 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed mova [rsp+ 0*mmsize], m0 mova [rsp+ 1*mmsize], m1 mova [rsp+ 2*mmsize], m2 mova [rsp+ 3*mmsize], m3 mova [rsp+ 4*mmsize], m4 mova [rsp+ 5*mmsize], m5 mova [rsp+ 6*mmsize], m6 mova [rsp+ 7*mmsize], m7 mova [rsp+ 8*mmsize], m16 mova [rsp+ 9*mmsize], m17 mova [rsp+10*mmsize], m18 mova [rsp+11*mmsize], m19 mova [rsp+12*mmsize], m20 mova [rsp+13*mmsize], m21 mova [rsp+14*mmsize], m22 mova [rsp+15*mmsize], m23 movshdup m7, [o(permB)] mova ym0, [cq+128*0] mova ym4, [cq+128*4] mova ym16, [cq+128*2] mova ym5, [cq+128*6] vpermt2q m16, m7, m5 ; 2 6 vpermq m0, m7, m0 ; 0 0 vpermq m4, m7, m4 ; 4 4 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 vpbroadcastd m11, [o(pd_2)] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end sub rsp, 16*64 mova [rsp+40*mmsize], m24 mova [rsp+41*mmsize], m25 mova [rsp+42*mmsize], m26 mova [rsp+43*mmsize], m27 mova [rsp+44*mmsize], m28 mova [rsp+45*mmsize], m29 mova [rsp+46*mmsize], m30 mova [rsp+47*mmsize], m31 call .pass2_fast2_start mov r7d, 16*4 mov r8, dstq pxor m31, m31 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end lea dstq, [r8+64] mova m0, [rsp+40*mmsize] mova m1, [rsp+41*mmsize] mova m2, [rsp+42*mmsize] mova m3, [rsp+43*mmsize] mova m4, [rsp+44*mmsize] mova m5, [rsp+45*mmsize] mova m6, [rsp+46*mmsize] mova m7, [rsp+47*mmsize] add rsp, 8*64 lea r5, [o_base] call .pass2_fast2_start .end: pxor m31, m31 .zero_loop: REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3 sub r7d, 16*4 jge .zero_loop call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end add rsp, 8*64 ; FIXME adjust stack_size_padded instead? RET .pass2_fast2_start: call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 punpcklqdq m27, m0, m2 ; 0 punpckhqdq m0, m2 ; 1 punpcklqdq m22, m3, m4 ; 2 punpckhqdq m26, m3, m4 ; 3 punpcklqdq m14, m5, m7 ; 4 punpckhqdq m20, m5, m7 ; 5 punpcklqdq m23, m6, m8 ; 6 punpckhqdq m21, m6, m8 ; 7 vpbroadcastd m10, [o(pd_2048)] jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1 .pass1_fast: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] mova m0, [cq+128* 1] mova m3, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast mova m0, [cq+128* 7] mova m3, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast mova m0, [cq+128* 5] mova m3, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast mova m0, [cq+128* 3] mova m3, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m16, [cq+128* 4] mova m17, [cq+128*12] call m(idct_8x16_internal_10bpc).main_fast2 call m(idct_16x16_internal_10bpc).main_fast2 call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 jmp .pass1_end .pass1: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] mova m0, [cq+128* 1] mova m1, [cq+128*31] mova m2, [cq+128*17] mova m3, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 mova m0, [cq+128* 7] mova m1, [cq+128*25] mova m2, [cq+128*23] mova m3, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 mova m0, [cq+128* 5] mova m1, [cq+128*27] mova m2, [cq+128*21] mova m3, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 mova m0, [cq+128* 3] mova m1, [cq+128*29] mova m2, [cq+128*19] mova m3, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m2, [cq+128*16] mova m3, [cq+128*24] mova m16, [cq+128* 4] mova m17, [cq+128*12] mova m18, [cq+128*20] mova m19, [cq+128*28] call m(idct_8x16_internal_10bpc).main_fast call m(idct_16x16_internal_10bpc).main_fast call .pass1_load_spill mova m4, [cq+128*18] mova m5, [cq+128*22] mova m6, [cq+128*26] mova m7, [cq+128*30] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast .pass1_end: vpbroadcastd m11, [o(pd_2)] lea r3, [rsp+gprsize] lea r4, [cq+8*128] call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end ; transpose one half immediately, we can transpose lower half later .transpose: ; transpose m0-7,16-23 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 punpcklqdq m27, m0, m20 ; 0 punpckhqdq m0, m20 ; 1 punpcklqdq m24, m5, m16 ; 10 punpckhqdq m16, m5, m16 ; 11 punpcklqdq m23, m3, m21 ; 6 punpckhqdq m21, m3, m21 ; 7 punpcklqdq m25, m7, m8 ; 14 punpckhqdq m3, m7, m8 ; 15 punpcklqdq m22, m15, m4 ; 2 punpckhqdq m26, m15, m4 ; 3 punpcklqdq m15, m6, m17 ; 12 punpckhqdq m17, m6, m17 ; 13 punpcklqdq m28, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpcklqdq m14, m2, m1 ; 4 punpckhqdq m20, m2, m1 ; 5 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 mova [cq+128* 1], m1 mova m0, [cq+128* 2] mova m1, [cq+128* 6] mova [cq+128* 2], m2 mova [cq+128* 3], m3 mova m2, [cq+128*10] mova m3, [cq+128*14] mova [cq+128* 4], m4 mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx16_sse.asm000066400000000000000000010562551517466257200236250ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright © 2017-2021, The rav1e contributors ; Copyright © 2020, Nathan Egge ; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA %macro COEF 1-2 pd_%1: times 4 dd %1 %if %0 == 2 pd_m%1: times 4 dd -%1 %endif %endmacro COEF 201 COEF 401 COEF 601, 1 COEF 799 COEF 995 COEF 1189, 1 COEF 1380, 1 COEF 1567 COEF 1751 COEF 1931 COEF 2106, 1 COEF 2276, 1 COEF 2440 COEF 2598, 1 COEF 2751, 1 COEF 2896 COEF 3035 COEF 3166 COEF 3290 COEF 3406 COEF 3513 COEF 3612 COEF 3703 COEF 3784 COEF 3857 COEF 3920 COEF 3973 COEF 4017 COEF 4052 COEF 4076 COEF 4091 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 %if ARCH_X86_32 pd_1: times 4 dd 1 %endif pd_2: times 4 dd 2 pw_5: times 8 dw 5 pd_1321: times 4 dd 1321 pd_2482: times 4 dd 2482 pd_m3344: times 4 dd -3344 pd_2048: times 4 dd 2048 pw_4x2048_4xm2048: times 4 dw 2048 times 4 dw -2048 pw_4xm2048_4x2048: times 4 dw -2048 times 4 dw 2048 pw_2048: times 8 dw 2048 pw_m2048: times 8 dw -2048 pd_3803: times 4 dd 3803 pw_4096: times 8 dw 4096 pd_5793: times 4 dd 5793 pd_6144: times 4 dd 6144 pw_8192: times 8 dw 8192 pd_10240: times 4 dd 10240 pd_11586: times 4 dd 11586 pw_1697x8: times 8 dw 1697*8 pw_2896x8: times 8 dw 2896*8 pw_1697x16: times 8 dw 1697*16 pw_16384: times 8 dw 16384 pixel_10bpc_max: times 8 dw 0x03ff pw_1567_3784: times 4 dw 1567, 3784 pw_m3784_1567: times 4 dw -3784, 1567 pw_2896_2896: times 4 dw 2896, 2896 pw_m2896_2896: times 4 dw -2896, 2896 clip_18b_min: times 4 dd -0x20000 clip_18b_max: times 4 dd 0x1ffff idct64_mul_16bpc: dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 cextern iadst_4x4_internal_8bpc_ssse3.main cextern idct_4x8_internal_8bpc_ssse3.main cextern iadst_4x8_internal_8bpc_ssse3.main cextern idct_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end cextern idct_8x4_internal_8bpc_ssse3.main cextern iadst_8x4_internal_8bpc_ssse3.main cextern idct_8x8_internal_8bpc_ssse3.main cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 cextern iadst_8x8_internal_8bpc_ssse3.main cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end cextern idct_16x8_internal_8bpc_ssse3.main cextern iadst_16x8_internal_8bpc_ssse3.main cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end cextern idct_8x32_internal_8bpc_ssse3.main cextern idct_8x32_internal_8bpc_ssse3.main_fast cextern idct_8x32_internal_8bpc_ssse3.main_veryfast cextern idct_16x64_internal_8bpc_ssse3.main cextern idct_16x64_internal_8bpc_ssse3.main_fast tbl_4x16_2d: db 0, 13, 29, 45 tbl_4x16_h: db 0, 16, 32, 48 tbl_4x16_v: db 0, 4, 8, 12 tbl_8x16_2d: db 0, 14, 30, 46 tbl_8x16_v: db 0, 4, 8, 12 tbl_8x16_h: db 0, 32, 64, 96 tbl_16x16_2d: db 0, 10, 36, 78 tbl_16x16_v: db 0, 4, 8, 12 tbl_16x16_h: db 0, 64, 128, 192 tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203 tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343 tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406 tbl_Nx32_odd_offset: db 2*16, 2*23 db 2*20, 2*19 db 2*18, 2*21 db 2*22, 2*17 db 2*30, 2*25 db 2*26, 2*29 db 2*28, 2*27 db 2*24, 2*31 tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46 db 2* 8, 2*40, 2*23, 2*38 db 2* 1, 2*36, 2*20, 2*42 db 2* 9, 2*44, 2*19, 2*34 db 2* 2, 2*60, 2*18, 2*50 db 2*10, 2*52, 2*21, 2*58 db 2* 3, 2*56, 2*22, 2*54 db 2*11, 2*48, 2*17, 2*62 SECTION .text %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) %define m(x) m_suffix(x, SUFFIX) ; This refers to the first function in itx_sse i.e. the start of the text section ; which is needed as a base pointer for constants. %define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) %if ARCH_X86_64 %define o(x) x %else %define o(x) r6-$$+x ; PIC %endif %macro IWHT4_1D 0 ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 paddd m0, m1 ; in0 += in1 psubd m4, m2, m3 ; tmp0 = in2 - in3 psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 psrad m5, 1 psubd m2, m5, m1 ; in2 = tmp1 - in1 psubd m5, m3 ; in1 = tmp1 - in3 psubd m0, m5 ; in0 -= in1 paddd m4, m2 ; in3 = tmp0 + in2 ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 ; m4 = out3, m5 = out1 %endmacro INIT_XMM sse2 cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax mova m0, [cq+16*0] mova m1, [cq+16*1] mova m2, [cq+16*2] mova m3, [cq+16*3] REPX {psrad x, 2}, m0, m1, m2, m3 IWHT4_1D punpckldq m1, m0, m5 punpckhdq m3, m0, m5 punpckldq m5, m2, m4 punpckhdq m2, m4 punpcklqdq m0, m1, m5 punpckhqdq m1, m5 punpcklqdq m4, m3, m2 punpckhqdq m3, m2 mova m2, m4 IWHT4_1D packssdw m0, m4 ; low: out3, high: out0 packssdw m2, m5 ; low: out2, high: out1 pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 lea r2, [dstq+strideq*2] movq m1, [dstq+strideq*0] movhps m1, [r2 +strideq*1] movq m3, [r2 +strideq*0] movhps m3, [dstq+strideq*1] movd m5, bdmaxm pshuflw m5, m5, q0000 ; broadcast punpcklqdq m5, m5 ; broadcast paddsw m0, m1 paddsw m2, m3 pmaxsw m0, m4 pmaxsw m2, m4 pminsw m0, m5 pminsw m2, m5 movhps [r2 +strideq*1], m0 ; write out0 movhps [dstq+strideq*1], m2 ; write out1 movq [r2 +strideq*0], m2 ; write out2 movq [dstq+strideq*0], m0 ; write out3 RET ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 ; flags: 2 = inv_dst1, 4 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags ; %1 dst/src[1] ; %2 dst/src[2] ; %3 tmp[1] ; %4 tmp[2] ; %5 tmp[3] ; %6 rnd ; %7 coef[1] ; %8 coef[2] ; %9 flags %ifnidn %7,%8 ; optimize when coef1 == coef2 %if %8 < 32 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else mova m%3, [o(pd_%8)] pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif %endif %if %7 < 32 pmulld m%1, m%7 pmulld m%2, m%7 %else mova m%5, [o(pd_%7)] pmulld m%1, m%5 pmulld m%2, m%5 %endif %if %9 & 4 ; invert dst2 paddd m%4, m%2 psubd m%2, m%6, m%4 %else %ifnum %6 %ifnidn %7,%8 paddd m%4, m%6 %else paddd m%1, m%6 %endif %endif %ifnidn %7,%8 paddd m%2, m%4 %else mova m%3, m%2 paddd m%2, m%1 %endif %endif %if %9 & 2 ; invert dst1 psubd m%3, m%1 paddd m%1, m%3, m%6 %else %ifnum %6 %ifnidn %7,%8 paddd m%1, m%6 %endif %endif psubd m%1, m%3 %endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 %endif %endmacro %macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_16bpc) %if ARCH_X86_32 LEA r6, $$ %endif %if has_epilogue %ifidn %1_%2, dct_dct test eobd, eobd jz %%end %endif lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] %ifnum %3 %if %3 add eobd, %3 %endif %else lea r5, [o(%3)] %endif call %%p1 RET %%end: %else ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] %ifnum %3 %if %3 add eobd, %3 %endif %else lea r5, [o(%3)] %endif %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 0, 4x4 %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: add r5d, 128 sar r5d, 8 .dconly2: imul r5d, 2896 mova m2, [o(pixel_10bpc_max)] add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 pxor m3, m3 punpcklqdq m0, m0 .dconly_loop: movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] paddw m1, m0 pminsw m1, m2 pmaxsw m1, m3 movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd ; butterfly rotation ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 ; Hadamard rotation psubd m%5, m%1, m%2 paddd m%2, m%1 paddd m%1, m%3, m%4 psubd m%3, m%4 ; %1 (src1) = out0 ; %2 (src2) = out1 ; %3 (src3) = out3 ; $5 (tmp1) = out2 %endmacro INIT_XMM sse4 INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, identity INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] mova m2, [cq+16*2] mova m3, [cq+16*3] mova m5, [o(pd_2048)] call .pass1_main packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 ; transpose punpckhwd m2, m0, m4 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass1_main: IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 ret .pass2: ; m0 = in0 in1 ; m1 = in2 in3 ; m5 = pd_2048 punpckhwd m2, m1, m0 punpcklwd m1, m0 pmaddwd m4, m2, [o(pw_m3784_1567)] pmaddwd m2, [o(pw_1567_3784)] pmaddwd m0, m1, [o(pw_m2896_2896)] pmaddwd m1, [o(pw_2896_2896)] REPX {paddd x, m5}, m4, m2, m0, m1 packssdw m5, m5 ; pw_2048 REPX {psrad x, 12}, m4, m2, m0, m1 packssdw m2, m4 ; t3 t2 packssdw m1, m0 ; t0 t1 paddsw m0, m1, m2 ; out0 out1 psubsw m1, m2 ; out3 out2 pmulhrsw m0, m5 pmulhrsw m1, m5 movq m2, [dstq+strideq*0] movhps m2, [dstq+strideq*1] lea r5, [dstq+strideq*2] movq m3, [r5 +strideq*1] movhps m3, [r5 +strideq*0] mova m5, [o(pixel_10bpc_max)] pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movhps [r5 +strideq*0], m1 movq [r5 +strideq*1], m1 RET INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .main packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 ; transpose punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass2: ; m0 = in0 in1 ; m1 = in2 in3 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main .end: mova m4, [o(pw_2048)] movq m2, [dstq+strideq*0] movhps m2, [dstq+strideq*1] lea r5, [dstq+strideq*2] movq m3, [r5 +strideq*0] movhps m3, [r5 +strideq*1] mova m5, [o(pixel_10bpc_max)] pmulhrsw m0, m4 pmulhrsw m1, m4 pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [r5 +strideq*0], m1 movhps [r5 +strideq*1], m1 RET ALIGN function_align .main: mova m1, [cq+16*2] mova m3, [cq+16*3] mova m5, [cq+16*0] lea r3, [cq+16*1] .main2: mova m0, [o(pd_1321)] ; SINPI_1_9 mova m2, [o(pd_2482)] ; SINPI_2_9 mova m6, [o(pd_3803)] ; SINPI_4_9 pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] psubd m1, m3 ; T[2] - T[3] pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] paddd m0, m6 ; s[0] += s[3] paddd m0, m3 ; s[0] += s[5] mova m3, [o(pd_m3344)] ; -SINPI_3_9 psubd m2, m4 ; s[1] -= s[4] psubd m2, m7 ; s[1] -= s[6] psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 paddd m4, m0, m2 ; x[3] = s[0] + s[1] psubd m2, m3 ; x[1] = s[1] + s[3] psubd m0, m3 ; x[0] = s[0] + s[3] paddd m4, m3 ; x[3] -= s[3] paddd m2, m5 ; x[1] + 2048 REPX {psrad x, 12}, m0, m2, m1, m4 ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_16bpc).main packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 ; transpose punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass2: ; m0 = in0 in1 ; m1 = in2 in3 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main mova m4, [o(pw_2048)] movq m3, [dstq+strideq*1] movhps m3, [dstq+strideq*0] lea r5, [dstq+strideq*2] movq m2, [r5 +strideq*1] movhps m2, [r5 +strideq*0] mova m5, [o(pixel_10bpc_max)] pmulhrsw m0, m4 pmulhrsw m1, m4 pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 movhps [dstq+strideq*0], m1 movq [dstq+strideq*1], m1 movhps [r5 +strideq*0], m0 movq [r5 +strideq*1], m0 RET INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m3, [o(pd_5793)] pmulld m0, m3, [cq+16*0] pmulld m1, m3, [cq+16*1] pmulld m2, m3, [cq+16*2] pmulld m3, [cq+16*3] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 ; transpose punpckhwd m3, m0, m2 punpcklwd m0, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass2: ; m0 = in0 in1 ; m1 = in2 in3 ; m5 = pd_2048 mova m4, [o(pw_1697x8)] movq m2, [dstq+strideq*0] movhps m2, [dstq+strideq*1] lea r5, [dstq+strideq*2] pmulhrsw m3, m4, m0 pmulhrsw m4, m1 paddsw m0, m3 paddsw m1, m4 movq m3, [r5 +strideq*0] movhps m3, [r5 +strideq*1] mova m4, [o(pixel_10bpc_max)] packssdw m5, m5 ; pw_2048 pmulhrsw m0, m5 pmulhrsw m1, m5 pxor m5, m5 mova [cq+16*0], m5 mova [cq+16*1], m5 mova [cq+16*2], m5 mova [cq+16*3], m5 paddw m0, m2 paddw m1, m3 pmaxsw m0, m5 pmaxsw m1, m5 pminsw m0, m4 pminsw m1, m4 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [r5 +strideq*0], m1 movhps [r5 +strideq*1], m1 RET %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 4x8 %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 add r5d, 128 sar r5d, 8 imul r5d, 181 jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly %endif %endmacro INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, identity, 9 INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp mova m5, [o(pd_2048)] %if ARCH_X86_64 xor r5d, r5d cmp eobd, 13 setge r5b %else mov r5d, 1 cmp eobd, 13 sbb r5d, 0 %endif shl r5d, 4 .loop_pass1: mova m3, [o(pd_2896)] pmulld m0, m3, [cq+32*0+r5] pmulld m1, m3, [cq+32*1+r5] pmulld m2, m3, [cq+32*2+r5] pmulld m3, [cq+32*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 call m(idct_4x4_internal_16bpc).pass1_main packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 test r5d, r5d jz .end_pass1 mova [cq+32*0+16], m0 mova [cq+32*1+16], m4 xor r5d, r5d jmp .loop_pass1 .end_pass1: punpckhwd m2, m0, m4 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 mova m2, [cq+32*0+16] mova m6, [cq+32*1+16] punpckhwd m4, m2, m6 punpcklwd m2, m6 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_4x8_internal_8bpc, _ssse3).main ; m0-3 is now out0/1,3/2,4/5,7/6 mova m4, [o(pw_2048)] shufps m1, m1, q1032 shufps m3, m3, q1032 .end: REPX {pmulhrsw x, m4}, m0, m1, m2, m3 pxor m4, m4 REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 mova m7, [o(pixel_10bpc_max)] lea r2, [strideq*3] movq m5, [dstq+strideq*0] movq m6, [dstq+strideq*2] movhps m5, [dstq+strideq*1] movhps m6, [dstq+r2] lea r4, [dstq+strideq*4] paddw m0, m5 paddw m1, m6 movq m5, [r4+strideq*0] movq m6, [r4+strideq*2] movhps m5, [r4+strideq*1] movhps m6, [r4+r2] paddw m2, m5 paddw m3, m6 REPX {pminsw x, m7}, m0, m1, m2, m3 REPX {pmaxsw x, m4}, m0, m1, m2, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r2 ], m1 movq [r4 +strideq*0], m2 movhps [r4 +strideq*1], m2 movq [r4 +strideq*2], m3 movhps [r4 +r2 ], m3 RET INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity, 9 cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .pass1_main punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 mova m2, [cq+32*2+16] mova m6, [cq+32*3+16] punpckhwd m4, m2, m6 punpcklwd m2, m6 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass1_main: %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 13 setge r5b %else mov r5d, 1 cmp eobd, 13 sbb r5d, 0 %endif shl r5d, 4 lea r3, [cq+32*1+16] .loop_pass1: mova m0, [o(pd_2048)] mova m3, [o(pd_2896)] pmulld m5, m3, [cq+32*0+r5] pmulld m2, m3, [cq+32*1+r5] pmulld m1, m3, [cq+32*2+r5] pmulld m3, [cq+32*3+r5] REPX {paddd x, m0}, m5, m2, m1, m3 REPX {psrad x, 12}, m5, m2, m1, m3 mova [r3], m2 call m(iadst_4x4_internal_16bpc).main2 packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 test r5d, r5d jz .end_pass1 mova [cq+32*2+16], m0 mova [cq+32*3+16], m1 xor r5d, r5d jmp .loop_pass1 .end_pass1: ret .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main mova m4, [o(pw_4x2048_4xm2048)] jmp m(idct_4x8_internal_16bpc).end INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity, 9 cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_16bpc).pass1_main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 mova m6, [cq+32*2+16] mova m2, [cq+32*3+16] punpcklwd m4, m2, m6 punpckhwd m2, m6 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main mova m4, m0 mova m5, m1 pshufd m0, m3, q1032 pshufd m1, m2, q1032 pshufd m2, m5, q1032 pshufd m3, m4, q1032 mova m4, [o(pw_4xm2048_4x2048)] jmp m(idct_4x8_internal_16bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity, 3 cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp mova m5, [o(pd_2048)] mova m4, [o(pd_2896)] mova m6, [o(pd_5793)] ; clear m7 in case we skip the bottom square pxor m7, m7 %if ARCH_X86_64 xor r5d, r5d cmp eobd, 16 setge r5b %else mov r5d, 1 cmp eobd, 16 sbb r5d, 0 %endif shl r5d, 4 .loop_pass1: pmulld m0, m4, [cq+32*0+r5] pmulld m1, m4, [cq+32*1+r5] pmulld m2, m4, [cq+32*2+r5] pmulld m3, m4, [cq+32*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {pmulld x, m6}, m0, m1, m2, m3 REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 test r5d, r5d jz .end_pass1 mova [cq+32*0+16], m0 mova m7, m2 xor r5d, r5d jmp .loop_pass1 .end_pass1: punpckhwd m4, m0, m2 punpcklwd m0, m2 punpckhwd m1, m0, m4 punpcklwd m0, m4 mova m2, [cq+32*0+16] punpckhwd m4, m2, m7 punpcklwd m2, m7 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass2: mova m4, [o(pw_4096)] jmp m(idct_4x8_internal_16bpc).end %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 add r5d, 384 sar r5d, 9 jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 %endif %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, identity, v INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif mova m5, [o(pd_2048)] .loop_pass1: mova m0, [cq+64*0+r5] mova m1, [cq+64*1+r5] mova m2, [cq+64*2+r5] mova m3, [cq+64*3+r5] call m(idct_4x4_internal_16bpc).pass1_main pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m1, m4, m2 REPX {psrad x, 1}, m0, m1, m4, m2 packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 punpckhwd m2, m0, m4 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 test r5d, r5d jz .end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .end_pass1: mova m2, [cq+64*0+16] mova m3, [cq+64*1+16] mova m4, [cq+64*0+32] mova m5, [cq+64*1+32] mova m6, [cq+64*0+48] mova m7, [cq+64*1+48] ; m0-7 = packed & transposed output jmp tx2q .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_16x4_internal_8bpc, _ssse3).main ; m0-6 is out0-13 [with odd registers having inversed output] ; [coeffq+16*7] has out15/14 mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [cq+16*7] REPX {shufps x, x, q1032}, m1, m3, m5, m7 mova [cq+16*0], m4 mova [cq+16*1], m5 mova [cq+16*2], m6 mova [cq+16*3], m7 .end: pxor m4, m4 REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 mova m7, [o(pixel_10bpc_max)] mov r5d, 2 lea r3, [strideq*3] .loop: movq m5, [dstq+strideq*0] movq m6, [dstq+strideq*2] movhps m5, [dstq+strideq*1] movhps m6, [dstq+r3] lea r4, [dstq+strideq*4] paddw m0, m5 paddw m1, m6 movq m5, [r4+strideq*0] movq m6, [r4+strideq*2] movhps m5, [r4+strideq*1] movhps m6, [r4+r3] paddw m2, m5 paddw m3, m6 REPX {pminsw x, m7}, m0, m1, m2, m3 REPX {pmaxsw x, m4}, m0, m1, m2, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r3 ], m1 movq [r4 +strideq*0], m2 movhps [r4 +strideq*1], m2 movq [r4 +strideq*2], m3 movhps [r4 +r3 ], m3 dec r5d jz .end2 lea dstq, [dstq+strideq*8] mova m0, [cq+0*16] mova m1, [cq+1*16] mova m2, [cq+2*16] mova m3, [cq+3*16] REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 jmp .loop .end2: RET INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity, v cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r6+r5] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif .loop_pass1: mova m5, [cq+64*0+r5] lea r3, [cq+64*1+r5] mova m1, [cq+64*2+r5] mova m3, [cq+64*3+r5] call m(iadst_4x4_internal_16bpc).main2 pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m2, m1, m4 REPX {psrad x, 1}, m0, m2, m1, m4 packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 mova m1, [o(pw_4x2048_4xm2048)] REPX {pmulhrsw x, m1}, m7, m2, m0 pshufd m6, m1, q1032 ; 4x-2048,4x2048 pmulhrsw m1, [cq+16*7] REPX {pmulhrsw x, m6}, m5, m4, m3 pmulhrsw m6, [cq+16*6] ; m7/5/2/4 = out4/11,5/10,6/9,7/8 ; m0/3/6/1 = out0/15,3/12,1/14,2/13 ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 movhps [cq+0*8], m4 movhps [cq+1*8], m2 movhps [cq+2*8], m5 movhps [cq+3*8], m7 movhps [cq+4*8], m3 movhps [cq+5*8], m1 movhps [cq+6*8], m6 movhps [cq+7*8], m0 punpcklqdq m0, m6 punpcklqdq m1, m3 punpcklqdq m3, m2, m4 punpcklqdq m2, m7, m5 jmp m(idct_4x16_internal_16bpc).end INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity, v cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif .loop_pass1: mova m5, [cq+64*0+r5] lea r3, [cq+64*1+r5] mova m1, [cq+64*2+r5] mova m3, [cq+64*3+r5] call m(iadst_4x4_internal_16bpc).main2 pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m2, m1, m4 REPX {psrad x, 1}, m0, m2, m1, m4 packssdw m0, m2 ; out3 out2 packssdw m1, m4 ; out1 out0 punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 mova m1, [o(pw_4x2048_4xm2048)] REPX {pmulhrsw x, m1}, m7, m2, m0 pshufd m6, m1, q1032 ; 4x-2048,4x2048 pmulhrsw m1, [cq+16*7] REPX {pmulhrsw x, m6}, m5, m4, m3 pmulhrsw m6, [cq+16*6] ; m7/5/2/4 = out11/4,10/5,9/6,8/7 ; m0/3/6/1 = out15/0,12/3,14/1,13/2 ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 movq [cq+0*8], m4 movq [cq+1*8], m2 movq [cq+2*8], m5 movq [cq+3*8], m7 movq [cq+4*8], m3 movq [cq+5*8], m1 movq [cq+6*8], m6 movq [cq+7*8], m0 punpckhqdq m0, m6 punpckhqdq m1, m3 punpckhqdq m3, m2, m4 punpckhqdq m2, m7, m5 jmp m(idct_4x16_internal_16bpc).end INV_TXFM_4X16_FN identity, dct, h INV_TXFM_4X16_FN identity, adst, h INV_TXFM_4X16_FN identity, flipadst, h INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif mova m5, [o(pd_6144)] mova m4, [o(pd_5793)] .loop_pass1: pmulld m0, m4, [cq+64*0+r5] pmulld m1, m4, [cq+64*1+r5] pmulld m2, m4, [cq+64*2+r5] pmulld m3, m4, [cq+64*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 13}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 punpckhwd m3, m0, m2 punpcklwd m0, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .pass2: mova [cq+16*4], m0 mova [cq+16*5], m1 mova [cq+16*6], m2 mova [cq+16*7], m7 mova m0, [o(pw_1697x16)] mova m7, [o(pw_2048)] pmulhrsw m1, m0, m4 pmulhrsw m2, m0, m5 REPX {paddsw x, x}, m4, m5 paddsw m4, m1 paddsw m5, m2 REPX {pmulhrsw x, m7}, m4, m5 mova [cq+16*0], m4 mova [cq+16*1], m5 mova m4, [cq+16*7] pmulhrsw m1, m0, m6 pmulhrsw m2, m0, m4 REPX {paddsw x, x}, m6, m4 paddsw m6, m1 paddsw m4, m2 REPX {pmulhrsw x, m7}, m6, m4 mova [cq+16*2], m6 mova [cq+16*3], m4 mova m4, [cq+16*4] mova m1, [cq+16*5] mova m2, [cq+16*6] pmulhrsw m5, m0, m2 pmulhrsw m6, m0, m3 REPX {paddsw x, x}, m2, m3 paddsw m2, m5 paddsw m3, m6 pmulhrsw m6, m0, m1 pmulhrsw m0, m4 REPX {paddsw x, x}, m1, m4 paddsw m1, m6 paddsw m0, m4 REPX {pmulhrsw x, m7}, m2, m3, m1, m0 jmp m(idct_4x16_internal_16bpc).end %macro INV_TXFM_8X4_FN 2 ; type1, type2 %if ARCH_X86_64 INV_TXFM_FN %1, %2, 0, 8x4, 15 %else INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 128 sar r5d, 8 imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 lea r2, [strideq*3] mova m1, [dstq+strideq*0] mova m2, [dstq+strideq*1] mova m3, [dstq+strideq*2] mova m4, [dstq+r2] REPX {paddw x, m0}, m1, m2, m3, m4 REPX {pmaxsw x, m5}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 mova [dstq+strideq*2], m3 mova [dstq+r2 ], m4 RET %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, identity INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] .pass1_entry: %if ARCH_X86_32 lea r3, [rsp+gprsize] %else mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+0*16] mova m1, [cq+1*16] mova m2, [cq+2*16] mova m3, [cq+3*16] mova m4, [cq+4*16] mova m5, [cq+5*16] mova m6, [cq+6*16] mova m7, [cq+7*16] call .rect2_mul call r5 call .transpose4x8packed ; m0-3 = packed & transposed output jmp tx2q .transpose4x8packed: ; transpose punpcklwd m1, m2, m6 punpckhwd m2, m6 punpckhwd m6, m0, m4 punpcklwd m0, m4 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m4, m6, m2 punpcklwd m6, m2 punpcklwd m2, m3, m4 punpckhwd m3, m4 punpckhwd m1, m0, m6 punpcklwd m0, m6 ret .main: call .main_pass1 call .round packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 ret .rect2_mul: %if ARCH_X86_64 REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 %else mova [r3], m7 mova m7, [o(pd_2896)] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulld m7, [r3] mova [r3], m7 mova m7, [o(pd_2048)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] %endif REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 ret %if ARCH_X86_64 .main_pass1_fast: pmulld m5, m3, [o(pd_m2276)] pmulld m3, [o(pd_3406)] pmulld m7, m1, [o(pd_4017)] pmulld m1, [o(pd_799)] pmulld m6, m2, [o(pd_3784)] pmulld m2, [o(pd_1567)] pmulld m0, m14 pxor m4, m4 jmp .main_pass1_fast2 .main_pass1: ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 REPX {pmulld x, m14}, m0, m4 .main_pass1_fast2: REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7 REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a paddd m9, m7, m3 ; t7 psubd m7, m3 ; t6a REPX {pmaxsd x, m12}, m1, m8, m7, m9 REPX {pminsd x, m13}, m1, m8, m7, m9 REPX {pmulld x, m14}, m7, m1 paddd m0, m11 paddd m7, m11 psubd m5, m0, m4 paddd m0, m4 psubd m4, m7, m1 paddd m7, m1 REPX {psrad x, 12 }, m5, m0, m4, m7 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 REPX {pmaxsd x, m12}, m0, m6, m5, m3 REPX {pminsd x, m13}, m0, m6, m5, m3 ret .round: paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 %else .main_pass1_fast: pmulld m5, m3, [o(pd_m2276)] pmulld m3, [o(pd_3406)] pmulld m7, m1, [o(pd_4017)] pmulld m1, [o(pd_799)] pmulld m6, m2, [o(pd_3784)] pmulld m2, [o(pd_1567)] mova m4, [o(pd_2048)] mova [r3+0*16], m2 REPX {paddd x, m4}, m5, m3, m7, m1 REPX {psrad x, 12}, m5, m3, m7, m1 paddd m2, m1, m5 ; t4 psubd m1, m5 ; t5a pmulld m5, m0, [o(pd_2896)] mova m0, m4 paddd m4, m7, m3 ; t7 psubd m7, m3 ; t6a mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3 }, m1, m2, m7, m4 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3 }, m1, m2, m7, m4 mova [r3+3*16], m2 mova [r3+1*16], m4 pxor m4, m4 mova m2, [r3+0*16] mova m3, [o(pd_2896)] jmp .main_pass1_fast2 .main_pass1: mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m4 mova [r3+3*16], m6 mova m0, [o(pd_2048)] ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a paddd m2, m1, m5 ; t4 psubd m1, m5 ; t5a paddd m4, m7, m3 ; t7 psubd m7, m3 ; t6a mova m6, [o(clip_18b_min)] REPX {pmaxsd x, m6 }, m1, m2, m7, m4 mova m6, [o(clip_18b_max)] REPX {pminsd x, m6 }, m1, m2, m7, m4 mova m6, [r3+3*16] mova [r3+3*16], m2 mova m2, [r3+1*16] mova [r3+1*16], m4 ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3 mova m3, [o(pd_2896)] mova m5, [r3+0*16] mova m4, [r3+2*16] REPX {pmulld x, m3 }, m5, m4 .main_pass1_fast2: REPX {paddd x, m0 }, m2, m6 REPX {psrad x, 12 }, m2, m6 REPX {pmulld x, m3 }, m7, m1 paddd m7, m0 paddd m0, m5 psubd m5, m0, m4 paddd m0, m4 psubd m4, m7, m1 paddd m7, m1 REPX {psrad x, 12 }, m5, m0, m4, m7 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 mova m1, [o(clip_18b_min)] REPX {pmaxsd x, m1 }, m0, m6, m5, m3 mova m1, [o(clip_18b_max)] REPX {pminsd x, m1 }, m0, m6, m5, m3 ret .round: paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 mova [r3+0*16], m6 mova m6, [r3+1*16] psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 mova m6, [r3+3*16] psubd m4, m3, m6 ; out4 paddd m3, m6 ; out3 mova m6, [r3+0*16] %endif ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_8x4_internal_8bpc, _ssse3).main .end: lea r3, [strideq*3] call .round2_and_write_8x4 REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 RET .round2_and_write_8x4: pxor m6, m6 mova m5, [o(pixel_10bpc_max)] mova m4, [o(pw_2048)] .round1_and_write_8x4: REPX {pmulhrsw x, m4}, m0, m1, m2, m3 .write_8x4: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3] REPX {pminsw x, m5}, m0, m1, m2, m3 REPX {pmaxsw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 ret INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] jmp m(idct_8x4_internal_16bpc).pass1_entry .main: call .main_pass1 call .round packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 ret .main_pass1: %if ARCH_X86_64 ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a psubd m8, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 psubd m4, m5, m1 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a psubd m9, m6, m8 ; t7 paddd m6, m8 ; out6 mova m8, [o(pd_2896)] psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m2 ; t2 paddd m0, m2 ; out0 psubd m2, m1, m4 ; t6 paddd m1, m4 ; -out1 REPX {pmaxsd x, m12}, m5, m3, m2, m9 REPX {pminsd x, m13}, m5, m3, m2, m9 REPX {pmulld x, m14}, m5, m3, m2, m9 psubd m4, m5, m3 ; (t2 - t3) * 2896 paddd m3, m5 ; (t2 + t3) * 2896 psubd m5, m2, m9 ; (t6 - t7) * 2896 paddd m2, m9 ; (t6 + t7) * 2896 ret .round: ; m0=out0,m1=-out1,m6=out6,m7=-out7 pcmpeqd m8, m8 REPX {pxor x, m8 }, m1, m7, m3, m5 REPX {psubd x, m8 }, m1, m7 REPX {paddd x, m11}, m2, m3, m4, m5 REPX {psrad x, 12 }, m2, m3, m4, m5 %else mova [r3+0*16], m2 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m5 mova m5, [o(pd_2048)] ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a mova m2, [r3+0*16] mova m3, [r3+1*16] mova m4, [r3+2*16] mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m6 mova m1, [r3+3*16] mova [r3+3*16], m7 ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a mova m0, [r3+0*16] mova m6, [r3+2*16] psubd m7, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 mova [r3+0*16], m7 mova m5, [r3+1*16] mova m7, [r3+3*16] psubd m4, m1, m5 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 mova [r3+1*16], m7 mova m7, [o(clip_18b_max)] pmaxsd m3, [r3+0*16] REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 pminsd m7, [r3+1*16] mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m5 mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a mova m5, [r3+2*16] mova m7, [r3+3*16] psubd m2, m6, m3 ; t7 paddd m6, m3 ; out6 mova [r3+3*16], m6 mova m0, [r3+0*16] mova m6, [r3+1*16] psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m6 ; t2 paddd m0, m6 ; out0 psubd m6, m1, m4 ; t6 paddd m1, m4 ; -out1 mova m4, [o(clip_18b_min)] REPX {pmaxsd x, m4 }, m5, m3, m6, m2 mova m4, [o(clip_18b_max)] REPX {pminsd x, m4 }, m5, m3, m6, m2 mova m4, [o(pd_2896)] REPX {pmulld x, m4 }, m5, m3, m6, m2 psubd m4, m5, m3 ; (t2 - t3) * 2896 paddd m3, m5 ; (t2 + t3) * 2896 psubd m5, m6, m2 ; (t6 - t7) * 2896 paddd m2, m6 ; (t6 + t7) * 2896 ret .round: mova [r3+2*16], m0 pcmpeqd m0, m0 mova m6, [o(pd_2048)] REPX {pxor x, m0 }, m1, m7, m3, m5 REPX {psubd x, m0 }, m1, m7 REPX {paddd x, m6 }, m2, m3, m4, m5 REPX {psrad x, 12 }, m2, m3, m4, m5 mova m6, [r3+3*16] mova m0, [r3+2*16] %endif ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main jmp m(idct_8x4_internal_16bpc).end INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] jmp m(idct_8x4_internal_16bpc).pass1_entry .main: call m(iadst_8x4_internal_16bpc).main_pass1 call m(iadst_8x4_internal_16bpc).round packssdw m7, m6 packssdw m5, m4 packssdw m3, m2 packssdw m1, m0 mova m0, m7 mova m2, m5 mova m4, m3 mova m6, m1 ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main lea r3, [strideq*3] add dstq, r3 neg strideq jmp m(idct_8x4_internal_16bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] jmp m(idct_8x4_internal_16bpc).pass1_entry .main: REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 ret .pass2: mova m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(idct_8x4_internal_16bpc).end %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset %if ARCH_X86_64 INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16 %else INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 2 .end: add r5d, 384 sar r5d, 9 .end2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 lea r2, [strideq*3] .loop: mova m1, [dstq+strideq*0] mova m2, [dstq+strideq*1] mova m3, [dstq+strideq*2] mova m4, [dstq+r2] REPX {paddw x, m0}, m1, m2, m3, m4 REPX {pmaxsw x, m5}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 mova [dstq+strideq*2], m3 mova [dstq+r2 ], m4 lea dstq, [dstq+strideq*4] dec r3d jg .loop RET %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, identity, 6 INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 DECLARE_REG_TMP 1 mov [rsp+4*16+1*gprsize], r1 %else DECLARE_REG_TMP 6 %endif lea t0, [o(.pass1_main)] .pass1_full: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 10 setge r5b %else mov r5d, 1 cmp eobd, 10 sbb r5d, 0 %endif shl r5d, 4 %if ARCH_X86_32 lea r3, [rsp+gprsize] %endif .loop_pass1: mova m0, [cq+0*32+r5] mova m1, [cq+1*32+r5] mova m2, [cq+2*32+r5] mova m3, [cq+3*32+r5] mova m4, [cq+4*32+r5] mova m5, [cq+5*32+r5] mova m6, [cq+6*32+r5] mova m7, [cq+7*32+r5] call t0 test r5d, r5d jz .end_pass1 mova [cq+0*32+16], m0 mova [cq+1*32+16], m1 mova [cq+2*32+16], m2 mova [cq+3*32+16], m3 sub r5d, 16 jmp .loop_pass1 .end_pass1: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_32 mov r1, [rsp+4*16+1*gprsize] %endif jmp tx2q .pass1_main: call m(idct_8x4_internal_16bpc).main_pass1 pcmpeqd m1, m1 REPX {psubd x, m1}, m0, m6, m5, m3 call m(idct_8x4_internal_16bpc).round REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 .pack_and_transpose: packssdw m2, m3 packssdw m6, m7 packssdw m0, m1 packssdw m4, m5 jmp m(idct_8x4_internal_16bpc).transpose4x8packed .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_8x8_internal_8bpc, _ssse3).main lea r3, [strideq*3] %if ARCH_X86_64 mova m10, [o(pixel_10bpc_max)] pxor m9, m9 %endif call .round3_and_write_8x8 .zero: %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %undef mzero RET ; round (rounded right-shift by 5) before writing ; data in m0-7 ; on x86-64, pw_2048 is in m8 ; .round1 is for m0-7 ; .round2 is for m0-6 & [rsp+gprsize*2] ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7 %if ARCH_X86_32 .round1_and_write_8x8: mova [rsp+gprsize*2], m7 .round2_and_write_8x8: %endif .round3_and_write_8x8: mova m7, [o(pw_2048)] %if ARCH_X86_32 .round4_and_write_8x8: %endif REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [rsp+gprsize*2] %if ARCH_X86_64 jmp .write_8x8 .round2_and_write_8x8: mova m7, [rsp+gprsize*2] .round1_and_write_8x8: REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 %endif ; m0-7 have to-be-written data [pre-rounded] ; on x86-64, m9-10 contain a zero/pixel_max ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch ; r0,1,3 contain dstq/strideq/stride3q ; r5 is a scratch register .write_8x8: lea r5, [dstq+strideq*4] paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3] paddw m4, [r5 +strideq*0] paddw m5, [r5 +strideq*1] paddw m6, [r5 +strideq*2] paddw m7, [r5 +r3] %if ARCH_X86_64 REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 %else mova [rsp+gprsize*2], m7 pxor m7, m7 REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmaxsw m7, [rsp+gprsize*2] mova [rsp+gprsize*2], m7 mova m7, [o(pixel_10bpc_max)] REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pminsw m7, [rsp+gprsize*2] %endif mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 mova [r5 +strideq*0], m4 mova [r5 +strideq*1], m5 mova [r5 +strideq*2], m6 mova [r5 +r3 ], m7 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity, 6 cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+4*16+1*gprsize], r1 %endif lea t0, [o(.pass1_main)] jmp m(idct_8x8_internal_16bpc).pass1_full .pass1_main: call m(iadst_8x4_internal_16bpc).main_pass1 call .round jmp m(idct_8x8_internal_16bpc).pack_and_transpose .round: %if ARCH_X86_64 pcmpeqd m8, m8 ; -1 REPX {psubd x, m8 }, m0, m6 REPX {pxor x, m8 }, m1, m7, m3, m5 REPX {psrad x, 1 }, m0, m1, m6, m7 REPX {psubd x, m8 }, m1, m7 mova m8, [o(pd_6144)] REPX {paddd x, m8 }, m2, m3, m4, m5 REPX {psrad x, 13 }, m2, m3, m4, m5 %else mova [r3+2*16], m0 pcmpeqd m0, m0 ; -1 mova m6, [o(pd_6144)] REPX {pxor x, m0 }, m1, m7, m3, m5 REPX {psrad x, 1 }, m1, m7 REPX {psubd x, m0 }, m1, m7 REPX {paddd x, m6 }, m2, m3, m4, m5 REPX {psrad x, 13 }, m2, m3, m4, m5 mova m0, [r3+2*16] psrld m6, 12 ; +1 paddd m0, m6 paddd m6, [r3+3*16] REPX {psrad x, 1 }, m0, m6 %endif ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end lea r3, [strideq*3] %if ARCH_X86_64 mova m10, [o(pixel_10bpc_max)] pxor m9, m9 %endif call .round3_and_write_8x8 jmp m(idct_8x8_internal_16bpc).zero ; round (rounded right-shift by 5) before writing; odd registers are negated ; data in m0-7 ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11 ; .round1 is for m0-7 ; .round2 is for m0-6 & [rsp+gprsize*2] ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) %if ARCH_X86_64 .round2_and_write_8x8: mova m7, [rsp+gprsize*2] .round1_and_write_8x8: REPX {pmulhrsw x, m8 }, m0, m2, m4, m6 REPX {pmulhrsw x, m11}, m1, m3, m5, m7 jmp m(idct_8x8_internal_16bpc).write_8x8 %else .round1_and_write_8x8: mova [rsp+gprsize*2], m7 .round2_and_write_8x8: %endif .round3_and_write_8x8: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova m7, [o(pw_m2048)] REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize*2] jmp m(idct_8x8_internal_16bpc).write_8x8 INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity, 6 cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+4*16+1*gprsize], r1 %endif lea t0, [o(.pass1_main)] jmp m(idct_8x8_internal_16bpc).pass1_full .pass1_main: call m(iadst_8x4_internal_16bpc).main_pass1 call m(iadst_8x8_internal_16bpc).round ; invert registers packssdw m7, m6 packssdw m5, m4 packssdw m3, m2 packssdw m1, m0 mova m0, m7 mova m2, m5 mova m4, m3 mova m6, m1 jmp m(idct_8x4_internal_16bpc).transpose4x8packed .pass2: lea dstq, [dstq+strideq*8] sub dstq, strideq neg strideq jmp m(iadst_8x8_internal_16bpc).pass2 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m0, [cq+0*32] mova m1, [cq+1*32] mova m2, [cq+2*32] mova m3, [cq+3*32] mova m4, [cq+4*32] mova m5, [cq+5*32] mova m6, [cq+6*32] mova m7, [cq+7*32] packssdw m0, [cq+0*32+16] packssdw m1, [cq+1*32+16] packssdw m2, [cq+2*32+16] packssdw m3, [cq+3*32+16] packssdw m4, [cq+4*32+16] packssdw m5, [cq+5*32+16] packssdw m6, [cq+6*32+16] packssdw m7, [cq+7*32+16] mova [rsp+gprsize+16*1], m6 jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif lea r3, [strideq*3] %if ARCH_X86_64 mova m10, [o(pixel_10bpc_max)] pxor m9, m9 mova m8, [o(pw_4096)] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 %else mova [rsp+gprsize], m7 mova m7, [o(pw_4096)] call m(idct_8x8_internal_16bpc).round4_and_write_8x8 %endif jmp m(idct_8x8_internal_16bpc).zero %macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix %if ARCH_X86_64 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16 %else INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 add r5d, 128 sar r5d, 8 imul r5d, 181 mov r3d, 4 %if stack_size_padded > 0 ; adjust to caller's stack allocation add rsp, (12+ARCH_X86_64)*16 %endif jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity, v INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst %if ARCH_X86_64 DECLARE_REG_TMP 7 %endif cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] .pass1_full: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif %undef cmp mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, [rsp+16*16+2*gprsize] ; setup stack pointer lea r3, [rsp+gprsize] %endif .loop_pass1: mova m0, [cq+0*64+r5] mova m1, [cq+1*64+r5] mova m2, [cq+2*64+r5] mova m3, [cq+3*64+r5] mova m4, [cq+4*64+r5] mova m5, [cq+5*64+r5] mova m6, [cq+6*64+r5] mova m7, [cq+7*64+r5] call m(idct_8x4_internal_16bpc).rect2_mul call t0 mova [cq+0*64+r5], m0 mova [cq+1*64+r5], m1 mova [cq+2*64+r5], m2 mova [cq+3*64+r5], m3 sub r5d, 16 jge .loop_pass1 %if WIN64 POP r7 %elif ARCH_X86_32 mov r1, [rsp+16*16+1*gprsize] %endif jmp tx2q .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 ; some are still pre-loaded from the final loop iteration in pass=1 mova m1, m2 mova m2, [cq+ 1*16] mova m3, [cq+ 9*16] mova m4, [cq+ 2*16] mova m5, [cq+10*16] mova m6, [cq+ 3*16] mova m7, [cq+11*16] call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+3*16], m0 mova [rsp+gprsize+4*16], m1 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m3 mova [rsp+gprsize+7*16], m4 mova [rsp+gprsize+8*16], m5 mova [rsp+gprsize+9*16], m6 ; m7 is already stored in [rsp+gprsize+0*16] mova m0, [cq+ 4*16] mova m1, [cq+12*16] mova m2, [cq+ 5*16] mova m3, [cq+13*16] mova m4, [cq+ 6*16] mova m5, [cq+14*16] mova m6, [cq+ 7*16] mova m7, [cq+15*16] call m_suffix(idct_16x8_internal_8bpc, _ssse3).main ; out0-7 is in rsp+gprsize+3-10*mmsize ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize %if ARCH_X86_64 mova m8, [o(pw_2048)] mova m10, [o(pixel_10bpc_max)] pxor m9, m9 mov r6, dstq %else mov [rsp+16*16+gprsize*1], dstq %endif lea r3, [strideq*3] lea dstq, [dstq+strideq*8] call m(idct_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 %undef mzero mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] %if ARCH_X86_64 mov dstq, r6 %else mov dstq, [rsp+16*16+gprsize*1] %endif call m(idct_8x8_internal_16bpc).round1_and_write_8x8 RET INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity, v cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] jmp m(idct_8x16_internal_16bpc).pass1_full .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m4, [cq+ 9*16] mova m5, [cq+13*16] mova [rsp+gprsize+7*16], m0 mova [rsp+gprsize+8*16], m1 mova [rsp+gprsize+5*16], m4 mova [rsp+gprsize+6*16], m5 mova m0, m2 mova m1, m3 mova m2, [cq+ 1*16] mova m3, [cq+ 5*16] mova m4, [cq+ 2*16] mova m5, [cq+ 6*16] mova m6, [cq+11*16] mova m7, [cq+15*16] mova [rsp+gprsize+ 3*16], m4 mova [rsp+gprsize+ 4*16], m5 mova [rsp+gprsize+ 9*16], m6 mova [rsp+gprsize+10*16], m7 mova m4, [cq+10*16] mova m5, [cq+14*16] mova m6, [cq+ 3*16] mova m7, [cq+ 7*16] call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end %if ARCH_X86_64 mova m11, [o(pw_m2048)] mova m8, [o(pw_2048)] mova m10, [o(pixel_10bpc_max)] pxor m9, m9 mov r6, dstq %else mov [rsp+16*16+gprsize*1], dstq %endif lea r3, [strideq*3] lea dstq, [dstq+strideq*8] call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 %undef mzero mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] %if ARCH_X86_64 mov dstq, r6 %else mov dstq, [rsp+16*16+gprsize*1] %endif call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 RET INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity, v cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] jmp m(idct_8x16_internal_16bpc).pass1_full .pass2: lea r3, [strideq*3] lea r3, [r3*5] add dstq, r3 neg strideq jmp m(iadst_8x16_internal_16bpc).pass2 INV_TXFM_8X16_FN identity, dct, h INV_TXFM_8X16_FN identity, adst, h INV_TXFM_8X16_FN identity, flipadst, h INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] jmp m(idct_8x16_internal_16bpc).pass1_full .pass2: %if ARCH_X86_64 mova m4, [o(pw_2048)] mova m5, [o(pixel_10bpc_max)] pxor m6, m6 mova m7, [o(pw_1697x16)] %endif mov r5d, 4 lea r3, [strideq*3] .pass2_loop: call .main %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).round1_and_write_8x4 %else call m(idct_8x4_internal_16bpc).round2_and_write_8x4 %endif REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28 dec r5d jle .end add cq, 16 lea dstq, [dstq+strideq*4] mova m0, [cq+ 0*16] mova m1, [cq+ 4*16] mova m2, [cq+ 8*16] mova m3, [cq+12*16] jmp .pass2_loop .end: RET .main: ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) %if ARCH_X86_32 mova m7, [o(pw_1697x16)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 %else pmulhrsw m8, m7, m0 pmulhrsw m9, m7, m1 pmulhrsw m10, m7, m2 pmulhrsw m11, m7, m3 %endif REPX {paddsw x, x}, m0, m1, m2, m3 %if ARCH_X86_64 paddsw m0, m8 paddsw m1, m9 paddsw m2, m10 paddsw m3, m11 %else paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 %endif ret %macro INV_TXFM_16X4_FN 2 ; type1, type2 %if ARCH_X86_64 INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16 %else INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: add r5d, 384 sar r5d, 9 .dconly2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m3, [o(pixel_10bpc_max)] pxor m4, m4 .loop: mova m1, [dstq+ 0] mova m2, [dstq+16] REPX {paddw x, m0}, m1, m2 REPX {pminsw x, m3}, m1, m2 REPX {pmaxsw x, m4}, m1, m2 mova [dstq+ 0], m1 mova [dstq+16], m2 add dstq, strideq dec r3d jg .loop RET %endif %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, identity INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif ; setup stack pointer lea r3, [rsp+gprsize] mova m0, [cq+ 1*16] mova m1, [cq+ 3*16] mova m2, [cq+ 5*16] mova m3, [cq+ 7*16] mova m4, [cq+ 9*16] mova m5, [cq+11*16] mova m6, [cq+13*16] mova m7, [cq+15*16] call .main_oddhalf mova m0, [cq+ 0*16] mova m1, [cq+ 2*16] mova m2, [cq+ 4*16] mova m3, [cq+ 6*16] mova m4, [cq+ 8*16] mova m5, [cq+10*16] mova m6, [cq+12*16] mova m7, [cq+14*16] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round ; t0-7 is in m0-7 call .round %if ARCH_X86_64 .pack_transpose: ; transpose in two parts packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 .transpose: call m(idct_8x4_internal_16bpc).transpose4x8packed call .transpose4x8packed_hi %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m2 mova [r3+3*16], m3 mova m0, [r3+ 8*16] mova m2, [r3+ 9*16] mova m4, [r3+10*16] mova m6, [r3+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif jmp tx2q %if ARCH_X86_64 .transpose4x8packed_hi: punpcklwd m9, m10, m14 punpckhwd m10, m14 punpckhwd m14, m8, m12 punpcklwd m8, m12 punpckhwd m11, m8, m9 punpcklwd m8, m9 punpckhwd m12, m14, m10 punpcklwd m14, m10 punpcklwd m10, m11, m12 punpckhwd m11, m12 punpckhwd m9, m8, m14 punpcklwd m8, m14 ret %endif .main_oddhalf_fast: ; lower half zero pmulld m7, m0, [o(pd_4076)] pmulld m0, [o(pd_401)] pmulld m6, m1, [o(pd_m1189)] pmulld m1, [o(pd_3920)] %if ARCH_X86_32 mova m4, [o(pd_2048)] REPX {paddd x, m4}, m1, m6 REPX {psrad x, 12}, m1, m6 mova [r3+1*16], m1 %endif pmulld m5, m2, [o(pd_3612)] pmulld m2, [o(pd_1931)] %if ARCH_X86_32 pmulld m1, m3, [o(pd_m2598)] %else pmulld m4, m3, [o(pd_m2598)] %endif pmulld m3, [o(pd_3166)] jmp .main_oddhalf_fast2 .main_oddhalf: %if ARCH_X86_64 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a .main_oddhalf_fast2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 psubd m8, m0, m4 ; t9 paddd m0, m4 ; t8 psubd m4, m6, m2 ; t10 paddd m2, m6 ; t11 psubd m6, m1, m5 ; t13 paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a paddd m0, m2 ; t8a psubd m2, m8, m6 ; t13 paddd m6, m8 ; t14 psubd m8, m7, m5 ; t12a paddd m7, m5 ; t15a REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pmulld x, m14}, m2, m8, m3, m4 paddd m2, m11 paddd m8, m11 paddd m5, m2, m3 ; t13a psubd m2, m3 ; t10a psubd m3, m8, m4 ; t11 paddd m4, m8 ; t12 REPX {psrad x, 12}, m5, m2, m3, m4 mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m2 mova [r3+3*16], m3 mova [r3+4*16], m4 mova [r3+5*16], m5 mova [r3+6*16], m6 mova [r3+7*16], m7 %else mova [r3+0*16], m2 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m5 mova m4, [o(pd_2048)] ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a mova m2, [r3+0*16] mova m3, [r3+1*16] mova [r3+0*16], m0 mova [r3+1*16], m1 mova m1, [r3+2*16] mova m5, [r3+3*16] mova [r3+2*16], m6 mova [r3+3*16], m7 ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a mova m0, [r3+0*16] mova m6, [r3+2*16] mova m7, [r3+3*16] .main_oddhalf_fast2: REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3 REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3 psubd m4, m0, m1 ; t9 paddd m0, m1 ; t8 mova m1, [r3+1*16] mova [r3+0*16], m4 psubd m4, m6, m2 ; t10 paddd m2, m6 ; t11 psubd m6, m1, m5 ; t13 paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 pmaxsd m3, [r3+0*16] mova [r3+0*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 pminsd m3, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m5 mova [r3+3*16], m7 mova m7, [o(pd_2048)] ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784 ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4 mova m0, [r3+0*16] mova m2, [r3+1*16] psubd m5, m1, m4 ; t10 mova [r3+1*16], m5 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a paddd m0, m2 ; t8a mova m5, [r3+2*16] mova m7, [r3+3*16] psubd m2, m3, m6 ; t13 paddd m6, m3 ; t14 paddd m3, m7, m5 ; t15a psubd m7, m5 ; t12a mova [r3+0*16], m3 mova m3, [r3+1*16] mova m5, [o(clip_18b_min)] REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 pmaxsd m5, [r3+0*16] mova [r3+0*16], m5 mova m5, [o(clip_18b_max)] REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 pminsd m5, [r3+0*16] mova [r3+0*16], m5 mova m5, [o(pd_2896)] REPX {pmulld x, m5}, m2, m7, m3, m4 mova m5, [o(pd_2048)] REPX {paddd x, m5}, m2, m7 paddd m5, m2, m3 ; t13a psubd m2, m3 ; t10a psubd m3, m7, m4 ; t11 paddd m4, m7 ; t12 REPX {psrad x, 12}, m5, m2, m3, m4 mova m7, [r3+0*16] mova [r3+11*16], m0 mova [r3+10*16], m1 mova [r3+9*16], m2 mova [r3+8*16], m3 mova [r3+7*16], m4 mova [r3+6*16], m5 mova [r3+5*16], m6 mova [r3+4*16], m7 %endif ret .round: %if ARCH_X86_64 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 pcmpeqd m8, m8 REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] mova m9, [r3+2*16] mova m10, [r3+3*16] mova m11, [r3+4*16] mova m12, [r3+5*16] mova m13, [r3+6*16] mova m14, [r3+7*16] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r3+0*16] ; out8 paddd m7, [r3+0*16] ; out7 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 pmaxsd m0, [r3+ 0*16] mova [r3+ 0*16], m7 mova m7, [o(clip_18b_max)] REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 pminsd m7, [r3+ 0*16] mova [r3+ 0*16], m0 pcmpeqd m0, m0 REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 mova [r3+ 1*16], m1 mova [r3+ 2*16], m2 mova m1, [r3+ 0*16] psubd m1, m0 mova [r3+ 0*16], m1 mova m1, [r3+11*16] mova m2, [r3+10*16] psubd m0, m7, m1 paddd m7, m1 psubd m1, m6, m2 paddd m6, m2 REPX {psrad x, 1}, m0, m1, m6, m7 packssdw m0, m1 ; out8-9 packssdw m6, m7 ; out6-7 mova [r3+11*16], m6 mova m1, [r3+9*16] mova m7, [r3+8*16] psubd m2, m5, m1 paddd m5, m1 psubd m1, m4, m7 paddd m4, m7 REPX {psrad x, 1}, m2, m1, m4, m5 packssdw m2, m1 ; out10-11 packssdw m4, m5 ; out4-5 mova m1, [r3+2*16] mova [r3+10*16], m4 mova m6, [r3+7*16] mova m7, [r3+6*16] psubd m4, m3, m6 paddd m3, m6 psubd m6, m1, m7 paddd m1, m7 REPX {psrad x, 1}, m4, m6, m1, m3 packssdw m4, m6 ; out12-13 packssdw m1, m3 ; out2-3 mova m3, [r3+1*16] mova [r3+9*16], m1 mova m1, [r3+0*16] mova m5, [r3+5*16] mova m7, [r3+4*16] psubd m6, m3, m5 paddd m3, m5 psubd m5, m1, m7 paddd m1, m7 REPX {psrad x, 1}, m6, m5, m1, m3 packssdw m6, m5 ; out14-15 packssdw m1, m3 ; out0-1 mova [r3+8*16], m1 %endif ret .pass2: lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)] .pass2_loop: lea r3, [strideq*3] %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call r4 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %if ARCH_X86_64 mova m0, m8 mova m1, m9 mova m2, m10 mova m3, m11 %else mova m0, [rsp+gprsize+0*16] mova m1, [rsp+gprsize+1*16] mova m2, [rsp+gprsize+2*16] mova m3, [rsp+gprsize+3*16] %endif add dstq, 16 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call r4 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 RET INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; setup stack pointer lea r3, [rsp+gprsize] call .main %if ARCH_X86_64 jmp m(idct_16x4_internal_16bpc).pack_transpose %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+gprsize+0*16], m0 mova [rsp+gprsize+1*16], m1 mova [rsp+gprsize+2*16], m2 mova [rsp+gprsize+3*16], m3 mova m0, [rsp+gprsize+ 8*16] mova m2, [rsp+gprsize+ 9*16] mova m4, [rsp+gprsize+10*16] mova m6, [rsp+gprsize+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed jmp tx2q %endif .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*16] mova m1, [cq+13*16] mova m2, [cq+ 6*16] mova m3, [cq+ 9*16] mova m4, [cq+10*16] mova m5, [cq+ 5*16] mova m6, [cq+14*16] mova m7, [cq+ 1*16] call .main_part1 mova m0, [cq+ 0*16] mova m1, [cq+15*16] mova m2, [cq+ 4*16] mova m3, [cq+11*16] mova m4, [cq+ 8*16] mova m5, [cq+ 7*16] mova m6, [cq+12*16] mova m7, [cq+ 3*16] call .main_part2 .round: %if ARCH_X86_64 mova m15, [o(pd_6144)] psrld m14, 11 ; pd_1 pcmpeqd m8, m8 ; -1 psubd m13, m15, m14 ; pd_6143 REPX {paddd x, m14}, m0, m2 REPX {paddd x, m15}, m4, m6 REPX {pxor x, m8 }, m1, m3, m5, m7 REPX {psrad x, 1 }, m1, m3 REPX {paddd x, m15}, m5, m7 REPX {psubd x, m8 }, m1, m3 paddd m8, m15, m9 psubd m9, m13, m10 paddd m10, m15, m11 psubd m11, m13, m12 paddd m12, m14, [r3+3*16] psubd m13, m14, [r3+2*16] psubd m15, m14, [r3+0*16] paddd m14, [r3+1*16] REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 %else mova [r3+8*16], m1 mova [r3+9*16], m3 mova m3, [o(pd_6144)] pcmpeqd m1, m1 REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m5, m6, m7 REPX {psrad x, 13}, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {pxor x, m1}, m5, m7 REPX {psubd x, m1}, m4, m6 REPX {psrad x, 1 }, m4, m5, m6, m7 REPX {psubd x, m1}, m5, m7 packssdw m4, m5 packssdw m6, m7 mova m5, [r3+8*16] mova m7, [r3+9*16] mova [r3+8*16], m4 mova [r3+9*16], m6 REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m0, m5, m2, m7 REPX {psrad x, 13}, m0, m5, m2, m7 packssdw m0, m5 packssdw m2, m7 mova m4, [r3+0*16] mova m5, [r3+1*16] mova m6, [r3+2*16] mova m7, [r3+3*16] REPX {psubd x, m1}, m4, m6 REPX {pxor x, m1}, m5, m7 REPX {psrad x, 1 }, m4, m5, m6, m7 REPX {psubd x, m1}, m5, m7 packssdw m4, m5 packssdw m6, m7 %endif ret .main_part2: %if ARCH_X86_64 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380 psubd m8, m0, m4 ; t8a paddd m0, m4 ; t0a psubd m4, m1, m5 ; t9a paddd m1, m5 ; t1a psubd m5, m2, m6 ; t12a paddd m2, m6 ; t4a psubd m6, m3, m7 ; t13a paddd m7, m3 ; t5a REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 mova m15, [o(pd_4017)] mova m10, [o(pd_799)] ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 psubd m3, m0, m2 ; t4 paddd m0, m2 ; t0 psubd m2, m1, m7 ; t5 paddd m1, m7 ; t1 psubd m7, m4, m6 ; t12a paddd m4, m6 ; t8a psubd m6, m8, m5 ; t13a paddd m5, m8 ; t9a REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15 ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15 mova m10, [r3+0*16] ; t2 mova m8, [r3+1*16] ; t3 psubd m9, m0, m10 ; t2a paddd m0, m10 ; out0 psubd m10, m1, m8 ; t3a paddd m1, m8 ; -out15 mova [r3+0*16], m1 mova m15, [r3+3*16] ; t7a mova m1, [r3+2*16] ; t6a psubd m8, m3, m15 ; t7 paddd m15, m3 ; out12 paddd m3, m2, m1 ; -out3 psubd m2, m1 ; t6 mova [r3+3*16], m15 mova [r3+1*16], m2 mova m1, [r3+7*16] ; t15 mova m2, [r3+6*16] ; t14 paddd m15, m7, m1 ; -out13 psubd m7, m1 ; t15a psubd m11, m6, m2 ; t14a paddd m2, m6 ; out2 mova [r3+2*16], m15 mova m1, [r3+4*16] ; t10a mova m15, [r3+5*16] ; t11a psubd m6, m4, m1 ; t10 paddd m1, m4 ; -out1 psubd m4, m5, m15 ; t11 paddd m5, m15 ; out14 REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8 pmaxsd m12, [r3+1*16] ; t6 mova [r3+1*16], m5 REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8 REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) paddd m7, m9, m10 ; -out7 (unshifted) psubd m9, m10 ; out8 (unshifted) psubd m10, m6, m4 ; -out9 (unshifted) paddd m6, m4 ; out6 (unshifted) paddd m4, m12, m8 ; out4 (unshifted) psubd m12, m8 ; -out11 (unshifted) %else mova [r3+8*16], m0 mova [r3+9*16], m1 mova [r3+10*16], m2 mova [r3+11*16], m3 mova m3, [o(pd_2048)] ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751 ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380 mova m0, [r3+8*16] mova m1, [r3+9*16] mova [r3+8*16], m4 mova m4, [r3+10*16] mova [r3+9*16], m5 mova [r3+10*16], m6 mova m5, [r3+11*16] mova [r3+11*16], m7 ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091 ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703 mova m2, [r3+8*16] mova m6, [r3+9*16] psubd m3, m0, m2 ; t8a paddd m0, m2 ; t0a mova [r3+8*16], m3 psubd m2, m1, m6 ; t9a paddd m1, m6 ; t1a mova m3, [r3+10*16] psubd m6, m4, m3 ; t12a paddd m4, m3 ; t4a mova m3, [r3+11*16] psubd m7, m5, m3 ; t13a paddd m5, m3 ; t5a mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 pmaxsd m3, [r3+8*16] mova [r3+8*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 pminsd m3, [r3+8*16] mova [r3+8*16], m3 psubd m3, m0, m4 ; t4 paddd m0, m4 ; t0 psubd m4, m1, m5 ; t5 paddd m1, m5 ; t1 mova m5, [o(pd_2048)] mova [r3+9*16], m1 mova [r3+10*16], m4 mova [r3+11*16], m3 mova m3, [r3+8*16] mova [r3+8*16], m0 ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017 ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4 psubd m5, m2, m7 ; t12a paddd m2, m7 ; t8a psubd m7, m3, m6 ; t13a paddd m6, m3 ; t9a mova m0, [r3+8*16] mova m1, [r3+9*16] mova m4, [r3+10*16] mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 pmaxsd m3, [r3+11*16] mova [r3+8*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 pminsd m3, [r3+8*16] mova [r3+8*16], m0 mova [r3+9*16], m1 mova [r3+10*16], m2 mova [r3+11*16], m6 mova m0, [o(pd_2048)] ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784 ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784 mova m0, [r3+7*16] ; t7a mova m2, [r3+6*16] ; t6a psubd m1, m3, m0 ; t7 paddd m0, m3 ; out12 paddd m3, m4, m2 ; -out3 psubd m4, m2 ; t6 mova [r3+7*16], m3 mova m3, [r3+3*16] ; t15 mova m2, [r3+2*16] ; t14 paddd m6, m5, m3 ; -out13 psubd m5, m3 ; t15a psubd m3, m7, m2 ; t14a paddd m2, m7 ; out2 mova [r3+6*16], m2 mova m7, [r3+0*16] ; t10a mova m2, [r3+1*16] ; t11a mova [r3+0*16], m0 mova [r3+1*16], m6 mova m6, [r3+11*16] psubd m0, m6, m2 ; t11 paddd m6, m2 ; out14 mova [r3+2*16], m6 mova m2, [r3+10*16] psubd m6, m2, m7 ; t10 paddd m2, m7 ; -out1 mova m7, [r3+5*16] ; t3 mova [r3+5*16], m2 mova [r3+10*16], m1 mova m1, [r3+9*16] psubd m2, m1, m7 ; t3a paddd m1, m7 ; -out15 mova [r3+3*16], m1 mova m1, [r3+4*16] ; t2 mova m7, [r3+8*16] psubd m7, m1 ; t2a paddd m1, [r3+8*16] ; out0 mova [r3+4*16], m1 mova m1, [o(clip_18b_min)] REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 pmaxsd m1, [r3+10*16] mova [r3+10*16], m1 mova m1, [o(clip_18b_max)] REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 pminsd m1, [r3+10*16] mova [r3+10*16], m1 mova m1, [o(pd_2896)] REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7 pmulld m1, [r3+10*16] mova [r3+11*16], m3 psubd m3, m4, m1 ; -out11 (unshifted) paddd m4, m1 ; out4 (unshifted) psubd m1, m6, m0 ; -out9 (unshifted) paddd m6, m0 ; out6 (unshifted) psubd m0, m7, m2 ; out8 (unshifted) paddd m7, m2 ; -out7 (unshifted) mova m2, [r3+11*16] mova [r3+11*16], m5 paddd m5, m2 ; -out5 (unshifted) psubd m2, [r3+11*16] ; out10 (unshifted) ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted) ; r[-4,3] contain out0-3 and out12-15 %endif ret .main_part1: %if ARCH_X86_64 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601 psubd m8, m0, m4 ; t10a paddd m0, m4 ; t2a psubd m4, m1, m5 ; t11a paddd m1, m5 ; t3a psubd m5, m2, m6 ; t14a paddd m2, m6 ; t6a psubd m6, m3, m7 ; t15a paddd m7, m3 ; t7a REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 mova m15, [o(pd_2276)] mova m10, [o(pd_3406)] ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 psubd m3, m0, m2 ; t6 paddd m0, m2 ; t2 psubd m2, m1, m7 ; t7 paddd m1, m7 ; t3 psubd m7, m4, m6 ; t14a paddd m4, m6 ; t10a psubd m6, m8, m5 ; t15a paddd m5, m8 ; t11a REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 mova m15, [o(pd_1567)] mova m10, [o(pd_3784)] ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15 mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+4*16], m4 mova [r3+5*16], m5 mova [r3+2*16], m2 mova [r3+3*16], m3 mova [r3+6*16], m6 mova [r3+7*16], m7 %else mova [r3+4*16], m0 mova [r3+5*16], m1 mova [r3+6*16], m2 mova [r3+7*16], m3 mova m3, [o(pd_2048)] ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106 ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601 mova [r3+0*16], m4 mova [r3+1*16], m5 mova [r3+2*16], m6 mova [r3+3*16], m7 mova m0, [r3+4*16] mova m1, [r3+5*16] mova m2, [r3+6*16] mova m7, [r3+7*16] ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973 ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290 mova m4, [r3+0*16] mova m5, [r3+1*16] psubd m6, m0, m4 ; t10a paddd m0, m4 ; t2a mova [r3+4*16], m6 mova m6, [r3+2*16] mova m3, [r3+3*16] psubd m4, m1, m5 ; t11a paddd m1, m5 ; t3a psubd m5, m2, m6 ; t14a paddd m2, m6 ; t6a psubd m6, m7, m3 ; t15a paddd m7, m3 ; t7a mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 pmaxsd m3, [r3+4*16] mova [r3+4*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 pminsd m3, [r3+4*16] mova [r3+4*16], m3 psubd m3, m0, m2 ; t6 paddd m0, m2 ; t2 psubd m2, m1, m7 ; t7 paddd m1, m7 ; t3 mova [r3+5*16], m1 mova [r3+6*16], m3 mova [r3+7*16], m2 mova m1, [r3+4*16] mova [r3+4*16], m0 mova m3, [o(pd_2048)] ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276 ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2 psubd m7, m4, m6 ; t14a paddd m4, m6 ; t10a psubd m6, m1, m5 ; t15a paddd m5, m1 ; t11a mova m1, [r3+5*16] mova m3, [r3+6*16] mova m2, [r3+7*16] mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 pmaxsd m0, [r3+4*16] mova [r3+4*16], m0 mova m0, [o(clip_18b_max)] REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 pminsd m0, [r3+4*16] mova [r3+4*16], m0 mova [r3+5*16], m1 mova [r3+0*16], m4 mova [r3+1*16], m5 mova m0, [o(pd_2048)] ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567 ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567 mova [r3+6*16], m2 mova [r3+7*16], m3 mova [r3+2*16], m6 mova [r3+3*16], m7 %endif ret .pass2: lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] jmp m(idct_16x4_internal_16bpc).pass2_loop INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r3, [rsp+gprsize] call m(iadst_16x4_internal_16bpc).main %if ARCH_X86_64 packssdw m1, m0 packssdw m3, m2 packssdw m5, m4 packssdw m7, m6 packssdw m9, m8 packssdw m11, m10 packssdw m13, m12 packssdw m15, m14 mova m0, m15 mova m2, m13 mova m4, m11 mova m6, m9 mova m8, m7 mova m10, m5 mova m12, m3 mova m14, m1 jmp m(idct_16x4_internal_16bpc).transpose %else mova [rsp+gprsize+4*16], m0 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m4 mova [rsp+gprsize+7*16], m6 pshufd m6, [rsp+gprsize+ 8*16], q1032 pshufd m4, [rsp+gprsize+ 9*16], q1032 pshufd m2, [rsp+gprsize+10*16], q1032 pshufd m0, [rsp+gprsize+11*16], q1032 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+gprsize+0*16], m0 mova [rsp+gprsize+1*16], m1 mova [rsp+gprsize+2*16], m2 mova [rsp+gprsize+3*16], m3 pshufd m6, [rsp+gprsize+ 4*16], q1032 pshufd m4, [rsp+gprsize+ 5*16], q1032 pshufd m2, [rsp+gprsize+ 6*16], q1032 pshufd m0, [rsp+gprsize+ 7*16], q1032 call m(idct_8x4_internal_16bpc).transpose4x8packed jmp tx2q %endif .pass2: lea r3, [strideq*3] lea dstq, [dstq+r3] neg strideq lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] jmp m(idct_16x4_internal_16bpc).pass2_loop INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 mova m15, [o(pd_11586)] pmulld m0, m15, [cq+ 0*16] pmulld m1, m15, [cq+ 1*16] pmulld m2, m15, [cq+ 2*16] pmulld m3, m15, [cq+ 3*16] pmulld m4, m15, [cq+ 4*16] pmulld m5, m15, [cq+ 5*16] pmulld m6, m15, [cq+ 6*16] pmulld m7, m15, [cq+ 7*16] pmulld m8, m15, [cq+ 8*16] pmulld m9, m15, [cq+ 9*16] pmulld m10, m15, [cq+10*16] pmulld m11, m15, [cq+11*16] pmulld m12, m15, [cq+12*16] pmulld m13, m15, [cq+13*16] pmulld m14, m15, [cq+14*16] pmulld m15, [cq+15*16] mova [cq+ 0*16], m15 mova m15, [o(pd_6144)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq+ 0*16] REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp m(idct_16x4_internal_16bpc).pack_transpose %else add cq, 8*16 mov r5d, 2 .loop_pass1: mova m7, [o(pd_11586)] pmulld m0, m7, [cq+0*16] pmulld m1, m7, [cq+1*16] pmulld m2, m7, [cq+2*16] pmulld m3, m7, [cq+3*16] pmulld m4, m7, [cq+4*16] pmulld m5, m7, [cq+5*16] pmulld m6, m7, [cq+6*16] pmulld m7, [cq+7*16] mova [cq+7*16], m7 mova m7, [o(pd_6144)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [cq+7*16] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed dec r5d jz .end_pass1 mova [rsp+gprsize+0*16], m0 mova [rsp+gprsize+1*16], m1 mova [rsp+gprsize+2*16], m2 mova [rsp+gprsize+3*16], m3 sub cq, 8*16 jmp .loop_pass1 .end_pass1: jmp tx2q %endif .pass2: %if ARCH_X86_64 mova m12, [o(pw_1697x8)] %endif lea r4, [o(.main)] jmp m(idct_16x4_internal_16bpc).pass2_loop .main: %if ARCH_X86_64 pmulhrsw m4, m0, m12 pmulhrsw m5, m1, m12 pmulhrsw m6, m2, m12 pmulhrsw m7, m3, m12 %else mova m7, [o(pw_1697x8)] pmulhrsw m4, m0, m7 pmulhrsw m5, m1, m7 pmulhrsw m6, m2, m7 pmulhrsw m7, m3 %endif paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 ret %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset %if ARCH_X86_64 INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16 %else INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 add r5d, 128 sar r5d, 8 imul r5d, 181 %if ARCH_X86_32 add rsp, 1*16 %endif jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity, 6 INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 DECLARE_REG_TMP 6, 4, 6 %else mov [rsp+gprsize+12*16], r1 DECLARE_REG_TMP 1, 4, 3 %endif lea t0, [o(.main)] .loop_main: %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 10 setge r5b %else mov r5d, 1 cmp eobd, 10 sbb r5d, 0 %endif shl r5d, 4 lea r3, [rsp+gprsize] .loop_pass1: call t0 %if ARCH_X86_64 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+4*32+r5], m8 mova [cq+5*32+r5], m9 mova [cq+6*32+r5], m10 mova [cq+7*32+r5], m11 %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+4*32+r5], m0 mova [cq+5*32+r5], m1 mova [cq+6*32+r5], m2 mova [cq+7*32+r5], m3 mova m0, [rsp+gprsize+ 8*16] mova m2, [rsp+gprsize+ 9*16] mova m4, [rsp+gprsize+10*16] mova m6, [rsp+gprsize+11*16] %endif call m(idct_8x4_internal_16bpc).transpose4x8packed pxor m7, m7 REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15 test r5d, r5d jz .end mova [cq+0*32+r5], m0 mova [cq+1*32+r5], m1 mova [cq+2*32+r5], m2 mova [cq+3*32+r5], m3 xor r5d, r5d jmp .loop_pass1 .end: jmp tx2q .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*32+r5] mova m1, [cq+ 3*32+r5] mova m2, [cq+ 5*32+r5] mova m3, [cq+ 7*32+r5] mova m4, [cq+ 9*32+r5] mova m5, [cq+11*32+r5] mova m6, [cq+13*32+r5] mova m7, [cq+15*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*32+r5] mova m1, [cq+ 2*32+r5] mova m2, [cq+ 4*32+r5] mova m3, [cq+ 6*32+r5] mova m4, [cq+ 8*32+r5] mova m5, [cq+10*32+r5] mova m6, [cq+12*32+r5] mova m7, [cq+14*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call m(idct_16x4_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif mov r4d, 2 .pass2_main: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %endif lea r3, [strideq*3] jmp .loop_pass2_entry .loop_pass2: mova m0, [cq+0*32+ 0] mova m1, [cq+1*32+ 0] mova m2, [cq+2*32+ 0] mova m3, [cq+3*32+ 0] .loop_pass2_entry: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_8x8_internal_8bpc, _ssse3).main call m(idct_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, 16 add cq, 4*32 dec r4d jg .loop_pass2 RET INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity, 6 cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+gprsize+12*16], r1 %endif lea t0, [o(.main)] jmp m(idct_16x8_internal_16bpc).loop_main .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*32+r5] mova m1, [cq+13*32+r5] mova m2, [cq+ 6*32+r5] mova m3, [cq+ 9*32+r5] mova m4, [cq+10*32+r5] mova m5, [cq+ 5*32+r5] mova m6, [cq+14*32+r5] mova m7, [cq+ 1*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul call m(iadst_16x4_internal_16bpc).main_part1 mova m0, [cq+ 0*32+r5] mova m1, [cq+15*32+r5] mova m2, [cq+ 4*32+r5] mova m3, [cq+11*32+r5] mova m4, [cq+ 8*32+r5] mova m5, [cq+ 7*32+r5] mova m6, [cq+12*32+r5] mova m7, [cq+ 3*32+r5] %if ARCH_X86_32 add r3, 8*16 %endif call m(idct_8x4_internal_16bpc).rect2_mul %if ARCH_X86_32 sub r3, 8*16 %endif call m(iadst_16x4_internal_16bpc).main_part2 call m(iadst_16x4_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif mov r4d, 2 %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] mova m11, [o(pw_m2048)] %endif lea r3, [strideq*3] jmp .loop_pass2_entry .loop_pass2: mova m0, [cq+0*32+ 0] mova m1, [cq+1*32+ 0] mova m2, [cq+2*32+ 0] mova m3, [cq+3*32+ 0] .loop_pass2_entry: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, 16 add cq, 4*32 dec r4d jg .loop_pass2 RET INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity, 6 cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+gprsize+12*16], r1 %endif lea t0, [o(.main)] jmp m(idct_16x8_internal_16bpc).loop_main .main: call m(iadst_16x8_internal_16bpc).main %if ARCH_X86_64 pshufd m1, m0, q1032 pshufd m3, m2, q1032 pshufd m5, m4, q1032 pshufd m7, m6, q1032 pshufd m0, m14, q1032 pshufd m2, m12, q1032 pshufd m4, m10, q1032 pshufd m6, m8, q1032 mova m14, m1 mova m12, m3 mova m10, m5 mova m8, m7 %else pshufd m1, m0, q1032 pshufd m3, m2, q1032 pshufd m5, m4, q1032 pshufd m7, m6, q1032 pshufd m0, [r3+11*16], q1032 pshufd m2, [r3+10*16], q1032 pshufd m4, [r3+9*16], q1032 pshufd m6, [r3+8*16], q1032 mova [r3+8*16], m7 mova [r3+9*16], m5 mova [r3+10*16], m3 mova [r3+11*16], m1 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif lea dstq, [dstq+strideq*8] neg strideq add dstq, strideq %if ARCH_X86_32 mov [rsp+gprsize+12*16], strideq %endif jmp m(iadst_16x8_internal_16bpc).pass2 INV_TXFM_16X8_FN identity, dct, -54 INV_TXFM_16X8_FN identity, adst, -54 INV_TXFM_16X8_FN identity, flipadst, -54 INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+gprsize+12*16], r1 %endif lea t0, [o(.main)] jmp m(idct_16x8_internal_16bpc).loop_main .main: %if ARCH_X86_64 mova m15, [o(pd_2896)] pmulld m0, m15, [cq+ 0*32+r5] pmulld m1, m15, [cq+ 1*32+r5] pmulld m2, m15, [cq+ 2*32+r5] pmulld m3, m15, [cq+ 3*32+r5] pmulld m4, m15, [cq+ 4*32+r5] pmulld m5, m15, [cq+ 5*32+r5] pmulld m6, m15, [cq+ 6*32+r5] pmulld m7, m15, [cq+ 7*32+r5] pmulld m8, m15, [cq+ 8*32+r5] pmulld m9, m15, [cq+ 9*32+r5] pmulld m10, m15, [cq+10*32+r5] pmulld m11, m15, [cq+11*32+r5] pmulld m12, m15, [cq+12*32+r5] pmulld m13, m15, [cq+13*32+r5] pmulld m14, m15, [cq+14*32+r5] pmulld m15, [cq+15*32+r5] mova [r3], m15 mova m15, [o(pd_2048)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [r3] REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [r3], m15 mova m15, [o(pd_11586)] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [r3] mova [r3], m15 mova m15, [o(pd_6144)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [r3] REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %else mova m0, [cq+ 0*32+r5] mova m1, [cq+ 1*32+r5] mova m2, [cq+ 2*32+r5] mova m3, [cq+ 3*32+r5] mova m4, [cq+ 4*32+r5] mova m5, [cq+ 5*32+r5] mova m6, [cq+ 6*32+r5] mova m7, [cq+ 7*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul mova [r3], m7 mova m7, [o(pd_11586)] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulld m7, [r3] mova [r3], m7 mova m7, [o(pd_6144)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 mova [r3+ 8*16], m0 mova [r3+ 9*16], m2 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m0, [cq+ 8*32+r5] mova m1, [cq+ 9*32+r5] mova m2, [cq+10*32+r5] mova m3, [cq+11*32+r5] mova m4, [cq+12*32+r5] mova m5, [cq+13*32+r5] mova m6, [cq+14*32+r5] mova m7, [cq+15*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul mova [r3], m7 mova m7, [o(pd_11586)] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulld m7, [r3] mova [r3], m7 mova m7, [o(pd_6144)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif mov r4d, 2 %if ARCH_X86_64 mova m8, [o(pw_4096)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %endif lea r3, [strideq*3] jmp .loop_pass2_entry .loop_pass2: mova m0, [cq+0*32+ 0] mova m1, [cq+1*32+ 0] mova m2, [cq+2*32+ 0] mova m3, [cq+3*32+ 0] .loop_pass2_entry: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_64 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 %else mova [rsp+gprsize], m7 mova m7, [o(pw_4096)] call m(idct_8x8_internal_16bpc).round4_and_write_8x8 %endif %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, 16 add cq, 4*32 dec r4d jg .loop_pass2 RET %macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix %if ARCH_X86_64 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16 %else INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 add r5d, 640 sar r5d, 10 add rsp, (5+ARCH_X86_64*3+WIN64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity, v INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 DECLARE_REG_TMP 6, 7 %if WIN64 mov [rsp+16*16+gprsize], r7 %endif %elif ARCH_X86_32 DECLARE_REG_TMP 1, 6 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] .pass1_full: %undef cmp mov t1d, 4 .zero_loop: dec t1d cmp eobb, byte [r5+t1] jb .zero_loop mov r5d, t1d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, [rsp+16*16+2*gprsize] %endif ; setup stack pointer lea r3, [rsp+gprsize] .loop_pass1: call t0 %if ARCH_X86_64 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+4*64+r5], m8 mova [cq+5*64+r5], m9 mova [cq+6*64+r5], m10 mova [cq+7*64+r5], m11 %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+4*64+r5], m0 mova [cq+5*64+r5], m1 mova [cq+6*64+r5], m2 mova [cq+7*64+r5], m3 mova m0, [rsp+gprsize+ 8*16] mova m2, [rsp+gprsize+ 9*16] mova m4, [rsp+gprsize+10*16] mova m6, [rsp+gprsize+11*16] %endif call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+0*64+r5], m0 mova [cq+1*64+r5], m1 mova [cq+2*64+r5], m2 mova [cq+3*64+r5], m3 pxor m0, m0 REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15 sub r5d, 16 jge .loop_pass1 %if ARCH_X86_32 ; restore pic-ptr mov r1, [rsp+16*16+1*gprsize] %endif jmp tx2q .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*64+r5] mova m1, [cq+ 3*64+r5] mova m2, [cq+ 5*64+r5] mova m3, [cq+ 7*64+r5] mova m4, [cq+ 9*64+r5] mova m5, [cq+11*64+r5] mova m6, [cq+13*64+r5] mova m7, [cq+15*64+r5] call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*64+r5] mova m1, [cq+ 2*64+r5] mova m2, [cq+ 4*64+r5] mova m3, [cq+ 6*64+r5] mova m4, [cq+ 8*64+r5] mova m5, [cq+10*64+r5] mova m6, [cq+12*64+r5] mova m7, [cq+14*64+r5] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call .round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .round: %if ARCH_X86_64 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 psrld m8, m11, 10 ; 2 REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] mova m9, [r3+2*16] mova m10, [r3+3*16] mova m11, [r3+4*16] mova m12, [r3+5*16] mova m13, [r3+6*16] mova m14, [r3+7*16] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r3+0*16] ; out8 paddd m7, [r3+0*16] ; out7 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 pmaxsd m0, [r3+ 0*16] mova [r3+ 0*16], m7 mova m7, [o(clip_18b_max)] REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 pminsd m7, [r3+ 0*16] mova [r3+ 0*16], m0 mova m0, [o(pd_2)] REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 paddd m0, [r3+ 0*16] mova [r3+ 0*16], m0 mova [r3+ 1*16], m1 mova [r3+ 2*16], m2 mova m1, [r3+11*16] mova m2, [r3+10*16] psubd m0, m7, m1 paddd m7, m1 psubd m1, m6, m2 paddd m6, m2 REPX {psrad x, 2}, m0, m1, m6, m7 packssdw m0, m1 ; out8-9 packssdw m6, m7 ; out6-7 mova [r3+11*16], m6 mova m1, [r3+9*16] mova m7, [r3+8*16] psubd m2, m5, m1 paddd m5, m1 psubd m1, m4, m7 paddd m4, m7 REPX {psrad x, 2}, m2, m1, m4, m5 packssdw m2, m1 ; out10-11 packssdw m4, m5 ; out4-5 mova m1, [r3+2*16] mova [r3+10*16], m4 mova m6, [r3+7*16] mova m7, [r3+6*16] psubd m4, m3, m6 paddd m3, m6 psubd m6, m1, m7 paddd m1, m7 REPX {psrad x, 2}, m4, m6, m1, m3 packssdw m4, m6 ; out12-13 packssdw m1, m3 ; out2-3 mova m3, [r3+1*16] mova [r3+9*16], m1 mova m1, [r3+0*16] mova m5, [r3+5*16] mova m7, [r3+4*16] psubd m6, m3, m5 paddd m3, m5 psubd m5, m1, m7 paddd m1, m7 REPX {psrad x, 2}, m6, m5, m1, m3 packssdw m6, m5 ; out14-15 packssdw m1, m3 ; out0-1 mova [r3+8*16], m1 %endif ret .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 2 .loop_pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m0, [cq+0*64+ 0] mova m1, [cq+2*64+ 0] mova m2, [cq+0*64+16] mova m3, [cq+2*64+16] mova m4, [cq+0*64+32] mova m5, [cq+2*64+32] mova m6, [cq+0*64+48] mova m7, [cq+2*64+48] call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+3*16], m0 mova [rsp+gprsize+4*16], m1 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m3 mova [rsp+gprsize+7*16], m4 mova [rsp+gprsize+8*16], m5 mova [rsp+gprsize+9*16], m6 ; m7 is already stored in [rsp+gprsize+0*16] mova m0, [cq+1*64+ 0] mova m1, [cq+3*64+ 0] mova m2, [cq+1*64+16] mova m3, [cq+3*64+16] mova m4, [cq+1*64+32] mova m5, [cq+3*64+32] mova m6, [cq+1*64+48] mova m7, [cq+3*64+48] call m_suffix(idct_16x8_internal_8bpc, _ssse3).main ; out0-7 is in rsp+gprsize+3-10*mmsize ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize %if ARCH_X86_64 lea dstq, [r7+strideq*8] %else mov dstq, [rsp+2*gprsize+16*16] lea dstq, [dstq+strideq*8] %endif call m(idct_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 mov dstq, r7 %else mov dstq, [rsp+2*gprsize+16*16] %endif mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 %if ARCH_X86_64 add r7, 16 %define mzero m9 %else add dword [rsp+2*gprsize+16*16], 16 %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add cq, 64*4 REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 %undef mzero dec r4d jg .loop_pass2 %if WIN64 mov r7, [rsp+16*16+gprsize] %endif RET INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 mov [rsp+16*16+gprsize], r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] jmp m(idct_16x16_internal_16bpc).pass1_full .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*64+r5] mova m1, [cq+13*64+r5] mova m2, [cq+ 6*64+r5] mova m3, [cq+ 9*64+r5] mova m4, [cq+10*64+r5] mova m5, [cq+ 5*64+r5] mova m6, [cq+14*64+r5] mova m7, [cq+ 1*64+r5] call m(iadst_16x4_internal_16bpc).main_part1 mova m0, [cq+ 0*64+r5] mova m1, [cq+15*64+r5] mova m2, [cq+ 4*64+r5] mova m3, [cq+11*64+r5] mova m4, [cq+ 8*64+r5] mova m5, [cq+ 7*64+r5] mova m6, [cq+12*64+r5] mova m7, [cq+ 3*64+r5] call m(iadst_16x4_internal_16bpc).main_part2 call .round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .round: %if ARCH_X86_64 pcmpeqd m8, m8 ; -1 mova m15, [o(pd_10240)] psrld m14, 10 ; +2 psubd m13, m14, m8 ; +3 REPX {pxor x, m8 }, m1, m3, m5, m7 REPX {paddd x, m14}, m0, m2 REPX {paddd x, m13}, m1, m3 REPX {paddd x, m15}, m4, m5, m6, m7 paddd m13, m15, m8 ; +10239 paddd m8, m15, m9 psubd m9, m13, m10 paddd m10, m15, m11 psubd m11, m13, m12 paddd m12, m14, [r3+3*16] psubd m13, m14, [r3+2*16] psubd m15, m14, [r3+0*16] paddd m14, [r3+1*16] REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 %else mova [r3+8*16], m1 mova [r3+9*16], m3 mova m3, [o(pd_10240)] pcmpeqd m1, m1 REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m5, m6, m7 REPX {psrad x, 14}, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] mova m3, [o(pd_2)] REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m6 psubd m3, m1 REPX {paddd x, m3}, m5, m7 REPX {psrad x, 2 }, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 mova m5, [r3+8*16] mova m7, [r3+9*16] mova [r3+8*16], m4 mova [r3+9*16], m6 mova m3, [o(pd_10240)] REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m0, m5, m2, m7 REPX {psrad x, 14}, m0, m5, m2, m7 packssdw m0, m5 packssdw m2, m7 mova m4, [r3+0*16] mova m5, [r3+1*16] mova m6, [r3+2*16] mova m7, [r3+3*16] mova m3, [o(pd_2)] REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m6 psubd m3, m1 REPX {paddd x, m3}, m5, m7 REPX {psrad x, 2 }, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 %endif ret .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] mova m11, [o(pw_m2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 2 .loop_pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m0, [cq+0*64+32] mova m1, [cq+1*64+32] mova m2, [cq+2*64+16] mova m3, [cq+3*64+16] mova m4, [cq+0*64+ 0] mova m5, [cq+1*64+ 0] mova m6, [cq+2*64+48] mova m7, [cq+3*64+48] mova [rsp+gprsize+3*16], m0 mova [rsp+gprsize+4*16], m1 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m3 mova [rsp+gprsize+7*16], m4 mova [rsp+gprsize+8*16], m5 mova [rsp+gprsize+9*16], m6 mova [rsp+gprsize+10*16], m7 mova m0, [cq+2*64+ 0] mova m1, [cq+3*64+ 0] mova m2, [cq+0*64+16] mova m3, [cq+1*64+16] mova m4, [cq+2*64+32] mova m5, [cq+3*64+32] mova m6, [cq+0*64+48] mova m7, [cq+1*64+48] call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end ; out0-7 is in rsp+gprsize+3-10*mmsize ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize %if ARCH_X86_64 lea dstq, [r7+strideq*8] %else mov dstq, [rsp+2*gprsize+16*16] lea dstq, [dstq+strideq*8] %endif call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 mov dstq, r7 %else mov dstq, [rsp+2*gprsize+16*16] %endif mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 %if ARCH_X86_64 add r7, 16 %define mzero m9 %else add dword [rsp+2*gprsize+16*16], 16 %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add cq, 64*4 REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 %undef mzero dec r4d jg .loop_pass2 %if WIN64 mov r7, [rsp+16*16+gprsize] %endif RET INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 mov [rsp+16*16+gprsize], r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] jmp m(idct_16x16_internal_16bpc).pass1_full .main: call m(iadst_16x16_internal_16bpc).main %if ARCH_X86_64 mova m1, m0 mova m3, m2 mova m5, m4 mova m7, m6 pshufd m0, m14, q1032 pshufd m2, m12, q1032 pshufd m4, m10, q1032 pshufd m6, m8, q1032 pshufd m8, m7, q1032 pshufd m10, m5, q1032 pshufd m12, m3, q1032 pshufd m14, m1, q1032 %else pshufd m1, m0, q1032 pshufd m3, m2, q1032 pshufd m5, m4, q1032 pshufd m7, m6, q1032 pshufd m0, [r3+11*16], q1032 pshufd m2, [r3+10*16], q1032 pshufd m4, [r3+9*16], q1032 pshufd m6, [r3+8*16], q1032 mova [r3+11*16], m1 mova [r3+10*16], m3 mova [r3+ 9*16], m5 mova [r3+ 8*16], m7 %endif ret .pass2: lea r3, [strideq*3] lea r3, [r3*5] add dstq, r3 neg strideq jmp m(iadst_16x16_internal_16bpc).pass2 INV_TXFM_16X16_FN identity, dct, h INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 mov [rsp+16*16+gprsize], r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] jmp m(idct_16x16_internal_16bpc).pass1_full .main: %if ARCH_X86_64 mova m15, [o(pd_11586)] pmulld m0, m15, [cq+ 0*64+r5] pmulld m1, m15, [cq+ 1*64+r5] pmulld m2, m15, [cq+ 2*64+r5] pmulld m3, m15, [cq+ 3*64+r5] pmulld m4, m15, [cq+ 4*64+r5] pmulld m5, m15, [cq+ 5*64+r5] pmulld m6, m15, [cq+ 6*64+r5] pmulld m7, m15, [cq+ 7*64+r5] pmulld m8, m15, [cq+ 8*64+r5] pmulld m9, m15, [cq+ 9*64+r5] pmulld m10, m15, [cq+10*64+r5] pmulld m11, m15, [cq+11*64+r5] pmulld m12, m15, [cq+12*64+r5] pmulld m13, m15, [cq+13*64+r5] pmulld m14, m15, [cq+14*64+r5] pmulld m15, [cq+15*64+r5] mova [r3], m15 mova m15, [o(pd_10240)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [r3] REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %else mova m7, [o(pd_11586)] pmulld m0, m7, [cq+ 0*64+r5] pmulld m1, m7, [cq+ 1*64+r5] pmulld m2, m7, [cq+ 2*64+r5] pmulld m3, m7, [cq+ 3*64+r5] pmulld m4, m7, [cq+ 4*64+r5] pmulld m5, m7, [cq+ 5*64+r5] pmulld m6, m7, [cq+ 6*64+r5] pmulld m7, [cq+ 7*64+r5] mova [r3], m7 mova m7, [o(pd_10240)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 mova [r3+8*16], m0 mova [r3+9*16], m2 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m7, [o(pd_11586)] pmulld m0, m7, [cq+ 8*64+r5] pmulld m1, m7, [cq+ 9*64+r5] pmulld m2, m7, [cq+10*64+r5] pmulld m3, m7, [cq+11*64+r5] pmulld m4, m7, [cq+12*64+r5] pmulld m5, m7, [cq+13*64+r5] pmulld m6, m7, [cq+14*64+r5] pmulld m7, [cq+15*64+r5] mova [r3], m7 mova m7, [o(pd_10240)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 %endif ret .pass2: %if ARCH_X86_64 mova m4, [o(pw_2048)] mova m5, [o(pixel_10bpc_max)] pxor m6, m6 mova m7, [o(pw_1697x16)] mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif mov r5d, 4 lea r3, [strideq*3] .pass2_loop: mova m0, [cq+0*64+0] mova m1, [cq+1*64+0] mova m2, [cq+2*64+0] mova m3, [cq+3*64+0] call m(iidentity_8x16_internal_16bpc).main %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).round1_and_write_8x4 %else call m(idct_8x4_internal_16bpc).round2_and_write_8x4 %endif REPX {mova [cq+x*16], m6}, 0, 4, 8, 12 add cq, 16 lea dstq, [dstq+strideq*4] dec r5w jg .pass2_loop add cq, 64*3 btc r5d, 16 jc .end %if ARCH_X86_64 lea dstq, [r7+16] %else mov dstq, [rsp+2*gprsize+16*16] add dstq, 16 %endif add r5d, 4 jmp .pass2_loop .end: %if WIN64 mov r7, [rsp+16*16+gprsize] %endif RET cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %endif mova m5, [o(pw_5)] mova m7, [o(pixel_10bpc_max)] pxor m6, m6 mov r5d, eobd add eobb, 21 cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192 lea r4, [strideq*3] .loop: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {paddsw x, m5}, m0, m1, m2, m3 REPX {psraw x, 3 }, m0, m1, m2, m3 call .main_zero add cq, 16 lea dstq, [dstq+strideq*4] btc eobd, 16 jnc .loop sub eobd, 64 jge .loop RET ALIGN function_align .main_zero: REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 .main: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpckhwd m4, m2, m1 punpcklwd m2, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r4 ] REPX {pmaxsw x, m6}, m0, m1, m2, m3 REPX {pminsw x, m7}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r4 ], m3 ret cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %endif mova m5, [o(pw_4096)] mova m7, [o(pixel_10bpc_max)] pxor m6, m6 mov r4d, eobd add eobb, 21 cmovc eobd, r4d lea r4, [strideq*3] mov r5, dstq .loop: mova m0, [cq+32*0] packssdw m0, [cq+32*1] mova m1, [cq+32*2] packssdw m1, [cq+32*3] mova m2, [cq+32*4] packssdw m2, [cq+32*5] mova m3, [cq+32*6] packssdw m3, [cq+32*7] REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 call m(inv_txfm_add_identity_identity_8x32_16bpc).main lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .loop add cq, 32*8-32 add r5, 16 mov dstq, r5 sub eobd, 64 jge .loop RET cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %else mova m8, [o(pw_2896x8)] mova m9, [o(pw_1697x16)] mova m11, [o(pw_8192)] %endif mova m7, [o(pixel_10bpc_max)] lea r4, [strideq*3] pxor m6, m6 %if ARCH_X86_64 paddw m10, m11, m11 ; pw_16384 %endif mov r5, dstq call .main sub eobd, 36 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main sub cq, 128*8 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 107 ; eob < 143 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main sub cq, 128*8 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 271 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main sub cq, 128*8 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 399 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main .ret: RET ALIGN function_align .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] %if ARCH_X86_64 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 pmulhrsw m4, m9, m0 pmulhrsw m5, m9, m1 REPX {pmulhrsw x, m10}, m4, m5 %else mova m6, [o(pw_2896x8)] REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 mova m5, [o(pw_1697x16)] pmulhrsw m4, m5, m0 pmulhrsw m5, m1 mova m6, [o(pw_16384)] REPX {pmulhrsw x, m6 }, m4, m5 %endif paddsw m0, m4 paddsw m1, m5 %if ARCH_X86_64 pmulhrsw m4, m9, m2 pmulhrsw m5, m9, m3 REPX {pmulhrsw x, m10}, m4, m5 %else mova m5, [o(pw_1697x16)] pmulhrsw m4, m5, m2 pmulhrsw m5, m3 REPX {pmulhrsw x, m6 }, m4, m5 %endif paddsw m2, m4 paddsw m3, m5 %if ARCH_X86_64 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 %else psrlw m6, 1 ; pw_8192 REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 pxor m6, m6 %endif call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .main ret cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %else mova m8, [o(pw_2896x8)] mova m9, [o(pw_1697x16)] mova m10, [o(pw_2048)] %endif mova m7, [o(pixel_10bpc_max)] lea r4, [strideq*3] pxor m6, m6 mov r5, dstq call .main sub eobd, 36 jl .ret call .main add cq, 64*8-64 lea dstq, [r5+16*1] call .main sub eobd, 107 ; eob < 143 jl .ret call .main add cq, 64*8-64 lea dstq, [r5+16*2] call .main sub eobd, 128 ; eob < 271 jl .ret call .main add cq, 64*8-64 lea dstq, [r5+16*3] call .main sub eobd, 128 ; eob < 399 jl .ret call .main .ret: RET ALIGN function_align .main: mova m0, [cq+64*0] packssdw m0, [cq+64*1] mova m1, [cq+64*2] packssdw m1, [cq+64*3] mova m2, [cq+64*4] packssdw m2, [cq+64*5] mova m3, [cq+64*6] packssdw m3, [cq+64*7] %if ARCH_X86_64 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 %else mova m6, [o(pw_2896x8)] REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 %endif REPX {paddsw x, x }, m0, m1, m2, m3 %if ARCH_X86_64 pmulhrsw m4, m9, m0 pmulhrsw m5, m9, m1 %else mova m6, [o(pw_1697x16)] pmulhrsw m4, m6, m0 pmulhrsw m5, m6, m1 %endif REPX {paddsw x, x }, m0, m1 paddsw m0, m4 paddsw m1, m5 %if ARCH_X86_64 pmulhrsw m4, m9, m2 pmulhrsw m5, m9, m3 %else pmulhrsw m4, m6, m2 pmulhrsw m6, m3 %endif REPX {paddsw x, x }, m2, m3 paddsw m2, m4 %if ARCH_X86_64 paddsw m3, m5 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 %else paddsw m3, m6 mova m6, [o(pw_2048)] REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 pxor m6, m6 %endif REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 call m(inv_txfm_add_identity_identity_8x32_16bpc).main lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .main ret cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob %undef cmp %if ARCH_X86_32 LEA r6, $$ %endif mova m5, [o(pw_8192)] mova m7, [o(pixel_10bpc_max)] pxor m6, m6 lea r4, [strideq*3] mov r5, dstq call .main ; 0 cmp eobd, 36 jl .ret add cq, 128*8-32 ; 0 1 lea dstq, [r5+16] ; 1 call .main call .main2 cmp eobd, 136 jl .ret add cq, 128*16-64 ; 0 1 2 lea dstq, [r5+16*2] ; 1 2 call .main ; 2 call .main2 call .main2 cmp eobd, 300 jl .ret add cq, 128*24-96 ; 0 1 2 3 add r5, 16*3 ; 1 2 3 mov dstq, r5 ; 2 3 call .main ; 3 call .main2 call .main2 call .main2 cmp eobd, 535 jl .ret add cq, 128*24-96 ; 0 1 2 3 lea dstq, [r5+strideq*8] ; 1 2 3 4 mov r5, dstq ; 2 3 4 call .main ; 3 4 call .main2 call .main2 cmp eobd, 755 jl .ret add cq, 128*16-64 ; 0 1 2 3 lea dstq, [r5+strideq*8] ; 1 2 3 4 mov r5, dstq ; 2 3 4 5 call .main ; 3 4 5 call .main2 cmp eobd, 911 jl .ret add cq, 128*8-32 ; 0 1 2 3 lea dstq, [r5+strideq*8] ; 1 2 3 4 call .main ; 2 3 4 5 .ret: ; 3 4 5 6 RET ALIGN function_align .main2: sub cq, 128*8 sub dstq, 16 .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .main ret cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %define base $$ DECLARE_REG_TMP 0, 4 %else lea r6, [tbl_Nx32_odd_offset] %define base tbl_Nx32_odd_offset DECLARE_REG_TMP 4, 7 %if WIN64 mov [rsp+gprsize*1+35*16], r7 %endif %endif %define o2(x) r6-base+x test eobd, eobd jz .dconly %if ARCH_X86_32 mov [rsp+gprsize*1+35*16], r0 %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_8x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [rsp+ 3*16+r5*8], m0 mova [rsp+11*16+r5*8], m0 mova [rsp+ 3*16+t0*8], m0 mova [rsp+ 3*16+t1*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_8x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+35*16], eobd mov r3, rsp .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+0*128+r5*8] mova m1, [cq+1*128+r5*8] mova m2, [cq+2*128+r5*8] mova m3, [cq+3*128+r5*8] mova m4, [cq+4*128+r5*8] mova m5, [cq+5*128+r5*8] mova m6, [cq+6*128+r5*8] mova m7, [cq+7*128+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 mova m1, [o(pd_2)] REPX {paddd x, m1}, m0, m6, m5, m3 call m(idct_8x4_internal_16bpc).round REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [r3+ 3*16+r5*8], m0 mova [r3+11*16+r5*8], m2 mova [r3+ 3*16+t1*8], m1 mova [r3+ 3*16+t0*8], m3 pxor m7, m7 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7 sub r5d, 2 jge .loop_pass1 ; pass 2 code starts here ; m0 is already loaded from last iteration of first pass %if ARCH_X86_32 mov r0, [rsp+gprsize*1+35*16] %endif mov eobd, [rsp+gprsize*0+35*16] cmp eobd, 43 jl .load_veryfast cmp eobd, 107 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: call .pass2 %if WIN64 mov r7, [rsp+gprsize*1+35*16] %endif RET .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m1, [rsp+gprsize+16* 4] mova m2, [rsp+gprsize+16* 5] mova m3, [rsp+gprsize+16* 6] mova m4, [rsp+gprsize+16* 7] mova m5, [rsp+gprsize+16* 8] mova m6, [rsp+gprsize+16* 9] mova m7, [rsp+gprsize+16*10] call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+ 3*16], m0 mova [rsp+gprsize+ 4*16], m1 mova [rsp+gprsize+ 5*16], m2 mova [rsp+gprsize+ 6*16], m3 mova [rsp+gprsize+ 7*16], m4 mova [rsp+gprsize+ 8*16], m5 mova [rsp+gprsize+ 9*16], m6 mova m0, [rsp+gprsize+11*16] mova m1, [rsp+gprsize+12*16] mova m2, [rsp+gprsize+13*16] mova m3, [rsp+gprsize+14*16] mova m4, [rsp+gprsize+15*16] mova m5, [rsp+gprsize+16*16] mova m6, [rsp+gprsize+17*16] mova m7, [rsp+gprsize+18*16] call m_suffix(idct_16x8_internal_8bpc, _ssse3).main mova m7, [rsp+gprsize+ 0*16] mova [rsp+gprsize+11*16], m0 mova [rsp+gprsize+12*16], m1 mova [rsp+gprsize+13*16], m2 mova [rsp+gprsize+14*16], m3 mova [rsp+gprsize+15*16], m4 mova [rsp+gprsize+16*16], m5 mova [rsp+gprsize+17*16], m6 mova [rsp+gprsize+18*16], m7 call r4 %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %endif lea r3, [strideq*3] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] mova m0, [rsp+gprsize+11*16] mova m1, [rsp+gprsize+12*16] mova m2, [rsp+gprsize+13*16] mova m3, [rsp+gprsize+14*16] mova m4, [rsp+gprsize+15*16] mova m5, [rsp+gprsize+16*16] mova m6, [rsp+gprsize+17*16] mova m7, [rsp+gprsize+18*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] mova m0, [rsp+gprsize+19*16] mova m1, [rsp+gprsize+20*16] mova m2, [rsp+gprsize+21*16] mova m3, [rsp+gprsize+22*16] mova m4, [rsp+gprsize+23*16] mova m5, [rsp+gprsize+24*16] mova m6, [rsp+gprsize+25*16] mova m7, [rsp+gprsize+26*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] mova m0, [rsp+gprsize+27*16] mova m1, [rsp+gprsize+28*16] mova m2, [rsp+gprsize+29*16] mova m3, [rsp+gprsize+30*16] mova m4, [rsp+gprsize+31*16] mova m5, [rsp+gprsize+32*16] mova m6, [rsp+gprsize+33*16] mova m7, [rsp+gprsize+34*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 add r5d, 640 sar r5d, 10 add rsp, (31+2*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 mov [rsp+gprsize*1+76*16], r0 %elif WIN64 mov [rsp+gprsize*1+76*16], r7 %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [rsp+12*16+r5*8], m0 mova [rsp+20*16+r5*8], m0 mova [rsp+12*16+t0*8], m0 mova [rsp+12*16+t1*8], m0 mova [rsp+44*16+r5*8], m0 mova [rsp+52*16+r5*8], m0 mova [rsp+44*16+t0*8], m0 mova [rsp+44*16+t1*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+76*16], eobd mov r3, rsp .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*128+r5*8] mova m1, [cq+ 3*128+r5*8] mova m2, [cq+ 5*128+r5*8] mova m3, [cq+ 7*128+r5*8] mova m4, [cq+ 9*128+r5*8] mova m5, [cq+11*128+r5*8] mova m6, [cq+13*128+r5*8] mova m7, [cq+15*128+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*128+r5*8] mova m1, [cq+ 2*128+r5*8] mova m2, [cq+ 4*128+r5*8] mova m3, [cq+ 6*128+r5*8] mova m4, [cq+ 8*128+r5*8] mova m5, [cq+10*128+r5*8] mova m6, [cq+12*128+r5*8] mova m7, [cq+14*128+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call m(idct_16x4_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif call m(idct_8x4_internal_16bpc).transpose4x8packed movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 %if ARCH_X86_64 mova [rsp+12*16+r5*8], m0 mova [rsp+20*16+r5*8], m2 mova [rsp+12*16+t1*8], m1 mova [rsp+12*16+t0*8], m3 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+44*16+r5*8], m8 mova [rsp+52*16+r5*8], m10 mova [rsp+44*16+t1*8], m9 mova [rsp+44*16+t0*8], m11 %else mova [rsp+44*16+r5*8], m0 mova [rsp+52*16+r5*8], m2 mova [rsp+44*16+t1*8], m1 mova [rsp+44*16+t0*8], m3 mova m0, [r3+ 8*16] mova m2, [r3+ 9*16] mova m4, [r3+10*16] mova m6, [r3+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+12*16+r5*8], m0 mova [rsp+20*16+r5*8], m2 mova [rsp+12*16+t1*8], m1 mova [rsp+12*16+t0*8], m3 %endif pxor m7, m7 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 sub r5d, 2 jge .loop_pass1 ; pass=2 add rsp, 9*16 %if ARCH_X86_64 mov r6, dstq %else mov dstq, [rsp+gprsize*1+67*16] %endif mov eobd, [rsp+gprsize*0+67*16] cmp eobd, 44 jl .load_veryfast cmp eobd, 151 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: %if ARCH_X86_64 lea r2, [dstq+32] mov r7, -4 %else lea r2, [rsp+67*16] mov dword [r2+0*gprsize], 2 %endif jmp .loop_pass2_entry .loop_pass2: mova m0, [rsp+16* 3] .loop_pass2_entry: %if ARCH_X86_32 mov dstq, [r2+1*gprsize] %endif call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2 add rsp, 32*16 %if ARCH_X86_64 add r7, 2 lea dstq, [r2+r7*8] jl .loop_pass2 %if WIN64 mov r7, [rsp+gprsize*1+3*16] %endif %else add dword [r2+1*gprsize], 16 dec dword [r2+0*gprsize] jg .loop_pass2 %endif %assign stack_size (stack_size-73*16) %if STACK_ALIGNMENT >= 16 %assign stack_size_padded (stack_size_padded-73*16) %assign stack_offset (stack_offset-73*16) %else %xdefine rstkm [rsp + stack_size] %endif RET .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add r5d, 128 sar r5d, 8 imul r5d, 181 add rsp, (65+4*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %endif test eobd, eobd jz .dconly ; remove entirely-zero iterations %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 10 setge r5b %else mov r5d, 1 cmp eobd, 10 sbb r5d, 0 %endif add r5d, r5d ; actual first pass after skipping all-zero data .loop_pass1: mova m0, [cq+32* 1+r5*8] mova m1, [cq+32* 7+r5*8] mova m2, [cq+32* 9+r5*8] mova m3, [cq+32*15+r5*8] mova m4, [cq+32*17+r5*8] mova m5, [cq+32*23+r5*8] mova m6, [cq+32*25+r5*8] mova m7, [cq+32*31+r5*8] %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp call .main_oddhalf_part1 mova m0, [cq+32* 3+r5*8] mova m1, [cq+32* 5+r5*8] mova m2, [cq+32*11+r5*8] mova m3, [cq+32*13+r5*8] mova m4, [cq+32*19+r5*8] mova m5, [cq+32*21+r5*8] mova m6, [cq+32*27+r5*8] mova m7, [cq+32*29+r5*8] call .main_oddhalf_part2 mova m0, [cq+32* 2+r5*8] mova m1, [cq+32* 6+r5*8] mova m2, [cq+32*10+r5*8] mova m3, [cq+32*14+r5*8] mova m4, [cq+32*18+r5*8] mova m5, [cq+32*22+r5*8] mova m6, [cq+32*26+r5*8] mova m7, [cq+32*30+r5*8] add r3, 16*(16+4*ARCH_X86_32) call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+32* 0+r5*8] mova m1, [cq+32* 4+r5*8] mova m2, [cq+32* 8+r5*8] mova m3, [cq+32*12+r5*8] mova m4, [cq+32*16+r5*8] mova m5, [cq+32*20+r5*8] mova m6, [cq+32*24+r5*8] mova m7, [cq+32*28+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call .round_dct32 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+32* 8+r5*8], m8 mova [cq+32* 9+r5*8], m9 mova [cq+32*10+r5*8], m10 mova [cq+32*11+r5*8], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+32* 4+r5*8], m8 mova [cq+32* 5+r5*8], m9 mova [cq+32* 6+r5*8], m10 mova [cq+32* 7+r5*8], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+32*12+r5*8], m8 mova [cq+32*13+r5*8], m9 mova [cq+32*14+r5*8], m10 mova [cq+32*15+r5*8], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+32* 4+r5*8], m0 mova [cq+32* 5+r5*8], m1 mova [cq+32* 6+r5*8], m2 mova [cq+32* 7+r5*8], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+32* 8+r5*8], m0 mova [cq+32* 9+r5*8], m1 mova [cq+32*10+r5*8], m2 mova [cq+32*11+r5*8], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+32*12+r5*8], m0 mova [cq+32*13+r5*8], m1 mova [cq+32*14+r5*8], m2 mova [cq+32*15+r5*8], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif pxor m7, m7 ; clear lower half of [cq] REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 test r5d, r5d jz .end_pass1 mova [cq+32* 0+r5*8], m0 mova [cq+32* 1+r5*8], m1 mova [cq+32* 2+r5*8], m2 mova [cq+32* 3+r5*8], m3 sub r5d, 2 jmp .loop_pass1 .end_pass1: ; pass=2, we need to call this otherwise the stack pointer has ; the wrong offset in the 8-bit code mov r4d, 4 call m(idct_16x8_internal_16bpc).pass2_main RET .main_oddhalf_part1_fast: ; lower half zero pmulld m7, m0, [o(pd_4091)] pmulld m0, [o(pd_201)] pmulld m4, m3, [o(pd_m2751)] %if ARCH_X86_32 pmulld m3, [o(pd_3035)] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m7 REPX {psrad x, 12}, m0, m7 mova [r3+3*16], m7 mova m7, m3 mova m3, m5 %else pmulld m3, [o(pd_3035)] %endif pmulld m6, m1, [o(pd_m1380)] pmulld m1, [o(pd_3857)] pmulld m5, m2, [o(pd_3703)] pmulld m2, [o(pd_1751)] jmp .main_oddhalf_part1_fast2 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 %if ARCH_X86_64 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a .main_oddhalf_part1_fast2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 psubd m8, m0, m4 ; t17 paddd m0, m4 ; t16 psubd m4, m6, m2 ; t18 paddd m6, m2 ; t19 psubd m2, m1, m5 ; t29 paddd m1, m5 ; t28 psubd m5, m7, m3 ; t30 paddd m7, m3 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 mova m15, [o(pd_4017)] mova m10, [o(pd_799)] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a paddd m7, m1 ; t31a psubd m1, m5, m4 ; t18 paddd m5, m4 ; t17 psubd m4, m8, m2 ; t29 paddd m8, m2 ; t30 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 mova [r3+16*0], m0 mova [r3+16*1], m5 mova [r3+16*2], m4 mova [r3+16*3], m6 mova [r3+16*4], m3 mova [r3+16*5], m1 mova [r3+16*6], m8 mova [r3+16*7], m7 %else mova [r3+0*16], m2 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m5 mova m3, [o(pd_2048)] ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a mova m4, [r3+2*16] mova m5, [r3+3*16] mova [r3+2*16], m6 mova [r3+3*16], m7 mova m2, [r3+0*16] mova m7, [r3+1*16] mova [r3+0*16], m0 mova [r3+1*16], m1 ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a mova m0, [r3+0*16] mova m1, [r3+1*16] mova m6, [r3+2*16] .main_oddhalf_part1_fast2: REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7 REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7 psubd m3, m0, m4 ; t17 mova [r3+0*16], m3 mova m3, [r3+3*16] paddd m0, m4 ; t16 psubd m4, m6, m2 ; t18 paddd m6, m2 ; t19 psubd m2, m1, m5 ; t29 paddd m1, m5 ; t28 psubd m5, m3, m7 ; t30 paddd m7, m3 ; t31 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pmaxsd m3, [r3+0*16] mova [r3+0*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pminsd m3, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m6 mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a psubd m1, m5, m4 ; t18 paddd m5, m4 ; t17 psubd m4, m3, m2 ; t29 paddd m3, m2 ; t30 mova m0, [r3+0*16] mova m2, [r3+1*16] mova m6, [r3+2*16] mova m7, [r3+3*16] mova [r3+0*16], m3 psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m2 ; t28a paddd m7, m2 ; t31a mova m2, [o(clip_18b_min)] REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pmaxsd m2, [r3+0*16] mova [r3+0*16], m2 mova m2, [o(clip_18b_max)] REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pminsd m2, [r3+0*16] mova [r3+16*0], m0 mova [r3+16*1], m5 mova [r3+16*6], m2 mova [r3+16*7], m7 mova m7, [o(pd_2048)] ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28 mova [r3+16*2], m4 mova [r3+16*3], m6 mova [r3+16*4], m3 mova [r3+16*5], m1 %endif ret .main_oddhalf_part2_fast: ; lower half zero pmulld m7, m0, [o(pd_m601)] pmulld m0, [o(pd_4052)] pmulld m4, m3, [o(pd_3290)] %if ARCH_X86_32 pmulld m3, [o(pd_2440)] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m7 REPX {psrad x, 12}, m0, m7 mova [r3+11*16], m7 mova m7, m3 mova m3, m5 %else pmulld m3, [o(pd_2440)] %endif pmulld m6, m1, [o(pd_3973)] pmulld m1, [o(pd_995)] pmulld m5, m2, [o(pd_m2106)] pmulld m2, [o(pd_3513)] jmp .main_oddhalf_part2_fast2 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 %if ARCH_X86_64 ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a .main_oddhalf_part2_fast2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 psubd m8, m0, m4 ; t25 paddd m0, m4 ; t24 psubd m4, m6, m2 ; t26 paddd m6, m2 ; t27 psubd m2, m1, m5 ; t21 paddd m1, m5 ; t20 psubd m5, m7, m3 ; t22 paddd m7, m3 ; t23 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 mova m15, [o(pd_2276)] mova m10, [o(pd_3406)] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a paddd m7, m1 ; t23a psubd m1, m5, m4 ; t21 paddd m5, m4 ; t22 psubd m4, m8, m2 ; t26 paddd m8, m2 ; t25 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 mova m9, [r3+16*0] ; t16a mova m10, [r3+16*1] ; t17 psubd m2, m9, m7 ; t23 paddd m9, m7 ; t16 psubd m7, m10, m5 ; t22a paddd m10, m5 ; t17a REPX {pmaxsd x, m12}, m9, m10, m2, m7 REPX {pminsd x, m13}, m9, m10, m2, m7 mova [r3+16*0], m9 mova [r3+16*1], m10 mova m9, [r3+16*2] ; t18a mova m10, [r3+16*3] ; t19 psubd m5, m9, m1 ; t21 paddd m9, m1 ; t18 psubd m1, m10, m6 ; t20a paddd m10, m6 ; t19a REPX {pmaxsd x, m12}, m9, m10, m5, m1 REPX {pminsd x, m13}, m9, m10, m5, m1 mova [r3+16*2], m9 mova [r3+16*3], m10 mova m9, [r3+16*4] ; t28 mova m10, [r3+16*5] ; t29a psubd m6, m9, m3 ; t27a paddd m9, m3 ; t28a psubd m3, m10, m4 ; t26 paddd m10, m4 ; t29 REPX {pmaxsd x, m12}, m9, m10, m6, m3 REPX {pminsd x, m13}, m9, m10, m6, m3 REPX {pmulld x, m14}, m6, m3, m1, m5 paddd m6, m11 paddd m3, m11 psubd m4, m6, m1 ; t20 paddd m6, m1 ; t27 psubd m1, m3, m5 ; t21a paddd m3, m5 ; t26a REPX {psrad x, 12 }, m4, m1, m3, m6 mova [r3+16*4], m4 mova [r3+16*5], m1 mova m4, [r3+16*6] ; t30 mova m1, [r3+16*7] ; t31a psubd m5, m4, m8 ; t25a paddd m4, m8 ; t30a psubd m8, m1, m0 ; t24 paddd m1, m0 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m1 REPX {pminsd x, m13}, m8, m5, m4, m1 REPX {pmulld x, m14}, m5, m8, m7, m2 paddd m5, m11 paddd m8, m11 psubd m0, m5, m7 ; t22 paddd m5, m7 ; t25 psubd m7, m8, m2 ; t23a paddd m2, m8 ; t24a REPX {psrad x, 12 }, m0, m7, m2, m5 mova [r3+16*6], m0 mova [r3+16*7], m7 mova [r3+16*8], m2 mova [r3+16*9], m5 mova [r3+16*10], m3 mova [r3+16*11], m6 mova [r3+16*12], m9 mova [r3+16*13], m10 mova [r3+16*14], m4 mova [r3+16*15], m1 %else mova [r3+ 8*16], m2 mova [r3+ 9*16], m3 mova [r3+10*16], m4 mova [r3+11*16], m5 mova m3, [o(pd_2048)] ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a mova m2, [r3+ 8*16] mova m4, [r3+10*16] mova m5, [r3+11*16] mova [r3+ 8*16], m0 mova [r3+10*16], m6 mova [r3+11*16], m7 mova m7, [r3+ 9*16] mova [r3+ 9*16], m1 ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a mova m0, [r3+ 8*16] mova m1, [r3+ 9*16] mova m6, [r3+10*16] .main_oddhalf_part2_fast2: REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6 REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6 psubd m3, m0, m4 ; t25 mova [r3+ 8*16], m3 mova m3, [r3+11*16] paddd m0, m4 ; t24 psubd m4, m6, m2 ; t26 paddd m6, m2 ; t27 psubd m2, m1, m5 ; t21 paddd m1, m5 ; t20 psubd m5, m3, m7 ; t22 paddd m7, m3 ; t23 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pmaxsd m3, [r3+ 8*16] mova [r3+ 8*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pminsd m3, [r3+ 8*16] mova [r3+ 8*16], m0 mova [r3+ 9*16], m1 mova [r3+10*16], m6 mova [r3+11*16], m7 mova m7, [o(pd_2048)] ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a psubd m1, m5, m4 ; t21 paddd m5, m4 ; t22 psubd m4, m3, m2 ; t26 paddd m3, m2 ; t25 mova m0, [r3+ 8*16] mova m2, [r3+ 9*16] mova m6, [r3+10*16] mova m7, [r3+11*16] mova [r3+ 8*16], m3 psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m2 ; t20a paddd m7, m2 ; t23a mova m2, [o(clip_18b_min)] REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pmaxsd m2, [r3+ 8*16] mova [r3+ 8*16], m2 mova m2, [o(clip_18b_max)] REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pminsd m2, [r3+ 8*16] mova [r3+ 8*16], m0 mova [r3+ 9*16], m2 mova [r3+14*16], m5 mova [r3+15*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 mova [r3+10*16], m3 mova m0, [o(clip_18b_min)] mova m2, [o(clip_18b_max)] mova m5, [r3+16*2] ; t18a mova m7, [r3+16*3] ; t19 psubd m3, m5, m1 ; t21 paddd m5, m1 ; t18 psubd m1, m7, m6 ; t20a paddd m7, m6 ; t19a REPX {pmaxsd x, m0}, m5, m7, m3, m1 REPX {pminsd x, m2}, m5, m7, m3, m1 mova [r3+16*2], m5 mova [r3+16*3], m7 mova [r3+11*16], m3 mova m3, [r3+10*16] mova m5, [r3+16*4] ; t28 mova m7, [r3+16*5] ; t29a psubd m6, m5, m3 ; t27a paddd m5, m3 ; t28a psubd m3, m7, m4 ; t26 paddd m7, m4 ; t29 REPX {pmaxsd x, m0}, m5, m7, m6, m3 REPX {pminsd x, m2}, m5, m7, m6, m3 mova [r3+16*12], m5 mova [r3+16*13], m7 mova m5, [o(pd_2048)] mova m7, [o(pd_2896)] mova m4, [r3+11*16] REPX {pmulld x, m7}, m6, m3, m1, m4 paddd m6, m5 paddd m3, m5 psubd m5, m6, m1 ; t20 paddd m6, m1 ; t27 psubd m1, m3, m4 ; t21a paddd m3, m4 ; t26a REPX {psrad x, 12}, m5, m1, m3, m6 mova [r3+16*4], m5 mova [r3+16*5], m1 mova [r3+16*10], m3 mova [r3+16*11], m6 mova m5, [r3+14*16] mova m6, [r3+15*16] mova m3, [r3+16*0] ; t16a mova m4, [r3+16*1] ; t17 psubd m1, m3, m6 ; t23 paddd m3, m6 ; t16 psubd m6, m4, m5 ; t22a paddd m4, m5 ; t17a REPX {pmaxsd x, m0}, m3, m4, m1, m6 REPX {pminsd x, m2}, m3, m4, m1, m6 mova [r3+16*0], m3 mova [r3+16*1], m4 mova m5, [r3+ 8*16] mova m3, [r3+ 9*16] mova [r3+ 8*16], m1 mova [r3+ 9*16], m6 mova m4, [r3+16*6] ; t30 mova m1, [r3+16*7] ; t31a psubd m6, m1, m5 ; t24 paddd m1, m5 ; t31 psubd m5, m4, m3 ; t25a paddd m4, m3 ; t30a REPX {pmaxsd x, m0}, m6, m5, m4, m1 REPX {pminsd x, m2}, m6, m5, m4, m1 mova [r3+16*14], m4 mova [r3+16*15], m1 mova m4, [o(pd_2048)] mova m1, [r3+ 9*16] mova m2, [r3+ 8*16] REPX {pmulld x, m7}, m5, m6, m1, m2 paddd m5, m4 paddd m6, m4 psubd m0, m5, m1 ; t22 paddd m5, m1 ; t25 psubd m1, m6, m2 ; t23a paddd m2, m6 ; t24a REPX {psrad x, 12}, m0, m1, m2, m5 mova [r3+16*6], m0 mova [r3+16*7], m1 mova [r3+16*8], m2 mova [r3+16*9], m5 %endif ret ; final sumsub for idct16 as well as idct32, plus final downshift %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx mova m%4, [r3+16*(23-%1)] pmaxsd m%1, m12 pminsd m%1, m13 psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 pmaxsd m%3, m12 pminsd m%1, m13 pminsd m%3, m13 paddd m%1, m11 paddd m%3, m11 mova m%5, [r3+16*( 0+%1)] mova m%2, [r3+16*(15-%1)] psubd m%4, m%1, m%2 ; out31 - n paddd m%1, m%2 ; out0 + n paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 %endmacro .round_dct32: %if ARCH_X86_64 psrld m11, 10 ; pd_2 IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31 mova [r3+ 0*16], m6 mova [r3+23*16], m7 IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30 packssdw m0, m1 ; 0 1 packssdw m14, m15 ; 14 15 packssdw m8, m6 ; 16 17 packssdw m7, m9 ; 30 31 mova [r3+16*15], m14 mova [r3+16*14], m7 IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29 IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28 packssdw m2, m3 ; 2 3 packssdw m14, m15 ; 12 13 packssdw m10, m1 ; 18 19 packssdw m9, m7 ; 28 29 mova [r3+16*13], m14 mova [r3+16*12], m9 IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27 IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26 packssdw m4, m5 ; 4 5 packssdw m14, m15 ; 10 11 packssdw m1, m3 ; 20 21 packssdw m9, m7 ; 26 27 mova [r3+16*11], m14 mova [r3+16*10], m9 mova m6, [r3+ 0*16] mova m7, [r3+23*16] IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25 IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24 packssdw m6, m7 ; 6 7 packssdw m11, m15 ; 8 9 packssdw m14, m3 ; 22 23 packssdw m9, m5 ; 24 25 mova [r3+16*9], m11 mova [r3+16*8], m9 mova m12, m1 ret %else mova [r3+16*16], m0 mova [r3+17*16], m1 mova [r3+18*16], m2 mova [r3+19*16], m3 mova [r3+20*16], m4 mova [r3+21*16], m5 mova [r3+22*16], m6 mova [r3+23*16], m7 mova m1, [o(pd_2)] mova m2, [o(clip_18b_min)] mova m3, [o(clip_18b_max)] mov r4, 15*16 .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] pmaxsd m0, m2 pminsd m0, m3 psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 pmaxsd m5, m2 pminsd m0, m3 pminsd m5, m3 paddd m0, m1 paddd m5, m1 mova m7, [r3] mova m4, [r3+r4] psubd m6, m0, m4 ; out31 - n paddd m0, m4 ; out0 + n paddd m4, m5, m7 ; out15 - n psubd m5, m7 ; out16 + n REPX {psrad x, 2}, m0, m5, m4, m6 mova [r3], m0 mova [r3+r4], m4 mova [r3+16*16], m5 mova [r3+24*16], m6 add r3, 16 sub r4, 32 jg .loop_dct32_end ret %endif .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 .dconly1: add r5d, 640 sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 .dconly_loop: mova m1, [dstq+16*0] mova m2, [dstq+16*1] mova m3, [dstq+16*2] mova m4, [dstq+16*3] REPX {paddw x, m0}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 REPX {pmaxsw x, m5}, m1, m2, m3, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m2 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, strideq dec r3d jg .dconly_loop RET cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly ; remove entirely-zero iterations %undef cmp mov r5d, 8 .zero_loop: sub r5d, 2 cmp eobw, word [o2(tbl_32x16_2d)+r5] jl .zero_loop ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+64* 1+r5*8] mova m1, [cq+64* 7+r5*8] mova m2, [cq+64* 9+r5*8] mova m3, [cq+64*15+r5*8] mova m4, [cq+64*17+r5*8] mova m5, [cq+64*23+r5*8] mova m6, [cq+64*25+r5*8] mova m7, [cq+64*31+r5*8] mov r3, rsp call m(idct_8x4_internal_16bpc).rect2_mul call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 mova m0, [cq+64* 3+r5*8] mova m1, [cq+64* 5+r5*8] mova m2, [cq+64*11+r5*8] mova m3, [cq+64*13+r5*8] mova m4, [cq+64*19+r5*8] mova m5, [cq+64*21+r5*8] mova m6, [cq+64*27+r5*8] mova m7, [cq+64*29+r5*8] %if ARCH_X86_32 add r3, 16*8 %endif call m(idct_8x4_internal_16bpc).rect2_mul %if ARCH_X86_32 sub r3, 16*8 %endif call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 add r3, 16*(16+4*ARCH_X86_32) mova m0, [cq+64* 2+r5*8] mova m1, [cq+64* 6+r5*8] mova m2, [cq+64*10+r5*8] mova m3, [cq+64*14+r5*8] mova m4, [cq+64*18+r5*8] mova m5, [cq+64*22+r5*8] mova m6, [cq+64*26+r5*8] mova m7, [cq+64*30+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+64* 0+r5*8] mova m1, [cq+64* 4+r5*8] mova m2, [cq+64* 8+r5*8] mova m3, [cq+64*12+r5*8] mova m4, [cq+64*16+r5*8] mova m5, [cq+64*20+r5*8] mova m6, [cq+64*24+r5*8] mova m7, [cq+64*28+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call .round_dct32 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+64* 8+r5*8], m8 mova [cq+64* 9+r5*8], m9 mova [cq+64*10+r5*8], m10 mova [cq+64*11+r5*8], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+64* 4+r5*8], m8 mova [cq+64* 5+r5*8], m9 mova [cq+64* 6+r5*8], m10 mova [cq+64* 7+r5*8], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+64*12+r5*8], m8 mova [cq+64*13+r5*8], m9 mova [cq+64*14+r5*8], m10 mova [cq+64*15+r5*8], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+64* 4+r5*8], m0 mova [cq+64* 5+r5*8], m1 mova [cq+64* 6+r5*8], m2 mova [cq+64* 7+r5*8], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+64* 8+r5*8], m0 mova [cq+64* 9+r5*8], m1 mova [cq+64*10+r5*8], m2 mova [cq+64*11+r5*8], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+64*12+r5*8], m0 mova [cq+64*13+r5*8], m1 mova [cq+64*14+r5*8], m2 mova [cq+64*15+r5*8], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif mova [cq+64* 0+r5*8], m0 mova [cq+64* 1+r5*8], m1 mova [cq+64* 2+r5*8], m2 mova [cq+64* 3+r5*8], m3 pxor m0, m0 REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 sub r5d, 2 jge .loop_pass1 ; pass=2, we need to call this otherwise the stack pointer has ; the wrong offset in the 8-bit code call .pass2 RET .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %if WIN64 mov [rsp+16*16+gprsize], r7 %endif mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 4 jmp m(idct_16x16_internal_16bpc).loop_pass2 .round_dct32: %if ARCH_X86_64 psrld m11, 11 ; pd_1 IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31 mova [r3+ 0*16], m6 mova [r3+23*16], m7 IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30 packssdw m0, m1 ; 0 1 packssdw m14, m15 ; 14 15 packssdw m8, m6 ; 16 17 packssdw m7, m9 ; 30 31 mova [r3+16*15], m14 mova [r3+16*14], m7 IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29 IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28 packssdw m2, m3 ; 2 3 packssdw m14, m15 ; 12 13 packssdw m10, m1 ; 18 19 packssdw m9, m7 ; 28 29 mova [r3+16*13], m14 mova [r3+16*12], m9 IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27 IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26 packssdw m4, m5 ; 4 5 packssdw m14, m15 ; 10 11 packssdw m1, m3 ; 20 21 packssdw m9, m7 ; 26 27 mova [r3+16*11], m14 mova [r3+16*10], m9 mova m6, [r3+ 0*16] mova m7, [r3+23*16] IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25 IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24 packssdw m6, m7 ; 6 7 packssdw m11, m15 ; 8 9 packssdw m14, m3 ; 22 23 packssdw m9, m5 ; 24 25 mova [r3+16*9], m11 mova [r3+16*8], m9 mova m12, m1 ret %else mova [r3+16*16], m0 mova [r3+17*16], m1 mova [r3+18*16], m2 mova [r3+19*16], m3 mova [r3+20*16], m4 mova [r3+21*16], m5 mova [r3+22*16], m6 mova [r3+23*16], m7 pcmpeqd m1, m1 ; -1 mova m2, [o(clip_18b_min)] mova m3, [o(clip_18b_max)] mov r4, 15*16 .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 pmaxsd m5, m2 pminsd m0, m3 pminsd m5, m3 psubd m0, m1 psubd m5, m1 mova m7, [r3] mova m4, [r3+r4] psubd m6, m0, m4 ; out31 - n paddd m0, m4 ; out0 + n paddd m4, m5, m7 ; out15 - n psubd m5, m7 ; out16 + n REPX {psrad x, 1}, m0, m5, m4, m6 mova [r3], m0 mova [r3+r4], m4 mova [r3+16*16], m5 mova [r3+24*16], m6 add r3, 16 sub r4, 32 jg .loop_dct32_end ret %endif .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 384 sar r5d, 9 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly ; remove entirely-zero iterations %if ARCH_X86_32 mov [rsp+5*32*16+1*gprsize], dstq %elif WIN64 mov [rsp+5*32*16+1*gprsize], r7 %endif %undef cmp mov r5d, 14 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [rsp+32*16+r5*8+0*32*16], m0 mova [rsp+40*16+r5*8+0*32*16], m0 mova [rsp+32*16+t0*8+0*32*16], m0 mova [rsp+32*16+t1*8+0*32*16], m0 mova [rsp+32*16+r5*8+1*32*16], m0 mova [rsp+40*16+r5*8+1*32*16], m0 mova [rsp+32*16+t0*8+1*32*16], m0 mova [rsp+32*16+t1*8+1*32*16], m0 mova [rsp+32*16+r5*8+2*32*16], m0 mova [rsp+40*16+r5*8+2*32*16], m0 mova [rsp+32*16+t0*8+2*32*16], m0 mova [rsp+32*16+t1*8+2*32*16], m0 mova [rsp+32*16+r5*8+3*32*16], m0 mova [rsp+40*16+r5*8+3*32*16], m0 mova [rsp+32*16+t0*8+3*32*16], m0 mova [rsp+32*16+t1*8+3*32*16], m0 sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+5*32*16], eobd .loop_pass1: mova m0, [cq+128* 1+r5*8] mova m1, [cq+128* 7+r5*8] mova m2, [cq+128* 9+r5*8] mova m3, [cq+128*15+r5*8] mova m4, [cq+128*17+r5*8] mova m5, [cq+128*23+r5*8] mova m6, [cq+128*25+r5*8] mova m7, [cq+128*31+r5*8] %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128* 5+r5*8] mova m2, [cq+128*11+r5*8] mova m3, [cq+128*13+r5*8] mova m4, [cq+128*19+r5*8] mova m5, [cq+128*21+r5*8] mova m6, [cq+128*27+r5*8] mova m7, [cq+128*29+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 mova m0, [cq+128* 2+r5*8] mova m1, [cq+128* 6+r5*8] mova m2, [cq+128*10+r5*8] mova m3, [cq+128*14+r5*8] mova m4, [cq+128*18+r5*8] mova m5, [cq+128*22+r5*8] mova m6, [cq+128*26+r5*8] mova m7, [cq+128*30+r5*8] add r3, 16*(16+4*ARCH_X86_32) call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 4+r5*8] mova m2, [cq+128* 8+r5*8] mova m3, [cq+128*12+r5*8] mova m4, [cq+128*16+r5*8] mova m5, [cq+128*20+r5*8] mova m6, [cq+128*24+r5*8] mova m7, [cq+128*28+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+32*16+r5*8+2*32*16], m8 mova [rsp+40*16+r5*8+2*32*16], m10 mova [rsp+32*16+t1*8+2*32*16], m9 mova [rsp+32*16+t0*8+2*32*16], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+32*16+r5*8+1*32*16], m8 mova [rsp+40*16+r5*8+1*32*16], m10 mova [rsp+32*16+t1*8+1*32*16], m9 mova [rsp+32*16+t0*8+1*32*16], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+32*16+r5*8+3*32*16], m8 mova [rsp+40*16+r5*8+3*32*16], m10 mova [rsp+32*16+t1*8+3*32*16], m9 mova [rsp+32*16+t0*8+3*32*16], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+32*16+r5*8+1*32*16], m0 mova [rsp+40*16+r5*8+1*32*16], m2 mova [rsp+32*16+t1*8+1*32*16], m1 mova [rsp+32*16+t0*8+1*32*16], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+32*16+r5*8+2*32*16], m0 mova [rsp+40*16+r5*8+2*32*16], m2 mova [rsp+32*16+t1*8+2*32*16], m1 mova [rsp+32*16+t0*8+2*32*16], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+32*16+r5*8+3*32*16], m0 mova [rsp+40*16+r5*8+3*32*16], m2 mova [rsp+32*16+t1*8+3*32*16], m1 mova [rsp+32*16+t0*8+3*32*16], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif pxor m7, m7 ; clear lower half of [cq] REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 mova [rsp+32*16+r5*8+0*32*16], m0 mova [rsp+40*16+r5*8+0*32*16], m2 mova [rsp+32*16+t1*8+0*32*16], m1 mova [rsp+32*16+t0*8+0*32*16], m3 sub r5d, 2 jge .loop_pass1 ; pass=2 code starts here mov eobd, [rsp+gprsize*0+5*32*16] add rsp, 29*16 cmp eobd, 36 jl .load_veryfast cmp eobd, 136 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: %if ARCH_X86_64 lea r2, [dstq+64] mov r7, -8 %else lea r2, [rsp+(4*32+3)*16] mov dword [r2+0*gprsize], 4 %endif jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1 cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0 mov [rsp+gprsize*1+(64*2+12)*16], r0 mov [rsp+gprsize*2+(64*2+12)*16], r1 mov [rsp+gprsize*3+(64*2+12)*16], r2 %else DECLARE_REG_TMP 8, 9, 4, 7 mov [rsp+gprsize*1+(64*2+12)*16], r9 %if WIN64 mov [rsp+gprsize*2+(64*2+12)*16], r7 mov [rsp+gprsize*3+(64*2+12)*16], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 mova [rsp+12*16+t0*8], m0 mova [rsp+12*16+t1*8], m0 mova [rsp+12*16+t2*8], m0 mova [rsp+12*16+t3*8], m0 mova [rsp+76*16+t0*8], m0 mova [rsp+76*16+t1*8], m0 mova [rsp+76*16+t2*8], m0 mova [rsp+76*16+t3*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+(64*2+12)*16], eobd mov r3, rsp %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 6, 0 mov r2, [rsp+gprsize*3+(64*2+12)*16] mov [rsp+gprsize*3+(64*2+12)*16], r6 %endif .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*128+r5*8] mova m1, [cq+ 3*128+r5*8] mova m2, [cq+ 5*128+r5*8] mova m3, [cq+ 7*128+r5*8] mova m4, [cq+ 9*128+r5*8] mova m5, [cq+11*128+r5*8] mova m6, [cq+13*128+r5*8] mova m7, [cq+15*128+r5*8] call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*128+r5*8] mova m1, [cq+ 2*128+r5*8] mova m2, [cq+ 4*128+r5*8] mova m3, [cq+ 6*128+r5*8] mova m4, [cq+ 8*128+r5*8] mova m5, [cq+10*128+r5*8] mova m6, [cq+12*128+r5*8] mova m7, [cq+14*128+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call m(idct_16x16_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif call m(idct_8x4_internal_16bpc).transpose4x8packed movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 %if ARCH_X86_64 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+76*16+t0*8], m8 mova [rsp+76*16+t1*8], m9 mova [rsp+76*16+t2*8], m10 mova [rsp+76*16+t3*8], m11 %else mova [rsp+76*16+t0*8], m0 mova [rsp+76*16+t1*8], m1 mova [rsp+76*16+t2*8], m2 mova [rsp+76*16+t3*8], m3 mova m0, [rsp+ 8*16] mova m2, [rsp+ 9*16] mova m4, [rsp+10*16] mova m6, [rsp+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif mova [rsp+12*16+t0*8], m0 mova [rsp+12*16+t1*8], m1 mova [rsp+12*16+t2*8], m2 mova [rsp+12*16+t3*8], m3 %if ARCH_X86_32 mov r6, [rsp+gprsize*3+(64*2+12)*16] %endif pxor m7, m7 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 sub r5d, 2 jge .loop_pass1 ; pass=2 mov eobd, [rsp+gprsize*0+(64*2+12)*16] cmp eobd, 151 jl .fast ; fall-through %if ARCH_X86_64 DECLARE_REG_TMP 8, 9 %else DECLARE_REG_TMP 1, 5 %endif lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] jmp .run .fast: lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] .run: add rsp, 9*16 %if ARCH_X86_64 lea r2, [dstq+32] mov r7, -4 %else lea r2, [rsp+(64*2+3)*16] mov [r2+4*gprsize], t0 mov [r2+5*gprsize], t1 mov r1, [r2+2*gprsize] mov dword [r2+0*gprsize], 2 %endif .loop_pass2: %if ARCH_X86_32 mov dstq, [r2+1*gprsize] %endif call .pass2 add rsp, 64*16 %if ARCH_X86_64 add r7, 2 lea dstq, [r2+r7*8] jl .loop_pass2 %else add dword [r2+1*gprsize], 16 dec dword [r2+0*gprsize] jg .loop_pass2 %endif %assign stack_size (stack_size-(64*2+9)*16) %if STACK_ALIGNMENT >= 16 %assign stack_size_padded (stack_size_padded-(64*2+9)*16) %assign stack_offset (stack_offset-(64*2+9)*16) %else %xdefine rstkm [rsp + stack_size] %endif %if ARCH_X86_64 mov r9, [rsp+gprsize*1+3*16] %if WIN64 mov r7, [rsp+gprsize*2+3*16] mov r8, [rsp+gprsize*3+3*16] %endif %endif RET .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m0, [rsp+gprsize+16* 3] mova m1, [rsp+gprsize+16* 4] mova m2, [rsp+gprsize+16* 5] mova m3, [rsp+gprsize+16* 6] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+ 3*16], m0 mova [rsp+gprsize+ 4*16], m1 mova [rsp+gprsize+ 5*16], m2 mova [rsp+gprsize+ 6*16], m3 mova [rsp+gprsize+ 7*16], m4 mova [rsp+gprsize+ 8*16], m5 mova [rsp+gprsize+ 9*16], m6 mova [rsp+gprsize+10*16], m7 mova m0, [rsp+gprsize+16*11] mova m1, [rsp+gprsize+16*12] mova m2, [rsp+gprsize+16*13] mova m3, [rsp+gprsize+16*14] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m_suffix(idct_16x8_internal_8bpc, _ssse3).main mova m7, [rsp+gprsize+ 0*16] mova [rsp+gprsize+11*16], m0 mova [rsp+gprsize+12*16], m1 mova [rsp+gprsize+13*16], m2 mova [rsp+gprsize+14*16], m3 mova [rsp+gprsize+15*16], m4 mova [rsp+gprsize+16*16], m5 mova [rsp+gprsize+17*16], m6 mova [rsp+gprsize+18*16], m7 %if ARCH_X86_64 call r8 %else call [r2+4*gprsize] %endif mova [rsp+gprsize+ 3*16], m0 mova [rsp+gprsize+ 5*16], m2 mova [rsp+gprsize+ 8*16], m5 mova [rsp+gprsize+10*16], m7 %if ARCH_X86_64 call r9 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %else call [r2+5*gprsize] %endif lea r3, [strideq*3] lea r4, [rsp+gprsize+ 3*16] %if ARCH_X86_64 mov r6d, 8 %else mov dword [r2+2*gprsize], 8 %endif .loop_write: mova m0, [r4+0*16] mova m1, [r4+1*16] mova m2, [r4+2*16] mova m3, [r4+3*16] mova m4, [r4+4*16] mova m5, [r4+5*16] mova m6, [r4+6*16] mova m7, [r4+7*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] add r4, 8*16 %if ARCH_X86_64 dec r6d %else dec dword [r2+2*gprsize] %endif jg .loop_write ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add r5d, 640 sar r5d, 10 add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0 mov [rsp+gprsize*1+(64*4+32)*16], r0 mov [rsp+gprsize*2+(64*4+32)*16], r1 mov [rsp+gprsize*3+(64*4+32)*16], r2 %else DECLARE_REG_TMP 8, 9, 4, 7 mov [rsp+gprsize*1+(64*4+32)*16], r9 %if WIN64 mov [rsp+gprsize*2+(64*4+32)*16], r7 mov [rsp+gprsize*3+(64*4+32)*16], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 mova [rsp+ 32*16+t0*8], m0 mova [rsp+ 32*16+t1*8], m0 mova [rsp+ 32*16+t2*8], m0 mova [rsp+ 32*16+t3*8], m0 mova [rsp+ 96*16+t0*8], m0 mova [rsp+ 96*16+t1*8], m0 mova [rsp+ 96*16+t2*8], m0 mova [rsp+ 96*16+t3*8], m0 mova [rsp+160*16+t0*8], m0 mova [rsp+160*16+t1*8], m0 mova [rsp+160*16+t2*8], m0 mova [rsp+160*16+t3*8], m0 mova [rsp+224*16+t0*8], m0 mova [rsp+224*16+t1*8], m0 mova [rsp+224*16+t2*8], m0 mova [rsp+224*16+t3*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+(64*4+32)*16], eobd mov r3, rsp %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 6, 0 mov r2, [rsp+gprsize*3+(64*4+32)*16] mov [rsp+gprsize*3+(64*4+32)*16], r6 %endif .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+128* 1+r5*8] mova m1, [cq+128* 7+r5*8] mova m2, [cq+128* 9+r5*8] mova m3, [cq+128*15+r5*8] mova m4, [cq+128*17+r5*8] mova m5, [cq+128*23+r5*8] mova m6, [cq+128*25+r5*8] mova m7, [cq+128*31+r5*8] mov r3, rsp call m(idct_8x4_internal_16bpc).rect2_mul call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128* 5+r5*8] mova m2, [cq+128*11+r5*8] mova m3, [cq+128*13+r5*8] mova m4, [cq+128*19+r5*8] mova m5, [cq+128*21+r5*8] mova m6, [cq+128*27+r5*8] mova m7, [cq+128*29+r5*8] %if ARCH_X86_32 add r3, 16*8 %endif call m(idct_8x4_internal_16bpc).rect2_mul %if ARCH_X86_32 sub r3, 16*8 %endif call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 add r3, 16*(16+4*ARCH_X86_32) mova m0, [cq+128* 2+r5*8] mova m1, [cq+128* 6+r5*8] mova m2, [cq+128*10+r5*8] mova m3, [cq+128*14+r5*8] mova m4, [cq+128*18+r5*8] mova m5, [cq+128*22+r5*8] mova m6, [cq+128*26+r5*8] mova m7, [cq+128*30+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 4+r5*8] mova m2, [cq+128* 8+r5*8] mova m3, [cq+128*12+r5*8] mova m4, [cq+128*16+r5*8] mova m5, [cq+128*20+r5*8] mova m6, [cq+128*24+r5*8] mova m7, [cq+128*28+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+160*16+t0*8], m8 mova [rsp+160*16+t1*8], m9 mova [rsp+160*16+t2*8], m10 mova [rsp+160*16+t3*8], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+ 96*16+t0*8], m8 mova [rsp+ 96*16+t1*8], m9 mova [rsp+ 96*16+t2*8], m10 mova [rsp+ 96*16+t3*8], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+224*16+t0*8], m8 mova [rsp+224*16+t1*8], m9 mova [rsp+224*16+t2*8], m10 mova [rsp+224*16+t3*8], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+ 96*16+t0*8], m0 mova [rsp+ 96*16+t1*8], m1 mova [rsp+ 96*16+t2*8], m2 mova [rsp+ 96*16+t3*8], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+160*16+t0*8], m0 mova [rsp+160*16+t1*8], m1 mova [rsp+160*16+t2*8], m2 mova [rsp+160*16+t3*8], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+224*16+t0*8], m0 mova [rsp+224*16+t1*8], m1 mova [rsp+224*16+t2*8], m2 mova [rsp+224*16+t3*8], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif mova [rsp+ 32*16+t0*8], m0 mova [rsp+ 32*16+t1*8], m1 mova [rsp+ 32*16+t2*8], m2 mova [rsp+ 32*16+t3*8], m3 pxor m0, m0 REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 %if ARCH_X86_32 mov r6, [rsp+gprsize*3+(64*4+32)*16] %endif sub r5d, 2 jge .loop_pass1 ; pass=2 mov eobd, [rsp+gprsize*0+(64*4+32)*16] cmp eobd, 136 jl .fast ; fall-through %if ARCH_X86_64 DECLARE_REG_TMP 8, 9 %else DECLARE_REG_TMP 1, 5 %endif lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] jmp .run .fast: lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] .run: add rsp, 29*16 %if ARCH_X86_64 lea r2, [dstq+64] mov r7, -8 %else lea r2, [rsp+(64*4+3)*16] mov [r2+4*gprsize], t0 mov [r2+5*gprsize], t1 mov r1, [r2+2*gprsize] mov dword [r2+0*gprsize], 4 %endif jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 384 sar r5d, 9 add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly ; remove entirely-zero iterations %undef cmp mov r5d, 8 .zero_loop: sub r5d, 2 cmp eobw, word [o2(tbl_32x16_2d)+r5] jl .zero_loop ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp lea r4, [o(idct64_mul_16bpc)] mova m0, [cq+64* 1+r5*8] mova m1, [cq+64*31+r5*8] mova m2, [cq+64*17+r5*8] mova m3, [cq+64*15+r5*8] call .main_part1 mova m0, [cq+64* 7+r5*8] mova m1, [cq+64*25+r5*8] mova m2, [cq+64*23+r5*8] mova m3, [cq+64* 9+r5*8] call .main_part1 mova m0, [cq+64* 5+r5*8] mova m1, [cq+64*27+r5*8] mova m2, [cq+64*21+r5*8] mova m3, [cq+64*11+r5*8] call .main_part1 mova m0, [cq+64* 3+r5*8] mova m1, [cq+64*29+r5*8] mova m2, [cq+64*19+r5*8] mova m3, [cq+64*13+r5*8] call .main_part1 call .main_part2 mova m0, [cq+64* 2+r5*8] mova m1, [cq+64*14+r5*8] mova m2, [cq+64*18+r5*8] mova m3, [cq+64*30+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast mova m0, [cq+64* 6+r5*8] mova m1, [cq+64*10+r5*8] mova m2, [cq+64*22+r5*8] mova m3, [cq+64*26+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast add r3, 16*(24+4*ARCH_X86_32) mova m0, [cq+64* 4+r5*8] mova m1, [cq+64*12+r5*8] mova m2, [cq+64*20+r5*8] mova m3, [cq+64*28+r5*8] call m(idct_16x4_internal_16bpc).main_oddhalf_fast mova m0, [cq+64* 0+r5*8] mova m1, [cq+64* 8+r5*8] mova m2, [cq+64*16+r5*8] mova m3, [cq+64*24+r5*8] call m(idct_8x4_internal_16bpc).main_pass1_fast call m(idct_8x4_internal_16bpc).round mova [r3-(7+4*ARCH_X86_32)*16], m1 mova [r3-(6+4*ARCH_X86_32)*16], m2 mova [r3-(5+4*ARCH_X86_32)*16], m3 mova [r3-(4+4*ARCH_X86_32)*16], m4 mova [r3-(3+4*ARCH_X86_32)*16], m5 mova [r3-(2+4*ARCH_X86_32)*16], m6 mova [r3-(1+4*ARCH_X86_32)*16], m7 sub r3, 16*(40+4*ARCH_X86_32-4) %if ARCH_X86_64 psrld m15, m11, 10 ; pd_2 %else mova m7, [o(pd_2)] %endif call .main_end_loop_start lea r3, [rsp+56*16] lea r4, [cq+r5*8+64*28] call .shift_transpose sub r5d, 2 jge .loop_pass1 ; pass=2, we need to call this otherwise the stack pointer has ; the wrong offset in the 8-bit code call .pass2 RET .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %if WIN64 mov [rsp+16*16+gprsize], r7 %endif mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 8 jmp m(idct_16x16_internal_16bpc).loop_pass2 .main_part1: ; idct64 steps 1-5 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a %if ARCH_X86_64 movd m7, [r4+4*0] movd m8, [r4+4*1] movd m6, [r4+4*2] movd m9, [r4+4*3] movd m5, [r4+4*4] movd m10, [r4+4*5] movd m4, [r4+4*6] movd m15, [r4+4*7] REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15 pmulld m7, m0 ; t63a pmulld m0, m8 ; t32a pmulld m6, m1 ; t62a pmulld m1, m9 ; t33a pmulld m5, m2 ; t61a pmulld m2, m10 ; t34a pmulld m4, m3 ; t60a pmulld m3, m15 ; t35a movd m10, [r4+4*8] movd m15, [r4+4*9] REPX {pshufd x, x, q0000}, m10, m15 REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 psubd m8, m0, m1 ; t33 paddd m0, m1 ; t32 psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m5 ; t61 paddd m4, m5 ; t60 REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 movd m10, [r4+4*10] movd m15, [r4+4*11] REPX {pshufd x, x, q0000}, m10, m15 psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m8, m2 ; t61 paddd m8, m2 ; t62 REPX {pmaxsd x, m12}, m5, m3, m4, m6 REPX {pminsd x, m13}, m5, m3, m4, m6 ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a REPX {pmaxsd x, m12}, m0, m7, m1, m8 REPX {pminsd x, m13}, m0, m7, m1, m8 add r4, 4*12 mova [r3+16*0], m0 mova [r3+16*7], m7 mova [r3+16*1], m1 mova [r3+16*6], m8 mova [r3+16*2], m6 mova [r3+16*5], m4 mova [r3+16*3], m3 mova [r3+16*4], m5 %else movd m7, [r4+4*0] movd m6, [r4+4*2] movd m5, [r4+4*4] movd m4, [r4+4*6] REPX {pshufd x, x, q0000}, m7, m6, m5, m4 pmulld m7, m0 ; t63a pmulld m6, m1 ; t62a pmulld m5, m2 ; t61a pmulld m4, m3 ; t60a mova [r3+0*16], m6 mova [r3+1*16], m7 movd m6, [r4+4*1] movd m7, [r4+4*3] REPX {pshufd x, x, q0000}, m7, m6 pmulld m0, m6 ; t32a pmulld m1, m7 ; t33a movd m6, [r4+4*5] movd m7, [r4+4*7] REPX {pshufd x, x, q0000}, m7, m6 pmulld m2, m6 ; t34a pmulld m3, m7 ; t35a mova m6, [r3+0*16] mova m7, [o(pd_2048)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3+1*16] REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4 mova [r3+0*16], m5 psubd m5, m0, m1 ; t33 paddd m0, m1 ; t32 mova [r3+1*16], m0 mova m0, [r3+0*16] psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m0 ; t61 paddd m4, m0 ; t60 mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 pmaxsd m0, [r3+1*16] mova [r3+0*16], m0 mova m0, [o(clip_18b_max)] REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 pminsd m0, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m7 mova m0, [o(pd_2048)] movd m3, [r4+4*8] movd m4, [r4+4*9] REPX {pshufd x, x, q0000}, m3, m4 mova [r3+4*16], m2 ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a mova m2, [r3+4*16] mova [r3+4*16], m5 ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a mova m0, [r3+0*16] mova m3, [r3+1*16] mova m4, [r3+2*16] mova m7, [r3+3*16] psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a mova [r3+0*16], m5 mova m5, [r3+4*16] psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m5, m2 ; t61 paddd m2, m5 ; t62 mova m5, [o(clip_18b_min)] REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 pmaxsd m5, [r3+0*16] mova [r3+0*16], m5 mova m5, [o(clip_18b_max)] REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 pminsd m5, [r3+0*16] mova [r3+16*0], m0 mova [r3+16*7], m7 mova [r3+16*1], m1 mova [r3+16*6], m2 mova [r3+16*2], m4 mova m7, [o(pd_2048)] movd m0, [r4+4*10] movd m1, [r4+4*11] REPX {pshufd x, x, q0000}, m0, m1 ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60 mova [r3+16*3], m3 mova [r3+16*4], m5 mova m4, [r3+2*16] ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a add r4, 4*12 mova [r3+16*2], m6 mova [r3+16*5], m4 %endif add r3, 16*8 ret .main_part2: ; idct64 steps 6-9 lea r4, [r3+16*7] %if ARCH_X86_64 mova m10, [o(pd_1567)] mova m15, [o(pd_3784)] .main_part2_loop: mova m0, [r3-16*32] ; t32a mova m1, [r4-16*24] ; t39a mova m2, [r4-16*32] ; t63a mova m3, [r3-16*24] ; t56a mova m4, [r3-16*16] ; t40a mova m5, [r4-16* 8] ; t47a mova m6, [r4-16*16] ; t55a mova m7, [r3-16* 8] ; t48a psubd m8, m0, m1 ; t39 paddd m0, m1 ; t32 psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a paddd m2, m7 ; t63a psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m8, m4 ; t55 paddd m8, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 REPX {pmaxsd x, m12}, m6, m7, m5, m4 REPX {pminsd x, m13}, m6, m7, m5, m4 REPX {pmulld x, m14}, m6, m7, m5, m4 REPX {pmaxsd x, m12}, m2, m0, m8, m1 REPX {pminsd x, m13}, m2, m0, m8, m1 paddd m6, m11 paddd m5, m11 psubd m3, m6, m7 ; t47 paddd m6, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m3, m6, m7, m5 mova [r4-16* 8], m2 mova [r3-16*32], m0 mova [r3-16* 8], m8 mova [r4-16*32], m1 mova [r4-16*24], m3 mova [r3-16*16], m6 mova [r3-16*24], m7 mova [r4-16*16], m5 %else .main_part2_loop: mova m0, [r3-16*32] ; t32a mova m1, [r4-16*24] ; t39a mova m2, [r4-16*32] ; t63a mova m3, [r3-16*24] ; t56a mova m4, [r3-16*16] ; t40a mova m5, [r4-16* 8] ; t47a mova m6, [r4-16*16] ; t55a psubd m7, m0, m1 ; t39 paddd m0, m1 ; t32 mova [r3+0*16], m7 mova m7, [r3-16* 8] ; t48a psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 mova m6, [o(clip_18b_min)] REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 pmaxsd m6, [r3+0*16] mova [r3+0*16], m6 mova m6, [o(clip_18b_max)] REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 pminsd m6, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m5 mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a mova m2, [r3+1*16] mova m7, [r3+3*16] psubd m5, m2, m7 ; t48a paddd m2, m7 ; t63a mova [r3+1*16], m5 mova m0, [r3+0*16] mova m5, [r3+2*16] psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m6, m4 ; t55 paddd m6, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 pmaxsd m3, [r3+1*16] mova [r3+0*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 pminsd m3, [r3+0*16] mova [r4-16* 8], m2 mova [r3-16*32], m0 mova [r3-16* 8], m6 mova [r4-16*32], m1 mova m0, [o(pd_2896)] mova m1, [o(pd_2048)] REPX {pmulld x, m0}, m3, m7, m5, m4 REPX {paddd x, m1}, m3, m5 psubd m6, m3, m7 ; t47 paddd m3, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m6, m3, m7, m5 mova [r4-16*24], m6 mova [r3-16*16], m3 mova [r3-16*24], m7 mova [r4-16*16], m5 %endif add r3, 16 sub r4, 16 cmp r3, r4 jl .main_part2_loop sub r3, 4*16 ret .main_end_loop: mova m0, [r3+16*28] ; idct8 0 + n .main_end_loop_start: mova m2, [r3+16*12] ; idct32 16 + n mova m3, [r4+16*12] ; idct32 31 - n %if ARCH_X86_64 mova m1, [r4+16*28] ; idct16 15 - n mova m4, [r4-16* 4] ; idct64 63 - n mova m5, [r3-16* 4] ; idct64 48 + n mova m6, [r4-16*20] ; idct64 47 - n mova m7, [r3-16*20] ; idct64 32 + n pmaxsd m0, m12 pminsd m0, m13 paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 REPX {pminsd x, m13}, m8, m0 paddd m1, m8, m3 ; idct32 out0 + n psubd m8, m3 ; idct32 out31 - n paddd m3, m0, m2 ; idct32 out15 - n psubd m0, m2 ; idct32 out16 + n REPX {pmaxsd x, m12}, m1, m8, m3, m0 REPX {pminsd x, m13}, m1, m3, m8, m0 REPX {paddd x, m15}, m1, m3, m0, m8 paddd m2, m1, m4 ; idct64 out0 + n (unshifted) psubd m1, m4 ; idct64 out63 - n (unshifted) paddd m4, m3, m5 ; idct64 out15 - n (unshifted) psubd m3, m5 ; idct64 out48 + n (unshifted) paddd m5, m0, m6 ; idct64 out16 + n (unshifted) psubd m0, m6 ; idct64 out47 - n (unshifted) paddd m6, m8, m7 ; idct64 out31 - n (unshifted) psubd m8, m7 ; idct64 out32 + n (unshifted) mova [r3-16*20], m2 mova [r4+16*28], m1 mova [r4-16*20], m4 mova [r3+16*28], m3 mova [r3-16* 4], m5 mova [r4+16*12], m0 mova [r4-16* 4], m6 mova [r3+16*12], m8 %else mova m5, [o(clip_18b_min)] mova m6, [o(clip_18b_max)] mova m1, [r3+16*44] ; idct16 15 - n pmaxsd m0, m5 pminsd m0, m6 paddd m4, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m5}, m4, m0 REPX {pminsd x, m6}, m4, m0 paddd m1, m4, m3 ; idct32 out0 + n psubd m4, m3 ; idct32 out31 - n paddd m3, m0, m2 ; idct32 out15 - n psubd m0, m2 ; idct32 out16 + n REPX {pmaxsd x, m5}, m1, m4, m3, m0 REPX {pminsd x, m6}, m1, m3, m4, m0 REPX {paddd x, m7}, m1, m3, m0, m4 mova m5, [r4-16* 4] ; idct64 63 - n mova m6, [r3-16* 4] ; idct64 48 + n paddd m2, m1, m5 ; idct64 out0 + n (unshifted) psubd m1, m5 ; idct64 out63 - n (unshifted) paddd m5, m3, m6 ; idct64 out15 - n (unshifted) psubd m3, m6 ; idct64 out48 + n (unshifted) mova [r4+16*28], m1 mova [r3+16*28], m3 mova m6, [r4-16*20] ; idct64 47 - n mova m1, [r3-16*20] ; idct64 32 + n mova [r3-16*20], m2 mova [r4-16*20], m5 paddd m5, m0, m6 ; idct64 out16 + n (unshifted) psubd m0, m6 ; idct64 out47 - n (unshifted) paddd m6, m4, m1 ; idct64 out31 - n (unshifted) psubd m4, m1 ; idct64 out32 + n (unshifted) mova [r3-16* 4], m5 mova [r4+16*12], m0 mova [r4-16* 4], m6 mova [r3+16*12], m4 %endif sub r4, 16 add r3, 16 cmp r3, r4 jl .main_end_loop ret .shift_transpose: mova m0, [r3+0*16] mova m1, [r3+1*16] mova m2, [r3+2*16] mova m3, [r3+3*16] mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [r4+0*64], m0 mova [r4+1*64], m1 mova [r4+2*64], m2 mova [r4+3*64], m3 sub r4, 4*64 sub r3, 8*16 cmp r3, rsp jg .shift_transpose ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 .dconly1: add r5d, 640 sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 .dconly_loop: paddw m1, m0, [dstq+16*0] paddw m2, m0, [dstq+16*1] paddw m3, m0, [dstq+16*2] paddw m4, m0, [dstq+16*3] REPX {pmaxsw x, m5}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m2 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, 64 btc r3d, 16 jnc .dconly_loop lea dstq, [dstq+strideq-128] dec r3d jg .dconly_loop RET cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 0, 4, 1 mov [rsp+(8*32+64+8)*16+1*gprsize], dstq mov [rsp+(8*32+64+8)*16+2*gprsize], strideq %else DECLARE_REG_TMP 4, 7, 8 %if WIN64 mov [rsp+(8*32+64+1)*16+1*gprsize], r7 mov [rsp+64*16+0*gprsize], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 14 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 lea t2, [rsp+7*32*16] .zero_loop_inner: mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0 sub t2, 32*16 cmp t2, rsp jge .zero_loop_inner sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp lea r4, [o(idct64_mul_16bpc)] mova m0, [cq+128* 1+r5*8] mova m1, [cq+128*31+r5*8] mova m2, [cq+128*17+r5*8] mova m3, [cq+128*15+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 7+r5*8] mova m1, [cq+128*25+r5*8] mova m2, [cq+128*23+r5*8] mova m3, [cq+128* 9+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 5+r5*8] mova m1, [cq+128*27+r5*8] mova m2, [cq+128*21+r5*8] mova m3, [cq+128*11+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128*29+r5*8] mova m2, [cq+128*19+r5*8] mova m3, [cq+128*13+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 mova m0, [cq+128* 2+r5*8] mova m1, [cq+128*14+r5*8] mova m2, [cq+128*18+r5*8] mova m3, [cq+128*30+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6+r5*8] mova m1, [cq+128*10+r5*8] mova m2, [cq+128*22+r5*8] mova m3, [cq+128*26+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast add r3, 16*(24+4*ARCH_X86_32) mova m0, [cq+128* 4+r5*8] mova m1, [cq+128*12+r5*8] mova m2, [cq+128*20+r5*8] mova m3, [cq+128*28+r5*8] call .rect2_mul_fast call m(idct_16x4_internal_16bpc).main_oddhalf_fast mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 8+r5*8] mova m2, [cq+128*16+r5*8] mova m3, [cq+128*24+r5*8] call .rect2_mul_fast call m(idct_8x4_internal_16bpc).main_pass1_fast call m(idct_8x4_internal_16bpc).round mova [r3-(7+4*ARCH_X86_32)*16], m1 mova [r3-(6+4*ARCH_X86_32)*16], m2 mova [r3-(5+4*ARCH_X86_32)*16], m3 mova [r3-(4+4*ARCH_X86_32)*16], m4 mova [r3-(3+4*ARCH_X86_32)*16], m5 mova [r3-(2+4*ARCH_X86_32)*16], m6 mova [r3-(1+4*ARCH_X86_32)*16], m7 sub r3, 16*(40+4*ARCH_X86_32-4) %if ARCH_X86_64 psrld m15, m11, 11 ; pd_1 %else mova m7, [o(pd_1)] %endif call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start lea r3, [rsp+56*16] lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16] movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 call .shift_transpose ; zero cq pxor m7, m7 lea r4, [cq+30*128+r5*8] .zero_cq_loop: REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 sub r4, 4*128 cmp r4, cq jg .zero_cq_loop sub r5d, 2 jge .loop_pass1 ; pass=2 code starts here mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16] %if ARCH_X86_32 mov strideq, [rsp+gprsize*2+(8*32+64+8)*16] %elif WIN64 mov r8, [rsp+gprsize*0+64*16] %endif add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16 cmp eobd, 36 jl .load_veryfast cmp eobd, 136 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: %if ARCH_X86_64 lea r2, [dstq+128] mov r7, -16 %else lea r2, [rsp+(8*32+3)*16] mov dword [r2+0*gprsize], 8 %endif jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .rect2_mul_fast: %if ARCH_X86_64 REPX {pmulld x, m14}, m0, m1, m2, m3 REPX {paddd x, m11}, m0, m1, m2, m3 %else mova m4, [o(pd_2896)] mova m5, [o(pd_2048)] REPX {pmulld x, m4 }, m0, m1, m2, m3 REPX {paddd x, m5 }, m0, m1, m2, m3 %endif REPX {psrad x, 12 }, m0, m1, m2, m3 ret .shift_transpose: mova m0, [r3+0*16] mova m1, [r3+1*16] mova m2, [r3+2*16] mova m3, [r3+3*16] mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [t2+0*16+r5*8], m0 mova [t2+8*16+r5*8], m2 mova [t2+0*16+t0*8], m3 mova [t2+0*16+t1*8], m1 sub t2, 16*32 sub r3, 8*16 cmp r3, rsp jg .shift_transpose ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 384 sar r5d, 9 add rsp, (1+8*32+1*WIN64)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0, 6 mov [rsp+gprsize*1+(64*9+8)*16], r0 mov [rsp+gprsize*2+(64*9+8)*16], r1 mov [rsp+gprsize*3+(64*9+8)*16], r2 mov [rsp+gprsize*4+(64*9+8)*16], r6 %else DECLARE_REG_TMP 8, 9, 4, 7, 0 mov [rsp+gprsize*1+(64*9+1)*16], r9 mov [rsp+gprsize*0+64*16], r0 %if WIN64 mov [rsp+gprsize*2+(64*9+1)*16], r7 mov [rsp+gprsize*3+(64*9+1)*16], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 14 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 lea t4, [rsp+7*64*16] .zero_loop_inner: mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0 sub t4, 64*16 cmp t4, rsp jge .zero_loop_inner %if ARCH_X86_32 mov r6, [rsp+gprsize*4+(64*9+8)*16] %endif sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd %if ARCH_X86_32 mov cq, [rsp+gprsize*3+(64*9+8)*16] %endif ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp lea r4, [o(idct64_mul_16bpc)] mova m0, [cq+128* 1+r5*8] mova m1, [cq+128*31+r5*8] mova m2, [cq+128*17+r5*8] mova m3, [cq+128*15+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 7+r5*8] mova m1, [cq+128*25+r5*8] mova m2, [cq+128*23+r5*8] mova m3, [cq+128* 9+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 5+r5*8] mova m1, [cq+128*27+r5*8] mova m2, [cq+128*21+r5*8] mova m3, [cq+128*11+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128*29+r5*8] mova m2, [cq+128*19+r5*8] mova m3, [cq+128*13+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 mova m0, [cq+128* 2+r5*8] mova m1, [cq+128*14+r5*8] mova m2, [cq+128*18+r5*8] mova m3, [cq+128*30+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6+r5*8] mova m1, [cq+128*10+r5*8] mova m2, [cq+128*22+r5*8] mova m3, [cq+128*26+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast add r3, 16*(24+4*ARCH_X86_32) mova m0, [cq+128* 4+r5*8] mova m1, [cq+128*12+r5*8] mova m2, [cq+128*20+r5*8] mova m3, [cq+128*28+r5*8] call m(idct_16x4_internal_16bpc).main_oddhalf_fast mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 8+r5*8] mova m2, [cq+128*16+r5*8] mova m3, [cq+128*24+r5*8] call m(idct_8x4_internal_16bpc).main_pass1_fast call m(idct_8x4_internal_16bpc).round mova [r3-(7+4*ARCH_X86_32)*16], m1 mova [r3-(6+4*ARCH_X86_32)*16], m2 mova [r3-(5+4*ARCH_X86_32)*16], m3 mova [r3-(4+4*ARCH_X86_32)*16], m4 mova [r3-(3+4*ARCH_X86_32)*16], m5 mova [r3-(2+4*ARCH_X86_32)*16], m6 mova [r3-(1+4*ARCH_X86_32)*16], m7 sub r3, 16*(40+4*ARCH_X86_32-4) %if ARCH_X86_64 psrld m15, m11, 10 ; pd_2 %else mova m7, [o(pd_2)] %endif call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start lea r3, [rsp+56*16] movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16] call .shift_transpose ; zero cq pxor m7, m7 %if ARCH_X86_32 mov cq, [rsp+gprsize*3+(64*9+8)*16] %endif lea r4, [cq+30*128+r5*8] .zero_cq_loop: REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 sub r4, 4*128 cmp r4, cq jg .zero_cq_loop %if ARCH_X86_32 mov r6, [rsp+gprsize*4+(64*9+8)*16] %endif sub r5d, 2 jge .loop_pass1 ; pass=2 code starts here mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16] %if ARCH_X86_32 mov strideq, [rsp+gprsize*2+(9*64+8)*16] %else mov r0, [rsp+gprsize*0+64*16] %endif add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16 cmp eobd, 151 jl .fast ; fall-through %if ARCH_X86_64 DECLARE_REG_TMP 8, 9 %else DECLARE_REG_TMP 1, 5 %endif lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] jmp .run .fast: lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] .run: %if ARCH_X86_64 lea r2, [dstq+128] mov r7, -16 %else lea r2, [rsp+(64*8+3)*16] mov [r2+4*gprsize], t0 mov [r2+5*gprsize], t1 mov r1, [r2+2*gprsize] mov dword [r2+0*gprsize], 8 %endif jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 ; copy of pass=1 tmp-regs %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0, 6 %else DECLARE_REG_TMP 8, 9, 4, 7, 0 %endif .shift_transpose: mova m0, [r3+0*16] mova m1, [r3+1*16] mova m2, [r3+2*16] mova m3, [r3+3*16] mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [t4+t0*8], m0 mova [t4+t1*8], m1 mova [t4+t2*8], m2 mova [t4+t3*8], m3 sub t4, 16*64 sub r3, 8*16 cmp r3, rsp jg .shift_transpose ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx_avx2.asm000066400000000000000000005772231517466257200235460ustar00rootroot00000000000000; Copyright © 2026, VideoLAN and dav2d authors ; Copyright © 2026, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA adst16_mat: dw 8, 41, 25, 55, 67, 84, 77, 88, 89, 81, 87, 73, 62, 33, 48, 17 dw 17, 73, 48, 87, 88, 55, 77, 25, -8, -67, -41, -84, -89, -62, -81, -33 dw 25, 88, 67, 81, 48, -48, 0, -81, -88, -25, -67, 25, 67, 81, 88, 48 dw 33, 84, 81, 41, -25, -87, -77, -48, 17, 88, 73, 55, -8, -89, -67, -62 dw 41, 62, 88, -17, -81, -8, -77, 67, 87, -48, 33, -89, -55, 84, 25, 73 dw 48, 25, 88, -67, -81, 81, 0, 67, -25, -48, -88, 48, 88, -67, 25, -81 dw 55, -17, 81, -89, -25, 62, 77, -48, -84, 88, 8, 33, -73, 41, -67, 87 dw 62, -55, 67, -73, 48, -41, 77, -81, 33, -25, 84, -87, 17, -8, 88, -89 dw 67, -81, 48, -25, 88, -88, 0, 25, 81, -67, -48, 67, 48, -25, -81, 88 dw 73, -89, 25, 33, 67, -17, -77, 88, -41, 81, -62, 8, -87, 55, 48, -84 dw 77, -77, 0, 77, 0, 77, -77, 0, -77, 0, 77, -77, 77, -77, 0, 77 dw 81, -48, -25, 88, -67, 67, 0, -88, 48, -81, 25, 81, -25, 88, -48, -67 dw 84, -8, -48, 62, -88, -33, 77, -25, 73, 67, -89, -17, -41, -87, 81, 55 dw 87, 33, -67, 8, -48, -89, 77, 81, -55, 25, 17, -62, 84, 73, -88, -41 dw 88, 67, -81, -48, 25, -25, 0, 48, -67, -88, 81, 88, -81, -48, 67, 25 dw 89, 87, -88, -84, 81, 73, -77, -67, 62, 48, -55, -41, 33, 17, -25, -8 flipadst16_mat: dw 89, 87, 88, 84, 81, 73, 77, 67, 62, 48, 55, 41, 33, 17, 25, 8 dw 88, 67, 81, 48, 25, -25, 0, -48, -67, -88, -81, -88, -81, -48, -67, -25 dw 87, 33, 67, -8, -48, -89, -77, -81, -55, 25, -17, 62, 84, 73, 88, 41 dw 84, -8, 48, -62, -88, -33, -77, 25, 73, 67, 89, 17, -41, -87, -81, -55 dw 81, -48, 25, -88, -67, 67, 0, 88, 48, -81, -25, -81, -25, 88, 48, 67 dw 77, -77, 0, -77, 0, 77, 77, 0, -77, 0, -77, 77, 77, -77, 0, -77 dw 73, -89, -25, -33, 67, -17, 77, -88, -41, 81, 62, -8, -87, 55, -48, 84 dw 67, -81, -48, 25, 88, -88, 0, -25, 81, -67, 48, -67, 48, -25, 81, -88 dw 62, -55, -67, 73, 48, -41, -77, 81, 33, -25, -84, 87, 17, -8, -88, 89 dw 55, -17, -81, 89, -25, 62, -77, 48, -84, 88, -8, -33, -73, 41, 67, -87 dw 48, 25, -88, 67, -81, 81, 0, -67, -25, -48, 88, -48, 88, -67, -25, 81 dw 41, 62, -88, 17, -81, -8, 77, -67, 87, -48, -33, 89, -55, 84, -25, -73 dw 33, 84, -81, -41, -25, -87, 77, 48, 17, 88, -73, -55, -8, -89, 67, 62 dw 25, 88, -67, -81, 48, -48, 0, 81, -88, -25, 67, -25, 67, 81, -88, -48 dw 17, 73, -48, -87, 88, 55, -77, -25, -8, -67, 41, 84, -89, -62, 81, 33 dw 8, 41, -25, -55, 67, 84, -77, -88, 89, 81, -87, -73, 62, 33, -48, -17 ddt16_mat: dw 12, 37, 17, 45, 47, 64, 60, 82, 89, 92, 100, 84, 69, 51, 50, 44 dw 15, 49, 23, 60, 60, 70, 74, 73, 48, -35, 9, -71, -83, -89, -79, -95 dw 19, 60, 30, 69, 61, 40, 64, 3, -53, -91, -99, -46, 2, 73, 47, 124 dw 23, 69, 38, 73, 49, -19, 28, -80, -96, 42, -45, 88, 75, -17, 14,-126 dw 30, 75, 48, 66, 19, -79, -31, -91, -5, 71, 84, -16, -78, -45, -60, 108 dw 39, 75, 61, 40, -29, -78, -87, 10, 89, -69, 36, -67, 18, 89, 67, -81 dw 51, 61, 76, -8, -77, 11, -82, 94, 16, -22, -81, 79, 50,-103, -37, 54 dw 66, 29, 87, -65, -83, 92, 4, 18, -83, 85, 4, -22, -85, 97, -6, -30 dw 78, -18, 83, -91, -16, 28, 88, -84, 12, -60, 73, -46, 81, -83, 49, 16 dw 88, -67, 59, -57, 75, -85, 54, -5, 75, -17, -60, 84, -43, 71, -80, -6 dw 94, -96, 19, 21, 93, -41, -55, 80, -51, 77, -17, -68, -6, -56, 98, 1 dw 97, -83, -30, 86, 3, 82, -77, -17, -43, -70, 76, 15, 53, 44, -99, 3 dw 93, -28, -73, 81, -92, 39, 29, -70, 81, 11, -55, 46, -81, -31, 90, -4 dw 83, 40, -99, 8, -74, -83, 88, 47, -14, 56, -21, -83, 88, 22, -71, 5 dw 68, 84, -99, -69, 32, -37, 3, 55, -75, -83, 81, 82, -69, -11, 48, -3 flipddt16_mat: dw 50, 83, -76, -90, 97, 83, -86, -68, 67, 49, -56, -40, 32, 5, -19, 2 dw 68, 84, -99, -69, 32, -37, 3, 55, -75, -83, 81, 82, -69, -11, 48, -3 dw 83, 40, -99, 8, -74, -83, 88, 47, -14, 56, -21, -83, 88, 22, -71, 5 dw 93, -28, -73, 81, -92, 39, 29, -70, 81, 11, -55, 46, -81, -31, 90, -4 dw 97, -83, -30, 86, 3, 82, -77, -17, -43, -70, 76, 15, 53, 44, -99, 3 dw 94, -96, 19, 21, 93, -41, -55, 80, -51, 77, -17, -68, -6, -56, 98, 1 dw 88, -67, 59, -57, 75, -85, 54, -5, 75, -17, -60, 84, -43, 71, -80, -6 dw 78, -18, 83, -91, -16, 28, 88, -84, 12, -60, 73, -46, 81, -83, 49, 16 dw 66, 29, 87, -65, -83, 92, 4, 18, -83, 85, 4, -22, -85, 97, -6, -30 dw 51, 61, 76, -8, -77, 11, -82, 94, 16, -22, -81, 79, 50,-103, -37, 54 dw 39, 75, 61, 40, -29, -78, -87, 10, 89, -69, 36, -67, 18, 89, 67, -81 dw 30, 75, 48, 66, 19, -79, -31, -91, -5, 71, 84, -16, -78, -45, -60, 108 dw 23, 69, 38, 73, 49, -19, 28, -80, -96, 42, -45, 88, 75, -17, 14,-126 dw 19, 60, 30, 69, 61, 40, 64, 3, -53, -91, -99, -46, 2, 73, 47, 124 dw 15, 49, 23, 60, 60, 70, 74, 73, 48, -35, 9, -71, -83, -89, -79, -95 dw 12, 37, 17, 45, 47, 64, 60, 82, 89, 92, 100, 84, 69, 51, 50, 44 dct32_mat: dw 90, 88, 90, 85, 82, 73, 78, 67, 47, 61, 39, 54, 13, 30, 4, 22 dw 90, 67, 82, 47, 22, -30, -4, -54, -90, -73, -88, -85, -39, -78, -13, -61 dw 88, 30, 67, -13, -54, -90, -82, -78, 39, -47, 73, -4, 61, 90, 22, 85 dw 85, -13, 47, -67, -90, -22, -73, 39, 54, 82, -4, 88, -78, -61, -30, -90 dw 82, -54, 22, -90, -61, 78, 13, 85, -90, 30, -67, -47, 88, 4, 39, 73 dw 78, -82, -4, -73, 13, 67, 85, -22, 30, -88, 90, -61, -90, 54, -47, -39 dw 73, -90, -30, -22, 78, -39, 67, -90, 61, -13, -47, 82, 85, -88, 54, -4 dw 67, -78, -54, 39, 85, -90, -22, 4, -88, 90, -30, 13, -73, 82, -61, 47 ; The dct32 and dct16 coefficients must be adjacent in memory dct16_mat: dw 90, 80, 87, 70, 26, 57, 9, 43 dw 87, 9, 57, -43, -70, -80, -26, -90 dw 80, -70, 9, -87, 90, -26, 43, 57 dw 70, -87, -43, 9, -80, 90, -57, 26 adst8_mat: dw 11, 54, 34, 71, 84, 79, 88, 50 dw 28, 89, 74, 68, 17, -83, -44, -69 dw 44, 48, 89, -41, -89, 50, -44, 81 dw 58, -34, 76, -86, 10, 6, 88, -84 dw 70, -87, 39, 1, 86, -59, -44, 78 dw 79, -66, -12, 87, -35, 86, -44, -62 dw 86, 12, -58, 38, -75, -74, 88, 40 flipadst8_mat: dw 89, 79, -86, -70, 58, 29, -44, -14 dw 86, 12, -58, 38, -75, -74, 88, 40 dw 79, -66, -12, 87, -35, 86, -44, -62 dw 70, -87, 39, 1, 86, -59, -44, 78 dw 58, -34, 76, -86, 10, 6, 88, -84 dw 44, 48, 89, -41, -89, 50, -44, 81 dw 28, 89, 74, 68, 17, -83, -44, -69 dw 11, 54, 34, 71, 84, 79, 88, 50 ddt8_mat: dw 4, 22, 6, 57, 96, 78, 103, 56 dw 7, 48, 14, 94, 73, -79, -17, -96 dw 15, 85, 36, 76, -43, 7, -80, 98 dw 33, 88, 77, -26, -69, 56, 56, -77 dw 65, 0, 100, -73, 55, -82, 15, 54 dw 98, -86, 45, 34, 20, 79, -66, -33 dw 106, -23, -57, 54, -71, -56, 75, 19 flipddt8_mat: dw 80, 82, -98, -66, 53, 26, -41, -6 dw 106, -23, -57, 54, -71, -56, 75, 19 dw 98, -86, 45, 34, 20, 79, -66, -33 dw 65, 0, 100, -73, 55, -82, 15, 54 dw 33, 88, 77, -26, -69, 56, 56, -77 dw 15, 85, 36, 76, -43, 7, -80, 98 dw 7, 48, 14, 94, 73, -79, -17, -96 dw 4, 22, 6, 57, 96, 78, 103, 56 adst4_mat: dw 18, 75, 50, 18, 75, -89; 89, 50 flipadst4_mat: dw 89, 50, 75, -89, 50, 18, 18, 75 pb_14_15: times 2 db 14, 15 pw_256: times 2 dw 256 pw_512: times 2 dw 512 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 pw_53x256: times 2 dw 53*256 pw_53x512: times 2 dw 53*512 pw_181x4: times 2 dw 181*4 pw_181x8: times 2 dw 181*8 pw_181x16: times 2 dw 181*16 pw_181x32: times 2 dw 181*32 pw_181x128: times 2 dw 181*128 pd_32: dd 32 pd_64: dd 64 pd_128: dd 128 pd_512: dd 512 pd_1024: dd 1024 pd_2048: dd 2048 pd_4096: dd 4096 pw_64_64: dw 64, 64 pw_64_m64: dw 64, -64 pw_35_83: dw 35, 83 pw_m83_35: dw -83, 35 pw_0_83: dw 0, 83 pw_0_35: dw 0, 35 pw_89_75: dw 89, 75 pw_75_m18: dw 75, -18 pw_50_m89: dw 50, -89 pw_18_m50: dw 18, -50 %define pw_18_75 (adst4_mat+4*0) %define pw_50_18 (adst4_mat+4*1) %define pw_75_m89 (adst4_mat+4*2) %define pw_89_50 (adst4_mat+4*3) %define o_base (adst4_mat + 128) %define o(x) (r6 - o_base + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ _avx2) %macro ITX_JMP_TABLE 2 ; w, h %if %1 <= 16 itx_%1x%2_table_p1: dd m(inv_txfm_add_%1x%2_8bpc).pass1_dct - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass1_identity - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass1_adst - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass1_flipadst - o_base %if %1 >= 8 dd m(inv_txfm_add_%1x%2_8bpc).pass1_ddt - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass1_flipddt - o_base %elif %2 == 4 ; wht pw_17: times 2 dw 17 ; There are two unused entries, make use pw_33: times 2 dw 33 ; of the space to store some constants dd m(inv_txfm_add_%1x%2_8bpc).wht - o_base %endif %endif %if %2 <= 16 itx_%1x%2_table_p2: dd m(inv_txfm_add_%1x%2_8bpc).pass2_dct - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass2_identity - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass2_adst - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass2_flipadst - o_base %if %2 >= 8 dd m(inv_txfm_add_%1x%2_8bpc).pass2_ddt - o_base dd m(inv_txfm_add_%1x%2_8bpc).pass2_flipddt - o_base %endif %endif %endmacro ITX_JMP_TABLE 4, 4 ITX_JMP_TABLE 4, 8 ITX_JMP_TABLE 4, 16 ITX_JMP_TABLE 4, 32 ITX_JMP_TABLE 8, 4 ITX_JMP_TABLE 8, 8 ITX_JMP_TABLE 8, 16 ITX_JMP_TABLE 8, 32 ITX_JMP_TABLE 16, 4 ITX_JMP_TABLE 16, 8 ITX_JMP_TABLE 16, 16 ITX_JMP_TABLE 32, 4 ITX_JMP_TABLE 32, 8 SECTION .text %macro CCTX_ROUND 1 pcmpgtd m4, m7, %1 paddd %1, m7 paddd %1, m4 %endmacro INIT_YMM avx2 cglobal cctx_8bpc, 4, 4, 8, u, v, angle, sz vpbroadcastd m5, [angleq+2] ; cos, -sin lea uq, [uq+szq*2] vpbroadcastd m6, [angleq+0] ; sin, cos lea vq, [vq+szq*2] vpbroadcastd m7, [pd_128] neg szq .loop: mova m3, [uq+szq*2] mova m2, [vq+szq*2] punpcklwd m1, m3, m2 punpckhwd m3, m2 pmaddwd m0, m5, m1 ; a pmaddwd m2, m5, m3 pmaddwd m1, m6 ; b pmaddwd m3, m6 REPX {CCTX_ROUND x}, m0, m2, m1, m3 REPX {psrad x, 8}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 mova [uq+szq*2], m0 mova [vq+szq*2], m1 add szq, 16 jl .loop RET %macro INV_TXFM_FN 2 ; w, h cglobal inv_txfm_add_%1x%2_8bpc, 4, 7, 0, dst, ds, cf, tx1, eob, tx2 lea r6, [o_base] %if WIN64 rorx eobd, eobm, 16 %else shl eobd, 16 %endif add eobd, tx1d jz .dconly %if %1 < 32 && %2 < 32 movzx tx2d, tx1b and tx1d, 7 shr tx2d, 5 movsxd tx1q, [o(itx_%1x%2_table_p1)+tx1q*4] movsxd tx2q, [o(itx_%1x%2_table_p2)+tx2q*4] add tx1q, r6 add tx2q, r6 %elif %1 < 32 and tx1d, 7 movsxd tx1q, [o(itx_%1x%2_table_p1)+tx1q*4] add tx1q, r6 %elif %2 < 32 movzx tx2d, tx1b shr tx2d, 5 movsxd tx2q, [o(itx_%1x%2_table_p2)+tx2q*4] add tx2q, r6 %endif %endmacro %macro WRAP_XMM 1+ INIT_XMM cpuname %1 INIT_YMM cpuname %endmacro %macro IDCT4_1D_PACKED 1 ; rnd vpbroadcastd m3, [o(pw_64_64)] punpckhwd m4, m1, m0 ; 3 1 vpbroadcastd m2, [o(pw_64_m64)] punpcklwd m0, m1 ; 0 2 vpbroadcastd m1, [o(pw_35_83)] pmaddwd m3, m0 ; a0 pmaddwd m2, m0 ; a1 vpbroadcastd m0, [o(pw_m83_35)] pmaddwd m1, m4 ; b0 pmaddwd m4, m0 ; b1 paddd m3, %1 paddd m2, %1 paddd m0, m3, m1 ; out0 psubd m3, m1 ; out3 paddd m1, m2, m4 ; out1 psubd m2, m4 ; out2 ret %endmacro %macro IDCT8_1D_PACKED 0 punpckhwd m7, m0, m2 ; 1 5 punpcklwd m0, m2 ; 0 4 punpckhwd m2, m3, m1 ; 7 3 punpcklwd m1, m3, m1 ; 2 6 .dct8b: vpbroadcastd m6, [o(pw_18_75)] vpbroadcastd m4, [o(pw_75_m89)] pmaddwd m8, m6, m7 pmaddwd m6, m2 vpbroadcastd m9, [o(pw_89_50)] pmaddwd m5, m4, m7 pmaddwd m4, m2 pmaddwd m3, m9, m2 pmaddwd m9, m7 psubd m8, m3 ; b3 vpbroadcastd m3, [o(pw_50_18)] paddd m6, m9 ; b0 pmaddwd m2, m3 pmaddwd m7, m3 psubd m5, m2 ; b1 vpbroadcastd m3, [o(pw_64_64)] paddd m4, m7 ; b2 vpbroadcastd m2, [o(pw_64_m64)] pmaddwd m3, m0 ; dct4 a0 vpbroadcastd m7, [o(pw_35_83)] pmaddwd m2, m0 ; dct4 a1 vpbroadcastd m0, [o(pw_m83_35)] pmaddwd m7, m1 ; dct4 b0 pmaddwd m0, m1 ; dct4 b1 %endmacro %macro IDCT16_1D_PACKED 1 ; cfq_offset lea r3, [o(dct16_mat)] .dct16b: punpcklwd m8, m0, m4 ; 0 8 punpckhwd m0, m2 ; 1 5 mova [cfq+mmsize*(%1+3)], m8 punpcklwd m9, m1, m5 ; 2 10 punpckhwd m1, m3 ; 3 7 mova [cfq+mmsize*(%1+2)], m9 punpcklwd m8, m6, m2 ; 12 4 punpckhwd m2, m6, m4 ; 13 9 mova [cfq+mmsize*(%1+5)], m8 punpcklwd m6, m7, m3 ; 14 6 punpckhwd m3, m7, m5 ; 15 11 mova [cfq+mmsize*(%1+4)], m6 .dct16c: call .dct16d add r3, 4*8 mova [cfq+mmsize*(%1+0)], m8 ; b0 mova [cfq+mmsize*(%1+1)], m9 ; b1 mova [cfq+mmsize*(%1+6)], m12 ; b7 mova [cfq+mmsize*(%1+7)], m11 ; b6 call .dct16d mova m7, [cfq+mmsize*(%1+2)] mova m0, [cfq+mmsize*(%1+3)] mova m2, [cfq+mmsize*(%1+4)] mova m1, [cfq+mmsize*(%1+5)] mova [cfq+mmsize*(%1+2)], m9 ; b3 mova [cfq+mmsize*(%1+3)], m8 ; b2 mova [cfq+mmsize*(%1+4)], m11 ; b4 mova [cfq+mmsize*(%1+5)], m12 ; b5 %if mmsize == 16 jmp m(inv_txfm_add_4x8_8bpc).dct8b %else jmp m(inv_txfm_add_8x8_8bpc).dct8b %endif .dct16d: vpbroadcastd m7, [r3+4*0] vpbroadcastd m12, [r3+4*1] vpbroadcastd m11, [r3+4*4] vpbroadcastd m6, [r3+4*5] pmaddwd m8, m7, m0 pmaddwd m4, m12, m1 pmaddwd m9, m11, m0 pmaddwd m5, m6, m1 pmaddwd m7, m3 pmaddwd m12, m2 pmaddwd m11, m3 pmaddwd m6, m2 paddd m8, m4 paddd m9, m5 psubd m12, m7 vpbroadcastd m7, [r3+4*2] psubd m11, m6 vpbroadcastd m6, [r3+4*6] pmaddwd m4, m7, m2 pmaddwd m5, m6, m2 pmaddwd m7, m1 pmaddwd m6, m1 paddd m8, m4 paddd m9, m5 psubd m12, m7 vpbroadcastd m7, [r3+4*3] paddd m11, m6 vpbroadcastd m6, [r3+4*7] pmaddwd m4, m7, m3 pmaddwd m5, m6, m3 pmaddwd m7, m0 pmaddwd m6, m0 paddd m8, m4 ; b0 / b2 paddd m9, m5 ; b1 / b3 paddd m12, m7 ; b7 / b5 psubd m11, m6 ; b6 / b4 ret %endmacro %macro IDCT16_1D_PACKED_FAST 1 ; cfq_offset lea r3, [o(dct16_mat)] .dct16_fast2: punpcklwd m9, m0, m2 ; 0 4 punpckhwd m0, m2 ; 1 5 punpcklwd m8, m1, m3 ; 2 6 punpckhwd m1, m3 ; 3 7 .dct16_fast3: call .dct16_fast4 add r3, 4*8 mova [cfq+mmsize*(%1+0)], m4 ; c0 mova [cfq+mmsize*(%1+1)], m5 ; c1 mova [cfq+mmsize*(%1+6)], m7 ; c7 mova [cfq+mmsize*(%1+7)], m6 ; c6 call .dct16_fast4 vpbroadcastd m2, [o(pd_64)] vpbroadcastd m1, [o(pw_0_83)] vpbroadcastd m11, [o(pw_0_35)] mova [cfq+mmsize*(%1+2)], m5 ; c3 mova [cfq+mmsize*(%1+3)], m4 ; c2 mova [cfq+mmsize*(%1+4)], m6 ; c4 mova [cfq+mmsize*(%1+5)], m7 ; c5 vpbroadcastd m6, [o(pw_89_75)] pmaddwd m2, m9 vpbroadcastd m5, [o(pw_75_m18)] pmaddwd m1, m9 vpbroadcastd m4, [o(pw_50_m89)] pmaddwd m11, m9 vpbroadcastd m7, [o(pw_18_m50)] pmaddwd m6, m8 ; b0 pmaddwd m5, m8 ; b1 pmaddwd m4, m8 ; b2 pmaddwd m8, m7 ; b3 paddd m2, m10 ; rnd %if mmsize == 16 INIT_YMM cpuname vinserti128 m2, xm2, 1 vinserti128 m1, xm11, 1 paddd m3, m2, m1 ; a0 a1 psubd m2, m1 ; a3 a2 vinserti128 m1, m6, xm5, 1 vinserti128 m4, m8, xm4, 1 jmp m(inv_txfm_add_4x8_8bpc).dct8c INIT_XMM cpuname %else paddd m0, m2, m1 ; a0 psubd m3, m2, m1 ; a3 paddd m1, m11, m2 ; a1 psubd m2, m11 ; a2 jmp m(inv_txfm_add_8x8_8bpc).dct8c %endif .dct16_fast4: vpbroadcastd m4, [r3+4*0] vpbroadcastd m6, [r3+4*1] vpbroadcastd m2, [r3+4*2] vpbroadcastd m7, [r3+4*3] pmaddwd m4, m0 pmaddwd m6, m1 pmaddwd m2, m1 pmaddwd m7, m0 vpbroadcastd m5, [r3+4*4] vpbroadcastd m3, [r3+4*5] paddd m4, m6 ; b0 / b2 vpbroadcastd m6, [r3+4*6] psubd m7, m2 ; b7 / b5 vpbroadcastd m2, [r3+4*7] pmaddwd m5, m0 pmaddwd m3, m1 pmaddwd m6, m1 pmaddwd m2, m0 paddd m5, m3 ; b1 / b3 psubd m6, m2 ; b6 / b4 ret %endmacro %macro IDCT32_1D_PACKED 0 lea r3, [o(dct32_mat)] call %%dct32b mova [rsp+gprsize+mmsize* 0], m0 ; b0 mova [rsp+gprsize+mmsize* 1], m1 ; b1 mova [rsp+gprsize+mmsize*14], m11 ; b15 mova [rsp+gprsize+mmsize*15], m10 ; b14 call %%dct32b mova [rsp+gprsize+mmsize* 2], m1 ; b3 mova [rsp+gprsize+mmsize* 3], m0 ; b2 mova [rsp+gprsize+mmsize*12], m10 ; b12 mova [rsp+gprsize+mmsize*13], m11 ; b13 call %%dct32b mova [rsp+gprsize+mmsize* 4], m0 ; b4 mova [rsp+gprsize+mmsize* 5], m1 ; b5 mova [rsp+gprsize+mmsize*10], m11 ; b11 mova [rsp+gprsize+mmsize*11], m10 ; b10 call %%dct32b mova [rsp+gprsize+mmsize* 6], m1 ; b7 mova [rsp+gprsize+mmsize* 7], m0 ; b6 mova [rsp+gprsize+mmsize* 8], m10 ; b8 mova [rsp+gprsize+mmsize* 9], m11 ; b9 ret %%dct32b: vpbroadcastd m12, [r3+4* 0] vpbroadcastd m11, [r3+4* 1] vpbroadcastd m10, [r3+4* 8] vpbroadcastd m13, [r3+4* 9] pmaddwd m0, m12, m8 pmaddwd m1, m11, m9 pmaddwd m12, m5 pmaddwd m11, m4 paddd m0, m1 pmaddwd m1, m10, m8 psubd m11, m12 pmaddwd m12, m13, m9 pmaddwd m10, m5 pmaddwd m13, m4 paddd m1, m12 vpbroadcastd m12, [r3+4* 2] psubd m10, m13 pmaddwd m13, m12, m2 pmaddwd m12, m7 paddd m0, m13 vpbroadcastd m13, [r3+4*10] psubd m11, m12 pmaddwd m12, m13, m2 pmaddwd m13, m7 paddd m1, m12 vpbroadcastd m12, [r3+4* 3] paddd m10, m13 pmaddwd m13, m12, m3 pmaddwd m12, m6 paddd m0, m13 vpbroadcastd m13, [r3+4*11] paddd m11, m12 pmaddwd m12, m13, m3 pmaddwd m13, m6 paddd m1, m12 vpbroadcastd m12, [r3+4* 4] psubd m10, m13 pmaddwd m13, m12, m6 pmaddwd m12, m3 paddd m0, m13 vpbroadcastd m13, [r3+4*12] psubd m11, m12 pmaddwd m12, m13, m6 pmaddwd m13, m3 paddd m1, m12 vpbroadcastd m12, [r3+4* 5] paddd m10, m13 pmaddwd m13, m12, m7 pmaddwd m12, m2 paddd m0, m13 vpbroadcastd m13, [r3+4*13] paddd m11, m12 pmaddwd m12, m13, m7 pmaddwd m13, m2 paddd m1, m12 vpbroadcastd m12, [r3+4* 6] psubd m10, m13 pmaddwd m13, m12, m4 pmaddwd m12, m9 paddd m0, m13 vpbroadcastd m13, [r3+4*14] psubd m11, m12 pmaddwd m12, m13, m4 pmaddwd m13, m9 paddd m1, m12 vpbroadcastd m12, [r3+4* 7] paddd m10, m13 pmaddwd m13, m12, m5 pmaddwd m12, m8 paddd m0, m13 ; b0 + 2n vpbroadcastd m13, [r3+4*15] add r3, 4*16 paddd m11, m12 ; b15 - 2n pmaddwd m12, m13, m5 pmaddwd m13, m8 paddd m1, m12 ; b1 + 2n psubd m10, m13 ; b14 - 2n ret %endmacro %macro IDCT32_1D_PACKED_FAST 0 lea r3, [o(dct32_mat)] call %%dct32_fast2 mova [rsp+gprsize+mmsize* 0], m0 ; b0 mova [rsp+gprsize+mmsize* 1], m1 ; b1 mova [rsp+gprsize+mmsize*14], m7 ; b15 mova [rsp+gprsize+mmsize*15], m6 ; b14 call %%dct32_fast2 mova [rsp+gprsize+mmsize* 2], m1 ; b3 mova [rsp+gprsize+mmsize* 3], m0 ; b2 mova [rsp+gprsize+mmsize*12], m6 ; b12 mova [rsp+gprsize+mmsize*13], m7 ; b13 call %%dct32_fast2 mova [rsp+gprsize+mmsize* 4], m0 ; b4 mova [rsp+gprsize+mmsize* 5], m1 ; b5 mova [rsp+gprsize+mmsize*10], m7 ; b11 mova [rsp+gprsize+mmsize*11], m6 ; b10 call %%dct32_fast2 mova [rsp+gprsize+mmsize* 6], m1 ; b7 mova [rsp+gprsize+mmsize* 7], m0 ; b6 mova [rsp+gprsize+mmsize* 8], m6 ; b8 mova [rsp+gprsize+mmsize* 9], m7 ; b9 ret %%dct32_fast2: vpbroadcastd m0, [r3+4* 0] vpbroadcastd m7, [r3+4* 1] vpbroadcastd m1, [r3+4* 8] vpbroadcastd m5, [r3+4* 9] vpbroadcastd m4, [r3+4* 2] vpbroadcastd m6, [r3+4*10] pmaddwd m0, m8 pmaddwd m7, m9 pmaddwd m1, m8 pmaddwd m5, m9 pmaddwd m4, m2 pmaddwd m6, m2 paddd m0, m7 vpbroadcastd m7, [r3+4* 3] paddd m1, m5 vpbroadcastd m5, [r3+4*11] pmaddwd m7, m3 paddd m0, m4 vpbroadcastd m4, [r3+4* 4] pmaddwd m5, m3 paddd m1, m6 vpbroadcastd m6, [r3+4*12] paddd m0, m7 ; b0 + 2n vpbroadcastd m7, [r3+4* 5] paddd m1, m5 ; b1 + 2n vpbroadcastd m5, [r3+4*13] pmaddwd m4, m3 pmaddwd m6, m3 pmaddwd m7, m2 pmaddwd m5, m2 psubd m7, m4 vpbroadcastd m4, [r3+4* 6] psubd m6, m5 vpbroadcastd m5, [r3+4*14] pmaddwd m4, m9 pmaddwd m5, m9 psubd m7, m4 vpbroadcastd m4, [r3+4* 7] paddd m6, m5 vpbroadcastd m5, [r3+4*15] add r3, 4*16 pmaddwd m4, m8 pmaddwd m5, m8 paddd m7, m4 ; b15 - 2n psubd m6, m5 ; b14 - 2n ret %endmacro %macro IDST4_1D_PACKED 0 %if WIN64 && xmm_regs_used <= 6 ; avoid xmm spills vpbroadcastd m2, [r3+4*0] punpcklwd m4, m0, m1 vpbroadcastd m3, [r3+4*3] punpckhwd m5, m1, m0 pmaddwd m0, m2, m4 pmaddwd m1, m3, m5 pmaddwd m3, m4 pmaddwd m2, m5 paddd m0, m1 vpbroadcastd m1, [r3+4*1] psubd m3, m2 vpbroadcastd m2, [r3+4*2] pmaddwd m1, m4 pmaddwd m2, m4 vpbroadcastd m4, [r3+4*2] pmaddwd m4, m5 psubd m1, m4 vpbroadcastd m4, [r3+4*1] pmaddwd m4, m5 paddd m2, m4 %else vpbroadcastd m4, [r3+4*0] punpcklwd m3, m0, m1 vpbroadcastd m5, [r3+4*1] punpckhwd m8, m1, m0 vpbroadcastd m6, [r3+4*2] pmaddwd m0, m4, m3 vpbroadcastd m7, [r3+4*3] pmaddwd m1, m5, m3 pmaddwd m2, m6, m3 pmaddwd m3, m7 REPX {pmaddwd x, m8}, m7, m6, m5, m4 paddd m0, m7 psubd m1, m6 paddd m2, m5 psubd m3, m4 %endif ret %endmacro %macro IDST8_1D_PACKED 0 SWAP 7, 8 ; reduces code size by avoiding C4 prefixes punpcklwd m8, m0, m1 ; 0 2 punpckhwd m9, m0, m1 ; 1 3 vpbroadcastd m0, [r3+16*0] vpbroadcastd m1, [r3+16*1] punpcklwd m10, m2, m3 ; 4 6 punpckhwd m11, m2, m3 ; 5 7 vpbroadcastd m2, [r3+16*2] vpbroadcastd m3, [r3+16*3] vpbroadcastd m4, [r3+16*4] vpbroadcastd m5, [r3+16*5] vpbroadcastd m6, [r3+16*6] vpbroadcastd m7, [r3+16*7] REPX {pmaddwd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 %assign %%i 1 %rep 3 %assign %%j 0 %assign %%k %%i+8 %rep 8 vpbroadcastd m8, [r3+%%i*4+%%j*16] pmaddwd m8, m%+%%k %if %%i == 3 && %%j == 7 SWAP 7, 8 ; swap back on the last instruction %endif paddd m%+%%j, m8 %assign %%j %%j+1 %endrep %assign %%i %%i+1 %endrep ret %endmacro %macro IDST16_1D_PACKED_1ROW 0 vpbroadcastd m0, [r3+4*0] vpbroadcastd m1, [r3+4*1] vpbroadcastd m8, [r3+4*2] vpbroadcastd m9, [r3+4*3] %if mmsize == 16 pmaddwd m0, m13 %else pmaddwd m0, m10 %endif pmaddwd m1, m11 pmaddwd m8, m12 pmaddwd m9, m2 paddd m0, m8 vpbroadcastd m8, [r3+4*4] paddd m1, m9 vpbroadcastd m9, [r3+4*5] pmaddwd m8, m3 pmaddwd m9, m4 paddd m0, m8 vpbroadcastd m8, [r3+4*6] paddd m1, m9 vpbroadcastd m9, [r3+4*7] pmaddwd m8, m5 pmaddwd m9, m6 paddd m0, m8 paddd m1, m9 paddd m0, m1 ret %endmacro %macro IDST16_1D_PACKED_2ROWS_FAST 1 ; row_offset vpbroadcastd m0, [r3+4*0] vpbroadcastd m3, [r3+4*1] vpbroadcastd m8, [r3+4*2] vpbroadcastd m9, [r3+4*3] %if mmsize == 16 pmaddwd m0, m13 %else pmaddwd m0, m10 %endif pmaddwd m3, m11 pmaddwd m8, m12 vpbroadcastd m1, [r3+4*0+32*%1] pmaddwd m9, m2 vpbroadcastd m4, [r3+4*1+32*%1] paddd m0, m8 vpbroadcastd m8, [r3+4*2+32*%1] paddd m3, m9 vpbroadcastd m9, [r3+4*3+32*%1] %if mmsize == 16 pmaddwd m1, m13 %else pmaddwd m1, m10 %endif pmaddwd m4, m11 pmaddwd m8, m12 pmaddwd m9, m2 paddd m1, m8 paddd m4, m9 paddd m0, m3 paddd m1, m4 %endmacro %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ; in1 in3 punpcklqdq m0, m1 ; in0 in2 psubw m2, m0, m3 paddw m0, m3 punpckhqdq m2, m2 ; t2 t2 punpcklqdq m0, m0 ; t0 t0 psubw m1, m0, m2 psraw m1, 1 psubw m1, m3 ; t1 t3 psubw m0, m1 ; ____ out0 paddw m2, m1 ; out3 ____ %endmacro INIT_XMM avx2 INV_TXFM_FN 4, 4 mova m0, [cfq+16*0] mova m1, [cfq+16*1] jmp tx1q .dconly: vpbroadcastw m3, [cfq] .dconly2: vpbroadcastd m2, [o(pw_17)] paddsw m3, m2 psraw m3, 5 .dconly3: pxor m2, m2 lea r3, [dsq*3] mova [cfq], m2 .dconly_loop: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movd m1, [dstq+dsq*2] pinsrd m1, [dstq+r3 ], 1 punpcklbw m0, m2 punpcklbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 pextrd [dstq+dsq*2], m0, 2 pextrd [dstq+r3 ], m0, 3 lea dstq, [dstq+dsq*4] dec r4d jg .dconly_loop ret .pass1_dct: vpbroadcastd m5, [o(pd_64)] call .dct4 .pass1_end: REPX {psrad x, 7}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 .pass1_identity: punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2_dct: vpbroadcastd m5, [o(pd_512)] call .dct4 REPX {psrad x, 10}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 jmp .pass2_end ALIGN function_align .dct4: IDCT4_1D_PACKED m5 .wht: psraw m0, 3 psraw m1, 3 IWHT4_1D_PACKED punpckhwd m0, m1 punpcklwd m3, m1, m2 punpckhdq m1, m0, m3 punpckldq m0, m3 IWHT4_1D_PACKED punpckhqdq m0, m1 punpcklqdq m1, m2 jmp .pass2_identity_end .pass2_identity: vpbroadcastd m2, [o(pw_4096)] pmulhrsw m0, m2 pmulhrsw m1, m2 .pass2_identity_end: test eobd, 0x300 jz .pass2_end test eobd, 0x200 jnz .vdpcm .hdpcm: psllq m2, m0, 16 psllq m3, m1, 16 paddw m0, m2 paddw m1, m3 psllq m2, m0, 32 psllq m3, m1, 32 jmp .dpcm_end .vdpcm: pslldq m2, m0, 8 paddw m1, m0 shufpd m3, m0, m1, 0x01 .dpcm_end: paddw m0, m2 paddw m1, m3 .pass2_end: lea r6, [dstq+dsq*2] movd m2, [dstq+dsq*0] pinsrd m2, [dstq+dsq*1], 1 movd m3, [r6 +dsq*0] pinsrd m3, [r6 +dsq*1], 1 pxor m4, m4 mova [cfq+16*0], m4 mova [cfq+16*1], m4 punpcklbw m2, m4 punpcklbw m3, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 pextrd [r6 +dsq*0], m0, 2 pextrd [r6 +dsq*1], m0, 3 RET .pass1_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst4_mat)] .pass1_dst: call .dst4 vpbroadcastd m4, [o(pd_64)] REPX {paddd x, m4}, m0, m1, m2, m3 jmp .pass1_end .pass2_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst4_mat)] .pass2_dst: call .dst4 vpbroadcastd m4, [o(pw_4096)] REPX {psrad x, 7}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 jmp .pass2_end ALIGN function_align .dst4: IDST4_1D_PACKED INIT_YMM avx2 INV_TXFM_FN 4, 8 vpbroadcastd m1, [o(pw_181x128)] pmulhrsw m0, m1, [cfq+32*0] pmulhrsw m1, [cfq+32*1] WIN64_SPILL_XMM 12 jmp tx1q .dconly: vpbroadcastw xm3, [cfq] vpbroadcastd xm2, [o(pw_181x128)] or r4d, 2 pmulhrsw xm3, xm2 jmp m(inv_txfm_add_4x4_8bpc).dconly2 .pass1_dct: vpbroadcastd m10, [o(pd_64)] vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(inv_txfm_add_8x4_8bpc).dct4 .pass1_end: REPX {psrad x, 7}, m0, m1, m2, m3 .pass1_end2: packssdw m0, m1 packssdw m1, m2, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 jmp tx2q .pass2_dct: vpbroadcastd m10, [o(pd_512)] call .dct8 .pass2_end: psrad m0, 10 psrad m1, 10 call .write_4x4_dct_start psrad m0, m2, 10 psrad m1, m3, 10 call .write_4x4_dct jmp m(inv_txfm_add_8x4_8bpc).pass2_end3 ALIGN function_align .write_4x4_dct_start: lea r6, [dsq*3] pxor m10, m10 .write_4x4_dct: movd xm8, [dstq+dsq*0] vpbroadcastd m9, [dstq+dsq*1] packssdw m0, m1 movd xm1, [dstq+r6 ] vpblendd m8, m9, 0x10 vpbroadcastd m9, [dstq+dsq*2] vpblendd m1, m9, 0x10 punpckldq m8, m1 punpcklbw m8, m10 paddw m0, m8 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+dsq*0], xm0 movd [dstq+dsq*1], xm1 pextrd [dstq+dsq*2], xm1, 1 pextrd [dstq+r6 ], xm0, 1 lea dstq, [dstq+dsq*4] ret ALIGN function_align .dct8: WRAP_XMM IDCT8_1D_PACKED vinserti128 m2, m3, xm2, 1 vinserti128 m0, m7, xm0, 1 vinserti128 m1, m6, xm5, 1 vinserti128 m4, m8, xm4, 1 paddd m2, m10 ; round paddd m3, m2, m0 ; a0 a1 psubd m2, m0 ; a3 a2 .dct8c: paddd m0, m3, m1 ; out0 out1 psubd m3, m1 ; out7 out6 paddd m1, m2, m4 ; out3 out2 psubd m2, m4 ; out4 out5 ret .pass1_identity: punpckhwd m2, m0, m1 punpcklwd m0, m1 vextracti128 xm3, m0, 1 vextracti128 xm4, m2, 1 punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 punpckhwd xm3, xm2, xm4 punpcklwd xm2, xm4 jmp tx2q .pass2_identity: vpbroadcastd m4, [o(pw_181x32)] lea r6, [dsq*3] pxor xm10, xm10 test eobd, 0x100 jnz .hdpcm pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 test eobd, 0x200 jnz .vdpcm call .write_4x4 pmulhrsw xm0, xm4, xm2 pmulhrsw xm1, xm4, xm3 call .write_4x4 jmp m(inv_txfm_add_8x4_8bpc).pass2_end3 .hdpcm: vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pmulhrsw m0, m4 pmulhrsw m1, m4 call .write_4x8_hdpcm jmp m(inv_txfm_add_8x4_8bpc).pass2_end3 .vdpcm: pmulhrsw xm2, xm4 pmulhrsw xm3, xm4 punpcklqdq xm4, xm10, xm0 paddw xm1, xm0 shufpd xm5, xm0, xm1, 0x01 paddw xm2, xm1 paddw xm0, xm4 shufpd xm4, xm1, xm2, 0x01 paddw xm3, xm2 paddw xm1, xm5 shufpd xm5, xm2, xm3, 0x01 call .write_4x4 paddw xm0, xm2, xm4 paddw xm1, xm3, xm5 call .write_4x4 jmp m(inv_txfm_add_8x4_8bpc).pass2_end3 ALIGN function_align .write_4x8_hdpcm: psllq m8, m0, 16 psllq m9, m1, 16 paddw m0, m8 paddw m1, m9 psllq m8, m0, 32 psllq m9, m1, 32 paddw m0, m8 paddw m1, m9 call .write_4x4 vextracti128 xm0, m0, 1 vextracti128 xm1, m1, 1 .write_4x4: movd xm8, [dstq+dsq*0] pinsrd xm8, [dstq+dsq*1], 1 movd xm9, [dstq+dsq*2] pinsrd xm9, [dstq+r6 ], 1 punpcklbw xm8, xm10 punpcklbw xm9, xm10 paddsw xm8, xm0 paddsw xm9, xm1 .write_4x4_end: packuswb xm8, xm9 movd [dstq+dsq*0], xm8 pextrd [dstq+dsq*1], xm8, 1 pextrd [dstq+dsq*2], xm8, 2 pextrd [dstq+r6 ], xm8, 3 lea dstq, [dstq+dsq*4] ret .pass1_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst4_mat)] .pass1_dst: vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(inv_txfm_add_8x4_8bpc).dst4 vpbroadcastd m4, [o(pd_64)] REPX {paddd x, m4}, m0, m1, m2, m3 jmp .pass1_end .pass2_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt8_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst8_mat)] .pass2_dst: call .dst8 vpbroadcastd m8, [o(pd_512)] vinserti128 m0, xm1, 1 vinserti128 m1, m3, xm2, 1 vinserti128 m2, m4, xm5, 1 vinserti128 m3, m7, xm6, 1 REPX {paddd x, m8}, m0, m1, m2, m3 jmp .pass2_end ALIGN function_align .dst8: WRAP_XMM IDST8_1D_PACKED INV_TXFM_FN 8, 4 WIN64_SPILL_XMM 12 jmp tx1q .dconly: movd xm5, [o(pw_181x128)] pmulhrsw xm5, [cfq] movd xm4, [o(pw_17)] paddw xm5, xm4 psraw xm5, 5 .dconly2: vpbroadcastw m5, xm5 lea r6, [dsq*3] pxor xm4, xm4 mova [cfq], xm4 .dconly_loop: movq xm0, [dstq+dsq*0] movq xm1, [dstq+dsq*1] vpbroadcastq m2, [dstq+dsq*2] vpbroadcastq m3, [dstq+r6 ] vpblendd m0, m2, 0x30 vpblendd m1, m3, 0x30 punpcklbw m0, m4 punpcklbw m1, m4 paddw m0, m5 paddw m1, m5 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] dec r4d jg .dconly_loop vzeroupper ret .pass1_dct: vpbroadcastd xm3, [o(pw_181x128)] pmulhrsw xm0, xm3, [cfq+16*0] pmulhrsw xm1, xm3, [cfq+16*1] pmulhrsw xm2, xm3, [cfq+16*2] pmulhrsw xm3, [cfq+16*3] vpbroadcastd m10, [o(pd_64)] call m(inv_txfm_add_4x8_8bpc).dct8 REPX {psrad x, 7}, m0, m1, m2, m3 packssdw m0, m2 ; 0 4 1 5 packssdw m1, m3 ; 3 7 2 6 vpermq m0, m0, q3120 ; 0 1 4 5 vpermq m1, m1, q1302 ; 2 3 6 7 .pass1_end: punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2_dct: vpbroadcastd m10, [o(pd_512)] call .dct4 REPX {psrad x, 10}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 .pass2_end: call .write_8x4_start .pass2_end3: mova [cfq+32*0], m10 mova [cfq+32*1], m10 RET ALIGN function_align .write_8x4_hdpcm_start: lea r6, [dsq*3] pxor m10, m10 .write_8x4_hdpcm_vpermq: vpermq m0, m0, q3120 vpermq m1, m1, q3120 .write_8x4_hdpcm: pslldq m8, m0, 2 pslldq m9, m1, 2 paddw m0, m8 paddw m1, m9 pslldq m8, m0, 4 pslldq m9, m1, 4 paddw m0, m8 paddw m1, m9 punpcklqdq m8, m10, m0 punpcklqdq m9, m10, m1 paddw m0, m8 paddw m1, m9 jmp .write_8x4 .write_8x4_start: lea r6, [dsq*3] pxor m10, m10 .write_8x4_vpermq: vpermq m0, m0, q3120 vpermq m1, m1, q3120 .write_8x4: movq xm8, [dstq+dsq*0] vpbroadcastq m11, [dstq+dsq*1] movq xm9, [dstq+dsq*2] vpblendd m8, m11, 0x30 vpbroadcastq m11, [dstq+r6 ] vpblendd m9, m11, 0x30 punpcklbw m8, m10 punpcklbw m9, m10 paddw m0, m8 paddw m1, m9 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movq [dstq+dsq*1], xm1 movhps [dstq+dsq*2], xm0 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] ret ALIGN function_align .dct4: IDCT4_1D_PACKED m10 .pass1_identity: vpbroadcastd m2, [o(pw_181x128)] vpbroadcastd m3, [o(pw_53x256)] mova xm0, [cfq+16*0] vinserti128 m0, [cfq+16*2], 1 mova xm1, [cfq+16*1] vinserti128 m1, [cfq+16*3], 1 pmulhrsw m0, m2 pmulhrsw m1, m2 pmulhrsw m2, m3, m0 ; (x * 181 + 64) >> 7 pmulhrsw m3, m1 ; = x + ((x * 53 + 64) >> 7) paddw m0, m2 paddw m1, m3 jmp .pass1_end .pass2_identity: vpbroadcastd m2, [o(pw_4096)] pmulhrsw m0, m2 pmulhrsw m1, m2 test eobd, 0x300 jz .pass2_end test eobd, 0x200 jnz .vdpcm .hdpcm: call .write_8x4_hdpcm_start jmp .pass2_end3 .vdpcm: pslldq m2, m0, 8 paddw m1, m0 shufpd m3, m0, m1, 0x05 paddw m0, m2 paddw m1, m3 jmp .pass2_end .pass1_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt8_mat)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst8_mat)] .pass1_dst: vpbroadcastd xm3, [o(pw_181x128)] pmulhrsw xm0, xm3, [cfq+16*0] pmulhrsw xm1, xm3, [cfq+16*1] pmulhrsw xm2, xm3, [cfq+16*2] pmulhrsw xm3, [cfq+16*3] call m(inv_txfm_add_4x8_8bpc).dst8 vpbroadcastd m8, [o(pd_64)] vinserti128 m0, xm4, 1 vinserti128 m1, xm5, 1 vinserti128 m2, xm6, 1 vinserti128 m3, xm7, 1 REPX {paddd x, m8}, m0, m1, m2, m3 REPX {psrad x, 7}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 jmp .pass1_end .pass2_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst4_mat)] .pass2_dst: call .dst4 vpbroadcastd m4, [o(pw_4096)] REPX {psrad x, 7}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 jmp .pass2_end ALIGN function_align .dst4: IDST4_1D_PACKED INV_TXFM_FN 8, 8 WIN64_SPILL_XMM 12 jmp tx1q .dconly: movd xm5, [o(pw_33)] paddsw xm5, [cfq] or r4d, 2 psraw xm5, 6 jmp m(inv_txfm_add_8x4_8bpc).dconly2 .pass1_dct: vpermq m0, [cfq+32*0], q3120 vpermq m1, [cfq+32*1], q3120 vpermq m2, [cfq+32*2], q3120 vpermq m3, [cfq+32*3], q3120 .pass1_dct2: vpbroadcastd m10, [o(pd_64)] call .dct8 .pass1_end: call .shift7_pack punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhdq m2, m4, m1 punpckldq m4, m1 vinserti128 m1, m3, xm2, 1 vperm2i128 m3, m2, 0x31 vperm2i128 m2, m0, m4, 0x31 vinserti128 m0, xm4, 1 jmp tx2q .pass2_dct: vpbroadcastd m10, [o(pd_1024)] call .dct8 REPX {psrad x, 11}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 REPX {psrad x, 11}, m4, m5, m6, m7 packssdw m2, m4, m5 packssdw m3, m6, m7 .pass2_end: lea r6, [dsq*3] pxor m10, m10 call .write_8x8 .pass2_end2: REPX {mova [cfq+32*x], m10}, 0, 1, 2, 3 RET ALIGN function_align .shift7_pack: REPX {psrad x, 7}, m0, m4, m1, m5 packssdw m0, m4 packssdw m1, m5 REPX {psrad x, 7}, m2, m6, m3, m7 packssdw m2, m6 packssdw m3, m7 ret ALIGN function_align .dct8: IDCT8_1D_PACKED paddd m3, m10 ; rnd paddd m2, m10 paddd m1, m2, m0 ; a1 psubd m2, m0 ; a2 paddd m0, m3, m7 ; a0 psubd m3, m7 ; a3 .dct8c: psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 psubd m6, m1, m5 ; out6 paddd m1, m5 ; out1 psubd m5, m2, m4 ; out5 paddd m2, m4 ; out2 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 ret .pass1_identity: mova xm0, [cfq+16*0] vinserti128 m0, [cfq+16*4], 1 mova xm1, [cfq+16*1] vinserti128 m1, [cfq+16*5], 1 mova xm2, [cfq+16*2] vinserti128 m2, [cfq+16*6], 1 mova xm3, [cfq+16*3] vinserti128 m3, [cfq+16*7], 1 .pass1_identity2: vpbroadcastd m7, [o(pw_53x256)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(inv_txfm_add_4x16_8bpc).pass1_end2 .pass2_identity: vpbroadcastd m4, [o(pw_181x16)] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 test eobd, 0x300 jz .pass2_end lea r6, [dsq*3] pxor m10, m10 test eobd, 0x200 jnz .vdpcm .hdpcm: call m(inv_txfm_add_8x4_8bpc).write_8x4_hdpcm_vpermq vpermq m0, m2, q3120 vpermq m1, m3, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4_hdpcm jmp .pass2_end2 .vdpcm: call .write_8x8_vdpcm jmp .pass2_end2 ALIGN function_align .write_8x8_vdpcm: pslldq m8, m0, 8 paddw m1, m0 shufpd m9, m0, m1, 0x05 paddw m2, m1 paddw m0, m8 shufpd m8, m1, m2, 0x05 paddw m3, m2 paddw m1, m9 shufpd m9, m2, m3, 0x05 paddw m2, m8 paddw m3, m9 .write_8x8: call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq vpermq m0, m2, q3120 vpermq m1, m3, q3120 jmp m(inv_txfm_add_8x4_8bpc).write_8x4 .pass1_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt8_mat)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst8_mat)] .pass1_dst: vpermq m0, [cfq+32*0], q3120 vpermq m1, [cfq+32*1], q3120 vpermq m2, [cfq+32*2], q3120 vpermq m3, [cfq+32*3], q3120 .pass1_dst2: call .dst8 vpbroadcastd m8, [o(pd_64)] REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 jmp .pass1_end .pass2_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt8_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst8_mat)] .pass2_dst: call .dst8 vpbroadcastd m8, [o(pw_4096)] REPX {psrad x, 8}, m0, m1, m2, m3 packssdw m0, m1 packssdw m1, m2, m3 REPX {psrad x, 8}, m4, m5, m6, m7 packssdw m2, m4, m5 packssdw m3, m6, m7 REPX {pmulhrsw x, m8}, m0, m1, m2, m3 jmp .pass2_end ALIGN function_align .dst8: IDST8_1D_PACKED INV_TXFM_FN 4, 16 WIN64_SPILL_XMM 13 jmp tx1q .dconly: vpbroadcastw xm3, [cfq] vpbroadcastd xm2, [o(pw_512)] or r4d, 4 pmulhrsw xm3, xm2 jmp m(inv_txfm_add_4x4_8bpc).dconly3 .pass1_dct_fast: vbroadcasti128 m0, [cfq+32*0] vbroadcasti128 m2, [cfq+32*1] vbroadcasti128 m1, [cfq+32*2] vbroadcasti128 m3, [cfq+32*3] shufpd m0, m2, 0x0c shufpd m1, m3, 0x0c call m(inv_txfm_add_8x4_8bpc).dct4 .pass1_fast_end: REPX {psrad x, 6}, m0, m1, m2, m3 jmp m(inv_txfm_add_4x8_8bpc).pass1_end2 .pass1_dct: vpbroadcastd m10, [o(pd_32)] lea r3d, [eobq+(18<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r3d sub eobd, 26<<16 jl .pass1_dct_fast mova m0, [cfq+32*0] mova m1, [cfq+32*1] mova m2, [cfq+32*2] mova m3, [cfq+32*3] call m(inv_txfm_add_16x4_8bpc).dct4 .pass1_end: REPX {psrad x, 6}, m0, m4, m1, m5 packssdw m0, m4 packssdw m1, m5 REPX {psrad x, 6}, m2, m6, m3, m7 packssdw m2, m6 packssdw m3, m7 .pass1_end2: punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2_dct: vpbroadcastd m10, [o(pd_2048)] test eobd, eobd jl .pass2_dct_fast vextracti128 xm4, m0, 1 vextracti128 xm5, m1, 1 vextracti128 xm6, m2, 1 vextracti128 xm7, m3, 1 call .dct16 jmp .pass2_dct2 .pass2_dct_fast: call .dct16_fast .pass2_dct2: mova m5, [cfq+32*0] mova m6, [cfq+32*1] psubd m4, m0, m5 ; out15 out14 paddd m0, m5 ; out0 out1 psubd m5, m1, m6 ; out12 out13 paddd m1, m6 ; out3 out2 psrad m0, 12 psrad m1, 12 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct_start mova m1, [cfq+32*2] mova m6, [cfq+32*3] paddd m0, m2, m1 ; out4 out5 psubd m2, m1 ; out11 out10 paddd m1, m3, m6 ; out7 out6 psubd m3, m6 ; out8 out9 psrad m0, 12 psrad m1, 12 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct psrad m0, m3, 12 psrad m1, m2, 12 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct psrad m0, m5, 12 psrad m1, m4, 12 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 ALIGN function_align .dct16_fast: WRAP_XMM IDCT16_1D_PACKED_FAST 0 ALIGN function_align .dct16: WRAP_XMM IDCT16_1D_PACKED 0 .pass1_identity: mova m0, [cfq+32*0] mova m1, [cfq+32*1] mova m2, [cfq+32*2] mova m3, [cfq+32*3] REPX {paddsw x, x}, m0, m1, m2, m3 lea r3d, [eobq-(32<<16)] test eobb, 0x10 ; TX_CLASS_V cmovnz eobd, r3d jmp .pass1_end2 .pass2_identity_fast: call m(inv_txfm_add_4x8_8bpc).write_4x4 pmulhrsw xm0, xm4, xm2 pmulhrsw xm1, xm4, xm3 call m(inv_txfm_add_4x8_8bpc).write_4x4 jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 .pass2_identity: vpbroadcastd m4, [o(pw_2048)] lea r6, [dsq*3] pxor m10, m10 pmulhrsw m0, m4 pmulhrsw m1, m4 test eobd, eobd jl .pass2_identity_fast pmulhrsw m2, m4 pmulhrsw m3, m4 test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm call .write_4x16 jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 .hdpcm: call .write_4x16_hdpcm jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 .vdpcm: call .write_4x16_vdpcm jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 ALIGN function_align .write_4x16_hdpcm: psllq m8, m0, 16 psllq m9, m1, 16 paddw m0, m8 paddw m1, m9 psllq m8, m0, 32 psllq m9, m1, 32 paddw m0, m8 paddw m1, m9 psllq m8, m2, 16 psllq m9, m3, 16 paddw m2, m8 paddw m3, m9 psllq m8, m2, 32 psllq m9, m3, 32 paddw m2, m8 paddw m3, m9 .write_4x16: call .write_4x8 vextracti128 xm0, m0, 1 vextracti128 xm1, m1, 1 vextracti128 xm2, m2, 1 vextracti128 xm3, m3, 1 .write_4x8: call m(inv_txfm_add_4x8_8bpc).write_4x4 movd xm8, [dstq+dsq*0] pinsrd xm8, [dstq+dsq*1], 1 movd xm9, [dstq+dsq*2] pinsrd xm9, [dstq+r6 ], 1 pmovzxbw xm8, xm8 pmovzxbw xm9, xm9 paddsw xm8, xm2 paddsw xm9, xm3 jmp m(inv_txfm_add_4x8_8bpc).write_4x4_end .write_4x16_vdpcm: pslldq m8, m0, 8 paddsw m1, m0 shufpd m9, m0, m1, 0x05 paddsw m2, m1 paddsw m0, m8 shufpd m8, m1, m2, 0x05 paddsw m3, m2 paddsw m1, m9 shufpd m9, m2, m3, 0x05 paddsw m2, m8 paddsw m3, m9 call .write_4x8 punpckhqdq xm8, xm3, xm3 vextracti128 xm0, m0, 1 vextracti128 xm1, m1, 1 vextracti128 xm2, m2, 1 vextracti128 xm3, m3, 1 REPX {paddsw x, xm8}, xm0, xm1, xm2, xm3 jmp .write_4x8 .pass1_dst_fast: vbroadcasti128 m0, [cfq+32*0] vbroadcasti128 m2, [cfq+32*1] vbroadcasti128 m1, [cfq+32*2] vbroadcasti128 m3, [cfq+32*3] shufpd m0, m2, 0x0c shufpd m1, m3, 0x0c call m(inv_txfm_add_8x4_8bpc).dst4 REPX {paddd x, m12}, m0, m1, m2, m3 jmp .pass1_fast_end .pass1_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst4_mat)] .pass1_dst: vpbroadcastd m12, [o(pd_32)] %if WIN64 push r8 %endif lea r8d, [eobq+(18<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r8d %if WIN64 pop r8 %endif sub eobd, 26<<16 jl .pass1_dst_fast mova m0, [cfq+32*0] mova m1, [cfq+32*1] mova m2, [cfq+32*2] mova m3, [cfq+32*3] call m(inv_txfm_add_16x4_8bpc).dst4 REPX {paddd x, m12}, m0, m4, m1, m5, m2, m6, m3, m7 jmp .pass1_end .pass2_flipddt: lea r3, [o(flipddt16_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt16_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst16_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst16_mat)] .pass2_dst: vpbroadcastd m7, [o(pd_2048)] lea r6, [dsq*3] pxor m10, m10 %if WIN64 movaps [cfq+16*3], xm13 %endif punpcklwd m13, m0, m1 ; 0 2 punpckhwd m11, m0, m1 ; 1 3 punpcklwd m12, m2, m3 ; 4 6 punpckhwd m2, m3 ; 5 7 test eobd, eobd jl .pass2_dst_fast vextracti128 xm3, m13, 1 ; 8 10 vextracti128 xm4, m11, 1 ; 9 11 vextracti128 xm5, m12, 1 ; 12 14 vextracti128 xm6, m2, 1 ; 13 15 call .pass2_dst_4x4 call .pass2_dst_4x4 call .pass2_dst_4x4 call .pass2_dst_4x4 %if WIN64 movaps xm13, [cfq+16*3] %endif jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 .pass2_dst_fast: call .pass2_dst_4x4_fast call .pass2_dst_4x4_fast call .pass2_dst_4x4_fast call .pass2_dst_4x4_fast %if WIN64 movaps xm13, [cfq+16*3] %endif jmp m(inv_txfm_add_16x4_8bpc).pass2_end2 ALIGN function_align .pass2_dst_4x4: call .dst16x1 add r3, 32 mova [cfq+16*0], xm0 call .dst16x1 add r3, 32 mova [cfq+16*1], xm0 call .dst16x1 add r3, 32 mova [cfq+16*2], xm0 call .dst16x1 add r3, 32 vinserti128 m1, m0, [cfq+16*2], 1 paddd m0, m7, [cfq+32*0] jmp .pass2_dst_4x4b .pass2_dst_4x4_fast: call .dst16x2 vinserti128 m6, m0, xm1, 1 call .dst16x2 vinserti128 m1, xm0, 1 paddd m0, m7, m6 .pass2_dst_4x4b: paddd m1, m7 psrad m0, 12 psrad m1, 12 jmp m(inv_txfm_add_4x8_8bpc).write_4x4_dct ALIGN function_align .dst16x1: WRAP_XMM IDST16_1D_PACKED_1ROW ALIGN function_align .dst16x2: WRAP_XMM IDST16_1D_PACKED_2ROWS_FAST 1 add r3, 32*2 ret INV_TXFM_FN 16, 4 mova xm0, [cfq+16*0] mova xm1, [cfq+16*1] mova xm2, [cfq+16*2] mova xm3, [cfq+16*3] WIN64_SPILL_XMM 13 jmp tx1q .dconly: movd xm3, [o(pw_512)] or r4d, 2 .dconly2: pmulhrsw xm3, [cfq] .dconly3: vpbroadcastw m3, xm3 pxor xm2, xm2 mova [cfq], xm2 .dconly_loop: movu xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] dec r4d jg .dconly_loop vzeroupper ret .pass1_dct: vpbroadcastd m10, [o(pd_32)] lea r3d, [eobq+(3<<16)] test eobb, 0x10 ; TX_CLASS_H cmovz eobd, r3d cmp eobd, 32<<16 jl .pass1_dct_fast mova xm4, [cfq+16*4] mova xm5, [cfq+16*5] mova xm6, [cfq+16*6] mova xm7, [cfq+16*7] call m(inv_txfm_add_4x16_8bpc).dct16 jmp .pass1_dct2 .pass1_dct_fast: call m(inv_txfm_add_4x16_8bpc).dct16_fast .pass1_dct2: mova m6, [cfq+32*0] mova m5, [cfq+32*1] mova m4, [cfq+32*2] mova m8, [cfq+32*3] psubd m7, m0, m6 ; out15 out14 paddd m0, m6 ; out0 out1 psubd m6, m1, m5 ; out12 out13 paddd m1, m5 ; out3 out2 psubd m5, m2, m4 ; out11 out10 paddd m2, m4 ; out4 out5 psubd m4, m3, m8 ; out8 out9 paddd m3, m8 ; out7 out6 REPX {psrad x, 6}, m0, m4, m1, m5 packssdw m0, m4 packssdw m1, m5 REPX {psrad x, 6}, m6, m2, m7, m3 packssdw m2, m6 packssdw m3, m7 vpermq m0, m0, q3120 ; 0 1 8 9 vpermq m1, m1, q1302 ; 2 3 10 11 vpermq m2, m2, q3120 ; 4 5 12 13 vpermq m3, m3, q1302 ; 6 7 14 15 .pass1_end: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpckhwd m4, m2, m1 punpcklwd m2, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 jmp tx2q .pass2_dct: vpbroadcastd m10, [o(pd_2048)] call .dct4 REPX {psrad x, 12}, m0, m4, m1, m5 packssdw m0, m4 packssdw m1, m5 call .write_16x2 REPX {psrad x, 12}, m2, m6, m3, m7 packssdw m0, m2, m6 packssdw m1, m3, m7 .pass2_end: call .write_16x2 .pass2_end2: pxor m10, m10 .pass2_end3: REPX {mova [cfq+32*x], m10}, 0, 1, 2, 3 RET ALIGN function_align .write_16x2_hdpcm: pslldq m8, m0, 2 pslldq m9, m1, 2 paddw m0, m8 paddw m1, m9 pslldq m8, m0, 4 pslldq m9, m1, 4 paddw m0, m8 paddw m1, m9 pslldq m8, m0, 8 pslldq m9, m1, 8 paddw m0, m8 paddw m1, m9 pshufhw xm8, xm0, q3333 pshufhw xm9, xm1, q3333 vpermq m8, m8, q1133 vpermq m9, m9, q1133 paddw m0, m8 paddw m1, m9 .write_16x2: pmovzxbw m8, [dstq+dsq*0] pmovzxbw m9, [dstq+dsq*1] paddw m8, m0 paddw m9, m1 packuswb m8, m9 vpermq m8, m8, q3120 movu [dstq+dsq*0], xm8 vextracti128 [dstq+dsq*1], m8, 1 lea dstq, [dstq+dsq*2] ret ALIGN function_align .dct4: punpcklwd m7, m0, m2 ; 0 2 vpbroadcastd m6, [o(pw_64_64)] punpckhwd m0, m2 vpbroadcastd m8, [o(pw_64_m64)] punpckhwd m5, m3, m1 ; 3 1 vpbroadcastd m4, [o(pw_35_83)] punpcklwd m1, m3, m1 vpbroadcastd m3, [o(pw_m83_35)] pmaddwd m2, m6, m7 ; a0 pmaddwd m6, m0 pmaddwd m7, m8 ; a1 pmaddwd m8, m0 pmaddwd m0, m4, m1 ; b0 pmaddwd m4, m5 pmaddwd m1, m3 ; b1 pmaddwd m5, m3 REPX {paddd x, m10}, m2, m6, m7, m8 psubd m3, m2, m0 ; out3a paddd m0, m2 ; out0a psubd m2, m7, m1 ; out2a paddd m1, m7 ; out1a psubd m7, m6, m4 ; out3b paddd m4, m6 ; out0b psubd m6, m8, m5 ; out2b paddd m5, m8 ; out1b ret .pass1_identity: vinserti128 m0, [cfq+16*4], 1 ; 0 1 8 9 vinserti128 m1, [cfq+16*5], 1 ; 2 3 10 11 vinserti128 m2, [cfq+16*6], 1 ; 4 5 12 13 vinserti128 m3, [cfq+16*7], 1 ; 6 7 14 15 REPX {paddsw x, x}, m0, m1, m2, m3, m0, m1, m2, m3 jmp .pass1_end .pass2_identity: vpbroadcastd m4, [o(pw_1024)] pmulhrsw m0, m4 pmulhrsw m1, m4 test eobd, 0x300 jz .pass2_dst_end test eobd, 0x200 jnz .vdpcm .hdpcm: call .write_16x2_hdpcm pmulhrsw m0, m4, m2 pmulhrsw m1, m4, m3 call .write_16x2_hdpcm jmp .pass2_end2 .vdpcm: pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m1, m0 call .write_16x2 paddw m0, m2, m1 paddw m1, m3, m0 jmp .pass2_end .pass1_flipddt: lea r3, [o(flipddt16_mat+32*6)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt16_mat+32*6)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst16_mat+32*6)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst16_mat+32*6)] .pass1_dst: vpbroadcastd m7, [o(pd_32)] %if WIN64 movaps xm10, xm13 push r8 %define tmp rsp+8 %else %define tmp rsp-24 %endif punpcklwd xm13, xm0, xm1 ; 0 2 punpckhwd xm11, xm0, xm1 ; 1 3 punpcklwd xm12, xm2, xm3 ; 4 6 punpckhwd xm2, xm3 ; 5 7 lea r8d, [eobq+(3<<16)] test eobb, 0x10 ; TX_CLASS_H cmovz eobd, r8d mov r8d, 32*3 cmp eobd, 32<<16 jl .pass1_dst_fast mova xm4, [cfq+16*4] mova xm5, [cfq+16*5] mova xm6, [cfq+16*6] mova xm0, [cfq+16*7] punpcklwd xm3, xm4, xm5 ; 8 10 punpckhwd xm4, xm5 ; 9 11 punpcklwd xm5, xm6, xm0 ; 12 14 punpckhwd xm6, xm0 ; 13 15 .pass1_dst_loop: call m(inv_txfm_add_4x16_8bpc).dst16x1 ; 6 add r3, 32*8 mova [cfq+16*0], xm0 call m(inv_txfm_add_4x16_8bpc).dst16x1 ; 14 add r3, 32*1 mova [cfq+16*1], xm0 call m(inv_txfm_add_4x16_8bpc).dst16x1 ; 15 sub r3, 32*8 mova [tmp], xm0 call m(inv_txfm_add_4x16_8bpc).dst16x1 ; 7 sub r3, 32*3 vinserti128 m1, m0, [tmp], 1 paddd m0, m7, [cfq] paddd m1, m7 psrad m0, 6 ; 6 14 psrad m1, 6 ; 7 15 packssdw m0, m1 mova [cfq+r8], m0 sub r8d, 32 jge .pass1_dst_loop jmp .pass1_dst_end .pass1_dst_fast: call .dst16x2_fast add r3, 32 vinserti128 m6, m0, xm1, 1 call .dst16x2_fast sub r3, 32*3 vinserti128 m1, m0, xm1, 1 paddd m0, m7, m6 paddd m1, m7 psrad m0, 6 ; 6 14 psrad m1, 6 ; 7 15 packssdw m0, m1 mova [cfq+r8], m0 sub r8d, 32 jge .pass1_dst_fast .pass1_dst_end: mova m1, [cfq+32*1] mova m2, [cfq+32*2] mova m3, [cfq+32*3] %if WIN64 movaps xm13, xm10 pop r8 %endif jmp .pass1_end .pass2_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst4_mat)] .pass2_dst: call .dst4 call m(inv_txfm_add_8x8_8bpc).shift7_pack vpbroadcastd m4, [o(pw_1024)] pmulhrsw m0, m4 pmulhrsw m1, m4 .pass2_dst_end: call .write_16x2 pmulhrsw m0, m4, m2 pmulhrsw m1, m4, m3 jmp .pass2_end ALIGN function_align .dst16x2_fast: WRAP_XMM IDST16_1D_PACKED_2ROWS_FAST 8 ret ALIGN function_align .dst4: vpbroadcastd m5, [r3+4*0] punpcklwd m8, m0, m2 ; 0 2 vpbroadcastd m7, [r3+4*3] punpckhwd m9, m0, m2 vpbroadcastd m10, [r3+4*1] punpcklwd m2, m3, m1 ; 3 1 vpbroadcastd m11, [r3+4*2] punpckhwd m6, m3, m1 pmaddwd m0, m5, m8 pmaddwd m3, m7, m2 pmaddwd m4, m5, m9 pmaddwd m1, m7, m6 paddd m0, m3 ; out0a paddd m4, m1 ; out0b pmaddwd m3, m7, m8 pmaddwd m1, m5, m2 pmaddwd m7, m9 pmaddwd m5, m6 psubd m3, m1 ; out3a psubd m7, m5 ; out3b pmaddwd m1, m10, m8 pmaddwd m5, m11, m2 pmaddwd m8, m11 pmaddwd m2, m10 psubd m1, m5 ; out1a paddd m2, m8 ; out2a pmaddwd m5, m10, m9 pmaddwd m9, m11 pmaddwd m11, m6 pmaddwd m6, m10 psubd m5, m11 ; out1b paddd m6, m9 ; out2b ret INV_TXFM_FN 8, 16 add cfq, 32*4 WIN64_SPILL_XMM 13 vpbroadcastd m8, [o(pw_181x128)] jmp tx1q .dconly: movd xm5, [o(pw_181x128)] pmulhrsw xm5, [cfq] movd xm4, [o(pw_33)] or r4d, 4 paddw xm5, xm4 psraw xm5, 6 jmp m(inv_txfm_add_8x4_8bpc).dconly2 ALIGN function_align .pass1_fast_load: vbroadcasti128 m0, [cfq-32*4] vbroadcasti128 m4, [cfq-32*3] vbroadcasti128 m1, [cfq-32*2] vbroadcasti128 m5, [cfq-32*1] vbroadcasti128 m2, [cfq+32*0] vbroadcasti128 m6, [cfq+32*1] vbroadcasti128 m3, [cfq+32*2] vbroadcasti128 m7, [cfq+32*3] shufpd m0, m4, 0x0c shufpd m1, m5, 0x0c shufpd m2, m6, 0x0c shufpd m3, m7, 0x0c ret .pass1_dct_fast: call .pass1_fast_load REPX {pmulhrsw x, m8}, m0, m1, m2, m3 jmp m(inv_txfm_add_8x8_8bpc).pass1_dct2 .pass1_dct: lea r3d, [eobq+(28<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r3d sub eobd, 36<<16 jl .pass1_dct_fast pmulhrsw m1, m8, [cfq-32*3] pmulhrsw m3, m8, [cfq-32*1] pmulhrsw m5, m8, [cfq+32*1] pmulhrsw m7, m8, [cfq+32*3] call m(inv_txfm_add_16x8_8bpc).dct8 mova [cfq-32*3], m0 mova [cfq-32*1], m1 mova [cfq+32*1], m2 mova [cfq+32*3], m3 vpbroadcastd m3, [o(pw_181x128)] pmulhrsw m0, m3, [cfq-32*4] pmulhrsw m1, m3, [cfq-32*2] pmulhrsw m2, m3, [cfq+32*0] pmulhrsw m3, [cfq+32*2] mova [cfq-32*4], m4 mova [cfq-32*2], m5 mova [cfq+32*0], m6 mova [cfq+32*2], m7 vpbroadcastd m10, [o(pd_64)] call m(inv_txfm_add_16x4_8bpc).dct4 mova m8, [cfq-32*3] ; b0 mova m9, [cfq-32*4] psubd m10, m0, m8 ; out7a paddd m0, m8 ; out0a mova m8, [cfq+32*3] ; b3 psubd m11, m4, m9 ; out7b paddd m4, m9 ; out0b mova m9, [cfq+32*2] REPX {psrad x, 7}, m0, m4, m10, m11 packssdw m0, m4 psubd m4, m3, m8 ; out4a paddd m3, m8 ; out3a psubd m8, m7, m9 ; out4b paddd m7, m9 ; out3b mova m9, [cfq-32*1] ; b1 REPX {psrad x, 7}, m4, m8, m3, m7 packssdw m4, m8 mova m8, [cfq-32*2] packssdw m3, m7 packssdw m7, m10, m11 psubd m10, m1, m9 ; out6a paddd m1, m9 ; out1a mova m9, [cfq+32*1] ; b2 psubd m11, m5, m8 ; out6b paddd m5, m8 ; out1b mova m8, [cfq+32*0] REPX {psrad x, 7}, m1, m5, m10, m11 packssdw m1, m5 psubd m5, m2, m9 ; out5a paddd m2, m9 ; out2a psubd m9, m6, m8 ; out5b paddd m6, m8 ; out2b REPX {psrad x, 7}, m5, m9, m2, m6 packssdw m5, m9 packssdw m2, m6 packssdw m6, m10, m11 .pass1_end: call .transpose16x8 jmp tx2q .pass2_dct: vpbroadcastd m10, [o(pd_1024)] test eobd, eobd jl .pass2_dct_fast call .dct16 jmp .pass2_dct2 .pass2_dct_fast: call .dct16_fast .pass2_dct2: mova m12, [cfq-32*4] mova m9, [cfq-32*3] psubd m8, m0, m12 ; out15 paddd m0, m12 ; out0 psubd m12, m1, m9 ; out14 paddd m1, m9 ; out1 REPX {psrad x, 11}, m0, m1, m8, m12 packssdw m0, m1 mova m1, [cfq-32*1] packssdw m12, m8 mova m8, [cfq-32*2] psubd m9, m2, m1 ; out13 paddd m1, m2 ; out2 paddd m2, m3, m8 ; out3 psubd m3, m8 ; out12 REPX {psrad x, 11}, m1, m2, m9, m3 packssdw m1, m2 packssdw m3, m9 call m(inv_txfm_add_8x4_8bpc).write_8x4_start mova m1, [cfq+32*0] mova m2, [cfq+32*1] paddd m0, m4, m1 ; out4 psubd m4, m1 ; out11 paddd m1, m5, m2 ; out5 psubd m5, m2 ; out10 mova m2, [cfq+32*3] mova m8, [cfq+32*2] REPX {psrad x, 11}, m0, m1, m4, m5 packssdw m0, m1 paddd m1, m6, m2 ; out6 psubd m6, m2 ; out9 psubd m2, m7, m8 ; out8 paddd m7, m8 ; out7 REPX {psrad x, 11}, m1, m7, m6, m2 packssdw m1, m7 call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq packssdw m0, m2, m6 packssdw m1, m5, m4 call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq vpermq m0, m3, q3120 vpermq m1, m12, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 ALIGN function_align .dct16_fast: IDCT16_1D_PACKED_FAST -4 ALIGN function_align .dct16: IDCT16_1D_PACKED -4 ALIGN function_align .transpose16x8: vperm2i128 m9, m3, m7, 0x31 vinserti128 m3, xm7, 1 vperm2i128 m8, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m0, m4, 0x31 vinserti128 m0, xm4, 1 punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m7, m5, m6 punpcklwd m5, m6 punpcklwd m6, m8, m9 punpckhwd m8, m9 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpckldq m6, m7, m8 punpckhdq m7, m8 ret .pass1_identity_fast: mova xm0, [cfq-32*4] vinserti128 m0, [cfq+32*0], 1 mova xm1, [cfq-32*3] vinserti128 m1, [cfq+32*1], 1 mova xm2, [cfq-32*2] vinserti128 m2, [cfq+32*2], 1 mova xm3, [cfq-32*1] vinserti128 m3, [cfq+32*3], 1 REPX {pmulhrsw x, m8}, m0, m1, m2, m3 jmp m(inv_txfm_add_8x8_8bpc).pass1_identity2 .pass1_identity: lea r3d, [eobq-(64<<16)] test eobb, 0x10 ; TX_CLASS_V cmovnz eobd, r3d test eobd, eobd jl .pass1_identity_fast pmulhrsw m0, m8, [cfq-32*4] pmulhrsw m1, m8, [cfq-32*3] pmulhrsw m2, m8, [cfq-32*2] pmulhrsw m3, m8, [cfq-32*1] vpbroadcastd m12, [o(pw_53x256)] pmulhrsw m4, m8, [cfq+32*0] pmulhrsw m5, m8, [cfq+32*1] pmulhrsw m6, m8, [cfq+32*2] pmulhrsw m7, m8, [cfq+32*3] pmulhrsw m8, m12, m0 pmulhrsw m9, m12, m1 pmulhrsw m10, m12, m2 pmulhrsw m11, m12, m3 paddw m0, m8 pmulhrsw m8, m12, m4 paddw m1, m9 pmulhrsw m9, m12, m5 paddw m2, m10 pmulhrsw m10, m12, m6 paddw m3, m11 pmulhrsw m12, m7 paddw m4, m8 paddw m5, m9 paddw m6, m10 paddw m7, m12 jmp .pass1_end .pass2_identity: vpbroadcastd m8, [o(pw_4096)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3 lea r6, [dsq*3] pxor m10, m10 test eobd, eobd jl .pass2_identity_fast REPX {pmulhrsw x, m8}, m4, m5, m6, m7 test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm call .write_8x16 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 .hdpcm: call .write_8x16_hdpcm jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 .vdpcm: call .write_8x16_vdpcm jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 .pass2_identity_fast: call m(inv_txfm_add_8x8_8bpc).write_8x8 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 ALIGN function_align .write_8x16_hdpcm: call m(inv_txfm_add_8x4_8bpc).write_8x4_hdpcm_vpermq vpermq m0, m2, q3120 vpermq m1, m3, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4_hdpcm vpermq m0, m4, q3120 vpermq m1, m5, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4_hdpcm vpermq m0, m6, q3120 vpermq m1, m7, q3120 jmp m(inv_txfm_add_8x4_8bpc).write_8x4_hdpcm .write_8x16_vdpcm: punpcklqdq m8, m10, m0 paddw m1, m0 shufpd m9, m0, m1, 0x05 paddw m2, m1 paddw m0, m8 shufpd m8, m1, m2, 0x05 paddw m3, m2 paddw m1, m9 shufpd m9, m2, m3, 0x05 paddw m4, m3 paddw m2, m8 shufpd m8, m3, m4, 0x05 paddw m5, m4 paddw m3, m9 shufpd m9, m4, m5, 0x05 paddw m6, m5 paddw m4, m8 shufpd m8, m5, m6, 0x05 paddw m7, m6 paddw m5, m9 shufpd m9, m6, m7, 0x05 paddw m6, m8 paddw m7, m9 .write_8x16: call m(inv_txfm_add_8x8_8bpc).write_8x8 vpermq m0, m4, q3120 vpermq m1, m5, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4 vpermq m0, m6, q3120 vpermq m1, m7, q3120 jmp m(inv_txfm_add_8x4_8bpc).write_8x4 .pass1_dst_fast: call .pass1_fast_load REPX {pmulhrsw x, m8}, m0, m1, m2, m3 %if WIN64 pop r8 %endif jmp m(inv_txfm_add_8x8_8bpc).pass1_dst2 .pass1_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt8_mat)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst8_mat)] .pass1_dst: %if WIN64 push r8 %endif lea r8d, [eobq+(28<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r8d sub eobd, 36<<16 jl .pass1_dst_fast pmulhrsw m0, m8, [cfq-32*4] pmulhrsw m1, m8, [cfq-32*3] pmulhrsw m2, m8, [cfq-32*2] pmulhrsw m3, m8, [cfq-32*1] pmulhrsw m4, m8, [cfq+32*0] pmulhrsw m5, m8, [cfq+32*1] pmulhrsw m6, m8, [cfq+32*2] pmulhrsw m7, m8, [cfq+32*3] punpcklwd m10, m0, m2 ; 0 2 punpckhwd m0, m2 punpcklwd m2, m1, m3 ; 1 3 punpckhwd m1, m3 punpcklwd m3, m4, m6 ; 4 6 punpckhwd m4, m6 punpcklwd m11, m5, m7 ; 5 7 punpckhwd m5, m7 vpbroadcastd m12, [o(pd_64)] mov r8, -32*7 .pass1_dst_loop: call m(inv_txfm_add_16x8_8bpc).dst8 paddd m6, m12 paddd m7, m12 psrad m6, 7 psrad m7, 7 packssdw m7, m6, m7 mova [cfq+r8+32*3], m7 add r8, 32 jle .pass1_dst_loop %if WIN64 pop r8 %endif mova m0, [cfq-32*4] mova m1, [cfq-32*3] mova m2, [cfq-32*2] mova m3, [cfq-32*1] mova m4, [cfq+32*0] mova m5, [cfq+32*1] mova m6, [cfq+32*2] jmp .pass1_end .pass2_dst_fast: IDST16_1D_PACKED_2ROWS_FAST 1 add r3, 32*2 psrad m0, 8 psrad m1, 8 packssdw m0, m1 vpermq m0, m0, q3120 mova [cfq+r5+32*3], m0 add r5, 32 jle .pass2_dst_fast jmp .pass2_dst_end .pass2_flipddt: lea r3, [o(flipddt16_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt16_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst16_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst16_mat)] .pass2_dst: punpcklwd m10, m0, m1 ; 0 2 punpckhwd m11, m0, m1 ; 1 3 punpcklwd m12, m2, m3 ; 4 6 punpckhwd m2, m3 ; 5 7 mov r5, -32*7 test eobd, eobd jl .pass2_dst_fast punpcklwd m3, m4, m5 ; 8 10 punpckhwd m4, m5 ; 9 11 punpcklwd m5, m6, m7 ; 12 14 punpckhwd m6, m7 ; 13 15 .pass2_dst_loop: call .dst16x1 add r3, 32 psrad m7, m0, 8 call .dst16x1 add r3, 32 psrad m0, 8 packssdw m0, m7 vpermq m0, m0, q2031 mova [cfq+r5+32*3], m0 add r5, 32 jle .pass2_dst_loop .pass2_dst_end: vpbroadcastd m2, [o(pw_4096)] pmulhrsw m0, m2, [cfq-32*4] pmulhrsw m1, m2, [cfq-32*3] lea r6, [dsq*3] pxor m10, m10 call m(inv_txfm_add_8x4_8bpc).write_8x4 pmulhrsw m0, m2, [cfq-32*2] pmulhrsw m1, m2, [cfq-32*1] call m(inv_txfm_add_8x4_8bpc).write_8x4 pmulhrsw m0, m2, [cfq+32*0] pmulhrsw m1, m2, [cfq+32*1] call m(inv_txfm_add_8x4_8bpc).write_8x4 pmulhrsw m0, m2, [cfq+32*2] pmulhrsw m1, m2, [cfq+32*3] call m(inv_txfm_add_8x4_8bpc).write_8x4 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 ALIGN function_align .dst16x1: IDST16_1D_PACKED_1ROW ret INV_TXFM_FN 16, 8 add cfq, 32*4 WIN64_SPILL_XMM 13 vpbroadcastd m8, [o(pw_181x128)] jmp tx1q .dconly: movd xm3, [o(pw_181x128)] pmulhrsw xm3, [cfq] movd xm2, [o(pw_33)] or r4d, 4 paddw xm3, xm2 psraw xm3, 6 jmp m(inv_txfm_add_16x4_8bpc).dconly3 .pass1_dct: pmulhrsw m0, m8, [cfq-32*4] pmulhrsw m1, m8, [cfq-32*3] pmulhrsw m2, m8, [cfq-32*2] pmulhrsw m3, m8, [cfq-32*1] vpbroadcastd m10, [o(pd_64)] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 lea r3d, [eobq+(21<<16)] test eobb, 0x10 ; TX_CLASS_H cmovz eobd, r3d cmp eobd, 64<<16 jl .pass1_dct_fast pmulhrsw m4, m8, [cfq+32*0] pmulhrsw m5, m8, [cfq+32*1] pmulhrsw m6, m8, [cfq+32*2] pmulhrsw m7, m8, [cfq+32*3] REPX {vpermq x, x, q3120}, m4, m5, m6, m7 call m(inv_txfm_add_8x16_8bpc).dct16 jmp .pass1_dct2 .pass1_dct_fast: call m(inv_txfm_add_8x16_8bpc).dct16_fast .pass1_dct2: %macro IDCT_8X16_PASS1_END 5 ; a[1-2], b_mem[1-2], shift mova m9, [cfq+32*%3] mova m10, [cfq+32*%4] psubd m8, %1, m9 paddd %1, m9 psubd m9, %2, m10 paddd %2, m10 REPX {psrad x, %5}, %1, m9, m8, %2 packssdw %1, m9 packssdw %2, m8 %endmacro IDCT_8X16_PASS1_END m0, m7, -4, 2, 7 IDCT_8X16_PASS1_END m1, m6, -3, 3, 7 IDCT_8X16_PASS1_END m2, m5, -1, 1, 7 IDCT_8X16_PASS1_END m3, m4, -2, 0, 7 .pass1_end: call .transpose8x8_vpermq jmp tx2q .pass2_dct: mova [cfq+32*0], m0 mova [cfq+32*1], m2 mova [cfq+32*2], m4 mova [cfq+32*3], m6 call .dct8 mova [cfq-32*4], m0 mova [cfq-32*3], m1 mova [cfq-32*2], m2 mova [cfq-32*1], m3 mova m0, [cfq+32*0] mova m1, [cfq+32*1] mova m2, [cfq+32*2] mova m3, [cfq+32*3] mova [cfq+32*0], m4 mova [cfq+32*1], m5 mova [cfq+32*2], m6 mova [cfq+32*3], m7 vpbroadcastd m10, [o(pd_1024)] call m(inv_txfm_add_16x4_8bpc).dct4 mova m11, [cfq-32*4] mova m9, [cfq+32*0] psubd m10, m0, m11 paddd m0, m11 paddd m11, m4, m9 psubd m4, m9 mova m8, [cfq-32*3] mova m9, [cfq+32*1] psrad m0, 11 psrad m11, 11 packssdw m0, m11 psubd m11, m1, m8 paddd m1, m8 paddd m8, m5, m9 psubd m5, m9 psrad m1, 11 psrad m8, 11 packssdw m1, m8 call m(inv_txfm_add_16x4_8bpc).write_16x2 mova m1, [cfq-32*2] mova m9, [cfq+32*2] paddd m0, m2, m1 psubd m2, m1 paddd m1, m6, m9 psubd m6, m9 mova m8, [cfq-32*1] mova m9, [cfq+32*3] psrad m0, 11 psrad m1, 11 packssdw m0, m1 paddd m1, m3, m8 psubd m3, m8 paddd m8, m7, m9 psubd m7, m9 psrad m1, 11 psrad m8, 11 packssdw m1, m8 call m(inv_txfm_add_16x4_8bpc).write_16x2 REPX {psrad x, 11}, m3, m7, m2, m6 packssdw m0, m3, m7 packssdw m1, m2, m6 call m(inv_txfm_add_16x4_8bpc).write_16x2 REPX {psrad x, 11}, m11, m5, m10, m4 packssdw m0, m11, m5 packssdw m1, m10, m4 .pass2_end: call m(inv_txfm_add_16x4_8bpc).write_16x2 .pass2_end2: pxor m10, m10 .pass2_end3: REPX {mova [cfq+32*x], m10}, -4, -3, -2, -1 jmp m(inv_txfm_add_16x4_8bpc).pass2_end3 ALIGN function_align .transpose8x8_vpermq: REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 .transpose8x8: punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m8, m1 punpckhdq m8, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 ret ALIGN function_align .dct8: vpbroadcastd m4, [o(pw_18_75)] vpbroadcastd m6, [o(pw_89_50)] punpcklwd m8, m1, m5 punpckhwd m9, m1, m5 punpcklwd m11, m7, m3 punpckhwd m12, m7, m3 pmaddwd m1, m8, m6 ; a0 pmaddwd m5, m9, m6 pmaddwd m3, m8, m4 ; d0 pmaddwd m7, m9, m4 pmaddwd m0, m11, m4 ; a1 pmaddwd m4, m12 pmaddwd m2, m11, m6 ; -d1 pmaddwd m6, m12 paddd m0, m1 ; out0a paddd m4, m5 ; out0b psubd m3, m2 ; out3a psubd m7, m6 ; out3b vpbroadcastd m6, [o(pw_75_m89)] vpbroadcastd m2, [o(pw_50_18)] pmaddwd m1, m8, m6 ; b0 pmaddwd m8, m2 ; c0 pmaddwd m5, m11, m2 ; -b1 psubd m1, m5 ; out1a pmaddwd m5, m9, m6 pmaddwd m9, m2 pmaddwd m2, m12 psubd m5, m2 ; out1b pmaddwd m2, m11, m6 ; c1 pmaddwd m6, m12 paddd m2, m8 ; out2a paddd m6, m9 ; out2b ret .pass1_identity: mova xm0, [cfq-16*8] vinserti128 m0, [cfq+16*0], 1 mova xm1, [cfq-16*7] vinserti128 m1, [cfq+16*1], 1 mova xm2, [cfq-16*6] vinserti128 m2, [cfq+16*2], 1 mova xm3, [cfq-16*5] vinserti128 m3, [cfq+16*3], 1 mova xm4, [cfq-16*4] vinserti128 m4, [cfq+16*4], 1 mova xm5, [cfq-16*3] vinserti128 m5, [cfq+16*5], 1 mova xm6, [cfq-16*2] vinserti128 m6, [cfq+16*6], 1 mova xm7, [cfq-16*1] vinserti128 m7, [cfq+16*7], 1 REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose8x8 jmp tx2q .pass2_identity: vpbroadcastd m11, [o(pw_181x16)] test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm call .write_16x8_rnd jmp .pass2_end2 .hdpcm: call .write_16x8_rnd_hdpcm jmp .pass2_end2 .vdpcm: call .write_16x8_rnd_vdpcm jmp .pass2_end2 .pass1_dst_fast: call .dst16x2_fast psrad m0, 7 psrad m1, 7 packssdw m0, m1 mova [cfq+r8+32*3], m0 add r8, 32 jle .pass1_dst_fast jmp .pass1_dst_end .pass1_flipddt: lea r3, [o(flipddt16_mat+32*7)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt16_mat+32*7)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst16_mat+32*7)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst16_mat+32*7)] .pass1_dst: pmulhrsw m11, m8, [cfq-32*4] pmulhrsw m1, m8, [cfq-32*3] pmulhrsw m2, m8, [cfq-32*2] pmulhrsw m3, m8, [cfq-32*1] vpbroadcastd m7, [o(pd_64)] REPX {vpermq x, x, q3120}, m11, m1, m2, m3 punpcklwd m10, m11, m1 ; 0 2 punpckhwd m11, m1 ; 1 3 punpcklwd m12, m2, m3 ; 4 6 punpckhwd m2, m3 ; 5 7 %if WIN64 push r8 %endif lea r8d, [eobq+(21<<16)] test eobb, 0x10 ; TX_CLASS_H cmovz eobd, r8d mov r8, -32*7 cmp eobd, 64<<16 jl .pass1_dst_fast pmulhrsw m4, m8, [cfq+32*0] pmulhrsw m5, m8, [cfq+32*1] pmulhrsw m6, m8, [cfq+32*2] pmulhrsw m0, m8, [cfq+32*3] REPX {vpermq x, x, q3120}, m4, m5, m6, m0 punpcklwd m3, m4, m5 ; 8 10 punpckhwd m4, m5 ; 9 11 punpcklwd m5, m6, m0 ; 12 14 punpckhwd m6, m0 ; 13 15 .pass1_dst_loop: call m(inv_txfm_add_8x16_8bpc).dst16x1 add r3, 32*8 mova [cfq+r8+32*3], m0 call m(inv_txfm_add_8x16_8bpc).dst16x1 sub r3, 32*9 paddd m1, m7, [cfq+r8+32*3] paddd m0, m7 psrad m1, 7 psrad m0, 7 packssdw m0, m1, m0 mova [cfq+r8+32*3], m0 add r8, 32 jle .pass1_dst_loop .pass1_dst_end: mova m7, [cfq-32*4] mova m6, [cfq-32*3] mova m5, [cfq-32*2] mova m4, [cfq-32*1] mova m3, [cfq+32*0] mova m2, [cfq+32*1] mova m1, [cfq+32*2] %if WIN64 pop r8 %endif jmp .pass1_end .pass2_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt8_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst8_mat)] .pass2_dst: punpcklwd m10, m0, m2 ; 0 2 punpckhwd m0, m2 punpcklwd m2, m1, m3 ; 1 3 punpckhwd m1, m3 punpcklwd m3, m4, m6 ; 4 6 punpckhwd m4, m6 punpcklwd m11, m5, m7 ; 5 7 punpckhwd m5, m7 mov r5, -32*7 .pass2_dst_loop: call .dst8 psrad m6, 8 psrad m7, 8 packssdw m6, m7 mova [cfq+r5+32*3], m6 add r5, 32 jle .pass2_dst_loop vpbroadcastd m11, [o(pw_4096)] call .write_16x8_rnd2 jmp .pass2_end2 ALIGN function_align .dst8: vpbroadcastd m7, [r3+4*0] vpbroadcastd m9, [r3+4*1] pmaddwd m6, m10, m7 pmaddwd m7, m0 pmaddwd m8, m2, m9 pmaddwd m9, m1 paddd m6, m8 vpbroadcastd m8, [r3+4*2] paddd m7, m9 pmaddwd m9, m3, m8 pmaddwd m8, m4 paddd m6, m9 vpbroadcastd m9, [r3+4*3] add r3, 4*4 paddd m7, m8 pmaddwd m8, m11, m9 pmaddwd m9, m5 paddd m6, m8 paddd m7, m9 ret ALIGN function_align .dst16x2_fast: IDST16_1D_PACKED_2ROWS_FAST 8 sub r3, 32 paddd m0, m7 paddd m1, m7 ret ALIGN function_align .write_16x8_rnd: pmulhrsw m0, m11 pmulhrsw m1, m11 call m(inv_txfm_add_16x4_8bpc).write_16x2 pmulhrsw m0, m11, m2 pmulhrsw m1, m11, m3 call m(inv_txfm_add_16x4_8bpc).write_16x2 pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 call m(inv_txfm_add_16x4_8bpc).write_16x2 pmulhrsw m0, m11, m6 pmulhrsw m1, m11, m7 jmp m(inv_txfm_add_16x4_8bpc).write_16x2 ALIGN function_align .write_16x8_rnd2: pmulhrsw m0, m11, [cfq-32*4] pmulhrsw m1, m11, [cfq-32*3] call m(inv_txfm_add_16x4_8bpc).write_16x2 pmulhrsw m0, m11, [cfq-32*2] pmulhrsw m1, m11, [cfq-32*1] call m(inv_txfm_add_16x4_8bpc).write_16x2 pmulhrsw m0, m11, [cfq+32*0] pmulhrsw m1, m11, [cfq+32*1] call m(inv_txfm_add_16x4_8bpc).write_16x2 pmulhrsw m0, m11, [cfq+32*2] pmulhrsw m1, m11, [cfq+32*3] jmp m(inv_txfm_add_16x4_8bpc).write_16x2 ALIGN function_align .write_16x8_rnd_hdpcm: pmulhrsw m0, m11 pmulhrsw m1, m11 call m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm pmulhrsw m0, m11, m2 pmulhrsw m1, m11, m3 call m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 call m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm pmulhrsw m0, m11, m6 pmulhrsw m1, m11, m7 jmp m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm ALIGN function_align .write_16x8_rnd_hdpcm2: pmulhrsw m0, m11, [cfq-32*4] pmulhrsw m1, m11, [cfq-32*3] call m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm pmulhrsw m0, m11, [cfq-32*2] pmulhrsw m1, m11, [cfq-32*1] call m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm pmulhrsw m0, m11, [cfq+32*0] pmulhrsw m1, m11, [cfq+32*1] call m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm pmulhrsw m0, m11, [cfq+32*2] pmulhrsw m1, m11, [cfq+32*3] jmp m(inv_txfm_add_16x4_8bpc).write_16x2_hdpcm ALIGN function_align .write_16x8_rnd_vdpcm: REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 .write_16x8_vdpcm: paddw m1, m0 call m(inv_txfm_add_16x4_8bpc).write_16x2 paddw m0, m2, m1 paddw m1, m3, m0 call m(inv_txfm_add_16x4_8bpc).write_16x2 paddw m0, m4, m1 paddw m1, m5, m0 call m(inv_txfm_add_16x4_8bpc).write_16x2 paddw m0, m6, m1 paddw m1, m7, m0 jmp m(inv_txfm_add_16x4_8bpc).write_16x2 ; At the start of pass2 rows 0-7 are stored in m0-m7 and rows 8-15 in cfq INV_TXFM_FN 16, 16 add cfq, 32*4 PROLOGUE 0, 7, 14, 32*32 jmp tx1q .dconly: movd xm3, [o(pw_256)] or r4d, 8 jmp m(inv_txfm_add_16x4_8bpc).dconly2 .pass1_dct_fast: call m(inv_txfm_add_8x16_8bpc).pass1_fast_load vpbroadcastd m10, [o(pd_32)] call m(inv_txfm_add_8x16_8bpc).dct16_fast .pass1_dct2: IDCT_8X16_PASS1_END m0, m7, -4, 2, 6 IDCT_8X16_PASS1_END m1, m6, -3, 3, 6 IDCT_8X16_PASS1_END m2, m5, -1, 1, 6 IDCT_8X16_PASS1_END m3, m4, -2, 0, 6 jmp m(inv_txfm_add_16x8_8bpc).pass1_end .pass1_dct: lea r3d, [eobq+(28<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r3d sub eobd, 36<<16 jl .pass1_dct_fast mova m1, [cfq-32* 3] mova m3, [cfq-32* 1] mova m5, [cfq+32* 1] mova m7, [cfq+32* 3] mova m8, [cfq+32* 5] mova m9, [cfq+32* 7] mova m10, [cfq+32* 9] mova m11, [cfq+32*11] call .dct16 lea r3, [cfq+32*8] mova m1, [cfq-32*2] ; 2 mova m3, [cfq+32*2] ; 6 mova m5, [r3 -32*2] ; 10 mova m7, [r3 +32*2] ; 14 call m(inv_txfm_add_16x8_8bpc).dct8 mova [cfq+32*3], m0 mova [cfq+32*2], m1 mova [cfq-32*2], m2 mova [cfq-32*1], m3 mova m0, [cfq-32*4] ; 0 mova m1, [cfq+32*0] ; 4 mova m2, [r3 -32*4] ; 8 mova m3, [r3 +32*0] ; 12 mova [cfq-32*4], m4 mova [cfq-32*3], m5 mova [cfq+32*1], m6 mova [cfq+32*0], m7 vpbroadcastd m10, [o(pd_32)] call m(inv_txfm_add_16x4_8bpc).dct4 mova m8, [cfq+32*3] ; dct8 b0 mova m9, [cfq-32*4] mova m10, [rsp+32*0] ; dct16 c0 mova m11, [rsp+32*1] psubd m12, m0, m8 ; a7 paddd m0, m8 ; a0 psubd m8, m4, m9 paddd m4, m9 psubd m9, m0, m10 ; out15a paddd m10, m0 ; out0a psubd m0, m4, m11 ; out15b paddd m4, m11 ; out0b mova m11, [rsp+32*8] ; dct16 b7 mova m13, [rsp+32*9] REPX {psrad x, 6}, m10, m4, m9, m0 packssdw m10, m4 packssdw m9, m0 psubd m4, m12, m11 ; out7a paddd m12, m11 ; out8a psubd m0, m8, m13 ; out7b paddd m8, m13 ; out8b mova m11, [cfq+32*2] ; dct8 b1 mova m13, [cfq-32*3] REPX {psrad x, 6}, m4, m0, m12, m8 packssdw m4, m0 packssdw m12, m8 mova [r3 +16*6], xm10 vextracti128 [cfq+16*6], m10, 1 mova [r3 -16*7], xm9 vextracti128 [cfq-16*7], m9, 1 mova [r3 -16*8], xm4 vextracti128 [cfq-16*8], m4, 1 mova [r3 +16*7], xm12 vextracti128 [cfq+16*7], m12, 1 mova m4, [rsp+32*2] ; dct16 b1 mova m0, [rsp+32*3] psubd m10, m1, m11 ; a6 paddd m1, m11 ; a1 psubd m9, m5, m13 paddd m5, m13 psubd m8, m1, m4 ; out14a paddd m1, m4 ; out1a psubd m4, m5, m0 ; out14b paddd m5, m0 ; out1b mova m11, [rsp+32*10] ; dct16 b6 mova m0, [rsp+32*11] REPX {psrad x, 6}, m1, m5, m8, m4 packssdw m1, m5 packssdw m8, m4 psubd m5, m10, m11 ; out9a paddd m10, m11 ; out6a psubd m11, m9, m0 ; out9b paddd m0, m9 ; out6b REPX {psrad x, 6}, m10, m0, m5, m11 packssdw m10, m0 packssdw m5, m11 mova [r3 +16*4], xm1 vextracti128 [cfq+16*4], m1, 1 mova [r3 -16*5], xm8 vextracti128 [cfq-16*5], m8, 1 mova [r3 -16*6], xm10 vextracti128 [cfq-16*6], m10, 1 mova [r3 +16*5], xm5 vextracti128 [cfq+16*5], m5, 1 mova m0, [cfq-32*2] ; dct8 b2 mova m4, [cfq+32*1] mova m1, [rsp+32*4] ; dct16 b2 mova m5, [rsp+32*5] psubd m8, m2, m0 ; a5 paddd m2, m0 ; a2 psubd m0, m6, m4 paddd m6, m4 psubd m4, m2, m1 ; out13a paddd m2, m1 ; out2a psubd m1, m6, m5 ; out13b paddd m6, m5 ; out2b mova m5, [rsp+32*12] ; dct16 b5 mova m9, [rsp+32*13] REPX {psrad x, 6}, m2, m6, m4, m1 packssdw m2, m6 packssdw m4, m1 psubd m6, m8, m5 ; out5a paddd m8, m5 ; out10a psubd m5, m0, m9 ; out5b paddd m0, m9 ; out10b REPX {psrad x, 6}, m6, m5, m8, m0 packssdw m6, m5 packssdw m8, m0 mova [r3 +16*2], xm2 vextracti128 [cfq+16*2], m2, 1 mova [r3 -16*3], xm4 vextracti128 [cfq-16*3], m4, 1 mova [r3 -16*4], xm6 vextracti128 [cfq-16*4], m6, 1 mova [r3 +16*3], xm8 vextracti128 [cfq+16*3], m8, 1 mova m5, [cfq-32*1] ; dct8 b3 mova m1, [cfq+32*0] mova m4, [rsp+32*6] ; dct16 b3 mova m0, [rsp+32*7] psubd m9, m3, m5 ; a4 paddd m3, m5 ; a3 psubd m5, m7, m1 paddd m7, m1 psubd m1, m3, m4 ; out12a paddd m3, m4 ; out3a psubd m4, m7, m0 ; out12b paddd m7, m0 ; out3b mova m10, [rsp+32*14] ; dct16 b4 mova m0, [rsp+32*15] REPX {psrad x, 6}, m3, m7, m1, m4 packssdw m3, m7 packssdw m1, m4 psubd m7, m9, m10 ; out11a paddd m9, m10 ; out4a psubd m4, m5, m0 ; out11b paddd m5, m0 ; out4b REPX {psrad x, 6}, m9, m5, m7, m4 packssdw m9, m5 packssdw m7, m4 mova [r3 +16*0], xm3 vextracti128 [cfq+16*0], m3, 1 mova [r3 -16*1], xm1 vextracti128 [cfq-16*1], m1, 1 mova [r3 -16*2], xm9 vextracti128 [cfq-16*2], m9, 1 mova [r3 +16*1], xm7 vextracti128 [cfq+16*1], m7, 1 .pass1_end: mova m0, [cfq+32*3] mova m1, [cfq+32*2] mova m2, [cfq+32*1] mova m3, [cfq+32*0] mova m4, [cfq-32*1] mova m5, [cfq-32*2] mova m6, [cfq-32*3] mova m7, [cfq-32*4] call m(inv_txfm_add_16x8_8bpc).transpose8x8 mova [cfq-32*4], m0 mova [cfq-32*3], m1 mova [cfq-32*2], m2 mova [cfq-32*1], m3 mova [cfq+32*0], m4 mova [cfq+32*1], m5 mova [cfq+32*2], m6 mova [cfq+32*3], m7 mova m0, [r3+32*3] mova m1, [r3+32*2] mova m2, [r3+32*1] mova m3, [r3+32*0] mova m4, [r3-32*1] mova m5, [r3-32*2] mova m6, [r3-32*3] mova m7, [r3-32*4] call m(inv_txfm_add_16x8_8bpc).transpose8x8 jmp tx2q .pass2_dct: test eobd, eobd jl .pass2_dct_fast mova m8, [cfq-32*3] ; 9 mova m9, [cfq-32*1] ; 11 mova m10, [cfq+32*1] ; 13 mova m11, [cfq+32*3] ; 15 mova [cfq-32*3], m0 mova [cfq-32*1], m2 mova [cfq+32*1], m4 mova [cfq+32*3], m6 call .dct16 mova m1, [cfq-32*1] ; 2 mova m3, [cfq+32*3] ; 6 mova m5, [cfq-32*2] ; 10 mova m7, [cfq+32*2] ; 14 call m(inv_txfm_add_16x8_8bpc).dct8 mova [cfq+32*3], m0 mova [cfq+32*2], m1 mova [cfq-32*2], m2 mova [cfq-32*1], m3 mova m0, [cfq-32*3] ; 0 mova m1, [cfq+32*1] ; 4 mova m2, [cfq-32*4] ; 8 mova m3, [cfq+32*0] ; 12 mova [cfq-32*4], m4 mova [cfq-32*3], m5 mova [cfq+32*1], m6 mova [cfq+32*0], m7 vpbroadcastd m10, [o(pd_4096)] call m(inv_txfm_add_16x4_8bpc).dct4 jmp .pass2_dct_end .pass2_dct_fast: call .dct16_fast mova [cfq+32*3], m1 mova [cfq+32*2], m2 mova [cfq-32*2], m3 mova [cfq-32*1], m8 mova [cfq-32*4], m5 mova [cfq-32*3], m6 mova [cfq+32*1], m7 mova [cfq+32*0], m9 vpbroadcastd m10, [o(pd_4096)] call .dct4_fast .pass2_dct_end: lea r6, [rsp+32*4] mova m11, [cfq+32*3] mova m12, [cfq-32*4] mova m9, [r6 -32*4] mova m13, [r6 -32*3] psubd m10, m0, m11 paddd m0, m11 psubd m11, m4, m12 paddd m4, m12 psubd m12, m0, m9 ; out15a paddd m0, m9 ; out0a psubd m9, m4, m13 ; out15b paddd m4, m13 ; out0b mova m8, [cfq+32*2] mova m13, [cfq-32*3] REPX {psrad x, 13}, m0, m4, m12, m9 packssdw m0, m4 mova m4, [r6 -32*2] packssdw m12, m9 mova m9, [r6 -32*1] mova [cfq-32*4], m12 psubd m12, m1, m8 paddd m1, m8 paddd m8, m5, m13 psubd m5, m13 psubd m13, m1, m4 ; out14a paddd m1, m4 ; out1a psubd m4, m8, m9 ; out14b paddd m8, m9 ; out1b REPX {psrad x, 13}, m1, m8, m13, m4 packssdw m1, m8 packssdw m13, m4 call m(inv_txfm_add_16x4_8bpc).write_16x2 mova m4, [cfq-32*2] mova m1, [cfq+32*1] mova m9, [r6 +32*0] mova m8, [r6 +32*1] mova [cfq-32*3], m13 paddd m0, m2, m4 psubd m2, m4 psubd m4, m6, m1 paddd m1, m6 psubd m6, m0, m9 ; out13a paddd m0, m9 ; out2a psubd m9, m1, m8 ; out13b paddd m1, m8 ; out2b mova m8, [cfq-32*1] mova m13, [cfq+32*0] REPX {psrad x, 13}, m0, m1, m6, m9 packssdw m0, m1 mova m1, [r6 +32*2] packssdw m6, m9 mova m9, [r6 +32*3] mova [cfq-32*2], m6 add r6, 32*8 psubd m6, m3, m8 paddd m8, m3 paddd m3, m7, m13 psubd m7, m13 psubd m13, m8, m1 ; out12a paddd m1, m8 ; out3a paddd m8, m3, m9 ; out3b psubd m3, m9 ; out12b REPX {psrad x, 13}, m1, m8, m13, m3 packssdw m1, m8 packssdw m13, m3 call m(inv_txfm_add_16x4_8bpc).write_16x2 mova m0, [r6+32*2] mova m1, [r6+32*3] mova [cfq-32*1], m13 psubd m13, m6, m0 ; out11a paddd m0, m6 ; out4a psubd m6, m7, m1 ; out11b paddd m1, m7 ; out4b mova m7, [r6+32*0] mova m3, [r6+32*1] REPX {psrad x, 13}, m0, m1 packssdw m0, m1 psubd m1, m2, m7 ; out5a paddd m7, m2 ; out10a paddd m2, m4, m3 ; out10b psubd m4, m3 ; out5b REPX {psrad x, 13}, m1, m4 packssdw m1, m4 call m(inv_txfm_add_16x4_8bpc).write_16x2 mova m1, [r6-32*2] mova m3, [r6-32*1] paddd m0, m12, m1 ; out6a psubd m12, m1 ; out9a paddd m1, m5, m3 ; out6b psubd m5, m3 ; out9b mova m3, [r6-32*4] mova m4, [r6-32*3] REPX {psrad x, 13}, m0, m1 packssdw m0, m1 psubd m1, m10, m3 ; out7a paddd m10, m3, m10 ; out8a psubd m3, m11, m4 ; out7b paddd m4, m11 ; out8b REPX {psrad x, 13}, m1, m3 packssdw m1, m3 call m(inv_txfm_add_16x4_8bpc).write_16x2 REPX {psrad x, 13}, m10, m4, m12, m5 packssdw m0, m10, m4 packssdw m1, m12, m5 call m(inv_txfm_add_16x4_8bpc).write_16x2 REPX {psrad x, 13}, m7, m2, m13, m6 packssdw m0, m7, m2 packssdw m1, m13, m6 call m(inv_txfm_add_16x4_8bpc).write_16x2 mova m0, [cfq-32*1] mova m1, [cfq-32*2] call m(inv_txfm_add_16x4_8bpc).write_16x2 mova m0, [cfq-32*3] mova m1, [cfq-32*4] .pass2_end: call m(inv_txfm_add_16x4_8bpc).write_16x2 .pass2_end2: mov r6, -32*16 call .zero_cf RET ALIGN function_align .zero_cf: sub cfq, r6 pxor m0, m0 .zero_cf_loop: mova [cfq+r6-32*4], m0 mova [cfq+r6-32*3], m0 mova [cfq+r6-32*2], m0 mova [cfq+r6-32*1], m0 add r6, 32*4 jl .zero_cf_loop ret ALIGN function_align .dct16: punpcklwd m0, m1, m5 punpckhwd m1, m5 punpcklwd m2, m3, m7 punpckhwd m3, m7 mov r3d, 16*3 punpcklwd m12, m10, m8 punpckhwd m10, m8 punpcklwd m13, m11, m9 punpckhwd m11, m9 .dct16_loop: vpbroadcastd m7, [o(dct16_mat)+r3+4*0] vpbroadcastd m5, [o(dct16_mat)+r3+4*1] pmaddwd m8, m0, m7 pmaddwd m9, m1, m7 pmaddwd m6, m13, m7 pmaddwd m7, m11 pmaddwd m4, m2, m5 paddd m8, m4 pmaddwd m4, m3, m5 paddd m9, m4 pmaddwd m4, m12, m5 pmaddwd m5, m10 psubd m6, m4 vpbroadcastd m4, [o(dct16_mat)+r3+4*2] psubd m7, m5 pmaddwd m5, m12, m4 paddd m8, m5 pmaddwd m5, m10, m4 paddd m9, m5 pmaddwd m5, m2, m4 pmaddwd m4, m3 paddd m6, m5 vpbroadcastd m5, [o(dct16_mat)+r3+4*3] paddd m7, m4 pmaddwd m4, m13, m5 paddd m8, m4 pmaddwd m4, m11, m5 paddd m9, m4 pmaddwd m4, m0, m5 pmaddwd m5, m1 psubd m6, m4 psubd m7, m5 mova [rsp+gprsize+r3*4+32*0], m8 mova [rsp+gprsize+r3*4+32*1], m9 mova [rsp+gprsize+r3*4+32*8], m6 mova [rsp+gprsize+r3*4+32*9], m7 sub r3d, 16 jge .dct16_loop ret ALIGN function_align .dct16_fast: punpcklwd m8, m1, m5 punpckhwd m9, m1, m5 mov r3d, 16*3 punpcklwd m11, m3, m7 punpckhwd m12, m3, m7 .dct16_fast_loop: vpbroadcastd m3, [o(dct16_mat)+r3+4*0] vpbroadcastd m7, [o(dct16_mat)+r3+4*1] pmaddwd m1, m8, m3 pmaddwd m3, m9 pmaddwd m5, m11, m7 pmaddwd m7, m12 paddd m1, m5 vpbroadcastd m5, [o(dct16_mat)+r3+4*2] paddd m3, m7 vpbroadcastd m7, [o(dct16_mat)+r3+4*3] mova [rsp+gprsize+r3*4+32*0], m1 mova [rsp+gprsize+r3*4+32*1], m3 pmaddwd m1, m11, m5 pmaddwd m5, m12 pmaddwd m3, m8, m7 pmaddwd m7, m9 psubd m1, m3 psubd m5, m7 mova [rsp+gprsize+r3*4+32*8], m1 mova [rsp+gprsize+r3*4+32*9], m5 sub r3d, 16 jge .dct16_fast_loop ; dct8_fast: punpcklwd m8, m2, m6 punpckhwd m9, m2, m6 vpbroadcastd m5, [o(pw_89_75)] vpbroadcastd m6, [o(pw_75_m18)] vpbroadcastd m7, [o(pw_50_m89)] vpbroadcastd m11, [o(pw_18_m50)] pmaddwd m1, m8, m5 ; out0a pmaddwd m5, m9 ; out0b pmaddwd m2, m8, m6 ; out1a pmaddwd m6, m9 ; out1b pmaddwd m3, m8, m7 ; out2a pmaddwd m7, m9 ; out2b pmaddwd m8, m11 ; out3a pmaddwd m9, m11 ; out3b ret ALIGN function_align .dct4_fast: vpbroadcastd m8, [o(pd_64)] punpcklwd m2, m0, m4 punpckhwd m3, m0, m4 vpbroadcastd m4, [o(pw_0_83)] vpbroadcastd m5, [o(pw_0_35)] pmaddwd m6, m2, m8 ; a pmaddwd m8, m3 pmaddwd m0, m2, m4 ; b0 pmaddwd m4, m3 pmaddwd m1, m2, m5 ; b1 pmaddwd m5, m3 paddd m6, m10 paddd m8, m10 psubd m3, m6, m0 ; out3a paddd m0, m6 ; out0a psubd m7, m8, m4 ; out3b paddd m4, m8 ; out0b psubd m2, m6, m1 ; out2a paddd m1, m6 ; out1a psubd m6, m8, m5 ; out2b paddd m5, m8 ; out1b ret .pass1_identity_fast: mova xm0, [cfq-16*8] vinserti128 m0, [r3 -16*8], 1 mova xm1, [cfq-16*6] vinserti128 m1, [r3 -16*6], 1 mova xm2, [cfq-16*4] vinserti128 m2, [r3 -16*4], 1 mova xm3, [cfq-16*2] vinserti128 m3, [r3 -16*2], 1 mova xm4, [cfq+16*0] vinserti128 m4, [r3 +16*0], 1 mova xm5, [cfq+16*2] vinserti128 m5, [r3 +16*2], 1 mova xm6, [cfq+16*4] vinserti128 m6, [r3 +16*4], 1 mova xm7, [cfq+16*6] vinserti128 m7, [r3 +16*6], 1 jmp .pass1_identity_end .pass1_identity: lea r3d, [eobq-(128<<16)] test eobb, 0x10 ; TX_CLASS_V cmovnz eobd, r3d lea r3, [cfq+32*8] test eobd, eobd jl .pass1_identity_fast mova xm0, [cfq-16*7] vinserti128 m0, [r3 -16*7], 1 mova xm1, [cfq-16*5] vinserti128 m1, [r3 -16*5], 1 mova xm2, [cfq-16*3] vinserti128 m2, [r3 -16*3], 1 mova xm3, [cfq-16*1] vinserti128 m3, [r3 -16*1], 1 mova xm4, [cfq+16*1] vinserti128 m4, [r3 +16*1], 1 mova xm5, [cfq+16*3] vinserti128 m5, [r3 +16*3], 1 mova xm6, [cfq+16*5] vinserti128 m6, [r3 +16*5], 1 mova xm7, [cfq+16*7] vinserti128 m7, [r3 +16*7], 1 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_16x8_8bpc).transpose8x8 paddsw m8, m0, m0 mova xm0, [cfq-16*8] vinserti128 m0, [r3 -16*8], 1 paddsw m9, m1, m1 mova xm1, [cfq-16*6] vinserti128 m1, [r3 -16*6], 1 paddsw m10, m2, m2 mova xm2, [cfq-16*4] vinserti128 m2, [r3 -16*4], 1 paddsw m11, m3, m3 mova xm3, [cfq-16*2] vinserti128 m3, [r3 -16*2], 1 mova [cfq-32*4], m8 mova [cfq-32*3], m9 mova [cfq-32*2], m10 mova [cfq-32*1], m11 paddsw m8, m4, m4 mova xm4, [cfq+16*0] vinserti128 m4, [r3 +16*0], 1 paddsw m9, m5, m5 mova xm5, [cfq+16*2] vinserti128 m5, [r3 +16*2], 1 paddsw m10, m6, m6 mova xm6, [cfq+16*4] vinserti128 m6, [r3 +16*4], 1 paddsw m11, m7, m7 mova xm7, [cfq+16*6] vinserti128 m7, [r3 +16*6], 1 mova [cfq+32*0], m8 mova [cfq+32*1], m9 mova [cfq+32*2], m10 mova [cfq+32*3], m11 .pass1_identity_end: REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_16x8_8bpc).transpose8x8 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2_identity: vpbroadcastd m11, [o(pw_1024)] test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm call m(inv_txfm_add_16x8_8bpc).write_16x8_rnd call m(inv_txfm_add_16x8_8bpc).write_16x8_rnd2 jmp .pass2_end2 .hdpcm: call m(inv_txfm_add_16x8_8bpc).write_16x8_rnd_hdpcm call m(inv_txfm_add_16x8_8bpc).write_16x8_rnd_hdpcm2 jmp .pass2_end2 .vdpcm: call m(inv_txfm_add_16x8_8bpc).write_16x8_rnd_vdpcm pmulhrsw m0, m11, [cfq-32*4] paddw m0, m1 pmulhrsw m1, m11, [cfq-32*3] pmulhrsw m2, m11, [cfq-32*2] pmulhrsw m3, m11, [cfq-32*1] pmulhrsw m4, m11, [cfq+32*0] pmulhrsw m5, m11, [cfq+32*1] pmulhrsw m6, m11, [cfq+32*2] pmulhrsw m7, m11, [cfq+32*3] call m(inv_txfm_add_16x8_8bpc).write_16x8_vdpcm jmp .pass2_end2 .pass1_dst_fast: call m(inv_txfm_add_8x16_8bpc).pass1_fast_load punpcklwd m10, m0, m1 ; 0 2 vpbroadcastd m7, [o(pd_32)] punpckhwd m11, m0, m1 ; 1 3 mov r8, -32*7 punpcklwd m12, m2, m3 ; 4 6 sub r3, r8 punpckhwd m2, m3 ; 5 7 .pass1_dst_fast_loop: call m(inv_txfm_add_16x8_8bpc).dst16x2_fast psrad m0, 6 psrad m1, 6 packssdw m0, m1 mova [cfq+r8+32*3], m0 add r8, 32 jle .pass1_dst_fast_loop jmp m(inv_txfm_add_16x8_8bpc).pass1_dst_end .pass1_flipddt: lea r3, [o(flipddt16_mat)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt16_mat)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst16_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst16_mat)] .pass1_dst: %if WIN64 push r8 %define tmp rsp+8 %else %define tmp rsp %endif lea r8d, [eobq+(28<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r8d sub eobd, 36<<16 jl .pass1_dst_fast mova m0, [cfq-32*4] mova m1, [cfq-32*3] mova m2, [cfq-32*2] mova m3, [cfq-32*1] mova m4, [cfq+32*0] mova m5, [cfq+32*1] mova m6, [cfq+32*2] mova m7, [cfq+32*3] call .dst16x1 mov r8d, 32*28 mova [tmp+r8+32*3], m1 mova [tmp+r8+32*2], m7 .pass1_dst_loop1: call .dst16x1b mova [tmp+r8+32*1], m1 mova [tmp+r8+32*0], m7 sub r8d, 32*2 jge .pass1_dst_loop1 lea r8, [cfq+32*8] mova m0, [r8-32*4] mova m1, [r8-32*3] mova m2, [r8-32*2] mova m3, [r8-32*1] mova m4, [r8+32*0] mova m5, [r8+32*1] mova m6, [r8+32*2] mova m7, [r8+32*3] add r3, 4*4-32*16 call .dst16x1 vpbroadcastd m13, [o(pd_32)] mov r8d, 32*7 jmp .pass1_dst_loop2a_start .pass1_dst_loop2a: ; rows 0-7 call .dst16x1b .pass1_dst_loop2a_start: paddd m8, m13, [tmp+r8*2+32*17] paddd m9, m13, [tmp+r8*2+32*16] paddd m1, m8 paddd m7, m9 psrad m1, 6 psrad m7, 6 packssdw m1, m7 mova [cfq+r8+16*8], xm1 ; storing xmm lanes separately avoids vextracti128 [cfq+r8-16*8], m1, 1 ; having to transpose them later sub r8d, 32 jge .pass1_dst_loop2a mov r8d, 32*7 .pass1_dst_loop2b: ; rows 8-15 call .dst16x1b paddd m8, m13, [tmp+r8*2+32*1] paddd m9, m13, [tmp+r8*2+32*0] paddd m1, m8 paddd m7, m9 psrad m1, 6 psrad m7, 6 packssdw m1, m7 mova [cfq+r8+16*9], xm1 vextracti128 [cfq+r8-16*7], m1, 1 sub r8d, 32 jge .pass1_dst_loop2b lea r3, [cfq+32*8] %if WIN64 pop r8 %endif jmp .pass1_end .pass2_dst_fast_loop: call .dst16x1b .pass2_dst_fast: psrad m1, 10 psrad m7, 10 packssdw m0, m1, m7 call .dst16x1b psrad m1, 10 psrad m7, 10 packssdw m1, m7 pmulhrsw m0, m13 pmulhrsw m1, m13 call m(inv_txfm_add_16x4_8bpc).write_16x2 sub r5d, 32*4 jge .pass2_dst_fast_loop jmp .pass2_end2 .pass2_flipddt: lea r3, [o(flipddt16_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt16_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst16_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst16_mat)] .pass2_dst: call .dst16x1 vpbroadcastd m13, [o(pw_4096)] mov r5d, 32*28 test eobd, eobd jl .pass2_dst_fast mova [rsp+r5+32*3], m1 mova [rsp+r5+32*2], m7 .pass2_loop1: call .dst16x1b mova [rsp+r5+32*1], m1 mova [rsp+r5+32*0], m7 sub r5d, 32*2 jge .pass2_loop1 mova m0, [cfq-32*4] mova m1, [cfq-32*3] mova m2, [cfq-32*2] mova m3, [cfq-32*1] mova m4, [cfq+32*0] mova m5, [cfq+32*1] mova m6, [cfq+32*2] mova m7, [cfq+32*3] add r3, 4*4-32*16 call .dst16x1 mov r5d, 32*28 jmp .pass2_dst_loop2_start .pass2_dst_loop2: call .dst16x1b .pass2_dst_loop2_start: paddd m1, [rsp+r5+32*3] paddd m7, [rsp+r5+32*2] psrad m1, 10 psrad m7, 10 packssdw m0, m1, m7 call .dst16x1b paddd m1, [rsp+r5+32*1] paddd m7, [rsp+r5+32*0] psrad m1, 10 psrad m7, 10 packssdw m1, m7 pmulhrsw m0, m13 pmulhrsw m1, m13 call m(inv_txfm_add_16x4_8bpc).write_16x2 sub r5d, 32*4 jge .pass2_dst_loop2 jmp .pass2_end2 ALIGN function_align .dst16x1: punpcklwd m10, m0, m2 punpckhwd m11, m0, m2 punpcklwd m2, m1, m3 punpckhwd m12, m1, m3 punpcklwd m3, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 .dst16x1b: vpbroadcastd m7, [r3+4*0] vpbroadcastd m8, [r3+4*1] pmaddwd m1, m10, m7 pmaddwd m7, m11 pmaddwd m9, m2, m8 pmaddwd m8, m12 paddd m1, m9 vpbroadcastd m9, [r3+4*2] paddd m7, m8 pmaddwd m8, m3, m9 pmaddwd m9, m4 paddd m1, m8 vpbroadcastd m8, [r3+4*3] add r3, 32 paddd m7, m9 pmaddwd m9, m6, m8 pmaddwd m8, m5 paddd m1, m9 paddd m7, m8 ret ALIGN function_align .dst16x2_fast: vpbroadcastd m7, [r3+4*0] vpbroadcastd m8, [r3+4*1] pmaddwd m1, m10, m7 pmaddwd m7, m11 pmaddwd m9, m2, m8 pmaddwd m8, m12 paddd m1, m9 vpbroadcastd m9, [r3+4*2] paddd m7, m8 pmaddwd m8, m3, m9 pmaddwd m9, m4 paddd m1, m8 vpbroadcastd m8, [r3+4*3] add r3, 32 paddd m7, m9 pmaddwd m9, m6, m8 pmaddwd m8, m5 paddd m1, m9 paddd m7, m8 ret INV_TXFM_FN 4, 32 add cfq, 32*4 WIN64_SPILL_XMM 13 vpbroadcastd m12, [o(pw_181x128)] pmulhrsw m0, m12, [cfq-32*4] pmulhrsw m1, m12, [cfq-32*2] pmulhrsw m2, m12, [cfq+32*0] pmulhrsw m3, m12, [cfq+32*2] jmp tx1q .dconly: vpbroadcastw xm3, [cfq] vpbroadcastd xm2, [o(pw_181x128)] or r4d, 8 pmulhrsw xm3, xm2 vpbroadcastd xm2, [o(pw_33)] paddw xm3, xm2 psraw xm3, 6 jmp m(inv_txfm_add_4x4_8bpc).dconly3 .pass1_dct: vpbroadcastd m10, [o(pd_64)] call m(inv_txfm_add_16x4_8bpc).dct4 lea r3d, [eobq+(42<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r3d call m(inv_txfm_add_8x8_8bpc).shift7_pack sub eobd, 58<<16 jl .pass1_end3 mova [cfq-32*4], m0 mova [cfq-32*2], m1 mova [cfq+32*0], m2 mova [cfq+32*2], m3 pmulhrsw m0, m12, [cfq-32*3] pmulhrsw m1, m12, [cfq-32*1] pmulhrsw m2, m12, [cfq+32*1] pmulhrsw m3, m12, [cfq+32*3] call m(inv_txfm_add_16x4_8bpc).dct4 .pass1_end: REPX {psrad x, 7}, m0, m4, m1, m5 packssdw m4, m0, m4 mova m0, [cfq-32*4] packssdw m5, m1, m5 mova m1, [cfq-32*2] REPX {psrad x, 7}, m2, m6, m3, m7 packssdw m6, m2, m6 mova m2, [cfq+32*0] packssdw m7, m3, m7 mova m3, [cfq+32*2] .pass1_end2: punpckhwd m8, m6, m7 punpcklwd m6, m7 punpckhwd m7, m4, m5 punpcklwd m4, m5 punpckhdq m5, m4, m6 ; 18 19 26 27 punpckldq m4, m6 ; 16 17 24 25 punpckldq m6, m7, m8 ; 20 21 28 29 punpckhdq m7, m8 ; 22 23 30 31 .pass1_end3: punpckhwd m8, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 ; 2 3 10 11 punpckldq m0, m2 ; 0 1 8 9 punpckldq m2, m3, m8 ; 4 5 12 13 punpckhdq m3, m8 ; 6 7 14 15 test eobb, 0x20 jnz .pass2_identity .pass2_dct: mov r5, rsp and rsp, ~31 sub rsp, 32*8 punpckhwd m8, m0, m2 ; 1 5 9 13 punpckhwd m9, m1, m3 ; 3 7 11 15 mova [cfq-32*4], m0 mova [cfq+32*0], m1 mova [cfq-32*3], m2 mova [cfq+32*1], m3 test eobd, eobd jl .pass2_dct_fast mova [cfq-32*1], m4 mova [cfq+32*2], m5 mova [cfq-32*2], m6 mova [cfq+32*3], m7 punpckhwd m6, m4 ; 21 17 29 25 punpckhwd m7, m5 ; 23 19 31 27 vextracti128 xm2, m8, 1 vextracti128 xm3, m9, 1 vextracti128 xm4, m6, 1 vextracti128 xm5, m7, 1 %if WIN64 movaps [r5], xm13 %endif call .dct32 mova xm0, [cfq+16*0] punpcklwd xm0, [cfq+16*1] ; 2 10 mova xm1, [cfq+16*2] punpcklwd xm1, [cfq+16*3] ; 6 14 mova xm2, [cfq+16*5] punpcklwd xm2, [cfq+16*4] ; 26 18 mova xm3, [cfq+16*7] punpcklwd xm3, [cfq+16*6] ; 30 22 vpbroadcastd m10, [o(pd_1024)] call m(inv_txfm_add_4x16_8bpc).dct16d add r3, 4*8 mova [cfq+16*0], xm8 ; b0 mova [cfq+16*1], xm9 ; b1 mova [cfq+16*6], xm12 ; b7 mova [cfq+16*7], xm11 ; b6 call m(inv_txfm_add_4x16_8bpc).dct16d mova xm7, [cfq-16*6] punpcklwd xm7, [cfq-16*4] ; 4 20 mova xm2, [cfq-16*3] punpcklwd xm2, [cfq-16*5] ; 28 12 mova xm0, [cfq-16*8] punpcklwd xm0, [cfq-16*2] ; 0 16 mova xm1, [cfq-16*1] punpcklwd xm1, [cfq-16*7] ; 24 8 mova [cfq+16*2], xm9 ; b3 mova [cfq+16*3], xm8 ; b2 mova [cfq+16*4], xm11 ; b4 mova [cfq+16*5], xm12 ; b5 call m(inv_txfm_add_4x8_8bpc).dct8b %if WIN64 movaps xm13, [r5] %endif jmp .pass2_dct2 .pass2_dct_fast: vpbroadcastd m10, [o(pd_1024)] vextracti128 xm2, m8, 1 vextracti128 xm3, m9, 1 call .dct32_fast mova xm9, [cfq-16*8] punpcklwd xm9, [cfq-16*7] ; 0 8 mova xm0, [cfq+16*0] punpcklwd xm0, [cfq+16*1] ; 2 10 mova xm8, [cfq-16*6] punpcklwd xm8, [cfq-16*5] ; 4 12 mova xm1, [cfq+16*2] punpcklwd xm1, [cfq+16*3] ; 6 14 call m(inv_txfm_add_4x16_8bpc).dct16_fast3 .pass2_dct2: mova m5, [cfq+32*0] ; b0 b1 mova m7, [cfq+32*1] ; b3 b2 mova m6, [rsp+32*0] ; c0 c1 mova m8, [rsp+32*1] ; c3 c2 psubd m4, m0, m5 ; a15 a14 paddd m0, m5 ; a0 a1 psubd m5, m1, m7 ; a12 a13 paddd m1, m7 ; a3 a2 psubd m12, m0, m6 ; out31 out30 paddd m0, m6 ; out0 out1 psubd m11, m1, m8 ; out28 out29 paddd m1, m8 ; out3 out2 psrad m0, 11 psrad m1, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct_start mova m1, [cfq+32*2] ; b4 b5 mova m7, [cfq+32*3] ; b7 b6 mova m6, [rsp+32*2] ; c4 c5 mova m8, [rsp+32*3] ; c7 c6 paddd m0, m2, m1 ; a4 a5 psubd m2, m1 ; a11 a10 paddd m1, m3, m7 ; a7 a6 psubd m3, m7 ; a8 a9 psubd m7, m0, m6 ; out27 out26 paddd m0, m6 ; out4 out5 psubd m6, m1, m8 ; out24 out25 paddd m1, m8 ; out7 out6 psrad m0, 11 psrad m1, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct mova m1, [rsp+32*4] ; c8 c9 mova m8, [rsp+32*5] ; c11 c10 paddd m0, m3, m1 ; out8 out9 psubd m3, m1 ; out23 out22 paddd m1, m2, m8 ; out11 out10 psubd m2, m8 ; out20 out21 psrad m0, 11 psrad m1, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct mova m1, [rsp+32*6] ; c12 c13 mova m8, [rsp+32*7] ; c15 c14 paddd m0, m5, m1 ; out12 out13 psubd m5, m1 ; out19 out18 paddd m1, m4, m8 ; out15 out14 psubd m4, m8 ; out16 out17 psrad m0, 11 psrad m1, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct psrad m0, m4, 11 psrad m1, m5, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct psrad m0, m2, 11 psrad m1, m3, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct psrad m0, m6, 11 psrad m1, m7, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct psrad m0, m11, 11 psrad m1, m12, 11 call m(inv_txfm_add_4x8_8bpc).write_4x4_dct mov rsp, r5 jmp m(inv_txfm_add_16x8_8bpc).pass2_end2 ALIGN function_align .dct32: WRAP_XMM IDCT32_1D_PACKED .dct32_fast: WRAP_XMM IDCT32_1D_PACKED_FAST .pass1_identity: lea r3d, [eobq-(64<<16)] test eobb, 0x10 ; TX_CLASS_V cmovnz eobd, r3d test eobd, eobd jl .pass1_end3 pmulhrsw m4, m12, [cfq-32*3] pmulhrsw m5, m12, [cfq-32*1] pmulhrsw m6, m12, [cfq+32*1] pmulhrsw m7, m12, [cfq+32*3] jmp .pass1_end2 .pass2_identity: vpbroadcastd m12, [o(pw_181x32)] REPX {pmulhrsw x, m12}, m0, m1, m2, m3 lea r6, [dsq*3] pxor m10, m10 test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm call m(inv_txfm_add_4x16_8bpc).write_4x16 test eobd, eobd jl m(inv_txfm_add_16x8_8bpc).pass2_end2 pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 call m(inv_txfm_add_4x16_8bpc).write_4x16 jmp m(inv_txfm_add_16x8_8bpc).pass2_end2 .hdpcm: call m(inv_txfm_add_4x16_8bpc).write_4x16_hdpcm pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 call m(inv_txfm_add_4x16_8bpc).write_4x16_hdpcm jmp m(inv_txfm_add_16x8_8bpc).pass2_end2 .vdpcm: call m(inv_txfm_add_4x16_8bpc).write_4x16_vdpcm pmulhrsw m4, m12 pmulhrsw m1, m12, m5 psrldq xm0, xm3, 8 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 paddsw m0, m4 call m(inv_txfm_add_4x16_8bpc).write_4x16_vdpcm jmp m(inv_txfm_add_16x8_8bpc).pass2_end2 .pass1_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst4_mat)] .pass1_dst: call m(inv_txfm_add_16x4_8bpc).dst4 vpbroadcastd m8, [o(pd_64)] %if WIN64 push r8 %endif lea r8d, [eobq+(42<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r8d %if WIN64 pop r8 %endif REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 call m(inv_txfm_add_8x8_8bpc).shift7_pack sub eobd, 58<<16 jl .pass1_end3 mova [cfq-32*4], m0 mova [cfq-32*2], m1 mova [cfq+32*0], m2 mova [cfq+32*2], m3 pmulhrsw m0, m12, [cfq-32*3] pmulhrsw m1, m12, [cfq-32*1] pmulhrsw m2, m12, [cfq+32*1] pmulhrsw m3, m12, [cfq+32*3] call m(inv_txfm_add_16x4_8bpc).dst4 vpbroadcastd m8, [o(pd_64)] REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 jmp .pass1_end INV_TXFM_FN 32, 4 add cfq, 32*4 WIN64_SPILL_XMM 13 vpbroadcastd m11, [o(pw_181x128)] test tx1b, 0x01 jnz .pass1_identity .pass1_dct: pmulhrsw m0, m11, [cfq-32*4] pmulhrsw m1, m11, [cfq-32*3] pmulhrsw m2, m11, [cfq-32*2] pmulhrsw m3, m11, [cfq-32*1] %if WIN64 push r8 %endif mov r8, rsp and rsp, ~31 punpckhwd m8, m0, m1 ; 1 5 3 7 lea r3d, [eobq+(3<<16)] test eobb, 0x10 ; TX_CLASS_H cmovz eobd, r3d sub rsp, 32*8 vextracti128 xm9, m8, 1 cmp eobd, 64<<16 jl .pass1_dct_fast pmulhrsw m12, m11, [cfq+32*0] pmulhrsw m6, m11, [cfq+32*1] pmulhrsw m5, m11, [cfq+32*2] pmulhrsw m4, m11, [cfq+32*3] vpblendd m7, m2, m12, 0x0f vpblendd m11, m2, m12, 0xf0 punpckhwd m2, m3 ; 9 13 11 15 punpcklwd m0, m7 ; 0 16 2 10 vpblendd m7, m3, m6, 0xf0 vpblendd m3, m6, 0x0f punpckhwd m6, m12 ; 21 17 23 19 punpcklwd m1, m3 ; 4 20 6 14 punpcklwd m7, m4, m7 ; 28 12 30 22 punpckhwd m4, m5 ; 29 25 31 27 punpcklwd m5, m11 ; 24 8 26 18 mova [cfq-32*4], m0 mova [cfq-32*3], m1 mova [cfq-32*2], m5 mova [cfq-32*1], m7 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 %if WIN64 movaps [r8+8], xm13 %endif call m(inv_txfm_add_4x32_8bpc).dct32 mova xm0, [cfq-16*7] ; 2 10 mova xm1, [cfq-16*5] ; 6 14 mova xm2, [cfq-16*3] ; 26 18 mova xm3, [cfq-16*1] ; 30 22 vpbroadcastd m10, [o(pd_64)] call m(inv_txfm_add_4x16_8bpc).dct16d add r3, 4*8 mova [cfq+16*0], xm8 ; b0 mova [cfq+16*1], xm9 ; b1 mova [cfq+16*6], xm12 ; b7 mova [cfq+16*7], xm11 ; b6 call m(inv_txfm_add_4x16_8bpc).dct16d mova xm7, [cfq-16*6] ; 4 20 mova xm0, [cfq-16*8] ; 0 16 mova xm2, [cfq-16*2] ; 28 12 mova xm1, [cfq-16*4] ; 24 8 mova [cfq+16*2], xm9 ; b3 mova [cfq+16*3], xm8 ; b2 mova [cfq+16*4], xm11 ; b4 mova [cfq+16*5], xm12 ; b5 call m(inv_txfm_add_4x8_8bpc).dct8b %if WIN64 movaps xm13, [r8+8] %endif jmp .pass1_dct2 .pass1_dct_fast: punpcklwd m0, m2 ; 0 8 2 10 punpckhwd m2, m3 ; 9 13 11 15 punpcklwd m1, m3 ; 4 12 6 14 vpbroadcastd m10, [o(pd_64)] mova [cfq-32*4], m0 mova [cfq-32*3], m1 vextracti128 xm3, m2, 1 call m(inv_txfm_add_4x32_8bpc).dct32_fast mova xm9, [cfq-16*8] ; 0 8 mova xm0, [cfq-16*7] ; 2 10 mova xm8, [cfq-16*6] ; 4 12 mova xm1, [cfq-16*5] ; 6 14 call m(inv_txfm_add_4x16_8bpc).dct16_fast3 .pass1_dct2: mova m4, [cfq+32*0] ; b0 b1 mova m5, [cfq+32*1] ; b3 b2 mova m7, [rsp+32*0] ; c0 c1 mova m6, [rsp+32*1] ; c3 c2 psubd m12, m0, m4 ; a15 a14 paddd m0, m4 ; a0 a1 psubd m11, m1, m5 ; a12 a13 paddd m1, m5 ; a3 a2 psubd m5, m0, m7 ; out31 out30 paddd m0, m7 ; out0 out1 psubd m7, m1, m6 ; out28 out29 paddd m1, m6 ; out3 out2 REPX {psrad x, 7}, m0, m1, m5, m7 packssdw m0, m1 ; 0 3 1 2 packssdw m7, m5 ; 28 31 29 30 mova m4, [cfq+32*2] ; b4 b5 mova m1, [cfq+32*3] ; b7 b6 mova m6, [rsp+32*2] ; c4 c5 mova m8, [rsp+32*3] ; c7 c6 psubd m5, m2, m4 ; a11 a10 paddd m2, m4 ; a4 a5 paddd m4, m3, m1 ; a7 a6 psubd m3, m1 ; a8 a9 paddd m1, m2, m6 ; out4 out5 psubd m2, m6 ; out27 out26 psubd m6, m4, m8 ; out24 out25 paddd m4, m8 ; out7 out6 REPX {psrad x, 7}, m1, m4, m2, m6 packssdw m1, m4 ; 4 7 5 6 packssdw m6, m2 ; 24 27 25 26 mova m4, [rsp+32*4] ; c8 c9 mova m8, [rsp+32*5] ; c11 c10 paddd m2, m3, m4 ; out8 out9 psubd m3, m4 ; out23 out22 paddd m4, m5, m8 ; out11 out10 psubd m5, m8 ; out20 out21 REPX {psrad x, 7}, m2, m4, m3, m5 packssdw m2, m4 ; 8 11 9 10 packssdw m5, m3 ; 20 23 21 22 mova m4, [rsp+32*6] ; c12 c13 mova m8, [rsp+32*7] ; c15 c14 paddd m3, m11, m4 ; out12 out13 psubd m11, m4 ; out19 out18 psubd m4, m12, m8 ; out16 out17 paddd m12, m8 ; out15 out14 REPX {psrad x, 7}, m3, m12, m11, m4 packssdw m3, m12 ; 12 15 13 14 packssdw m4, m11 ; 16 19 17 18 mov rsp, r8 %if WIN64 pop r8 %endif punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 vperm2i128 m3, m0, m4, 0x31 vinserti128 m0, xm4, 1 vinserti128 m4, m2, xm6, 1 vperm2i128 m2, m6, 0x31 vperm2i128 m6, m7, m8, 0x31 vinserti128 m7, xm8, 1 vperm2i128 m8, m1, m5, 0x31 vinserti128 m1, xm5, 1 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhwd m7, m4, m2 punpcklwd m4, m2 punpckhwd m2, m0, m3 punpcklwd m0, m3 punpckhwd m3, m8, m1 punpcklwd m8, m1 punpckhdq m1, m0, m6 punpckldq m0, m6 punpckldq m6, m7, m3 punpckhdq m7, m3 punpckhdq m3, m2, m5 punpckldq m2, m5 punpckhdq m5, m4, m8 punpckldq m4, m8 jmp tx2q .pass2_dct: mova [cfq-32*4], m4 mova [cfq-32*3], m5 mova [cfq-32*2], m6 mova [cfq-32*1], m7 vpbroadcastd m10, [o(pd_1024)] call m(inv_txfm_add_16x4_8bpc).dct4 REPX {psrad x, 11}, m0, m4, m1, m5 packssdw m4, m0, m4 packssdw m5, m1, m5 REPX {psrad x, 11}, m2, m6, m3, m7 packssdw m6, m2, m6 packssdw m7, m3, m7 mova m0, [cfq-32*4] mova m1, [cfq-32*3] mova m2, [cfq-32*2] mova m3, [cfq-32*1] mova [cfq-32*4], m4 mova [cfq-32*3], m5 mova [cfq-32*2], m6 mova [cfq-32*1], m7 call m(inv_txfm_add_16x4_8bpc).dct4 REPX {psrad x, 11}, m0, m4, m1, m5 packssdw m4, m0, m4 mova m0, [cfq-32*4] packssdw m5, m1, m5 mova m1, [cfq-32*3] pxor m10, m10 call .write_32x2 REPX {psrad x, 11}, m2, m6, m3, m7 mova m0, [cfq-32*2] mova m1, [cfq-32*1] packssdw m4, m2, m6 packssdw m5, m3, m7 call .write_32x2 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 .dconly: movd xm3, [o(pw_181x128)] pmulhrsw xm3, [cfq] movd xm2, [o(pw_33)] or r4d, 4 paddw xm3, xm2 psraw xm3, 6 .dconly2: vpbroadcastw m3, xm3 pxor xm2, xm2 mova [cfq], xm2 .dconly_loop: movu m1, [dstq] punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 movu [dstq], m0 add dstq, dsq dec r4d jg .dconly_loop vzeroupper ret .pass1_identity: mova xm0, [cfq-16*8] vinserti128 m0, [cfq+16*0], 1 ; 0 1 16 17 mova xm1, [cfq-16*7] vinserti128 m1, [cfq+16*1], 1 ; 2 3 18 19 mova xm2, [cfq-16*6] vinserti128 m2, [cfq+16*2], 1 ; 4 5 20 21 mova xm3, [cfq-16*5] vinserti128 m3, [cfq+16*3], 1 ; 6 7 22 23 mova xm4, [cfq-16*4] vinserti128 m4, [cfq+16*4], 1 ; 8 9 24 25 mova xm5, [cfq-16*3] vinserti128 m5, [cfq+16*5], 1 ; 10 11 26 27 mova xm6, [cfq-16*2] vinserti128 m6, [cfq+16*6], 1 ; 12 13 28 29 mova xm7, [cfq-16*1] vinserti128 m7, [cfq+16*7], 1 ; 14 15 30 31 vpbroadcastd m12, [o(pw_53x512)] REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m8 punpcklwd m0, m8 punpckhwd m8, m2, m1 punpcklwd m2, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m8 punpckhqdq m3, m8 call .identity_main punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhwd m7, m4, m8 punpcklwd m4, m8 punpckhwd m8, m6, m5 punpcklwd m6, m5 punpckhqdq m5, m4, m6 punpcklqdq m4, m6 punpcklqdq m6, m7, m8 punpckhqdq m7, m8 jmp tx2q .pass2_identity: vpbroadcastd m12, [o(pw_2048)] pxor m10, m10 REPX {pmulhrsw x, m12}, m0, m4, m1, m5 test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm .pass2_identity2: call .write_32x2 pmulhrsw m0, m12, m2 pmulhrsw m4, m12, m6 pmulhrsw m1, m12, m3 pmulhrsw m5, m12, m7 call .write_32x2 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 .hdpcm: call .write_32x2_hdpcm pmulhrsw m0, m12, m2 pmulhrsw m1, m12, m3 pmulhrsw m4, m12, m6 pmulhrsw m5, m12, m7 call .write_32x2_hdpcm2 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 .vdpcm: paddw m1, m0 paddw m5, m4 call .write_32x2 pmulhrsw m0, m12, m2 pmulhrsw m4, m12, m6 pmulhrsw m2, m12, m3 pmulhrsw m3, m12, m7 call .write_32x2_vdpcm2 jmp m(inv_txfm_add_16x8_8bpc).pass2_end3 ALIGN function_align .write_32x2_hdpcm: vpbroadcastd m11, [o(pb_14_15)] .write_32x2_hdpcm2: pslldq m8, m0, 2 pslldq m9, m1, 2 paddw m0, m8 pslldq m8, m4, 2 paddw m1, m9 pslldq m9, m5, 2 paddw m4, m8 pslldq m8, m0, 4 paddw m5, m9 pslldq m9, m1, 4 paddw m0, m8 pslldq m8, m4, 4 paddw m1, m9 pslldq m9, m5, 4 paddw m4, m8 punpcklqdq m8, m10, m0 paddw m5, m9 punpcklqdq m9, m10, m1 paddw m0, m8 punpcklqdq m8, m10, m4 paddw m1, m9 punpcklqdq m9, m10, m5 paddw m4, m8 pshufb m8, m0, m11 paddw m5, m9 pshufb m9, m1, m11 paddw m4, m8 paddw m5, m9 pshufb xm8, xm4, xm11 pshufb xm9, xm5, xm11 vinserti128 m8, m10, xm8, 1 vinserti128 m9, m10, xm9, 1 paddw m0, m8 paddw m4, m8 paddw m1, m9 paddw m5, m9 jmp .write_32x2 .write_32x2_vdpcm2: paddw m0, m1 paddw m4, m5 paddw m1, m0, m2 paddw m5, m4, m3 .write_32x2: movu m9, [dstq+dsq*0] punpcklbw m8, m9, m10 punpckhbw m9, m10 paddw m8, m0 paddw m9, m4 packuswb m8, m9 movu m9, [dstq+dsq*1] movu [dstq+dsq*0], m8 punpcklbw m8, m9, m10 punpckhbw m9, m10 paddw m8, m1 paddw m9, m5 packuswb m8, m9 movu [dstq+dsq*1], m8 lea dstq, [dstq+dsq*2] ret ALIGN function_align .identity_main: pmulhrsw m8, m12, m0 ; (x * 362 + 64) >> 7 pmulhrsw m9, m12, m1 ; = x * 2 + ((x * (181 - 128) + 32) >> 6) pmulhrsw m10, m12, m2 ; = x * 2 + ((x * 53 * 512 + 16384) >> 15) pmulhrsw m11, m12, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m8 pmulhrsw m8, m12, m4 paddsw m1, m9 pmulhrsw m9, m12, m5 paddsw m2, m10 pmulhrsw m10, m12, m6 paddsw m3, m11 pmulhrsw m11, m12, m7 REPX {paddsw x, x}, m4, m5, m6, m7 paddsw m4, m8 paddsw m5, m9 paddsw m6, m10 paddsw m7, m11 ret .pass2_flipadst: lea r3, [o(flipadst4_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst4_mat)] .pass2_dst: mova [cfq-32*4], m4 mova [cfq-32*3], m5 mova [cfq-32*2], m6 mova [cfq-32*1], m7 call m(inv_txfm_add_16x4_8bpc).dst4 vpbroadcastd m12, [o(pw_2048)] REPX {psrad x, 7}, m0, m4, m1, m5 packssdw m4, m0, m4 packssdw m5, m1, m5 REPX {psrad x, 7}, m2, m6, m3, m7 packssdw m6, m2, m6 packssdw m7, m3, m7 mova m0, [cfq-32*4] mova m1, [cfq-32*3] mova m2, [cfq-32*2] mova m3, [cfq-32*1] mova [cfq-32*4], m4 mova [cfq-32*3], m5 mova [cfq-32*2], m6 mova [cfq-32*1], m7 call m(inv_txfm_add_16x4_8bpc).dst4 REPX {psrad x, 7}, m0, m4, m1, m5 packssdw m4, m0, m4 pmulhrsw m0, m12, [cfq-32*4] packssdw m5, m1, m5 pmulhrsw m1, m12, [cfq-32*3] pmulhrsw m4, m12 pmulhrsw m5, m12 REPX {psrad x, 7}, m2, m6, m3, m7 packssdw m6, m2, m6 mova m2, [cfq-32*2] packssdw m7, m3, m7 mova m3, [cfq-32*1] pxor m10, m10 jmp .pass2_identity2 INV_TXFM_FN 8, 32 add cfq, 32*4 PROLOGUE 0, 8, 13 lea r7, [cfq+32*8] jmp tx1q .dconly: movd xm5, [o(pw_256)] pmulhrsw xm5, [cfq] or r4d, 8 jmp m(inv_txfm_add_8x4_8bpc).dconly2 .pass1_dct: lea r3d, [eobq+(84<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r3d sub eobd, 100<<16 jl .pass1_dct_fast add cfq, 32 call .pass1_dct_main mova [cfq-32*4], m0 mova [cfq-32*2], m1 mova [cfq+32*0], m2 mova [cfq+32*2], m3 sub cfq, 32 mova [r3 -32*4], m4 mova [r3 -32*2], m5 mova [r3 +32*0], m6 mova [r3 +32*2], m7 .pass1_dct_fast: call .pass1_dct_main .pass1_end: test eobb, 0x20 jnz .pass2_identity .pass2_dct: punpckhwd m8, m0, m2 ; 1 5 punpckhwd m9, m1, m3 ; 3 7 punpcklwd m1, m5 ; 2 10 punpcklwd m10, m3, m7 ; 6 14 punpckhwd m3, m5, m7 ; 11 15 mova [cfq-32*4], m1 mova [cfq+32*2], m10 mov r5, rsp and rsp, ~31 %if WIN64 mova [r5], xm13 %endif sub rsp, 32*16 test eobd, eobd jl .pass2_dct_fast mova m10, [cfq-32*3] ; 16 17 mova m1, [cfq-32*1] ; 18 19 mova m11, [cfq+32*1] ; 20 21 mova m7, [cfq+32*3] ; 22 23 punpcklwd m0, m10 ; 0 16 mova [cfq-32*1], m0 punpcklwd m2, m11 ; 4 20 mova [cfq-32*2], m2 punpckhwd m2, m4, m6 ; 9 13 mova m0, [r7 -32*3] ; 24 25 mova m13, [r7 -32*1] ; 26 27 mova m12, [r7 +32*1] ; 28 29 mova m5, [r7 +32*3] ; 30 31 punpcklwd m4, m0, m4 ; 24 8 mova [cfq+32*1], m4 punpckhwd m4, m12, m0 ; 29 25 punpcklwd m12, m6 ; 28 12 mova [cfq+32*0], m12 punpckhwd m6, m11, m10 ; 21 17 punpcklwd m0, m5, m7 ; 30 22 mova [cfq+32*3], m0 punpckhwd m7, m1 ; 23 19 punpckhwd m5, m13 ; 31 27 punpcklwd m13, m1 ; 26 18 mova [cfq-32*3], m13 call m(inv_txfm_add_8x32_8bpc).dct32 mova m0, [cfq-32*4] ; 2 10 mova m1, [cfq+32*2] ; 6 14 mova m2, [cfq-32*3] ; 26 18 mova m3, [cfq+32*3] ; 30 22 vpbroadcastd m10, [o(pd_4096)] call m(inv_txfm_add_8x16_8bpc).dct16c jmp .pass2_dct2 .pass2_dct_fast: punpcklwd m0, m4 ; 0 8 punpcklwd m10, m2, m6 ; 4 12 punpckhwd m2, m4, m6 ; 9 13 mova [cfq-32*2], m0 mova [cfq+32*0], m10 call m(inv_txfm_add_8x32_8bpc).dct32_fast mova m0, [cfq-32*4] mova m1, [cfq+32*2] mova m8, [cfq+32*0] mova m9, [cfq-32*2] vpbroadcastd m10, [o(pd_4096)] call m(inv_txfm_add_8x16_8bpc).dct16_fast3 .pass2_dct2: mova m13, [cfq-32* 4] ; b0 mova m9, [cfq-32* 3] ; b1 mova m10, [rsp+32* 0] ; c0 mova m8, [rsp+32* 1] ; c1 psubd m11, m0, m13 ; a15 paddd m0, m13 ; a0 psubd m13, m1, m9 ; a14 paddd m1, m9 ; a1 psubd m9, m0, m10 ; out31 paddd m0, m10 ; out0 psubd m10, m1, m8 ; out30 paddd m1, m8 ; out1 REPX {psrad x, 13}, m0, m1, m9, m10 packssdw m0, m1 ; 0 1 packssdw m10, m9 ; 30 31 mova m12, [cfq-32* 1] ; b2 mova m1, [cfq-32* 2] ; b3 mova m8, [rsp+32* 3] ; c2 mova m9, [rsp+32* 2] ; c3 mova [cfq-32*4], m13 mova [cfq-32*3], m11 mova [cfq-32*2], m10 psubd m13, m2, m12 ; a13 paddd m2, m12 ; a2 psubd m12, m3, m1 ; a12 paddd m3, m1 ; a3 paddd m1, m2, m8 ; out2 psubd m2, m8 ; out29 psubd m8, m3, m9 ; out28 paddd m3, m9 ; out3 REPX {psrad x, 13}, m1, m3, m2, m8 packssdw m1, m3 ; 2 3 packssdw m8, m2 ; 28 29 mova [cfq-32*1], m8 call m(inv_txfm_add_8x4_8bpc).write_8x4_start mova m3, [cfq+32* 0] ; b4 mova m0, [cfq+32* 1] ; b5 mova m1, [rsp+32* 4] ; c4 mova m8, [rsp+32* 5] ; c5 psubd m2, m4, m3 ; a11 paddd m4, m3 ; a4 psubd m3, m5, m0 ; a10 paddd m5, m0 ; a5 paddd m0, m4, m1 ; out4 psubd m4, m1 ; out27 paddd m1, m5, m8 ; out5 psubd m5, m8 ; out26 REPX {psrad x, 13}, m0, m1, m4, m5 packssdw m0, m1 ; 4 5 packssdw m5, m4 ; 26 27 mova m4, [cfq+32* 3] ; b6 mova m1, [cfq+32* 2] ; b7 mova m8, [rsp+32* 7] ; c6 mova m11, [rsp+32* 6] ; c7 paddd m9, m6, m4 ; a6 psubd m6, m4 ; a9 psubd m4, m7, m1 ; a8 paddd m7, m1 ; a7 paddd m1, m9, m8 ; out6 psubd m9, m8 ; out25 paddd m8, m7, m11 ; out7 psubd m7, m11 ; out24 REPX {psrad x, 13}, m1, m8, m9, m7 packssdw m1, m8 ; 6 7 packssdw m7, m9 ; 24 25 call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq mova m1, [rsp+32* 8] ; c8 mova m8, [rsp+32* 9] ; c9 paddd m0, m4, m1 ; out8 psubd m4, m1 ; out23 paddd m1, m6, m8 ; out9 psubd m6, m8 ; out22 REPX {psrad x, 13}, m0, m1, m4, m6 packssdw m0, m1 ; 8 9 packssdw m6, m4 ; 22 23 mova m4, [rsp+32*11] ; c10 mova m8, [rsp+32*10] ; c11 paddd m1, m3, m4 ; out10 psubd m3, m4 ; out21 paddd m4, m2, m8 ; out11 psubd m2, m8 ; out20 REPX {psrad x, 13}, m1, m4, m3, m2 packssdw m1, m4 ; 10 11 packssdw m2, m3 ; 20 21 call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq mova m1, [rsp+32*12] ; c12 mova m3, [rsp+32*13] ; c13 paddd m0, m12, m1 ; out12 psubd m12, m1 ; out19 paddd m1, m13, m3 ; out13 psubd m13, m3 ; out18 mova m3, [cfq-32*4] ; a14 mova m8, [rsp+32*15] ; c14 mova m4, [cfq-32*3] ; a15 mova m9, [rsp+32*14] ; c15 REPX {psrad x, 13}, m0, m1, m12, m13 packssdw m0, m1 ; 12 13 paddd m1, m3, m8 ; out14 psubd m3, m8 ; out17 paddd m8, m4, m9 ; out15 psubd m4, m9 ; out16 REPX {psrad x, 13}, m1, m8, m3, m4 packssdw m1, m8 ; 14 15 call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq packssdw m0, m4, m3 ; 16 17 packssdw m1, m13, m12 ; 18 19 call m(inv_txfm_add_8x4_8bpc).write_8x4_vpermq vpermq m0, m2, q3120 vpermq m1, m6, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4 vpermq m0, m7, q3120 vpermq m1, m5, q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4 vpermq m0, [cfq-32*1], q3120 vpermq m1, [cfq-32*2], q3120 call m(inv_txfm_add_8x4_8bpc).write_8x4 %if WIN64 mova xm13, [r5] %endif mov rsp, r5 jmp m(inv_txfm_add_32x8_8bpc).pass2_end ALIGN function_align .pass1_dct_main: lea r3, [cfq+32*8] mova m1, [cfq-32*2] mova m3, [cfq+32*2] mova m5, [r3 -32*2] mova m7, [r3 +32*2] call m(inv_txfm_add_16x8_8bpc).dct8 mova [cfq-32*2], m0 mova [cfq+32*2], m1 mova [r3 -32*2], m2 mova [r3 +32*2], m3 mova m0, [cfq-32*4] mova m1, [cfq+32*0] mova m2, [r3 -32*4] mova m3, [r3 +32*0] mova [cfq-32*4], m4 mova [cfq+32*0], m5 mova [r3 -32*4], m6 mova [r3 +32*0], m7 vpbroadcastd m10, [o(pd_32)] call m(inv_txfm_add_16x4_8bpc).dct4 mova m8, [cfq-32*2] ; b0 mova m9, [cfq-32*4] psubd m10, m0, m8 ; out7a paddd m0, m8 ; out0a mova m8, [r3 +32*2] ; b3 psubd m11, m4, m9 ; out7b paddd m4, m9 ; out0b mova m9, [r3 +32*0] REPX {psrad x, 6}, m0, m4, m10, m11 packssdw m0, m4 psubd m4, m3, m8 ; out4a paddd m3, m8 ; out3a psubd m8, m7, m9 ; out4b paddd m7, m9 ; out3b mova m9, [cfq+32*2] ; b1 REPX {psrad x, 6}, m4, m8, m3, m7 packssdw m4, m8 mova m8, [cfq+32*0] packssdw m3, m7 packssdw m7, m10, m11 psubd m10, m1, m9 ; out6a paddd m1, m9 ; out1a mova m9, [r3 -32*2] ; b2 psubd m11, m5, m8 ; out6b paddd m5, m8 ; out1b mova m8, [r3 -32*4] REPX {psrad x, 6}, m1, m5, m10, m11 packssdw m1, m5 psubd m5, m2, m9 ; out5a paddd m2, m9 ; out2a psubd m9, m6, m8 ; out5b paddd m6, m8 ; out2b REPX {psrad x, 6}, m5, m9, m2, m6 packssdw m5, m9 packssdw m2, m6 packssdw m6, m10, m11 jmp m(inv_txfm_add_8x16_8bpc).transpose16x8 ALIGN function_align .dct32: IDCT32_1D_PACKED .dct32_fast: IDCT32_1D_PACKED_FAST .pass1_identity: vpbroadcastd m12, [o(pw_53x512)] lea r3d, [eobq-(128<<16)] test eobb, 0x10 ; TX_CLASS_V cmovnz eobd, r3d lea r3, [cfq+32*8] test eobd, eobd jl .pass1_identity_fast mova m0, [cfq-32*3] mova m1, [cfq-32*1] mova m2, [cfq+32*1] mova m3, [cfq+32*3] mova m4, [r3 -32*3] mova m5, [r3 -32*1] mova m6, [r3 +32*1] mova m7, [r3 +32*3] call m(inv_txfm_add_32x4_8bpc).identity_main call m(inv_txfm_add_8x16_8bpc).transpose16x8 mova [cfq-32*3], m0 mova [cfq-32*1], m1 mova [cfq+32*1], m2 mova [cfq+32*3], m3 mova [r3 -32*3], m4 mova [r3 -32*1], m5 mova [r3 +32*1], m6 mova [r3 +32*3], m7 .pass1_identity_fast: mova m0, [cfq-32*4] mova m1, [cfq-32*2] mova m2, [cfq+32*0] mova m3, [cfq+32*2] mova m4, [r3 -32*4] mova m5, [r3 -32*2] mova m6, [r3 +32*0] mova m7, [r3 +32*2] call m(inv_txfm_add_32x4_8bpc).identity_main call m(inv_txfm_add_8x16_8bpc).transpose16x8 test eobb, 0x20 jz .pass2_dct .pass2_identity: vpbroadcastd m12, [o(pw_181x8)] REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 lea r6, [dsq*3] pxor m10, m10 test eobd, eobd jl .pass2_identity_fast test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm call m(inv_txfm_add_8x16_8bpc).write_8x16 call .pass2_identity_load_round_bottomhalf .pass2_identity_fast: call m(inv_txfm_add_8x16_8bpc).write_8x16 jmp m(inv_txfm_add_32x8_8bpc).pass2_end .hdpcm: call m(inv_txfm_add_8x16_8bpc).write_8x16_hdpcm call .pass2_identity_load_round_bottomhalf call m(inv_txfm_add_8x16_8bpc).write_8x16_hdpcm jmp m(inv_txfm_add_32x8_8bpc).pass2_end .vdpcm: call m(inv_txfm_add_8x16_8bpc).write_8x16_vdpcm punpckhqdq m8, m7, m10 call .pass2_identity_load_round_bottomhalf paddw m0, m8 call m(inv_txfm_add_8x16_8bpc).write_8x16_vdpcm jmp m(inv_txfm_add_32x8_8bpc).pass2_end .pass2_identity_load_round_bottomhalf: lea r3, [cfq+32*8] pmulhrsw m0, m12, [cfq-32*3] pmulhrsw m1, m12, [cfq-32*1] pmulhrsw m2, m12, [cfq+32*1] pmulhrsw m3, m12, [cfq+32*3] pmulhrsw m4, m12, [r3 -32*3] pmulhrsw m5, m12, [r3 -32*1] pmulhrsw m6, m12, [r3 +32*1] pmulhrsw m7, m12, [r3 +32*3] ret .pass1_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass1_dst .pass1_ddt: lea r3, [o(ddt8_mat)] jmp .pass1_dst .pass1_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass1_dst .pass1_adst: lea r3, [o(adst8_mat)] .pass1_dst: %if WIN64 push r8 %endif lea r8d, [eobq+(84<<16)] test eobb, 0x10 ; TX_CLASS_H cmovnz eobd, r8d sub eobd, 100<<16 jl .pass1_dst_fast add cfq, 32 call .pass1_dst_main mova [cfq-64*2], m0 mova [cfq-64*1], m1 mova [cfq+64*0], m2 mova [cfq+64*1], m3 sub r3, 16*8 mova [cfq+64*2], m4 mova [cfq+64*3], m5 mova [cfq+64*4], m6 mova [cfq+64*5], m7 sub cfq, 32 .pass1_dst_fast: call .pass1_dst_main %if WIN64 pop r8 %endif jmp .pass1_end ALIGN function_align .pass1_dst_main: mova m0, [cfq-64*2] mova m1, [cfq-64*1] mova m2, [cfq+64*0] mova m3, [cfq+64*1] mova m4, [cfq+64*2] mova m5, [cfq+64*3] mova m6, [cfq+64*4] mova m7, [cfq+64*5] punpcklwd m10, m0, m2 ; 0 2 punpckhwd m0, m2 punpcklwd m2, m1, m3 ; 1 3 punpckhwd m1, m3 punpcklwd m3, m4, m6 ; 4 6 punpckhwd m4, m6 punpcklwd m11, m5, m7 ; 5 7 punpckhwd m5, m7 vpbroadcastd m12, [o(pd_32)] mov r8d, 64*7 .pass1_dst_loop: call m(inv_txfm_add_16x8_8bpc).dst8 paddd m6, m12 paddd m7, m12 psrad m6, 6 psrad m7, 6 packssdw m7, m6, m7 mova [cfq+r8-64*2], m7 sub r8d, 64 jge .pass1_dst_loop mova m0, [cfq+64*5] mova m1, [cfq+64*4] mova m2, [cfq+64*3] mova m3, [cfq+64*2] mova m4, [cfq+64*1] mova m5, [cfq+64*0] mova m6, [cfq-64*1] jmp m(inv_txfm_add_8x16_8bpc).transpose16x8 INV_TXFM_FN 32, 8 add cfq, 32*4 PROLOGUE 0, 8, 13 lea r7, [cfq+32*8] test tx1b, 0x01 jnz .pass1_identity .pass1_dct: vbroadcasti128 m9, [cfq-16*7] vbroadcasti128 m0, [cfq-16*5] vbroadcasti128 m2, [cfq-16*3] vbroadcasti128 m7, [cfq-16*1] vbroadcasti128 m3, [cfq+16*1] vbroadcasti128 m4, [cfq+16*3] vbroadcasti128 m6, [cfq+16*5] vbroadcasti128 m1, [cfq+16*7] shufpd m9, m0, 0x0c shufpd m2, m7, 0x0c shufpd m3, m4, 0x0c shufpd m6, m1, 0x0c punpcklwd m8, m9, m2 ; 1 5 punpckhwd m9, m2 ; 3 7 punpcklwd m2, m3, m6 ; 9 13 punpckhwd m3, m6 ; 11 15 %if WIN64 push r8 %endif mov r8, rsp and rsp, ~31 lea r3d, [eobq+(24<<16)] test eobb, 0x10 ; TX_CLASS_H cmovz eobd, r3d sub rsp, 32*16 cmp eobd, 128<<16 jl .pass1_dct_fast vbroadcasti128 m4, [r7-16*7] vbroadcasti128 m11, [r7-16*5] vbroadcasti128 m7, [r7-16*3] vbroadcasti128 m12, [r7-16*1] vbroadcasti128 m0, [r7+16*1] vbroadcasti128 m1, [r7+16*3] vbroadcasti128 m5, [r7+16*5] vbroadcasti128 m6, [r7+16*7] shufpd m4, m11, 0x0c shufpd m7, m12, 0x0c shufpd m0, m1, 0x0c shufpd m5, m6, 0x0c punpcklwd m6, m7, m4 ; 21 17 punpckhwd m7, m4 ; 23 19 punpcklwd m4, m5, m0 ; 29 25 punpckhwd m5, m0 ; 31 27 %if WIN64 mova [r8+8], xm13 %endif call m(inv_txfm_add_8x32_8bpc).dct32 %if WIN64 mova xm13, [r8+8] %endif call .pass1_dct_load_dct16 vbroadcasti128 m4, [r7-16*8] vbroadcasti128 m8, [r7-16*6] vbroadcasti128 m5, [r7-16*4] vbroadcasti128 m9, [r7-16*2] vbroadcasti128 m6, [r7+16*0] vbroadcasti128 m11, [r7+16*2] vbroadcasti128 m7, [r7+16*4] vbroadcasti128 m12, [r7+16*6] shufpd m4, m8, 0x0c ; 16 18 shufpd m5, m9, 0x0c ; 20 22 shufpd m6, m11, 0x0c ; 24 26 shufpd m7, m12, 0x0c ; 28 30 call m(inv_txfm_add_8x16_8bpc).dct16 jmp .pass1_dct2 .pass1_dct_load_dct16: vbroadcasti128 m0, [cfq-16*8] vbroadcasti128 m4, [cfq-16*6] vbroadcasti128 m1, [cfq-16*4] vbroadcasti128 m5, [cfq-16*2] vbroadcasti128 m2, [cfq+16*0] vbroadcasti128 m6, [cfq+16*2] vbroadcasti128 m3, [cfq+16*4] vbroadcasti128 m7, [cfq+16*6] vpbroadcastd m10, [o(pd_32)] shufpd m0, m4, 0x0c ; 0 2 shufpd m1, m5, 0x0c ; 4 6 shufpd m2, m6, 0x0c ; 8 10 shufpd m3, m7, 0x0c ; 12 14 ret .pass1_dct_fast: call m(inv_txfm_add_8x32_8bpc).dct32_fast call .pass1_dct_load_dct16 call m(inv_txfm_add_8x16_8bpc).dct16_fast .pass1_dct2: %macro IDCT_8X32_PASS1_END 6 ; a, b_mem, c_mem[1-2], shift, out_high_mem mova m9, [cfq+32*%2] ; b0 mova m10, [rsp+32*%4] ; c15 mova m11, [rsp+32*%3] ; c0 psubd m8, %1, m9 ; a15 paddd %1, m9 ; a0 psubd m9, m8, m10 ; out16 paddd m8, m10 ; out15 psubd m10, %1, m11 ; out31 paddd %1, m11 ; out0 REPX {psrad x, %5}, m9, %1, m8, m10 packssdw %1, m9 ; 0 16 packssdw m8, m10 ; 15 31 mova [%6], m8 %endmacro IDCT_8X32_PASS1_END m0, -4, 0, 14, 6, r7 +32*0 IDCT_8X32_PASS1_END m1, -3, 1, 15, 6, r7 +32*1 IDCT_8X32_PASS1_END m2, -1, 3, 13, 6, r7 +32*3 IDCT_8X32_PASS1_END m3, -2, 2, 12, 6, r7 +32*2 IDCT_8X32_PASS1_END m4, 0, 4, 10, 6, cfq+32*0 IDCT_8X32_PASS1_END m5, 1, 5, 11, 6, cfq+32*1 IDCT_8X32_PASS1_END m6, 3, 7, 9, 6, cfq+32*3 IDCT_8X32_PASS1_END m7, 2, 6, 8, 6, cfq+32*2 call m(inv_txfm_add_16x8_8bpc).transpose8x8_vpermq mova [cfq-32*4], m0 mova [cfq-32*3], m1 mova [cfq-32*2], m2 mova [cfq-32*1], m3 mova [r7 -32*4], m4 mova [r7 -32*3], m5 mova [r7 -32*2], m6 mova [r7 -32*1], m7 mova m0, [cfq+32*2] mova m1, [cfq+32*3] mova m2, [cfq+32*1] mova m3, [cfq+32*0] mova m4, [r7 +32*2] mova m5, [r7 +32*3] mova m6, [r7 +32*1] mova m7, [r7 +32*0] call m(inv_txfm_add_16x8_8bpc).transpose8x8_vpermq mov rsp, r8 %if WIN64 pop r8 %endif jmp tx2q .pass2_dct: mova [cfq+32*0], m0 mova [cfq+32*1], m2 mova [cfq+32*2], m4 mova [cfq+32*3], m6 call m(inv_txfm_add_16x8_8bpc).dct8 mova [r7 +32*0], m0 mova [r7 +32*1], m1 mova [r7 +32*2], m2 mova [r7 +32*3], m3 mova m0, [cfq+32*0] mova m1, [cfq+32*1] mova m2, [cfq+32*2] mova m3, [cfq+32*3] vpbroadcastd m10, [o(pd_4096)] mova [cfq+32*0], m4 mova [cfq+32*1], m5 mova [cfq+32*2], m6 mova [cfq+32*3], m7 call m(inv_txfm_add_16x4_8bpc).dct4 mova m11, [r7 +32*0] mova m9, [cfq+32*0] psubd m12, m0, m11 paddd m0, m11 paddd m11, m4, m9 psubd m4, m9 mova m8, [r7 +32*1] mova m9, [cfq+32*1] psrad m0, 13 psrad m11, 13 packssdw m0, m11 psubd m11, m1, m8 paddd m1, m8 paddd m8, m5, m9 psubd m5, m9 psrad m1, 13 psrad m8, 13 packssdw m1, m8 mova [cfq+32*0], m0 mova [cfq+32*1], m1 mova m1, [r7 +32*2] mova m9, [cfq+32*2] paddd m0, m2, m1 psubd m2, m1 paddd m1, m6, m9 psubd m6, m9 mova m8, [r7 +32*3] mova m9, [cfq+32*3] psrad m0, 13 psrad m1, 13 packssdw m0, m1 paddd m1, m3, m8 psubd m3, m8 paddd m8, m7, m9 psubd m7, m9 psrad m1, 13 psrad m8, 13 packssdw m1, m8 mova [cfq+32*2], m0 mova [cfq+32*3], m1 REPX {psrad x, 13}, m3, m7, m2, m6 packssdw m3, m7 packssdw m2, m6 mova [r7 +32*0], m3 mova [r7 +32*1], m2 REPX {psrad x, 13}, m11, m5, m12, m4 packssdw m11, m5 packssdw m12, m4 mova [r7 +32*2], m11 mova [r7 +32*3], m12 mova m1, [cfq-32*3] mova m3, [cfq-32*1] mova m5, [r7 -32*3] mova m7, [r7 -32*1] call m(inv_txfm_add_16x8_8bpc).dct8 mova [cfq-32*3], m0 mova [cfq-32*1], m1 mova [r7 -32*3], m2 mova [r7 -32*1], m3 mova m0, [cfq-32*4] mova m1, [cfq-32*2] mova m2, [r7 -32*4] mova m3, [r7 -32*2] mova [cfq-32*4], m4 mova [cfq-32*2], m5 mova [r7 -32*4], m6 mova [r7 -32*2], m7 call m(inv_txfm_add_16x4_8bpc).dct4 mova m11, [cfq-32*3] mova m9, [cfq-32*4] psubd m12, m0, m11 paddd m0, m11 paddd m11, m4, m9 psubd m4, m9 mova m8, [cfq-32*1] mova m9, [cfq-32*2] REPX {psrad x, 13}, m0, m11, m12, m4 packssdw m0, m11 packssdw m12, m4 psubd m11, m1, m8 paddd m1, m8 paddd m8, m5, m9 psubd m5, m9 REPX {psrad x, 13}, m1, m8, m11, m5 packssdw m1, m8 packssdw m11, m5 mova m4, [cfq+32*0] mova m5, [cfq+32*1] pxor m10, m10 call m(inv_txfm_add_32x4_8bpc).write_32x2 mova m1, [r7 -32*3] mova m9, [r7 -32*4] paddd m0, m2, m1 psubd m2, m1 paddd m1, m6, m9 psubd m6, m9 mova m8, [r7 -32*1] mova m9, [r7 -32*2] psrad m0, 13 psrad m1, 13 packssdw m0, m1 paddd m1, m3, m8 psubd m3, m8 paddd m8, m7, m9 psubd m7, m9 psrad m1, 13 psrad m8, 13 packssdw m1, m8 mova m4, [cfq+32*2] mova m5, [cfq+32*3] call m(inv_txfm_add_32x4_8bpc).write_32x2 mova m4, [r7 +32*0] mova m5, [r7 +32*1] REPX {psrad x, 13}, m3, m7, m2, m6 packssdw m0, m3, m7 packssdw m1, m2, m6 call m(inv_txfm_add_32x4_8bpc).write_32x2 mova m4, [r7 +32*2] mova m5, [r7 +32*3] mova m0, m11 mova m1, m12 call m(inv_txfm_add_32x4_8bpc).write_32x2 .pass2_end: mov r6, -32*16 call m(inv_txfm_add_16x16_8bpc).zero_cf RET .dconly: movd xm3, [o(pw_256)] pmulhrsw xm3, [cfq] or r4d, 8 jmp m(inv_txfm_add_32x4_8bpc).dconly2 .pass1_identity: vpbroadcastd m12, [o(pw_53x512)] call .pass1_identity_main mova [cfq-32*4], m0 mova [cfq-32*3], m1 mova [cfq-32*2], m2 mova [cfq-32*1], m3 add cfq, 32*4 mova [cfq+32*0], m4 mova [cfq+32*1], m5 mova [cfq+32*2], m6 mova [cfq+32*3], m7 call .pass1_identity_main sub cfq, 32*4 jmp tx2q .pass2_identity: vpbroadcastd m12, [o(pw_181x4)] pxor m10, m10 mova [cfq+32*0], m4 mova [cfq+32*1], m5 pmulhrsw m4, m12, m0 pmulhrsw m0, m12, [cfq-32*4] pmulhrsw m5, m12, m1 pmulhrsw m1, m12, [cfq-32*3] test eobd, 0x100 jnz .hdpcm test eobd, 0x200 jnz .vdpcm .pass2_identity2: call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [cfq-32*2] pmulhrsw m4, m12, m2 pmulhrsw m1, m12, [cfq-32*1] pmulhrsw m5, m12, m3 call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [r7 -32*4] pmulhrsw m4, m12, [cfq+32*0] pmulhrsw m1, m12, [r7 -32*3] pmulhrsw m5, m12, [cfq+32*1] call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [r7 -32*2] pmulhrsw m4, m12, m6 pmulhrsw m1, m12, [r7 -32*1] pmulhrsw m5, m12, m7 call m(inv_txfm_add_32x4_8bpc).write_32x2 jmp .pass2_end .hdpcm: call m(inv_txfm_add_32x4_8bpc).write_32x2_hdpcm pmulhrsw m0, m12, [cfq-32*2] pmulhrsw m4, m12, m2 pmulhrsw m1, m12, [cfq-32*1] pmulhrsw m5, m12, m3 call m(inv_txfm_add_32x4_8bpc).write_32x2_hdpcm2 pmulhrsw m0, m12, [r7 -32*4] pmulhrsw m4, m12, [cfq+32*0] pmulhrsw m1, m12, [r7 -32*3] pmulhrsw m5, m12, [cfq+32*1] call m(inv_txfm_add_32x4_8bpc).write_32x2_hdpcm2 pmulhrsw m0, m12, [r7 -32*2] pmulhrsw m4, m12, m6 pmulhrsw m1, m12, [r7 -32*1] pmulhrsw m5, m12, m7 call m(inv_txfm_add_32x4_8bpc).write_32x2_hdpcm2 jmp .pass2_end .vdpcm: paddw m1, m0 paddw m5, m4 call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [cfq-32*2] pmulhrsw m4, m12, m2 pmulhrsw m2, m12, [cfq-32*1] pmulhrsw m3, m12, m3 call m(inv_txfm_add_32x4_8bpc).write_32x2_vdpcm2 pmulhrsw m0, m12, [r7 -32*4] pmulhrsw m4, m12, [cfq+32*0] pmulhrsw m2, m12, [r7 -32*3] pmulhrsw m3, m12, [cfq+32*1] call m(inv_txfm_add_32x4_8bpc).write_32x2_vdpcm2 pmulhrsw m0, m12, [r7 -32*2] pmulhrsw m4, m12, m6 pmulhrsw m2, m12, [r7 -32*1] pmulhrsw m3, m12, m7 call m(inv_txfm_add_32x4_8bpc).write_32x2_vdpcm2 jmp .pass2_end ALIGN function_align .pass1_identity_main: lea r3, [cfq+16*16] mova xm0, [cfq-16*8] vinserti128 m0, [r3 -16*8], 1 mova xm1, [cfq-16*7] vinserti128 m1, [r3 -16*7], 1 mova xm2, [cfq-16*6] vinserti128 m2, [r3 -16*6], 1 mova xm3, [cfq-16*5] vinserti128 m3, [r3 -16*5], 1 mova xm4, [cfq-16*4] vinserti128 m4, [r3 -16*4], 1 mova xm5, [cfq-16*3] vinserti128 m5, [r3 -16*3], 1 mova xm6, [cfq-16*2] vinserti128 m6, [r3 -16*2], 1 mova xm7, [cfq-16*1] vinserti128 m7, [r3 -16*1], 1 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_32x4_8bpc).identity_main jmp m(inv_txfm_add_16x8_8bpc).transpose8x8 .pass2_flipddt: lea r3, [o(flipddt8_mat)] jmp .pass2_dst .pass2_ddt: lea r3, [o(ddt8_mat)] jmp .pass2_dst .pass2_flipadst: lea r3, [o(flipadst8_mat)] jmp .pass2_dst .pass2_adst: lea r3, [o(adst8_mat)] .pass2_dst: add cfq, 32*4 call .pass2_dst_main mova m4, [cfq+32*0] mova m5, [cfq+32*1] mova m6, [cfq+32*2] mova m7, [cfq+32*3] sub cfq, 32*4 mova m0, [cfq-32*4] mova m1, [cfq-32*3] mova m2, [cfq-32*2] mova m3, [cfq-32*1] sub r3, 16*8 call .pass2_dst_main vpbroadcastd m12, [o(pw_4096)] pxor m10, m10 pmulhrsw m0, m12, [r7 -32*1] pmulhrsw m4, m12, [r7 +32*3] pmulhrsw m1, m12, [cfq-32*1] pmulhrsw m5, m12, [cfq+32*3] call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [r7 -32*2] pmulhrsw m4, m12, [r7 +32*2] pmulhrsw m1, m12, [cfq-32*2] pmulhrsw m5, m12, [cfq+32*2] call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [r7 -32*3] pmulhrsw m4, m12, [r7 +32*1] pmulhrsw m1, m12, [cfq-32*3] pmulhrsw m5, m12, [cfq+32*1] call m(inv_txfm_add_32x4_8bpc).write_32x2 pmulhrsw m0, m12, [r7 -32*4] pmulhrsw m4, m12, [r7 +32*0] pmulhrsw m1, m12, m6 pmulhrsw m5, m12, [cfq+32*0] call m(inv_txfm_add_32x4_8bpc).write_32x2 jmp .pass2_end ALIGN function_align .pass2_dst_main: punpcklwd m10, m0, m2 ; 0 2 punpckhwd m0, m2 punpcklwd m2, m1, m3 ; 1 3 punpckhwd m1, m3 punpcklwd m3, m4, m6 ; 4 6 punpckhwd m4, m6 punpcklwd m11, m5, m7 ; 5 7 punpckhwd m5, m7 mov r5d, 32*3 .pass2_dst_main_loop: call m(inv_txfm_add_16x8_8bpc).dst8 psrad m6, 10 psrad m7, 10 packssdw m6, m7 mova [cfq+r5+32*4], m6 call m(inv_txfm_add_16x8_8bpc).dst8 psrad m6, 10 psrad m7, 10 packssdw m6, m7 mova [cfq+r5-32*4], m6 sub r5d, 32 jge .pass2_dst_main_loop ret dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx_avx512.asm000066400000000000000000011232451517466257200237040ustar00rootroot00000000000000; Copyright © 2020-2023, VideoLAN and dav2d authors ; Copyright © 2020-2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 const \ dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 const \ int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 ; packed 4-bit qword shuffle indices permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 dq 0xc824352d56128751, 0xd906171e74301e15 dq 0x6271604b03472d62, 0x735342782165b426 dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e dq 0x5115049dd9045b79, 0x733726bffb263d1f permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 dq 0x0f11fa9c01150415, 0x0988f326039d2637 dq 0x05640f1108269d8c, 0x05290edb0aaebfae dq 0x0005000509378c9d, 0xffffffff0bbfaebf pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 pb_32: times 4 db 32 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 pw_8192: times 2 dw 8192 pw_16384: times 2 dw 16384 pw_1697x16: times 2 dw 1697*16 pw_1697x8: times 2 dw 1697*8 pw_2896x8: times 2 dw 2896*8 pd_2048: dd 2048 %define pw_5 (permD+52) %define pd_m1 (permD+60) %define pw_3803_1321 (permD+44) %define pw_2482_3803 (permD+12) %define pw_2440_3290 (permD+ 4) %define pw_m3290_2440 (permD+28) %define pw_3857_1380 (permD+36) %define pw_m1380_3857 (permD+20) pw_8192_m8192: dw 8192, -8192 pw_m8192_8192: dw -8192, 8192 pw_16384_m16384: dw 16384, -16384 pw_m16384_16384: dw -16384, 16384 pw_m1321_2482: dw -1321, 2482 pw_m3344_3344: dw -3344, 3344 pw_2482_3344: dw 2482, 3344 pw_m3803_3344: dw -3803, 3344 pd_3344: dd 3344 pw_m1321_m3344: dw -1321, -3344 pw_2896_m2896: dw 2896, -2896 pw_1567_m3784: dw 1567, -3784 pw_3784_m1567: dw 3784, -1567 pw_4017_m799: dw 4017, -799 pw_2276_m3406: dw 2276, -3406 pw_m799_m4017: dw -799, -4017 pw_m3406_m2276: dw -3406, -2276 %macro COEF_PAIR 2-3 0 pw_%1_%2: dw %1, %2 pw_m%2_%1: dw -%2, %1 %if %3 pw_m%1_m%2: dw -%1, -%2 %endif %endmacro COEF_PAIR 2896, 2896 COEF_PAIR 1567, 3784, 1 COEF_PAIR 3784, 1567 COEF_PAIR 201, 4091 COEF_PAIR 995, 3973 COEF_PAIR 1751, 3703 COEF_PAIR 3035, 2751 COEF_PAIR 3513, 2106 COEF_PAIR 4052, 601 COEF_PAIR 3166, 2598, 1 COEF_PAIR 3920, 1189, 1 COEF_PAIR 2276, 3406 COEF_PAIR 4017, 799 %macro COEF_X8 1-* %rep %0 dw %1*8, %1*8 %rotate 1 %endrep %endmacro pw_m2276x8: COEF_X8 -2276 pw_3406x8: COEF_X8 3406 pw_4017x8: COEF_X8 4017 pw_799x8: COEF_X8 799 pw_3784x8: COEF_X8 3784 pw_1567x8: COEF_X8 1567 pw_4076x8: COEF_X8 4076 pw_401x8: COEF_X8 401 pw_m2598x8: COEF_X8 -2598 pw_3166x8: COEF_X8 3166 pw_3612x8: COEF_X8 3612 pw_1931x8: COEF_X8 1931 pw_m1189x8: COEF_X8 -1189 pw_3920x8: COEF_X8 3920 pw_4091x8: COEF_X8 4091 pw_201x8: COEF_X8 201 pw_m2751x8: COEF_X8 -2751 pw_3035x8: COEF_X8 3035 pw_3703x8: COEF_X8 3703 pw_1751x8: COEF_X8 1751 pw_m1380x8: COEF_X8 -1380 pw_3857x8: COEF_X8 3857 pw_3973x8: COEF_X8 3973 pw_995x8: COEF_X8 995 pw_m2106x8: COEF_X8 -2106 pw_3513x8: COEF_X8 3513 pw_3290x8: COEF_X8 3290 pw_2440x8: COEF_X8 2440 pw_m601x8: COEF_X8 -601 pw_4052x8: COEF_X8 4052 pw_401_4076x8: dw 401*8, 4076*8 pw_m2598_3166x8: dw -2598*8, 3166*8 pw_1931_3612x8: dw 1931*8, 3612*8 pw_m1189_3920x8: dw -1189*8, 3920*8 pw_799_4017x8: dw 799*8, 4017*8 pw_m2276_3406x8: dw -2276*8, 3406*8 pw_201_4091x8: dw 201*8, 4091*8 pw_m601_4052x8: dw -601*8, 4052*8 pw_995_3973x8: dw 995*8, 3973*8 pw_m1380_3857x8: dw -1380*8, 3857*8 pw_1751_3703x8: dw 1751*8, 3703*8 pw_m2106_3513x8: dw -2106*8, 3513*8 pw_2440_3290x8: dw 2440*8, 3290*8 pw_m2751_3035x8: dw -2751*8, 3035*8 pw_101_4095x8: dw 101*8, 4095*8 pw_m2824_2967x8: dw -2824*8, 2967*8 pw_1660_3745x8: dw 1660*8, 3745*8 pw_m1474_3822x8: dw -1474*8, 3822*8 pw_897_3996x8: dw 897*8, 3996*8 pw_m2191_3461x8: dw -2191*8, 3461*8 pw_2359_3349x8: dw 2359*8, 3349*8 pw_m700_4036x8: dw -700*8, 4036*8 pw_501_4065x8: dw 501*8, 4065*8 pw_m2520_3229x8: dw -2520*8, 3229*8 pw_2019_3564x8: dw 2019*8, 3564*8 pw_m1092_3948x8: dw -1092*8, 3948*8 pw_1285_3889x8: dw 1285*8, 3889*8 pw_m1842_3659x8: dw -1842*8, 3659*8 pw_2675_3102x8: dw 2675*8, 3102*8 pw_m301_4085x8: dw -301*8, 4085*8 idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 COEF_PAIR 401, 4076, 1 COEF_PAIR 799, 4017 COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 COEF_PAIR 1931, 3612, 1 COEF_PAIR 3406, 2276 COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 SECTION .text %define o_base int8_permA+64*18 %define o(x) (r5 - (o_base) + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, ; 16 = special_mul1, 32 = special_mul2 %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags mova m%2, m%4 %if %7 & 16 vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} mova m%3, m%4 %if %7 & 32 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} %else vpdpwssd m%3, m%1, m%6 %endif %elif %7 & 32 vpdpwssd m%2, m%1, m%5 mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} %elif %6 < 32 vpdpwssd m%2, m%1, m%5 mova m%3, m%4 vpdpwssd m%3, m%1, m%6 %elif %7 & 1 vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} %else vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} %endif %if %7 & 2 psrld m%2, 12 pslld m%3, 4 vpshrdd m%1, m%3, m%2, 16 %elif %7 & 4 ; compared to using shifts (as above) this has better throughput, ; but worse latency and requires setting up the opmask/index ; registers, so only use this method for the larger transforms pslld m%1, m%2, 4 vpmultishiftqb m%1{k7}, m13, m%3 %else psrad m%2, 12 psrad m%3, 12 %if %7 & 8 == 0 packssdw m%1, m%3, m%2 %endif %endif %endmacro ; flags: same as ITX_MUL2X_PACK %macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags %if %11 & 1 vpbroadcastd m%4, [o(pw_%9_%10)] vpbroadcastd m%4{k1}, [o(pw_%7_%8)] vpbroadcastd m%5, [o(pw_m%10_%9)] vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] %else vpbroadcastd m%4, [o(pw_m%10_%9)] vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] vpbroadcastd m%5, [o(pw_%9_%10)] vpbroadcastd m%5{k1}, [o(pw_%7_%8)] %endif ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 %endmacro ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 punpcklwd m%3, m%2, m%1 punpckhwd m%2, m%1 %if %7 < 32 mova m%1, m%5 vpdpwssd m%1, m%3, m%7 mova m%4, m%5 vpdpwssd m%4, m%2, m%7 %else mova m%1, m%5 vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} mova m%4, m%5 vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} %endif psrad m%1, 12 psrad m%4, 12 packssdw m%1, m%4 mova m%4, m%5 %if %7 < 32 vpdpwssd m%4, m%2, m%6 mova m%2, m%5 vpdpwssd m%2, m%3, m%6 %else vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} mova m%2, m%5 vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} %endif psrad m%4, 12 psrad m%2, 12 %if %0 == 8 packssdw m%8, m%2, m%4 %else packssdw m%2, m%4 %endif %endmacro %macro WRAP_XMM 1+ %xdefine %%reset RESET_MM_PERMUTATION INIT_XMM cpuname DEFINE_MMREGS xmm AVX512_MM_PERMUTATION %1 %%reset %endmacro %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro %macro ITX4_END 4-5 2048 ; row[1-4], rnd %if %5 vpbroadcastd m2, [o(pw_%5)] pmulhrsw m0, m2 pmulhrsw m1, m2 %endif lea r2, [dstq+strideq*2] %assign %%i 1 %rep 4 %if %1 & 2 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) %else CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) %endif %assign %%i %%i + 1 %rotate 1 %endrep movd m2, [%%row_adr1] pinsrd m2, [%%row_adr2], 1 movd m3, [%%row_adr3] pinsrd m3, [%%row_adr4], 1 pmovzxbw m2, m2 pmovzxbw m3, m3 paddw m0, m2 paddw m1, m3 packuswb m0, m1 movd [%%row_adr1], m0 pextrd [%%row_adr2], m0, 1 pextrd [%%row_adr3], m0, 2 pextrd [%%row_adr4], m0, 3 ret %endmacro %macro INV_TXFM_FN 3 ; type1, type2, size cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base %define %%p1 m(i%1_%3_internal_8bpc) lea baseq, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%3_internal_8bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x4 %ifidn %1_%2, dct_dct vpbroadcastw m0, [cq] vpbroadcastd m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [cq], eobd pmulhrsw m0, m1 mova m1, m0 jmp m(iadst_4x4_internal_8bpc).end2 %endif %endmacro %macro IDCT4_1D_PACKED 0 vpbroadcastd m4, [o(pd_2048)] punpckhwd m2, m1, m0 punpcklwd m1, m0 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 paddsw m0, m1, m2 ; out0 out1 psubsw m1, m2 ; out3 out2 %endmacro %macro IADST4_1D_PACKED 0 punpcklwd m4, m1, m0 ; in2 in0 punpckhwd m5, m1, m0 ; in3 in1 .main2: vpbroadcastd m3, [o(pd_2048)] mova m0, m3 vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} mova m2, m3 vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} mova m1, m3 vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} vpdpwssd m1, m5, [o(pd_3344)] {bcstd} vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} REPX {psrad x, 12}, m0, m2, m1, m3 packssdw m0, m2 ; out0 out1 packssdw m1, m3 ; out2 out3 %endmacro INIT_XMM avx512icl INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: IDCT4_1D_PACKED pxor ymm16, ymm16 mova [cq], ymm16 ITX4_END 0, 1, 3, 2 INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: pxor ymm16, ymm16 mova [cq], ymm16 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x4_internal_8bpc).main .end: pxor ymm16, ymm16 mova [cq], ymm16 .end2: ITX4_END 3, 2, 1, 0 INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x4_internal_8bpc).end %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x8 %ifidn %1_%2, dct_dct movd xmm1, [o(pw_2896x8)] pmulhrsw xmm0, xmm1, [cq] movd xmm2, [o(pw_2048)] pmulhrsw xmm0, xmm1 pmulhrsw xmm0, xmm1 pmulhrsw xmm0, xmm2 vpbroadcastw ym0, xmm0 mova ym1, ym0 jmp m(iadst_4x8_internal_8bpc).end3 %endif %endmacro %macro IDCT8_1D_PACKED 0 punpckhwd m5, m3, m0 ; in7 in1 punpckhwd m4, m1, m2 ; in3 in5 punpcklwd m3, m1 ; in6 in2 punpcklwd m2, m0 ; in4 in0 .main2: vpbroadcastd m6, [o(pd_2048)] ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 psubsw m0, m5, m4 ; t5a t6a (interleaved) paddsw m4, m5 ; t4 t7 (interleaved) ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 %if mmsize > 16 vbroadcasti32x4 m1, [o(deint_shuf)] pshufb m4, m1 %else pshufb m4, [o(deint_shuf)] %endif psubsw m1, m2, m3 ; tmp3 tmp2 paddsw m3, m2 ; tmp0 tmp1 punpckhqdq m2, m4, m0 ; t7 t6 punpcklqdq m4, m0 ; t4 t5 paddsw m0, m3, m2 ; out0 out1 psubsw m3, m2 ; out7 out6 psubsw m2, m1, m4 ; out4 out5 paddsw m1, m4 ; out3 out2 %endmacro %macro IADST8_1D_PACKED 1 ; pass vpbroadcastd m6, [o(pd_2048)] %if %1 == 1 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a psubsw m4, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a %if mmsize > 16 vbroadcasti32x4 m2, [o(deint_shuf)] %else mova m2, [o(deint_shuf)] %endif vprord m1, 16 psubsw m3, m0, m1 ; t3 t2 paddsw m0, m1 ; -out7 out0 psubsw m1, m4, m5 ; t7 t6 paddsw m4, m5 ; out6 -out1 pshufb m0, m2 pshufb m4, m2 mova m2, m6 vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} mova m5, m6 vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} psrad m2, 12 psrad m5, 12 packssdw m2, m5 ; out4 -out5 mova m5, m6 vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} mova m3, m6 vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} psrad m5, 12 psrad m3, 12 packssdw m1, m3, m5 ; out2 -out3 %else punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a psubsw m4, m0, m2 ; t4 t5 paddsw m0, m2 ; t0 t1 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 shufps m2, m5, m4, q1032 punpckhwd m4, m2 punpcklwd m5, m2 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a psubsw m2, m0, m1 ; t2 t3 paddsw m0, m1 ; out0 -out7 psubsw m1, m4, m5 ; t6 t7 paddsw m4, m5 ; -out1 out6 vpbroadcastd m5, [o(pw_2896x8)] punpckhqdq m3, m2, m1 ; t3 t7 punpcklqdq m2, m1 ; t2 t6 paddsw m1, m2, m3 ; t2+t3 t6+t7 psubsw m2, m3 ; t2-t3 t6-t7 punpckhqdq m3, m4, m0 ; out6 -out7 punpcklqdq m0, m4 ; out0 -out1 pmulhrsw m2, m5 ; out4 -out5 pshufd m1, m1, q1032 pmulhrsw m1, m5 ; out2 -out3 %endif %endmacro INIT_YMM avx512icl INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, identity INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 IDCT4_1D_PACKED vbroadcasti32x4 m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 call .main vpbroadcastd m4, [o(pw_2048)] vinserti32x4 m0, m0, xm2, 1 vinserti32x4 m1, m1, xm3, 1 pshufd m1, m1, q1032 jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align .main: WRAP_XMM IDCT8_1D_PACKED ret INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal_8bpc).main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call .main_pass2 vpbroadcastd m4, [o(pw_2048)] vinserti32x4 m0, xm2, 1 vinserti32x4 m1, xm3, 1 pxor m5, m5 psubw m5, m4 .end: punpcklqdq m4, m5 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 .end3: vpbroadcastd m3, strided pmulld m5, m3, [o(pd_0to15)] kxnorb k1, k1, k1 kmovb k2, k1 vpgatherdd m3{k1}, [dstq+m5] pxor m4, m4 mova [cq], zmm20 punpcklbw m2, m3, m4 punpckhbw m3, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpscatterdd [dstq+m5]{k2}, m0 RET ALIGN function_align .main_pass1: punpckhwd xm0, xm4, xm3 ; 0 7 punpckhwd xm1, xm5, xm2 ; 2 5 punpcklwd xm2, xm5 ; 4 3 punpcklwd xm3, xm4 ; 6 1 WRAP_XMM IADST8_1D_PACKED 1 punpcklqdq xm3, xm4, xm0 ; out6 -out7 punpckhqdq xm0, xm4 ; out0 -out1 ret ALIGN function_align .main_pass2: WRAP_XMM IADST8_1D_PACKED 2 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal_8bpc).main punpcklwd m3, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m3 punpckhwd m1, m3 jmp tx2q .pass2: vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call m(iadst_4x8_internal_8bpc).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti32x4 m3, xm1, 1 vinserti32x4 m2, xm0, 1 pxor m4, m4 psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 jmp m(iadst_4x8_internal_8bpc).end INIT_ZMM avx512icl INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd m0, [o(pw_2896x8)] pmulhrsw m0, [cq] mova m1, [o(int8_permB)] vpbroadcastd m2, [o(pw_1697x8)] vpermb m0, m1, m0 pmulhrsw m2, m0 paddsw m0, m2 vextracti32x8 ym1, m0, 1 jmp tx2q .pass2: vpbroadcastd ym4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 imul r6d, 181 add r6d, 128+2048 sar r6d, 8+4 vpbroadcastw m0, r6d mova m1, m0 jmp m(iadst_4x16_internal_8bpc).end3 %endif %endmacro %macro IDCT16_1D_PACKED 0 punpckhwd m8, m7, m0 ; dct16 in15 in1 punpcklwd m9, m4, m0 ; dct4 in2 in0 punpckhwd m0, m3, m4 ; dct16 in7 in9 punpcklwd m7, m1 ; dct8 in7 in1 punpckhwd m1, m6 ; dct16 in3 in13 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 cglobal_label .main2 vpbroadcastd m10, [o(pd_2048)] .main3: vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 ; 0x33... ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a .main4: psubsw m2, m8, m0 ; t9 t14 paddsw m8, m0 ; t8 t15 psubsw m4, m1, m5 ; t10 t13 paddsw m1, m5 ; t11 t12 ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 psubsw m0, m8, m1 ; t11a t12a paddsw m8, m1 ; t8a t15a psubsw m1, m7, m3 ; t5a t6a paddsw m7, m3 ; t4 t7 .main5: ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a %if mmsize > 16 vbroadcasti32x4 m5, [o(deint_shuf)] %else mova m5, [o(deint_shuf)] %endif vpbroadcastd m11, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] paddsw m3, m2, m4 ; t9 t14 psubsw m2, m4 ; t10 t13 pshufb m8, m5 pshufb m7, m5 pshufb m3, m5 ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a punpckhqdq m2, m7, m1 ; t7 t6 punpcklqdq m7, m1 ; t4 t5 psubsw m1, m9, m6 ; dct4 out3 out2 paddsw m9, m6 ; dct4 out0 out1 packssdw m5, m11 ; t12 t13a packssdw m4, m0 ; t11 t10a punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 psubsw m3, m9, m2 ; dct8 out7 out6 paddsw m9, m2 ; dct8 out0 out1 psubsw m2, m1, m7 ; dct8 out4 out5 paddsw m1, m7 ; dct8 out3 out2 psubsw m7, m9, m0 ; out15 out14 paddsw m0, m9 ; out0 out1 psubsw m6, m1, m5 ; out12 out13 paddsw m1, m5 ; out3 out2 psubsw m5, m2, m4 ; out11 out10 paddsw m2, m4 ; out4 out5 psubsw m4, m3, m8 ; out8 out9 paddsw m3, m8 ; out7 out6 %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, identity INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova ym1, [cq+32*2] vinserti32x8 m1, [cq+32*0], 1 mova m0, [o(int16_perm)] mova ym2, [cq+32*3] vinserti32x8 m2, [cq+32*1], 1 vpbroadcastd m4, [o(pd_2048)] vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 vpbroadcastd m4, [o(pw_16384)] psubsw m3, m1, m2 paddsw m1, m2 ; out0 out1 vprord m3, 16 ; out2 out3 punpckldq m0, m1, m3 punpckhdq m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 jmp tx2q .pass2: vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, ym1, 1 vextracti32x4 xm4, m0, 2 vextracti32x4 xm5, m1, 2 vextracti32x4 xm6, m0, 3 vextracti32x4 xm7, m1, 3 call .main vinserti32x4 ym0, xm2, 1 vinserti32x4 ym1, xm3, 1 vinserti32x4 ym4, xm6, 1 vinserti32x4 ym5, xm7, 1 vinserti32x8 m0, ym4, 1 vinserti32x8 m1, ym5, 1 vpbroadcastd m5, [o(pw_2048)] pshufd m1, m1, q1032 jmp m(iadst_4x16_internal_8bpc).end2 ALIGN function_align .main: WRAP_XMM IDCT16_1D_PACKED ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m1, [o(permB)] vpermq m0, m1, [cq+64*0] vpermq m1, m1, [cq+64*1] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m3, [o(pw_16384)] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmulhrsw m2, m3 pmulhrsw m0, m3 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: call .main vpbroadcastd m5, [o(pw_2048)] psrlq m10, 4 psubw m6, m8, m5 .end: vpbroadcastd m7, [o(pw_2896x8)] paddsw ym1, ym2, ym4 psubsw ym2, ym4 vinserti32x8 m1, ym2, 1 pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 psrlq m0, m10, 4 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f punpcklqdq m5, m6 .end2: pmulhrsw m0, m5 pmulhrsw m1, m5 .end3: vpbroadcastd m3, strided pmulld m5, m3, [o(pd_0to15)] kxnorw k1, k1, k1 kmovw k2, k1 vpgatherdd m3{k1}, [dstq+m5] pxor m4, m4 mova [cq+64*0], m4 mova [cq+64*1], m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpscatterdd [dstq+m5]{k2}, m0 RET ALIGN function_align .main: movu m3, [o(permB+1)] psrlq m10, m3, 4 .main2: vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 vpbroadcastd m9, [o(pd_2048)] vpbroadcastq ym13, [o(int_mshift)] kxnorb k1, k1, k1 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 punpcklwd m0, m3 ; in0 in15 in2 in13 kshiftrb k1, k1, 4 vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 INIT_YMM avx512icl vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 psubsw m2, m0, m3 ; t9a t8a t11a t10a paddsw m0, m3 ; t1a t0a t3a t2a psubsw m3, m1, m4 ; t13a t12a t15a t14a paddsw m4, m1 ; t5a t4a t7a t6a ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 psubw m7, m8, m7 ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 vpbroadcastd m6, [o(pw_3784_m1567)] vpbroadcastd m6{k1}, [o(pw_m3784_1567)] psubsw m1, m0, m4 ; t5 t4 t7 t6 paddsw m0, m4 ; t1 t0 t3 t2 psubsw m4, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 vbroadcasti32x4 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 pshufd m2, m2, q1032 ; t6a t7a t14 t15 psubsw m4, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 psubsw m1, m2 ; t7 t6 t15a t14a punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a punpcklqdq m4, m1 ; t3a t7 t11 t15a INIT_ZMM avx512icl vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m1, [o(permB)] vpermq m0, m1, [cq+64*0] vpermq m1, m1, [cq+64*1] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m3, [o(pw_16384)] punpcklwd m2, m1, m0 punpckhwd m1, m0 pmulhrsw m2, m3 pmulhrsw m1, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x16_internal_8bpc).main vpbroadcastd m6, [o(pw_2048)] psrlq m10, 12 psubw m5, m8, m6 jmp m(iadst_4x16_internal_8bpc).end INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m2, [o(int16_perm)] vpermb m1, m2, [cq+64*0] vpermb m2, m2, [cq+64*1] vpbroadcastd m4, [o(pw_1697x8)] vpbroadcastd m0, [o(pd_m1)] pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal pmulhrsw m4, m2 ; it still works, but if the input is -1 the vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. punpckldq m0, m1, m2 punpckhdq m1, m2 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x16)] vpbroadcastd m5, [o(pw_2048)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m0 paddsw m1, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x16_internal_8bpc).end2 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] movhps xm%3, [dstq+%5] movq xm%4, [dstq+%6] movhps xm%4, [dstq+%7] pmovzxbw m%3, xm%3 pmovzxbw m%4, xm%4 %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vextracti32x4 xm%4, m%3, 1 movq [dstq ], xm%3 movhps [dstq+%6], xm%3 movq [dstq+%5], xm%4 movhps [dstq+%7], xm%4 %endmacro %macro INV_TXFM_8X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x4 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_2048)] pmulhrsw xm0, xm1 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 jmp m(iadst_8x4_internal_8bpc).end3 %endif %endmacro INIT_YMM avx512icl INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] pmulhrsw xm0, xm3, [cq+16*0] pmulhrsw xm1, xm3, [cq+16*1] pmulhrsw xm2, xm3, [cq+16*2] pmulhrsw xm3, [cq+16*3] call m(idct_4x8_internal_8bpc).main vbroadcasti32x4 m4, [o(deint_shuf)] vinserti32x4 m3, m1, xm3, 1 vinserti32x4 m1, m0, xm2, 1 shufps m0, m1, m3, q0220 shufps m1, m3, q1331 pshufb m0, m4 pshufb m1, m4 jmp tx2q .pass2: IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal_8bpc).main_pass1 vinserti32x4 m0, xm2, 1 vinserti32x4 m1, xm3, 1 pxor m3, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 psubsw m3, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: vpermq m0, m0, q3120 vpermq m1, m1, q3120 .end2: vpbroadcastd m2, [o(pw_2048)] pmulhrsw m0, m2 pmulhrsw m1, m2 .end3: pxor m2, m2 mova [cq], zmm18 lea r6, [strideq*3] WRITE_8X4 0, 1, 4, 5 RET ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal_8bpc).main_pass1 vinserti32x4 m3, m3, xm1, 1 vinserti32x4 m2, m2, xm0, 1 punpckhwd m1, m3, m2 punpcklwd m3, m2 pxor m0, m0 psubsw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm0, [cq+16*1] vinserti32x4 m2, [cq+16*2], 1 vinserti32x4 m0, [cq+16*3], 1 vpbroadcastd m3, [o(pw_2896x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 pmulhrsw m1, m3 pmulhrsw m2, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 paddsw m0, m0 paddsw m1, m1 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8 %ifidn %1_%2, dct_dct INIT_ZMM avx512icl movsx r6d, word [cq] mov [cq], eobd .dconly: imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 .dconly2: vpbroadcastd ym2, strided imul r6d, 181 pmulld ym5, ym2, [o(pd_0to15)] kxnorb k1, k1, k1 add r6d, 128+2048 sar r6d, 8+4 pxor m3, m3 vpbroadcastw m4, r6d .dconly_loop: kmovb k2, k1 vpgatherdq m2{k1}, [dstq+ym5] punpcklbw m0, m2, m3 punpckhbw m1, m2, m3 paddw m0, m4 paddw m1, m4 packuswb m0, m1 kmovb k1, k2 vpscatterdq [dstq+ym5]{k2}, m0 lea dstq, [dstq+strideq*8] sub r3d, 8 jg .dconly_loop RET INIT_YMM avx512icl %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, identity INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m1, [cq+32*1], q3120 ; 2 3 call .main shufps m4, m0, m1, q0220 shufps m5, m0, m1, q1331 shufps m1, m2, m3, q0220 shufps m3, m2, m3, q1331 vbroadcasti32x4 m0, [o(deint_shuf)] vpbroadcastd m2, [o(pw_16384)] REPX {pshufb x, m0}, m4, m5, m1, m3 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 vinserti32x4 m0, m4, xm1, 1 vshufi32x4 m2, m4, m1, 0x03 vinserti32x4 m1, m5, xm3, 1 vshufi32x4 m3, m5, m3, 0x03 jmp tx2q .pass2: call .main vpbroadcastd m4, [o(pw_2048)] vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call .main_pass1 vpbroadcastd m5, [o(pw_16384_m16384)] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 REPX {pmulhrsw x, m5}, m3, m4, m0, m1 vshufi32x4 m2, m3, m0, 0x03 vinserti32x4 m0, m3, xm0, 1 vshufi32x4 m3, m4, m1, 0x03 vinserti32x4 m1, m4, xm1, 1 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call .main_pass2 vpbroadcastd m5, [o(pw_2048)] vpbroadcastd xm4, [o(pw_4096)] psubw m4, m5 ; lower half = 2048, upper half = -2048 .end: REPX {vpermq x, x, q3120}, m0, m1, m2, m3 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 .end3: pmulhrsw m2, m4 pmulhrsw m3, m4 .end4: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 lea r6, [strideq*3] WRITE_8X4 0, 1, 4, 5 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 4, 5 RET ALIGN function_align .main_pass1: punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 IADST8_1D_PACKED 1 punpcklqdq m3, m4, m0 ; out6 -out7 punpckhqdq m0, m4 ; out0 -out1 ret ALIGN function_align cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call m(iadst_8x8_internal_8bpc).main_pass1 vpbroadcastd m5, [o(pw_m16384_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 punpckhwd m2, m1, m0 punpcklwd m1, m0 punpckhwd m0, m4, m3 punpcklwd m4, m3 punpckhwd m3, m2, m1 punpcklwd m2, m1 REPX {pmulhrsw x, m5}, m0, m4, m3, m2 vinserti32x4 m1, m0, xm3, 1 vshufi32x4 m3, m0, m3, 0x03 vinserti32x4 m0, m4, xm2, 1 vshufi32x4 m2, m4, m2, 0x03 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 vpermq m5, m3, q2031 vpermq m3, m0, q2031 vpermq m0, m2, q2031 vpermq m2, m1, q2031 pmulhrsw m1, m0, m4 pmulhrsw m0, m5, m4 jmp m(iadst_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*1] vinserti32x4 m3, [cq+16*4], 1 vinserti32x4 m2, [cq+16*5], 1 mova xm4, [cq+16*2] mova xm0, [cq+16*3] vinserti32x4 m4, [cq+16*6], 1 vinserti32x4 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal_8bpc).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd or r3d, 16 imul r6d, 181 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly %endif %endmacro %macro ITX_8X16_LOAD_COEFS 0 vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m0, m4, [cq+32*0] add cq, 32*4 pmulhrsw m7, m4, [cq+32*3] pmulhrsw m1, m4, [cq-32*3] pmulhrsw m6, m4, [cq+32*2] pmulhrsw m2, m4, [cq-32*2] pmulhrsw m5, m4, [cq+32*1] pmulhrsw m3, m4, [cq-32*1] pmulhrsw m4, [cq+32*0] %endmacro INIT_ZMM avx512icl INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m3, [o(permB)] vpermq m0, m3, [cq+64*0] vpbroadcastd m4, [o(pw_2896x8)] vpermq m1, m3, [cq+64*1] vpermq m2, m3, [cq+64*2] vpermq m3, m3, [cq+64*3] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 call m(idct_16x8_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 REPX {pmulhrsw x, m5}, m4, m0, m2, m1 punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 punpckhdq m1, m0, m2 ; 1 5 9 13 punpckldq m0, m2 ; 0 4 8 12 punpckldq m2, m3, m4 ; 2 6 10 14 punpckhdq m3, m4 ; 3 7 11 15 jmp tx2q .pass2: vprord m5, [o(int16_perm)], 16 vshufi32x4 m2, m2, q1320 ; 2 10 14 6 vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 vshufi32x4 m1, m3, q0132 ; 9 13 7 3 vpermb m9, m5, m0 vpermb m7, m5, m2 vpermb m8, m5, m4 vpermb m0, m5, m1 vextracti32x8 ym6, m9, 1 vextracti32x8 ym3, m7, 1 vextracti32x8 ym5, m8, 1 vextracti32x8 ym1, m0, 1 call .main2 mova ym8, [o(gather8a)] lea r3, [dstq+strideq*4] pmovzxdq m9, ym8 pshufd ym8, ym8, q1230 vpermt2q m0, m9, m4 vpermt2q m1, m9, m5 vpermt2q m2, m9, m6 vpermt2q m3, m9, m7 .end: vpbroadcastd m7, [o(pw_2048)] .end2: pmulhrsw m0, m7 pmulhrsw m1, m7 .end3: pmulhrsw m2, m7 pmulhrsw m3, m7 .end4: vpbroadcastd ym6, strided kxnorb k1, k1, k1 pxor m4, m4 pmulld ym8, ym6 kmovb k2, k1 vpgatherdq m6{k1}, [dstq+ym8] kmovb k1, k2 vpgatherdq m7{k2}, [r3+ym8] mova [cq+64*0], m4 mova [cq+64*1], m4 kmovb k2, k1 mova [cq+64*2], m4 mova [cq+64*3], m4 punpcklbw m5, m6, m4 punpckhbw m6, m4 paddw m0, m5 paddw m1, m6 packuswb m0, m1 vpscatterdq [dstq+ym8]{k1}, m0 punpcklbw m6, m7, m4 punpckhbw m7, m4 paddw m2, m6 paddw m3, m7 packuswb m2, m3 vpscatterdq [r3+ym8]{k2}, m2 RET ALIGN function_align cglobal_label .main_fast2 ; bottom three-quarters are zero vpbroadcastd ym10, [o(pd_2048)] vpbroadcastq ym13, [o(int_mshift)] vpbroadcastd ym3, [o(pw_401_4076x8)] vpbroadcastd ym5, [o(pw_799_4017x8)] vpbroadcastd ym4, [o(pw_m1189_3920x8)] pxor ym6, ym6 punpckhwd ym2, ym0, ym0 pmulhrsw ym2, ym3 ; t8a t15a punpcklwd ym7, ym1, ym1 pmulhrsw ym7, ym5 ; t4a t7a punpckhwd ym1, ym1 pmulhrsw ym4, ym1 ; t11a t12a vpcmpub k7, ym13, ym10, 6 punpcklwd ym9, ym6, ym0 psubsw ym0, ym2, ym4 ; t11a t12a paddsw ym8, ym2, ym4 ; t8a t15a mova ym1, ym7 jmp .main5 ALIGN function_align cglobal_label .main_fast ; bottom half is zero vpbroadcastd ym10, [o(pd_2048)] vpbroadcastq ym13, [o(int_mshift)] pxor ym6, ym6 punpckhwd ym8, ym0, ym0 punpckhwd ym4, ym3, ym3 punpckhwd ym5, ym2, ym2 punpcklwd ym7, ym1, ym1 punpckhwd ym1, ym1 punpcklwd ym3, ym3 punpcklwd ym9, ym6, ym0 punpcklwd ym6, ym2 vpbroadcastd ym2, [o(pw_401_4076x8)] vpbroadcastd ym0, [o(pw_m2598_3166x8)] vpbroadcastd ym11, [o(pw_1931_3612x8)] vpbroadcastd ym12, [o(pw_m1189_3920x8)] pmulhrsw ym8, ym2 ; t8a t15a vpbroadcastd ym2, [o(pw_799_4017x8)] pmulhrsw ym0, ym4 ; t9a t14a vpbroadcastd ym4, [o(pw_m2276_3406x8)] pmulhrsw ym5, ym11 ; t10a t13a pmulhrsw ym1, ym12 ; t11a t12a pmulhrsw ym7, ym2 ; t4a t7a pmulhrsw ym3, ym4 ; t5a t6a vpcmpub k7, ym13, ym10, 6 jmp .main4 ALIGN function_align cglobal_label .main WRAP_YMM IDCT16_1D_PACKED ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_8bpc).main_pass1 vbroadcasti32x4 m6, [o(int_shuf1)] vpbroadcastd m7, [o(pw_16384_m16384)] punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 .pass1_end: REPX {pmulhrsw x, m7}, m3, m5, m4, m2 punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m5 punpckhqdq m3, m5 jmp tx2q .pass2: call .main_pass2 vpbroadcastd m6, [o(pw_2048)] psrlq m10, 4 psubw m7, m8, m6 .pass2_end: vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m5, m2 ; out8 -out11 -out9 out10 mova ym8, [o(gather8c)] lea r3, [dstq+strideq] psrlq m2, m10, 4 vpermi2q m2, m0, m3 ; 1 3 13 15 vpermt2q m0, m10, m3 ; 0 2 12 14 psrlq m3, m10, 8 vpermi2q m3, m1, m5 ; 5 7 9 11 psrlq m10, 12 vpermt2q m1, m10, m5 ; 4 6 8 10 pmulhrsw m0, m6 pmulhrsw m1, m6 jmp m(idct_8x16_internal_8bpc).end3 ALIGN function_align .main_pass1: vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m5, m2, [cq+64*0] pmulhrsw m3, m2, [cq+64*3] pmulhrsw m1, m2, [cq+64*1] pmulhrsw m2, [cq+64*2] movu m4, [o(permA+3)] psrlq m10, m4, 4 mova m6, m4 vpermi2q m4, m5, m3 ; in0 in12 in2 in14 vpermt2q m5, m10, m3 ; in15 in3 in13 in1 vpermi2q m6, m1, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m2 ; in11 in7 in9 in5 jmp .main ALIGN function_align .main_pass2: mova m4, [o(permC)] psrlq m5, m4, 4 vpermi2q m4, m0, m2 ; in0 in12 in2 in14 psrlq m6, m5, 4 vpermi2q m5, m1, m3 ; in15 in3 in13 in1 psrlq m10, m6, 4 vpermi2q m6, m0, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 .main: punpcklwd m0, m4, m5 ; in0 in15 in2 in13 punpckhwd m4, m5 ; in12 in3 in14 in1 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 punpckhwd m6, m1 ; in8 in7 in10 in5 cglobal_label .main2 vpbroadcastd m9, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] kxnorb k1, k1, k1 vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 psubsw m2, m0, m6 ; t9a t8a t11a t10a paddsw m0, m6 ; t1a t0a t3a t2a psubsw m3, m5, m4 ; t13a t12a t15a t14a paddsw m5, m4 ; t5a t4a t7a t6a ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 psubw m7, m8, m7 ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 vpbroadcastd m6, [o(pw_3784_m1567)] vpbroadcastd m6{k1}, [o(pw_m3784_1567)] psubsw m1, m0, m5 ; t5 t4 t7 t6 paddsw m0, m5 ; t1 t0 t3 t2 psubsw m4, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 vbroadcasti32x4 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 pshufd m2, m2, q1032 ; t7a t6a t15 t14 psubsw m4, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 psubsw m1, m2 ; t7 t6 t15a t14a punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a punpcklqdq m4, m1 ; t3a t7 t11 t15a ret INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_8bpc).main_pass1 vbroadcasti32x4 m6, [o(int_shuf2)] vpbroadcastd m7, [o(pw_m16384_16384)] punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 jmp m(iadst_8x16_internal_8bpc).pass1_end .pass2: call m(iadst_8x16_internal_8bpc).main_pass2 vpbroadcastd m7, [o(pw_2048)] psrlq m10, 36 psubw m6, m8, m7 jmp m(iadst_8x16_internal_8bpc).pass2_end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [o(int16_perm)] vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 vpbroadcastd m5, [o(pw_2896x8)] punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 REPX {pmulhrsw x, m5}, m1, m2, m3, m4 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 jmp tx2q .pass2: vpbroadcastd m7, [o(pw_1697x16)] mova ym8, [o(gather8b)] lea r3, [dstq+strideq*2] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(idct_8x16_internal_8bpc).end %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] pmovzxbw m%3, [dstq+%5] %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif pmovzxbw m%4, [dstq+%6] %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vpermq m%3, m%3, q3120 mova [dstq+%5], xm%3 vextracti32x4 [dstq+%6], m%3, 1 %endmacro %macro INV_TXFM_16X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x4 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 %endif %endmacro INIT_ZMM avx512icl INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] mova xm3, [cq+16*3] mova xm4, [cq+16*4] mova xm5, [cq+16*5] mova xm6, [cq+16*6] mova xm7, [cq+16*7] call m(idct_4x16_internal_8bpc).main vpbroadcastd m8, [o(pw_16384)] vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 vinserti32x4 ym5, xm7, 1 ; b a f e vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 vinserti32x4 ym4, xm6, 1 ; 8 9 c d vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d pmulhrsw m1, m8 pmulhrsw m0, m8 pshufd m1, m1, q1032 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: IDCT4_1D_PACKED mova m2, [o(permA)] jmp m(iadst_16x4_internal_8bpc).end INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+64*0] mova m1, [cq+64*1] movshdup m3, [o(permB)] psrlq m10, m3, 4 call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m6, [o(pw_16384_m16384)] psrlq m0, m10, 4 psrlq m10, 8 .pass1_end: punpcklwd ym5, ym4, ym2 punpckhwd ym4, ym2 vinserti32x8 m5, ym4, 1 mova m1, m9 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} mova m4, m9 vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} psrad m1, 12 psrad m4, 12 packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 pmulhrsw m0, m6 pmulhrsw m1, m6 jmp tx2q .pass2: call .main movu m2, [o(permA+1)] .end: vpbroadcastd m3, [o(pw_2048)] pmulhrsw m0, m3 pmulhrsw m1, m3 .end2: psrlq m3, m2, 4 vpermi2q m2, m0, m1 vpermi2q m3, m0, m1 .end3: lea r3, [dstq+strideq*2] mova xm1, [dstq+strideq*0] vinserti32x4 ym1, [dstq+strideq*1], 1 vinserti32x4 m1, [r3 +strideq*0], 2 vinserti32x4 m1, [r3 +strideq*1], 3 pxor m4, m4 mova [cq+64*0], m4 mova [cq+64*1], m4 punpcklbw m0, m1, m4 punpckhbw m1, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [r3 +strideq*0], m0, 2 vextracti32x4 [r3 +strideq*1], m0, 3 RET ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+64*0] mova m1, [cq+64*1] movshdup m3, [o(permB)] psrlq m10, m3, 4 call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m6, [o(pw_m16384_16384)] psrlq m0, m10, 12 psrlq m10, 16 jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: call m(iadst_16x4_internal_8bpc).main movu m2, [o(permA+2)] jmp m(iadst_16x4_internal_8bpc).end INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m1, [cq+64*0] mova m2, [cq+64*1] vpbroadcastd m3, [o(pw_1697x16)] vpbroadcastd m4, [o(pw_16384)] mova m5, [o(idtx_16x4p)] shufps m0, m1, m2, q2020 shufps m1, m2, q3131 pmulhrsw m2, m3, m0 pmulhrsw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 paddsw m0, m2 paddsw m1, m3 vpermb m0, m5, m0 vpermb m1, m5, m1 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 movu m2, [o(permA+1)] jmp m(iadst_16x4_internal_8bpc).end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd or r3d, 8 .dconly: imul r6d, 181 add r6d, 128 sar r6d, 8 .dconly2: imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 .dconly3: imul r6d, 181 lea r2, [strideq*3] add r6d, 128+2048 sar r6d, 8+4 pxor m2, m2 vpbroadcastw m3, r6d .dconly_loop: mova xm1, [dstq+strideq*0] vinserti32x4 ym1, [dstq+strideq*1], 1 vinserti32x4 m1, [dstq+strideq*2], 2 vinserti32x4 m1, [dstq+r2 ], 3 punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 lea dstq, [dstq+strideq*4] sub r3d, 4 jg .dconly_loop RET %endif %endmacro %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd vpbroadcastd m8, [o(pw_2896x8)] vpermq m0, [cq+32*0], q3120 add cq, 32*4 vpermq m7, [cq+32*3], q%1 vpermq m1, [cq-32*3], q%1 vpermq m6, [cq+32*2], q3120 vpermq m2, [cq-32*2], q3120 vpermq m5, [cq+32*1], q%1 vpermq m3, [cq-32*1], q%1 vpermq m4, [cq+32*0], q3120 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd m1, [o(pw_2896x8)] vpermq m0, [cq+64*0], q3120 vpermq m2, [cq+64*1], q3120 vpermq m4, [cq+64*2], q3120 vpermq m6, [cq+64*3], q3120 REPX {pmulhrsw x, m1}, m0, m2, m4, m6 vextracti32x8 ym1, m0, 1 vextracti32x8 ym3, m2, 1 vextracti32x8 ym5, m4, 1 vextracti32x8 ym7, m6, 1 call m(idct_8x16_internal_8bpc).main vbroadcasti32x4 m8, [o(int_shuf1)] vbroadcasti32x4 m9, [o(int_shuf2)] vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 vpbroadcastd m2, [o(pw_16384)] pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 REPX {pmulhrsw x, m2}, m0, m1, m6, m7 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp tx2q .pass2: vshufi32x4 m0, m2, m4, q2020 ; 0 1 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 ; 2 3 vshufi32x4 m3, m5, q3131 ; 6 7 call .main movshdup m4, [o(permC)] psrlq m6, m4, 4 vpermq m5, m4, q1032 vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 psrlq m6, m5, 4 vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 vpbroadcastd m6, [o(pw_2048)] .end: REPX {pmulhrsw x, m6}, m0, m4, m1, m5 .end2: lea r3, [dstq+strideq*4] lea r4, [strideq*3] mova xm3, [dstq+strideq*0] mova xm6, [dstq+strideq*2] vinserti32x4 ym3, [dstq+strideq*1], 1 vinserti32x4 ym6, [dstq+r4 ], 1 vinserti32x4 m3, [r3 +strideq*0], 2 vinserti32x4 m6, [r3 +strideq*2], 2 vinserti32x4 m3, [r3 +strideq*1], 3 vinserti32x4 m6, [r3 +r4 ], 3 pxor m7, m7 mova [cq+64*0], m7 mova [cq+64*1], m7 mova [cq+64*2], m7 mova [cq+64*3], m7 punpcklbw m2, m3, m7 punpckhbw m3, m7 paddw m0, m2 paddw m4, m3 packuswb m0, m4 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [r3 +strideq*0], m0, 2 vextracti32x4 [r3 +strideq*1], m0, 3 punpcklbw m3, m6, m7 punpckhbw m6, m7 paddw m1, m3 paddw m5, m6 packuswb m1, m5 mova [dstq+strideq*2], xm1 vextracti32x4 [dstq+r4 ], ym1, 1 vextracti32x4 [r3 +strideq*2], m1, 2 vextracti32x4 [r3 +r4 ], m1, 3 RET ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8bpc).main_pass1 vpbroadcastd m7, [o(pw_16384_m16384)] psrlq m10, 4 .pass1_end: punpcklwd m5, m4, m2 punpckhwd m4, m2 mova m1, m9 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} mova m6, m9 vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} mova m2, m9 vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} psrad m1, 12 psrad m6, 12 packssdw m1, m6 ; out8 -out7 -out9 out6 psrad m2, 12 psrad m9, 12 packssdw m2, m9 ; -out11 out4 out10 -out5 psrlq m4, m10, 4 vpermi2q m4, m0, m2 vpermt2q m0, m10, m2 psrlq m5, m10, 8 vpermi2q m5, m1, m3 psrlq m10, 12 vpermt2q m1, m10, m3 punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 REPX {pmulhrsw x, m7}, m2, m3, m4, m5 jmp tx2q .pass2: vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 vshufi32x4 m3, m5, q3131 ; 6 7 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call .main_pass2 movshdup m4, [o(permC)] pmulhrsw m0, m6 pmulhrsw m1, m6 psrlq m6, m4, 4 mova m5, m4 vpermi2q m4, m0, m2 vpermt2q m0, m6, m2 vpermi2q m5, m1, m3 vpermt2q m1, m6, m3 jmp m(idct_16x8_internal_8bpc).end2 ALIGN function_align .main_pass1: vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m3, m4, [cq+64*0] pmulhrsw m1, m4, [cq+64*3] pmulhrsw m2, m4, [cq+64*1] pmulhrsw m4, [cq+64*2] mova m5, [o(int16_perm)] kxnorb k1, k1, k1 vpblendmd m0{k1}, m1, m3 ; 0 7 vmovdqa32 m3{k1}, m1 ; 6 1 vpblendmd m1{k1}, m4, m2 ; 2 5 vmovdqa32 m2{k1}, m4 ; 4 3 REPX {vpermb x, m5, x}, m0, m1, m2, m3 IADST8_1D_PACKED 1 ret ALIGN function_align cglobal_label .main_pass2 IADST8_1D_PACKED 2 pxor m5, m5 psubd m5, m6 packssdw m6, m5 pmulhrsw m2, m6 pmulhrsw m3, m6 ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8bpc).main_pass1 vpbroadcastd m7, [o(pw_m16384_16384)] psrlq m10, 20 jmp m(iadst_16x8_internal_8bpc).pass1_end .pass2: vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 vshufi32x4 m3, m5, q3131 ; 6 7 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call m(iadst_16x8_internal_8bpc).main_pass2 movshdup m4, [o(permC)] pmulhrsw m5, m6, m0 pmulhrsw m0, m6, m1 psrlq m1, m4, 12 psrlq m4, 8 mova m7, m4 vpermi2q m4, m0, m3 vpermt2q m0, m1, m3 vpermi2q m1, m5, m2 vpermt2q m5, m7, m2 jmp m(idct_16x8_internal_8bpc).end2 INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd m0, [o(pw_2896x8)] pmulhrsw m3, m0, [cq+64*0] pmulhrsw m4, m0, [cq+64*1] pmulhrsw m5, m0, [cq+64*2] pmulhrsw m0, [cq+64*3] vpbroadcastd m7, [o(pw_1697x16)] vpbroadcastd m8, [o(pw_16384)] shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 mova m9, [o(int8_permA)] pmulhrsw m0, m7, m2 pmulhrsw m1, m7, m3 pmulhrsw m6, m7, m4 pmulhrsw m7, m5 REPX {pmulhrsw x, m8}, m0, m1, m6, m7 paddsw m2, m0 paddsw m3, m1 paddsw m4, m6 paddsw m5, m7 REPX {vpermb x, m9, x}, m2, m3, m4, m5 jmp tx2q .pass2: mova m7, [o(permB)] vpbroadcastd m6, [o(pw_4096)] vpermq m0, m7, m2 vpermq m4, m7, m4 vpermq m1, m7, m3 vpermq m5, m7, m5 jmp m(idct_16x8_internal_8bpc).end %macro INV_TXFM_16X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x16 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd or r3d, 16 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m7, [o(permB)] vpermq m0, m7, [cq+64*0] vpermq m1, m7, [cq+64*1] vpermq m2, m7, [cq+64*2] vpermq m3, m7, [cq+64*3] vpermq m4, m7, [cq+64*4] vpermq m5, m7, [cq+64*5] vpermq m6, m7, [cq+64*6] vpermq m7, m7, [cq+64*7] call .main vbroadcasti32x4 m12, [o(int_shuf1)] vbroadcasti32x4 m11, [o(int_shuf2)] vpbroadcastd m13, [o(pw_8192)] pshufb m0, m12 pshufb m8, m1, m11 pshufb m2, m12 pshufb m9, m3, m11 pshufb m4, m12 pshufb m10, m5, m11 pshufb m6, m12 pshufb m11, m7, m11 REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 punpckhdq m1, m0, m8 punpckldq m0, m8 punpckhdq m3, m2, m9 punpckldq m2, m9 punpckhdq m5, m4, m10 punpckldq m4, m10 punpckhdq m7, m6, m11 punpckldq m6, m11 jmp tx2q .pass2: vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 vshufi32x4 m3, m1, m5, q3131 ; 6 7 vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 call .main mova m8, [o(permD)] psrlq m12, m8, 4 psrlq m9, m8, 8 psrlq m13, m8, 12 mova m10, m8 vpermi2q m8, m0, m2 ; 0 1 4 5 vpermt2q m0, m12, m2 mova m11, m9 vpermi2q m9, m1, m3 ; 2 3 6 7 vpermt2q m1, m13, m3 vpermi2q m10, m4, m6 ; 8 9 12 13 vpermt2q m4, m12, m6 vpermi2q m11, m5, m7 ; 10 11 14 15 vpermt2q m5, m13, m7 .end: vpbroadcastd m12, [o(pw_2048)] .end2: REPX {pmulhrsw x, m12}, m0, m1, m4, m5 .end3: REPX {pmulhrsw x, m12}, m8, m9, m10, m11 lea r3, [strideq*3] lea r4, [dstq+strideq*4] lea r5, [dstq+strideq*8] lea r6, [r4 +strideq*8] mova xm3, [dstq+strideq*0] mova xm6, [dstq+strideq*2] vinserti32x4 ym3, [dstq+strideq*1], 1 vinserti32x4 ym6, [dstq+r3 ], 1 vinserti32x4 m3, [r4+strideq*0], 2 vinserti32x4 m6, [r4+strideq*2], 2 vinserti32x4 m3, [r4+strideq*1], 3 vinserti32x4 m6, [r4+r3 ], 3 mova xm12, [r5+strideq*0] mova xm13, [r5+strideq*2] vinserti32x4 ym12, [r5+strideq*1], 1 vinserti32x4 ym13, [r5+r3 ], 1 vinserti32x4 m12, [r6+strideq*0], 2 vinserti32x4 m13, [r6+strideq*2], 2 vinserti32x4 m12, [r6+strideq*1], 3 vinserti32x4 m13, [r6+r3 ], 3 pxor m7, m7 REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m2, m3, m7 punpckhbw m3, m7 paddw m0, m2 paddw m8, m3 packuswb m0, m8 punpcklbw m2, m6, m7 punpckhbw m6, m7 paddw m1, m2 paddw m9, m6 packuswb m1, m9 punpcklbw m2, m12, m7 punpckhbw m12, m7 paddw m2, m4 paddw m10, m12 packuswb m2, m10 punpcklbw m3, m13, m7 punpckhbw m13, m7 paddw m3, m5 paddw m11, m13 packuswb m3, m11 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 mova [dstq+strideq*2], xm1 vextracti32x4 [dstq+r3 ], ym1, 1 vextracti32x4 [r4+strideq*0], m0, 2 vextracti32x4 [r4+strideq*1], m0, 3 vextracti32x4 [r4+strideq*2], m1, 2 vextracti32x4 [r4+r3 ], m1, 3 mova [r5+strideq*0], xm2 vextracti32x4 [r5+strideq*1], ym2, 1 mova [r5+strideq*2], xm3 vextracti32x4 [r5+r3 ], ym3, 1 vextracti32x4 [r6+strideq*0], m2, 2 vextracti32x4 [r6+strideq*1], m2, 3 vextracti32x4 [r6+strideq*2], m3, 2 vextracti32x4 [r6+r3 ], m3, 3 RET ALIGN function_align cglobal_label .main_fast2 ; bottom three-quarters are zero vpbroadcastd m10, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 .main_fast4: vpbroadcastd m2, [o(pw_401_4076x8)] vpbroadcastd m4, [o(pw_m1189_3920x8)] vpbroadcastd m3, [o(pw_799_4017x8)] pmulhrsw m2, m8 ; t8a t15a pmulhrsw m4, m1 ; t11a t12a pmulhrsw m7, m3 ; t4a t7a pxor m6, m6 psubsw m0, m2, m4 ; t11a t12a paddsw m8, m2, m4 ; t8a t15a mova m1, m7 jmp .main5 ALIGN function_align cglobal_label .main_fast ; bottom half is zero vpbroadcastd m10, [o(pd_2048)] .main_fast3: vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 .main_fast5: vpbroadcastd m2, [o(pw_401_4076x8)] vpbroadcastd m4, [o(pw_m2598_3166x8)] vpbroadcastd m11, [o(pw_1931_3612x8)] vpbroadcastd m12, [o(pw_m1189_3920x8)] pmulhrsw m8, m2 ; t8a t15a vpbroadcastd m2, [o(pw_799_4017x8)] pmulhrsw m0, m4 ; t9a t14a vpbroadcastd m4, [o(pw_m2276_3406x8)] pmulhrsw m5, m11 ; t10a t13a pmulhrsw m1, m12 ; t11a t12a pmulhrsw m7, m2 ; t4a t7a pmulhrsw m3, m4 ; t5a t6a jmp .main4 ALIGN function_align cglobal_label .main IDCT16_1D_PACKED ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call .main_pass1 vpbroadcastd m10, [o(pw_8192_m8192)] punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 .pass1_end: REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call .main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 psrlq m12, m10, 12 psrlq m13, m10, 4 mova m9, m8 vpermi2q m8, m0, m2 ; 0 1 4 5 vpermt2q m0, m12, m2 vpermi2q m9, m1, m3 ; 2 3 6 7 vpermt2q m1, m12, m3 vpbroadcastd m12, [o(pw_2048)] mov r3d, 0xff00ff00 mova m11, m10 vpermi2q m10, m4, m6 ; 8 9 12 13 vpermt2q m4, m13, m6 kmovd k1, r3d vpermi2q m11, m5, m7 ; 10 11 14 15 vpermt2q m5, m13, m7 pxor m7, m7 vpsubw m12{k1}, m7, m12 jmp m(idct_16x16_internal_8bpc).end2 ALIGN function_align .main_pass1: mova m4, [o(permB)] psrlq m3, m4, 4 vpermq m0, m4, [cq+64*0] vpermq m7, m3, [cq+64*7] vpermq m6, m4, [cq+64*6] vpermq m1, m3, [cq+64*1] vpermq m2, m4, [cq+64*2] vpermq m5, m3, [cq+64*5] vpermq m4, m4, [cq+64*4] vpermq m3, m3, [cq+64*3] call .main vpbroadcastd m13, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] mova m2, m10 vpdpwssd m2, m5, m13 ; -out5 mova m8, m10 vpdpwssd m8, m11, m13 ; out4 mova m9, m10 vpdpwssd m9, m5, m12 ; out10 mova m5, m10 vpdpwssd m5, m11, m12 ; -out11 mova m11, m10 vpdpwssd m11, m3, m13 ; -out7 mova m14, m10 vpdpwssd m14, m4, m13 ; out6 mova m13, m10 vpdpwssd m13, m3, m12 ; out8 vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 packssdw m2, m8 ; -out5 out4 packssdw m5, m9, m5 ; out10 -out11 packssdw m3, m11, m14 ; -out7 out6 packssdw m4, m13, m10 ; out8 -out9 ret ALIGN function_align .main_pass2: vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 vshufi32x4 m3, m1, m5, q3131 ; 6 7 vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 cglobal_label .main_pass2b REPX {pshufd x, x, q1032}, m1, m3, m5, m7 call .main vpbroadcastd m8, [o(pw_2896x8)] pshufb m2, m11, m12 pshufb m5, m12 pshufb m3, m12 pshufb m4, m12 punpcklqdq m9, m5, m2 ; t15a t7 punpckhqdq m5, m2 ; t14a t6 shufps m2, m3, m4, q1032 ; t2a t10 shufps m3, m4, q3210 ; t3a t11 psubsw m4, m2, m3 ; out8 -out9 paddsw m3, m2 ; -out7 out6 paddsw m2, m5, m9 ; -out5 out4 psubsw m5, m9 ; out10 -out11 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 ret ALIGN function_align .main: vpbroadcastd m10, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] punpckhwd m8, m7, m0 ; in14 in1 punpcklwd m0, m7 ; in0 in15 punpcklwd m7, m6, m1 ; in12 in3 punpckhwd m1, m6 ; in2 in13 punpckhwd m6, m5, m2 ; in10 in5 punpcklwd m2, m5 ; in4 in11 punpcklwd m5, m4, m3 ; in8 in7 punpckhwd m3, m4 ; in6 in9 vpcmpub k7, m13, m10, 6 ; 0x33... ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 psubsw m4, m0, m5 ; t9a t8a paddsw m0, m5 ; t1a t0a psubsw m5, m1, m6 ; t11a t10a paddsw m1, m6 ; t3a t2a psubsw m6, m2, m7 ; t13a t12a paddsw m2, m7 ; t5a t4a psubsw m7, m3, m8 ; t15a t14a paddsw m3, m8 ; t7a t6a ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 psubsw m8, m1, m3 ; t7 t6 paddsw m1, m3 ; t3 t2 psubsw m3, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m2, m5, m7 ; t14a t15a paddsw m7, m5 ; t10a t11a psubsw m5, m4, m6 ; t12a t13a paddsw m4, m6 ; t8a t9a ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 vbroadcasti32x4 m12, [o(deint_shuf)] paddsw m6, m4, m7 ; -out1 out14 psubsw m4, m7 ; t10 t11 psubsw m11, m3, m8 ; t7 t6 paddsw m8, m3 ; out12 -out3 psubsw m3, m0, m1 ; t3a t2a paddsw m0, m1 ; -out15 out0 paddsw m1, m2, m5 ; -out13 out2 psubsw m5, m2 ; t15a t14a pshufb m0, m12 pshufb m6, m12 pshufb m8, m12 pshufb m1, m12 shufps m7, m6, m0, q1032 ; out14 -out15 shufps m0, m6, m0, q3210 ; -out1 out0 punpcklqdq m6, m8, m1 ; out12 -out13 punpckhqdq m1, m8, m1 ; -out3 out2 ret INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_16x16_internal_8bpc).main_pass1 vpbroadcastd m10, [o(pw_m8192_8192)] punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp m(iadst_16x16_internal_8bpc).pass1_end .pass2: call m(iadst_16x16_internal_8bpc).main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 psrlq m12, m10, 12 psrlq m13, m10, 4 mova m9, m8 vpermi2q m8, m7, m5 ; 0 1 4 5 vpermt2q m7, m12, m5 vpermi2q m9, m6, m4 ; 2 3 6 7 vpermt2q m6, m12, m4 vpbroadcastd m12, [o(pw_2048)] mov r3d, 0x00ff00ff mova m11, m10 vpermi2q m10, m3, m1 ; 8 9 12 13 vpermt2q m3, m13, m1 kmovd k1, r3d vpermi2q m11, m2, m0 ; 10 11 14 15 vpermt2q m2, m13, m0 pxor m0, m0 vpsubw m12{k1}, m0, m12 pmulhrsw m0, m7, m12 pmulhrsw m1, m6, m12 pmulhrsw m4, m3, m12 pmulhrsw m5, m2, m12 jmp m(idct_16x16_internal_8bpc).end3 INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m8, [o(int16_perm)] vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 vpbroadcastd m0, [o(pw_1697x16)] vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 pmulhrsw m9, m0, m1 pmulhrsw m10, m0, m2 pmulhrsw m11, m0, m3 pmulhrsw m12, m0, m4 pmulhrsw m13, m0, m5 pmulhrsw m14, m0, m6 pmulhrsw m15, m0, m7 pmulhrsw m0, m8 REPX {psraw x, 1}, m9, m10, m11, m12 pavgw m1, m9 pavgw m2, m10 pavgw m3, m11 pavgw m4, m12 REPX {psraw x, 1}, m13, m14, m15, m0 pavgw m5, m13 pavgw m6, m14 pavgw m7, m15 pavgw m8, m0 punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 jmp tx2q ALIGN function_align .pass2: vpbroadcastd m11, [o(pw_1697x16)] pmulhrsw m12, m11, m0 pmulhrsw m13, m11, m1 pmulhrsw m14, m11, m2 pmulhrsw m15, m11, m3 pmulhrsw m8, m11, m4 pmulhrsw m9, m11, m5 pmulhrsw m10, m11, m6 pmulhrsw m11, m7 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 paddsw m0, m12 paddsw m1, m13 paddsw m2, m14 paddsw m3, m15 paddsw m8, m4 movu m4, [o(permD+2)] paddsw m9, m5 paddsw m6, m10 paddsw m7, m11 psrlq m12, m4, 4 mova m5, m4 mova m10, m4 mova m11, m4 vpermi2q m4, m0, m2 ; 8 9 12 13 vpermt2q m0, m12, m2 ; 0 1 4 5 vpermi2q m5, m1, m3 ; 10 11 14 15 vpermt2q m1, m12, m3 ; 2 3 6 7 vpermi2q m10, m8, m6 vpermt2q m8, m12, m6 vpermi2q m11, m9, m7 vpermt2q m9, m12, m7 jmp m(idct_16x16_internal_8bpc).end %macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] vpbroadcastd m%4, [o(pw_%5_%6x8)] punpcklwd m%1, m%3, m%3 pmulhrsw m%1, m%4 vpbroadcastd m%4, [o(pw_%7_%8x8)] punpckhwd m%2, m%3, m%3 pmulhrsw m%2, m%4 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly cmp eobd, 107 jb .fast mova m5, [cq+64*5] mova m3, [cq+64*3] mova m1, [cq+64*1] mova m7, [cq+64*7] mova m2, [cq+64*2] mova m6, [cq+64*6] mova m0, [cq+64*0] mova m4, [cq+64*4] call m(inv_txfm_add_dct_dct_32x8_8bpc).main mova m8, [o(idct_8x32p)] vpbroadcastd m9, [o(pw_8192)] REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 punpckldq m8, m0, m1 ; ab punpckhdq m0, m1 punpckldq m1, m2, m3 ; cd punpckhdq m2, m3 punpckldq m3, m4, m5 ; ef punpckhdq m4, m5 punpckldq m5, m6, m7 ; gh punpckhdq m6, m7 REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 punpcklqdq m20, m3, m5 punpckhqdq m16, m3, m5 punpcklqdq m19, m4, m6 punpckhqdq m17, m4, m6 vinserti32x4 ym8, ym18, xm20, 1 vshufi32x4 ym1, ym18, ym20, 0x03 vinserti32x4 ym9, ym14, xm16, 1 vshufi32x4 ym3, ym14, ym16, 0x03 vinserti32x4 ym0, ym21, xm19, 1 vshufi32x4 ym5, ym21, ym19, 0x03 vinserti32x4 ym7, ym15, xm17, 1 vshufi32x4 ym6, ym15, ym17, 0x03 call m(idct_8x16_internal_8bpc).main2 psrlq m12, [o(permB)], 60 vpermt2q m14, m12, m16 vpermt2q m21, m12, m19 vpermt2q m15, m12, m17 vpermi2q m12, m18, m20 vextracti32x8 ym16, m14, 1 vextracti32x8 ym19, m21, 1 vextracti32x8 ym17, m15, 1 vextracti32x8 ym20, m12, 1 call .main2 jmp .end .fast: ; right half is zero mova m0, [o(int16_perm)] mova ym2, [cq+64*4] vinserti32x8 m2, [cq+64*0], 1 mova ym3, [cq+64*6] vinserti32x8 m3, [cq+64*2], 1 mova ym4, [cq+64*3] vinserti32x8 m4, [cq+64*5], 1 mova ym5, [cq+64*7] vinserti32x8 m5, [cq+64*1], 1 REPX {vpermb x, m0, x}, m2, m3, m4, m5 call m(idct_16x8_internal_8bpc).main2 vbroadcasti32x4 m4, [o(int_shuf3)] vbroadcasti32x4 m5, [o(int_shuf4)] pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 vpbroadcastd m4, [o(pw_8192)] psrlq m5, [o(permB)], 60 punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 REPX {pmulhrsw x, m4}, m6, m17, m2, m16 vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 vpermt2q m2, m5, m6 ; 8 10 vpermt2q m16, m5, m17 ; 9 11 vextracti32x8 ym3, m2, 1 ; 12 14 vextracti32x8 ym17, m16, 1 ; 13 15 call m(idct_8x16_internal_8bpc).main_fast call .main_fast .end: vpbroadcastd ym8, strided pmulld ym8, [o(gather8d)] call .main_end lea r3, [dstq+strideq*4] kxnorb k1, k1, k1 lea r4, [dstq+strideq*8] pxor m9, m9 lea r1, [r3+strideq*8] kmovb k2, k1 vpgatherdq m12{k1}, [r0+ym8] kmovb k1, k2 vpgatherdq m13{k2}, [r3+ym8] kmovb k2, k1 vpgatherdq m14{k1}, [r4+ym8] kmovb k1, k2 vpgatherdq m15{k2}, [r1+ym8] REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m11, m12, m9 punpckhbw m12, m9 paddw m0, m11 paddw m1, m12 packuswb m0, m1 kmovb k2, k1 vpscatterdq [r0+ym8]{k1}, m0 punpcklbw m12, m13, m9 punpckhbw m13, m9 paddw m2, m12 paddw m3, m13 packuswb m2, m3 kmovb k1, k2 vpscatterdq [r3+ym8]{k2}, m2 punpcklbw m13, m14, m9 punpckhbw m14, m9 paddw m4, m13 paddw m5, m14 packuswb m4, m5 kmovb k2, k1 vpscatterdq [r4+ym8]{k1}, m4 punpcklbw m14, m15, m9 punpckhbw m15, m9 paddw m6, m14 paddw m7, m15 packuswb m6, m7 vpscatterdq [r1+ym8]{k2}, m6 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 INIT_YMM avx512icl ALIGN function_align cglobal_label .main_fast2 ; bottom three-quarters are zero ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a mova m11, m12 mova m17, m20 mova m15, m21 mova m16, m14 jmp .main4 ALIGN function_align cglobal_label .main_fast ; bottom half is zero ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main3 ALIGN function_align cglobal_label .main punpcklwd m12, m21, m14 ; in31 in1 punpckhwd m14, m21 ; in3 in29 punpcklwd m21, m20, m15 ; in27 in5 punpckhwd m15, m20 ; in7 in25 punpcklwd m20, m19, m16 ; in23 in9 punpckhwd m16, m19 ; in11 in21 punpcklwd m19, m18, m17 ; in19 in13 punpckhwd m17, m18 ; in15 in17 .main2: ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a .main3: psubsw m11, m12, m17 ; t17 t30 paddsw m12, m17 ; t16 t31 psubsw m17, m15, m20 ; t18 t29 paddsw m20, m15 ; t19 t28 psubsw m15, m21, m16 ; t21 t26 paddsw m21, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 .main4: ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a vpbroadcastd m8, [o(pw_m3784_1567)] psubsw m19, m12, m20 ; t19a t28a paddsw m20, m12 ; t16a t31a psubsw m12, m14, m21 ; t20a t27a paddsw m14, m21 ; t23a t24a psubsw m21, m11, m17 ; t18 t29 paddsw m11, m17 ; t17 t30 psubsw m17, m16, m15 ; t21 t26 paddsw m16, m15 ; t22 t25 ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a vbroadcasti32x4 m18, [o(deint_shuf)] vpbroadcastd m8, [o(pw_m2896_2896)] vpbroadcastd m9, [o(pw_2896_2896)] psubsw m15, m20, m14 ; t23 t24 paddsw m20, m14 ; t16 t31 psubsw m14, m11, m16 ; t22a t25a paddsw m11, m16 ; t17a t30a psubsw m16, m21, m17 ; t21 t26 paddsw m21, m17 ; t18 t29 psubsw m17, m19, m12 ; t20a t27a paddsw m19, m12 ; t19a t28a REPX {pshufb x, m18}, m20, m11, m21, m19 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 packssdw m18, m13 ; t23a t22 packssdw m12, m15 ; t24a t25 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 packssdw m16, m13 ; t20 t21a packssdw m14, m15 ; t27 t26a punpcklqdq m13, m19, m21 ; t19a t18 punpckhqdq m19, m21 ; t28a t29 punpcklqdq m21, m20, m11 ; t16 t17a punpckhqdq m20, m11 ; t31 t30a INIT_ZMM avx512icl mova m15, [o(permA)] ret cglobal_label .main_end vpbroadcastd m10, [o(pw_2048)] vpermt2q m0, m15, m1 ; t0 t1 t2 t3 vpermt2q m20, m15, m19 ; t31 t30a t29 t28a vpermt2q m2, m15, m3 ; t4 t5 t6 t7 vpermt2q m14, m15, m12 ; t27 t26a t25 t24a vpermt2q m4, m15, m5 ; t8 t9 t10 t11 vpermt2q m18, m15, m16 ; t23a t22 t21a t20 vpermt2q m6, m15, m7 ; t12 t13 t14 t15 vpermt2q m13, m15, m21 ; t19a t18 t17a t16 psubsw m7, m0, m20 ; out31 out30 out29 out28 paddsw m0, m20 ; out0 out1 out2 out3 psubsw m5, m2, m14 ; out27 out26 out25 out24 paddsw m2, m14 ; out4 out5 out6 out7 psubsw m3, m4, m18 ; out23 out22 out21 out20 paddsw m4, m18 ; out8 out9 out10 out11 psubsw m1, m6, m13 ; out19 out18 out17 out16 paddsw m6, m13 ; out12 out13 out14 out15 vzeroupper ret %macro LOAD_PACKED_16X2 3 ; dst, row[1-2] vbroadcasti32x4 ym%1, [cq+16*%2] vbroadcasti32x4 ym8, [cq+16*%3] shufpd ym%1, ym8, 0x0c %endmacro cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob %undef cmp test eobd, eobd jz .dconly lea r5, [o_base] LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 pxor m4, m4 REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 cmp eobd, 107 jb .fast LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 call m(idct_8x16_internal_8bpc).main LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 pxor m8, m8 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 call m(inv_txfm_add_dct_dct_8x32_8bpc).main jmp .pass2 .fast: ; bottom half is zero mova ym5, ym4 mova ym6, ym4 mova ym7, ym4 call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: vpbroadcastd m10, [o(pw_8192)] vpermt2q m0, m15, m4 ; t0 t1 t9 t8 vpermt2q m20, m15, m18 ; t31 t30a t23a t22 vpermt2q m3, m15, m7 ; t7 t6 t14 t15 vpermt2q m12, m15, m21 ; t25 t24a t17a t16 vpermt2q m2, m15, m6 ; t4 t5 t13 t12 vpermt2q m14, m15, m13 ; t23a t22 t21a t20 vpermt2q m1, m15, m5 ; t3 t2 t10 t11 vpermt2q m19, m15, m16 ; t27 t26a t19a t18 psubsw m8, m0, m20 ; out31 out30 out22 out23 paddsw m0, m20 ; out0 out1 out9 out8 paddsw m6, m3, m12 ; out7 out6 out14 out15 psubsw m3, m12 ; out24 out25 out17 out16 psubsw m5, m2, m14 ; out27 out26 out18 out19 paddsw m4, m2, m14 ; out4 out5 out13 out12 psubsw m7, m1, m19 ; out28 out29 out21 out20 paddsw m2, m1, m19 ; out3 out2 out10 out11 vzeroupper vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 call .main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 lea r2, [strideq*3] lea r3, [dstq+strideq*4] movshdup m12, [o(permD)] pmovzxbw m8, [dstq+strideq*0] pmovzxbw m9, [dstq+strideq*1] pmovzxbw m10, [dstq+strideq*2] pmovzxbw m11, [dstq+r2 ] paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 pmovzxbw m8, [r3+strideq*0] pmovzxbw m9, [r3+strideq*1] pmovzxbw m10, [r3+strideq*2] pmovzxbw m11, [r3+r2 ] paddw m4, m8 paddw m5, m9 paddw m6, m10 paddw m7, m11 packuswb m0, m1 packuswb m2, m3 vpermq m0, m12, m0 vpermq m2, m12, m2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym2 vextracti32x8 [dstq+r2 ], m2, 1 packuswb m4, m5 packuswb m6, m7 vpermq m4, m12, m4 vpermq m6, m12, m6 mova [r3+strideq*0], ym4 vextracti32x8 [r3+strideq*1], m4, 1 mova [r3+strideq*2], ym6 vextracti32x8 [r3+r2 ], m6, 1 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 8 .dconly2: imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 .dconly3: imul r6d, 181 add r6d, 128+2048 sar r6d, 8+4 pxor m2, m2 vpbroadcastw m3, r6d .dconly_loop: mova ym1, [dstq+strideq*0] vinserti32x8 m1, [dstq+strideq*1], 1 punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET ALIGN function_align cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 .main3: paddsw m8, m1, m5 ; t4 psubsw m1, m5 ; t5a paddsw m9, m7, m3 ; t7 psubsw m7, m3 ; t6a ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 psubsw m5, m0, m2 ; dct4 out2 paddsw m2, m0 ; dct4 out1 paddsw m0, m4, m6 ; dct4 out0 psubsw m4, m6 ; dct4 out3 psubsw m6, m2, m1 ; out6 paddsw m1, m2 ; out1 paddsw m2, m5, m7 ; out2 psubsw m5, m7 ; out5 psubsw m7, m0, m9 ; out7 paddsw m0, m9 ; out0 paddsw m3, m4, m8 ; out3 psubsw m4, m8 ; out4 ret cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c vpbroadcastd m7, [pw_5] paddsw m0, m7, [cq+64*0] paddsw m1, m7, [cq+64*1] vpbroadcastd ym9, strided paddsw m2, m7, [cq+64*2] paddsw m3, m7, [cq+64*3] paddsw m4, m7, [cq+64*4] paddsw m5, m7, [cq+64*5] paddsw m6, m7, [cq+64*6] paddsw m7, [cq+64*7] pmulld ym14, ym9, [pd_0to15] lea r3, [dstq+strideq*1] lea r4, [dstq+strideq*2] kxnorb k1, k1, k1 pxor m13, m13 add r1, r4 ; dstq+strideq*3 kmovb k2, k1 vpgatherdq m9{k1}, [r0+ym14*4] kmovb k1, k2 vpgatherdq m10{k2}, [r3+ym14*4] kmovb k2, k1 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 vpgatherdq m11{k1}, [r4+ym14*4] kmovb k1, k2 vpgatherdq m12{k2}, [r1+ym14*4] REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m8, m9, m13 ; 0 8 16 24 punpckhbw m9, m13 ; 4 12 20 28 paddw m0, m8 paddw m4, m9 packuswb m0, m4 kmovb k2, k1 vpscatterdq [r0+ym14*4]{k1}, m0 punpcklbw m8, m10, m13 ; 1 9 17 25 punpckhbw m10, m13 ; 5 13 21 29 paddw m1, m8 paddw m5, m10 packuswb m1, m5 kmovb k1, k2 vpscatterdq [r3+ym14*4]{k2}, m1 punpcklbw m8, m11, m13 ; 2 10 18 26 punpckhbw m11, m13 ; 6 14 22 30 paddw m2, m8 paddw m6, m11 packuswb m2, m6 kmovb k2, k1 vpscatterdq [r4+ym14*4]{k1}, m2 punpcklbw m8, m12, m13 ; 3 11 19 27 punpckhbw m12, m13 ; 7 15 23 31 paddw m3, m8 paddw m7, m12 packuswb m3, m7 vpscatterdq [r1+ym14*4]{k2}, m3 RET cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c vpbroadcastd m0, [pw_4096] pmulhrsw m3, m0, [cq+64*0] pmulhrsw m4, m0, [cq+64*4] pmulhrsw m6, m0, [cq+64*1] pmulhrsw m5, m0, [cq+64*5] pmulhrsw m7, m0, [cq+64*2] pmulhrsw m2, m0, [cq+64*6] pmulhrsw m8, m0, [cq+64*3] pmulhrsw m0, [cq+64*7] mova m13, [int8_permA] lea r3, [strideq*3] lea r4, [dstq+strideq*4] punpckldq m1, m3, m4 punpckhdq m3, m4 punpckldq m4, m6, m5 punpckhdq m6, m5 punpckldq m5, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m0 punpckhdq m8, m0 mova ym9, [dstq+strideq*0] vinserti32x8 m9, [dstq+strideq*2], 1 mova ym10, [dstq+strideq*1] vinserti32x8 m10, [dstq+r3 ], 1 mova ym11, [r4+strideq*0] vinserti32x8 m11, [r4+strideq*2], 1 mova ym12, [r4+strideq*1] vinserti32x8 m12, [r4+r3 ], 1 REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 pxor m13, m13 REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 punpckhqdq m1, m4 ; b0 b2 d0 d2 punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 punpckhqdq m5, m2 ; b1 b3 d1 d3 punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 punpckhqdq m3, m6 ; f0 f2 h0 h2 punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 punpckhqdq m7, m8 ; f1 f3 h1 h3 punpcklbw m8, m9, m13 punpckhbw m9, m13 paddw m0, m8 paddw m4, m9 packuswb m0, m4 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*2], m0, 1 punpcklbw m8, m10, m13 punpckhbw m10, m13 paddw m1, m8 paddw m5, m10 packuswb m1, m5 mova [dstq+strideq*1], ym1 vextracti32x8 [dstq+r3 ], m1, 1 punpcklbw m8, m11, m13 punpckhbw m11, m13 paddw m2, m8 paddw m6, m11 packuswb m2, m6 mova [r4+strideq*0], ym2 vextracti32x8 [r4+strideq*2], m2, 1 punpcklbw m8, m12, m13 punpckhbw m12, m13 paddw m3, m8 paddw m7, m12 packuswb m3, m7 mova [r4+strideq*1], ym3 vextracti32x8 [r4+r3 ], m3, 1 RET %macro IDCT_16x32_END 3 ; src[1-2], row mova xm8, [dstq+strideq*0] vinserti32x4 ym8, [dstq+strideq*1], 1 mova xm9, [dstq+r3 ] vinserti32x4 ym9, [dstq+strideq*2], 1 pmulhrsw m%1, m10 pmulhrsw m%2, m10 vpermb m8, m11, m8 vpermb m9, m11, m9 mova [cq+64*(%3*2+0)], m13 mova [cq+64*(%3*2+1)], m13 paddw m8, m%1 paddw m9, m%2 packuswb m8, m9 vpermd m8, m12, m8 mova [dstq+strideq*0], xm8 vextracti32x4 [dstq+strideq*1], ym8, 1 vextracti32x4 [dstq+strideq*2], m8, 2 vextracti32x4 [dstq+r3 ], m8, 3 %if %1 != 20 lea dstq, [dstq+strideq*4] %endif %endmacro cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m15, [o(pw_2896x8)] cmp eobd, 151 jb .fast pmulhrsw m5, m15, [cq+64*10] pmulhrsw m3, m15, [cq+64* 6] pmulhrsw m1, m15, [cq+64* 2] pmulhrsw m7, m15, [cq+64*14] pmulhrsw m2, m15, [cq+64* 4] pmulhrsw m6, m15, [cq+64*12] pmulhrsw m0, m15, [cq+64* 0] pmulhrsw m4, m15, [cq+64* 8] call m(inv_txfm_add_dct_dct_32x8_8bpc).main pmulhrsw m14, m15, [cq+64* 1] pmulhrsw m21, m15, [cq+64*15] pmulhrsw m18, m15, [cq+64* 9] pmulhrsw m17, m15, [cq+64* 7] pmulhrsw m16, m15, [cq+64* 5] pmulhrsw m19, m15, [cq+64*11] pmulhrsw m20, m15, [cq+64*13] pmulhrsw m15, [cq+64* 3] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova m8, [o(idct_16x32p)] vpbroadcastd m9, [o(pw_16384)] REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ m14, m15, m16, m17, m18, m19, m20, m21 punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m1, m2, m3 punpckhdq m2, m3 REPX {pmulhrsw x, m9}, m8, m0, m1, m2 punpckldq m3, m4, m5 punpckhdq m4, m5 punpckldq m5, m6, m7 punpckhdq m6, m7 REPX {pmulhrsw x, m9}, m3, m4, m5, m6 punpckldq m7, m14, m15 punpckhdq m14, m15 punpckldq m15, m16, m17 punpckhdq m16, m17 REPX {pmulhrsw x, m9}, m7, m14, m15, m16 punpckldq m17, m18, m19 punpckhdq m18, m19 punpckldq m19, m20, m21 punpckhdq m20, m21 REPX {pmulhrsw x, m9}, m17, m18, m19, m20 punpcklqdq m21, m8, m1 punpckhqdq m8, m1 punpcklqdq m1, m0, m2 punpckhqdq m0, m2 punpcklqdq m2, m3, m5 punpckhqdq m3, m5 punpcklqdq m5, m4, m6 punpckhqdq m4, m6 punpcklqdq m6, m7, m15 punpckhqdq m7, m15 punpcklqdq m15, m14, m16 punpckhqdq m14, m16 punpcklqdq m16, m17, m19 punpckhqdq m17, m19 punpcklqdq m19, m18, m20 punpckhqdq m18, m20 vinserti32x8 m20, m21, ym2, 1 vshufi32x4 m21, m2, q3232 vinserti32x8 m2, m8, ym3, 1 vshufi32x4 m8, m3, q3232 vinserti32x8 m3, m1, ym5, 1 vshufi32x4 m1, m5, q3232 vinserti32x8 m5, m0, ym4, 1 vshufi32x4 m0, m4, q3232 vinserti32x8 m4, m6, ym16, 1 vshufi32x4 m6, m16, q3232 vinserti32x8 m16, m7, ym17, 1 vshufi32x4 m7, m17, q3232 vinserti32x8 m17, m15, ym19, 1 vshufi32x4 m15, m19, q3232 vinserti32x8 m19, m14, ym18, 1 vshufi32x4 m14, m18, q3232 vshufi32x4 m18, m21, m6, q3131 ; 27 5 vshufi32x4 m21, m6, q2020 ; 31 1 vshufi32x4 m6, m8, m7, q2020 ; 24 8 vshufi32x4 m8, m7, q3131 ; 30 2 vshufi32x4 m7, m1, m15, q2020 ; 28 4 vshufi32x4 m1, m15, q3131 ; 6 26 vshufi32x4 m15, m0, m14, q2020 ; 7 25 vshufi32x4 m0, m14, q3131 ; 14 18 vshufi32x4 m14, m20, m4, q2020 ; 3 29 vshufi32x4 m20, m4, q3131 ; 23 9 vshufi32x4 m9, m3, m17, q2020 ; 16 0 vshufi32x4 m3, m17, q3131 ; 12 20 vshufi32x4 m17, m5, m19, q2020 ; 15 17 vshufi32x4 m5, m19, q3131 ; 22 10 vshufi32x4 m19, m2, m16, q2020 ; 19 13 vshufi32x4 m16, m2, m16, q3131 ; 11 21 call m(idct_16x16_internal_8bpc).main3 call .main_oddhalf jmp .pass2 .fast: ; right half is zero mova ym8, [cq+64*15] vinserti32x8 m8, [cq+64* 1], 1 mova m2, [o(int16_perm)] mova ym9, [cq+64* 8] vinserti32x8 m9, [cq+64* 0], 1 mova ym0, [cq+64* 7] vinserti32x8 m0, [cq+64* 9], 1 mova ym7, [cq+64*14] vinserti32x8 m7, [cq+64* 2], 1 mova ym1, [cq+64* 3] vinserti32x8 m1, [cq+64*13], 1 mova ym3, [cq+64* 6] vinserti32x8 m3, [cq+64*10], 1 mova ym5, [cq+64*11] vinserti32x8 m5, [cq+64* 5], 1 mova ym6, [cq+64*12] vinserti32x8 m6, [cq+64* 4], 1 REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 call m(idct_16x16_internal_8bpc).main2 vbroadcasti32x4 m8, [o(int_shuf3)] vbroadcasti32x4 m9, [o(int_shuf4)] vpbroadcastd m11, [o(pw_16384)] pshufb m0, m8 pshufb m1, m9 pshufb m2, m8 pshufb m3, m9 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 pshufb m4, m8 pshufb m5, m9 pshufb m6, m8 pshufb m7, m9 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 punpckhdq m17, m0, m1 punpckldq m0, m1 punpckhdq m16, m2, m3 punpckldq m2, m3 punpckhdq m18, m4, m5 punpckldq m4, m5 punpckhdq m5, m6, m7 punpckldq m6, m7 vinserti32x8 m1, m0, ym2, 1 vshufi32x4 m3, m0, m2, q3232 vinserti32x8 m2, m4, ym6, 1 vshufi32x4 m4, m6, q3232 vinserti32x8 m15, m17, ym16, 1 vshufi32x4 m17, m16, q3232 vinserti32x8 m16, m18, ym5, 1 vshufi32x4 m18, m5, q3232 vshufi32x4 m0, m1, m2, q2020 ; 0 2 vshufi32x4 m1, m2, q3131 ; 4 6 vshufi32x4 m2, m3, m4, q2020 ; 8 10 vshufi32x4 m3, m4, q3131 ; 12 14 vshufi32x4 m14, m15, m16, q2020 ; 1 3 vshufi32x4 m15, m16, q3131 ; 5 7 vshufi32x4 m16, m17, m18, q2020 ; 9 11 vshufi32x4 m17, m18, q3131 ; 13 15 pxor m6, m6 punpckhwd m8, m0, m0 punpcklwd m9, m6, m0 punpckhwd m0, m3, m3 punpckhwd m5, m2, m2 punpcklwd m7, m1, m1 punpckhwd m1, m1 punpcklwd m3, m3 punpcklwd m6, m2 call m(idct_16x16_internal_8bpc).main_fast5 punpcklwd m21, m14, m14 punpckhwd m14, m14 punpcklwd m18, m15, m15 punpckhwd m15, m15 punpcklwd m20, m16, m16 punpckhwd m16, m16 punpcklwd m19, m17, m17 punpckhwd m17, m17 call .main_oddhalf_fast .pass2: vpbroadcastd m10, [o(pw_2048)] mova m11, [o(end_16x32p)] lea r3, [strideq*3] pxor m13, m13 psrld m12, m11, 8 IDCT_16x32_END 0, 1, 0 IDCT_16x32_END 2, 3, 1 IDCT_16x32_END 4, 5, 2 IDCT_16x32_END 6, 7, 3 IDCT_16x32_END 14, 15, 4 IDCT_16x32_END 16, 17, 5 IDCT_16x32_END 18, 19, 6 IDCT_16x32_END 20, 21, 7 RET ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly ALIGN function_align cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_201_4091x8)] vpbroadcastd m20, [o(pw_m1380_3857x8)] vpbroadcastd m9, [o(pw_995_3973x8)] vpbroadcastd m16, [o(pw_m601_4052x8)] pmulhrsw m21, m8 ; t16a, t31a pmulhrsw m20, m15 ; t19a, t28a pmulhrsw m18, m9 ; t20a, t27a pmulhrsw m14, m16 ; t23a, t24a mova m8, m21 mova m17, m20 mova m15, m18 mova m16, m14 jmp .main3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m8, [o(pw_201_4091x8)] vpbroadcastd m9, [o(pw_m2751_3035x8)] vpbroadcastd m11, [o(pw_1751_3703x8)] vpbroadcastd m12, [o(pw_m1380_3857x8)] pmulhrsw m21, m8 ; t16a, t31a vpbroadcastd m8, [o(pw_995_3973x8)] pmulhrsw m17, m9 ; t17a, t30a vpbroadcastd m9, [o(pw_m2106_3513x8)] pmulhrsw m20, m11 ; t18a, t29a vpbroadcastd m11, [o(pw_2440_3290x8)] pmulhrsw m15, m12 ; t19a, t28a vpbroadcastd m12, [o(pw_m601_4052x8)] pmulhrsw m18, m8 ; t20a, t27a pmulhrsw m16, m9 ; t21a, t26a pmulhrsw m19, m11 ; t22a, t25a pmulhrsw m14, m12 ; t23a, t24a jmp .main2 ALIGN function_align cglobal_label .main_oddhalf ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a .main2: psubsw m8, m21, m17 ; t17 t30 paddsw m21, m17 ; t16 t31 psubsw m17, m15, m20 ; t18 t29 paddsw m20, m15 ; t19 t28 psubsw m15, m18, m16 ; t21 t26 paddsw m18, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 .main3: ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a vpbroadcastd m11, [o(pw_m3784_1567)] psubsw m19, m21, m20 ; t19a t28a paddsw m21, m20 ; t16a t31a psubsw m20, m14, m18 ; t20a t27a paddsw m14, m18 ; t23a t24a psubsw m18, m8, m17 ; t18 t29 paddsw m8, m17 ; t17 t30 psubsw m17, m16, m15 ; t21 t26 paddsw m15, m16 ; t22 t25 ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a vbroadcasti32x4 m9, [o(deint_shuf)] psubsw m16, m21, m14 ; t23 t24 paddsw m14, m21 ; t16 t31 psubsw m21, m8, m15 ; t22a t25a paddsw m15, m8 ; t17a t30a psubsw m8, m18, m17 ; t21 t26 paddsw m18, m17 ; t18 t29 paddsw m17, m19, m20 ; t19a t28a psubsw m19, m20 ; t20a t27a vpbroadcastd m11, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] REPX {pshufb x, m9}, m14, m15, m18, m17 mova m9, m10 vpdpwssd m9, m16, m11 mova m20, m10 vpdpwssd m20, m21, m11 psrad m9, 12 psrad m20, 12 packssdw m9, m20 ; t23a t22 mova m20, m10 vpdpwssd m20, m16, m12 mova m16, m10 vpdpwssd m16, m21, m12 psrad m20, 12 psrad m16, 12 packssdw m16, m20, m16 ; t24a t25 ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 packssdw m11, m20 ; t27 t26a packssdw m8, m21 ; t20 t21a punpcklqdq m20, m14, m15 ; t16 t17a punpckhqdq m14, m15 ; t31 t30a punpckhqdq m15, m17, m18 ; t28a t29 punpcklqdq m17, m18 ; t19a t18 psubsw m21, m0, m14 ; out31 out30 paddsw m0, m14 ; out0 out1 psubsw m14, m7, m20 ; out16 out17 paddsw m7, m20 ; out15 out14 psubsw m20, m1, m15 ; out28 out29 paddsw m1, m15 ; out3 out2 psubsw m15, m6, m17 ; out19 out18 paddsw m6, m17 ; out12 out13 psubsw m17, m4, m9 ; out23 out22 paddsw m4, m9 ; out8 out9 psubsw m18, m3, m16 ; out24 out25 paddsw m3, m16 ; out7 out6 psubsw m16, m5, m8 ; out20 out21 paddsw m5, m8 ; out11 out10 psubsw m19, m2, m11 ; out27 out26 paddsw m2, m11 ; out4 out5 ret cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly mova m21, [o(permB)] vpermq m1, m21, [cq+64* 0] ; 0 1 vpermq m14, m21, [cq+64* 1] ; 2 3 vpermq m20, m21, [cq+64* 2] ; 4 5 vpermq m15, m21, [cq+64* 3] ; 6 7 vpbroadcastd m8, [o(pw_2896x8)] vpermq m2, m21, [cq+64* 4] ; 8 9 vpermq m16, m21, [cq+64* 5] ; 10 11 vpermq m3, m21, [cq+64* 6] ; 12 13 vpermq m17, m21, [cq+64* 7] ; 14 15 REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 pxor m12, m12 REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 cmp eobd, 151 jb .fast vpermq m9, m21, [cq+64* 8] ; 16 17 vpermq m19, m21, [cq+64* 9] ; 18 19 vpermq m4, m21, [cq+64*10] ; 20 21 vpermq m5, m21, [cq+64*11] ; 22 23 vpermq m6, m21, [cq+64*12] ; 24 25 vpermq m18, m21, [cq+64*13] ; 26 27 vpermq m7, m21, [cq+64*14] ; 28 29 vpermq m21, m21, [cq+64*15] ; 30 31 REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 punpcklwd m8, m21, m14 ; 30 2 punpckhwd m21, m1 ; 31 1 punpcklwd m0, m17, m19 ; 14 18 punpckhwd m17, m9 ; 15 17 punpcklwd m9, m1 ; 16 0 punpckhwd m14, m7 ; 3 29 punpcklwd m1, m15, m18 ; 6 26 punpckhwd m15, m6 ; 7 25 punpcklwd m6, m2 ; 24 8 punpckhwd m19, m3 ; 19 13 punpcklwd m3, m4 ; 12 20 punpckhwd m18, m20 ; 27 5 punpcklwd m7, m20 ; 28 4 punpckhwd m20, m5, m2 ; 23 9 punpcklwd m5, m16 ; 22 10 punpckhwd m16, m4 ; 11 21 call m(idct_16x16_internal_8bpc).main2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf jmp .pass2 .fast: ; bottom half zero punpcklwd m8, m14, m14 ; 2 punpcklwd m0, m17, m17 ; 14 punpcklwd m5, m16, m16 ; 10 punpcklwd m9, m12, m1 ; __ 0 punpckhwd m21, m1, m1 ; 1 punpcklwd m1, m15, m15 ; 6 punpcklwd m7, m20, m20 ; 4 punpckhwd m19, m3, m3 ; 13 punpcklwd m3, m3 ; 12 punpcklwd m6, m12, m2 ; __ 8 punpckhwd m18, m20, m20 ; 5 punpckhwd m20, m2, m2 ; 9 call m(idct_16x16_internal_8bpc).main_fast punpckhwd m15, m15 ; 7 punpckhwd m14, m14 ; 3 punpckhwd m16, m16 ; 11 punpckhwd m17, m17 ; 15 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast .pass2: vpbroadcastd m9, [o(pw_16384)] call .transpose_round vshufi32x4 m16, m14, m2, q3131 ; 5 vshufi32x4 m14, m2, q2020 ; 1 vshufi32x4 m2, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m1, m18, q3131 ; 6 vshufi32x4 m1, m18, q2020 ; 2 vshufi32x4 m18, m20, m6, q2020 ; 9 vshufi32x4 m20, m6, q3131 ; 13 vshufi32x4 m6, m21, m4, q3131 ; 12 vshufi32x4 m4, m21, m4, q2020 ; 8 vshufi32x4 m21, m19, m7, q3131 ; 15 vshufi32x4 m19, m7, q2020 ; 11 vshufi32x4 m7, m5, m15, q3131 ; 14 vshufi32x4 m5, m15, q2020 ; 10 vshufi32x4 m15, m17, m9, q2020 ; 3 vshufi32x4 m17, m9, q3131 ; 7 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 call .main_oddhalf vpbroadcastd m12, [o(pw_2048)] movshdup m13, [o(permD)] lea r2, [strideq*3] pmovzxbw m8, [dstq+strideq*0] pmovzxbw m9, [dstq+strideq*1] pmovzxbw m10, [dstq+strideq*2] pmovzxbw m11, [dstq+r2 ] REPX {pmulhrsw x, m12}, m0, m1, m2, m3 lea r3, [dstq+strideq*4] paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 pmovzxbw m8, [r3+strideq*0] pmovzxbw m9, [r3+strideq*1] pmovzxbw m10, [r3+strideq*2] pmovzxbw m11, [r3+r2 ] REPX {pmulhrsw x, m12}, m4, m5, m6, m7 lea r4, [dstq+strideq*8] packuswb m0, m1 paddw m4, m8 paddw m5, m9 packuswb m2, m3 paddw m6, m10 paddw m7, m11 pmovzxbw m8, [r4+strideq*0] pmovzxbw m9, [r4+strideq*1] pmovzxbw m10, [r4+strideq*2] pmovzxbw m11, [r4+r2 ] REPX {pmulhrsw x, m12}, m14, m15, m16, m17 lea r5, [r3+strideq*8] packuswb m4, m5 paddw m14, m8 paddw m15, m9 packuswb m6, m7 paddw m16, m10 paddw m17, m11 pmovzxbw m8, [r5+strideq*0] pmovzxbw m9, [r5+strideq*1] pmovzxbw m10, [r5+strideq*2] pmovzxbw m11, [r5+r2 ] REPX {pmulhrsw x, m12}, m18, m19, m20, m21 packuswb m14, m15 paddw m18, m8 paddw m19, m9 packuswb m16, m17 paddw m20, m10 paddw m21, m11 packuswb m18, m19 packuswb m20, m21 REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym2 vextracti32x8 [dstq+r2 ], m2, 1 mova [r3+strideq*0], ym4 vextracti32x8 [r3+strideq*1], m4, 1 mova [r3+strideq*2], ym6 vextracti32x8 [r3+r2 ], m6, 1 mova [r4+strideq*0], ym14 vextracti32x8 [r4+strideq*1], m14, 1 mova [r4+strideq*2], ym16 vextracti32x8 [r4+r2 ], m16, 1 mova [r5+strideq*0], ym18 vextracti32x8 [r5+strideq*1], m18, 1 mova [r5+strideq*2], ym20 vextracti32x8 [r5+r2 ], m20, 1 RET ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 16 imul r6d, 181 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 ALIGN function_align cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero vpbroadcastd m8, [o(pw_2896x8)] vpbroadcastd m4, [o(pw_4076x8)] vpbroadcastd m3, [o(pw_401x8)] pmulhrsw m8, m0 ; t0 pmulhrsw m4, m14 ; t15a pmulhrsw m3, m14 ; t8a punpcklwd m9, m3, m4 punpckhwd m5, m3, m4 mova m2, m10 vpdpwssd m2, m9, [o(pw_m3784_1567)] {bcstd} mova m1, m10 vpdpwssd m1, m5, [o(pw_m3784_1567)] {bcstd} mova m6, m10 vpdpwssd m6, m5, [o(pw_1567_3784)] {bcstd} mova m5, m10 vpdpwssd m5, m9, [o(pw_1567_3784)] {bcstd} vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] psubsw m21, m8, m4 ; out15 paddsw m0, m8, m4 ; out0 psubsw m14, m8, m3 ; out8 paddsw m7, m8, m3 ; out7 REPX {psrad x, 12}, m2, m1, m6, m5 packssdw m2, m1 ; t9a packssdw m5, m6 ; t14a ITX_MULSUB_2W 4, 3, 16, 17, 10, 11, 12 ; t11, t12 psubsw m20, m8, m5 ; out14 paddsw m1, m8, m5 ; out1 psubsw m15, m8, m2 ; out9 paddsw m6, m8, m2 ; out6 ITX_MULSUB_2W 5, 2, 16, 17, 10, 11, 12 ; t10a, t13a psubsw m18, m8, m3 ; out12 paddsw m3, m8 ; out3 psubsw m17, m8, m4 ; out11 paddsw m4, m8 ; out4 psubsw m19, m8, m2 ; out13 paddsw m2, m8 ; out2 psubsw m16, m8, m5 ; out10 paddsw m5, m8 ; out5 ret cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m2, [o(pw_4017x8)] vpbroadcastd m3, [o(pw_799x8)] vpbroadcastd m18, [o(pw_4076x8)] vpbroadcastd m19, [o(pw_401x8)] vpbroadcastd m20, [o(pw_m1189x8)] vpbroadcastd m16, [o(pw_3920x8)] pmulhrsw m9, m0 ; t0 pmulhrsw m2, m1 ; t7a pmulhrsw m1, m3 ; t4a pmulhrsw m18, m14 ; t15a pmulhrsw m14, m19 ; t8a pmulhrsw m20, m15 ; t11a pmulhrsw m15, m16 ; t12a psubsw m7, m9, m2 ; idct8 out7 paddsw m0, m9, m2 ; idct8 out0 psubsw m4, m9, m1 ; idct8 out4 paddsw m3, m9, m1 ; idct8 out3 ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 mova m21, m18 mova m19, m14 mova m16, m15 mova m8, m20 psubsw m6, m9, m1 ; idct8 out6 paddsw m1, m9 ; idct8 out1 psubsw m5, m9, m2 ; idct8 out5 paddsw m2, m9 ; idct8 out2 jmp .main3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m5, [o(pw_m2276x8)] vpbroadcastd m11, [o(pw_3406x8)] vpbroadcastd m7, [o(pw_4017x8)] vpbroadcastd m12, [o(pw_799x8)] vpbroadcastd m6, [o(pw_3784x8)] vpbroadcastd m10, [o(pw_1567x8)] vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m5, m3 ; t5a pmulhrsw m3, m11 ; t6a pmulhrsw m7, m1 ; t7a pmulhrsw m1, m12 ; t4a pmulhrsw m6, m2 ; t3 pmulhrsw m2, m10 ; t2 pmulhrsw m4, m0 ; t0 vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] vpbroadcastd m10, [o(pd_2048)] mova m0, m4 ; t1 call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 vpbroadcastd m21, [o(pw_4076x8)] vpbroadcastd m8, [o(pw_401x8)] vpbroadcastd m18, [o(pw_m2598x8)] vpbroadcastd m9, [o(pw_3166x8)] vpbroadcastd m19, [o(pw_3612x8)] vpbroadcastd m11, [o(pw_1931x8)] vpbroadcastd m20, [o(pw_m1189x8)] vpbroadcastd m12, [o(pw_3920x8)] pmulhrsw m21, m14 ; t15a pmulhrsw m14, m8 ; t8a pmulhrsw m18, m17 ; t9a pmulhrsw m17, m9 ; t14a pmulhrsw m19, m16 ; t13a pmulhrsw m16, m11 ; t10a pmulhrsw m20, m15 ; t11a pmulhrsw m15, m12 ; t12a jmp .main2 ALIGN function_align cglobal_label .main_oddhalf ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a .main2: paddsw m8, m20, m16 ; t11 psubsw m20, m16 ; t10 paddsw m16, m15, m19 ; t12 psubsw m15, m19 ; t13 psubsw m19, m14, m18 ; t9 paddsw m14, m18 ; t8 psubsw m18, m21, m17 ; t14 paddsw m21, m17 ; t15 .main3: vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a vpbroadcastd m11, [o(pw_m1567_m3784)] ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] psubsw m17, m14, m8 ; t11a paddsw m8, m14 ; t8a paddsw m14, m18, m15 ; t9 psubsw m18, m15 ; t10 psubsw m15, m19, m20 ; t13 paddsw m19, m20 ; t14 paddsw m20, m21, m16 ; t15a psubsw m16, m21, m16 ; t12a ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 psubsw m21, m0, m20 ; out15 paddsw m0, m20 ; out0 psubsw m20, m1, m19 ; out14 paddsw m1, m19 ; out1 psubsw m19, m2, m18 ; out13 paddsw m2, m18 ; out2 psubsw m18, m3, m17 ; out12 paddsw m3, m17 ; out3 psubsw m17, m4, m16 ; out11 paddsw m4, m16 ; out4 psubsw m16, m5, m15 ; out10 paddsw m5, m15 ; out5 psubsw m15, m6, m14 ; out9 paddsw m6, m14 ; out6 psubsw m14, m7, m8 ; out8 paddsw m7, m8 ; out7 ret .transpose_round: punpcklwd m8, m0, m2 punpckhwd m0, m2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 punpcklwd m7, m14, m16 punpckhwd m14, m16 punpcklwd m16, m15, m17 punpckhwd m15, m17 punpcklwd m17, m19, m21 punpckhwd m19, m21 punpckhwd m21, m18, m20 punpcklwd m18, m20 punpcklwd m20, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 REPX {pmulhrsw x, m9}, m20, m8, m1, m0 punpcklwd m6, m7, m15 punpckhwd m7, m15 punpcklwd m15, m14, m16 punpckhwd m14, m16 REPX {pmulhrsw x, m9}, m2, m3, m5, m4 punpckhwd m16, m18, m19 punpcklwd m18, m19 punpcklwd m19, m21, m17 punpckhwd m21, m17 REPX {pmulhrsw x, m9}, m6, m7, m15, m14 punpcklwd m17, m8, m0 ; a2 a6 aa ae punpckhwd m8, m0 ; a3 a7 ab af punpcklwd m0, m20, m1 ; a0 a4 a8 ac punpckhwd m20, m1 ; a1 a5 a9 ad REPX {pmulhrsw x, m9}, m16, m18, m19, m21 punpcklwd m1, m2, m5 ; b0 b4 b8 bc punpckhwd m2, m5 ; b1 b5 b9 bd punpcklwd m5, m3, m4 ; b2 b6 ba be punpckhwd m3, m4 ; b3 b7 bb bf punpcklwd m4, m6, m15 ; c0 c4 c8 cc punpckhwd m6, m15 ; c1 c5 c9 cd punpcklwd m15, m7, m14 ; c2 c6 ca ce punpckhwd m7, m14 ; c3 c7 cb cf punpcklwd m14, m18, m19 ; d0 d4 d8 dc punpckhwd m18, m19 ; d1 d5 d9 dd punpcklwd m9, m16, m21 ; d2 d6 da de punpckhwd m16, m21 ; d3 d7 db df vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 vshufi32x4 m15, m9, q3232 ; ca ce da de vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 vshufi32x4 m7, m16, q3232 ; cb cf db df ret %macro IDTX_16x32 4 ; src/dst[1-4] pmulhrsw m%1, m15, [cq+64*%1] pmulhrsw m%2, m15, [cq+64*%2] pmulhrsw m%3, m15, [cq+64*%3] pmulhrsw m%4, m15, [cq+64*%4] pmulhrsw m18, m16, m%1 pmulhrsw m19, m16, m%2 pmulhrsw m20, m16, m%3 pmulhrsw m21, m16, m%4 REPX {pmulhrsw x, m17}, m18, m19, m20, m21 paddsw m%1, m18 paddsw m%2, m19 paddsw m%3, m20 paddsw m%4, m21 %endmacro %macro IDTX_16x32_STORE 2 ; src[1-2] mova xm17, [dstq+r3*0] vinserti128 ym17, [dstq+r3*4], 1 vinserti32x4 m17, [dstq+r3*8], 2 vinserti32x4 m17, [dstq+r4*8], 3 mova [cq+64*(%1*2+0)], m18 mova [cq+64*(%1*2+1)], m18 punpcklbw m16, m17, m18 punpckhbw m17, m18 paddw m16, m%1 paddw m17, m%2 packuswb m16, m17 mova [dstq+r3*0], xm16 vextracti128 [dstq+r3*4], ym16, 1 vextracti32x4 [dstq+r3*8], m16, 2 vextracti32x4 [dstq+r4*8], m16, 3 %if %1 != 7 add dstq, strideq %endif %endmacro cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c vpbroadcastd m15, [pw_2896x8] vpbroadcastd m16, [pw_1697x16] vpbroadcastd m17, [pw_16384] IDTX_16x32 0, 1, 2, 3 IDTX_16x32 4, 5, 6, 7 IDTX_16x32 8, 9, 10, 11 IDTX_16x32 12, 13, 14, 15 vpbroadcastd m16, [pw_8192] call .transpose_2x8x8_round lea r3, [strideq*2] lea r4, [strideq*3] pxor m18, m18 IDTX_16x32_STORE 0, 8 IDTX_16x32_STORE 1, 9 IDTX_16x32_STORE 2, 10 IDTX_16x32_STORE 3, 11 IDTX_16x32_STORE 4, 12 IDTX_16x32_STORE 5, 13 IDTX_16x32_STORE 6, 14 IDTX_16x32_STORE 7, 15 RET ALIGN function_align .transpose_2x8x8_round: punpckhwd m17, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m17, m1 punpckhdq m17, m1 REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m17 punpcklqdq m6, m17 punpckhwd m17, m12, m13 punpcklwd m12, m13 punpckhwd m13, m8, m9 punpcklwd m8, m9 punpckhwd m9, m14, m15 punpcklwd m14, m15 punpckhwd m15, m10, m11 punpcklwd m10, m11 punpckhdq m11, m8, m10 punpckldq m8, m10 punpckldq m10, m12, m14 punpckhdq m12, m14 punpckhdq m14, m13, m15 punpckldq m13, m15 punpckldq m15, m17, m9 punpckhdq m17, m9 REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 punpckhqdq m9, m8, m10 punpcklqdq m8, m10 punpcklqdq m10, m11, m12 punpckhqdq m11, m12 punpcklqdq m12, m13, m15 punpckhqdq m13, m15 punpckhqdq m15, m14, m17 punpcklqdq m14, m17 ret %macro IDTX_32x16 4 ; dst[1-4] pmulhrsw m%2, m12, [cq+32*(%1+ 0)] pmulhrsw m18, m12, [cq+32*(%1+16)] pmulhrsw m%4, m12, [cq+32*(%3+ 0)] pmulhrsw m19, m12, [cq+32*(%3+16)] REPX {paddsw x, x}, m%2, m18, m%4, m19 mova m%1, m14 vpermi2q m%1, m%2, m18 vpermt2q m%2, m16, m18 %if %3 != 14 mova m%3, m14 %endif vpermi2q m%3, m%4, m19 vpermt2q m%4, m16, m19 pmulhrsw m18, m17, m%1 pmulhrsw m19, m17, m%2 pmulhrsw m20, m17, m%3 pmulhrsw m21, m17, m%4 REPX {paddsw x, x}, m%1, m%2, m%3, m%4 paddsw m%1, m18 paddsw m%2, m19 paddsw m%3, m20 paddsw m%4, m21 %endmacro %macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 mova ym19, [dstq+strideq*0] vinserti32x8 m19, [dstq+strideq*8], 1 %if %3 == 0 mova [cq+64*(%1*2+0)], m20 mova [cq+64*(%1*2+1)], m20 %endif punpcklbw m18, m19, m20 punpckhbw m19, m20 paddw m18, m%1 paddw m19, m%2 packuswb m18, m19 mova [dstq+strideq*0], ym18 vextracti32x8 [dstq+strideq*8], m18, 1 %if %3 || %1 != 7 add dstq, strideq %endif %endmacro cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c vpbroadcastd m12, [pw_2896x8] movu m14, [permB+7] vpbroadcastd m17, [pw_1697x16] psrlq m16, m14, 4 IDTX_32x16 0, 1, 2, 3 IDTX_32x16 4, 5, 6, 7 IDTX_32x16 8, 9, 10, 11 IDTX_32x16 12, 13, 14, 15 vpbroadcastd m16, [pw_2048] call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round pxor m20, m20 IDTX_32x16_STORE 0, 8 IDTX_32x16_STORE 1, 9 IDTX_32x16_STORE 2, 10 IDTX_32x16_STORE 3, 11 IDTX_32x16_STORE 4, 12 IDTX_32x16_STORE 5, 13 IDTX_32x16_STORE 6, 14 IDTX_32x16_STORE 7, 15 RET %macro IDCT_32x32_END 4 ; src, mem, stride[1-2] pmovzxbw m10, [dstq+%3] pmovzxbw m11, [r3 +%4] %if %2 < 8 paddsw m8, m%2, m%1 psubsw m9, m%2, m%1 %else mova m9, [cq+64*(%2*2-16)] paddsw m8, m9, m%1 psubsw m9, m%1 %endif pmulhrsw m8, m12 pmulhrsw m9, m12 %if %2 >= 8 %if %2 == 8 pxor m0, m0 %endif mova [cq+64*(%2*2-16)], m0 mova [cq+64*(%2*2-15)], m0 %endif paddw m8, m10 paddw m9, m11 packuswb m8, m9 vpermq m8, m13, m8 mova [dstq+%3], ym8 vextracti32x8 [r3 +%4], m8, 1 %if %2 == 3 || %2 == 7 || %2 == 11 add dstq, r5 sub r3, r5 %endif %endmacro cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly WIN64_SPILL_XMM 30 cmp eobd, 136 jb .fast mova m5, [cq+64*20] mova m3, [cq+64*12] mova m1, [cq+64* 4] mova m7, [cq+64*28] mova m2, [cq+64* 8] mova m6, [cq+64*24] mova m0, [cq+64* 0] mova m4, [cq+64*16] call m(inv_txfm_add_dct_dct_32x8_8bpc).main mova m14, [cq+64* 2] mova m21, [cq+64*30] mova m18, [cq+64*18] mova m17, [cq+64*14] mova m16, [cq+64*10] mova m19, [cq+64*22] mova m20, [cq+64*26] mova m15, [cq+64* 6] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 mova m22, [cq+64* 1] mova m21, [cq+64*31] mova m14, [cq+64*17] mova m29, [cq+64*15] mova m26, [cq+64* 9] mova m17, [cq+64*23] mova m18, [cq+64*25] mova m25, [cq+64* 7] mova m24, [cq+64* 5] mova m19, [cq+64*27] mova m16, [cq+64*21] mova m27, [cq+64*11] mova m28, [cq+64*13] mova m15, [cq+64*19] mova m20, [cq+64*29] mova m23, [cq+64* 3] call .main_oddhalf vpbroadcastd m10, [o(pw_8192)] psubsw m13, m0, m29 ; 31 paddsw m0, m29 ; 0 psubsw m29, m1, m28 ; 30 paddsw m1, m28 ; 1 psubsw m28, m2, m27 ; 29 paddsw m2, m27 ; 2 psubsw m27, m3, m26 ; 28 paddsw m3, m26 ; 3 psubsw m26, m4, m25 ; 27 paddsw m4, m25 ; 4 psubsw m25, m5, m24 ; 26 paddsw m5, m24 ; 5 psubsw m24, m6, m23 ; 25 paddsw m6, m23 ; 6 psubsw m23, m7, m22 ; 24 paddsw m7, m22 ; 7 pxor m9, m9 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 punpckhwd m3, m23, m24 punpcklwd m23, m24 punpckhwd m24, m25, m26 punpcklwd m25, m26 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 punpckhwd m26, m27, m28 punpcklwd m27, m28 punpckhwd m28, m29, m13 punpcklwd m29, m13 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 REPX {pmulhrsw x, m10}, m0, m4, m8, m22 punpckhdq m13, m23, m25 punpckldq m23, m25 punpckhdq m25, m27, m29 punpckldq m27, m29 REPX {pmulhrsw x, m10}, m13, m23, m25, m27 punpckhdq m9, m3, m24 punpckldq m3, m24 punpckhdq m24, m26, m28 punpckldq m26, m28 punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 punpckhqdq m23, m27 ; d01 d09 d17 d25 punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 punpcklqdq m13, m25 ; d02 d10 d18 d26 punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 punpcklqdq m3, m26 ; d04 d12 d20 d28 punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 punpcklqdq m9, m24 ; d06 d14 d22 d30 REPX {pmulhrsw x, m10}, m25, m3, m26 mova [cq+64* 9], m23 mova [cq+64*11], m27 mova [cq+64*13], m25 mova [cq+64*15], m26 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 punpcklqdq m8, m22 ; a04 a12 a20 a28 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 punpcklqdq m0, m4 ; a00 a08 a16 a24 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 punpcklqdq m7, m2 ; a02 a10 a18 a26 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 punpcklqdq m6, m1 ; a06 a14 a22 a30 mova m2, [cq+64* 0] mova m11, [cq+64* 2] mova m12, [cq+64* 4] mova m29, [cq+64* 6] mova m27, [cq+64* 8] mova m26, [cq+64*10] mova m4, [cq+64*12] mova m28, [cq+64*14] psubsw m1, m2, m21 ; 23 paddsw m2, m21 ; 8 psubsw m21, m11, m20 ; 22 paddsw m11, m20 ; 9 psubsw m20, m12, m19 ; 21 paddsw m12, m19 ; 10 psubsw m19, m29, m18 ; 20 paddsw m29, m18 ; 11 psubsw m18, m27, m17 ; 19 paddsw m27, m17 ; 12 psubsw m17, m26, m16 ; 18 paddsw m26, m16 ; 13 paddsw m16, m4, m15 ; 14 psubsw m4, m15 ; 17 pmulhrsw m15, m6, m10 psubsw m6, m28, m14 ; 16 paddsw m28, m14 ; 15 pmulhrsw m14, m7, m10 punpcklwd m7, m6, m4 punpckhwd m6, m4 punpckhwd m4, m17, m18 punpcklwd m17, m18 punpckhwd m18, m19, m20 punpcklwd m19, m20 punpckhwd m20, m21, m1 punpcklwd m21, m1 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 pmulhrsw m23, m10 pmulhrsw m25, m10 punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 REPX {pmulhrsw x, m10}, m28, m2, m12, m27 punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 REPX {pmulhrsw x, m10}, m16, m1, m11, m29 punpckhdq m26, m19, m21 punpckldq m19, m21 punpckhdq m21, m6, m4 punpckldq m6, m4 REPX {pmulhrsw x, m10}, m26, m19, m21, m6 punpckhdq m4, m18, m20 punpckldq m18, m20 punpckhdq m20, m7, m17 punpckldq m7, m17 REPX {pmulhrsw x, m10}, m4, m18, m20, m7 punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 punpckhqdq m28, m12 ; b03 b11 b19 b27 punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 punpcklqdq m2, m27 ; b00 b08 b16 b24 punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 punpcklqdq m1, m29 ; b04 b12 b20 b28 punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 punpcklqdq m16, m11 ; b06 b14 b22 b30 mova [cq+64* 1], m12 mova [cq+64* 3], m28 mova [cq+64* 5], m27 mova [cq+64* 7], m29 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 punpcklqdq m20, m26 ; c02 c10 c18 c26 punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 punpcklqdq m7, m19 ; c00 c08 c16 c24 punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 punpcklqdq m6, m18 ; c04 c12 c20 c28 punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 punpcklqdq m21, m4 ; c06 c14 c22 c30 pmulhrsw m19, m9, m10 vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 vshufi32x4 m3, m1, m6, q3131 ; 12 vshufi32x4 m1, m6, q2020 ; 4 vshufi32x4 m6, m4, m2, q3131 ; 24 vshufi32x4 m4, m2, q2020 ; 16 vshufi32x4 m2, m0, m7, q3131 ; 8 vshufi32x4 m0, m7, q2020 ; 0 vshufi32x4 m7, m5, m8, q3131 ; 28 vshufi32x4 m5, m8, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x8_8bpc).main vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 vshufi32x4 m16, m14, m20, q3131 ; 10 vshufi32x4 m14, m20, q2020 ; 2 vshufi32x4 m20, m18, m17, q3131 ; 26 vshufi32x4 m18, m17, q2020 ; 18 vshufi32x4 m17, m15, m21, q3131 ; 14 vshufi32x4 m15, m21, q2020 ; 6 vshufi32x4 m21, m19, m13, q3131 ; 30 vshufi32x4 m19, m13, q2020 ; 22 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 mova m15, [cq+64* 1] mova m16, [cq+64* 3] mova m17, [cq+64* 5] mova m19, [cq+64* 7] mova m20, [cq+64* 9] mova m21, [cq+64*11] mova m13, [cq+64*13] mova m18, [cq+64*15] vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 vshufi32x4 m18, m14, m26, q3131 ; 25 vshufi32x4 m14, m26, q2020 ; 17 vshufi32x4 m19, m15, m27, q3131 ; 27 vshufi32x4 m15, m27, q2020 ; 19 vshufi32x4 m20, m16, m28, q3131 ; 29 vshufi32x4 m16, m28, q2020 ; 21 vshufi32x4 m21, m17, m29, q3131 ; 31 vshufi32x4 m17, m29, q2020 ; 23 vshufi32x4 m26, m22, m8, q3131 ; 9 vshufi32x4 m22, m8, q2020 ; 1 vshufi32x4 m27, m23, m9, q3131 ; 11 vshufi32x4 m23, m9, q2020 ; 3 vshufi32x4 m28, m24, m11, q3131 ; 13 vshufi32x4 m24, m11, q2020 ; 5 vshufi32x4 m29, m25, m12, q3131 ; 15 vshufi32x4 m25, m12, q2020 ; 7 call .main_oddhalf jmp .end .fast: ; bottom/right halves are zero mova m14, [o(dup16_perm)] pmovzxwd m9, [cq+64* 0] pmovzxwd m6, [cq+64* 8] vpermb m8, m14, [cq+64* 2] vpermb ym0, ym14, [cq+64*14] vpermb ym5, ym14, [cq+64*10] vpermb m1, m14, [cq+64* 6] vpermb m7, m14, [cq+64* 4] vpermb ym3, ym14, [cq+64*12] pslld m9, 16 pslld m6, 16 call m(idct_16x16_internal_8bpc).main_fast vpermb m21, m14, [cq+64* 1] vpermb ym17, ym14, [cq+64*15] vpermb ym20, ym14, [cq+64* 9] vpermb m15, m14, [cq+64* 7] vpermb m18, m14, [cq+64* 5] vpermb ym16, ym14, [cq+64*11] vpermb ym19, ym14, [cq+64*13] vpermb m14, m14, [cq+64* 3] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m9, [o(pw_8192)] call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round vshufi32x4 m22, m14, m2, q2020 ; 1 vshufi32x4 m24, m14, m2, q3131 ; 5 vshufi32x4 m23, m17, m9, q2020 ; 3 vshufi32x4 m25, m17, m9, q3131 ; 7 vshufi32x4 m16, m5, m15, q2020 ; 10 vshufi32x4 m17, m5, m15, q3131 ; 14 vshufi32x4 m14, m1, m18, q2020 ; 2 vshufi32x4 m15, m1, m18, q3131 ; 6 vshufi32x4 m1, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m21, m4, q3131 ; 12 vshufi32x4 m2, m21, m4, q2020 ; 8 vshufi32x4 m26, m20, m6, q2020 ; 9 vshufi32x4 m28, m20, m6, q3131 ; 13 vshufi32x4 m27, m19, m7, q2020 ; 11 vshufi32x4 m29, m19, m7, q3131 ; 15 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 call .main_oddhalf_fast .end: lea r4, [strideq*3] vpbroadcastd m12, [o(pw_2048)] movshdup m13, [o(permD)] lea r3, [dstq+r4*8] lea r5, [strideq+r4] ; stride*4 add r3, r5 ; dst+stride*28 IDCT_32x32_END 29, 0, strideq*0, r4 IDCT_32x32_END 28, 1, strideq*1, strideq*2 IDCT_32x32_END 27, 2, strideq*2, strideq*1 IDCT_32x32_END 26, 3, r4 , strideq*0 IDCT_32x32_END 25, 4, strideq*0, r4 IDCT_32x32_END 24, 5, strideq*1, strideq*2 IDCT_32x32_END 23, 6, strideq*2, strideq*1 IDCT_32x32_END 22, 7, r4 , strideq*0 IDCT_32x32_END 21, 8, strideq*0, r4 IDCT_32x32_END 20, 9, strideq*1, strideq*2 IDCT_32x32_END 19, 10, strideq*2, strideq*1 IDCT_32x32_END 18, 11, r4 , strideq*0 IDCT_32x32_END 17, 12, strideq*0, r4 IDCT_32x32_END 16, 13, strideq*1, strideq*2 IDCT_32x32_END 15, 14, strideq*2, strideq*1 IDCT_32x32_END 14, 15, r4 , strideq*0 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 ALIGN function_align cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m24, [o(pw_m601x8)] vpbroadcastd m12, [o(pw_4052x8)] pmulhrsw m21, m22 ; t31a pmulhrsw m22, m8 ; t16a pmulhrsw m24, m23 ; t23a pmulhrsw m23, m12 ; t24a punpcklwd m9, m22, m21 punpckhwd m8, m22, m21 mova m15, m10 vpdpwssd m15, m9, [o(pw_m4017_799)] {bcstd} mova m17, m10 vpdpwssd m17, m8, [o(pw_m4017_799)] {bcstd} REPX {psrad x, 12}, m15, m17 packssdw m15, m17 mova m17, m10 vpdpwssd m17, m8, [o(pw_799_4017)] {bcstd} mova m8, m10 vpdpwssd m8, m9, [o(pw_799_4017)] {bcstd} REPX {psrad x, 12}, m17, m8 packssdw m8, m17 punpcklwd m9, m24, m23 punpckhwd m16, m24, m23 mova m20, m10 vpdpwssd m20, m9, [o(pw_m3406_m2276)] {bcstd} mova m17, m10 vpdpwssd m17, m16, [o(pw_m3406_m2276)] {bcstd} REPX {psrad x, 12}, m20, m17 packssdw m20, m17 mova m17, m10 vpdpwssd m17, m16, [o(pw_m2276_3406)] {bcstd} mova m16, m10 vpdpwssd m16, m9, [o(pw_m2276_3406)] {bcstd} REPX {psrad x, 12}, m17, m16 packssdw m16, m17 mova m17, m21 mova m27, m15 mova m25, m20 mova m29, m8 mova m18, m22 mova m14, m24 mova m28, m16 mova m26, m23 jmp .main4 cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m18, [o(pw_m1380x8)] vpbroadcastd m9, [o(pw_3857x8)] vpbroadcastd m19, [o(pw_3973x8)] vpbroadcastd m11, [o(pw_995x8)] vpbroadcastd m28, [o(pw_m601x8)] vpbroadcastd m12, [o(pw_4052x8)] pmulhrsw m21, m22 ; t31a pmulhrsw m22, m8 ; t16a pmulhrsw m18, m25 ; t19a pmulhrsw m25, m9 ; t28a pmulhrsw m19, m24 ; t27a pmulhrsw m24, m11 ; t20a pmulhrsw m28, m23 ; t23a pmulhrsw m23, m12 ; t24a mova m15, m21 mova m8, m22 mova m14, m18 mova m27, m25 mova m29, m19 mova m26, m24 mova m16, m28 mova m20, m23 jmp .main3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m14, [o(pw_m2751x8)] vpbroadcastd m9, [o(pw_3035x8)] vpbroadcastd m17, [o(pw_3703x8)] vpbroadcastd m11, [o(pw_1751x8)] vpbroadcastd m18, [o(pw_m1380x8)] vpbroadcastd m12, [o(pw_3857x8)] pmulhrsw m21, m22 ; t31a vpbroadcastd m19, [o(pw_3973x8)] pmulhrsw m22, m8 ; t16a vpbroadcastd m8, [o(pw_995x8)] pmulhrsw m14, m29 ; t30a vpbroadcastd m16, [o(pw_m2106x8)] pmulhrsw m29, m9 ; t17a vpbroadcastd m9, [o(pw_3513x8)] pmulhrsw m17, m26 ; t29a vpbroadcastd m15, [o(pw_3290x8)] pmulhrsw m26, m11 ; t18a vpbroadcastd m11, [o(pw_2440x8)] pmulhrsw m18, m25 ; t19a vpbroadcastd m20, [o(pw_m601x8)] pmulhrsw m25, m12 ; t28a vpbroadcastd m12, [o(pw_4052x8)] pmulhrsw m19, m24 ; t27a pmulhrsw m24, m8 ; t20a pmulhrsw m16, m27 ; t21a pmulhrsw m27, m9 ; t26a pmulhrsw m15, m28 ; t25a pmulhrsw m28, m11 ; t22a pmulhrsw m20, m23 ; t23a pmulhrsw m23, m12 ; t24a jmp .main2 ALIGN function_align cglobal_label .main_oddhalf ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a .main2: psubsw m8, m22, m14 ; t17 paddsw m22, m14 ; t16 paddsw m14, m18, m26 ; t19 psubsw m18, m26 ; t18 psubsw m26, m24, m16 ; t21 paddsw m24, m16 ; t20 psubsw m16, m20, m28 ; t22 paddsw m28, m20 ; t23 psubsw m20, m23, m15 ; t25 paddsw m23, m15 ; t24 psubsw m15, m21, m29 ; t30 paddsw m21, m29 ; t31 psubsw m29, m19, m27 ; t26 paddsw m19, m27 ; t27 paddsw m27, m25, m17 ; t28 psubsw m25, m17 ; t29 .main3: ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a psubsw m17, m21, m27 ; t28a paddsw m21, m27 ; t31a psubsw m27, m15, m25 ; t18 paddsw m15, m25 ; t17 psubsw m25, m20, m29 ; t21 paddsw m20, m29 ; t22 psubsw m29, m8, m18 ; t29 paddsw m8, m18 ; t30 psubsw m18, m22, m14 ; t19a paddsw m22, m14 ; t16a psubsw m14, m28, m24 ; t20a paddsw m24, m28 ; t23a paddsw m28, m16, m26 ; t25 psubsw m16, m26 ; t26 psubsw m26, m23, m19 ; t27a paddsw m23, m19 ; t24a .main4: vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m11, [o(pw_1567_3784)] ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 vpbroadcastd m11, [o(pw_m1567_m3784)] ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 vpbroadcastd m12, [o(pw_m2896_2896)] vpbroadcastd m11, [o(pw_2896_2896)] psubsw m19, m27, m25 ; t26 paddsw m27, m25 ; t29 psubsw m25, m17, m26 ; t20a paddsw m17, m26 ; t19a paddsw m26, m18, m14 ; t28a psubsw m18, m14 ; t27a paddsw m14, m22, m24 ; t16 psubsw m22, m24 ; t23 psubsw m24, m29, m16 ; t21 paddsw m16, m29 ; t18 paddsw m29, m21, m23 ; t31 psubsw m21, m23 ; t24 psubsw m23, m15, m20 ; t22a paddsw m15, m20 ; t17a psubsw m20, m8, m28 ; t25a paddsw m28, m8 ; t30a ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 ret %macro IDTX_32x32 2 ; dst[1-2] vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements vmovdqa32 ym18, [cq+64*(%2+16)] vpermt2q m%1, m21, m17 vpermt2q m%2, m21, m18 %endmacro cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c movu m21, [permB+7] vpbroadcastd m16, [pw_8192] pxor m20, m20 .loop: IDTX_32x32 0, 1 IDTX_32x32 2, 3 IDTX_32x32 4, 5 IDTX_32x32 6, 7 IDTX_32x32 8, 9 IDTX_32x32 10, 11 IDTX_32x32 12, 13 IDTX_32x32 14, 15 call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round IDTX_32x16_STORE 0, 8, 1 IDTX_32x16_STORE 1, 9, 1 IDTX_32x16_STORE 2, 10, 1 IDTX_32x16_STORE 3, 11, 1 IDTX_32x16_STORE 4, 12, 1 IDTX_32x16_STORE 5, 13, 1 IDTX_32x16_STORE 6, 14, 1 IDTX_32x16_STORE 7, 15, 1 lea dstq, [dstq+strideq*8] btc cq, 5 jnc .loop mov r0d, 8 .zero_loop: mova [cq+64*0], m20 mova [cq+64*1], m20 mova [cq+64*2], m20 mova [cq+64*3], m20 add cq, 64*4 dec r0d jg .zero_loop RET cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly WIN64_SPILL_XMM 30 cmp eobd, 151 jb .fast mova m5, [cq+64*10] mova m3, [cq+64* 6] mova m1, [cq+64* 2] mova m7, [cq+64*14] mova m2, [cq+64* 4] mova m6, [cq+64*12] mova m0, [cq+64* 0] mova m4, [cq+64* 8] call m(inv_txfm_add_dct_dct_32x8_8bpc).main mova m14, [cq+64* 1] mova m21, [cq+64*15] mova m18, [cq+64* 9] mova m17, [cq+64* 7] mova m16, [cq+64* 5] mova m19, [cq+64*11] mova m20, [cq+64*13] mova m15, [cq+64* 3] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf vpbroadcastd m9, [o(pw_8192)] %macro TRANSPOSE_8x4_ROUND 4 punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 %endmacro TRANSPOSE_8x4_ROUND 0, 1, 2, 3 TRANSPOSE_8x4_ROUND 4, 5, 6, 7 TRANSPOSE_8x4_ROUND 14, 15, 16, 17 TRANSPOSE_8x4_ROUND 18, 19, 20, 21 vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 vshufi32x4 m22, m26, m4, q2020 ; 0 1 vshufi32x4 m26, m4, q3131 ; 8 9 vshufi32x4 m23, m27, m5, q2020 ; 2 3 vshufi32x4 m27, m5, q3131 ; 10 11 vshufi32x4 m24, m28, m6, q2020 ; 4 5 vshufi32x4 m28, m6, q3131 ; 12 13 vshufi32x4 m25, m29, m7, q2020 ; 6 7 vshufi32x4 m29, m7, q3131 ; 14 15 vshufi32x4 m4, m0, m14, q2020 ; 16 17 vshufi32x4 m3, m0, m14, q3131 ; 24 25 vshufi32x4 m20, m1, m15, q2020 ; 18 19 vshufi32x4 m19, m1, m15, q3131 ; 26 27 vshufi32x4 m5, m2, m16, q2020 ; 20 21 vshufi32x4 m0, m2, m16, q3131 ; 28 29 vshufi32x4 m16, m8, m17, q2020 ; 22 23 vshufi32x4 m17, m8, m17, q3131 ; 30 31 pxor m6, m6 mova [cq+64* 0], m4 mova [cq+64* 2], m5 mova [cq+64* 4], m3 mova [cq+64* 6], m0 punpcklwd m8, m24, m24 ; 4 punpcklwd m0, m0 ; 28 punpcklwd m5, m5 ; 20 punpcklwd m1, m28, m28 ; 12 punpcklwd m7, m26, m26 ; 8 punpcklwd m3, m3 ; 24 punpcklwd m9, m6, m22 ; __ 0 punpcklwd m6, m4 ; __ 16 call m(idct_16x16_internal_8bpc).main_fast3 mova [cq+64* 1], m20 mova [cq+64* 3], m16 mova [cq+64* 5], m19 mova [cq+64* 7], m17 punpcklwd m21, m23, m23 ; 2 punpcklwd m17, m17 ; 30 punpcklwd m20, m20 ; 18 punpcklwd m15, m29, m29 ; 14 punpcklwd m18, m27, m27 ; 10 punpcklwd m16, m16 ; 22 punpcklwd m19, m19 ; 26 punpcklwd m14, m25, m25 ; 6 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 mova m21, [cq+64* 7] mova m14, [cq+64* 0] mova m17, [cq+64* 3] mova m18, [cq+64* 4] mova m19, [cq+64* 5] mova m16, [cq+64* 2] mova m15, [cq+64* 1] mova m20, [cq+64* 6] REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ m24, m19, m16, m27, m28, m15, m20, m23 call .main_oddhalf jmp .end .fast: ; right half is zero mova ym8, [cq+64*15] vinserti32x8 m8, [cq+64* 1], 1 mova m2, [o(int16_perm)] mova ym9, [cq+64* 8] vinserti32x8 m9, [cq+64* 0], 1 mova ym0, [cq+64* 7] vinserti32x8 m0, [cq+64* 9], 1 mova ym7, [cq+64*14] vinserti32x8 m7, [cq+64* 2], 1 mova ym1, [cq+64* 3] vinserti32x8 m1, [cq+64*13], 1 mova ym3, [cq+64* 6] vinserti32x8 m3, [cq+64*10], 1 mova ym5, [cq+64*11] vinserti32x8 m5, [cq+64* 5], 1 mova ym6, [cq+64*12] vinserti32x8 m6, [cq+64* 4], 1 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 call m(idct_16x16_internal_8bpc).main2 vbroadcasti32x4 m8, [o(int_shuf3)] vbroadcasti32x4 m9, [o(int_shuf4)] vpbroadcastd m11, [o(pw_8192)] pshufb m0, m8 pshufb m1, m9 pshufb m2, m8 pshufb m3, m9 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 pshufb m4, m8 pshufb m5, m9 pshufb m6, m8 pshufb m7, m9 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 punpckhdq m28, m0, m1 punpckldq m0, m1 punpckhdq m27, m2, m3 punpckldq m2, m3 punpckhdq m22, m4, m5 punpckldq m4, m5 punpckhdq m23, m6, m7 punpckldq m6, m7 vinserti32x8 m14, m0, ym2, 1 vshufi32x4 m15, m0, m2, q3232 vinserti32x8 m2, m4, ym6, 1 vshufi32x4 m4, m6, q3232 vshufi32x4 m21, m14, m2, q2020 ; 0 2 vshufi32x4 m14, m2, q3131 ; 4 6 vshufi32x4 m18, m15, m4, q2020 ; 8 10 vshufi32x4 m15, m4, q3131 ; 12 14 pxor m9, m9 punpcklwd m8, m14, m14 ; 4 punpcklwd m1, m15, m15 ; 12 punpcklwd m7, m18, m18 ; 8 punpcklwd m9, m21 ; __ 0 call m(idct_16x16_internal_8bpc).main_fast4 punpckhwd m21, m21 ; 2 punpckhwd m15, m15 ; 14 punpckhwd m18, m18 ; 10 punpckhwd m14, m14 ; 6 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 vinserti32x8 m24, m28, ym27, 1 vshufi32x4 m28, m27, q3232 vinserti32x8 m27, m22, ym23, 1 vshufi32x4 m22, m23, q3232 vshufi32x4 m23, m24, m27, q2020 ; 1 3 vshufi32x4 m24, m27, q3131 ; 5 7 vshufi32x4 m27, m28, m22, q2020 ; 9 11 vshufi32x4 m28, m22, q3131 ; 13 15 punpcklwd m22, m23, m23 ; 1 punpckhwd m29, m28, m28 ; 15 punpcklwd m26, m27, m27 ; 9 punpckhwd m25, m24, m24 ; 7 mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 punpcklwd m24, m24 ; 5 punpckhwd m27, m27 ; 11 punpcklwd m28, m28 ; 13 punpckhwd m23, m23 ; 3 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 call .main_oddhalf_fast .end: imul r6, strideq, 60 mova m10, [o(end_16x32p)] vpbroadcastd m11, [o(pw_2048)] lea r3, [strideq*3] pxor m12, m12 add r6, dstq ; dst+stride*60 psrldq m13, m10, 1 lea r4, [strideq+r3] ; stride*4 %macro IDCT_16x64_END 3 ; idct32, idct64, tmp %if %1 & 1 %define %%s0 r3 %define %%s1 strideq*2 %define %%s2 strideq*1 %define %%s3 strideq*0 %else %define %%s0 strideq*0 %define %%s1 strideq*1 %define %%s2 strideq*2 %define %%s3 r3 %if %1 add dstq, r4 sub r6, r4 %endif %endif %if %1 < 8 pmulhrsw m8, m11, m%1 pmulhrsw m9, m11, m%2 %else mova m9, [cq+64*%1] paddsw m8, m9, m%2 ; out 0+n, 1+n psubsw m9, m%2 ; out 63-n, 62-n pmulhrsw m8, m11 pmulhrsw m9, m11 %endif mova xm29, [dstq+%%s0] vinserti128 ym29, [dstq+%%s1], 1 mova xm%3, [r6 +%%s3] vinserti128 ym%3, [r6 +%%s2], 1 vpermb m29, m10, m29 vpermb m%3, m10, m%3 mova [cq+64*%1], m12 paddw m29, m8 paddw m%3, m9 packuswb m29, m%3 vpermd m29, m13, m29 mova [dstq+%%s0], xm29 vextracti128 [dstq+%%s1], ym29, 1 vextracti32x4 [r6 +%%s2], m29, 2 vextracti32x4 [r6 +%%s3], m29, 3 %endmacro IDCT_16x64_END 0, 29, 0 IDCT_16x64_END 1, 28, 28 IDCT_16x64_END 2, 27, 28 IDCT_16x64_END 3, 26, 28 IDCT_16x64_END 4, 25, 28 IDCT_16x64_END 5, 24, 28 IDCT_16x64_END 6, 23, 28 IDCT_16x64_END 7, 22, 28 IDCT_16x64_END 8, 21, 28 IDCT_16x64_END 9, 20, 28 IDCT_16x64_END 10, 19, 28 IDCT_16x64_END 11, 18, 28 IDCT_16x64_END 12, 17, 28 IDCT_16x64_END 13, 16, 28 IDCT_16x64_END 14, 15, 28 IDCT_16x64_END 15, 14, 28 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 64 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m21, [o(pw_m1474_3822x8)] vpbroadcastd m14, [o(pw_897_3996x8)] vpbroadcastd m17, [o(pw_m700_4036x8)] vpbroadcastd m18, [o(pw_501_4065x8)] vpbroadcastd m19, [o(pw_m1092_3948x8)] vpbroadcastd m16, [o(pw_1285_3889x8)] vpbroadcastd m15, [o(pw_m301_4085x8)] pmulhrsw m8, m22 ; t32a t63a pmulhrsw m21, m29 ; t35a t60a pmulhrsw m14, m26 ; t36a t59a pmulhrsw m17, m25 ; t39a t56 pmulhrsw m18, m24 ; t40a t55a pmulhrsw m19, m27 ; t43a t52a pmulhrsw m16, m28 ; t44a t51a pmulhrsw m15, m23 ; t47a t48a mova m22, m8 mova m29, m21 mova m26, m14 mova m25, m17 mova m24, m18 mova m27, m19 mova m28, m16 mova m20, m15 jmp .main_oddhalf2 ALIGN function_align cglobal_label .main_oddhalf vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m9, [o(pw_m2824_2967x8)] vpbroadcastd m11, [o(pw_1660_3745x8)] vpbroadcastd m12, [o(pw_m1474_3822x8)] pmulhrsw m22, m8 ; t32a t63a vpbroadcastd m8, [o(pw_897_3996x8)] pmulhrsw m21, m9 ; t33a t62a vpbroadcastd m9, [o(pw_m2191_3461x8)] pmulhrsw m14, m11 ; t34a t61a vpbroadcastd m11, [o(pw_2359_3349x8)] pmulhrsw m29, m12 ; t35a t60a vpbroadcastd m12, [o(pw_m700_4036x8)] pmulhrsw m26, m8 ; t36a t59a vpbroadcastd m8, [o(pw_501_4065x8)] pmulhrsw m17, m9 ; t37a t58a vpbroadcastd m9, [o(pw_m2520_3229x8)] pmulhrsw m18, m11 ; t38a t57a vpbroadcastd m11, [o(pw_2019_3564x8)] pmulhrsw m25, m12 ; t39a t56a vpbroadcastd m12, [o(pw_m1092_3948x8)] pmulhrsw m24, m8 ; t40a t55a vpbroadcastd m8, [o(pw_1285_3889x8)] pmulhrsw m19, m9 ; t41a t54a vpbroadcastd m9, [o(pw_m1842_3659x8)] pmulhrsw m16, m11 ; t42a t53a vpbroadcastd m11, [o(pw_2675_3102x8)] pmulhrsw m27, m12 ; t43a t52a vpbroadcastd m12, [o(pw_m301_4085x8)] pmulhrsw m28, m8 ; t44a t51a pmulhrsw m15, m9 ; t45a t50a pmulhrsw m20, m11 ; t46a t49a pmulhrsw m23, m12 ; t47a t48a psubsw m8, m22, m21 ; t33 t62 paddsw m22, m21 ; t32 t63 psubsw m21, m29, m14 ; t34 t61 paddsw m29, m14 ; t35 t60 psubsw m14, m26, m17 ; t37 t58 paddsw m26, m17 ; t36 t59 psubsw m17, m25, m18 ; t38 t57 paddsw m25, m18 ; t39 t56 psubsw m18, m24, m19 ; t41 t54 paddsw m24, m19 ; t40 t55 psubsw m19, m27, m16 ; t42 t53 paddsw m27, m16 ; t43 t52 psubsw m16, m28, m15 ; t45 t50 paddsw m28, m15 ; t44 t51 psubsw m15, m23, m20 ; t46 t49 paddsw m20, m23 ; t47 t48 .main_oddhalf2: ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a vpbroadcastd m11, [o(pw_m4017_799)] psubsw m23, m25, m26 ; t36a t59a paddsw m25, m26 ; t39a t56a psubsw m26, m24, m27 ; t43a t52a paddsw m27, m24 ; t40a t55a psubsw m24, m20, m28 ; t44a t51a paddsw m20, m28 ; t47a t48a psubsw m28, m8, m21 ; t34 t61 paddsw m8, m21 ; t33 t62 psubsw m21, m17, m14 ; t37 t58 paddsw m17, m14 ; t38 t57 psubsw m14, m18, m19 ; t42 t53 paddsw m18, m19 ; t41 t54 psubsw m19, m15, m16 ; t45 t50 paddsw m15, m16 ; t46 t49 psubsw m16, m22, m29 ; t35a t60a paddsw m22, m29 ; t32a t63a ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a vpbroadcastd m11, [o(pw_m2276_3406)] ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] psubsw m29, m22, m25 ; t39 t56 paddsw m22, m25 ; t32 t63 psubsw m25, m20, m27 ; t40 t55 paddsw m20, m27 ; t47 t48 psubsw m27, m8, m17 ; t38a t57a paddsw m8, m17 ; t33a t62a psubsw m17, m15, m18 ; t41a t54a paddsw m15, m18 ; t46a t49a paddsw m18, m16, m23 ; t35a t60a psubsw m16, m23 ; t36a t59a psubsw m23, m24, m26 ; t43a t52a paddsw m24, m26 ; t44a t51a paddsw m26, m28, m21 ; t34 t61 psubsw m28, m21 ; t37 t58 psubsw m21, m19, m14 ; t42 t53 paddsw m19, m14 ; t45 t50 ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a vpbroadcastd m11, [o(pw_m1567_m3784)] ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a vbroadcasti32x4 m13, [o(deint_shuf)] vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] paddsw m14, m22, m20 ; t32a t63a psubsw m22, m20 ; t47a t48a psubsw m20, m8, m15 ; t46 t49 paddsw m8, m15 ; t33 t62 paddsw m15, m18, m24 ; t35 t60 psubsw m18, m24 ; t44 t51 psubsw m24, m26, m19 ; t45a t50a paddsw m26, m19 ; t34a t61a REPX {pshufb x, m13}, m14, m8, m15, m26 psubsw m19, m29, m25 ; t40 t55 paddsw m25, m29 ; t39 t56 psubsw m29, m27, m17 ; t41a t54a paddsw m27, m17 ; t38a t57a psubsw m17, m16, m23 ; t43a t52a paddsw m16, m23 ; t36a t59a psubsw m9, m28, m21 ; t42 t53 paddsw m28, m21 ; t37 t58 REPX {pshufb x, m13}, m25, m27, m16, m28 ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a packssdw m21, m22 ; t47 t46a packssdw m13, m23 ; t48 t49a ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 packssdw m20, m18 ; t44a t45 packssdw m22, m23 ; t51a t50 ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 packssdw m18, m19 ; t40a t41 packssdw m24, m23 ; t55a t54 ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a packssdw m19, m17 ; t43 t42a packssdw m23, m29 ; t52 t53a punpcklqdq m17, m25, m27 ; t39 t38a punpckhqdq m25, m27 ; t56 t57a punpckhqdq m27, m15, m26 ; t60 t61a punpcklqdq m15, m26 ; t35 t34a punpckhqdq m26, m16, m28 ; t59a t58 punpcklqdq m16, m28 ; t36a t37 punpckhqdq m28, m14, m8 ; t63a t62 punpcklqdq m14, m8 ; t32a t33 psubsw m29, m0, m28 ; out63 out62 paddsw m0, m28 ; out0 out1 psubsw m28, m1, m27 ; out60 out61 paddsw m1, m27 ; out3 out2 psubsw m27, m2, m26 ; out59 out58 paddsw m2, m26 ; out4 out5 psubsw m26, m3, m25 ; out56 out57 paddsw m3, m25 ; out7 out6 psubsw m25, m4, m24 ; out55 out54 paddsw m4, m24 ; out8 out9 psubsw m24, m5, m23 ; out52 out53 paddsw m5, m23 ; out11 out10 psubsw m23, m6, m22 ; out51 out50 paddsw m6, m22 ; out12 out13 psubsw m22, m7, m13 ; out48 out49 paddsw m7, m13 ; out15 out14 ret cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jnz .normal movsx r6d, word [cq] mov [cq], eobd or r3d, 16 .dconly: imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 .dconly2: imul r6d, 181 add r6d, 128+2048 sar r6d, 8+4 pxor m2, m2 vpbroadcastw m3, r6d .dconly_loop: mova m1, [dstq] punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, strideq dec r3d jg .dconly_loop RET .normal: WIN64_SPILL_XMM 31 mova m19, [o(dup16_perm)] mova m24, [cq+64* 2] mova m28, [cq+64* 6] mova m26, [cq+64* 4] mova m22, [cq+64* 0] mova m23, [cq+64* 1] mova m29, [cq+64* 7] mova m27, [cq+64* 5] mova m25, [cq+64* 3] vpermb m8, m19, m24 ; 4 vpermb m1, m19, m28 ; 12 vpermb m7, m19, m26 ; 8 vpermb m9, m19, m22 ; __ 0 vpermb m21, m19, m23 ; 2 vpermb m15, m19, m29 ; 14 vpermb m18, m19, m27 ; 10 vpermb m14, m19, m25 ; 6 pslld m9, 16 vpord m30, m19, [o(pb_32)] {1to16} REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 cmp eobd, 151 jb .fast vpermb m0, m19, [cq+64*14] ; 28 vpermb m5, m19, [cq+64*10] ; 20 vpermb m3, m19, [cq+64*12] ; 24 vpermb m6, m19, [cq+64* 8] ; __ 16 pslld m6, 16 call m(idct_16x16_internal_8bpc).main_fast vpermb m17, m19, [cq+64*15] ; 30 vpermb m20, m19, [cq+64* 9] ; 18 vpermb m16, m19, [cq+64*11] ; 22 vpermb m19, m19, [cq+64*13] ; 26 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 vpermb m21, m30, [cq+64*15] vpermb m14, m30, [cq+64* 8] vpermb m17, m30, [cq+64*11] vpermb m18, m30, [cq+64*12] vpermb m19, m30, [cq+64*13] vpermb m16, m30, [cq+64*10] vpermb m15, m30, [cq+64* 9] vpermb m20, m30, [cq+64*14] call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf jmp .end .fast: ; bottom half is zero call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast .end: mova [cq+64* 8], m4 mova [cq+64* 9], m5 mova [cq+64*10], m6 mova [cq+64*11], m7 mova [cq+64*12], m26 mova [cq+64*13], m27 mova [cq+64*14], m28 mova [cq+64*15], m29 vpbroadcastd m13, [o(pw_8192)] call .pass1_end call .pass2 mova [cq+64* 0], m0 mova [cq+64* 1], m1 mova [cq+64* 2], m2 mova [cq+64* 3], m3 mova [cq+64* 4], m4 mova [cq+64* 5], m5 mova [cq+64* 6], m6 mova [cq+64* 7], m7 pmulhrsw m0, m13, [cq+64* 8] pmulhrsw m1, m13, [cq+64* 9] pmulhrsw m2, m13, [cq+64*10] pmulhrsw m3, m13, [cq+64*11] vpbroadcastd m30, [o(pw_2048)] pmulhrsw m4, m13, m22 pmulhrsw m5, m13, m23 pmulhrsw m6, m13, m24 pmulhrsw m7, m13, m25 pmulhrsw m22, m30, m14 pmulhrsw m14, m13, m26 pmulhrsw m23, m30, m15 pmulhrsw m15, m13, m27 pmulhrsw m24, m30, m16 pmulhrsw m16, m13, m28 pmulhrsw m25, m30, m17 pmulhrsw m17, m13, m29 pmulhrsw m26, m30, m18 pmulhrsw m18, m13, [cq+64*12] pmulhrsw m27, m30, m19 pmulhrsw m19, m13, [cq+64*13] pmulhrsw m28, m30, m20 pmulhrsw m20, m13, [cq+64*14] pmulhrsw m29, m30, m21 pmulhrsw m21, m13, [cq+64*15] call .transpose_round call .pass2 pxor m10, m10 lea r3, [strideq*3] %macro IDCT_64x16_END 4 mova m9, [dstq+%4] %if %1 < 8 pmulhrsw m%3, m30, [cq+64*%1] %endif pmulhrsw m%2, m30 mova [cq+64*%1], m10 punpcklbw m8, m9, m10 punpckhbw m9, m10 paddw m8, m%3 paddw m9, m%2 packuswb m8, m9 mova [dstq+%4], m8 %if %1 == 3 || %1 == 7 || %1 == 11 lea dstq, [dstq+strideq*4] %endif %endmacro IDCT_64x16_END 0, 0, 11, strideq*0 IDCT_64x16_END 1, 1, 11, strideq*1 IDCT_64x16_END 2, 2, 11, strideq*2 IDCT_64x16_END 3, 3, 11, r3 IDCT_64x16_END 4, 4, 11, strideq*0 IDCT_64x16_END 5, 5, 11, strideq*1 IDCT_64x16_END 6, 6, 11, strideq*2 IDCT_64x16_END 7, 7, 11, r3 IDCT_64x16_END 8, 14, 22, strideq*0 IDCT_64x16_END 9, 15, 23, strideq*1 IDCT_64x16_END 10, 16, 24, strideq*2 IDCT_64x16_END 11, 17, 25, r3 IDCT_64x16_END 12, 18, 26, strideq*0 IDCT_64x16_END 13, 19, 27, strideq*1 IDCT_64x16_END 14, 20, 28, strideq*2 IDCT_64x16_END 15, 21, 29, r3 RET ALIGN function_align .pass1_end: mova m4, [cq+64* 0] mova m5, [cq+64* 1] mova m6, [cq+64* 2] mova m7, [cq+64* 3] mova m8, [cq+64* 4] mova m9, [cq+64* 5] mova m11, [cq+64* 6] mova m12, [cq+64* 7] psubsw m29, m4, m21 ; out47 out46 paddsw m4, m21 ; out16 out17 psubsw m28, m5, m20 ; out44 out45 paddsw m5, m20 ; out19 out18 REPX {pmulhrsw x, m13}, m0, m1, m2, m3 psubsw m27, m6, m19 ; out43 out42 paddsw m6, m19 ; out20 out21 psubsw m26, m7, m18 ; out40 out41 paddsw m7, m18 ; out23 out22 pmulhrsw m18, m13, m22 pmulhrsw m19, m13, m23 pmulhrsw m20, m13, m24 pmulhrsw m21, m13, m25 paddsw m25, m12, m14 ; out31 out30 psubsw m14, m12, m14 ; out32 out33 paddsw m24, m11, m15 ; out28 out29 psubsw m15, m11, m15 ; out35 out34 REPX {pmulhrsw x, m13}, m4, m5, m6, m7 paddsw m23, m9, m16 ; out27 out26 psubsw m16, m9, m16 ; out36 out37 paddsw m22, m8, m17 ; out24 out25 psubsw m17, m8, m17 ; out39 out38 REPX {pmulhrsw x, m13}, m14, m15, m16, m17 .transpose_round: %macro TRANSPOSE_8x4_PACKED 4 punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 punpcklwd m%3, m%4, m%2 ; 2 punpckhwd m%4, m%2 ; 3 punpckhwd m%2, m%1, m8 ; 1 punpcklwd m%1, m8 ; 0 %endmacro TRANSPOSE_8x4_PACKED 0, 1, 2, 3 TRANSPOSE_8x4_PACKED 18, 19, 20, 21 TRANSPOSE_8x4_PACKED 4, 5, 6, 7 TRANSPOSE_8x4_PACKED 14, 15, 16, 17 vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 ret .pass2: vshufi32x4 m7, m5, m19, q3131 ; 14 vshufi32x4 m5, m19, q2020 ; 10 vshufi32x4 m21, m6, m20, q3131 ; 15 vshufi32x4 m19, m6, m20, q2020 ; 11 vshufi32x4 m20, m4, m18, q3131 ; 13 vshufi32x4 m18, m4, m18, q2020 ; 9 vshufi32x4 m6, m8, m2, q3131 ; 12 vshufi32x4 m4, m8, m2, q2020 ; 8 vshufi32x4 m2, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m1, m16, q3131 ; 6 vshufi32x4 m1, m16, q2020 ; 2 vshufi32x4 m16, m9, m15, q3131 ; 5 vshufi32x4 m14, m9, m15, q2020 ; 1 vshufi32x4 m15, m11, m17, q2020 ; 3 vshufi32x4 m17, m11, m17, q3131 ; 7 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob vpbroadcastd m23, [o(pw_2896x8)] %undef cmp cmp eobd, 136 jb .fast pmulhrsw m5, m23, [cq+64*20] pmulhrsw m3, m23, [cq+64*12] pmulhrsw m1, m23, [cq+64* 4] pmulhrsw m7, m23, [cq+64*28] pmulhrsw m2, m23, [cq+64* 8] pmulhrsw m6, m23, [cq+64*24] pmulhrsw m0, m23, [cq+64* 0] pmulhrsw m4, m23, [cq+64*16] call m(inv_txfm_add_dct_dct_32x8_8bpc).main pmulhrsw m14, m23, [cq+64* 2] pmulhrsw m21, m23, [cq+64*30] pmulhrsw m18, m23, [cq+64*18] pmulhrsw m17, m23, [cq+64*14] pmulhrsw m16, m23, [cq+64*10] pmulhrsw m19, m23, [cq+64*22] pmulhrsw m20, m23, [cq+64*26] pmulhrsw m15, m23, [cq+64* 6] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 pmulhrsw m22, m23, [cq+64* 1] pmulhrsw m21, m23, [cq+64*31] pmulhrsw m14, m23, [cq+64*17] pmulhrsw m29, m23, [cq+64*15] pmulhrsw m26, m23, [cq+64* 9] pmulhrsw m17, m23, [cq+64*23] pmulhrsw m18, m23, [cq+64*25] pmulhrsw m25, m23, [cq+64* 7] pmulhrsw m24, m23, [cq+64* 5] pmulhrsw m19, m23, [cq+64*27] pmulhrsw m16, m23, [cq+64*21] pmulhrsw m27, m23, [cq+64*11] pmulhrsw m28, m23, [cq+64*13] pmulhrsw m15, m23, [cq+64*19] pmulhrsw m20, m23, [cq+64*29] pmulhrsw m23, [cq+64* 3] call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf vpbroadcastd m12, [o(pw_16384)] psubsw m13, m0, m29 ; 31 paddsw m0, m29 ; 0 psubsw m29, m1, m28 ; 30 paddsw m1, m28 ; 1 psubsw m28, m2, m27 ; 29 paddsw m2, m27 ; 2 psubsw m27, m3, m26 ; 28 paddsw m3, m26 ; 3 psubsw m26, m4, m25 ; 27 paddsw m4, m25 ; 4 psubsw m25, m5, m24 ; 26 paddsw m5, m24 ; 5 psubsw m24, m6, m23 ; 25 paddsw m6, m23 ; 6 psubsw m23, m7, m22 ; 24 paddsw m7, m22 ; 7 pxor m9, m9 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 punpckhwd m3, m23, m24 punpcklwd m23, m24 punpckhwd m24, m25, m26 punpcklwd m25, m26 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 punpckhwd m26, m27, m28 punpcklwd m27, m28 punpckhwd m28, m29, m13 punpcklwd m29, m13 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 REPX {pmulhrsw x, m12}, m7, m0, m2, m4 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 REPX {pmulhrsw x, m12}, m6, m8, m1, m22 punpckhdq m13, m23, m25 punpckldq m23, m25 punpckhdq m25, m27, m29 punpckldq m27, m29 REPX {pmulhrsw x, m12}, m13, m23, m25, m27 punpckhdq m9, m3, m24 punpckldq m3, m24 punpckhdq m24, m26, m28 punpckldq m26, m28 REPX {pmulhrsw x, m12}, m9, m3, m24, m26 punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 punpcklqdq m23, m27 ; d00 d08 d16 d24 punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 punpckhqdq m13, m25 ; d03 d11 d19 d27 punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 punpckhqdq m3, m26 ; d05 d13 d21 d29 punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 punpckhqdq m9, m24 ; d07 d15 d23 d31 mova [cq+64* 3], m23 mova [cq+64*13], m27 mova [cq+64* 7], m25 mova [cq+64*15], m26 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 punpcklqdq m8, m22 ; a04 a12 a20 a28 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 punpcklqdq m0, m4 ; a00 a08 a16 a24 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 punpcklqdq m7, m2 ; a02 a10 a18 a26 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 punpcklqdq m6, m1 ; a06 a14 a22 a30 mova [cq+64* 1], m0 mova [cq+64* 9], m7 mova [cq+64* 5], m8 mova [cq+64*11], m6 mova m2, [cq+64* 0] mova m11, [cq+64* 2] mova m8, [cq+64* 4] mova m29, [cq+64* 6] mova m27, [cq+64* 8] mova m26, [cq+64*10] mova m4, [cq+64*12] mova m28, [cq+64*14] psubsw m1, m2, m21 ; 23 paddsw m2, m21 ; 8 psubsw m21, m11, m20 ; 22 paddsw m11, m20 ; 9 psubsw m20, m8, m19 ; 21 paddsw m8, m19 ; 10 psubsw m19, m29, m18 ; 20 paddsw m29, m18 ; 11 psubsw m18, m27, m17 ; 19 paddsw m27, m17 ; 12 psubsw m17, m26, m16 ; 18 paddsw m26, m16 ; 13 psubsw m16, m4, m15 ; 17 paddsw m4, m15 ; 14 psubsw m15, m28, m14 ; 16 paddsw m28, m14 ; 15 punpcklwd m14, m15, m16 punpckhwd m15, m16 punpckhwd m16, m17, m18 punpcklwd m17, m18 punpckhwd m18, m19, m20 punpcklwd m19, m20 punpckhwd m20, m21, m1 punpcklwd m21, m1 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 REPX {pmulhrsw x, m12}, m28, m2, m8, m27 punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 REPX {pmulhrsw x, m12}, m4, m1, m11, m29 punpckhdq m26, m19, m21 punpckldq m19, m21 punpckhdq m21, m15, m16 punpckldq m15, m16 REPX {pmulhrsw x, m12}, m26, m19, m21, m15 punpckhdq m16, m18, m20 punpckldq m18, m20 punpckhdq m20, m14, m17 punpckldq m14, m17 REPX {pmulhrsw x, m12}, m16, m18, m20, m14 punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 punpcklqdq m28, m8 ; b02 b10 b18 b26 punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 punpcklqdq m2, m27 ; b00 b08 b16 b24 punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 punpckhqdq m1, m29 ; b05 b13 b21 b29 punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 punpckhqdq m4, m11 ; b07 b15 b23 b31 mova [cq+64* 0], m2 mova [cq+64* 8], m28 mova [cq+64* 4], m27 mova [cq+64*10], m29 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 punpcklqdq m20, m26 ; c02 c10 c18 c26 punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 punpcklqdq m14, m19 ; c00 c08 c16 c24 punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 punpcklqdq m15, m18 ; c04 c12 c20 c28 punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 punpcklqdq m21, m16 ; c06 c14 c22 c30 mova [cq+64* 2], m14 mova [cq+64*12], m20 mova [cq+64* 6], m15 mova [cq+64*14], m21 vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 mov r4, rsp vshufi32x4 m0, m22, m19, q2020 ; 1 vshufi32x4 m1, m17, m29, q3131 ; 31 vshufi32x4 m2, m14, m26, q2020 ; 17 vshufi32x4 m3, m25, m18, q3131 ; 15 call .main_part1 vshufi32x4 m0, m25, m18, q2020 ; 7 vshufi32x4 m1, m14, m26, q3131 ; 25 vshufi32x4 m2, m17, m29, q2020 ; 23 vshufi32x4 m3, m22, m19, q3131 ; 9 call .main_part1 vshufi32x4 m0, m24, m21, q2020 ; 5 vshufi32x4 m1, m15, m27, q3131 ; 27 vshufi32x4 m2, m16, m28, q2020 ; 21 vshufi32x4 m3, m23, m20, q3131 ; 11 call .main_part1 vshufi32x4 m0, m23, m20, q2020 ; 3 vshufi32x4 m1, m16, m28, q3131 ; 29 vshufi32x4 m2, m15, m27, q2020 ; 19 vshufi32x4 m3, m24, m21, q3131 ; 13 call .main_part1 call .main_part2 mova m0, [cq+64* 1] ; a0 mova m15, [cq+64* 0] ; b0 mova m3, [cq+64* 2] ; c0 mova m16, [cq+64* 3] ; d0 mova m14, [cq+64* 5] ; a4 mova m8, [cq+64* 4] ; b4 mova m17, [cq+64* 6] ; c4 mova m1, [cq+64* 7] ; d4 vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 vshufi32x4 m1, m0, m3, q3131 ; 8 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m2, m15, q3131 ; 24 vshufi32x4 m2, m15, q2020 ; 16 vshufi32x4 m15, m14, m17, q3131 ; 12 vshufi32x4 m14, m17, q2020 ; 4 vshufi32x4 m17, m16, m8, q3131 ; 28 vshufi32x4 m16, m8, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova m8, [cq+64* 8] mova m9, [cq+64*12] mova m11, [cq+64*10] mova m12, [cq+64*14] mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 mova m22, [cq+64* 9] mova m27, [cq+64*13] mova m23, [cq+64*11] mova m24, [cq+64*15] vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 vshufi32x4 m28, m26, m8, q3131 ; 26 vshufi32x4 m26, m8, q2020 ; 18 vshufi32x4 m24, m22, m9, q3131 ; 10 vshufi32x4 m22, m9, q2020 ; 2 vshufi32x4 m29, m27, m11, q3131 ; 30 vshufi32x4 m27, m11, q2020 ; 22 vshufi32x4 m25, m23, m12, q3131 ; 14 vshufi32x4 m23, m12, q2020 ; 6 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast jmp .end .fast: ; bottom/right halves are zero pmulhrsw ym9, ym23, [cq+64* 0] pmulhrsw ym6, ym23, [cq+64* 8] mova m14, [o(dup16_perm)] pmulhrsw ym8, ym23, [cq+64* 2] pmulhrsw xm0, xm23, [cq+64*14] pmulhrsw xm5, xm23, [cq+64*10] pmulhrsw ym1, ym23, [cq+64* 6] pmulhrsw ym7, ym23, [cq+64* 4] pmulhrsw xm3, xm23, [cq+64*12] pmovzxwd m9, ym9 pmovzxwd m6, ym6 vpermb m8, m14, m8 punpcklwd xm0, xm0 vpermb ym5, ym14, ym5 vpermb m1, m14, m1 vpermb m7, m14, m7 punpcklwd xm3, xm3 pslld m9, 16 pslld m6, 16 call m(idct_16x16_internal_8bpc).main_fast vpmulhrsw ym21, ym23, [cq+64* 1] {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements {evex}vpmulhrsw ym18, ym23, [cq+64* 5] {evex}vpmulhrsw xm16, xm23, [cq+64*11] {evex}vpmulhrsw xm19, xm23, [cq+64*13] {evex}vpmulhrsw ym23, [cq+64* 3] vpermb m21, m14, m21 punpcklwd xm17, xm17 vpermb ym20, ym14, ym20 vpermb m15, m14, m15 vpermb m18, m14, m18 vpermb ym16, ym14, ym16 punpcklwd xm19, xm19 vpermb m14, m14, m23 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m9, [o(pw_16384)] call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round vshufi32x4 m16, m0, m3, q2020 ; 0 vshufi32x4 m26, m0, m3, q3131 ; 4 vshufi32x4 m0, m14, m2, q2020 ; 1 vshufi32x4 m14, m2, q3131 ; 5 vshufi32x4 m3, m19, m7, q3131 ; 15 vshufi32x4 m19, m7, q2020 ; 11 vshufi32x4 m27, m17, m9, q2020 ; 3 vshufi32x4 m17, m9, q3131 ; 7 vshufi32x4 m28, m20, m6, q2020 ; 9 vshufi32x4 m20, m6, q3131 ; 13 vshufi32x4 m22, m1, m18, q2020 ; 2 vshufi32x4 m23, m1, m18, q3131 ; 6 vshufi32x4 m24, m5, m15, q2020 ; 10 vshufi32x4 m25, m5, m15, q3131 ; 14 vshufi32x4 m15, m21, m4, q3131 ; 12 vshufi32x4 m21, m21, m4, q2020 ; 8 mov r4, rsp call .main_part1_fast mova m0, m17 mova m3, m28 call .main_part1_fast mova m0, m14 mova m3, m19 call .main_part1_fast mova m0, m27 mova m3, m20 call .main_part1_fast call .main_part2 mova m0, m16 mova m1, m21 mova m14, m26 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [cq+64*14], m21 mova [cq+64* 0], m14 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64* 4], m16 mova [cq+64* 2], m15 mova [cq+64*12], m20 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 .end: lea r4, [strideq*3] vpbroadcastd m12, [o(pw_2048)] movshdup m13, [o(permD)] lea r5, [r4+strideq] ; stride*4 lea r3, [dstq+r4*8] lea r6, [strideq+r5*8] ; stride*33 lea r8, [r4+r5*8] ; stride*35 add r3, r5 ; dst+stride*28 lea r7, [r6+strideq] ; stride*34 %macro IDCT_32x64_END 6 ; src, mem, stride[1-4] %if %2 < 8 paddsw m10, m%2, m%1 psubsw m11, m%2, m%1 %else mova m11, [cq+64*(%2*2-16)] paddsw m10, m11, m%1 psubsw m11, m%1 %endif mova m9, [rsp+64*(31-%2)] mova m%1, [rsp+64*%2] paddsw m8, m10, m9 psubsw m10, m9 paddsw m9, m11, m%1 pmovzxbw m0, [dstq+%3] psubsw m11, m%1 pmovzxbw m%1, [r3 +%4] REPX {pmulhrsw x, m12}, m8, m10, m9, m11 paddw m8, m0 pmovzxbw m0, [r3 +%5] paddw m10, m%1 pmovzxbw m%1, [dstq+%6] paddw m9, m0 paddw m11, m%1 %if %2 >= 8 %if %2 == 8 pxor m1, m1 %endif mova [cq+64*(%2*2-16)], m1 mova [cq+64*(%2*2-15)], m1 %endif packuswb m8, m10 packuswb m9, m11 vpermq m8, m13, m8 vpermq m9, m13, m9 mova [dstq+%3], ym8 vextracti32x8 [r3 +%4], m8, 1 mova [r3 +%5], ym9 vextracti32x8 [dstq+%6], m9, 1 %if %2 == 3 || %2 == 7 || %2 == 11 add dstq, r5 sub r3, r5 %endif %endmacro IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 64 imul r6d, 181 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 ALIGN function_align ; bottom three-quarters are zero cglobal_label .main_part1_fast2 vpbroadcastd m7, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] pmulhrsw m7, m0 ; t63a pmulhrsw m0, m8 ; t32a punpcklwd m4, m0, m7 punpckhwd m6, m0, m7 mova m1, m10 vpdpwssd m1, m4, [o(idct64_mul+4*9)] {bcstd} mova m9, m10 vpdpwssd m9, m6, [o(idct64_mul+4*9)] {bcstd} REPX {psrad x, 12}, m1, m9 packssdw m1, m9 mova m9, m10 vpdpwssd m9, m6, [o(idct64_mul+4*8)] {bcstd} mova m6, m10 vpdpwssd m6, m4, [o(idct64_mul+4*8)] {bcstd} REPX {psrad x, 12}, m9, m6 packssdw m6, m9 mova m4, m0 mova m3, m7 mova m5, m1 mova m2, m6 jmp .main_part1c cglobal_label .main_part1_fast vpbroadcastd m1, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] vpbroadcastd m2, [o(idct64_mul+4*6)] vpbroadcastd m9, [o(idct64_mul+4*7)] pmulhrsw m1, m0 ; t63a pmulhrsw m0, m8 ; t32a pmulhrsw m2, m3 ; t60a pmulhrsw m3, m9 ; t35a mova m8, m0 mova m7, m1 mova m6, m3 mova m5, m2 jmp .main_part1b cglobal_label .main_part1 ; idct64 steps 1-5: ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vpbroadcastd m7, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] vpbroadcastd m6, [o(idct64_mul+4*2)] vpbroadcastd m9, [o(idct64_mul+4*3)] pmulhrsw m7, m0 ; t63a vpbroadcastd m5, [o(idct64_mul+4*4)] pmulhrsw m0, m8 ; t32a vpbroadcastd m8, [o(idct64_mul+4*5)] pmulhrsw m6, m1 ; t62a vpbroadcastd m4, [o(idct64_mul+4*6)] pmulhrsw m1, m9 ; t33a vpbroadcastd m9, [o(idct64_mul+4*7)] pmulhrsw m5, m2 ; t61a pmulhrsw m2, m8 ; t34a pmulhrsw m4, m3 ; t60a pmulhrsw m3, m9 ; t35a psubsw m8, m0, m1 ; t33 paddsw m0, m1 ; t32 psubsw m1, m7, m6 ; t62 paddsw m7, m6 ; t63 psubsw m6, m3, m2 ; t34 paddsw m3, m2 ; t35 psubsw m2, m4, m5 ; t61 paddsw m5, m4 ; t60 .main_part1b: vpbroadcastd m11, [o(idct64_mul+4*8)] vpbroadcastd m12, [o(idct64_mul+4*9)] ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a vpbroadcastd m11, [o(idct64_mul+4*10)] ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a psubsw m4, m0, m3 ; t35a paddsw m0, m3 ; t32a psubsw m3, m7, m5 ; t60a paddsw m7, m5 ; t63a psubsw m5, m1, m2 ; t34 paddsw m1, m2 ; t33 psubsw m2, m8, m6 ; t61 paddsw m6, m8 ; t62 .main_part1c: vpbroadcastd m11, [o(idct64_mul+4*11)] vpbroadcastd m12, [o(idct64_mul+4*12)] add r5, 4*13 ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a mova [r4+64*0], m0 mova [r4+64*7], m7 mova [r4+64*1], m1 mova [r4+64*6], m6 mova [r4+64*3], m3 mova [r4+64*4], m4 mova [r4+64*2], m2 mova [r4+64*5], m5 add r4, 64*8 ret cglobal_label .main_part2 vpbroadcastd m11, [o(pw_1567_3784 -16*13)] vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] lea r6, [r4+64*7] vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] vpbroadcastd m18, [o(pw_2896_2896 -16*13)] vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] sub r5, 16*13 .main_part2_loop: mova m0, [r4-64*32] ; t32a mova m1, [r6-64*24] ; t39a mova m2, [r6-64*32] ; t63a mova m3, [r4-64*24] ; t56a mova m4, [r4-64*16] ; t40a mova m5, [r6-64* 8] ; t47a mova m6, [r6-64*16] ; t55a mova m7, [r4-64* 8] ; t48a psubsw m8, m0, m1 ; t39 paddsw m0, m1 ; t32 psubsw m1, m2, m3 ; t56 paddsw m2, m3 ; t63 psubsw m3, m5, m4 ; t40 paddsw m5, m4 ; t47 psubsw m4, m7, m6 ; t55 paddsw m7, m6 ; t48 ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a psubsw m6, m2, m7 ; t48a paddsw m2, m7 ; t63a psubsw m7, m0, m5 ; t47a paddsw m0, m5 ; t32a psubsw m5, m8, m3 ; t55 paddsw m8, m3 ; t56 psubsw m3, m1, m4 ; t40 paddsw m1, m4 ; t39 ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a mova [r6-64* 8], m2 mova [r4-64*32], m0 mova [r4-64* 8], m8 mova [r6-64*32], m1 mova [r6-64*24], m6 mova [r4-64*16], m7 mova [r4-64*24], m5 mova [r6-64*16], m3 add r4, 64 sub r6, 64 cmp r4, r6 jb .main_part2_loop ret cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob vpbroadcastd m23, [o(pw_2896x8)] %undef cmp cmp eobd, 136 jb .fast pmulhrsw m0, m23, [cq+64* 1] pmulhrsw m1, m23, [cq+64*31] pmulhrsw m2, m23, [cq+64*17] pmulhrsw m3, m23, [cq+64*15] vpbroadcastd m10, [o(pd_2048)] mov r4, rsp call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 pmulhrsw m0, m23, [cq+64* 7] pmulhrsw m1, m23, [cq+64*25] pmulhrsw m2, m23, [cq+64*23] pmulhrsw m3, m23, [cq+64* 9] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 pmulhrsw m0, m23, [cq+64* 5] pmulhrsw m1, m23, [cq+64*27] pmulhrsw m2, m23, [cq+64*21] pmulhrsw m3, m23, [cq+64*11] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 pmulhrsw m0, m23, [cq+64* 3] pmulhrsw m1, m23, [cq+64*29] pmulhrsw m2, m23, [cq+64*19] pmulhrsw m3, m23, [cq+64*13] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 pmulhrsw m3, m23, [cq+64*24] pmulhrsw m1, m23, [cq+64* 8] pmulhrsw m2, m23, [cq+64*16] pmulhrsw m0, m23, [cq+64* 0] pmulhrsw m14, m23, [cq+64* 4] pmulhrsw m17, m23, [cq+64*28] pmulhrsw m16, m23, [cq+64*20] pmulhrsw m15, m23, [cq+64*12] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast pmulhrsw m22, m23, [cq+64* 2] pmulhrsw m29, m23, [cq+64*30] pmulhrsw m26, m23, [cq+64*18] pmulhrsw m25, m23, [cq+64*14] pmulhrsw m24, m23, [cq+64*10] pmulhrsw m27, m23, [cq+64*22] pmulhrsw m28, m23, [cq+64*26] pmulhrsw m23, [cq+64* 6] mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_16384)] call .pass1_end_part1 mova [cq+64*16], m1 mova [cq+64*17], m3 mova [cq+64*18], m5 mova [cq+64*19], m7 mova [cq+64*24], m23 mova [cq+64*25], m25 mova [cq+64*26], m27 mova [cq+64*27], m29 pmulhrsw m23, m13, m0 ; a0 pmulhrsw m25, m13, m2 ; a2 pmulhrsw m27, m13, m4 ; a4 pmulhrsw m29, m13, m6 ; a6 REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 call .pass1_end_part2 mova [cq+64*20], m15 mova [cq+64*21], m17 mova [cq+64*22], m19 mova [cq+64*23], m21 mova [cq+64*28], m1 mova [cq+64*29], m3 mova [cq+64*30], m5 mova [cq+64*31], m7 REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 vshufi32x4 m2, m3, m15, q3131 ; 8 vshufi32x4 m0, m3, m15, q2020 ; 0 vshufi32x4 m6, m23, m22, q3131 ; 24 vshufi32x4 m4, m23, m22, q2020 ; 16 vshufi32x4 m3, m1, m18, q3131 ; 12 vshufi32x4 m1, m18, q2020 ; 4 vshufi32x4 m7, m27, m26, q3131 ; 28 vshufi32x4 m5, m27, m26, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x8_8bpc).main vshufi32x4 m16, m14, m17, q3131 ; 10 vshufi32x4 m14, m17, q2020 ; 2 vshufi32x4 m17, m19, m20, q3131 ; 14 vshufi32x4 m15, m19, m20, q2020 ; 6 vshufi32x4 m20, m25, m24, q3131 ; 26 vshufi32x4 m18, m25, m24, q2020 ; 18 vshufi32x4 m21, m29, m28, q3131 ; 30 vshufi32x4 m19, m29, m28, q2020 ; 22 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf pmulhrsw m22, m13, [cq+64*16] ; a1 pmulhrsw m23, m13, [cq+64*20] ; c1 pmulhrsw m24, m13, [cq+64*24] ; e1 pmulhrsw m25, m13, [cq+64*28] ; g1 pmulhrsw m26, m13, [cq+64*17] ; a3 pmulhrsw m27, m13, [cq+64*21] ; c3 pmulhrsw m28, m13, [cq+64*25] ; e3 pmulhrsw m29, m13, [cq+64*29] ; g3 mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 pmulhrsw m14, m13, [cq+64*18] ; a5 pmulhrsw m15, m13, [cq+64*22] ; c5 pmulhrsw m16, m13, [cq+64*26] ; e5 pmulhrsw m17, m13, [cq+64*30] ; g5 pmulhrsw m18, m13, [cq+64*19] ; a7 pmulhrsw m19, m13, [cq+64*23] ; c7 pmulhrsw m20, m13, [cq+64*27] ; e7 pmulhrsw m21, m13, [cq+64*31] ; g7 vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 mova [cq+64* 0], m0 mova [cq+64* 1], m1 mova [cq+64* 2], m2 mova [cq+64* 3], m3 mova [cq+64* 4], m4 mova [cq+64* 5], m5 mova [cq+64* 6], m6 mova [cq+64* 7], m7 vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 vshufi32x4 m27, m23, m11, q3131 ; 11 m27 vshufi32x4 m23, m11, q2020 ; 3 m23 vshufi32x4 m19, m26, m28, q3131 ; 27 m19 vshufi32x4 m15, m26, m28, q2020 ; 19 m15 vshufi32x4 m29, m25, m17, q3131 ; 15 m29 vshufi32x4 m25, m17, q2020 ; 7 m25 vshufi32x4 m21, m18, m20, q3131 ; 31 m21 vshufi32x4 m17, m18, m20, q2020 ; 23 m17 vshufi32x4 m20, m14, m16, q3131 ; 29 m20 vshufi32x4 m16, m14, m16, q2020 ; 21 m16 vshufi32x4 m18, m22, m24, q3131 ; 25 m18 vshufi32x4 m14, m22, m24, q2020 ; 17 m14 vshufi32x4 m26, m8, m9, q3131 ; 9 m26 vshufi32x4 m22, m8, m9, q2020 ; 1 m22 vshufi32x4 m28, m12, m13, q3131 ; 13 m28 vshufi32x4 m24, m12, m13, q2020 ; 5 m24 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf vpbroadcastd m13, [o(pw_16384)] pmulhrsw m0, m13, [r4-64*21] pmulhrsw m1, m13, [r4-64*22] pmulhrsw m2, m13, [r4-64*23] pmulhrsw m3, m13, [r4-64*24] pmulhrsw m4, m13, [r4-64*25] pmulhrsw m5, m13, [r4-64*26] pmulhrsw m6, m13, [r4-64*27] pmulhrsw m7, m13, [r4-64*28] mova [cq+64*16], m14 mova [cq+64*17], m15 mova [cq+64*18], m16 mova [cq+64*19], m17 mova [cq+64*20], m18 mova [cq+64*21], m19 mova [cq+64*22], m20 mova [cq+64*23], m21 pmulhrsw m14, m13, [r4-64*12] pmulhrsw m15, m13, [r4-64*11] pmulhrsw m16, m13, [r4-64*10] pmulhrsw m17, m13, [r4-64* 9] pmulhrsw m18, m13, [r4-64* 8] pmulhrsw m19, m13, [r4-64* 7] pmulhrsw m20, m13, [r4-64* 6] pmulhrsw m21, m13, [r4-64* 5] mova [cq+64*24], m22 mova [cq+64*25], m23 mova [cq+64*26], m24 mova [cq+64*27], m25 mova [cq+64*28], m26 mova [cq+64*29], m27 mova [cq+64*30], m28 mova [cq+64*31], m29 call .transpose_2x8x8_lo mova [r4-64*12], m1 mova [r4-64*11], m3 mova [r4-64*10], m5 mova [r4-64* 9], m7 mova [r4-64* 8], m15 mova [r4-64* 7], m17 mova [r4-64* 6], m19 mova [r4-64* 5], m21 vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 pmulhrsw m0, m13, [r4-64*20] pmulhrsw m1, m13, [r4-64*19] pmulhrsw m2, m13, [r4-64*18] pmulhrsw m3, m13, [r4-64*17] pmulhrsw m4, m13, [r4-64*16] pmulhrsw m5, m13, [r4-64*15] pmulhrsw m6, m13, [r4-64*14] pmulhrsw m7, m13, [r4-64*13] pmulhrsw m14, m13, [r4-64*29] pmulhrsw m15, m13, [r4-64*30] pmulhrsw m16, m13, [r4-64*31] pmulhrsw m17, m13, [r4-64*32] pmulhrsw m18, m13, [r4-64*33] pmulhrsw m19, m13, [r4-64*34] pmulhrsw m20, m13, [r4-64*35] pmulhrsw m21, m13, [r4-64*36] call .transpose_2x8x8_lo mova [r4-64*20], m1 mova [r4-64*19], m3 mova [r4-64*18], m5 mova [r4-64*17], m7 mova [r4-64*16], m15 mova [r4-64*15], m17 mova [r4-64*14], m19 mova [r4-64*13], m21 vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 vshufi32x4 m2, m0, m22, q3131 ; 8 vshufi32x4 m0, m22, q2020 ; 0 vshufi32x4 m3, m1, m26, q3131 ; 12 vshufi32x4 m1, m26, q2020 ; 4 vshufi32x4 m6, m4, m23, q3131 ; 24 vshufi32x4 m4, m23, q2020 ; 16 vshufi32x4 m7, m5, m27, q3131 ; 28 vshufi32x4 m5, m27, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x8_8bpc).main vshufi32x4 m16, m14, m24, q3131 ; 10 vshufi32x4 m14, m24, q2020 ; 2 vshufi32x4 m17, m15, m28, q3131 ; 14 vshufi32x4 m15, m28, q2020 ; 6 vshufi32x4 m20, m18, m25, q3131 ; 26 vshufi32x4 m18, m25, q2020 ; 18 vshufi32x4 m21, m19, m29, q3131 ; 30 vshufi32x4 m19, m29, q2020 ; 22 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova m22, [r4-64*20] mova m26, [r4-64*16] mova m23, [r4-64*19] mova m27, [r4-64*15] mova m24, [r4-64*18] mova m28, [r4-64*14] mova m25, [r4-64*17] mova m29, [r4-64*13] mova [r4-64*20], m14 mova [r4-64*19], m15 mova [r4-64*18], m16 mova [r4-64*17], m17 mova [r4-64*16], m18 mova [r4-64*15], m19 mova [r4-64*14], m20 mova [r4-64*13], m21 mova m19, [r4-64*12] mova m11, [r4-64* 8] mova m20, [r4-64*11] mova m12, [r4-64* 7] mova m21, [r4-64*10] mova m8, [r4-64* 6] mova m9, [r4-64* 9] mova m18, [r4-64* 5] vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 vshufi32x4 m26, m22, m27, q3131 ; 9 vshufi32x4 m22, m27, q2020 ; 1 vshufi32x4 m27, m23, m28, q3131 ; 11 vshufi32x4 m23, m28, q2020 ; 3 vshufi32x4 m28, m24, m29, q3131 ; 13 vshufi32x4 m24, m29, q2020 ; 5 vshufi32x4 m29, m25, m8, q3131 ; 15 vshufi32x4 m25, m8, q2020 ; 7 vshufi32x4 m18, m14, m19, q3131 ; 25 vshufi32x4 m14, m19, q2020 ; 17 vshufi32x4 m19, m15, m20, q3131 ; 27 vshufi32x4 m15, m20, q2020 ; 19 vshufi32x4 m20, m16, m21, q3131 ; 29 vshufi32x4 m16, m21, q2020 ; 21 vshufi32x4 m21, m17, m9, q3131 ; 31 vshufi32x4 m17, m9, q2020 ; 23 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf jmp .end .fast: ; bottom/right halves are zero {evex}vpmulhrsw ym8, ym23, [cq+64* 4] {evex}vpmulhrsw xm1, xm23, [cq+64*12] mova m28, [o(dup16_perm)] {evex}vpmulhrsw ym7, ym23, [cq+64* 8] vpmulhrsw ym22, ym23, [cq+64* 0] vpermb m8, m28, m8 vpermb ym1, ym28, ym1 vpermb m7, m28, m7 pmovzxwd m9, ym22 pslld m9, 16 call m(idct_16x16_internal_8bpc).main_fast2 {evex}vpmulhrsw ym21, ym23, [cq+64* 2] {evex}vpmulhrsw xm15, xm23, [cq+64*14] {evex}vpmulhrsw xm18, xm23, [cq+64*10] {evex}vpmulhrsw ym14, ym23, [cq+64* 6] vpermb m21, m28, m21 punpcklwd xm15, xm15 vpermb ym18, ym28, ym18 vpermb m14, m28, m14 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 vpmulhrsw ym22, ym23, [cq+64* 1] {evex}vpmulhrsw xm29, xm23, [cq+64*15] {evex}vpmulhrsw xm26, xm23, [cq+64* 9] {evex}vpmulhrsw ym25, ym23, [cq+64* 7] {evex}vpmulhrsw ym24, ym23, [cq+64* 5] {evex}vpmulhrsw xm27, xm23, [cq+64*11] {evex}vpmulhrsw xm8, xm23, [cq+64*13] {evex}vpmulhrsw ym23, [cq+64* 3] vpermb m22, m28, m22 punpcklwd xm29, xm29 vpermb ym26, ym28, ym26 vpermb m25, m28, m25 mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 REPX {vpermb x, m28, x}, m24, m27, m23 punpcklwd xm28, xm8, xm8 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast mov r4, rsp vpbroadcastd m13, [o(pw_16384)] mova [r4+64*16], m4 mova [r4+64*17], m5 mova [r4+64*18], m6 mova [r4+64*19], m7 mova [r4+64*28], m26 mova [r4+64*29], m27 mova [r4+64*30], m28 mova [r4+64*31], m29 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end mova [r4+64*20], m22 mova [r4+64*21], m23 mova [r4+64*22], m24 mova [r4+64*23], m25 mova [r4+64*24], m26 mova [r4+64*25], m27 mova [r4+64*26], m28 mova [r4+64*27], m29 call .pass2_fast mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast mova [cq+64* 0], m0 mova [cq+64* 1], m1 mova [cq+64* 2], m2 mova [cq+64* 3], m3 mova [cq+64* 4], m4 mova [cq+64* 5], m5 mova [cq+64* 6], m6 mova [cq+64* 7], m7 pmulhrsw m0, m13, [r4+64*16] pmulhrsw m1, m13, [r4+64*17] pmulhrsw m2, m13, [r4+64*18] pmulhrsw m3, m13, [r4+64*19] pmulhrsw m4, m13, [r4+64*20] pmulhrsw m5, m13, [r4+64*21] pmulhrsw m6, m13, [r4+64*22] pmulhrsw m7, m13, [r4+64*23] mova [cq+64*16], m14 mova [cq+64*17], m15 mova [cq+64*18], m16 mova [cq+64*19], m17 mova [cq+64*20], m18 mova [cq+64*21], m19 mova [cq+64*22], m20 mova [cq+64*23], m21 pmulhrsw m14, m13, [r4+64*24] pmulhrsw m15, m13, [r4+64*25] pmulhrsw m16, m13, [r4+64*26] pmulhrsw m17, m13, [r4+64*27] pmulhrsw m18, m13, [r4+64*28] pmulhrsw m19, m13, [r4+64*29] pmulhrsw m20, m13, [r4+64*30] pmulhrsw m21, m13, [r4+64*31] mova [cq+64*24], m22 mova [cq+64*25], m23 mova [cq+64*26], m24 mova [cq+64*27], m25 mova [cq+64*28], m26 mova [cq+64*29], m27 mova [cq+64*30], m28 mova [cq+64*31], m29 call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round call .pass2_fast mova [r4+64*16], m14 mova [r4+64*17], m15 mova [r4+64*18], m16 mova [r4+64*19], m17 mova [r4+64*20], m18 mova [r4+64*21], m19 mova [r4+64*22], m20 mova [r4+64*23], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast .end: vpbroadcastd m13, [o(pw_2048)] lea r5, [strideq*3] pxor m12, m12 lea r3, [dstq+r5*8] lea r6, [strideq+r5] ; stride*4 add r3, r6 ; dst+stride*28 %macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi mova m11, [cq+64*( %3)] ; 0 mova m9, [cq+64*(31-%3)] ; 31 %if %3 >= 8 mova m%1, [rsp+64*(%1+16)] %endif mova m10, [dstq+%4] paddsw m8, m11, m9 psubsw m11, m9 paddsw m9, m%1, m%2 psubsw m%1, m%2 punpcklbw m%2, m10, m12 punpckhbw m10, m12 pmulhrsw m8, m13 pmulhrsw m9, m13 paddw m8, m%2 paddw m9, m10 mova m10, [r3+%5] pmulhrsw m11, m13 pmulhrsw m%1, m13 mova [cq+64*( %3)], m12 mova [cq+64*(31-%3)], m12 punpcklbw m%2, m10, m12 punpckhbw m10, m12 packuswb m8, m9 paddw m11, m%2 paddw m%1, m10 packuswb m11, m%1 mova [dstq+%4], m8 mova [r3 +%5], m11 %if %3 == 3 || %3 == 7 || %3 == 11 add dstq, r6 sub r3, r6 %endif %endmacro IDCT_64x32_END 0, 29, 0, strideq*0, r5 IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 IDCT_64x32_END 3, 26, 3, r5 , strideq*0 IDCT_64x32_END 4, 25, 4, strideq*0, r5 IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 IDCT_64x32_END 7, 22, 7, r5 , strideq*0 IDCT_64x32_END 0, 21, 8, strideq*0, r5 IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 IDCT_64x32_END 3, 18, 11, r5 , strideq*0 IDCT_64x32_END 4, 17, 12, strideq*0, r5 IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 IDCT_64x32_END 7, 14, 15, r5 , strideq*0 RET ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 imul r6d, 181 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 ALIGN function_align .pass1_end_part1: %macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 %if %1 != %3 mova m%1, [cq+64*%1] %endif mova m9, [r4+64*(%3-36)] ; idct64 32+n mova m11, [r4+64*(-5-%3)] ; idct64 63-n psubsw m8, m%1, m%2 ; idct32 31-n paddsw m%1, m%2 ; idct32 0+n %if %1 == %3 psubsw m%2, m8, m9 ; out 32+n e paddsw m8, m9 ; out 31-n d psubsw m9, m%1, m11 ; out 63-n h paddsw m%1, m11 ; out 0+n a %else paddsw m%2, m8, m9 ; out 23-n c psubsw m8, m9 ; out 40+n f paddsw m9, m%1, m11 ; out 8+n b psubsw m%1, m11 ; out 55-n g %endif mova [r4+64*(%3-36)], m8 mova [r4+64*(-5-%3)], m9 %endmacro IDCT_64x32_PASS1_END 0, 29, 0 IDCT_64x32_PASS1_END 1, 28, 1 IDCT_64x32_PASS1_END 2, 27, 2 IDCT_64x32_PASS1_END 3, 26, 3 IDCT_64x32_PASS1_END 4, 25, 4 IDCT_64x32_PASS1_END 5, 24, 5 IDCT_64x32_PASS1_END 6, 23, 6 IDCT_64x32_PASS1_END 7, 22, 7 .transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckhqdq m23, m22, m27 ; 1 23 punpcklqdq m22, m27 ; 0 22 punpckhqdq m27, m26, m28 ; 5 27 punpcklqdq m26, m28 ; 4 26 punpcklqdq m28, m29, m25 ; 6 28 punpckhqdq m29, m25 ; 7 29 punpckhqdq m25, m24, m8 ; 3 25 punpcklqdq m24, m8 ; 2 24 .transpose_8x8: punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m8, m1 punpckhdq m8, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 ret .pass1_end_part2: IDCT_64x32_PASS1_END 0, 21, 8 IDCT_64x32_PASS1_END 1, 20, 9 IDCT_64x32_PASS1_END 2, 19, 10 IDCT_64x32_PASS1_END 3, 18, 11 IDCT_64x32_PASS1_END 4, 17, 12 IDCT_64x32_PASS1_END 5, 16, 13 IDCT_64x32_PASS1_END 6, 15, 14 IDCT_64x32_PASS1_END 7, 14, 15 .transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 punpcklwd m8, m3, m2 punpckhwd m3, m2 punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m7, m6 punpckhwd m7, m6 punpcklwd m6, m5, m4 punpckhwd m5, m4 punpckldq m4, m7, m5 punpckhdq m7, m5 punpckldq m5, m8, m2 punpckhdq m8, m2 punpckhdq m2, m0, m6 punpckldq m0, m6 punpckldq m6, m3, m1 punpckhdq m3, m1 punpckhqdq m1, m0, m5 punpcklqdq m0, m5 punpckhqdq m5, m4, m6 punpcklqdq m4, m6 punpcklqdq m6, m7, m3 punpckhqdq m7, m3 punpckhqdq m3, m2, m8 punpcklqdq m2, m8 punpckhwd m8, m18, m19 punpcklwd m18, m19 punpckhwd m19, m14, m15 punpcklwd m14, m15 punpckhwd m15, m20, m21 punpcklwd m20, m21 punpckhwd m21, m16, m17 punpcklwd m16, m17 punpckhdq m17, m14, m16 punpckldq m14, m16 punpckldq m16, m18, m20 punpckhdq m18, m20 punpckhdq m20, m19, m21 punpckldq m19, m21 punpckldq m21, m8, m15 punpckhdq m8, m15 punpckhqdq m15, m14, m16 punpcklqdq m14, m16 punpcklqdq m16, m17, m18 punpckhqdq m17, m18 punpcklqdq m18, m19, m21 punpckhqdq m19, m21 punpckhqdq m21, m20, m8 punpcklqdq m20, m8 ret .pass2_fast: vshufi32x4 m24, m9, m15, q3131 ; 5 vshufi32x4 m22, m9, m15, q2020 ; 1 vshufi32x4 m15, m1, m16, q3131 ; 6 vshufi32x4 m14, m1, m16, q2020 ; 2 vshufi32x4 m1, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m8, m2, q3131 ; 12 vshufi32x4 m2, m8, m2, q2020 ; 8 vshufi32x4 m25, m11, m17, q3131 ; 7 vshufi32x4 m23, m11, m17, q2020 ; 3 vshufi32x4 m17, m5, m19, q3131 ; 14 vshufi32x4 m16, m5, m19, q2020 ; 10 vshufi32x4 m29, m6, m20, q3131 ; 15 vshufi32x4 m27, m6, m20, q2020 ; 11 vshufi32x4 m28, m4, m18, q3131 ; 13 vshufi32x4 m26, m4, m18, q2020 ; 9 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob %undef cmp cmp eobd, 136 jb .fast mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] vpbroadcastd m10, [o(pd_2048)] mov r4, rsp call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] mova m3, [cq+64*24] mova m14, [cq+64* 4] mova m15, [cq+64*12] mova m16, [cq+64*20] mova m17, [cq+64*28] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova m22, [cq+64* 2] mova m29, [cq+64*30] mova m26, [cq+64*18] mova m25, [cq+64*14] mova m24, [cq+64*10] mova m27, [cq+64*22] mova m28, [cq+64*26] mova m23, [cq+64* 6] mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_8192)] call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 mova [r4+64*36], m1 mova [r4+64*37], m3 mova [r4+64*38], m5 mova [r4+64*39], m7 mova [r4+64*44], m23 mova [r4+64*45], m25 mova [r4+64*46], m27 mova [r4+64*47], m29 pmulhrsw m23, m13, m0 ; a0 pmulhrsw m25, m13, m2 ; a2 pmulhrsw m27, m13, m4 ; a4 pmulhrsw m29, m13, m6 ; a6 call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 lea r6, [r4-64*4] add r4, 64*28 call .pass2_end mov r4, rsp mova m0, [r4+64*23] mova m1, [r4+64*22] mova m2, [r4+64*21] mova m3, [r4+64*20] mova m4, [r4+64*19] mova m5, [r4+64*18] mova m6, [r4+64*17] mova m7, [r4+64*16] mova m22, [r4+64*15] mova m23, [r4+64*14] mova m24, [r4+64*13] mova m25, [r4+64*12] mova m26, [r4+64*11] mova m27, [r4+64*10] mova m28, [r4+64* 9] mova m29, [r4+64* 8] call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi vpbroadcastd m13, [o(pw_8192)] mova [r4+64* 8], m1 mova [r4+64* 9], m3 mova [r4+64*10], m5 mova [r4+64*11], m7 mova [r4+64*16], m23 mova [r4+64*17], m25 mova [r4+64*18], m27 mova [r4+64*19], m29 pmulhrsw m23, m13, m0 ; b0 pmulhrsw m25, m13, m2 ; b2 pmulhrsw m27, m13, m4 ; b4 pmulhrsw m29, m13, m6 ; b6 mova m0, [r4+64*31] mova m1, [r4+64*30] mova m2, [r4+64*29] mova m3, [r4+64*28] mova m4, [r4+64*27] mova m5, [r4+64*26] mova m6, [r4+64*25] mova m7, [r4+64*24] mova m14, [r4+64* 7] mova m15, [r4+64* 6] mova m16, [r4+64* 5] mova m17, [r4+64* 4] mova m18, [r4+64* 3] mova m19, [r4+64* 2] mova m20, [r4+64* 1] mova m21, [r4+64* 0] call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo mov r6, cq call .pass2_end jmp .end .fast: ; bottom/right halves are zero mova m28, [o(dup16_perm)] pmovzxwd m9, [cq+64* 0] vpermb m8, m28, [cq+64* 4] vpermb ym1, ym28, [cq+64*12] vpermb m7, m28, [cq+64* 8] pslld m9, 16 call m(idct_16x16_internal_8bpc).main_fast2 vpermb m21, m28, [cq+64* 2] vpermb ym15, ym28, [cq+64*14] vpermb ym18, ym28, [cq+64*10] vpermb m14, m28, [cq+64* 6] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 vpermb m22, m28, [cq+64* 1] vpermb ym29, ym28, [cq+64*15] vpermb ym26, ym28, [cq+64* 9] vpermb m25, m28, [cq+64* 7] vpermb m24, m28, [cq+64* 5] vpermb ym27, ym28, [cq+64*11] vpermb m23, m28, [cq+64* 3] vpermb ym28, ym28, [cq+64*13] mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_8192)] mova [cq+64*16], m4 mova [cq+64*17], m5 mova [cq+64*18], m6 mova [cq+64*19], m7 mova [cq+64*28], m26 mova [cq+64*29], m27 mova [cq+64*30], m28 mova [cq+64*31], m29 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end mova [cq+64*20], m22 mova [cq+64*21], m23 mova [cq+64*22], m24 mova [cq+64*23], m25 mova [cq+64*24], m26 mova [cq+64*25], m27 mova [cq+64*26], m28 mova [cq+64*27], m29 lea r4, [rsp+64*64] lea r3, [rsp+64*32] call .pass2_fast pmulhrsw m0, m13, [cq+64*16] pmulhrsw m1, m13, [cq+64*17] pmulhrsw m2, m13, [cq+64*18] pmulhrsw m3, m13, [cq+64*19] pmulhrsw m4, m13, [cq+64*20] pmulhrsw m5, m13, [cq+64*21] pmulhrsw m6, m13, [cq+64*22] pmulhrsw m7, m13, [cq+64*23] pmulhrsw m14, m13, [cq+64*24] pmulhrsw m15, m13, [cq+64*25] pmulhrsw m16, m13, [cq+64*26] pmulhrsw m17, m13, [cq+64*27] pmulhrsw m18, m13, [cq+64*28] pmulhrsw m19, m13, [cq+64*29] pmulhrsw m20, m13, [cq+64*30] pmulhrsw m21, m13, [cq+64*31] call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round mov r4, rsp mov r3, cq call .pass2_fast .end: vpbroadcastd m17, [o(pw_2048)] lea r5, [strideq*8] mov r3, dstq pxor m16, m16 sub r4, 64*5 ; rsp+64*31 mov r6, rsp .end_loop: mova m2, [r6+64*32] ; idct16 0+n lo mova m7, [r6+64*48] ; idct32 31-n lo mova m6, [cq+64* 0] ; idct16 0+n hi mova m0, [cq+64*16] ; idct32 31-n hi mova m4, [r4+64*64] ; idct64 63-n lo mova m1, [r4+64* 0] ; idct64 63-n hi mova m5, [r6+64*64] ; idct64 32+n lo mova m8, [r6+64* 0] ; idct64 32+n hi sub r3, strideq paddsw m3, m2, m7 ; idct32 0+n lo mova m12, [dstq+r5*0] psubsw m2, m7 ; idct32 31-n lo mova m15, [r3 +r5*8] paddsw m7, m6, m0 ; idct32 0+n hi mova m13, [r3 +r5*4] psubsw m6, m0 ; idct32 31-n hi mova m14, [dstq+r5*4] paddsw m0, m3, m4 ; out 0+n lo add r6, 64 psubsw m3, m4 ; out 63-n lo sub r4, 64 paddsw m4, m7, m1 ; out 0+n hi mova [cq+64* 0], m16 psubsw m7, m1 ; out 63-n hi mova [cq+64*16], m16 paddsw m1, m2, m5 ; out 31-n lo add cq, 64 psubsw m2, m5 ; out 32+n lo paddsw m5, m6, m8 ; out 31-n hi psubsw m6, m8 ; out 32+n hi pmulhrsw m0, m17 punpcklbw m8, m12, m16 pmulhrsw m4, m17 punpckhbw m12, m16 pmulhrsw m3, m17 punpcklbw m11, m15, m16 pmulhrsw m7, m17 punpckhbw m15, m16 pmulhrsw m1, m17 punpcklbw m9, m13, m16 pmulhrsw m5, m17 punpckhbw m13, m16 pmulhrsw m2, m17 punpcklbw m10, m14, m16 pmulhrsw m6, m17 punpckhbw m14, m16 paddw m0, m8 paddw m4, m12 packuswb m0, m4 paddw m3, m11 paddw m7, m15 packuswb m3, m7 paddw m1, m9 paddw m5, m13 packuswb m1, m5 paddw m2, m10 paddw m6, m14 packuswb m2, m6 mova [dstq+r5*0], m0 mova [r3 +r5*8], m3 mova [r3 +r5*4], m1 mova [dstq+r5*4], m2 add dstq, strideq cmp r6, r4 jb .end_loop RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly ALIGN function_align .pass2_end: REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 mova [r4+64*20], m1 mova [r4+64*21], m3 mova [r4+64*22], m5 mova [r4+64*23], m7 vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 mova [r4+64*12], m15 mova [r4+64*13], m17 mova [r4+64*14], m19 mova [r4+64*15], m21 vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 vshufi32x4 m0, m1, m5, q2020 ; 0 vshufi32x4 m1, m5, q3131 ; 8 vshufi32x4 m2, m3, m14, q2020 ; 16 vshufi32x4 m3, m14, q3131 ; 24 vshufi32x4 m14, m15, m18, q2020 ; 4 vshufi32x4 m15, m18, q3131 ; 12 vshufi32x4 m16, m17, m19, q2020 ; 20 vshufi32x4 m17, m19, q3131 ; 28 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast vshufi32x4 m24, m22, m25, q3131 ; 10 vshufi32x4 m22, m25, q2020 ; 2 vshufi32x4 m25, m23, m28, q3131 ; 14 vshufi32x4 m23, m28, q2020 ; 6 vshufi32x4 m28, m26, m27, q3131 ; 26 vshufi32x4 m26, m27, q2020 ; 18 vshufi32x4 m27, m29, m13, q2020 ; 22 vshufi32x4 m29, m13, q3131 ; 30 mova [r6+64* 0], m0 mova [r6+64* 1], m1 mova [r6+64* 2], m2 mova [r6+64* 3], m3 mova [r6+64* 4], m4 mova [r6+64* 5], m5 mova [r6+64* 6], m6 mova [r6+64* 7], m7 mova [r6+64* 8], m14 mova [r6+64* 9], m15 mova [r6+64*10], m16 mova [r6+64*11], m17 mova [r6+64*12], m18 mova [r6+64*13], m19 mova [r6+64*14], m20 mova [r6+64*15], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_8192)] mova [r6+64*16], m29 mova [r6+64*17], m28 mova [r6+64*18], m27 mova [r6+64*19], m26 mova [r6+64*20], m25 mova [r6+64*21], m24 mova [r6+64*22], m23 mova [r6+64*23], m22 mova [r6+64*24], m21 mova [r6+64*25], m20 mova [r6+64*26], m19 mova [r6+64*27], m18 mova [r6+64*28], m17 mova [r6+64*29], m16 mova [r6+64*30], m15 mova [r6+64*31], m14 pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 pmulhrsw m16, m13, [r4+64*12] pmulhrsw m17, m13, [r4+64*16] pmulhrsw m18, m13, [r4+64*20] pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 pmulhrsw m20, m13, [r4+64*15] pmulhrsw m21, m13, [r4+64*19] pmulhrsw m22, m13, [r4+64*23] vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 pmulhrsw m24, m13, [r4+64*14] pmulhrsw m25, m13, [r4+64*18] pmulhrsw m26, m13, [r4+64*22] vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 pmulhrsw m28, m13, [r4+64*13] pmulhrsw m29, m13, [r4+64*17] pmulhrsw m13, [r4+64*21] vshufi32x4 m0, m14, m16, q2020 ; 1 vshufi32x4 m1, m19, m21, q3131 ; 31 vshufi32x4 m2, m15, m17, q2020 ; 17 vshufi32x4 m3, m18, m20, q3131 ; 15 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 vshufi32x4 m0, m18, m20, q2020 ; 7 vshufi32x4 m1, m15, m17, q3131 ; 25 vshufi32x4 m2, m19, m21, q2020 ; 23 vshufi32x4 m3, m14, m16, q3131 ; 9 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 vshufi32x4 m0, m22, m24, q2020 ; 5 vshufi32x4 m1, m27, m29, q3131 ; 27 vshufi32x4 m2, m23, m25, q2020 ; 21 vshufi32x4 m3, m26, m28, q3131 ; 11 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 vshufi32x4 m0, m26, m28, q2020 ; 3 vshufi32x4 m1, m23, m25, q3131 ; 29 vshufi32x4 m2, m27, m29, q2020 ; 19 vshufi32x4 m3, m22, m24, q3131 ; 13 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 ALIGN function_align .pass2_fast: vshufi32x4 m23, m1, m16, q3131 ; 6 vshufi32x4 m22, m1, m16, q2020 ; 2 vshufi32x4 m14, m0, m3, q3131 ; 4 vshufi32x4 m26, m0, m3, q2020 ; 0 vshufi32x4 m28, m9, m15, q3131 ; 5 vshufi32x4 m0, m9, m15, q2020 ; 1 vshufi32x4 m16, m11, m17, q3131 ; 7 vshufi32x4 m29, m11, m17, q2020 ; 3 vshufi32x4 m15, m8, m2, q3131 ; 12 vshufi32x4 m27, m8, m2, q2020 ; 8 vshufi32x4 m25, m5, m19, q3131 ; 14 vshufi32x4 m24, m5, m19, q2020 ; 10 vshufi32x4 m3, m6, m20, q3131 ; 15 vshufi32x4 m19, m6, m20, q2020 ; 11 vshufi32x4 m17, m4, m18, q3131 ; 13 vshufi32x4 m18, m4, m18, q2020 ; 9 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m16 mova m3, m18 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m28 mova m3, m19 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m29 mova m3, m17 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m26 mova m1, m27 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [r3+64* 0], m0 mova [r3+64* 1], m1 mova [r3+64* 2], m2 mova [r3+64* 3], m3 mova [r3+64* 4], m4 mova [r3+64* 5], m5 mova [r3+64* 6], m6 mova [r3+64* 7], m7 mova [r3+64* 8], m14 mova [r3+64* 9], m15 mova [r3+64*10], m16 mova [r3+64*11], m17 mova [r3+64*12], m18 mova [r3+64*13], m19 mova [r3+64*14], m20 mova [r3+64*15], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 mova [r3+64*16], m29 mova [r3+64*17], m28 mova [r3+64*18], m27 mova [r3+64*19], m26 mova [r3+64*20], m25 mova [r3+64*21], m24 mova [r3+64*22], m23 mova [r3+64*23], m22 mova [r3+64*24], m21 mova [r3+64*25], m20 mova [r3+64*26], m19 mova [r3+64*27], m18 mova [r3+64*28], m17 mova [r3+64*29], m16 mova [r3+64*30], m15 mova [r3+64*31], m14 ret %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/itx_sse.asm000066400000000000000000010037101517466257200234420ustar00rootroot00000000000000; Copyright © 2018-2021, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 %macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 pw_%1_m%2: times 4 dw %1, -%2 %if %3 != 2 pw_%2_%1: times 4 dw %2, %1 %endif %if %3 pw_m%1_m%2: times 4 dw -%1, -%2 %endif %endmacro ;adst4 pw_1321_3803: times 4 dw 1321, 3803 pw_2482_m1321: times 4 dw 2482, -1321 pw_3344_2482: times 4 dw 3344, 2482 pw_3344_m3803: times 4 dw 3344, -3803 pw_3344_m3344: times 4 dw 3344, -3344 pw_0_3344 times 4 dw 0, 3344 pw_m6688_m3803: times 4 dw -6688, -3803 COEF_PAIR 2896, 2896 COEF_PAIR 1567, 3784 COEF_PAIR 799, 4017 COEF_PAIR 3406, 2276 COEF_PAIR 401, 4076 COEF_PAIR 1931, 3612 COEF_PAIR 3166, 2598 COEF_PAIR 3920, 1189 COEF_PAIR 3784, 1567, 1 COEF_PAIR 995, 3973 COEF_PAIR 1751, 3703 COEF_PAIR 3513, 2106 COEF_PAIR 3857, 1380 COEF_PAIR 4017, 799, 1 COEF_PAIR 201, 4091 COEF_PAIR 2440, 3290 COEF_PAIR 3035, 2751 COEF_PAIR 4052, 601 COEF_PAIR 2276, 3406, 1 COEF_PAIR 4076, 401, 2 COEF_PAIR 2598, 3166, 2 COEF_PAIR 3612, 1931, 2 COEF_PAIR 1189, 3920, 2 pd_2048: times 4 dd 2048 pw_2048: times 8 dw 2048 pw_m2048: times 8 dw -2048 pw_4096: times 8 dw 4096 pw_16384: times 8 dw 16384 pw_m16384: times 8 dw -16384 pw_1697x16: times 8 dw 1697*16 pw_1697x8: times 8 dw 1697*8 pw_2896x8: times 8 dw 2896*8 pw_3344x8: times 8 dw 3344*8 pw_8192: times 8 dw 8192 pw_m8192: times 8 dw -8192 pw_5: times 8 dw 5 pw_201x8: times 8 dw 201*8 pw_4091x8: times 8 dw 4091*8 pw_m2751x8: times 8 dw -2751*8 pw_3035x8: times 8 dw 3035*8 pw_1751x8: times 8 dw 1751*8 pw_3703x8: times 8 dw 3703*8 pw_m1380x8: times 8 dw -1380*8 pw_3857x8: times 8 dw 3857*8 pw_995x8: times 8 dw 995*8 pw_3973x8: times 8 dw 3973*8 pw_m2106x8: times 8 dw -2106*8 pw_3513x8: times 8 dw 3513*8 pw_2440x8: times 8 dw 2440*8 pw_3290x8: times 8 dw 3290*8 pw_m601x8: times 8 dw -601*8 pw_4052x8: times 8 dw 4052*8 pw_4095x8: times 8 dw 4095*8 pw_101x8: times 8 dw 101*8 pw_2967x8: times 8 dw 2967*8 pw_m2824x8: times 8 dw -2824*8 pw_3745x8: times 8 dw 3745*8 pw_1660x8: times 8 dw 1660*8 pw_3822x8: times 8 dw 3822*8 pw_m1474x8: times 8 dw -1474*8 pw_3996x8: times 8 dw 3996*8 pw_897x8: times 8 dw 897*8 pw_3461x8: times 8 dw 3461*8 pw_m2191x8: times 8 dw -2191*8 pw_3349x8: times 8 dw 3349*8 pw_2359x8: times 8 dw 2359*8 pw_4036x8: times 8 dw 4036*8 pw_m700x8: times 8 dw -700*8 pw_4065x8: times 8 dw 4065*8 pw_501x8: times 8 dw 501*8 pw_3229x8: times 8 dw 3229*8 pw_m2520x8: times 8 dw -2520*8 pw_3564x8: times 8 dw 3564*8 pw_2019x8: times 8 dw 2019*8 pw_3948x8: times 8 dw 3948*8 pw_m1092x8: times 8 dw -1092*8 pw_3889x8: times 8 dw 3889*8 pw_1285x8: times 8 dw 1285*8 pw_3659x8: times 8 dw 3659*8 pw_m1842x8: times 8 dw -1842*8 pw_3102x8: times 8 dw 3102*8 pw_2675x8: times 8 dw 2675*8 pw_4085x8: times 8 dw 4085*8 pw_m301x8: times 8 dw -301*8 SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %if ARCH_X86_64 %define o(x) x %else %define o(x) r5-$$+x ; PIC %endif %macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] lea r2, [dstq+strideq*2] %assign %%i 1 %rotate 5 %rep 4 %if %1 & 2 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) %else CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) %endif %assign %%i %%i + 1 %rotate 1 %endrep movd m%3, [%%row_adr1] ;dst0 movd m%5, [%%row_adr2] ;dst1 punpckldq m%3, m%5 ;high: dst1 :low: dst0 movd m%4, [%%row_adr3] ;dst2 movd m%5, [%%row_adr4] ;dst3 punpckldq m%4, m%5 ;high: dst3 :low: dst2 pxor m%5, m%5 punpcklbw m%3, m%5 ;extend byte to word punpcklbw m%4, m%5 ;extend byte to word paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 movd [%%row_adr1], m%3 ;store dst0 + out0 pshuflw m%4, m%3, q1032 movd [%%row_adr2], m%4 ;store dst1 + out1 punpckhqdq m%3, m%3 movd [%%row_adr3], m%3 ;store dst2 + out2 psrlq m%3, 32 movd [%%row_adr4], m%3 ;store dst3 + out3 %endmacro %macro ITX4_END 4-5 2048 ; row[1-4], rnd %if %5 mova m2, [o(pw_%5)] pmulhrsw m0, m2 pmulhrsw m1, m2 %endif WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 ret %endmacro ; flags: 1 = swap, 2: coef_regs, 4: no_pack %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags %if %6 & 2 pmaddwd m%2, m%4, m%1 pmaddwd m%1, m%5 %elif %6 & 1 pmaddwd m%2, m%1, [o(pw_%5_%4)] pmaddwd m%1, [o(pw_%4_m%5)] %else pmaddwd m%2, m%1, [o(pw_%4_m%5)] pmaddwd m%1, [o(pw_%5_%4)] %endif paddd m%2, m%3 paddd m%1, m%3 psrad m%2, 12 psrad m%1, 12 %if %6 & 4 == 0 packssdw m%1, m%2 %endif %endmacro %macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 mova m3, [o(pd_2048)] punpckhwd m2, m0, m1 ;unpacked in1 in3 punpcklwd m0, m1 ;unpacked in0 in2 ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 psubsw m1, m0, m2 ;high: out2 ;low: out3 paddsw m0, m2 ;high: out1 ;low: out0 %endmacro %macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 %define %%p1 m(i%1_%3_internal_8bpc) %if ARCH_X86_32 LEA r5, $$ %endif %if has_epilogue %ifidn %1_%2, dct_dct test eobd, eobd jz %%end %endif lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] call %%p1 RET %%end: %else lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x4, 6 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [coeffq], eobd ;0 pmulhrsw m0, m1 mova m1, m0 TAIL_CALL m(iadst_4x4_internal_8bpc).end2 %endif %endmacro INIT_XMM ssse3 ; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 mova m1, [coeffq+16*1] ;high: in3 ;low in2 IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m1, q0220 pshufb m0, m2 ;high: in1 ;low: in0 pshufb m1, m3, m2 ;high: in3 ;low :in2 jmp tx2q .pass2: IDCT4_1D_PACKED pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); ITX4_END 0, 1, 3, 2 INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call .main punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 punpcklwd m0, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: call .main .end: pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align cglobal_label .main punpcklwd m2, m0, m1 ;unpacked in0 in2 punpckhwd m0, m1 ;unpacked in1 in3 mova m3, m0 pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 paddd m1, m0 ;t2 pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 paddd m4, m0 ;t0 + t3 pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 mova m0, [o(pd_2048)] paddd m1, m0 ;t2 + 2048 paddd m2, m0 paddd m0, m4 ;t0 + t3 + 2048 paddd m5, m2 ;t1 + t3 + 2048 paddd m2, m4 paddd m2, m3 ;t0 + t1 - t3 + 2048 REPX {psrad x, 12}, m1, m0, m5, m2 packssdw m0, m5 ;high: out1 ;low: out0 packssdw m1, m2 ;high: out3 ;low: out3 ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 punpckhwd m1, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 .end2: ITX4_END 3, 2, 1, 0 INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] mova m3, [o(pw_1697x8)] pmulhrsw m2, m0, m3 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 punpcklwd m0, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: mova m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x4_internal_8bpc).end %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ;low: in1 high: in3 punpcklqdq m0, m1 ;low: in0 high: in2 psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 paddw m0, m3 ;low: in0 + in1 high: in2 + in3 punpckhqdq m2, m2 ;t2 t2 punpcklqdq m0, m0 ;t0 t0 psubw m1, m0, m2 psraw m1, 1 ;t4 t4 psubw m1, m3 ;low: t1/out2 high: t3/out1 psubw m0, m1 ;high: out0 paddw m2, m1 ;low: out3 %endmacro INIT_XMM sse2 cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 psraw m0, 2 psraw m1, 2 IWHT4_1D_PACKED punpckhwd m0, m1 punpcklwd m3, m1, m2 punpckhdq m1, m0, m3 punpckldq m0, m3 IWHT4_1D_PACKED shufpd m0, m2, 0x01 ITX4_END 0, 3, 2, 1, 0 %macro IDCT8_1D_PACKED 0 mova m6, [o(pd_2048)] punpckhwd m4, m0, m3 ;unpacked in1 in7 punpcklwd m0, m2 ;unpacked in0 in4 punpckhwd m2, m1 ;unpacked in5 in3 punpcklwd m1, m3 ;unpacked in2 in6 ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 psubsw m3, m4, m2 ;low: t6a high: t5a paddsw m4, m2 ;low: t7 high: t4 pshufb m3, [o(deint_shuf1)] ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 psubsw m2, m0, m1 ;low: tmp3 high: tmp2 paddsw m0, m1 ;low: tmp0 high: tmp1 punpcklqdq m1, m4, m3 ;low: t7 high: t6 punpckhqdq m4, m3 ;low: t4 high: t5 psubsw m3, m0, m1 ;low: out7 high: out6 paddsw m0, m1 ;low: out0 high: out1 paddsw m1, m2, m4 ;low: out3 high: out2 psubsw m2, m4 ;low: out4 high: out5 %endmacro ;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 punpckhwd m%4, m%1, m%2 punpcklwd m%1, m%2 %if %7 < 8 pmaddwd m%2, m%7, m%1 pmaddwd m%3, m%7, m%4 %else mova m%2, [o(pw_%7_%6)] %if %8 pmaddwd m%3, m%1, m%2 pmaddwd m%2, m%4 %else pmaddwd m%3, m%4, m%2 pmaddwd m%2, m%1 %endif %endif paddd m%3, m%5 paddd m%2, m%5 psrad m%3, 12 psrad m%2, 12 %if %8 packssdw m%3, m%2 %else packssdw m%2, m%3 ;dst2 %endif %if %7 < 8 pmaddwd m%4, m%6 pmaddwd m%1, m%6 %elif %8 mova m%2, [o(pw_%6_m%7)] pmaddwd m%4, m%2 pmaddwd m%1, m%2 %else mova m%3, [o(pw_%6_m%7)] pmaddwd m%4, m%3 pmaddwd m%1, m%3 %endif paddd m%4, m%5 paddd m%1, m%5 psrad m%4, 12 psrad m%1, 12 packssdw m%1, m%4 ;dst1 %endmacro %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 psubsw m%3, m%1, m%2 ;out2 paddsw m%2, m%1 ;out1 paddsw m%1, m%5, m%4 ;out0 psubsw m%4, m%5 ;out3 %endmacro %macro WRITE_4X8 4 ;row[1-4] WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 lea dstq, [dstq+strideq*4] WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 %endmacro %macro INV_4X8 0 punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 ;low: in2 high: in3 punpckldq m0, m2 ;low: in0 high: in1 punpckldq m2, m3, m4 ;low: in4 high: in5 punpckhdq m3, m4 ;low: in6 high: in7 %endmacro %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x8, 8 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [coeffq], eobd pmulhrsw m0, m1 pmulhrsw m0, m1 pmulhrsw m0, [o(pw_2048)] mova m1, m0 mova m2, m0 mova m3, m0 TAIL_CALL m(iadst_4x8_internal_8bpc).end3 %endif %endmacro INIT_XMM ssse3 INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: call m(idct_8x4_internal_8bpc).main jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: call .main shufps m1, m1, q1032 shufps m3, m3, q1032 mova m4, [o(pw_2048)] jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: call m(iadst_8x4_internal_8bpc).main .pass1_end: INV_4X8 jmp tx2q .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 call .main mova m4, [o(pw_2048)] pxor m5, m5 psubw m5, m4 .end: punpcklqdq m4, m5 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 pmulhrsw m2, m4 pmulhrsw m3, m4 pxor m5, m5 mova [coeffq+16*0], m5 mova [coeffq+16*1], m5 mova [coeffq+16*2], m5 mova [coeffq+16*3], m5 .end3: WRITE_4X8 0, 1, 2, 3 RET ALIGN function_align cglobal_label .main mova m6, [o(pd_2048)] punpckhwd m4, m3, m0 ;unpacked in7 in0 punpckhwd m5, m2, m1 ;unpacked in5 in2 punpcklwd m1, m2 ;unpacked in3 in4 punpcklwd m0, m3 ;unpacked in1 in6 ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a psubsw m3, m4, m1 ;low: t4 high: t5 paddsw m4, m1 ;low: t0 high: t1 psubsw m2, m5, m0 ;low: t6 high: t7 paddsw m5, m0 ;low: t2 high: t3 shufps m1, m3, m2, q1032 punpckhwd m2, m1 punpcklwd m3, m1 ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a psubsw m1, m4, m5 ;low: t2 high: t3 paddsw m4, m5 ;low: out0 high: -out7 psubsw m5, m3, m2 ;low: t7 high: t6 paddsw m3, m2 ;low: out6 high: -out1 shufps m0, m4, m3, q3210 ;low: out0 high: -out1 shufps m3, m4, q3210 ;low: out6 high: -out7 mova m2, [o(pw_2896_m2896)] mova m7, [o(pw_2896_2896)] shufps m4, m1, m5, q1032 ;low: t3 high: t7 shufps m1, m5, q3210 ;low: t2 high: t6 punpcklwd m5, m1, m4 punpckhwd m1, m4 pmaddwd m4, m2, m1 ;-out5 pmaddwd m2, m5 ; out4 pmaddwd m1, m7 ; out2 pmaddwd m5, m7 ;-out3 REPX {paddd x, m6}, m4, m2, m1, m5 REPX {psrad x, 12}, m4, m2, m1, m5 packssdw m1, m5 ;low: out2 high: -out3 packssdw m2, m4 ;low: out4 high: -out5 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: call m(iadst_8x4_internal_8bpc).main punpcklwd m4, m3, m2 punpckhwd m3, m2 punpcklwd m5, m1, m0 punpckhwd m1, m0 punpckldq m2, m3, m1 ;low: in4 high: in5 punpckhdq m3, m1 ;low: in6 high: in7 punpckldq m0, m4, m5 ;low: in0 high: in1 punpckhdq m1, m4, m5 ;low: in2 high: in3 jmp tx2q .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 call m(iadst_4x8_internal_8bpc).main mova m4, m0 mova m5, m1 pshufd m0, m3, q1032 pshufd m1, m2, q1032 pshufd m2, m5, q1032 pshufd m3, m4, q1032 mova m5, [o(pw_2048)] pxor m4, m4 psubw m4, m5 jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: mova m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: mova m4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 %macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] movq m%3, [dstq ] movq m%4, [dstq+strideq] pxor m%5, m%5 punpcklbw m%3, m%5 ;extend byte to word punpcklbw m%4, m%5 ;extend byte to word %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 movq [dstq ], m%3 punpckhqdq m%3, m%3 movq [dstq+strideq], m%3 %endmacro %macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] WRITE_8X2 %1, %2, %5, %6, %7 lea dstq, [dstq+strideq*2] WRITE_8X2 %3, %4, %5, %6, %7 %endmacro %macro INV_TXFM_8X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x4, 8 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 pmulhrsw m0, m1 mova m2, [o(pw_2048)] pmulhrsw m0, m1 pmulhrsw m0, m2 mova m1, m0 mova m2, m0 mova m3, m0 TAIL_CALL m(iadst_8x4_internal_8bpc).end2 %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] call m(idct_4x8_internal_8bpc).main mova m4, [o(deint_shuf1)] mova m5, [o(deint_shuf2)] pshufb m0, m4 pshufb m1, m5 pshufb m2, m4 pshufb m3, m5 punpckhdq m4, m0, m1 punpckldq m0, m1 punpckhdq m5, m2, m3 punpckldq m2, m3 punpckhqdq m1, m0, m2 ;in1 punpcklqdq m0, m2 ;in0 punpckhqdq m3, m4, m5 ;in3 punpcklqdq m2 ,m4, m5 ;in2 jmp tx2q .pass2: call .main jmp m(iadst_8x4_internal_8bpc).end ALIGN function_align cglobal_label .main mova m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] shufps m0, m0, q1032 shufps m1, m1, q1032 call m(iadst_4x8_internal_8bpc).main punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 pxor m5, m5 psubsw m3, m5, m1 psubsw m5, m4 punpckhdq m4, m5, m3 punpckldq m5, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhwd m1, m0, m5 ;in1 punpcklwd m0, m5 ;in0 punpcklwd m2, m3, m4 ;in2 punpckhwd m3, m4 ;in3 jmp tx2q .pass2: call .main .end: mova m4, [o(pw_2048)] pmulhrsw m0, m4 pmulhrsw m1, m4 pmulhrsw m2, m4 pmulhrsw m3, m4 .end2: pxor m6, m6 mova [coeffq+16*0], m6 mova [coeffq+16*1], m6 mova [coeffq+16*2], m6 mova [coeffq+16*3], m6 .end3: WRITE_8X4 0, 1, 2, 3, 4, 5, 6 RET ALIGN function_align cglobal_label .main punpckhwd m6, m0, m2 ;unpacked in0 in2 punpcklwd m0, m2 ;unpacked in0 in2 punpckhwd m7, m1, m3 ;unpacked in1 in3 punpcklwd m1, m3 ;unpacked in1 in3 mova m2, [o(pw_3344_m3344)] mova m4, [o(pw_0_3344)] pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 pmaddwd m5, m4, m7 ;3344 * in3 pmaddwd m2, m0 pmaddwd m4, m1 paddd m3, m5 paddd m2, m4 mova m4, [o(pd_2048)] paddd m3, m4 ;t2 + 2048 paddd m2, m4 psrad m3, 12 psrad m2, 12 packssdw m2, m3 ;out2 pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 paddd m3, m4 ;t0 + t3 pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 mova m4, [o(pd_2048)] paddd m0, m4 paddd m4, m3 ;t0 + t3 + 2048 paddd m5, m0 ;t1 + t3 + 2048 paddd m3, m0 paddd m3, m1 ;t0 + t1 - t3 + 2048 psrad m4, 12 ;out0 psrad m5, 12 ;out1 psrad m3, 12 ;out3 packssdw m0, m4, m5 ;low: out0 high: out1 pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 paddd m1, m4 ;t0 + t3 pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 mova m4, [o(pd_2048)] paddd m6, m4 paddd m4, m1 ;t0 + t3 + 2048 paddd m5, m6 ;t1 + t3 + 2048 paddd m1, m6 paddd m1, m7 ;t0 + t1 - t3 + 2048 psrad m4, 12 ;out0 psrad m5, 12 ;out1 psrad m1, 12 ;out3 packssdw m3, m1 ;out3 packssdw m4, m5 ;low: out0 high: out1 punpckhqdq m1, m0, m4 ;out1 punpcklqdq m0, m4 ;out0 ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] shufps m0, m0, q1032 shufps m1, m1, q1032 call m(iadst_4x8_internal_8bpc).main punpckhwd m5, m3, m2 punpcklwd m3, m2 punpckhwd m2, m1, m0 punpcklwd m1, m0 pxor m0, m0 psubsw m4, m0, m2 psubsw m0, m5 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckhdq m4, m3, m1 punpckldq m3, m1 punpckhwd m1, m0, m3 ;in1 punpcklwd m0, m3 ;in0 punpckhwd m3, m2, m4 ;in3 punpcklwd m2, m4 ;in2 jmp tx2q .pass2: call m(iadst_8x4_internal_8bpc).main mova m4, m0 mova m5, m1 mova m0, m3 mova m1, m2 mova m2, m5 mova m3, m4 jmp m(iadst_8x4_internal_8bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] paddsw m0, m0 paddsw m1, m1 paddsw m2, m2 paddsw m3, m3 punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m5, m4, m1 punpckldq m4, m1 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhwd m1, m0, m4 ;in1 punpcklwd m0, m4 ;in0 punpcklwd m2, m3, m5 ;in2 punpckhwd m3, m5 ;in3 jmp tx2q .pass2: mova m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mova m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m2 psrlw m2, 3 pmulhrsw m0, m1 pmulhrsw m0, m2 .end: mov r3d, 2 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] .loop: WRITE_8X4 0, 0, 0, 0, 1, 2, 3 lea dstq, [dstq+strideq*2] dec r3d jg .loop jmp tx2q .end3: RET %endif %endmacro %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 %if %3 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [%1+%2*0] pmulhrsw m1, m7, [%1+%2*1] pmulhrsw m2, m7, [%1+%2*2] pmulhrsw m3, m7, [%1+%2*3] pmulhrsw m4, m7, [%1+%2*4] pmulhrsw m5, m7, [%1+%2*5] pmulhrsw m6, m7, [%1+%2*6] pmulhrsw m7, [%1+%2*7] %else mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] mova m4, [%1+%2*4] mova m5, [%1+%2*5] mova m6, [%1+%2*6] mova m7, [%1+%2*7] %endif %endmacro %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a psubsw m%2, m%4, m%5 ;t6a paddsw m%4, m%5 ;t7 psubsw m%5, m%1, m%3 ;t5a paddsw m%1, m%3 ;t4 ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: call .main .pass1_end: mova m7, [o(pw_16384)] .pass1_end1: REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 .pass1_end2: REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize+16*0] cglobal_label .pass1_end3 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 mova [rsp+gprsize+16*2], m6 mova m6, [rsp+gprsize+16*1] punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 mova [rsp+gprsize+16*0], m2 punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 mova m7, [rsp+gprsize+16*2] punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 mova m7, [rsp+gprsize+16*0] jmp tx2q .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 .end2: REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize+16*0] mova [rsp+gprsize+16*2], m5 mova [rsp+gprsize+16*0], m7 .end3: WRITE_8X4 0, 1, 2, 3, 5, 6, 7 lea dstq, [dstq+strideq*2] WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 jmp tx2q .end4: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m1 mova m7, [o(pd_2048)] IDCT4_1D 0, 2, 4, 6, 1, 3, 7 mova m3, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*2], m2 mova m2, [rsp+gprsize*2+16*1] mova [rsp+gprsize*2+16*1], m4 mova m4, [rsp+gprsize*2+16*0] mova [rsp+gprsize*2+16*0], m6 IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 mova m6, [rsp+gprsize*2+16*0] psubsw m7, m0, m4 ;out7 paddsw m0, m4 ;out0 mova [rsp+gprsize*2+16*0], m7 mova m1, [rsp+gprsize*2+16*2] psubsw m4, m6, m3 ;out4 paddsw m3, m6 ;out3 mova m7, [rsp+gprsize*2+16*1] psubsw m6, m1, m5 ;out6 paddsw m1, m5 ;out1 psubsw m5, m7, m2 ;out5 paddsw m2, m7 ;out2 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: call .main call .main_pass1_end .pass1_end: mova m7, [o(pw_16384)] .pass1_end1: REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 pxor m6, m6 psubw m6, m7 mova m7, m6 jmp m(idct_8x8_internal_8bpc).pass1_end2 ALIGN function_align .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main call .main_pass2_end .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 pxor m6, m6 psubw m6, m7 mova m7, m6 jmp m(idct_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m4 mova m7, [o(pd_2048)] ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a paddsw m3, m2, m6 ;t2 psubsw m2, m6 ;t6 paddsw m4, m5, m1 ;t3 psubsw m5, m1 ;t7 ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a mova m6, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*2], m5 mova m1, [rsp+gprsize*2+16*1] mova [rsp+gprsize*2+16*1], m2 mova m5, [rsp+gprsize*2+16*0] mova [rsp+gprsize*2+16*0], m3 ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a psubsw m2, m0, m6 ;t4 paddsw m0, m6 ;t0 paddsw m3, m5, m1 ;t1 psubsw m5, m1 ;t5 ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a mova m7, [rsp+gprsize*2+16*0] paddsw m1, m3, m4 ;-out7 psubsw m3, m4 ;t3 mova [rsp+gprsize*2+16*0], m1 psubsw m4, m0, m7 ;t2 paddsw m0, m7 ;out0 mova m6, [rsp+gprsize*2+16*2] mova m7, [rsp+gprsize*2+16*1] paddsw m1, m5, m6 ;-out1 psubsw m5, m6 ;t6 paddsw m6, m2, m7 ;out6 psubsw m2, m7 ;t7 ret ALIGN function_align .main_pass1_end: mova [rsp+gprsize*2+16*1], m1 mova [rsp+gprsize*2+16*2], m6 punpckhwd m1, m4, m3 punpcklwd m4, m3 punpckhwd m7, m5, m2 punpcklwd m5, m2 mova m2, [o(pw_2896_2896)] mova m6, [o(pd_2048)] pmaddwd m3, m2, m7 pmaddwd m2, m5 paddd m3, m6 paddd m2, m6 psrad m3, 12 psrad m2, 12 packssdw m2, m3 ;out2 mova m3, [o(pw_2896_m2896)] pmaddwd m7, m3 pmaddwd m5, m3 paddd m7, m6 paddd m5, m6 psrad m7, 12 psrad m5, 12 packssdw m5, m7 ;-out5 mova m3, [o(pw_2896_2896)] pmaddwd m7, m3, m1 pmaddwd m3, m4 paddd m7, m6 paddd m3, m6 psrad m7, 12 psrad m3, 12 packssdw m3, m7 ;-out3 mova m7, [o(pw_2896_m2896)] pmaddwd m1, m7 pmaddwd m4, m7 paddd m1, m6 paddd m4, m6 psrad m1, 12 psrad m4, 12 packssdw m4, m1 ;-out5 mova m1, [rsp+gprsize*2+16*1] mova m6, [rsp+gprsize*2+16*2] ret ALIGN function_align cglobal_label .main_pass2_end paddsw m7, m4, m3 ;t2 + t3 psubsw m4, m3 ;t2 - t3 paddsw m3, m5, m2 ;t6 + t7 psubsw m5, m2 ;t6 - t7 mova m2, [o(pw_2896x8)] pmulhrsw m4, m2 ;out4 pmulhrsw m5, m2 ;-out5 pmulhrsw m7, m2 ;-out3 pmulhrsw m2, m3 ;out2 mova m3, m7 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: call m(iadst_8x8_internal_8bpc).main call m(iadst_8x8_internal_8bpc).main_pass1_end .pass1_end: mova m7, [o(pw_m16384)] .pass1_end1: pmulhrsw m1, m7 mova [rsp+gprsize+16*1], m1 mova m1, m6 mova m6, m2 pmulhrsw m2, m5, m7 mova m5, m6 mova m6, m4 pmulhrsw m4, m3, m7 mova m3, m6 mova m6, m0 mova m0, m7 pxor m7, m7 psubw m7, m0 pmulhrsw m0, [rsp+gprsize+16*0] REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call m(iadst_8x8_internal_8bpc).main call m(iadst_8x8_internal_8bpc).main_pass2_end .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*2], m2 mova m2, m0 pxor m0, m0 psubw m0, m7 mova m7, m2 pmulhrsw m1, m0 pmulhrsw m2, m5, m0 mova [rsp+gprsize+16*1], m1 mova m5, m4 mova m1, m6 pmulhrsw m4, m3, m0 pmulhrsw m0, [rsp+gprsize+16*0] mova m3, m5 mova [rsp+gprsize+16*0], m7 jmp m(idct_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .end: pmulhrsw m7, [o(pw_4096)] mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_4096)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+gprsize+16*2], m5 mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).end3 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16, 8 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [coeffq], eobd pmulhrsw m0, [o(pw_16384)] pmulhrsw m0, m1 pmulhrsw m0, [o(pw_2048)] .end: WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 RET %endif %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] .pass1: mova m0, [coeffq+16*1] mova m1, [coeffq+16*3] mova m2, [coeffq+16*5] mova m3, [coeffq+16*7] push tx2q lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] jmp r3 .pass1_2: mova [coeffq+16*1], m0 mova [coeffq+16*3], m1 mova [coeffq+16*5], m2 mova [coeffq+16*7], m3 mova m0, [coeffq+16*0] mova m1, [coeffq+16*2] mova m2, [coeffq+16*4] mova m3, [coeffq+16*6] lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: pop tx2q mova m4, [coeffq+16*1] mova m5, [coeffq+16*3] mova m6, [coeffq+16*5] mova m7, [o(pw_16384)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*7], m7 jmp tx2q .pass2: call m(idct_16x4_internal_8bpc).main .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*4], m4 .end1: mova [coeffq+16*5], m5 mova [coeffq+16*6], m6 mov r3, coeffq WRITE_4X8 0, 1, 3, 2 mova m0, [r3+16*4] mova m1, [r3+16*5] mova m2, [r3+16*6] mova m3, m7 lea dstq, [dstq+strideq*4] WRITE_4X8 0, 1, 3, 2 .end2: pxor m7, m7 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] jmp m(idct_4x16_internal_8bpc).pass1 .pass2: call m(iadst_16x4_internal_8bpc).main call m(iadst_16x4_internal_8bpc).main_pass2_end punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 punpckhqdq m4, m5 ;low: out8 high: out10 punpcklqdq m5, m7, m2 ;low: out4 high: out6 punpckhqdq m2, m7 ;low: -out9 high: -out11 mova [coeffq+16*4], m2 mova [coeffq+16*5], m6 mova m2, [coeffq+16*6] mova m6, [coeffq+16*7] punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 punpcklqdq m0, m6 ;low: out0 high: out2 punpckhqdq m6, m3, m2 ;low: out12 high: out14 punpcklqdq m2, m3 ;low: -out1 high: -out3 mova m7, [o(pw_2048)] .end1: REPX {pmulhrsw x, m7}, m0, m5, m4, m6 pxor m3, m3 psubw m3, m7 mova m7, [coeffq+16*4] REPX {pmulhrsw x, m3}, m2, m7, m1 pmulhrsw m3, [coeffq+16*5] mova [coeffq+16*7], m5 punpckhqdq m5, m4, m7 ;low: out10 high: out11 punpcklqdq m4, m7 ;low: out8 high: out9 punpckhqdq m7, m6, m1 ;low: out14 high: out15 punpcklqdq m6, m1 ;low: out12 high: out13 punpckhqdq m1, m0, m2 ;low: out2 high: out3 punpcklqdq m0, m2 ;low: out0 high: out1 mova [coeffq+16*4], m4 mova m4, [coeffq+16*7] punpcklqdq m2, m4, m3 ;low: out4 high: out5 punpckhqdq m4, m3 ;low: out6 high: out7 mova m3, m4 .end2: mova [coeffq+16*5], m5 mova [coeffq+16*6], m6 mov r3, coeffq WRITE_4X8 0, 1, 2, 3 mova m0, [r3+16*4] mova m1, [r3+16*5] mova m2, [r3+16*6] mova m3, m7 lea dstq, [dstq+strideq*4] WRITE_4X8 0, 1, 2, 3 .end3: pxor m7, m7 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] jmp m(idct_4x16_internal_8bpc).pass1 .pass2: call m(iadst_16x4_internal_8bpc).main call m(iadst_16x4_internal_8bpc).main_pass2_end punpckhqdq m6, m5, m4 ;low: out5 high: out7 punpcklqdq m4, m5 ;low: -out8 high: -out10 punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 punpcklqdq m2, m7 ;low: out9 high: out11 mova [coeffq+16*4], m2 mova [coeffq+16*5], m6 mova m2, [coeffq+16*6] mova m6, [coeffq+16*7] punpcklqdq m1, m6, m0 ;low: out13 high: out15 punpckhqdq m0, m6 ;low: -out0 high: -out2 punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 punpckhqdq m2, m3 ;low: out1 high: out3 mova m7, [o(pw_m2048)] jmp m(iadst_4x16_internal_8bpc).end1 INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 pmulhrsw m%2, m%4 %else paddsw m%1, m%1 %endif paddsw m%1, m%2 %endmacro cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*1] mova m6, [o(pw_1697x8)] mova m1, [coeffq+16*3] mova m2, [coeffq+16*5] mova m3, [coeffq+16*7] pcmpeqw m7, m7 mov r3, tx2q lea tx2q, [o(.pass1_2)] .pass1: pmulhrsw m4, m6, m0 pmulhrsw m5, m6, m1 pavgw m4, m0 pcmpeqw m0, m7 pavgw m5, m1 pcmpeqw m1, m7 pandn m0, m4 pmulhrsw m4, m6, m2 pandn m1, m5 pmulhrsw m5, m6, m3 pavgw m4, m2 pcmpeqw m2, m7 pavgw m5, m3 pcmpeqw m3, m7 pandn m2, m4 pandn m3, m5 jmp m(iadst_4x8_internal_8bpc).pass1_end .pass1_2: mova [coeffq+16*1], m0 mova [coeffq+16*3], m1 mova [coeffq+16*5], m2 mova [coeffq+16*7], m3 mova m0, [coeffq+16*0] mova m1, [coeffq+16*2] mova m2, [coeffq+16*4] mova m3, [coeffq+16*6] lea tx2q, [o(.pass1_end)] jmp .pass1 .pass1_end: mova m4, [coeffq+16*1] mova m5, [coeffq+16*3] mova m6, [coeffq+16*5] jmp r3 .pass2: mova m7, [o(pw_1697x16)] mova [coeffq+16*6], m6 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 mova m6, [coeffq+16*7] IDTX16 6, 7, 7 mova [coeffq+16*7], m6 mova m6, [coeffq+16*6] pmulhrsw m7, m6, [o(pw_1697x16)] paddsw m6, m6 paddsw m6, m7 mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*4], m4 jmp m(iadst_4x16_internal_8bpc).end2 %macro INV_TXFM_16X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x4, 8 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd mov r2d, 2 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] .dconly: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 pxor m5, m5 .dconly_loop: mova m1, [dstq] mova m3, [dstq+strideq] punpckhbw m2, m1, m5 punpcklbw m1, m5 punpckhbw m4, m3, m5 punpcklbw m3, m5 paddw m2, m0 paddw m1, m0 paddw m4, m0 paddw m3, m0 packuswb m1, m2 packuswb m3, m4 mova [dstq], m1 mova [dstq+strideq], m3 lea dstq, [dstq+strideq*2] dec r2d jg .dconly_loop jmp tx2q .end: RET %endif %endmacro %macro LOAD_7ROWS 2 ;src, stride mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] mova m4, [%1+%2*4] mova m5, [%1+%2*5] mova m6, [%1+%2*6] %endmacro %macro SAVE_7ROWS 2 ;src, stride mova [%1+%2*0], m0 mova [%1+%2*1], m1 mova [%1+%2*2], m2 mova [%1+%2*3], m3 mova [%1+%2*4], m4 mova [%1+%2*5], m5 mova [%1+%2*6], m6 %endmacro %macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] punpckhwd m%5, m%4, m%1 ;packed in13 in3 punpcklwd m%1, m%4 ;packed in1 in15 punpcklwd m%4, m%3, m%2 ;packed in9 in7 punpckhwd m%2, m%3 ;packed in5 in11 mova m%7, [o(pd_2048)] ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a psubsw m%6, m%1, m%4 ;low: t9 high: t14 paddsw m%1, m%4 ;low: t8 high: t15 psubsw m%4, m%5, m%2 ;low: t10 high: t13 paddsw m%5, m%2 ;low: t11 high: t12 mova m%2, [o(deint_shuf2)] pshufb m%6, m%2 pshufb m%4, m%2 ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a psubsw m%3, m%1, m%5 ;low: t11a high: t12a paddsw m%1, m%5 ;low: t8a high: t15a psubsw m%5, m%6, m%4 ;low: t10 high: t13 paddsw m%6, m%4 ;low: t9 high: t14 pshufb m%3, m%2 pshufb m%5, m%2 ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a packssdw m%2, m%4 ;low: t11 high: t10a packssdw m%3, m%5 ;low: t12 high: t13a punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 punpcklqdq m%1, m%6 ;low: t8a high: t9 %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main .pass1_end: punpckhwd m7, m0, m2 ;packed out1, out5 punpcklwd m0, m2 ;packed out0, out4 punpcklwd m2, m1, m3 ;packed out3, out7 punpckhwd m1, m3 ;packed out2, out6 mova [coeffq+16*6], m7 mova m7, [coeffq+16*7] punpckhwd m3, m4, m6 ;packed out9, out13 punpcklwd m4, m6 ;packed out8, out12 punpcklwd m6, m5, m7 ;packed out11, out15 punpckhwd m5, m7 ;packed out10, out14 .pass1_end2: mova m7, [o(pw_16384)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*6] mova [coeffq+16*6], m7 .pass1_end3: punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high punpcklwd m3, m6 ;packed 9, 10, 13, 15 low punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high punpcklwd m4, m5 ;packed 8, 10, 12, 14 low punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) mova [coeffq+16*7], m3 mova m3, [coeffq+16*6] punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high punpcklwd m3, m2 ;packed 1, 3, 5, 7 low punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high punpcklwd m0, m1 ;packed 0, 2, 4, 6 low punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) jmp tx2q .pass2: lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] .pass2_end: mova [coeffq+16*4], m4 mova [coeffq+16*5], m5 mova [coeffq+16*6], m6 lea r3, [dstq+8] call tx2q add coeffq, 16*4 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] mova m2, [coeffq+16*2] mova m3, [coeffq+16*3] mov dstq, r3 jmp tx2q ALIGN function_align cglobal_label .main punpckhqdq m7, m0, m1 ;low:in1 high:in3 punpcklqdq m0, m1 punpcklqdq m1, m2, m3 punpckhqdq m3, m2 ;low:in7 high:in5 mova [coeffq+16*4], m7 mova [coeffq+16*5], m3 mova m7, [coeffq+16*7] punpcklqdq m2, m4, m5 punpckhqdq m4, m5 ;low:in9 high:in11 punpcklqdq m3, m6, m7 punpckhqdq m7, m6 ;low:in15 high:in13 mova [coeffq+16*6], m4 IDCT8_1D_PACKED mova m6, [coeffq+16*4] mova m4, [coeffq+16*5] mova m5, [coeffq+16*6] mova [coeffq+16*4], m1 mova [coeffq+16*5], m2 mova [coeffq+16*6], m3 IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 mova m1, [coeffq+16*4] psubsw m3, m0, m7 ;low:out15 high:out14 paddsw m0, m7 ;low:out0 high:out1 psubsw m7, m1, m5 ;low:out12 high:out13 paddsw m1, m5 ;low:out3 high:out2 mova [coeffq+16*7], m3 mova m2, [coeffq+16*5] mova m3, [coeffq+16*6] psubsw m5, m2, m4 ;low:out11 high:out10 paddsw m2, m4 ;low:out4 high:out5 psubsw m4, m3, m6 ;low:out8 high:out9 paddsw m3, m6 ;low:out7 high:out6 mova m6, m7 ret INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main call .main_pass1_end punpckhwd m6, m7, m0 ;packed -out11, -out15 punpcklwd m0, m7 ;packed out0, out4 punpcklwd m7, m3, m4 ;packed -out3, -out7 punpckhwd m4, m3 ;packed out8, out12 mova m1, [coeffq+16*6] punpcklwd m3, m1, m5 ;packed -out1, -out5 punpckhwd m5, m1 ;packed out10, out14 mova m1, [coeffq+16*7] mova [coeffq+16*6], m3 mova [coeffq+16*7], m7 punpckhwd m3, m2, m1 ;packed -out9, -out13 punpcklwd m1, m2 ;packed out2, out6 mova m7, [o(pw_16384)] .pass1_end: REPX {pmulhrsw x, m7}, m0, m1, m4, m5 pxor m2, m2 psubw m2, m7 mova m7, [coeffq+16*6] REPX {pmulhrsw x, m2}, m7, m3, m6 pmulhrsw m2, [coeffq+16*7] mova [coeffq+16*6], m7 jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] jmp m(idct_16x4_internal_8bpc).pass2_end ALIGN function_align cglobal_label .main mova [coeffq+16*6], m0 pshufd m0, m1, q1032 pshufd m2, m2, q1032 punpckhwd m1, m6, m0 ;packed in13, in2 punpcklwd m0, m6 ;packed in3, in12 punpckhwd m7, m5, m2 ;packed in11, in4 punpcklwd m2, m5 ;packed in5, in10 mova m6, [o(pd_2048)] ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 psubsw m5, m1, m2 ;low:t10a high:t11a paddsw m1, m2 ;low:t2a high:t3a psubsw m2, m7, m0 ;low:t12a high:t13a paddsw m7, m0 ;low:t4a high:t5a punpcklqdq m0, m5 punpckhwd m0, m5 ;packed t10a, t11a punpcklqdq m5, m2 punpckhwd m2, m5 ;packed t13a, t12a ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 mova [coeffq+16*4], m1 mova [coeffq+16*5], m7 mova m1, [coeffq+16*6] mova m7, [coeffq+16*7] pshufd m1, m1, q1032 pshufd m3, m3, q1032 punpckhwd m5, m7, m1 ;packed in15, in0 punpcklwd m1, m7 ;packed in1, in14 punpckhwd m7, m4, m3 ;packed in9, in6 punpcklwd m3, m4 ;packed in7, in8 ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 psubsw m4, m5, m3 ;low:t8a high:t9a paddsw m5, m3 ;low:t0a high:t1a psubsw m3, m7, m1 ;low:t14a high:t15a paddsw m7, m1 ;low:t6a high:t7a punpcklqdq m1, m4 punpckhwd m1, m4 ;packed t8a, t9a punpcklqdq m4, m3 punpckhwd m3, m4 ;packed t15a, t14a ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 paddsw m4, m1, m2 ;low:t12a high:t13a psubsw m1, m2 ;low:t8a high:t9a psubsw m2, m0, m3 ;low:t14a high:t15a paddsw m0, m3 ;low:t10a high:t11a punpcklqdq m3, m1 punpckhwd m3, m1 ;packed t12a, t13a punpcklqdq m1, m2 punpckhwd m2, m1 ;packed t15a, t14a ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 psubsw m1, m3, m2 ;low:t14a high:t15a paddsw m3, m2 ;low:out2 high:-out13 psubsw m2, m4, m0 ;low:t10 high:t11 paddsw m0, m4 ;low:-out1 high:out14 mova [coeffq+16*6], m0 mova [coeffq+16*7], m3 mova m0, [coeffq+16*4] mova m3, [coeffq+16*5] psubsw m4, m5, m3 ;low:t4 high:t5 paddsw m5, m3 ;low:t0 high:t1 psubsw m3, m0, m7 ;low:t6 high:t7 paddsw m0, m7 ;low:t2 high:t3 punpcklqdq m7, m4 punpckhwd m7, m4 ;packed t4, t5 punpcklqdq m4, m3 punpckhwd m3, m4 ;packed t7, t6 ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a psubsw m4, m5, m0 ;low:t2a high:t3a paddsw m0, m5 ;low:out0 high:-out15 psubsw m5, m7, m3 ;low:t6 high:t7 paddsw m3, m7 ;low:-out3 high:out12 ret ALIGN function_align .main_pass1_end: mova m7, [o(deint_shuf1)] mova [coeffq+16*4], m0 mova [coeffq+16*5], m3 mova m0, [o(pw_2896_m2896)] mova m3, [o(pw_2896_2896)] pshufb m1, m7 ;t14a t15a pshufb m2, m7 ;t10 t11 pshufb m4, m7 ;t2a t3a pshufb m5, m7 ;t6 t7 pmaddwd m7, m0, m2 pmaddwd m2, m3 paddd m7, m6 paddd m2, m6 psrad m7, 12 psrad m2, 12 packssdw m2, m7 ;low:out6 high:-out9 pmaddwd m7, m0, m4 pmaddwd m4, m3 paddd m7, m6 paddd m4, m6 psrad m7, 12 psrad m4, 12 packssdw m4, m7 ;low:-out7 high:out8 pmaddwd m7, m3, m5 pmaddwd m5, m0 paddd m7, m6 paddd m5, m6 psrad m7, 12 psrad m5, 12 packssdw m7, m5 ;low:out4 high:-out11 pmaddwd m5, m3, m1 pmaddwd m1, m0 paddd m5, m6 paddd m1, m6 psrad m5, 12 psrad m1, 12 packssdw m5, m1 ;low:-out5 high:out10 mova m0, [coeffq+16*4] mova m3, [coeffq+16*5] ret ALIGN function_align cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] punpckhqdq m6, m2, m1 ;low:t11 high:t15a punpcklqdq m2, m1 ;low:t10 high:t14a psubsw m1, m2, m6 paddsw m2, m6 punpckhqdq m6, m4, m5 ;low:t3a high:t7 punpcklqdq m4, m5 ;low:t2a high:t6 psubsw m5, m4, m6 paddsw m4, m6 pmulhrsw m1, m7 ;low:-out9 high:out10 pmulhrsw m2, m7 ;low:out6 high:-out5 pmulhrsw m5, m7 ;low:out8 high:-out11 pmulhrsw m4, m7 ;low:-out7 high:out4 punpckhqdq m7, m4, m5 ;low:out4 high:-out11 punpcklqdq m4, m5 ;low:-out7 high:out8 punpckhqdq m5, m2, m1 ;low:-out5 high:out10 punpcklqdq m2, m1 ;low:out6 high:-out9 ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call m(iadst_16x4_internal_8bpc).main call m(iadst_16x4_internal_8bpc).main_pass1_end punpcklwd m6, m7, m0 ;packed out11, out15 punpckhwd m0, m7 ;packed -out0, -out4 punpckhwd m7, m3, m4 ;packed out3, out7 punpcklwd m4, m3 ;packed -out8, -out12 mova m1, [coeffq+16*6] punpckhwd m3, m1, m5 ;packed out1, out5 punpcklwd m5, m1 ;packed -out10, -out14 mova m1, [coeffq+16*7] mova [coeffq+16*6], m3 mova [coeffq+16*7], m7 punpcklwd m3, m2, m1 ;packed out9, out13 punpckhwd m1, m2 ;packed -out2, -out6 mova m7, [o(pw_m16384)] jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] jmp m(idct_16x4_internal_8bpc).pass2_end INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, [coeffq+16*6] mova m0, [coeffq+16*5] mova m2, [coeffq+16*7] mova m6, [o(pw_1697x16)] mova m7, [o(pw_16384)] pmulhrsw m4, m6, m1 pmulhrsw m3, m6, m0 pmulhrsw m5, m6, m2 pmulhrsw m4, m7 pmulhrsw m3, m7 pmulhrsw m5, m7 paddsw m1, m4 paddsw m0, m3 paddsw m5, m2 mova m2, [coeffq+16*2] mova m3, [coeffq+16*3] mova m4, [coeffq+16*4] mova [coeffq+16*6], m1 mova [coeffq+16*5], m0 mova [coeffq+16*7], m5 pmulhrsw m0, m6, m2 pmulhrsw m1, m6, m3 pmulhrsw m5, m6, m4 pmulhrsw m0, m7 pmulhrsw m1, m7 pmulhrsw m5, m7 paddsw m2, m0 paddsw m3, m1 paddsw m4, m5 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] pmulhrsw m5, m6, m0 pmulhrsw m6, m1 pmulhrsw m5, m7 pmulhrsw m6, m7 paddsw m0, m5 paddsw m1, m6 mova m6, [coeffq+16*6] mova m5, [coeffq+16*5] punpckhwd m7, m0, m2 ;packed out1, out5 punpcklwd m0, m2 ;packed out0, out4 punpckhwd m2, m1, m3 ;packed out3, out7 punpcklwd m1, m3 ;packed out2, out6 mova [coeffq+16*6], m7 mova m7, [coeffq+16*7] punpckhwd m3, m4, m6 ;packed out9, out13 punpcklwd m4, m6 ;packed out8, out12 punpckhwd m6, m5, m7 ;packed out11, out15 punpcklwd m5, m7 ;packed out10, out14 jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] jmp m(idct_16x4_internal_8bpc).pass2_end %macro SAVE_8ROWS 2 ;src, stride mova [%1+%2*0], m0 mova [%1+%2*1], m1 mova [%1+%2*2], m2 mova [%1+%2*3], m3 mova [%1+%2*4], m4 mova [%1+%2*5], m5 mova [%1+%2*6], m6 mova [%1+%2*7], m7 %endmacro %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16, 8, 16*16 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mova m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 pmulhrsw m0, m2 psrlw m2, 3 ; pw_2048 pmulhrsw m0, m1 pmulhrsw m0, m2 mov r3d, 4 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] .pass1: LOAD_8ROWS coeffq+16*1, 32, 1 mov [rsp+gprsize+16*11], tx2q lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32, 1 mov tx2q, [rsp+gprsize+16*11] jmp r3 .pass2: lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] .pass2_pre: mova [coeffq+16*2 ], m1 mova [coeffq+16*6 ], m3 mova [coeffq+16*10], m5 mova [coeffq+16*14], m7 mova m1, m2 mova m2, m4 mova m3, m6 mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*5 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*13] .pass2_main: call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*2 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*10] mova m3, [coeffq+16*14] mova m4, [coeffq+16*3 ] mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*11] mova m7, [coeffq+16*15] call m(idct_16x8_internal_8bpc).main mov r3, dstq lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] jmp m(idct_8x16_internal_8bpc).pass1 .pass2: lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] .pass2_pre: mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 mova m0, m2 mova m1, m3 mova m2, m4 mova m3, m5 .pass2_main: mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*3 ] mova m6, [coeffq+16*13] mova m7, [coeffq+16*15] mova [rsp+gprsize+16*3], m4 mova [rsp+gprsize+16*4], m5 mova [rsp+gprsize+16*9], m6 mova [rsp+gprsize+32*5], m7 mova m4, [coeffq+16*5 ] mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end mov r3, dstq lea dstq, [dstq+strideq*8] jmp m(iadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] jmp m(idct_8x16_internal_8bpc).pass1 .pass2: lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] lea r3, [dstq+strideq*8] .pass2_pre: mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 mova m0, m2 mova m1, m3 mova m2, m4 mova m3, m5 .pass2_main: mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*3 ] mova m6, [coeffq+16*13] mova m7, [coeffq+16*15] mova [rsp+gprsize+16*3], m4 mova [rsp+gprsize+16*4], m5 mova [rsp+gprsize+16*9], m6 mova [rsp+gprsize+32*5], m7 mova m4, [coeffq+16*5 ] mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end jmp m(iflipadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iflipadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 mov r3, tx2q lea tx2q, [o(.pass1_end)] mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32, 1 mov tx2q, r3 mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass2: lea tx2q, [o(.end1)] .end: mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*1], m6 mova m7, [o(pw_1697x16)] REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 mova m6, [rsp+gprsize+16*1] mova [rsp+gprsize+16*2], m5 IDTX16 6, 5, 7 mova m5, [rsp+gprsize+16*0] IDTX16 5, 7, 7 mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [rsp+gprsize+16*2] mova [rsp+gprsize+16*0], m5 mova [rsp+gprsize+16*1], m6 mova [rsp+gprsize+16*2], m7 jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 4 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 32, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*1, 32, 1 call .main mov r3, tx2q lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 jmp m(idct_8x8_internal_8bpc).pass1_end .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).pass2_main ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 mova [rsp+gprsize*2+32*5], m5 mova m6, [o(pd_2048)] ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a psubsw m2, m0, m4 ;t9 paddsw m0, m4 ;t8 psubsw m4, m7, m3 ;t14 paddsw m7, m3 ;t15 ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a mova m3, [rsp+gprsize*2+16*1] mova m5, [rsp+gprsize*2+32*5] mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+32*5], m4 mova m2, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*2], m7 ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a psubsw m4, m2, m3 ;t10 paddsw m2, m3 ;t11 psubsw m3, m1, m5 ;t13 paddsw m1, m5 ;t12 ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a mova m7, [rsp+gprsize*2+32*5] psubsw m6, m0, m2 ;t11a paddsw m0, m2 ;t8a paddsw m2, m7, m3 ;t9 psubsw m7, m3 ;t10 mova m5, [rsp+gprsize*2+16*0] psubsw m3, m5, m0 ;out8 paddsw m0, m5 ;out7 mova [rsp+gprsize*2+32*5], m0 mova m5, [rsp+gprsize*2+16*9] psubsw m0, m5, m2 ;out9 paddsw m2, m5 ;out6 mova [rsp+gprsize*2+16*0], m0 mova [rsp+gprsize*2+16*9], m2 mova m0, [rsp+gprsize*2+16*1] mova m2, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*1], m3 psubsw m5, m0, m4 ;t13 paddsw m0, m4 ;t14 mova m3, [o(pd_2048)] psubsw m4, m2, m1 ;t12a paddsw m1, m2 ;t15a mova [rsp+gprsize*2+16*2], m1 ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 mova m3, [rsp+gprsize*2+16*8] psubsw m2, m3, m5 ;out10 paddsw m3, m5 ;out5 mova m5, [rsp+gprsize*2+16*7] mova [rsp+gprsize*2+16*8], m3 psubsw m3, m5, m4 ;out11 paddsw m5, m4 ;out4 mova m4, [rsp+gprsize*2+16*6] mova [rsp+gprsize*2+16*7], m5 paddsw m5, m4, m6 ;out3 psubsw m4, m6 ;out12 mova m6, [rsp+gprsize*2+16*5] mova [rsp+gprsize*2+16*6], m5 psubsw m5, m6, m7 ;out13 paddsw m6, m7 ;out2 mova m7, [rsp+gprsize*2+16*4] mova [rsp+gprsize*2+16*5], m6 psubsw m6, m7, m0 ;out14 paddsw m7, m0 ;out1 mova m1, [rsp+gprsize*2+16*2] mova m0, [rsp+gprsize*2+16*3] mova [rsp+gprsize*2+16*4], m7 psubsw m7, m0, m1 ;out15 paddsw m0, m1 ;out0 mova [rsp+gprsize*2+16*3], m0 mova m1, [rsp+gprsize*2+16*0] mova m0, [rsp+gprsize*2+16*1] mova [rsp+gprsize*2+16*0], m7 ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] pmulhrsw m2, m7, [coeffq+16*14] pmulhrsw m3, m7, [coeffq+16*15] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 pmulhrsw m0, m7, [coeffq+16*6 ] pmulhrsw m1, m7, [coeffq+16*7 ] pmulhrsw m2, m7, [coeffq+16*8 ] pmulhrsw m3, m7, [coeffq+16*9 ] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 pmulhrsw m0, m7, [coeffq+16*2 ] pmulhrsw m1, m7, [coeffq+16*3 ] pmulhrsw m2, m7, [coeffq+16*4 ] pmulhrsw m3, m7, [coeffq+16*5 ] pmulhrsw m4, m7, [coeffq+16*10] pmulhrsw m5, m7, [coeffq+16*11] pmulhrsw m6, m7, [coeffq+16*12] pmulhrsw m7, [coeffq+16*13] call .main call .main_pass1_end mov r3, tx2q lea tx2q, [o(.pass1_end)] jmp m(iadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 jmp m(iadst_8x8_internal_8bpc).pass1_end .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iadst_8x8_internal_8bpc).pass2_main ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*0], m1 mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 mova m6, [o(pd_2048)] ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 psubsw m1, m0, m4 ;t10a paddsw m0, m4 ;t2a psubsw m4, m7, m3 ;t11a paddsw m3, m7 ;t3a ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 mova m2, [rsp+gprsize*2+16*0] ;in3 mova m7, [rsp+gprsize*2+16*1] ;in4 mova [rsp+gprsize*2+16*0], m1 ;t11 mova [rsp+gprsize*2+16*1], m4 ;t10 mova m1, [rsp+gprsize*2+16*2] ;in12 mova [rsp+gprsize*2+16*2], m0 ;t2a ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 psubsw m0, m7, m1 ;t12a paddsw m1, m7 ;t4a psubsw m4, m5, m2 ;t13a paddsw m5, m2 ;t5a ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 mova m2, [rsp+gprsize*2+16*8] ;in1 mova m7, [rsp+gprsize*2+16*9] ;in14 mova [rsp+gprsize*2+16*8], m4 ;t12 mova [rsp+gprsize*2+16*9], m0 ;t13 mova m4, [rsp+gprsize*2+16*4] ;in9 mova m0, [rsp+gprsize*2+16*5] ;in6 mova [rsp+gprsize*2+16*4], m1 ;t4a mova [rsp+gprsize*2+16*5], m5 ;t5a ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 psubsw m1, m0, m7 ;t14a paddsw m0, m7 ;t6a psubsw m5, m4, m2 ;t15a paddsw m4, m2 ;t7a ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 mova m2, [rsp+gprsize*2+16*2] ;t2a mova [rsp+gprsize*2+16*2], m5 ;t14 psubsw m7, m2, m0 ;t6 paddsw m2, m0 ;t2 psubsw m0, m3, m4 ;t7 paddsw m3, m4 ;t3 ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a mova m4, [rsp+gprsize*2+16*7] ;in0 mova m5, [rsp+gprsize*2+32*5] ;in15 mova [rsp+gprsize*2+16*7], m3 ;t3 mova [rsp+gprsize*2+32*5], m1 ;t15 mova m1, [rsp+gprsize*2+16*6] ;in7 mova m3, [rsp+gprsize*2+16*3] ;in8 mova [rsp+gprsize*2+16*6], m7 ;t7a mova [rsp+gprsize*2+16*3], m0 ;t6a ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 psubsw m0, m4, m3 ;t8a paddsw m4, m3 ;t0a psubsw m3, m5, m1 ;t9a paddsw m5, m1 ;t1a ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 mova m1, [rsp+gprsize*2+16*4] ;t4a mova m7, [rsp+gprsize*2+16*5] ;t5a mova [rsp+gprsize*2+16*4], m3 ;t8 mova [rsp+gprsize*2+16*5], m0 ;t9 psubsw m0, m4, m1 ;t4 paddsw m4, m1 ;t0 psubsw m3, m5, m7 ;t5 paddsw m5, m7 ;t1 ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a mova m7, [rsp+gprsize*2+16*3] ;t6a psubsw m1, m4, m2 ;t2a paddsw m4, m2 ;out0 mova [rsp+gprsize*2+16*3], m4 ;out0 mova m4, [rsp+gprsize*2+16*6] ;t7a psubsw m2, m3, m7 ;t6 paddsw m3, m7 ;-out3 mova [rsp+gprsize*2+16*6], m3 ;-out3 psubsw m3, m0, m4 ;t7 paddsw m0, m4 ;out12 mova [rsp+gprsize*2+16*12], m3 mova m3, [rsp+gprsize*2+16*7] ;t3 mova [rsp+gprsize*2+16* 7], m2 ;out4 psubsw m2, m5, m3 ;t3a paddsw m5, m3 ;-out15 mova [rsp+gprsize*2+16*11], m2 mova m2, [rsp+gprsize*2+32*5] ;t15 mova [rsp+gprsize*2+16*10], m1 ;-out7 mova m1, [rsp+gprsize*2+16*0] ;t11 mova [rsp+gprsize*2+16*0 ], m5 ;-out15 mova m3, [rsp+gprsize*2+16*1] ;t10 mova [rsp+gprsize*2+16*1 ], m4 ;-out11 mova m4, [rsp+gprsize*2+16*2] ;t14 mova [rsp+gprsize*2+16*2 ], m0 ;out12 psubsw m0, m3, m4 ;t14a paddsw m3, m4 ;t10a psubsw m5, m1, m2 ;t15a paddsw m1, m2 ;t11a ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 mova m2, [rsp+gprsize*2+16*4] ;t8 mova m4, [rsp+gprsize*2+16*5] ;t9 mova [rsp+gprsize*2+16*4], m3 ;t10a mova [rsp+gprsize*2+16*5], m1 ;t11a mova m3, [rsp+gprsize*2+16*8] ;t12 mova m1, [rsp+gprsize*2+16*9] ;t13 mova [rsp+gprsize*2+16*8], m5 ;t14 mova [rsp+gprsize*2+16*9], m0 ;t15 psubsw m5, m2, m3 ;t12a paddsw m2, m3 ;t8a psubsw m0, m4, m1 ;t13a paddsw m4, m1 ;t9a ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 mova m6, [rsp+gprsize*2+16*4] ;t10a mova m1, [rsp+gprsize*2+16*5] ;t11a psubsw m3, m2, m6 ;t10 paddsw m2, m6 ;-out1 paddsw m6, m4, m1 ;out14 psubsw m4, m1 ;t11 mova [rsp+gprsize*2+16*14], m4 mova [rsp+gprsize*2+16* 4], m2 ;-out1 mova m4, [rsp+gprsize*2+16*8] ;t14 mova m2, [rsp+gprsize*2+16*9] ;t15 mova [rsp+gprsize*2+16* 9], m3 ;out6 psubsw m3, m0, m4 ;t14a paddsw m0, m4 ;out2 psubsw m4, m5, m2 ;t15a paddsw m5, m2 ;-out13 mova [rsp+gprsize*2+16* 5], m0 ;out2 ret ALIGN function_align .main_pass1_end: mova m0, [rsp+gprsize*2+16*14] mova [rsp+gprsize*2+16*14], m5 mova [rsp+gprsize*2+16*15], m6 mova m5, [o(pw_2896_2896)] mova m6, [o(pw_2896_m2896)] mova m7, [o(pd_2048)] punpcklwd m2, m3, m4 punpckhwd m3, m4 pmaddwd m4, m5, m2 pmaddwd m2, m6 pmaddwd m1, m5, m3 pmaddwd m3, m6 REPX {paddd x, m7}, m4, m2, m1, m3 REPX {psrad x, 12}, m4, m1, m2, m3 packssdw m4, m1 ;-out5 packssdw m2, m3 ;out10 mova [rsp+gprsize*2+16* 8], m4 mova m3, [rsp+gprsize*2+16* 9] punpcklwd m1, m3, m0 punpckhwd m3, m0 pmaddwd m0, m5, m1 pmaddwd m1, m6 pmaddwd m4, m5, m3 pmaddwd m3, m6 REPX {paddd x, m7}, m0, m1, m4, m3 REPX {psrad x, 12}, m0, m4, m1, m3 packssdw m0, m4 ;out6 packssdw m1, m3 ;-out9 mova [rsp+gprsize*2+16* 9], m0 mova m0, [rsp+gprsize*2+16* 7] mova m4, [rsp+gprsize*2+16*12] punpcklwd m3, m0, m4 punpckhwd m0, m4 pmaddwd m4, m5, m3 pmaddwd m3, m6 pmaddwd m5, m0 pmaddwd m0, m6 REPX {paddd x, m7}, m4, m3, m5, m0 REPX {psrad x, 12}, m4, m5, m3, m0 packssdw m4, m5 ;out4 packssdw m3, m0 ;-out11 mova [rsp+gprsize*2+16* 7], m4 mova m4, [rsp+gprsize*2+16*10] mova m5, [rsp+gprsize*2+16*11] punpcklwd m0, m4, m5 punpckhwd m4, m5 pmaddwd m5, m0, [o(pw_2896_2896)] pmaddwd m0, m6 pmaddwd m6, m4 pmaddwd m4, [o(pw_2896_2896)] REPX {paddd x, m7}, m5, m0, m6, m4 REPX {psrad x, 12}, m0, m6, m5, m4 packssdw m0, m6 ;out8 packssdw m5, m4 ;-out7 mova [rsp+gprsize*2+16*10], m5 mova m4, [rsp+gprsize*2+16* 2] ;out12 mova m5, [rsp+gprsize*2+16*14] ;-out13 mova m6, [rsp+gprsize*2+16*15] ;out14 ret ALIGN function_align cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] mova m1, [rsp+gprsize*2+16* 9] mova m2, [rsp+gprsize*2+16*14] paddsw m0, m1, m2 psubsw m1, m2 pmulhrsw m0, m7 ;out6 pmulhrsw m1, m7 ;-out9 mova [rsp+gprsize*2+16* 9], m0 psubsw m2, m3, m4 paddsw m3, m4 pmulhrsw m2, m7 ;out10 pmulhrsw m3, m7 ;-out5 mova [rsp+gprsize*2+16* 8], m3 mova m3, [rsp+gprsize*2+16* 7] mova m4, [rsp+gprsize*2+16*12] paddsw m0, m3, m4 psubsw m3, m4 pmulhrsw m0, m7 ;out4 pmulhrsw m3, m7 ;-out11 mova [rsp+gprsize*2+16* 7], m0 mova m0, [rsp+gprsize*2+16*10] paddsw m4, m0, [rsp+gprsize*2+16*11] psubsw m0, [rsp+gprsize*2+16*11] pmulhrsw m4, m7 ;-out7 pmulhrsw m0, m7 ;out8 mova [rsp+gprsize*2+16*10], m4 mova m4, [rsp+gprsize*2+16*2 ] ;out12 ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] pmulhrsw m2, m7, [coeffq+16*14] pmulhrsw m3, m7, [coeffq+16*15] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 pmulhrsw m0, m7, [coeffq+16*6 ] pmulhrsw m1, m7, [coeffq+16*7 ] pmulhrsw m2, m7, [coeffq+16*8 ] pmulhrsw m3, m7, [coeffq+16*9 ] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 pmulhrsw m0, m7, [coeffq+16*2 ] pmulhrsw m1, m7, [coeffq+16*3 ] pmulhrsw m2, m7, [coeffq+16*4 ] pmulhrsw m3, m7, [coeffq+16*5 ] pmulhrsw m4, m7, [coeffq+16*10] pmulhrsw m5, m7, [coeffq+16*11] pmulhrsw m6, m7, [coeffq+16*12] pmulhrsw m7, [coeffq+16*13] call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, tx2q lea tx2q, [o(.pass1_end)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iflipadst_8x8_internal_8bpc).pass2_main INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*16 mova m4, [coeffq-16*7] mova m5, [coeffq-16*5] mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mov r3, tx2q lea tx2q, [o(.pass1_end)] .pass1: mova m0, [o(pw_2896x8)] mova m2, [o(pw_1697x16)] mova m3, [o(pw_16384)] sub coeffq, 8*16 REPX {pmulhrsw x, m0}, m4, m5, m6, m7 pmulhrsw m1, m2, m4 pmulhrsw m1, m3 paddsw m1, m4 ; 1 pmulhrsw m4, m2, m5 pmulhrsw m4, m3 paddsw m4, m5 ; 3 pmulhrsw m5, m2, m6 pmulhrsw m5, m3 paddsw m5, m6 ; 5 pmulhrsw m6, m2, m7 pmulhrsw m6, m3 paddsw m7, m6 ; 7 pmulhrsw m6, m0, [coeffq+16*6] mova [rsp+gprsize+16*0], m4 pmulhrsw m4, m2, m6 pmulhrsw m4, m3 paddsw m6, m4 ; 6 pmulhrsw m4, m0, [coeffq+16*4] mova [rsp+gprsize+16*1], m6 pmulhrsw m6, m2, m4 pmulhrsw m6, m3 paddsw m4, m6 ; 4 pmulhrsw m6, m0, [coeffq+16*2] pmulhrsw m0, [coeffq+16*0] pmulhrsw m2, m6 pmulhrsw m2, m3 paddsw m2, m6 ; 2 pmulhrsw m6, m0, [o(pw_1697x16)] pmulhrsw m6, m3 mova m3, [rsp+gprsize+16*0] paddsw m0, m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: mova [coeffq+16*1], m4 mova [coeffq+16*3], m5 mova [coeffq+16*5], m6 mova [coeffq+16*7], m7 mova m4, [coeffq-16*7] mova m5, [coeffq-16*5] mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mova [coeffq-16*7], m0 mova [coeffq-16*5], m1 mova [coeffq-16*3], m2 mova [coeffq-16*1], m3 mov tx2q, r3 jmp .pass1 .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iidentity_8x8_internal_8bpc).end .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iidentity_8x8_internal_8bpc).end %macro INV_TXFM_16X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x16, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 8 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*3, 64 call m(idct_16x8_internal_8bpc).main mov r3, tx2q lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass2: lea tx2q, [o(.end)] jmp m(idct_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 mov dstq, r3 mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*4 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*12] mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*5 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*13] lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] jmp m(idct_8x16_internal_8bpc).pass2_main %macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 mova m0, [coeffq+16*1 ] mova m1, [coeffq+16*3 ] mova m2, [coeffq+16*29] mova m3, [coeffq+16*31] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 mova m0, [coeffq+16*13] mova m1, [coeffq+16*15] mova m2, [coeffq+16*17] mova m3, [coeffq+16*19] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 mova m0, [coeffq+16*5 ] mova m1, [coeffq+16*7 ] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*11] mova m4, [coeffq+16*21] mova m5, [coeffq+16*23] mova m6, [coeffq+16*25] mova m7, [coeffq+16*27] %endmacro %macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*2 ] mova m2, [coeffq+16*28] mova m3, [coeffq+16*30] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 mova m0, [coeffq+16*12] mova m1, [coeffq+16*14] mova m2, [coeffq+16*16] mova m3, [coeffq+16*18] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*10] mova m4, [coeffq+16*20] mova m5, [coeffq+16*22] mova m6, [coeffq+16*24] mova m7, [coeffq+16*26] %endmacro INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass2: lea tx2q, [o(.end)] jmp m(iadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 mov dstq, r3 mova m4, [coeffq+16*0 ] mova m5, [coeffq+16*2 ] mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*10] mova m6, [coeffq+16*12] mova m7, [coeffq+16*14] mova [rsp+gprsize+16*7], m4 mova [rsp+gprsize+16*8], m5 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] jmp m(iadst_8x16_internal_8bpc).pass2_main INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q lea tx2q, [o(.pass1_end)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*17, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS coeffq+16* 0, 32 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] lea dstq, [dstq+strideq*2] jmp m(iflipadst_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 mova m4, [coeffq+16*0 ] mova m5, [coeffq+16*2 ] mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*10] mova m6, [coeffq+16*12] mova m7, [coeffq+16*14] mova [rsp+gprsize+16*7], m4 mova [rsp+gprsize+16*8], m5 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 lea tx2q, [o(.end2)] mov dstq, r3 jmp m(iflipadst_8x16_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp m(iflipadst_8x8_internal_8bpc).end %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 pmulhrsw m%2, m%3, m%1 psraw m%2, 1 pavgw m%1, m%2 %endmacro INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*17 mov r3, tx2q lea tx2q, [o(.pass1_end)] .pass1: mova m6, [o(pw_1697x16)] mova m7, [coeffq+32*6] mova m0, [coeffq+32*0] mova m1, [coeffq+32*1] mova m2, [coeffq+32*2] mova m3, [coeffq+32*3] mova m4, [coeffq+32*4] REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 mova m5, [coeffq+32*5] mova [rsp+gprsize+16*1], m7 IDTX16B 5, 7, 6 mova m7, [coeffq+32*7] IDTX16B 7, 6, 6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq, 32 sub coeffq, 16 lea tx2q, [o(.pass1_end1)] jmp .pass1 .pass1_end1: SAVE_8ROWS coeffq, 32 sub coeffq, 15*16 lea tx2q, [o(.pass1_end2)] jmp .pass1 .pass1_end2: SAVE_8ROWS coeffq, 32 sub coeffq, 16 mov tx2q, r3 jmp .pass1 .pass2: lea r3, [dstq+8] lea tx2q, [o(.end1)] .end: mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*1], m4 mova m7, [o(pw_1697x16)] REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 mova m4, [o(pw_2048)] pmulhrsw m5, m4 pmulhrsw m6, m4 mova [rsp+gprsize+16*2], m5 mova m5, [rsp+gprsize+16*1] mova [rsp+gprsize+16*1], m6 IDTX16 5, 6, 7 mova m6, [rsp+gprsize+16*0] IDTX16 6, 7, 7 REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 pmulhrsw m4, m5 mova [rsp+gprsize+16*0], m6 jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*2] jmp .end .end2: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 LOAD_8ROWS coeffq, 32 lea tx2q, [o(.end3)] mov dstq, r3 jmp .end .end3: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_8x32_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd pmulhrsw m0, m2 psrlw m2, 2 ;pw_2048 pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 mov r3d, 8 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 cmp eobd, 106 jle .fast LOAD_8ROWS coeffq+16*3, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1: mova [rsp+gprsize+16*9 ], m0 ;in24 mova [rsp+gprsize+16*10], m4 ;in28 mova [rsp+gprsize+16*17], m2 ;in26 mova [rsp+gprsize+16*18], m6 ;in30 mova [rsp+gprsize+16*31], m1 ;in25 mova [rsp+gprsize+16*30], m3 ;in27 mova [rsp+gprsize+16*27], m5 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 LOAD_8ROWS coeffq+16*2, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_1: mova [rsp+gprsize+16*7 ], m0 ;in16 mova [rsp+gprsize+16*8 ], m4 ;in20 mova [rsp+gprsize+16*15], m2 ;in18 mova [rsp+gprsize+16*16], m6 ;in22 mova [rsp+gprsize+16*33], m1 ;in17 mova [rsp+gprsize+16*28], m3 ;in19 mova [rsp+gprsize+16*29], m5 ;in21 mova [rsp+gprsize+16*32], m7 ;in23 .fast: LOAD_8ROWS coeffq+16*1, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: mova [rsp+gprsize+16*5 ], m0 ;in8 mova [rsp+gprsize+16*6 ], m4 ;in12 mova [rsp+gprsize+16*13], m2 ;in10 mova [rsp+gprsize+16*14], m6 ;in14 mova [rsp+gprsize+16*21], m1 ;in9 mova [rsp+gprsize+16*24], m3 ;in11 mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: mova [rsp+gprsize+16*11], m2 ;in2 mova [rsp+gprsize+16*12], m6 ;in6 mova [rsp+gprsize+16*19], m1 ;in1 mova [rsp+gprsize+16*26], m3 ;in3 mova [rsp+gprsize+16*23], m5 ;in5 mova [rsp+gprsize+16*22], m7 ;in7 mova m1, m4 ;in4 mova m2, [rsp+gprsize+16*5 ] ;in8 mova m3, [rsp+gprsize+16*6 ] ;in12 cmp eobd, 106 jg .full pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 mova m0, [rsp+gprsize+16*11] mova m1, [rsp+gprsize+16*12] mova m2, [rsp+gprsize+16*13] mova m3, [rsp+gprsize+16*14] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call .main_fast jmp .pass2 .full: mova m4, [rsp+gprsize+16*7 ] ;in16 mova m5, [rsp+gprsize+16*8 ] ;in20 mova m6, [rsp+gprsize+16*9 ] ;in24 mova m7, [rsp+gprsize+16*10] ;in28 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call .main .pass2: lea r3, [o(.end6)] .end: mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.end2)] .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 jmp tx2q .end2: lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).end .end3: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).end .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).end .end5: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] mov tx2q, r3 jmp m(idct_8x8_internal_8bpc).end .end6: ret ALIGN function_align cglobal_label .main_veryfast mova m0, [rsp+gprsize*2+16*19] ;in1 pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 pmulhrsw m0, [o(pw_201x8)] ;t16,t17 mova m7, [o(pd_2048)] mova [rsp+gprsize*2+16*19], m0 ;t16 mova [rsp+gprsize*2+16*34], m3 ;t31 ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a mova [rsp+gprsize*2+16*20], m3 ;t17a mova [rsp+gprsize*2+16*33], m0 ;t30a mova m1, [rsp+gprsize*2+16*22] ;in7 pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 mova [rsp+gprsize*2+16*22], m1 ;t19 mova [rsp+gprsize*2+16*31], m2 ;t28 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a mova [rsp+gprsize*2+16*21], m2 ;t18a mova [rsp+gprsize*2+16*32], m1 ;t29a mova m0, [rsp+gprsize*2+16*23] ;in5 pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 pmulhrsw m0, [o(pw_995x8)] ;t20, t21 mova [rsp+gprsize*2+16*23], m0 ;t20 mova [rsp+gprsize*2+16*30], m3 ;t27 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a mova [rsp+gprsize*2+16*24], m3 ;t21a mova [rsp+gprsize*2+16*29], m0 ;t26a mova m2, [rsp+gprsize*2+16*26] ;in3 pxor m0, m0 mova m3, m0 pmulhrsw m1, m2, [o(pw_4052x8)] pmulhrsw m2, [o(pw_m601x8)] jmp .main2 ALIGN function_align cglobal_label .main_fast ;bottom half is zero mova m0, [rsp+gprsize*2+16*19] ;in1 mova m1, [rsp+gprsize*2+16*20] ;in15 pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a pmulhrsw m0, [o(pw_201x8)] ;t16a pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a pmulhrsw m1, [o(pw_m2751x8)] ;t17a mova m7, [o(pd_2048)] psubsw m4, m0, m1 ;t17 paddsw m0, m1 ;t16 psubsw m5, m3, m2 ;t30 paddsw m3, m2 ;t31 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a mova [rsp+gprsize*2+16*19], m0 ;t16 mova [rsp+gprsize*2+16*20], m5 ;t17a mova [rsp+gprsize*2+16*33], m4 ;t30a mova [rsp+gprsize*2+16*34], m3 ;t31 mova m0, [rsp+gprsize*2+16*21] ;in9 mova m1, [rsp+gprsize*2+16*22] ;in7 pmulhrsw m3, m0, [o(pw_3703x8)] pmulhrsw m0, [o(pw_1751x8)] pmulhrsw m2, m1, [o(pw_3857x8)] pmulhrsw m1, [o(pw_m1380x8)] psubsw m4, m1, m0 ;t18 paddsw m0, m1 ;t19 psubsw m5, m2, m3 ;t29 paddsw m3, m2 ;t28 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a mova [rsp+gprsize*2+16*21], m5 ;t18a mova [rsp+gprsize*2+16*22], m0 ;t19 mova [rsp+gprsize*2+16*31], m3 ;t28 mova [rsp+gprsize*2+16*32], m4 ;t29a mova m0, [rsp+gprsize*2+16*23] ;in5 mova m1, [rsp+gprsize*2+16*24] ;in11 pmulhrsw m3, m0, [o(pw_3973x8)] pmulhrsw m0, [o(pw_995x8)] pmulhrsw m2, m1, [o(pw_3513x8)] pmulhrsw m1, [o(pw_m2106x8)] psubsw m4, m0, m1 ;t21 paddsw m0, m1 ;t20 psubsw m5, m3, m2 ;t26 paddsw m3, m2 ;t27 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a mova [rsp+gprsize*2+16*23], m0 ;t20 mova [rsp+gprsize*2+16*24], m5 ;t21a mova [rsp+gprsize*2+16*29], m4 ;t26a mova [rsp+gprsize*2+16*30], m3 ;t27 mova m0, [rsp+gprsize*2+16*25] ;in13 mova m2, [rsp+gprsize*2+16*26] ;in3 pmulhrsw m3, m0, [o(pw_3290x8)] pmulhrsw m0, [o(pw_2440x8)] pmulhrsw m1, m2, [o(pw_4052x8)] pmulhrsw m2, [o(pw_m601x8)] jmp .main2 ALIGN function_align cglobal_label .main mova m7, [o(pd_2048)] mova m0, [rsp+gprsize*2+16*19] ;in1 mova m1, [rsp+gprsize*2+16*20] ;in15 mova m2, [rsp+gprsize*2+16*33] ;in17 mova m3, [rsp+gprsize*2+16*34] ;in31 ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a psubsw m4, m0, m2 ;t17 paddsw m0, m2 ;t16 psubsw m5, m3, m1 ;t30 paddsw m3, m1 ;t31 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a mova [rsp+gprsize*2+16*19], m0 ;t16 mova [rsp+gprsize*2+16*20], m5 ;t17a mova [rsp+gprsize*2+16*33], m4 ;t30a mova [rsp+gprsize*2+16*34], m3 ;t31 mova m0, [rsp+gprsize*2+16*21] ;in9 mova m1, [rsp+gprsize*2+16*22] ;in7 mova m2, [rsp+gprsize*2+16*31] ;in25 mova m3, [rsp+gprsize*2+16*32] ;in23 ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a psubsw m4, m2, m0 ;t18 paddsw m0, m2 ;t19 psubsw m5, m1, m3 ;t29 paddsw m3, m1 ;t28 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a mova [rsp+gprsize*2+16*21], m5 ;t18a mova [rsp+gprsize*2+16*22], m0 ;t19 mova [rsp+gprsize*2+16*31], m3 ;t28 mova [rsp+gprsize*2+16*32], m4 ;t29a mova m0, [rsp+gprsize*2+16*23] ;in5 mova m1, [rsp+gprsize*2+16*24] ;in11 mova m2, [rsp+gprsize*2+16*29] ;in21 mova m3, [rsp+gprsize*2+16*30] ;in27 ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a psubsw m4, m0, m2 ;t21 paddsw m0, m2 ;t20 psubsw m5, m3, m1 ;t26 paddsw m3, m1 ;t27 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a mova [rsp+gprsize*2+16*23], m0 ;t20 mova [rsp+gprsize*2+16*24], m5 ;t21a mova [rsp+gprsize*2+16*29], m4 ;t26a mova [rsp+gprsize*2+16*30], m3 ;t27 mova m0, [rsp+gprsize*2+16*25] ;in13 mova m1, [rsp+gprsize*2+16*26] ;in3 mova m2, [rsp+gprsize*2+16*27] ;in29 mova m3, [rsp+gprsize*2+16*28] ;in19 ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a .main2: psubsw m4, m2, m0 ;t22 paddsw m0, m2 ;t23 psubsw m5, m1, m3 ;t25 paddsw m3, m1 ;t24 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a mova m2, [rsp+gprsize*2+16*24] ;t21a psubsw m1, m5, m2 ;t21 paddsw m5, m2 ;t22 mova [rsp+gprsize*2+16*25], m5 ;t22 mova m2, [rsp+gprsize*2+16*29] ;t26a psubsw m5, m4, m2 ;t26 paddsw m4, m2 ;t25 mova [rsp+gprsize*2+16*28], m4 ;t25 ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a mova [rsp+gprsize*2+16*24], m5 ;t21a mova [rsp+gprsize*2+16*29], m1 ;t26a mova m1, [rsp+gprsize*2+16*23] ;t20 mova m5, [rsp+gprsize*2+16*30] ;t27 psubsw m2, m0, m1 ;t20a paddsw m0, m1 ;t23a psubsw m6, m3, m5 ;t27a paddsw m3, m5 ;t24a ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 mova [rsp+gprsize*2+16*26], m0 ;t23a mova [rsp+gprsize*2+16*27], m3 ;t24a mova [rsp+gprsize*2+16*30], m2 ;t27 mova m0, [rsp+gprsize*2+16*20] ;t17a mova m1, [rsp+gprsize*2+16*21] ;t18a mova m2, [rsp+gprsize*2+16*32] ;t29a mova m3, [rsp+gprsize*2+16*33] ;t30a psubsw m4, m0, m1 ;t18 paddsw m0, m1 ;t17 psubsw m5, m3, m2 ;t29 paddsw m3, m2 ;t30 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a mova [rsp+gprsize*2+16*20], m0 ;t17 mova [rsp+gprsize*2+16*21], m5 ;t18a mova [rsp+gprsize*2+16*32], m4 ;t29a mova [rsp+gprsize*2+16*33], m3 ;t30 mova m0, [rsp+gprsize*2+16*19] ;t16 mova m1, [rsp+gprsize*2+16*22] ;t19 mova m2, [rsp+gprsize*2+16*31] ;t28 mova m3, [rsp+gprsize*2+16*34] ;t31 psubsw m4, m0, m1 ;t19a paddsw m0, m1 ;t16a psubsw m5, m3, m2 ;t28a paddsw m3, m2 ;t31a ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 mova m2, [rsp+gprsize*2+16*15] ;tmp12 psubsw m1, m5, m6 ;t20a paddsw m5, m6 ;t19a psubsw m6, m2, m5 ;out19 paddsw m2, m5 ;out12 mova m5, [rsp+gprsize*2+16*30] ;t27 mova [rsp+gprsize*2+16*22], m6 ;out19 mova [rsp+gprsize*2+16*15], m2 ;out12 psubsw m6, m4, m5 ;t27a paddsw m4, m5 ;t28a ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 psubsw m5, m2, m4 ;out28 paddsw m2, m4 ;out3 mova m4, [rsp+gprsize*2+16*14] ;tmp11 mova [rsp+gprsize*2+16*31], m5 ;out28 mova [rsp+gprsize*2+16*6 ], m2 ;out3 psubsw m5, m4, m6 ;out20 paddsw m4, m6 ;out11 mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 mova [rsp+gprsize*2+16*23], m5 ;out20 mova [rsp+gprsize*2+16*14], m4 ;out11 psubsw m5, m2, m1 ;out27 paddsw m2, m1 ;out4 mova m1, [rsp+gprsize*2+16*26] ;t23a mova m4, [rsp+gprsize*2+16*27] ;t24a mova [rsp+gprsize*2+16*30], m5 ;out27 mova [rsp+gprsize*2+16*7 ], m2 ;out4 psubsw m5, m0, m1 ;t23 paddsw m0, m1 ;t16 psubsw m2, m3, m4 ;t24 paddsw m3, m4 ;t31 ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a mova m6, [rsp+gprsize*2+16*18] ;tmp15 psubsw m4, m6, m0 ;out16 paddsw m6, m0 ;out15 mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 mova m1, [rsp+gprsize*2+16*11] ;tmp8 mova [rsp+gprsize*2+16*18], m6 ;out15 mova [rsp+gprsize*2+16*19], m4 ;out16 psubsw m6, m0, m3 ;out31 paddsw m0, m3 ;out0 psubsw m4, m1, m2 ;out23 paddsw m1, m2 ;out8 mova m3, [rsp+gprsize*2+16*10] ;tmp7 mova [rsp+gprsize*2+16*34], m6 ;out31 mova [rsp+gprsize*2+16*11], m1 ;out8 mova [rsp+gprsize*2+16*26], m4 ;out23 paddsw m6, m3, m5 ;out7 psubsw m3, m5 ;out24 mova m1, [rsp+gprsize*2+16*20] ;t17 mova m5, [rsp+gprsize*2+16*25] ;t22 mova m2, [rsp+gprsize*2+16*17] ;tmp14 mova [rsp+gprsize*2+16*27], m3 ;out24 psubsw m4, m1, m5 ;t22a paddsw m1, m5 ;t17a psubsw m3, m2, m1 ;out17 paddsw m2, m1 ;out14 mova m5, [rsp+gprsize*2+16*28] ;t25 mova m1, [rsp+gprsize*2+16*33] ;t30 mova [rsp+gprsize*2+16*17], m2 ;out14 mova [rsp+gprsize*2+16*20], m3 ;out17 psubsw m2, m1, m5 ;t25a paddsw m1, m5 ;t30a ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 psubsw m3, m5, m1 ;out30 paddsw m5, m1 ;out1 mova m1, [rsp+gprsize*2+16*12] ;tmp9 mova [rsp+gprsize*2+16*33], m3 ;out30 mova [rsp+gprsize*2+16*4 ], m5 ;out1 psubsw m3, m1, m2 ;out22 paddsw m1, m2 ;out9 mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 mova [rsp+gprsize*2+16*25], m3 ;out22 mova [rsp+gprsize*2+16*12], m1 ;out9 psubsw m3, m5, m4 ;out25 paddsw m5, m4 ;out6 mova m4, [rsp+gprsize*2+16*21] ;t18a mova m1, [rsp+gprsize*2+16*24] ;t21a mova m2, [rsp+gprsize*2+16*16] ;tmp13 mova [rsp+gprsize*2+16*28], m3 ;out25 mova [rsp+gprsize*2+16*9 ], m5 ;out6 paddsw m3, m4, m1 ;t18 psubsw m4, m1 ;t21 psubsw m5, m2, m3 ;out18 paddsw m2, m3 ;out13 mova m1, [rsp+gprsize*2+16*29] ;t26a mova m3, [rsp+gprsize*2+16*32] ;t29a mova [rsp+gprsize*2+16*21], m5 ;out18 mova [rsp+gprsize*2+16*16], m2 ;out13 psubsw m5, m3, m1 ;t26 paddsw m3, m1 ;t29 ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 psubsw m1, m2, m3 ;out29 paddsw m2, m3 ;out2 mova m3, [rsp+gprsize*2+16*13] ;tmp10 mova [rsp+gprsize*2+16*32], m1 ;out29 psubsw m7, m3, m5 ;out21 paddsw m3, m5 ;out10 mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 mova [rsp+gprsize*2+16*24], m7 ;out21 mova [rsp+gprsize*2+16*13], m3 ;out10 psubsw m1, m5, m4 ;out26 paddsw m5, m4 ;out5 mova m7, m6 ;out7 mova m3, [rsp+gprsize*2+16*6 ] ;out3 mova m4, [rsp+gprsize*2+16*7 ] ;out4 mova [rsp+gprsize*2+16*29], m1 ;out26 mova m6, [rsp+gprsize*2+16*9 ] ;out6 mova m1, [rsp+gprsize*2+16*4 ] ;out1 ret cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x8_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 8 lea tx2q, [o(.end)] .body: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 pxor m5, m5 .loop: mova m1, [dstq+16*0] mova m3, [dstq+16*1] punpckhbw m2, m1, m5 punpcklbw m1, m5 punpckhbw m4, m3, m5 punpcklbw m3, m5 paddw m2, m0 paddw m1, m0 paddw m4, m0 paddw m3, m0 packuswb m1, m2 packuswb m3, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m3 add dstq, strideq dec r3d jg .loop jmp tx2q .end: RET cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+16*1, 32 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 cmp eobd, 106 jg .full call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: LOAD_8ROWS coeffq+16*17, 32 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main .pass2: mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.end)] jmp m(idct_8x32_internal_8bpc).end1 .end: mova m7, [o(pw_8192)] lea tx2q, [o(.end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end1: lea r3, [dstq+8] lea tx2q, [o(.end2)] jmp m(idct_8x8_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end3: mov dstq, r3 add r3, 8 lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).pass2_main .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end5: mov dstq, r3 add r3, 8 lea tx2q, [o(.end6)] jmp m(idct_8x8_internal_8bpc).pass2_main .end6: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end7: mov dstq, r3 lea tx2q, [o(.end8)] jmp m(idct_8x8_internal_8bpc).pass2_main .end8: ret cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 cmovns tx2d, r5d mov r3d, tx2d %if ARCH_X86_32 LEA r5, $$ %endif lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] .loop: LOAD_8ROWS coeffq+16*0, 64 paddsw m6, [o(pw_5)] mova [rsp+16*1], m6 mova m6, [o(pw_5)] REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 call m(idct_8x8_internal_8bpc).pass1_end3 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 add coeffq, 16 dec r3d jg .loop RET cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 cmovns tx2d, r5d mov r3d, tx2d %if ARCH_X86_32 LEA r5, $$ %endif .loop: LOAD_8ROWS coeffq+16*0, 16 pmulhrsw m6, [o(pw_4096)] mova [rsp+16*1], m6 mova m6, [o(pw_4096)] REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] call m(idct_8x8_internal_8bpc).pass1_end3 mov [rsp+16*3], dstq mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] call m(idct_8x8_internal_8bpc).end3 add coeffq, 16*8 mov dstq, [rsp+16*3] lea dstq, [dstq+8] dec r3d jg .loop jnc .loop RET cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_16x32_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 16 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*5, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: mova [coeffq+16*1 ], m0 ;in8 mova [coeffq+16*5 ], m4 ;in12 mova [rsp+gprsize+16*13], m2 ;in10 mova [rsp+gprsize+16*14], m6 ;in14 mova [rsp+gprsize+16*21], m1 ;in9 mova [rsp+gprsize+16*24], m3 ;in11 mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: mova [rsp+gprsize+16*11], m2 ;in2 mova [rsp+gprsize+16*12], m6 ;in6 mova [rsp+gprsize+16*19], m1 ;in1 mova [rsp+gprsize+16*26], m3 ;in3 mova [rsp+gprsize+16*23], m5 ;in5 mova [rsp+gprsize+16*22], m7 ;in7 cmp eobd, 150 jg .full mova m1, m4 ;in4 mova m2, [coeffq+16*1 ] ;in8 mova m3, [coeffq+16*5 ] ;in12 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [rsp+gprsize+16*11] ;in2 mova m1, [rsp+gprsize+16*12] ;in6 mova m2, [rsp+gprsize+16*13] ;in10 mova m3, [rsp+gprsize+16*14] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: mova [coeffq+16*0 ], m0 ;in0 mova [coeffq+16*4 ], m4 ;in4 LOAD_8ROWS coeffq+16*2, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*6, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: mova [coeffq+16*2 ], m0 ;in16 mova [coeffq+16*6 ], m4 ;in20 mova [rsp+gprsize+16*15], m2 ;in18 mova [rsp+gprsize+16*16], m6 ;in22 mova [rsp+gprsize+16*33], m1 ;in17 mova [rsp+gprsize+16*28], m3 ;in19 mova [rsp+gprsize+16*29], m5 ;in21 mova [rsp+gprsize+16*32], m7 ;in23 LOAD_8ROWS coeffq+16*3, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*7, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: mova [rsp+gprsize+16*17], m2 ;in26 mova [rsp+gprsize+16*18], m6 ;in30 mova [rsp+gprsize+16*31], m1 ;in25 mova [rsp+gprsize+16*30], m3 ;in27 mova [rsp+gprsize+16*27], m5 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 mova m6, m0 ;in24 mova m7, m4 ;in28 mova m0, [coeffq+16*0 ] ;in0 mova m1, [coeffq+16*4 ] ;in4 mova m2, [coeffq+16*1 ] ;in8 mova m3, [coeffq+16*5 ] ;in12 mova m4, [coeffq+16*2 ] ;in16 mova m5, [coeffq+16*6 ] ;in20 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main .pass2: mov [rsp+gprsize*1+16*35], eobd lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 lea r3, [o(.end)] jmp m(idct_8x32_internal_8bpc).end .end: mov dstq, [rsp+gprsize*2+16*35] mov eobd, [rsp+gprsize*1+16*35] add coeffq, 16*32 mova m0, [coeffq+16*4 ] ;in1 mova m1, [coeffq+16*12] ;in3 mova m2, [coeffq+16*20] ;in5 mova m3, [coeffq+16*28] ;in7 mova m4, [coeffq+16*5 ] ;in9 mova m5, [coeffq+16*13] ;in11 mova m6, [coeffq+16*21] ;in13 mova m7, [coeffq+16*29] ;in15 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mova m0, [coeffq+16*0 ] ;in0 mova m1, [coeffq+16*16] ;in4 mova m2, [coeffq+16*1 ] ;in8 mova m3, [coeffq+16*17] ;in12 cmp eobd, 150 jg .full1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 mova m1, [coeffq+16*24] ;in6 mova m2, [coeffq+16*9 ] ;in10 mova m3, [coeffq+16*25] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast jmp m(idct_8x32_internal_8bpc).pass2 .full1: mova m4, [coeffq+16*2 ] ;in16 mova m5, [coeffq+16*18] ;in20 mova m6, [coeffq+16*3 ] ;in24 mova m7, [coeffq+16*19] ;in26 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 mova m1, [coeffq+16*24] ;in6 mova m2, [coeffq+16*9 ] ;in10 mova m3, [coeffq+16*25] ;in14 mova m4, [coeffq+16*10] ;in18 mova m5, [coeffq+16*26] ;in22 mova m6, [coeffq+16*11] ;in26 mova m7, [coeffq+16*27] ;in30 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*6 ] ;in17 mova m1, [coeffq+16*14] ;in19 mova m2, [coeffq+16*22] ;in21 mova m3, [coeffq+16*30] ;in23 mova m4, [coeffq+16*7 ] ;in25 mova m5, [coeffq+16*15] ;in27 mova m6, [coeffq+16*23] ;in29 mova m7, [coeffq+16*31] ;in31 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp m(idct_8x32_internal_8bpc).pass2 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x16_internal_8bpc) call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*11, 16 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*19, 16 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*27, 16 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end call m(idct_8x16_internal_8bpc).pass2 RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 16 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16 lea r3, [o(.pass1_end1)] .pass1: LOAD_8ROWS coeffq+16*0, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+16*2, 64, 1 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*34, 64, 1 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main .pass1_end: mova [rsp+gprsize+16*0 ], m7 mov tx2q, r3 jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+16*32, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*48, 32 sub coeffq, 16 lea r3, [o(.end)] jmp .pass1 .end: ret cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r4d, eobd cmp eobd, 43 ;if (eob > 43) sbb r3d, r3d ; iteration_count++ cmp r4d, 150 ;if (eob > 150) sbb r3d, 0 ; iteration_count++ cmp r4d, 278 ;if (eob > 278) sbb r3d, -4 ; iteration_count++ %if ARCH_X86_32 LEA r5, $$ %endif lea r4, [dstq+8] mov [rsp+16*3], r4 mov [rsp+gprsize+16*3], r3d mov [rsp+gprsize*2+16*3], coeffq .loop: LOAD_8ROWS coeffq, 64, 1 mova [rsp+16*1], m6 pxor m6, m6 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*0], m2 mova [rsp+16*1], m3 mova [rsp+16*2], m4 mova m3, [o(pw_1697x16)] mova m4, [o(pw_16384)] REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 mova m2, [o(pw_8192)] REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 mova m2, [rsp+16*0] mova [rsp+16*0], m7 IDTX16 2, 7, 3, 4 mova m7, [rsp+16*2] mova [rsp+16*2], m5 IDTX16 7, 5, 3, 4 mova m5, [rsp+16*1] mova [rsp+16*1], m6 pmulhrsw m3, m5 pmulhrsw m3, m4 psrlw m4, 1 ; pw_8192 paddsw m3, m5 pmulhrsw m2, m4 pmulhrsw m3, m4 pmulhrsw m4, m7 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] add coeffq, 16 dec r3d jg .loop mov coeffq, [rsp+gprsize*2+16*3] add coeffq, 64*8 mov r3d, [rsp+gprsize+16*3] xor dstq, dstq mov [rsp+gprsize+16*3], dstq mov dstq, [rsp+16*3] test r3d, r3d jnz .loop RET cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r4d, 12 ;0100b mov r5d, 136 ;1000 1000b cmp eobd, 44 ;if (eob > 43) cmovns r4d, r5d ; iteration_count+2 cmp eobd, 151 ;if (eob > 150) mov r3d, 34952 ;1000 1000 1000 1000b cmovs r3d, r4d ; iteration_count += 4 %if ARCH_X86_32 LEA r5, $$ %endif lea r4, [dstq+8] mov [rsp+16*3], r4 .loop: LOAD_8ROWS coeffq, 32, 1 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*1], m6 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*1], m5 mova [rsp+16*2], m6 mova m6, [o(pw_1697x16)] REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 pmulhrsw m7, [o(pw_2048)] mova m5, [rsp+16*1] mova [rsp+16*0], m7 IDTX16 5, 7, 6 mova m7, [rsp+16*2] IDTX16 7, 6, 6 mova m6, [o(pw_2048)] REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m7 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .loop_end: add coeffq, 16 shr r3d, 2 jz .ret test r3d, 2 jnz .loop mov r4d, r3d and r4d, 1 lea coeffq, [coeffq+r4*8+32*7] mov dstq, [rsp+16*3] lea r4, [dstq+8] mov [rsp+16*3], r4 jmp .loop .ret: RET cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x32_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 32 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*35], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*35], coeffq .pass1_loop: LOAD_8ROWS coeffq+64*1, 64*2 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mov tx2d, [rsp+gprsize*1+16*35] test tx2d, tx2d jl .fast .full: LOAD_8ROWS coeffq+64*0, 64*4 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*17, 64*2 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: mova m0, [coeffq+256*0] mova m1, [coeffq+256*1] mova m2, [coeffq+256*2] mova m3, [coeffq+256*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+128*1] mova m1, [coeffq+128*3] mova m2, [coeffq+128*5] mova m3, [coeffq+128*7] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 add coeffq, 16 dec r3d jg .pass1_loop .pass2: mov coeffq, [rsp+gprsize*2+16*35] mov r3d, 4 lea tx2q, [o(.pass2_end)] .pass2_loop: mov [rsp+gprsize*3+16*35], r3d lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*12] mova m2, [coeffq+16*20] mova m3, [coeffq+16*28] mova m4, [coeffq+16*5 ] mova m5, [coeffq+16*13] mova m6, [coeffq+16*21] mova m7, [coeffq+16*29] mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mov eobd, [rsp+gprsize*1+16*35] test eobd, eobd jl .fast1 .full1: mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*16] mova m2, [coeffq+16*1 ] mova m3, [coeffq+16*17] mova m4, [coeffq+16*2 ] mova m5, [coeffq+16*18] mova m6, [coeffq+16*3 ] mova m7, [coeffq+16*19] call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] mova m4, [coeffq+16*10] mova m5, [coeffq+16*26] mova m6, [coeffq+16*11] mova m7, [coeffq+16*27] call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*6 ] mova m1, [coeffq+16*14] mova m2, [coeffq+16*22] mova m3, [coeffq+16*30] mova m4, [coeffq+16*7 ] mova m5, [coeffq+16*15] mova m6, [coeffq+16*23] mova m7, [coeffq+16*31] mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp tx2q .fast1: mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*16] mova m2, [coeffq+16*1 ] mova m3, [coeffq+16*17] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast jmp tx2q .pass2_end: lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end .pass2_end1: lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] dec r3d jg .pass2_loop ret cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 mov r4d, 2 cmp eobd, 136 mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif lea r4, [dstq+8] mov [rsp+gprsize*0+16*3], r4 mov [rsp+gprsize*1+16*3], r3d mov [rsp+gprsize*2+16*3], r3d mov [rsp+gprsize*3+16*3], coeffq .loop: LOAD_8ROWS coeffq, 64 mova [rsp+16*1], m6 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end3 pmulhrsw m7, [o(pw_8192)] mova [rsp+16*0], m7 mova m7, [o(pw_8192)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+16*1], m6 mova [rsp+16*2], m5 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 add coeffq, 16 dec r3d jg .loop mov r4d, [rsp+gprsize*2+16*3] dec r4d jle .ret mov dstq, [rsp+gprsize*0+16*3] mov coeffq, [rsp+gprsize*3+16*3] mov [rsp+gprsize*2+16*3], r4 lea r3, [dstq+8] add coeffq, 64*8 mov [rsp+gprsize*0+16*3], r3 mov r3d, [rsp+gprsize*1+16*3] mov [rsp+gprsize*3+16*3], coeffq jmp .loop .ret: RET cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_16x64_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 32 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 151 mov [rsp+gprsize*1+16*67], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*67], coeffq .pass1_loop: LOAD_8ROWS coeffq+64*0, 64*2 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 call m(idct_16x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 add coeffq, 16 dec r3d jg .pass1_loop mov coeffq, [rsp+gprsize*2+16*67] mov r3d, 2 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.end1)] .pass2_loop: mov [rsp+gprsize*3+16*67], r3d mov eobd, [rsp+gprsize*1+16*67] mova m0, [coeffq+16*4 ] ;in1 mova m1, [coeffq+16*12] ;in3 mova m2, [coeffq+16*20] ;in5 mova m3, [coeffq+16*28] ;in7 mova m4, [coeffq+16*5 ] ;in9 mova m5, [coeffq+16*13] ;in11 mova m6, [coeffq+16*21] ;in13 mova m7, [coeffq+16*29] ;in15 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 pxor m4, m4 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] test eobd, eobd jl .fast .full: mova m2, [coeffq+16*2] mova m3, [coeffq+16*3] REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 mova m0, [coeffq+16*16] mova m1, [coeffq+16*17] mova m2, [coeffq+16*18] mova m3, [coeffq+16*19] REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] mova m4, [coeffq+16*10] mova m5, [coeffq+16*26] mova m6, [coeffq+16*11] mova m7, [coeffq+16*27] mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*6 ] ;in17 mova m1, [coeffq+16*14] ;in19 mova m2, [coeffq+16*22] ;in21 mova m3, [coeffq+16*30] ;in23 mova m4, [coeffq+16*7 ] ;in25 mova m5, [coeffq+16*15] ;in27 mova m6, [coeffq+16*23] ;in29 mova m7, [coeffq+16*31] ;in31 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call .main jmp .end .fast: REPX {mova x, m4}, m2, m3, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 mova m0, [coeffq+16*16] mova m1, [coeffq+16*17] REPX {mova x, m4}, m2, m3, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 call m(idct_8x32_internal_8bpc).main_veryfast SAVE_8ROWS rsp+gprsize+16*3, 16 call .main_fast .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, r4 jmp m(idct_8x32_internal_8bpc).end2 .end1: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] lea r3, [rsp+16*32+gprsize] call .write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.end1)] dec r3d jg .pass2_loop ret .write: mova [r3+16*0], m7 mov r4, -16*32 pxor m7, m7 sub coeffq, r4 .zero_loop: mova [coeffq+r4+16*0], m7 mova [coeffq+r4+16*1], m7 add r4, 16*2 jl .zero_loop call .write_main2 LOAD_8ROWS r3+16*11, 16 call .write_main LOAD_8ROWS r3+16*19, 16 call .write_main LOAD_8ROWS r3+16*27, 16 .write_main: mova [r3+16*0], m7 .write_main2: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [r3+16*0] mova [r3+16*2], m5 mova [r3+16*1], m6 mova [r3+16*0], m7 WRITE_8X4 0, 1, 2, 3, 5, 6, 7 lea dstq, [dstq+strideq*2] WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 lea dstq, [dstq+strideq*2] ret ALIGN function_align cglobal_label .main_fast mova m0, [rsp+gprsize*2+16*35] ;in1 pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 pmulhrsw m0, [o(pw_101x8)] ;t32,t33 mova m7, [o(pd_2048)] mova [rsp+gprsize*2+16*35], m0 ;t32 mova [rsp+gprsize*2+16*66], m3 ;t63 ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a mova [rsp+gprsize*2+16*36], m3 ;t33a mova [rsp+gprsize*2+16*65], m0 ;t62a mova m1, [rsp+gprsize*2+16*37] ;in15 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 mova [rsp+gprsize*2+16*38], m1 ;t35 mova [rsp+gprsize*2+16*63], m2 ;t60 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a mova [rsp+gprsize*2+16*37], m2 ;t34a mova [rsp+gprsize*2+16*64], m1 ;t61a mova m0, [rsp+gprsize*2+16*39] ;in9 pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 pmulhrsw m0, [o(pw_897x8)] ;t36,t37 mova [rsp+gprsize*2+16*39], m0 ;t36 mova [rsp+gprsize*2+16*62], m3 ;t59 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a mova [rsp+gprsize*2+16*40], m3 ;t37a mova [rsp+gprsize*2+16*61], m0 ;t58a mova m1, [rsp+gprsize*2+16*41] ;in7 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 mova [rsp+gprsize*2+16*42], m1 ;t39 mova [rsp+gprsize*2+16*59], m2 ;t56 ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a mova [rsp+gprsize*2+16*41], m2 ;t38a mova [rsp+gprsize*2+16*60], m1 ;t57a mova m0, [rsp+gprsize*2+16*43] ;in5 pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 pmulhrsw m0, [o(pw_501x8)] ;t40,t41 mova [rsp+gprsize*2+16*43], m0 ;t40 mova [rsp+gprsize*2+16*58], m3 ;t55 ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a mova [rsp+gprsize*2+16*44], m3 ;t41a mova [rsp+gprsize*2+16*57], m0 ;t54a mova m1, [rsp+gprsize*2+16*45] ;in11 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 mova [rsp+gprsize*2+16*46], m1 ;t43 mova [rsp+gprsize*2+16*55], m2 ;t52 ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a mova [rsp+gprsize*2+16*45], m2 ;t42a mova [rsp+gprsize*2+16*56], m1 ;t53a mova m0, [rsp+gprsize*2+16*47] ;in13 pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 mova m6, m0 mova [rsp+gprsize*2+16*54], m3 ;t51 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a mova [rsp+gprsize*2+16*48], m3 ;t45a mova [rsp+gprsize*2+16*53], m0 ;t50a mova m0, [rsp+gprsize*2+16*49] ;in3 pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 mova m4, m3 mova m5, m0 jmp .main2 ALIGN function_align cglobal_label .main mova m0, [rsp+gprsize*2+16*35] ;in1 mova m1, [rsp+gprsize*2+16*65] ;in31 pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a pmulhrsw m0, [o(pw_101x8)] ;t32a pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a pmulhrsw m1, [o(pw_m2824x8)] ;t33a mova m7, [o(pd_2048)] psubsw m4, m0, m1 ;t33 paddsw m0, m1 ;t32 psubsw m5, m3, m2 ;t62 paddsw m3, m2 ;t63 ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a mova [rsp+gprsize*2+16*35], m0 ;t32 mova [rsp+gprsize*2+16*36], m5 ;t33a mova [rsp+gprsize*2+16*65], m4 ;t62a mova [rsp+gprsize*2+16*66], m3 ;t63 mova m0, [rsp+gprsize*2+16*63] ;in17 mova m1, [rsp+gprsize*2+16*37] ;in15 pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a pmulhrsw m0, [o(pw_1660x8)] ;t34a pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a pmulhrsw m1, [o(pw_m1474x8)] ;t35a psubsw m4, m1, m0 ;t34 paddsw m0, m1 ;t35 psubsw m5, m2, m3 ;t61 paddsw m3, m2 ;t60 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a mova [rsp+gprsize*2+16*37], m5 ;t34a mova [rsp+gprsize*2+16*38], m0 ;t35 mova [rsp+gprsize*2+16*63], m3 ;t60 mova [rsp+gprsize*2+16*64], m4 ;t61a mova m0, [rsp+gprsize*2+16*39] ;in9 mova m1, [rsp+gprsize*2+16*61] ;in23 pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a pmulhrsw m0, [o(pw_897x8)] ;t36a pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a pmulhrsw m1, [o(pw_m2191x8)] ;t37a psubsw m4, m0, m1 ;t37 paddsw m0, m1 ;t36 psubsw m5, m3, m2 ;t58 paddsw m3, m2 ;t59 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a mova [rsp+gprsize*2+16*39], m0 ;t36 mova [rsp+gprsize*2+16*40], m5 ;t37a mova [rsp+gprsize*2+16*61], m4 ;t58a mova [rsp+gprsize*2+16*62], m3 ;t59 mova m0, [rsp+gprsize*2+16*59] ;in25 mova m1, [rsp+gprsize*2+16*41] ;in7 pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a pmulhrsw m0, [o(pw_2359x8)] ;t38a pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a pmulhrsw m1, [o(pw_m700x8)] ;t39a psubsw m4, m1, m0 ;t38 paddsw m0, m1 ;t39 psubsw m5, m2, m3 ;t57 paddsw m3, m2 ;t56 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a mova [rsp+gprsize*2+16*41], m5 ;t38a mova [rsp+gprsize*2+16*42], m0 ;t39 mova [rsp+gprsize*2+16*59], m3 ;t56 mova [rsp+gprsize*2+16*60], m4 ;t57a mova m0, [rsp+gprsize*2+16*43] ;in5 mova m1, [rsp+gprsize*2+16*57] ;in27 pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a pmulhrsw m0, [o(pw_501x8)] ;t40a pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a pmulhrsw m1, [o(pw_m2520x8)] ;t41a psubsw m4, m0, m1 ;t41 paddsw m0, m1 ;t40 psubsw m5, m3, m2 ;t54 paddsw m3, m2 ;t55 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a mova [rsp+gprsize*2+16*43], m0 ;t40 mova [rsp+gprsize*2+16*44], m5 ;t41a mova [rsp+gprsize*2+16*57], m4 ;t54a mova [rsp+gprsize*2+16*58], m3 ;t55 mova m0, [rsp+gprsize*2+16*55] ;in21 mova m1, [rsp+gprsize*2+16*45] ;in11 pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a pmulhrsw m0, [o(pw_2019x8)] ;t42a pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a pmulhrsw m1, [o(pw_m1092x8)] ;t43a psubsw m4, m1, m0 ;t42 paddsw m0, m1 ;t43 psubsw m5, m2, m3 ;t53 paddsw m3, m2 ;t52 ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a mova [rsp+gprsize*2+16*45], m5 ;t42a mova [rsp+gprsize*2+16*46], m0 ;t43 mova [rsp+gprsize*2+16*55], m3 ;t52 mova [rsp+gprsize*2+16*56], m4 ;t53a mova m0, [rsp+gprsize*2+16*47] ;in13 mova m1, [rsp+gprsize*2+16*53] ;in19 pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a pmulhrsw m0, [o(pw_1285x8)] ;t44a pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a pmulhrsw m1, [o(pw_m1842x8)] ;t45a psubsw m4, m0, m1 ;t45 paddsw m0, m1 ;t44 psubsw m5, m3, m2 ;t50 paddsw m3, m2 ;t51 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a mova m6, m0 mova [rsp+gprsize*2+16*48], m5 ;t45a mova [rsp+gprsize*2+16*53], m4 ;t50a mova [rsp+gprsize*2+16*54], m3 ;t51 mova m0, [rsp+gprsize*2+16*51] ;in29 mova m1, [rsp+gprsize*2+16*49] ;in3 pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a pmulhrsw m0, [o(pw_2675x8)] ;t46a pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a pmulhrsw m1, [o(pw_m301x8)] ;t47a psubsw m5, m1, m0 ;t46 paddsw m0, m1 ;t47 psubsw m4, m2, m3 ;t49 paddsw m3, m2 ;t48 ALIGN function_align .main2: ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a mova m1, [rsp+gprsize*2+16*54] ;t51 psubsw m2, m0, m6 ;t44a paddsw m0, m6 ;t47a psubsw m6, m3, m1 ;t51a paddsw m3, m1 ;t48a mova [rsp+gprsize*2+16*50], m0 ;t47a mova [rsp+gprsize*2+16*51], m3 ;t48a ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 mova [rsp+gprsize*2+16*47], m6 ;t44 mova [rsp+gprsize*2+16*54], m2 ;t51 mova m0, [rsp+gprsize*2+16*48] ;t45a mova m3, [rsp+gprsize*2+16*53] ;t50a psubsw m2, m4, m0 ;t45 paddsw m4, m0 ;t46 psubsw m6, m5, m3 ;t50 paddsw m5, m3 ;t49 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a mova [rsp+gprsize*2+16*48], m6 ;t45a mova [rsp+gprsize*2+16*49], m4 ;t46 mova [rsp+gprsize*2+16*52], m5 ;t49 mova [rsp+gprsize*2+16*53], m2 ;t50a mova m0, [rsp+gprsize*2+16*43] ;t40 mova m2, [rsp+gprsize*2+16*46] ;t43 mova m3, [rsp+gprsize*2+16*55] ;t52 mova m1, [rsp+gprsize*2+16*58] ;t55 psubsw m4, m0, m2 ;t43a paddsw m0, m2 ;t40a psubsw m5, m1, m3 ;t52a paddsw m1, m3 ;t55a ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 mova [rsp+gprsize*2+16*43], m0 ;t40a mova [rsp+gprsize*2+16*46], m5 ;t43 mova [rsp+gprsize*2+16*55], m4 ;t52 mova [rsp+gprsize*2+16*58], m1 ;t55a mova m0, [rsp+gprsize*2+16*44] ;t41a mova m2, [rsp+gprsize*2+16*45] ;t42a mova m3, [rsp+gprsize*2+16*56] ;t53a mova m1, [rsp+gprsize*2+16*57] ;t54a psubsw m4, m0, m2 ;t42 paddsw m0, m2 ;t41 psubsw m5, m1, m3 ;t53 paddsw m1, m3 ;t54 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a mova [rsp+gprsize*2+16*44], m0 ;t41 mova [rsp+gprsize*2+16*45], m5 ;t42a mova [rsp+gprsize*2+16*56], m4 ;t53a mova [rsp+gprsize*2+16*57], m1 ;t54 mova m0, [rsp+gprsize*2+16*41] ;t38a mova m2, [rsp+gprsize*2+16*40] ;t37a mova m3, [rsp+gprsize*2+16*61] ;t58a mova m1, [rsp+gprsize*2+16*60] ;t57a psubsw m4, m0, m2 ;t37 paddsw m0, m2 ;t38 psubsw m5, m1, m3 ;t58 paddsw m1, m3 ;t57 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a mova [rsp+gprsize*2+16*41], m0 ;t38 mova [rsp+gprsize*2+16*40], m5 ;t37a mova [rsp+gprsize*2+16*61], m4 ;t58a mova [rsp+gprsize*2+16*60], m1 ;t57 mova m0, [rsp+gprsize*2+16*42] ;t39 mova m2, [rsp+gprsize*2+16*39] ;t36 mova m3, [rsp+gprsize*2+16*62] ;t59 mova m1, [rsp+gprsize*2+16*59] ;t56 psubsw m4, m0, m2 ;t36a paddsw m0, m2 ;t39a psubsw m5, m1, m3 ;t59a paddsw m1, m3 ;t56a ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 mova [rsp+gprsize*2+16*42], m0 ;t39a mova [rsp+gprsize*2+16*39], m5 ;t36 mova [rsp+gprsize*2+16*62], m4 ;t59 mova [rsp+gprsize*2+16*59], m1 ;t56a mova m0, [rsp+gprsize*2+16*35] ;t32 mova m2, [rsp+gprsize*2+16*38] ;t35 mova m3, [rsp+gprsize*2+16*63] ;t60 mova m1, [rsp+gprsize*2+16*66] ;t63 psubsw m4, m0, m2 ;t35a paddsw m0, m2 ;t32a psubsw m5, m1, m3 ;t60a paddsw m1, m3 ;t63a ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 mova [rsp+gprsize*2+16*35], m0 ;t32a mova [rsp+gprsize*2+16*38], m5 ;t35 mova [rsp+gprsize*2+16*63], m4 ;t60 mova [rsp+gprsize*2+16*66], m1 ;t63a mova m0, [rsp+gprsize*2+16*36] ;t33a mova m2, [rsp+gprsize*2+16*37] ;t34a mova m3, [rsp+gprsize*2+16*64] ;t61a mova m1, [rsp+gprsize*2+16*65] ;t62a psubsw m4, m0, m2 ;t34 paddsw m0, m2 ;t33 psubsw m5, m1, m3 ;t61 paddsw m1, m3 ;t62 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a mova m2, [rsp+gprsize*2+16*41] ;t38 mova m3, [rsp+gprsize*2+16*60] ;t57 psubsw m6, m0, m2 ;t38a paddsw m0, m2 ;t33a psubsw m2, m1, m3 ;t57a paddsw m1, m3 ;t62a mova [rsp+gprsize*2+16*36], m0 ;t33a mova [rsp+gprsize*2+16*65], m1 ;t62a ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 mova [rsp+gprsize*2+16*41], m2 ;t38 mova [rsp+gprsize*2+16*60], m6 ;t57 mova m2, [rsp+gprsize*2+16*40] ;t37 mova m3, [rsp+gprsize*2+16*61] ;t58 psubsw m0, m5, m2 ;t37 paddsw m5, m2 ;t34 psubsw m1, m4, m3 ;t58 paddsw m4, m3 ;t61 ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a mova [rsp+gprsize*2+16*37], m5 ;t34 mova [rsp+gprsize*2+16*64], m4 ;t61 mova [rsp+gprsize*2+16*40], m1 ;t37a mova [rsp+gprsize*2+16*61], m0 ;t58a mova m0, [rsp+gprsize*2+16*38] ;t35 mova m2, [rsp+gprsize*2+16*39] ;t36 mova m3, [rsp+gprsize*2+16*62] ;t59 mova m1, [rsp+gprsize*2+16*63] ;t60 psubsw m4, m0, m2 ;t36a paddsw m0, m2 ;t35a psubsw m5, m1, m3 ;t59a paddsw m1, m3 ;t60a ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 mova [rsp+gprsize*2+16*38], m0 ;t35a mova [rsp+gprsize*2+16*39], m5 ;t36 mova [rsp+gprsize*2+16*62], m4 ;t59 mova [rsp+gprsize*2+16*63], m1 ;t60a mova m0, [rsp+gprsize*2+16*35] ;t32a mova m2, [rsp+gprsize*2+16*42] ;t39a mova m3, [rsp+gprsize*2+16*59] ;t56a mova m1, [rsp+gprsize*2+16*66] ;t63a psubsw m4, m0, m2 ;t39 paddsw m0, m2 ;t32 psubsw m5, m1, m3 ;t56 paddsw m1, m3 ;t63 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a mova [rsp+gprsize*2+16*35], m0 ;t32 mova [rsp+gprsize*2+16*42], m5 ;t39a mova [rsp+gprsize*2+16*59], m4 ;t56a mova [rsp+gprsize*2+16*66], m1 ;t63 mova m0, [rsp+gprsize*2+16*50] ;t47a mova m2, [rsp+gprsize*2+16*43] ;t40a mova m3, [rsp+gprsize*2+16*58] ;t55a mova m1, [rsp+gprsize*2+16*51] ;t48a psubsw m4, m0, m2 ;t40 paddsw m0, m2 ;t47 psubsw m5, m1, m3 ;t55 paddsw m1, m3 ;t48 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a mova [rsp+gprsize*2+16*50], m0 ;t47 mova [rsp+gprsize*2+16*43], m5 ;t40a mova [rsp+gprsize*2+16*58], m4 ;t55a mova [rsp+gprsize*2+16*51], m1 ;t48 mova m0, [rsp+gprsize*2+16*49] ;t46 mova m2, [rsp+gprsize*2+16*44] ;t41 mova m3, [rsp+gprsize*2+16*57] ;t54 mova m1, [rsp+gprsize*2+16*52] ;t49 psubsw m4, m0, m2 ;t41a paddsw m0, m2 ;t46a psubsw m5, m1, m3 ;t54a paddsw m1, m3 ;t49a ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 mova [rsp+gprsize*2+16*49], m0 ;t46a mova [rsp+gprsize*2+16*44], m5 ;t41 mova [rsp+gprsize*2+16*57], m4 ;t54 mova [rsp+gprsize*2+16*52], m1 ;t49a mova m0, [rsp+gprsize*2+16*48] ;t45a mova m2, [rsp+gprsize*2+16*45] ;t42a mova m3, [rsp+gprsize*2+16*56] ;t53a mova m1, [rsp+gprsize*2+16*53] ;t50a psubsw m4, m0, m2 ;t42 paddsw m0, m2 ;t45 psubsw m5, m1, m3 ;t53 paddsw m1, m3 ;t50 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a mova [rsp+gprsize*2+16*48], m0 ;t45 mova [rsp+gprsize*2+16*45], m5 ;t42a mova [rsp+gprsize*2+16*56], m4 ;t53a mova [rsp+gprsize*2+16*53], m1 ;t50 mova m0, [rsp+gprsize*2+16*47] ;t44 mova m2, [rsp+gprsize*2+16*46] ;t43 mova m3, [rsp+gprsize*2+16*55] ;t52 mova m1, [rsp+gprsize*2+16*54] ;t51 psubsw m4, m0, m2 ;t43a paddsw m0, m2 ;t44a psubsw m5, m1, m3 ;t52a paddsw m1, m3 ;t51a ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 mova m2, [rsp+gprsize*2+16*38] ;t35a mova m3, [rsp+gprsize*2+16*31] ;tmp[28] psubsw m6, m2, m0 ;t44 paddsw m2, m0 ;t35 psubsw m0, m3, m2 ;out35 paddsw m2, m3 ;out28 mova m3, [rsp+gprsize*2+16*63] ;t60a mova [rsp+gprsize*2+16*38], m0 ;out35 mova [rsp+gprsize*2+16*31], m2 ;out28 psubsw m0, m3, m1 ;t51 paddsw m3, m1 ;t60 ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] psubsw m1, m2, m3 ;out60 paddsw m2, m3 ;out3 mova m3, [rsp+gprsize*2+16*22] ;tmp[19] mova [rsp+gprsize*2+16*63], m1 ;out60 mova [rsp+gprsize*2+16*6 ], m2 ;out3 psubsw m1, m3, m0 ;out44 paddsw m3, m0 ;out19 mova m2, [rsp+gprsize*2+16*15] ;tmp[12] mova m0, [rsp+gprsize*2+16*39] ;t36 mova [rsp+gprsize*2+16*47], m1 ;out44 mova [rsp+gprsize*2+16*22], m3 ;out19 mova m1, [rsp+gprsize*2+16*62] ;t59 psubsw m3, m2, m6 ;out51 paddsw m2, m6 ;out12 mova [rsp+gprsize*2+16*54], m3 ;out51 mova [rsp+gprsize*2+16*15], m2 ;out12 psubsw m2, m0, m5 ;t43a paddsw m0, m5 ;t36a mova m5, [rsp+gprsize*2+16*30] ;tmp[27] psubsw m3, m1, m4 ;t52a paddsw m1, m4 ;t59a ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] psubsw m6, m5, m0 ;out36 paddsw m5, m0 ;out27 psubsw m0, m4, m1 ;out59 paddsw m4, m1 ;out4 mova [rsp+gprsize*2+16*39], m6 ;out36 mova [rsp+gprsize*2+16*30], m5 ;out27 mova [rsp+gprsize*2+16*62], m0 ;out59 mova [rsp+gprsize*2+16*7 ], m4 ;out4 mova m0, [rsp+gprsize*2+16*23] ;tmp[20] mova m5, [rsp+gprsize*2+16*14] ;tmp[11] psubsw m4, m0, m3 ;out43 paddsw m0, m3 ;out20 psubsw m6, m5, m2 ;out52 paddsw m5, m2 ;out11 mova [rsp+gprsize*2+16*46], m4 ;out43 mova [rsp+gprsize*2+16*23], m0 ;out20 mova [rsp+gprsize*2+16*55], m6 ;out52 mova [rsp+gprsize*2+16*14], m5 ;out11 mova m0, [rsp+gprsize*2+16*40] ;t37a mova m5, [rsp+gprsize*2+16*45] ;t42a mova m3, [rsp+gprsize*2+16*56] ;t53a mova m1, [rsp+gprsize*2+16*61] ;t58a mova m2, [rsp+gprsize*2+16*29] ;tmp[26] psubsw m4, m0, m5 ;t42 paddsw m0, m5 ;t37 psubsw m5, m1, m3 ;t53 paddsw m1, m3 ;t58 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] psubsw m6, m2, m0 ;out37 paddsw m2, m0 ;out26 psubsw m0, m3, m1 ;out58 paddsw m3, m1 ;out5 mova [rsp+gprsize*2+16*40], m6 ;out37 mova [rsp+gprsize*2+16*29], m2 ;out26 mova [rsp+gprsize*2+16*61], m0 ;out58 mova [rsp+gprsize*2+16*8 ], m3 ;out5 mova m0, [rsp+gprsize*2+16*24] ;tmp[21] mova m1, [rsp+gprsize*2+16*13] ;tmp[10] psubsw m2, m0, m5 ;out42 paddsw m0, m5 ;out21 psubsw m3, m1, m4 ;out53 paddsw m1, m4 ;out10 mova [rsp+gprsize*2+16*45], m2 ;out42 mova [rsp+gprsize*2+16*24], m0 ;out21 mova [rsp+gprsize*2+16*56], m3 ;out53 mova [rsp+gprsize*2+16*13], m1 ;out10 mova m0, [rsp+gprsize*2+16*41] ;t38 mova m5, [rsp+gprsize*2+16*44] ;t41 mova m3, [rsp+gprsize*2+16*57] ;t54 mova m1, [rsp+gprsize*2+16*60] ;t57 mova m2, [rsp+gprsize*2+16*28] ;tmp[25] psubsw m4, m0, m5 ;t41a paddsw m0, m5 ;t38a psubsw m5, m1, m3 ;t54a paddsw m1, m3 ;t57a ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] psubsw m6, m2, m0 ;out38 paddsw m2, m0 ;out25 psubsw m0, m3, m1 ;out57 paddsw m3, m1 ;out6 mova [rsp+gprsize*2+16*41], m6 ;out38 mova [rsp+gprsize*2+16*28], m2 ;out25 mova [rsp+gprsize*2+16*60], m0 ;out57 mova [rsp+gprsize*2+16*9 ], m3 ;out6 mova m0, [rsp+gprsize*2+16*25] ;tmp[22] mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] psubsw m2, m0, m5 ;out41 paddsw m0, m5 ;out22 psubsw m3, m1, m4 ;out54 paddsw m1, m4 ;out9 mova [rsp+gprsize*2+16*44], m2 ;out41 mova [rsp+gprsize*2+16*25], m0 ;out22 mova [rsp+gprsize*2+16*57], m3 ;out54 mova [rsp+gprsize*2+16*12], m1 ;out9 mova m0, [rsp+gprsize*2+16*42] ;t39a mova m5, [rsp+gprsize*2+16*43] ;t40a mova m3, [rsp+gprsize*2+16*58] ;t55a mova m1, [rsp+gprsize*2+16*59] ;t56a mova m2, [rsp+gprsize*2+16*27] ;tmp[24] psubsw m4, m0, m5 ;t40 paddsw m0, m5 ;t39 psubsw m5, m1, m3 ;t55 paddsw m1, m3 ;t56 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] psubsw m6, m2, m0 ;out39 paddsw m2, m0 ;out24 psubsw m0, m3, m1 ;out56 paddsw m3, m1 ;out7 mova [rsp+gprsize*2+16*42], m6 ;out39 mova [rsp+gprsize*2+16*27], m2 ;out24 mova [rsp+gprsize*2+16*59], m0 ;out56 mova [rsp+gprsize*2+16*10], m3 ;out7 mova m0, [rsp+gprsize*2+16*26] ;tmp[23] mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] psubsw m2, m0, m5 ;out40 paddsw m0, m5 ;out23 psubsw m3, m1, m4 ;out55 paddsw m1, m4 ;out8 mova [rsp+gprsize*2+16*43], m2 ;out40 mova [rsp+gprsize*2+16*26], m0 ;out23 mova [rsp+gprsize*2+16*58], m3 ;out55 mova [rsp+gprsize*2+16*11], m1 ;out8 mova m0, [rsp+gprsize*2+16*37] ;t34 mova m5, [rsp+gprsize*2+16*48] ;t45 mova m3, [rsp+gprsize*2+16*53] ;t50 mova m1, [rsp+gprsize*2+16*64] ;t61 mova m2, [rsp+gprsize*2+16*32] ;tmp[29] psubsw m4, m0, m5 ;t45a paddsw m0, m5 ;t34a psubsw m5, m1, m3 ;t50a paddsw m1, m3 ;t61a ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] psubsw m6, m2, m0 ;out34 paddsw m2, m0 ;out29 psubsw m0, m3, m1 ;out61 paddsw m3, m1 ;out2 mova [rsp+gprsize*2+16*37], m6 ;out34 mova [rsp+gprsize*2+16*32], m2 ;out29 mova [rsp+gprsize*2+16*64], m0 ;out61 mova [rsp+gprsize*2+16*5 ], m3 ;out2 mova m0, [rsp+gprsize*2+16*21] ;tmp[18] mova m1, [rsp+gprsize*2+16*16] ;tmp[13] psubsw m2, m0, m5 ;out45 paddsw m0, m5 ;out18 psubsw m3, m1, m4 ;out50 paddsw m1, m4 ;out13 mova [rsp+gprsize*2+16*48], m2 ;out45 mova [rsp+gprsize*2+16*21], m0 ;out18 mova [rsp+gprsize*2+16*53], m3 ;out50 mova [rsp+gprsize*2+16*16], m1 ;out13 mova m0, [rsp+gprsize*2+16*36] ;t33a mova m5, [rsp+gprsize*2+16*49] ;t46a mova m3, [rsp+gprsize*2+16*52] ;t49a mova m1, [rsp+gprsize*2+16*65] ;t62a mova m2, [rsp+gprsize*2+16*33] ;tmp[30] psubsw m4, m0, m5 ;t46 paddsw m0, m5 ;t33 psubsw m5, m1, m3 ;t49 paddsw m1, m3 ;t62 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] psubsw m6, m2, m0 ;out33 paddsw m2, m0 ;out30 psubsw m0, m3, m1 ;out62 paddsw m3, m1 ;out1 mova [rsp+gprsize*2+16*36], m6 ;out33 mova [rsp+gprsize*2+16*33], m2 ;out30 mova [rsp+gprsize*2+16*65], m0 ;out62 mova [rsp+gprsize*2+16*4 ], m3 ;out1 mova m0, [rsp+gprsize*2+16*20] ;tmp[17] mova m1, [rsp+gprsize*2+16*17] ;tmp[14] psubsw m2, m0, m5 ;out46 paddsw m0, m5 ;out17 psubsw m3, m1, m4 ;out49 paddsw m1, m4 ;out14 mova [rsp+gprsize*2+16*49], m2 ;out46 mova [rsp+gprsize*2+16*20], m0 ;out17 mova [rsp+gprsize*2+16*52], m3 ;out49 mova [rsp+gprsize*2+16*17], m1 ;out14 mova m0, [rsp+gprsize*2+16*35] ;t32 mova m5, [rsp+gprsize*2+16*50] ;t47 mova m3, [rsp+gprsize*2+16*51] ;t48 mova m1, [rsp+gprsize*2+16*66] ;t63 mova m2, [rsp+gprsize*2+16*34] ;tmp[31] psubsw m4, m0, m5 ;t47a paddsw m0, m5 ;t32a psubsw m5, m1, m3 ;t48a paddsw m1, m3 ;t63a ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] psubsw m6, m2, m0 ;out32 paddsw m2, m0 ;out31 psubsw m0, m3, m1 ;out63 paddsw m3, m1 ;out0 mova [rsp+gprsize*2+16*35], m6 ;out32 mova [rsp+gprsize*2+16*34], m2 ;out31 mova [rsp+gprsize*2+16*66], m0 ;out63 mova [rsp+gprsize*2+16*3 ], m3 ;out0 mova m0, [rsp+gprsize*2+16*19] ;tmp[16] mova m1, [rsp+gprsize*2+16*18] ;tmp[15] psubsw m2, m0, m5 ;out47 paddsw m0, m5 ;out16 psubsw m3, m1, m4 ;out48 paddsw m1, m4 ;out15 mova [rsp+gprsize*2+16*50], m2 ;out47 mova [rsp+gprsize*2+16*19], m0 ;out16 mova [rsp+gprsize*2+16*51], m3 ;out48 mova [rsp+gprsize*2+16*18], m1 ;out15 ret cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_64x16_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 16 lea tx2q, [o(.end)] .body: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 pxor m7, m7 .loop: mova m1, [dstq+16*0] mova m3, [dstq+16*1] mova m5, [dstq+16*2] mova m6, [dstq+16*3] punpckhbw m2, m1, m7 punpcklbw m1, m7 punpckhbw m4, m3, m7 punpcklbw m3, m7 paddw m2, m0 paddw m1, m0 paddw m4, m0 paddw m3, m0 packuswb m1, m2 packuswb m3, m4 punpckhbw m2, m5, m7 punpcklbw m5, m7 punpckhbw m4, m6, m7 punpcklbw m6, m7 paddw m2, m0 paddw m5, m0 paddw m4, m0 paddw m6, m0 packuswb m5, m2 packuswb m6, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m3 mova [dstq+16*2], m5 mova [dstq+16*3], m6 add dstq, strideq dec r3d jg .loop jmp tx2q .end: RET %macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 %if %3 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [%1+%2*0] pmulhrsw m1, m3, [%1+%2*1] pmulhrsw m2, m3, [%1+%2*2] pmulhrsw m3, [%1+%2*3] %else mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] %endif %endmacro %macro LOAD_4ROWS_H 2 ;src, stride mova m4, [%1+%2*0] mova m5, [%1+%2*1] mova m6, [%1+%2*2] mova m7, [%1+%2*3] %endmacro cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 2 mov [rsp+gprsize*2+16*67], dstq lea dstq, [rsp+gprsize+16*68] .pass1_loop: LOAD_4ROWS coeffq+32*0, 32*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+32*4, 32*8 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+32*2, 32*4 mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+32*1, 32*2 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 LOAD_8ROWS coeffq+32*17, 32*2 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+32*24, 32 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+32*24, 32 add coeffq, 16 add dstq, 16 dec r3d jg .pass1_loop .pass2: mov dstq, [rsp+gprsize*2+16*67] sub coeffq, 32 mov r3d, 4 .pass2_loop: mov [rsp+gprsize*1+16*67], r3d LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 call m(idct_16x8_internal_8bpc).main mov r3, dstq lea tx2q, [o(.end)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 16*16 mov r3d, [rsp+gprsize*1+16*67] mov dstq, [rsp+gprsize*2+16*67] add dstq, 8 mov [rsp+gprsize*2+16*67], dstq dec r3d jg .pass2_loop mov r3d, 4 lea coeffq, [rsp+gprsize+16*68] .pass2_loop2: mov [rsp+gprsize*1+16*67], r3d LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 call m(idct_16x8_internal_8bpc).main mov r3, dstq lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end3)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end .end3: add coeffq, 16*16 mov r3d, [rsp+gprsize*1+16*67] mov dstq, [rsp+gprsize*2+16*67] add dstq, 8 mov [rsp+gprsize*2+16*67], dstq dec r3d jg .pass2_loop2 ret cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x64_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 64 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*67], coeffq .pass1_loop: LOAD_8ROWS coeffq+64*1, 64*2, 1 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mov tx2d, [rsp+gprsize*1+16*67] test tx2d, tx2d jl .fast .full: LOAD_8ROWS coeffq+64*0, 64*4, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4, 1 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*17, 64*2, 1 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: LOAD_4ROWS coeffq, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+128*1, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 add coeffq, 16 dec r3d jg .pass1_loop .pass2: mov coeffq, [rsp+gprsize*2+16*67] mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] jmp m(idct_16x64_internal_8bpc).pass2_loop cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_64x32_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] pmulhrsw m0, m1 mov [coeffq], eobd mov r3d, 32 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*67], coeffq mov [rsp+gprsize*3+16*67], dstq lea dstq, [rsp+gprsize+16*69] mov [rsp+gprsize*4+16*67], dstq .pass1_loop: LOAD_4ROWS coeffq+64*0, 64*8, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8, 1 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*2, 64*4, 1 mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2, 1 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 LOAD_8ROWS coeffq+64*17, 64*2, 1 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: SAVE_8ROWS dstq+64*24, 64 add coeffq, 16 add dstq, 16 dec r3d jg .pass1_loop .pass2: mov coeffq, [rsp+gprsize*4+16*67] mov dstq, [rsp+gprsize*3+16*67] mov eobd, [rsp+gprsize*1+16*67] lea dstq, [dstq+32] mov [rsp+gprsize*1+16*35], eobd lea tx2q, [o(.pass2_end)] mov r3d, 4 jmp m(idct_32x32_internal_8bpc).pass2_loop .pass2_end: mova [rsp+gprsize+16*0], m7 lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] dec r3d jg m(idct_32x32_internal_8bpc).pass2_loop .pass2_end2: mov dstq, [rsp+gprsize*3+16*67] mov coeffq, [rsp+gprsize*2+16*67] lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] mov r3d, 4 jmp m(idct_32x32_internal_8bpc).pass2_loop cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_64x64_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 64 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r5d, 4 mov r4d, 2 sub eobd, 136 cmovns r4d, r5d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*1+16*67], eobd mov r3d, r4d mov [rsp+gprsize*4+16*67], coeffq mov [rsp+gprsize*3+16*67], dstq lea dstq, [rsp+gprsize+16*69] mov [rsp+gprsize*2+16*67], dstq .pass1_loop: LOAD_4ROWS coeffq+64*0, 64*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*2, 64*4 mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 LOAD_8ROWS coeffq+64*17, 64*2 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+64*24, 64 add coeffq, 16 add dstq, 16 dec r3d jg .pass1_loop .pass2: mov dstq, [rsp+gprsize*3+16*67] mov coeffq, [rsp+gprsize*2+16*67] lea dstq, [dstq+32] mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.pass2_end)] jmp m(idct_16x64_internal_8bpc).pass2_loop .pass2_end: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] lea r3, [rsp+16*32+gprsize] mova [rsp+gprsize+16*0], m7 call m(idct_16x64_internal_8bpc).write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.pass2_end)] dec r3d jg m(idct_16x64_internal_8bpc).pass2_loop .pass2_end2: mov coeffq, [rsp+gprsize*4+16*67] mov dstq, [rsp+gprsize*2+16*67] mov r3d, 4 sub dstq, 72 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] jmp m(idct_16x64_internal_8bpc).pass2_loop dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/loopfilter16_avx2.asm000066400000000000000000001036561517466257200252630ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 times 4 db 0, 1 times 4 db 8, 9 pw_1: times 16 dw 1 pw_2: times 16 dw 2 pw_3: times 16 dw 3 pw_4096: times 2 dw 4096 ; 10bpc/12bpc: pw_4: times 2 dw 4 times 2 dw 16 clip_max: times 2 dw 511 times 2 dw 2047 clip_min: times 2 dw -512 times 2 dw -2048 SECTION .text ; in: out: ; mm%1 a b c d a e i m ; mm%2 e f g h b f j n ; mm%3 i j k l -> c g k o ; mm%4 m n o p d h l p %macro TRANSPOSE4X4W 5 punpcklwd m%5, m%1, m%2 punpckhwd m%1, m%2 punpcklwd m%2, m%3, m%4 punpckhwd m%3, m%4 punpckldq m%4, m%5, m%2 punpckhdq m%5, m%2 punpckldq m%2, m%1, m%3 punpckhdq m%1, m%3 SWAP %1, %4 SWAP %2, %5, %3 %endmacro ; in: out: ; xmm%1 a b c d e f g h a i q y 6 E M U ; xmm%2 i j k l m n o p b j r z 7 F N V ; xmm%3 q r s t u v w x c k s 0 8 G O W ; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X ; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y ; xmm%6 E F G H I J K L f n v 3 B J R Z ; xmm%7 M N O P Q R S T g o w 4 C K S + ; xmm%8 U V W X Y Z + = h p x 5 D L T = %macro TRANSPOSE8X8W 9 ; xmm%1 a b c d e f g h a i q y b j r z ; xmm%2 i j k l m n o p c k s 0 d l t 1 ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 TRANSPOSE4X4W %1, %2, %3, %4, %9 ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V ; xmm%6 E F G H I J K L 8 G O W 9 H P X ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z ; xmm%8 U V W X Y Z + = C K S + D L T = TRANSPOSE4X4W %5, %6, %7, %8, %9 ; xmm%1 a i q y b j r z a i q y 6 E M U ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z ; xmm%7 A I Q Y B J R Z g o w 4 C K S + ; xmm%8 C K S + D L T = h p x 5 D L T = punpckhqdq m%9, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 punpcklqdq m%2, m%6 punpckhqdq m%6, m%3, m%7 punpcklqdq m%3, m%7 punpckhqdq m%7, m%4, m%8 punpcklqdq m%4, m%8 SWAP %8, %7, %4, %5, %3, %2, %9 %endmacro ; transpose and write m3-6, everything else is scratch %macro TRANSPOSE_8x4_AND_WRITE_4x16 0 ; transpose 8x4 punpcklwd m0, m3, m4 punpckhwd m3, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpckldq m6, m0, m4 punpckhdq m0, m4 punpckldq m4, m3, m5 punpckhdq m3, m5 ; write out movq [dstq+strideq*0-4], xm6 movhps [dstq+strideq*1-4], xm6 movq [dstq+strideq*2-4], xm0 movhps [dstq+stride3q -4], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm4 movhps [dstq+strideq*1-4], xm4 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] vextracti128 xm6, m6, 1 vextracti128 xm0, m0, 1 vextracti128 xm4, m4, 1 vextracti128 xm3, m3, 1 movq [dstq+strideq*0-4], xm6 movhps [dstq+strideq*1-4], xm6 movq [dstq+strideq*2-4], xm0 movhps [dstq+stride3q -4], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm4 movhps [dstq+strideq*1-4], xm4 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %if %1 == 4 lea tmpq, [dstq+mstrideq*2] mova m3, [tmpq+strideq*0] ; p1 mova m4, [tmpq+strideq*1] ; p0 mova m5, [tmpq+strideq*2] ; q0 mova m6, [tmpq+stride3q] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] ; we load p3 later mova m13, [tmpq+strideq*1] mova m3, [tmpq+strideq*2] mova m4, [tmpq+stride3q] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m15, [dstq+stride3q] %endif %endif %else ; load lines %if %1 == 4 movq xm3, [dstq+strideq*0-4] movq xm4, [dstq+strideq*1-4] movq xm5, [dstq+strideq*2-4] movq xm6, [dstq+stride3q -4] lea tmpq, [dstq+strideq*4] movq xm11, [tmpq+strideq*0-4] movq xm13, [tmpq+strideq*1-4] movq xm14, [tmpq+strideq*2-4] movq xm15, [tmpq+stride3q -4] lea tmpq, [tmpq+strideq*4] ; this overreads by 8 bytes but the buffers are padded ; so that should be ok vinserti128 m3, [tmpq+strideq*0-4], 1 vinserti128 m4, [tmpq+strideq*1-4], 1 vinserti128 m5, [tmpq+strideq*2-4], 1 vinserti128 m6, [tmpq+stride3q -4], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m11, [tmpq+strideq*0-4], 1 vinserti128 m13, [tmpq+strideq*1-4], 1 vinserti128 m14, [tmpq+strideq*2-4], 1 vinserti128 m15, [tmpq+stride3q -4], 1 ; transpose 4x8 ; xm3: A-D0,A-D4 ; xm4: A-D1,A-D5 ; xm5: A-D2,A-D6 ; xm6: A-D3,A-D7 punpcklwd m7, m3, m4 punpcklwd m3, m11, m13 punpcklwd m4, m5, m6 punpcklwd m5, m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1 ; xm3: A4-5,B4-5,C4-5,D4-5 ; xm4: A2-3,B2-3,C2-3,D2-3 ; xm5: A6-7,B6-7,C6-7,D6-7 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m8, m3, m5 punpckhdq m5, m3, m5 ; xm6: A0-3,B0-3 ; xm7: C0-3,D0-3 ; xm8: A4-7,B4-7 ; xm5: C4-7,D4-7 punpcklqdq m3, m6, m8 punpckhqdq m4, m6, m8 punpckhqdq m6, m7, m5 punpcklqdq m5, m7, m5 ; xm3: A0-7 ; xm4: B0-7 ; xm5: C0-7 ; xm6: D0-7 %elif %1 == 6 || %1 == 8 movu xm3, [dstq+strideq*0-8] movu xm4, [dstq+strideq*1-8] movu xm5, [dstq+strideq*2-8] movu xm6, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu xm11, [tmpq+strideq*0-8] movu xm13, [tmpq+strideq*1-8] movu xm14, [tmpq+strideq*2-8] movu xm15, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] vinserti128 m3, [tmpq+strideq*0-8], 1 vinserti128 m4, [tmpq+strideq*1-8], 1 vinserti128 m5, [tmpq+strideq*2-8], 1 vinserti128 m6, [tmpq+stride3q -8], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m11, [tmpq+strideq*0-8], 1 vinserti128 m13, [tmpq+strideq*1-8], 1 vinserti128 m14, [tmpq+strideq*2-8], 1 vinserti128 m15, [tmpq+stride3q -8], 1 ; transpose 8x16 ; xm3: A-H0,A-H8 ; xm4: A-H1,A-H9 ; xm5: A-H2,A-H10 ; xm6: A-H3,A-H11 ; xm11: A-H4,A-H12 ; xm13: A-H5,A-H13 ; xm14: A-H6,A-H14 ; xm15: A-H7,A-H15 punpcklwd m7, m3, m4 punpckhwd m3, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpcklwd m6, m11, m13 punpckhwd m11, m13 punpcklwd m13, m14, m15 punpckhwd m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1 ; xm3: E0-1,F0-1,G0-1,H0-1 ; xm4: A2-3,B2-3,C2-3,D2-3 ; xm5: E2-3,F2-3,G2-3,H2-3 ; xm6: A4-5,B4-5,C4-5,D4-5 ; xm11: E4-5,F4-5,G4-5,H4-5 ; xm13: A6-7,B6-7,C6-7,D6-7 ; xm14: E6-7,F6-7,G6-7,H6-7 punpckldq m15, m7, m4 punpckhdq m7, m4 punpckldq m9, m3, m5 punpckhdq m8, m3, m5 punpckldq m3, m6, m13 punpckhdq m6, m13 punpckldq m10, m11, m14 punpckhdq m11, m14 ; xm15: A0-3,B0-3 ; xm7: C0-3,D0-3 ; xm9: E0-3,F0-3 ; xm8: G0-3,H0-3 ; xm3: A4-7,B4-7 ; xm6: C4-7,D4-7 ; xm10: E4-7,F4-7 ; xm11: G4-7,H4-7 %if %1 != 6 punpcklqdq m0, m15, m3 %endif punpckhqdq m13, m15, m3 punpcklqdq m3, m7, m6 punpckhqdq m4, m7, m6 punpcklqdq m5, m9, m10 punpckhqdq m6, m9, m10 punpcklqdq m14, m8, m11 %if %1 != 6 punpckhqdq m15, m8, m11 mova [rsp+5*32], m0 %endif %else ; We only use 14 pixels but we'll need the remainder at the end for ; the second transpose mova xm0, [dstq+strideq*0-16] mova xm1, [dstq+strideq*1-16] mova xm2, [dstq+strideq*2-16] mova xm3, [dstq+stride3q -16] lea tmpq, [dstq+strideq*4] mova xm4, [tmpq+strideq*0-16] mova xm5, [tmpq+strideq*1-16] mova xm6, [tmpq+strideq*2-16] mova xm7, [tmpq+stride3q -16] lea tmpq, [tmpq+strideq*4] vinserti128 m0, m0, [tmpq+strideq*0-16], 1 vinserti128 m1, m1, [tmpq+strideq*1-16], 1 vinserti128 m2, m2, [tmpq+strideq*2-16], 1 vinserti128 m3, m3, [tmpq+stride3q -16], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m4, m4, [tmpq+strideq*0-16], 1 vinserti128 m5, m5, [tmpq+strideq*1-16], 1 vinserti128 m6, m6, [tmpq+strideq*2-16], 1 vinserti128 m7, m7, [tmpq+stride3q -16], 1 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 mova [rsp+6*32], m0 mova [rsp+7*32], m1 mova [rsp+8*32], m2 mova [rsp+9*32], m3 mova [rsp+5*32], m4 mova xm0, [dstq+strideq*0] mova xm1, [dstq+strideq*1] mova xm2, [dstq+strideq*2] mova xm3, [dstq+stride3q ] lea tmpq, [dstq+strideq*4] mova xm8, [tmpq+strideq*0] mova xm9, [tmpq+strideq*1] mova xm10, [tmpq+strideq*2] mova xm11, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] vinserti128 m0, m0, [tmpq+strideq*0], 1 vinserti128 m1, m1, [tmpq+strideq*1], 1 vinserti128 m2, m2, [tmpq+strideq*2], 1 vinserti128 m3, m3, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m8, m8, [tmpq+strideq*0], 1 vinserti128 m9, m9, [tmpq+strideq*1], 1 vinserti128 m10, m10, [tmpq+strideq*2], 1 vinserti128 m11, m11, [tmpq+stride3q ], 1 TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 mova [rsp+10*32], m8 mova [rsp+11*32], m9 mova [rsp+12*32], m10 mova [rsp+13*32], m11 ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 SWAP 13, 5, 0 SWAP 3, 6, 1, 15 SWAP 4, 7 SWAP 2, 14 %endif %endif ; load L/E/I/H %ifidn %2, v pmovzxbw m1, [lq] pmovzxbw m0, [lq+l_strideq] pxor m2, m2 %else vpbroadcastq m0, [lq] ; l0, l1 vpbroadcastq m1, [lq+l_strideq] ; l2, l3 vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5 vpbroadcastq m10, [lq+l_stride3q] ; l6, l7 punpckldq m0, m1 ; l0, l2, l1, l3 [2x] punpckldq m2, m10 ; l4, l6, l5, l7 [2x] vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7 pxor m2, m2 punpcklbw m1, m0, m2 ; l0, l2, l4, l6 punpckhbw m0, m2 ; l1, l3, l5, l7 %endif pcmpeqw m10, m2, m0 pand m1, m10 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] pcmpeqw m10, m2, m0 ; !L psrlw m10, 1 psrlw m2, m0, [lutq+128] vpbroadcastw m1, [lutq+136] pminuw m2, m1 pmaxuw m2, [pw_1] ; I psrlw m1, m0, 4 ; H paddw m0, [pw_2] vpbroadcastd m8, [r11] paddw m0, m0 paddw m0, m2 ; E REPX {pmullw x, m8}, m0, m1, m2 psubw m8, m3, m4 ; p1-p0 psubw m9, m5, m6 ; q1-q0 REPX {pabsw x, x}, m8, m9 pmaxuw m8, m10 pmaxuw m8, m9 pcmpgtw m7, m8, m1 ; hev %if %1 != 4 psubw m9, m13, m4 ; p2-p0 pabsw m9, m9 pmaxuw m9, m8 %if %1 != 6 %ifidn %2, v mova m11, [tmpq+strideq*0] ; p3 %else mova m11, [rsp+5*32] ; p3 %endif psubw m10, m11, m4 ; p3-p0 pabsw m10, m10 pmaxuw m9, m10 %endif psubw m10, m5, m14 ; q2-q0 pabsw m10, m10 pmaxuw m9, m10 %if %1 != 6 psubw m10, m5, m15 ; q3-q0 pabsw m10, m10 pmaxuw m9, m10 %endif vpbroadcastd m10, [r11] pcmpgtw m9, m10 ; !flat8in psubw m10, m13, m3 ; p2-p1 pabsw m10, m10 %if %1 != 6 psubw m11, m13 ; p3-p2 pabsw m11, m11 pmaxuw m10, m11 psubw m11, m14, m15 ; q3-q2 pabsw m11, m11 pmaxuw m10, m11 %endif psubw m11, m14, m6 ; q2-q1 pabsw m11, m11 pmaxuw m10, m11 %if %1 == 16 vpbroadcastd m11, [maskq+8] vpbroadcastd m1, [maskq+4] por m11, m1 pand m11, m12 pcmpeqd m11, m12 pand m10, m11 %else vpbroadcastd m11, [maskq+4] pand m11, m12 pcmpeqd m11, m12 pand m10, m11 ; only apply fm-wide to wd>4 blocks %endif pmaxuw m8, m10 %endif pcmpgtw m8, m2 psubw m10, m3, m6 ; p1-q1 psubw m11, m4, m5 ; p0-q0 REPX {pabsw x, x}, m10, m11 paddw m11, m11 psrlw m10, 1 paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E por m8, m10 %if %1 == 16 %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] mova m1, [tmpq+strideq*2] mova m2, [tmpq+stride3q] %else mova m0, [rsp+7*32] mova m1, [rsp+8*32] mova m2, [rsp+9*32] %endif REPX {psubw x, m4}, m0, m1, m2 REPX {pabsw x, x}, m0, m1, m2 pmaxuw m1, m0 pmaxuw m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] mova m2, [tmpq+strideq*1] mova m10, [tmpq+strideq*2] %else mova m0, [rsp+10*32] mova m2, [rsp+11*32] mova m10, [rsp+12*32] %endif REPX {psubw x, m5}, m0, m2, m10 REPX {pabsw x, x}, m0, m2, m10 pmaxuw m0, m2 pmaxuw m1, m10 pmaxuw m1, m0 vpbroadcastd m0, [r11] pcmpgtw m1, m0 ; !flat8out por m1, m9 ; !flat8in | !flat8out vpbroadcastd m2, [maskq+8] pand m10, m2, m12 pcmpeqd m10, m12 pandn m1, m10 ; flat16 pandn m1, m8, m1 ; flat16 & fm vpbroadcastd m10, [maskq+4] por m10, m2 pand m2, m10, m12 pcmpeqd m2, m12 pandn m9, m2 ; flat8in pandn m9, m8, m9 vpbroadcastd m2, [maskq+0] por m2, m10 pand m2, m12 pcmpeqd m2, m12 pandn m8, m2 pandn m8, m9, m8 ; fm & !flat8 & !flat16 pandn m9, m1, m9 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] pand m2, m0, m12 pcmpeqd m2, m12 pandn m9, m2 pandn m9, m8, m9 ; flat8 & fm vpbroadcastd m2, [maskq+0] por m0, m2 pand m0, m12 pcmpeqd m0, m12 pandn m8, m0 pandn m8, m9, m8 ; fm & !flat8 %else vpbroadcastd m0, [maskq+0] pand m0, m12 pcmpeqd m0, m12 pandn m8, m0 ; fm %endif ; short filter vpbroadcastd m0, [r11+8*1] ; 511 or 2047 vpbroadcastd m2, [r11+8*2] ; -512 or -2048 psubw m10, m5, m4 paddw m11, m10, m10 paddw m11, m10 psubw m10, m3, m6 ; iclip_diff(p1-q1) pminsw m10, m0 pmaxsw m10, m2 pand m10, m7 ; f=iclip_diff(p1-q1)&hev paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) pminsw m10, m0 pmaxsw m10, m2 pand m8, m10 ; f&=fm vpbroadcastd m10, [pw_4] paddw m10, m8 paddw m8, [pw_3] REPX {pminsw x, m0}, m10, m8 psraw m10, 3 ; f2 psraw m8, 3 ; f1 psubw m5, m10 paddw m4, m8 paddw m10, [pw_1] psraw m10, 1 ; f=(f1+1)>>1 pandn m8, m7, m10 ; f&=!hev paddw m3, m8 psubw m6, m8 pxor m8, m8 psubw m0, m2 ; 1023 or 4095 REPX {pminsw x, m0}, m3, m4, m5, m6 REPX {pmaxsw x, m8}, m3, m4, m5, m6 %if %1 == 16 ; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 ; m12=filter bits mask ; m13-15=p2/q2/q3 ; m0,2,7-8,10-11 = free ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 mova m11, [tmpq+strideq*4] ; p3 %else mova m0, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] mova m11, [rsp+5*32] %endif mova [rsp+ 0*32], m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 paddw m8, m0, [pw_1] psllw m8, 3 ; p6*8+8 paddw m10, m2, m7 ; p5+p4 psubw m8, m0 paddw m10, m10 ; (p5+p4)*2 paddw m8, m11 ; p6*7+p3 paddw m10, m13 ; (p5+p4)*2+p2 paddw m8, m3 ; p6*7+p3+p1 paddw m10, m4 ; (p5+p4)*2+p2+p0 paddw m8, m5 ; p6*7+p3+p1+q0 paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psrlw m10, m8, 4 vpblendvb m10, m2, m10, m1 %ifidn %2, v mova [tmpq+strideq*2], m10 ; p5 %else mova [rsp+8*32], m10 %endif ; sub p6*2, add p3/q1 paddw m8, m11 paddw m10, m0, m0 paddw m8, m6 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m7, m10, m1 %ifidn %2, v mova [tmpq+stride3q], m10 ; p4 %else mova [rsp+9*32], m10 %endif ; sub p6/p5, add p2/q2 psubw m8, m0 paddw m10, m13, m14 psubw m8, m2 paddw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*4], m10 ; p3 lea tmpq, [dstq+strideq*4] %else mova [rsp+5*32], m10 %endif ; sub p6/p4, add p1/q3 paddw m8, m3 paddw m10, m0, m7 paddw m8, m15 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m13, m10, m1 mova [rsp+1*32], m10 ; don't clobber p2/m13 ; sub p6/p3, add p0/q4 paddw m8, m4 paddw m10, m0, m11 %ifidn %2, v paddw m8, [tmpq+strideq*0] %else paddw m8, [rsp+10*32] %endif psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m3, m10, m1 mova [rsp+2*32], m10 ; don't clobber p1/m3 ; sub p6/p2, add q0/q5 paddw m8, m5 paddw m10, m0, m13 %ifidn %2, v paddw m8, [tmpq+strideq*1] %else paddw m8, [rsp+11*32] %endif psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m4, m10, m1 mova [rsp+3*32], m10 ; don't clobber p0/m4 ; sub p6/p1, add q1/q6 paddw m8, m6 paddw m10, m0, m3 %ifidn %2, v mova m0, [tmpq+strideq*2] ; q6 %else mova m0, [rsp+12*32] ; q6 %endif paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m5, m10, m1 mova [rsp+4*32], m10 ; don't clobber q0/m5 ; sub p5/p0, add q2/q6 paddw m8, m14 paddw m10, m2, m4 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 paddw m8, m15 paddw m10, m7, m5 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v paddw m8, [tmpq+strideq*0] %else paddw m8, [rsp+10*32] %endif paddw m10, m11, m6 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m15, m10, m1 %ifidn %2, v mova [tmpq+mstrideq], m10 ; q3 %else mova [rsp+14*32], m10 %endif ; sub p2/q2, add q5/q6 %ifidn %2, v paddw m8, [tmpq+strideq*1] %else paddw m8, [rsp+11*32] %endif paddw m10, m13, m14 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 %ifidn %2, v mova m9, [tmpq+strideq*0] %else mova m9, [rsp+10*32] %endif vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*0], m10 ; q4 %else mova [rsp+10*32], m10 %endif ; sub p1/q3, add q6*2 psubw m8, m3 paddw m0, m0 psubw m8, m15 paddw m8, m0 psrlw m10, m8, 4 %ifidn %2, v mova m9, [tmpq+strideq*1] %else mova m9, [rsp+11*32] %endif vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else mova [rsp+11*32], m10 %endif mova m9, [rsp+0*32] mova m13, [rsp+1*32] mova m3, [rsp+2*32] mova m4, [rsp+3*32] mova m5, [rsp+4*32] SWAP 2, 6 SWAP 7, 14 %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %else mova m15, [rsp+14*32] %endif %endif %if %1 >= 8 ; flat8 filter vpbroadcastd m7, [pw_4096] %ifidn %2, v mova m0, [tmpq+strideq*0] ; p3 %else mova m0, [rsp+5*32] ; p3 %endif paddw m1, m0, m13 ; p3+p2 paddw m2, m3, m4 ; p1+p0 paddw m8, m1, m1 ; 2*(p3+p2) paddw m2, m0 ; p1+p0+p3 paddw m8, m5 ; 2*(p3+p2)+q0 paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 pmulhrsw m10, m2, m7 paddw m8, m3, m6 psubw m2, m1 paddw m2, m8 pmulhrsw m8, m2, m7 paddw m11, m0, m3 paddw m1, m4, m14 psubw m2, m11 paddw m2, m1 pmulhrsw m1, m2, m7 paddw m11, m0, m4 pblendvb m4, m1, m9 paddw m1, m5, m15 psubw m2, m11 paddw m2, m1 pmulhrsw m11, m2, m7 paddw m2, m6 paddw m2, m15 paddw m1, m13, m5 pblendvb m5, m11, m9 pblendvb m13, m10, m9 psubw m2, m1 pmulhrsw m1, m2, m7 psubw m2, m3 pblendvb m3, m8, m9 psubw m2, m6 pblendvb m6, m1, m9 paddw m1, m15, m14 paddw m2, m1 pmulhrsw m2, m7 pblendvb m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 mova [tmpq+strideq*2], m3 ; p1 mova [tmpq+stride3q ], m4 ; p0 mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 mova [dstq+strideq*2], m14 ; q2 %elif %1 == 8 TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 ; write 8x16 movu [dstq+strideq*0-8], xm0 movu [dstq+strideq*1-8], xm13 movu [dstq+strideq*2-8], xm3 movu [dstq+stride3q -8], xm4 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm5 movu [dstq+strideq*1-8], xm6 movu [dstq+strideq*2-8], xm14 movu [dstq+stride3q -8], xm15 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m0, 1 vextracti128 [dstq+strideq*1-8], m13, 1 vextracti128 [dstq+strideq*2-8], m3, 1 vextracti128 [dstq+stride3q -8], m4, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m5, 1 vextracti128 [dstq+strideq*1-8], m6, 1 vextracti128 [dstq+strideq*2-8], m14, 1 vextracti128 [dstq+stride3q -8], m15, 1 lea dstq, [dstq+strideq*4] %else mova m8, [rsp+6*32] mova m1, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 mova [dstq+strideq*0-16], xm8 mova [dstq+strideq*1-16], xm1 mova [dstq+strideq*2-16], xm2 mova [dstq+stride3q -16], xm7 lea tmpq, [dstq+strideq*4] mova [tmpq+strideq*0-16], xm0 mova [tmpq+strideq*1-16], xm13 mova [tmpq+strideq*2-16], xm3 mova [tmpq+stride3q -16], xm4 lea tmpq, [tmpq+strideq*4] vextracti128 [tmpq+strideq*0-16], m8, 1 vextracti128 [tmpq+strideq*1-16], m1, 1 vextracti128 [tmpq+strideq*2-16], m2, 1 vextracti128 [tmpq+stride3q -16], m7, 1 lea tmpq, [tmpq+strideq*4] vextracti128 [tmpq+strideq*0-16], m0, 1 vextracti128 [tmpq+strideq*1-16], m13, 1 vextracti128 [tmpq+strideq*2-16], m3, 1 vextracti128 [tmpq+stride3q -16], m4, 1 mova m0, [rsp+10*32] mova m1, [rsp+11*32] mova m2, [rsp+12*32] mova m3, [rsp+13*32] TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 mova [dstq+strideq*0], xm5 mova [dstq+strideq*1], xm6 mova [dstq+strideq*2], xm14 mova [dstq+stride3q ], xm15 lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 mova [dstq+strideq*2], xm2 mova [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m5, 1 vextracti128 [dstq+strideq*1], m6, 1 vextracti128 [dstq+strideq*2], m14, 1 vextracti128 [dstq+stride3q ], m15, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m0, 1 vextracti128 [dstq+strideq*1], m1, 1 vextracti128 [dstq+strideq*2], m2, 1 vextracti128 [dstq+stride3q ], m3, 1 lea dstq, [dstq+strideq*4] %endif %elif %1 == 6 ; flat6 filter vpbroadcastd m7, [pw_4096] paddw m8, m3, m4 paddw m8, m13 ; p2+p1+p0 paddw m11, m13, m5 paddw m8, m8 paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 pmulhrsw m2, m8, m7 paddw m8, m5 paddw m11, m13, m13 paddw m8, m6 psubw m8, m11 pmulhrsw m10, m8, m7 paddw m8, m6 paddw m11, m13, m3 paddw m8, m14 psubw m8, m11 pmulhrsw m11, m8, m7 psubw m8, m3 paddw m14, m14 psubw m8, m4 paddw m8, m14 pmulhrsw m8, m7 pblendvb m3, m2, m9 pblendvb m4, m10, m9 pblendvb m5, m11, m9 pblendvb m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 mova [tmpq+stride3q ], m4 ; p0 mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 %else TRANSPOSE_8x4_AND_WRITE_4x16 %endif %else %ifidn %2, v mova [tmpq+strideq*0], m3 ; p1 mova [tmpq+strideq*1], m4 ; p0 mova [tmpq+strideq*2], m5 ; q0 mova [tmpq+stride3q ], m6 ; q1 %else TRANSPOSE_8x4_AND_WRITE_4x16 %endif %endif %endmacro INIT_YMM avx2 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, v jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call .v4 .end: pslld m12, 4 add lq, 16 add dstq, 32 shl mask_bitsd, 4 sub wd, 4 jg .loop RET ALIGN function_align .v4: FILTER 4, v ret INIT_YMM avx2 cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, h jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter call .h4 jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] .end: pslld m12, 4 lea lq, [lq+l_strideq*4] shl mask_bitsd, 4 sub hd, 4 jg .loop RET ALIGN function_align .h4: FILTER 4, h ret INIT_YMM avx2 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 .end: pslld m12, 4 add lq, 16 add dstq, 32 shl mask_bitsd, 4 sub wd, 4 jg .loop RET INIT_YMM avx2 cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] .end: pslld m12, 4 lea lq, [lq+l_strideq*4] shl mask_bitsd, 4 sub hd, 4 jg .loop RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/loopfilter16_avx512.asm000066400000000000000000001026201517466257200254170ustar00rootroot00000000000000; Copyright © 2022, VideoLAN and dav2d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 l_shuf_v: times 2 db 0, 32 pw_1: times 2 dw 1 times 2 db 4, 36 pw_3: times 2 dw 3 times 2 db 8, 40 pw_4: times 2 dw 4 times 2 db 12, 44 pw_16: times 2 dw 16 times 2 db 16, 48 pw_4096: times 2 dw 4096 times 2 db 20, 52 pw_16384: times 2 dw 16384 times 2 db 24, 56 pw_32767: times 2 dw 32767 times 2 db 28, 60 times 2 dw 0 filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128 stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25 l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1 clip_max: dw 511, 511, 2047, 2047 clip_min: dw -512, -512, -2048, -2048 SECTION .text %macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp punpckhwd m%9, m%5, m%6 punpcklwd m%5, m%6 punpckhwd m%6, m%1, m%2 punpcklwd m%1, m%2 punpckhwd m%2, m%7, m%8 punpcklwd m%7, m%8 punpckhwd m%8, m%3, m%4 punpcklwd m%3, m%4 punpckhdq m%4, m%1, m%3 punpckldq m%1, m%3 punpckldq m%3, m%5, m%7 punpckhdq m%5, m%7 punpckhdq m%7, m%6, m%8 punpckldq m%6, m%8 punpckldq m%8, m%9, m%2 punpckhdq m%9, m%2 punpckhqdq m%2, m%1, m%3 punpcklqdq m%1, m%3 punpcklqdq m%3, m%4, m%5 punpckhqdq m%4, m%5 punpcklqdq m%5, m%6, m%8 punpckhqdq m%6, m%8 punpckhqdq m%8, m%7, m%9 punpcklqdq m%7, m%9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] %ifidn %2, v %if %1 == 16 lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1 ] mova m1, [tmpq+strideq*2 ] ; p5 mova m2, [tmpq+stride3q ] ; p4 mova m3, [tmpq+strideq*4 ] ; p3 mova m4, [tmpq+stride5q ] ; p2 %elif %1 == 6 || %1 == 8 lea tmpq, [dstq+mstrideq*4] %if %1 == 8 mova m3, [tmpq+strideq*0 ] %endif mova m4, [tmpq+strideq*1 ] %endif mova m5, [dstq+mstrideq*2] ; p1 mova m6, [dstq+mstrideq*1] ; p0 mova m7, [dstq+strideq*0 ] ; q0 mova m8, [dstq+strideq*1 ] ; q1 %if %1 != 4 mova m9, [dstq+strideq*2 ] ; q2 %endif %if %1 == 8 || %1 == 16 mova m10, [dstq+stride3q ] ; q3 %endif %if %1 == 16 mova m11, [dstq+strideq*4 ] ; q4 mova m22, [dstq+stride5q ] ; q5 mova m23, [dstq+stride3q*2] %endif %else ; h %if %1 == 16 movu ym16, [dstq+strideq*0 -16] movu ym17, [dstq+strideq*1 -16] movu ym18, [dstq+strideq*2 -16] movu ym19, [dstq+stride3q -16] movu ym20, [dstq+strideq*4 -16] movu ym22, [dstq+stride5q -16] movu ym23, [dstq+stride3q*2-16] movu ym28, [dstq+stride7q -16] lea tmpq, [dstq+strideq*8 -16] vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1 vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1 vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1 vinserti32x8 m10, m19, [tmpq+stride3q ], 1 vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1 vinserti32x8 m22, m22, [tmpq+stride5q ], 1 vinserti32x8 m23, m23, [tmpq+stride3q*2], 1 vinserti32x8 m28, m28, [tmpq+stride7q ], 1 lea tmpq, [tmpq+strideq*8] TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27 movu ym16, [tmpq+strideq*0 ] movu ym17, [tmpq+strideq*1 ] movu ym18, [tmpq+strideq*2 ] movu ym19, [tmpq+stride3q ] movu ym24, [tmpq+strideq*4 ] movu ym25, [tmpq+stride5q ] movu ym26, [tmpq+stride3q*2] movu ym20, [tmpq+stride7q ] lea tmpq, [tmpq+strideq*8] vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1 vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1 vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1 vinserti32x8 m3, m19, [tmpq+stride3q ], 1 vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1 vinserti32x8 m5, m25, [tmpq+stride5q ], 1 vinserti32x8 m6, m26, [tmpq+stride3q*2], 1 vinserti32x8 m20, m20, [tmpq+stride7q ], 1 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27 vshufi32x4 m27, m7, m0, q2020 vshufi32x4 m7, m0, q3131 vshufi32x4 m0, m8, m1, q2020 vshufi32x4 m8, m1, q3131 vshufi32x4 m1, m9, m2, q2020 vshufi32x4 m9, m2, q3131 vshufi32x4 m2, m10, m3, q2020 vshufi32x4 m10, m3, q3131 vshufi32x4 m3, m11, m4, q2020 vshufi32x4 m11, m4, q3131 vshufi32x4 m4, m22, m5, q2020 vshufi32x4 m22, m5, q3131 vshufi32x4 m5, m23, m6, q2020 vshufi32x4 m23, m6, q3131 vshufi32x4 m6, m28, m20, q2020 vshufi32x4 m28, m20, q3131 %elif %1 == 6 || %1 == 8 %if %1 == 8 sub dstq, 8 movu xm16, [dstq+strideq*0 ] movu xm17, [dstq+strideq*1 ] movu xm18, [dstq+strideq*2 ] movu xm19, [dstq+stride3q ] movu xm24, [dstq+strideq*4 ] movu xm25, [dstq+stride5q ] movu xm26, [dstq+stride3q*2] movu xm27, [dstq+stride7q ] lea tmpq, [dstq+strideq*8 ] vinserti128 ym16, [tmpq+strideq*0 ], 1 vinserti128 ym17, [tmpq+strideq*1 ], 1 vinserti128 ym18, [tmpq+strideq*2 ], 1 vinserti128 ym19, [tmpq+stride3q ], 1 vinserti128 ym24, [tmpq+strideq*4 ], 1 vinserti128 ym25, [tmpq+stride5q ], 1 vinserti128 ym26, [tmpq+stride3q*2], 1 vinserti128 ym27, [tmpq+stride7q ], 1 lea tmpq, [tmpq+strideq*8 ] vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2 vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2 vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2 vinserti32x4 m7, m19, [tmpq+stride3q ], 2 vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2 vinserti32x4 m9, m25, [tmpq+stride5q ], 2 vinserti32x4 m3, m26, [tmpq+stride3q*2], 2 vinserti32x4 m4, m27, [tmpq+stride7q ], 2 lea tmpq, [tmpq+strideq*8 ] vinserti32x4 m10, [tmpq+strideq*0 ], 3 vinserti32x4 m8, [tmpq+strideq*1 ], 3 vinserti32x4 m5, [tmpq+strideq*2 ], 3 vinserti32x4 m7, [tmpq+stride3q ], 3 vinserti32x4 m2, [tmpq+strideq*4 ], 3 vinserti32x4 m9, [tmpq+stride5q ], 3 vinserti32x4 m3, [tmpq+stride3q*2], 3 vinserti32x4 m4, [tmpq+stride7q ], 3 %else ; %1 == 6 movu xm16, [dstq+strideq*0-8] movu xm17, [dstq+strideq*1-8] movu xm18, [dstq+strideq*2-8] movu xm19, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4-8] movu xm2, [tmpq+strideq*0] movu xm9, [tmpq+strideq*1] movu xm3, [tmpq+strideq*2] movu xm4, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] vinserti128 ym16, [tmpq+strideq*0], 1 vinserti128 ym17, [tmpq+strideq*1], 1 vinserti128 ym18, [tmpq+strideq*2], 1 vinserti128 ym19, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 ym2, [tmpq+strideq*0], 1 vinserti128 ym9, [tmpq+strideq*1], 1 vinserti128 ym3, [tmpq+strideq*2], 1 vinserti128 ym4, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti32x4 m10, m16, [tmpq+strideq*0], 2 vinserti32x4 m8, m17, [tmpq+strideq*1], 2 vinserti32x4 m5, m18, [tmpq+strideq*2], 2 vinserti32x4 m7, m19, [tmpq+stride3q ], 2 lea tmpq, [tmpq+strideq*4] vinserti32x4 m2, [tmpq+strideq*0], 2 vinserti32x4 m9, [tmpq+strideq*1], 2 vinserti32x4 m3, [tmpq+strideq*2], 2 vinserti32x4 m4, [tmpq+stride3q ], 2 lea tmpq, [tmpq+strideq*4] vinserti32x4 m10, [tmpq+strideq*0], 3 vinserti32x4 m8, [tmpq+strideq*1], 3 vinserti32x4 m5, [tmpq+strideq*2], 3 vinserti32x4 m7, [tmpq+stride3q ], 3 lea tmpq, [tmpq+strideq*4] vinserti32x4 m2, [tmpq+strideq*0], 3 vinserti32x4 m9, [tmpq+strideq*1], 3 vinserti32x4 m3, [tmpq+strideq*2], 3 vinserti32x4 m4, [tmpq+stride3q ], 3 %endif punpcklwd m6, m10, m8 punpckhwd m10, m8 punpcklwd m8, m5, m7 punpckhwd m5, m7 punpcklwd m7, m2, m9 punpckhwd m2, m9 punpcklwd m9, m3, m4 punpckhwd m3, m4 punpckldq m4, m6, m8 punpckhdq m6, m8 punpckldq m8, m10, m5 punpckhdq m10, m5 punpckldq m5, m7, m9 punpckhdq m7, m9 punpckldq m9, m2, m3 punpckhdq m2, m3 %if %1 == 8 punpcklqdq m3, m4, m5 %endif punpckhqdq m4, m5 punpcklqdq m5, m6, m7 punpckhqdq m6, m7 punpcklqdq m7, m8, m9 punpckhqdq m8, m9 punpcklqdq m9, m10, m2 %if %1 == 8 punpckhqdq m10, m2 %endif %else ; %1 == 4 kxnorb k1, k1, k1 kmovb k2, k1 vpgatherdq m7{k1}, [dstq+ym12-4] lea tmpq, [dstq+strideq*2-4] kmovb k1, k2 vpgatherdq m4{k2}, [tmpq+ym12] lea tmpq, [tmpq+strideq*2] kmovb k2, k1 vpgatherdq m5{k1}, [tmpq+ym12] lea tmpq, [tmpq+strideq*2] vpgatherdq m6{k2}, [tmpq+ym12] punpcklwd m8, m7, m4 punpckhwd m7, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpcklwd m6, m8, m7 punpckhwd m8, m7 punpcklwd m7, m4, m5 punpckhwd m4, m5 punpcklqdq m5, m6, m7 punpckhqdq m6, m7 punpcklqdq m7, m8, m4 punpckhqdq m8, m4 %endif %endif ; load L/E/I/H %ifidn %2, v movu ym16, [lq+l_strideq*1] movsldup m17, [l_shuf_v] vptestnmb k1, ym16, ym16 vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][] vpermb m16, m17, m16 ; l[x][1] %else movq xm16, [lq+l_strideq*0] movq xm17, [lq+l_strideq*1] vinserti128 ym16, [lq+l_strideq*2], 1 vinserti128 ym17, [lq+l_stride3q ], 1 lea tmpq, [lq+l_strideq*4] vinserti32x4 m16, [tmpq+l_strideq*0], 2 vinserti32x4 m17, [tmpq+l_strideq*1], 2 vinserti32x4 m16, [tmpq+l_strideq*2], 3 vinserti32x4 m17, [tmpq+l_stride3q ], 3 punpcklqdq m16, m17 vbroadcasti32x4 m17, [l_shuf_h] vptestnmb k1, m16, m16 vpalignr m16{k1}, m16, 12 pshufb m16, m17 ; l[x][1] %endif vpbroadcastd m20, [pw_32767] psubw m17, m5, m6 ; p1-p0 psubw m18, m7, m8 ; q1-q0 vptestmw k1, m16, m16 ; L pabsw m17, m17 pabsw m18, m18 vpmaxuw m20{k1}, m17, m18 vpbroadcastw m17, [lutq+136] psrlw m18, m16, [lutq+128] vpbroadcastd m19, [pw_1] pminuw m18, m17 psrlw m17, m16, 4 ; H paddw m16, m16 pmaxuw m18, m19 ; I vpaddd m16, [pw_4] {1to16} paddw m16, m18 ; E REPX {pmullw x, m13}, m17, m18, m16 vpcmpw k4, m20, m17, 6 ; hev %if %1 != 4 psubw m19, m4, m5 ; p2-p1 pabsw m19, m19 %if %1 == 8 || %1 == 16 psubw m17, m3, m4 ; p3-p2 pabsw m17, m17 pmaxuw m19, m17 psubw m17, m9, m10 ; q3-q2 pabsw m17, m17 pmaxuw m19, m17 %endif psubw m17, m9, m8 ; q2-q1 pabsw m17, m17 pmaxuw m19, m17 %if %1 == 16 vpbroadcastd ym17, [maskq+4] vpord ym17, [maskq+8] {1to8} vptestmd k1, ym17, ym21 %else vptestmd k1, ym21, [maskq+4] {1to8} %endif pmaxuw m19, m20 psubw m17, m4, m6 ; p2-p0 pabsw m17, m17 pmaxuw m17, m20 vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks %if %1 == 8 || %1 == 16 psubw m19, m3, m6 ; p3-p0 pabsw m19, m19 pmaxuw m17, m19 psubw m19, m7, m10 ; q3-q0 pabsw m19, m19 pmaxuw m17, m19 %endif psubw m19, m7, m9 ; q2-q0 pabsw m19, m19 pmaxuw m17, m19 %endif vpcmpw k1, m20, m18, 2 psubw m18, m5, m8 ; p1-q1 psubw m19, m6, m7 ; p0-q0 pabsw m18, m18 pabsw m19, m19 psrlw m18, 1 paddw m19, m19 paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1) vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E %if %1 != 4 vpcmpw k2{k1}, m17, m13, 2 ; flat8in %endif %if %1 == 16 psubw m20, m0, m6 psubw m16, m1, m6 pabsw m20, m20 psubw m17, m2, m6 pabsw m16, m16 psubw m18, m11, m7 pabsw m17, m17 psubw m19, m22, m7 pabsw m18, m18 pmaxuw m20, m16 psubw m16, m23, m7 pabsw m19, m19 pmaxuw m17, m18 pabsw m16, m16 vpandd ym18, ym21, [maskq+8] {1to8} pmaxuw m20, m17 pmaxuw m19, m16 pcmpeqd ym16, ym21, ym18 vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8 pmaxuw m20, m19 pcmpeqd ym17, ym21, ym18 vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8 vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out pcmpeqd ym18, ym21 vptestmb k3{k3}, ym16, ym16 ; flat8 & fm vptestmb k2{k2}, ym17, ym17 ; flat8in vptestmb k1{k1}, ym18, ym18 kandnd k1, k2, k1 ; fm & !flat8 & !flat16 kandnd k2, k3, k2 ; flat8 & !flat16 %elif %1 == 6 || %1 == 8 vpandd ym17, ym21, [maskq+4] {1to8} pcmpeqd ym16, ym21, ym17 vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8 pcmpeqd ym17, ym21 vptestmb k2{k2}, ym16, ym16 ; flat8 & fm vptestmb k1{k1}, ym17, ym17 kandnd k1, k2, k1 ; fm & !flat8 %else ; %1 == 4 vpandd ym16, ym21, [maskq+0] {1to8} pcmpeqd ym16, ym21 vptestmb k1{k1}, ym16, ym16 %endif ; short filter psubw m16, m7, m6 vpbroadcastd m17, [pw_3] paddw m18, m16, m16 paddw m18, m16 psubw m16, m5, m8 ; iclip_diff(p1-q1) pminsw m16, m14 vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev knotd k4, k4 ; !hev paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f) vpbroadcastd m18, [pw_4] pminsw m16, m14 vpmaxsw m16{k1}{z}, m15 ; f&=fm paddw m17, m16 paddw m16, m18 vpbroadcastd m18, [pw_16384] pminsw m17, m14 pminsw m16, m14 psraw m17, 3 ; f2 psraw m16, 3 ; f1 paddw m6, m17 psubw m7, m16 vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev psubw m17, m14, m15 ; 1023 or 4095 pxor m18, m18 paddw m5, m16 psubw m8, m16 REPX {pminsw x, m17}, m6, m7, m5, m8 REPX {pmaxsw x, m18}, m6, m7, m5, m8 %if %1 == 16 ; flat16 filter vpaddd m19, m0, [pw_1] {1to16} paddw m16, m1, m2 ; p5+p4 paddw m26, m1, m6 ; p5+p0 paddw m24, m2, m7 ; p4+q0 paddw m16, m4 ; p5+p4+p3 paddw m17, m3, m5 ; p2+p1 psllw m19, 3 paddw m16, m26 ; p5*2+p4+p3+p0 paddw m17, m24 ; p4+p2+p1+q0 psubw m19, m0 ; p6*7+8 paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0 paddw m18, m3, m8 paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0 paddw m25, m1, m0 paddw m16, m0, m0 psrlw m1{k3}, m19, 4 paddw m19, m18 psubw m19, m16 ; +p3+q1-p6*2 paddw m16, m2, m0 psrlw m2{k3}, m19, 4 psubw m19, m25 paddw m25, m4, m9 paddw m20, m10, m5 paddw m19, m25 ; +p2+q2-p6-p5 paddw m17, m0, m3 psubw m16, m20, m16 psrlw m3{k3}, m19, 4 paddw m19, m16 ; +p1+q3-p6-p4 paddw m16, m11, m6 psubw m16, m17 paddw m17, m0, m4 psrlw m4{k3}, m19, 4 paddw m19, m16 ; +p0+q4-p6-p3 paddw m16, m22, m7 psubw m16, m17 paddw m17, m0, m5 psrlw m5{k3}, m19, 4 paddw m19, m16 ; +q0+q5-p6-p2 paddw m16, m23, m8 psrlw m6{k3}, m19, 4 psubw m16, m17 paddw m19, m16 ; +q1+q6-p6-p1 paddw m16, m23, m9 psrlw m7{k3}, m19, 4 psubw m16, m26 paddw m19, m16 ; +q2+q6-p5-p0 paddw m16, m23, m10 psrlw m8{k3}, m19, 4 psubw m16, m24 paddw m19, m16 ; +q3+q6-p4-p0 paddw m16, m23, m11 psrlw m9{k3}, m19, 4 psubw m16, m18 paddw m19, m16 ; +q4+q6-p3-q1 paddw m16, m23, m22 psrlw m10{k3}, m19, 4 psubw m16, m25 paddw m19, m16 ; +q5+q6-p2-q2 paddw m16, m23, m23 psrlw m11{k3}, m19, 4 psubw m16, m20 paddw m19, m16 ; +q6*2-p1-q3 psrlw m22{k3}, m19, 4 %endif %if %1 == 8 || %1 == 16 ; flat8 filter vpbroadcastd m20, [pw_4096] paddw m16, m3, m4 ; p3+p2 paddw m19, m5, m6 ; p1+p0 paddw m17, m16, m16 ; 2*(p3+p2) paddw m19, m3 ; p1+p0+p3 paddw m17, m7 ; 2*(p3+p2)+q0 paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0 paddw m18, m4, m7 pmulhrsw m4{k2}, m19, m20 psubw m19, m16 paddw m17, m5, m8 paddw m16, m3, m5 paddw m19, m17 pmulhrsw m5{k2}, m19, m20 psubw m19, m16 paddw m16, m6, m9 paddw m19, m16 paddw m16, m3, m6 pmulhrsw m6{k2}, m19, m20 paddw m19, m10 psubw m16, m7, m16 paddw m19, m16 psubw m16, m10, m18 pmulhrsw m7{k2}, m19, m20 paddw m16, m8 paddw m19, m16 psubw m16, m10, m17 pmulhrsw m8{k2}, m19, m20 paddw m16, m9 paddw m19, m16 pmulhrsw m9{k2}, m19, m20 %elif %1 == 6 ; flat6 filter vpbroadcastd m10, [pw_4096] paddw m2, m5, m6 paddw m0, m4, m7 paddw m1, m2, m4 ; p2+p1+p0 paddw m3, m4, m4 paddw m1, m1 paddw m4, m5 paddw m1, m0 ; p2+2*(p2+p1+p0)+q0 psubw m3, m7, m3 pmulhrsw m5{k2}, m1, m10 paddw m3, m8 psubw m4, m8, m4 paddw m1, m3 pmulhrsw m6{k2}, m1, m10 paddw m4, m9 paddw m9, m9 paddw m1, m4 pmulhrsw m7{k2}, m1, m10 psubw m9, m2 paddw m1, m9 pmulhrsw m8{k2}, m1, m10 %endif %ifidn %2, v %if %1 == 16 mova [tmpq+strideq*2 ], m1 ; p5 mova [tmpq+stride3q ], m2 ; p4 mova [tmpq+strideq*4 ], m3 ; p3 mova [tmpq+stride5q ], m4 ; p2 %elif %1 == 8 mova [tmpq+strideq*1 ], m4 ; p2 %endif mova [dstq+mstrideq*2], m5 ; p1 mova [dstq+mstrideq ], m6 ; p0 mova [dstq+strideq*0 ], m7 ; q0 mova [dstq+strideq*1 ], m8 ; q1 %if %1 == 8 || %1 == 16 mova [dstq+strideq*2 ], m9 ; q2 %endif %if %1 == 16 mova [dstq+stride3q ], m10 ; q3 mova [dstq+strideq*4 ], m11 ; q4 mova [dstq+stride5q ], m22 ; q5 %endif %else %if %1 == 16 TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20 TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20 mova [dstq+strideq*0 -16], xm27 mova [dstq+strideq*0 ], xm7 mova [dstq+strideq*1 -16], xm0 mova [dstq+strideq*1 ], xm8 mova [dstq+strideq*2 -16], xm1 mova [dstq+strideq*2 ], xm9 mova [dstq+stride3q -16], xm2 mova [dstq+stride3q ], xm10 mova [dstq+strideq*4 -16], xm3 mova [dstq+strideq*4 ], xm11 mova [dstq+stride5q -16], xm4 mova [dstq+stride5q ], xm22 mova [dstq+stride3q*2-16], xm5 mova [dstq+stride3q*2 ], xm23 mova [dstq+stride7q -16], xm6 mova [dstq+stride7q ], xm28 lea dstq, [dstq+strideq*8] vextracti128 [dstq+strideq*0 -16], ym27, 1 vextracti128 [dstq+strideq*0 ], ym7, 1 vextracti128 [dstq+strideq*1 -16], ym0, 1 vextracti128 [dstq+strideq*1 ], ym8, 1 vextracti128 [dstq+strideq*2 -16], ym1, 1 vextracti128 [dstq+strideq*2 ], ym9, 1 vextracti128 [dstq+stride3q -16], ym2, 1 vextracti128 [dstq+stride3q ], ym10, 1 vextracti128 [dstq+strideq*4 -16], ym3, 1 vextracti128 [dstq+strideq*4 ], ym11, 1 vextracti128 [dstq+stride5q -16], ym4, 1 vextracti128 [dstq+stride5q ], ym22, 1 vextracti128 [dstq+stride3q*2-16], ym5, 1 vextracti128 [dstq+stride3q*2 ], ym23, 1 vextracti128 [dstq+stride7q -16], ym6, 1 vextracti128 [dstq+stride7q ], ym28, 1 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 -16], m27, 2 vextracti32x4 [dstq+strideq*0 ], m7, 2 vextracti32x4 [dstq+strideq*1 -16], m0, 2 vextracti32x4 [dstq+strideq*1 ], m8, 2 vextracti32x4 [dstq+strideq*2 -16], m1, 2 vextracti32x4 [dstq+strideq*2 ], m9, 2 vextracti32x4 [dstq+stride3q -16], m2, 2 vextracti32x4 [dstq+stride3q ], m10, 2 vextracti32x4 [dstq+strideq*4 -16], m3, 2 vextracti32x4 [dstq+strideq*4 ], m11, 2 vextracti32x4 [dstq+stride5q -16], m4, 2 vextracti32x4 [dstq+stride5q ], m22, 2 vextracti32x4 [dstq+stride3q*2-16], m5, 2 vextracti32x4 [dstq+stride3q*2 ], m23, 2 vextracti32x4 [dstq+stride7q -16], m6, 2 vextracti32x4 [dstq+stride7q ], m28, 2 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 -16], m27, 3 vextracti32x4 [dstq+strideq*0 ], m7, 3 vextracti32x4 [dstq+strideq*1 -16], m0, 3 vextracti32x4 [dstq+strideq*1 ], m8, 3 vextracti32x4 [dstq+strideq*2 -16], m1, 3 vextracti32x4 [dstq+strideq*2 ], m9, 3 vextracti32x4 [dstq+stride3q -16], m2, 3 vextracti32x4 [dstq+stride3q ], m10, 3 vextracti32x4 [dstq+strideq*4 -16], m3, 3 vextracti32x4 [dstq+strideq*4 ], m11, 3 vextracti32x4 [dstq+stride5q -16], m4, 3 vextracti32x4 [dstq+stride5q ], m22, 3 vextracti32x4 [dstq+stride3q*2-16], m5, 3 vextracti32x4 [dstq+stride3q*2 ], m23, 3 vextracti32x4 [dstq+stride7q -16], m6, 3 vextracti32x4 [dstq+stride7q ], m28, 3 %elif %1 == 8 TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2 movu [dstq+strideq*0 ], xm3 movu [dstq+strideq*1 ], xm4 movu [dstq+strideq*2 ], xm5 movu [dstq+stride3q ], xm6 movu [dstq+strideq*4 ], xm7 movu [dstq+stride5q ], xm8 movu [dstq+stride3q*2], xm9 movu [dstq+stride7q ], xm10 lea dstq, [dstq+strideq*8] vextracti128 [dstq+strideq*0 ], ym3, 1 vextracti128 [dstq+strideq*1 ], ym4, 1 vextracti128 [dstq+strideq*2 ], ym5, 1 vextracti128 [dstq+stride3q ], ym6, 1 vextracti128 [dstq+strideq*4 ], ym7, 1 vextracti128 [dstq+stride5q ], ym8, 1 vextracti128 [dstq+stride3q*2], ym9, 1 vextracti128 [dstq+stride7q ], ym10, 1 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 ], m3, 2 vextracti32x4 [dstq+strideq*1 ], m4, 2 vextracti32x4 [dstq+strideq*2 ], m5, 2 vextracti32x4 [dstq+stride3q ], m6, 2 vextracti32x4 [dstq+strideq*4 ], m7, 2 vextracti32x4 [dstq+stride5q ], m8, 2 vextracti32x4 [dstq+stride3q*2], m9, 2 vextracti32x4 [dstq+stride7q ], m10, 2 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 ], m3, 3 vextracti32x4 [dstq+strideq*1 ], m4, 3 vextracti32x4 [dstq+strideq*2 ], m5, 3 vextracti32x4 [dstq+stride3q ], m6, 3 vextracti32x4 [dstq+strideq*4 ], m7, 3 vextracti32x4 [dstq+stride5q ], m8, 3 vextracti32x4 [dstq+stride3q*2], m9, 3 vextracti32x4 [dstq+stride7q ], m10, 3 lea dstq, [dstq+strideq*8+8] %else ; %1 == 4 || %1 == 6 punpcklwd m9, m5, m6 punpckhwd m5, m6 kxnorb k1, k1, k1 punpcklwd m6, m7, m8 punpckhwd m7, m8 kmovb k2, k1 punpckldq m8, m9, m6 vpscatterdq [dstq+ym12-4]{k1}, m8 punpckhdq m9, m6 lea tmpq, [dstq+strideq*2-4] kmovb k1, k2 vpscatterdq [tmpq+ym12]{k2}, m9 punpckldq m6, m5, m7 lea tmpq, [tmpq+strideq*2] kmovb k2, k1 vpscatterdq [tmpq+ym12]{k1}, m6 punpckhdq m5, m7 lea tmpq, [tmpq+strideq*2] vpscatterdq [tmpq+ym12]{k2}, m5 %endif %endif %endmacro INIT_ZMM avx512icl cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride, tmp, \ mask_bits, stride5 %define base tmpq-filter_mask SWAP 12, 26 ; avoids clobbering xmm10 on WIN64 lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max lea stride3q, [strideq*3] shl l_strideq, 2 lea stride5q, [strideq*5] shr r6d, 11 ; is_12bpc mova ym21, [base+filter_mask] mov mstrideq, strideq vpbroadcastd m13, [base+pw_4+r6*8] mov mask_bitsd, 0xff vpbroadcastd m14, [base+clip_max+r6*4] sub lq, l_strideq vpbroadcastd m15, [base+clip_min+r6*4] neg mstrideq mov wd, wm .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, v jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call .v4 .end: shl mask_bitsd, 8 add dstq, 64 pslld ym21, 8 add lq, 32 sub wd, 8 jg .loop RET ALIGN function_align .v4: ; called by both luma and chroma FILTER 4, v ret cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \ lut, h, stride3, l_stride3, tmp, \ mask_bits, stride5, stride7 lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max lea stride3q, [strideq*3] vpbroadcastd ym12, strided shl l_strideq, 2 lea stride5q, [strideq*5] shr r6d, 11 ; is_12bpc pmulld ym12, [base+stride_mul] lea stride7q, [strideq+stride3q*2] mova ym21, [base+filter_mask] mov mask_bitsd, 0xff vpbroadcastd m13, [base+pw_4+r6*8] sub lq, 4 vpbroadcastd m14, [base+clip_max+r6*4] lea l_stride3q, [l_strideq*3] vpbroadcastd m15, [base+clip_min+r6*4] mov hd, hm .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, h jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, h jmp .end2 .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter call .h4 .no_filter: lea dstq, [dstq+stride3q*8] .end: lea dstq, [dstq+strideq*8] .end2: shl mask_bitsd, 8 pslld ym21, 8 lea lq, [lq+l_strideq*8] sub hd, 8 jg .loop RET ALIGN function_align .h4: ; called by both luma and chroma FILTER 4, h ret cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max shl l_strideq, 2 lea stride3q, [strideq*3] shr r6d, 11 ; is_12bpc mova ym21, [base+filter_mask] mov mstrideq, strideq vpbroadcastd m13, [base+pw_4+r6*8] mov mask_bitsd, 0xff vpbroadcastd m14, [base+clip_max+r6*4] sub lq, l_strideq vpbroadcastd m15, [base+clip_min+r6*4] neg mstrideq mov wd, wm .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4 .end: shl mask_bitsd, 8 add dstq, 64 pslld ym21, 8 add lq, 32 sub wd, 8 jg .loop RET cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max vpbroadcastd ym12, strided shl l_strideq, 2 shr r6d, 11 ; is_12bpc pmulld ym12, [base+stride_mul] lea stride3q, [strideq*3] mova ym21, [base+filter_mask] mov mask_bitsd, 0xff vpbroadcastd m13, [base+pw_4+r6*8] sub lq, 4 vpbroadcastd m14, [base+clip_max+r6*4] lea l_stride3q, [l_strideq*3] vpbroadcastd m15, [base+clip_min+r6*4] mov hd, hm .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4 .end: lea tmpq, [strideq+stride3q] shl mask_bitsd, 8 pslld ym21, 8 lea dstq, [dstq+tmpq*8] lea lq, [lq+l_strideq*8] sub hd, 8 jg .loop RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/loopfilter16_sse.asm000066400000000000000000001370771517466257200252010ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 %if ARCH_X86_64 %define PIC_sym(a) a %else %define PIC_base $$ %define PIC_sym(a) pic_regq+a-PIC_base %endif pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_3: times 8 dw 3 ; 4 and 16 need to be next to each other since they are used as alternates ; depending on whether bitdepth is 10 or 12 pw_4: times 8 dw 4 pw_16: times 8 dw 16 pw_8: times 8 dw 8 pw_4096: times 8 dw 4096 pb_mask: dd 1, 1, 2, 2 SECTION .text %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %define extra_stack 2 %else %define extra_stack 0 %endif %endif %macro RELOC_ARGS 2 ; h/v, off ASSERT ARCH_X86_32 %if STACK_ALIGNMENT < 16 mov r5d, [rstk + stack_offset + 4*4 + 4] %define lstridem [esp+%2+0*gprsize] mov lstridem, r5d mov r5d, [rstk + stack_offset + 4*5 + 4] %define lutm [esp+%2+1*gprsize] mov lutm, r5d mov r5d, [rstk + stack_offset + 4*6 + 4] %ifidn %1, v %define wm [esp+%2+2*gprsize] mov wm, r5d mov r5d, [rstk + stack_offset + 4*3 + 4] %define lm [esp+%2+3*gprsize] mov lm, r5d %else ; %1 == h %define hm [esp+%2+2*gprsize] mov hm, r5d %endif ; %1==v mov r5d, r7m %define bdmulm [esp+%2+4*gprsize] mov bdmulm, r5d %else %define lstridem r4m %define lutm r5m %ifidn %1, v %define wm r6m %define lm r3m %else %define hm r6m %endif %define bdmulm r7m %endif ; STACK_ALIGNMENT %endmacro %macro UNRELOC_ARGS 0 %if ARCH_X86_32 %undef lm %undef lstridem %undef wm %undef hm %undef lutm %endif %endmacro %macro SPLATD 2 movd %1, %2 pshufd %1, %1, q0000 %endmacro %macro SPLATW 2 movd %1, %2 pshuflw %1, %1, q0000 punpcklqdq %1, %1 %endmacro ; in: out: ; mm%1 a b c d a e i m ; mm%2 e f g h b f j n ; mm%3 i j k l -> c g k o ; mm%4 m n o p d h l p %macro TRANSPOSE4X4W 5 punpcklwd m%5, m%1, m%2 punpckhwd m%1, m%2 punpcklwd m%2, m%3, m%4 punpckhwd m%3, m%4 punpckldq m%4, m%5, m%2 punpckhdq m%5, m%2 punpckldq m%2, m%1, m%3 punpckhdq m%1, m%3 SWAP %1, %4 SWAP %2, %5, %3 %endmacro ; in: out: ; m%1 a b c d e f g h a i q y 6 E M U ; m%2 i j k l m n o p b j r z 7 F N V ; m%3 q r s t u v w x c k s 0 8 G O W ; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X ; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y ; m%6 E F G H I J K L f n v 3 B J R Z ; m%7 M N O P Q R S T g o w 4 C K S + ; m%8 U V W X Y Z + = h p x 5 D L T = %if ARCH_X86_64 %macro TRANSPOSE8X8W 9 ; m%1 a b c d e f g h a i q y b j r z ; m%2 i j k l m n o p c k s 0 d l t 1 ; m%3 q r s t u v w x -> e m u 2 f n v 3 ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 TRANSPOSE4X4W %1, %2, %3, %4, %9 ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V ; m%6 E F G H I J K L 8 G O W 9 H P X ; m%7 M N O P Q R S T -> A I Q Y B J R Z ; m%8 U V W X Y Z + = C K S + D L T = TRANSPOSE4X4W %5, %6, %7, %8, %9 ; m%1 a i q y b j r z a i q y 6 E M U ; m%2 c k s 0 d l t 1 b j r z 7 F N V ; m%3 e m u 2 f n v 3 c k s 0 8 G O W ; m%4 g o w 4 h p x 5 d l t 1 9 H P X ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y ; m%6 8 G O W 9 H P X f n v 3 B J R Z ; m%7 A I Q Y B J R Z g o w 4 C K S + ; m%8 C K S + D L T = h p x 5 D L T = punpckhqdq m%9, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 punpcklqdq m%2, m%6 punpckhqdq m%6, m%3, m%7 punpcklqdq m%3, m%7 punpckhqdq m%7, m%4, m%8 punpcklqdq m%4, m%8 SWAP %8, %7, %4, %5, %3, %2, %9 %endmacro %else ; x86-32 ; input: 1-7 in registers, 8 in first memory [read-only] ; second memory is scratch, and may overlap with first or third memory ; output: 1-5,7-8 in registers, 6 in third memory [write-only] %macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] TRANSPOSE4X4W %1, %2, %3, %4, %8 %ifnidn %9, "" mov%12 m%8, %9 %else mova m%8, %10 %endif mova %10, m%4 TRANSPOSE4X4W %5, %6, %7, %8, %4 punpckhqdq m%4, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 punpcklqdq m%2, m%6 punpckhqdq m%6, m%3, m%7 punpcklqdq m%3, m%7 mova m%7, %10 %ifnidn %11, "" mov%13 %11, m%6 %else mova %10, m%6 %endif punpckhqdq m%6, m%7, m%8 punpcklqdq m%7, m%8 ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 SWAP %2, %4, %5, %3 SWAP %6, %8 %endmacro %endif ; x86-32/64 ; transpose and write m8-11, everything else is scratch %macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp ; transpose 8x4 punpcklwd %5, %1, %2 punpckhwd %1, %2 punpcklwd %2, %3, %4 punpckhwd %3, %4 punpckldq %4, %5, %2 punpckhdq %5, %2 punpckldq %2, %1, %3 punpckhdq %1, %3 ; write out movq [dstq+strideq*0-4], %4 movhps [dstq+strideq*1-4], %4 movq [dstq+strideq*2-4], %5 movhps [dstq+stride3q -4], %5 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], %2 movhps [dstq+strideq*1-4], %2 movq [dstq+strideq*2-4], %1 movhps [dstq+stride3q -4], %1 lea dstq, [dstq+strideq*4] %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %if %1 == 4 %if ARCH_X86_64 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 mova P1, [dstq+mstrideq*2] ; p1 mova P0, [dstq+mstrideq*1] ; p0 mova Q0, [dstq+strideq*0] ; q0 mova Q1, [dstq+strideq*1] ; q1 %else ; x86-32 %define P1 [dstq+mstrideq*2] %define P0 [dstq+mstrideq*1] %define Q0 [dstq+strideq*0] %define Q1 [dstq+strideq*1] %endif ; x86-32/64 %else ; %1 != 4 ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] %if ARCH_X86_64 ; we load p3 later %define P2 m13 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 %define Q2 m14 mova P2, [tmpq+strideq*1] mova P1, [tmpq+strideq*2] mova P0, [tmpq+stride3q] mova Q0, [dstq+strideq*0] mova Q1, [dstq+strideq*1] mova Q2, [dstq+strideq*2] %if %1 != 6 %define P3 [tmpq+strideq*0] %define Q3 m15 mova Q3, [dstq+stride3q] %endif ; %1 != 6 %else ; x86-32 %define P2 [tmpq+strideq*1] %define P1 [dstq+mstrideq*2] %define P0 [dstq+mstrideq*1] %define Q0 [dstq+strideq*0] %define Q1 [dstq+strideq*1] %define Q2 [dstq+strideq*2] %if %1 != 6 %define P3 [dstq+mstrideq*4] %define Q3 [dstq+stride3q] %endif ; %1 != 6 %endif ; x86-32/64 %endif ; %1 ==/!= 4 %else ; %2 != v ; load lines %if %1 == 4 movq m0, [dstq+strideq*0-4] movq m2, [dstq+strideq*1-4] movq m4, [dstq+strideq*2-4] movq m5, [dstq+stride3q -4] lea tmpq, [dstq+strideq*4] movq m3, [tmpq+strideq*0-4] movq m6, [tmpq+strideq*1-4] movq m1, [tmpq+strideq*2-4] movq m7, [tmpq+stride3q -4] ; transpose 4x8 ; m0: A-D0 ; m2: A-D1 ; m4: A-D2 ; m5: A-D3 ; m3: A-D4 ; m6: A-D5 ; m1: A-D6 ; m7: A-D7 punpcklwd m0, m2 punpcklwd m4, m5 punpcklwd m3, m6 punpcklwd m1, m7 ; m0: A0-1,B0-1,C0-1,D0-1 ; m4: A2-3,B2-3,C2-3,D2-3 ; m3: A4-5,B4-5,C4-5,D4-5 ; m1: A6-7,B6-7,C6-7,D6-7 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckhdq m4, m3, m1 punpckldq m3, m1 ; m0: A0-3,B0-3 ; m2: C0-3,D0-3 ; m3: A4-7,B4-7 ; m4: C4-7,D4-7 punpckhqdq m1, m0, m3 punpcklqdq m0, m3 punpckhqdq m3, m2, m4 punpcklqdq m2, m4 ; m0: A0-7 ; m1: B0-7 ; m2: C0-7 ; m3: D0-7 %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 2, 10 SWAP 3, 11 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 %else %define P1 [esp+3*mmsize] %define P0 [esp+4*mmsize] %define Q0 [esp+5*mmsize] %define Q1 [esp+6*mmsize] mova P1, m0 mova P0, m1 mova Q0, m2 mova Q1, m3 %endif %elif %1 == 6 || %1 == 8 movu m0, [dstq+strideq*0-8] movu m1, [dstq+strideq*1-8] movu m2, [dstq+strideq*2-8] movu m3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu m4, [tmpq+strideq*0-8] movu m5, [tmpq+strideq*1-8] movu m6, [tmpq+strideq*2-8] %if ARCH_X86_64 movu m7, [tmpq+stride3q -8] %endif ; transpose 8x16 ; m0: A-H0,A-H8 ; m1: A-H1,A-H9 ; m2: A-H2,A-H10 ; m3: A-H3,A-H11 ; m4: A-H4,A-H12 ; m5: A-H5,A-H13 ; m6: A-H6,A-H14 ; m7: A-H7,A-H15 %if ARCH_X86_64 punpcklwd m8, m0, m1 %else punpcklwd m7, m0, m1 %endif punpckhwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpcklwd m3, m4, m5 punpckhwd m4, m5 %if ARCH_X86_64 punpcklwd m5, m6, m7 punpckhwd m6, m7 %else mova [rsp+3*16], m4 movu m4, [tmpq+stride3q -8] punpcklwd m5, m6, m4 punpckhwd m6, m4 %endif ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] ; m0: E0-1,F0-1,G0-1,H0-1 ; m1: A2-3,B2-3,C2-3,D2-3 ; m2: E2-3,F2-3,G2-3,H2-3 ; m3: A4-5,B4-5,C4-5,D4-5 ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] ; m5: A6-7,B6-7,C6-7,D6-7 ; m6: E6-7,F6-7,G6-7,H6-7 %if ARCH_X86_64 punpckldq m7, m8, m1 punpckhdq m8, m1 %else punpckldq m4, m7, m1 punpckhdq m7, m1 %endif punpckldq m1, m0, m2 punpckhdq m0, m2 punpckldq m2, m3, m5 punpckhdq m3, m5 %if ARCH_X86_64 punpckldq m5, m4, m6 punpckhdq m4, m6 %else mova [rsp+4*16], m3 mova m3, [rsp+3*16] punpckldq m5, m3, m6 punpckhdq m3, m6 %endif ; m7: A0-3,B0-3 [m4 on x86-32] ; m8: C0-3,D0-3 [m7 on x86-32] ; m1: E0-3,F0-3 ; m0: G0-3,H0-3 ; m2: A4-7,B4-7 ; m3: C4-7,D4-7 [r4 on x86-32] ; m5: E4-7,F4-7 ; m4: G4-7,H4-7 [m3 on x86-32] %if ARCH_X86_64 %if %1 != 6 punpcklqdq m6, m7, m2 %endif punpckhqdq m7, m2 punpcklqdq m2, m8, m3 punpckhqdq m8, m3 punpcklqdq m3, m1, m5 punpckhqdq m1, m5 %if %1 != 6 punpckhqdq m5, m0, m4 %endif punpcklqdq m0, m4 %if %1 == 8 mova [rsp+1*16], m6 %define P3 [rsp+1*16] %endif ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 SWAP 7, 13 SWAP 8, 2, 9 SWAP 3, 10 SWAP 1, 11 SWAP 0, 14 SWAP 5, 15 %define P2 m13 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 %define Q2 m14 %if %1 == 8 %define Q3 m15 %endif %else ; x86-32 %if %1 == 8 %define P3 [rsp+ 6*16] punpcklqdq m6, m4, m2 mova P3, m6 %endif mova m6, [rsp+4*16] punpckhqdq m4, m2 punpcklqdq m2, m7, m6 punpckhqdq m7, m6 punpcklqdq m6, m1, m5 punpckhqdq m1, m5 %if %1 == 8 %define Q3 [rsp+24*16] punpckhqdq m5, m0, m3 mova Q3, m5 %endif punpcklqdq m0, m3 %if %1 == 8 %define P2 [rsp+18*16] %define P1 [rsp+19*16] %define P0 [rsp+20*16] %define Q0 [rsp+21*16] %define Q1 [rsp+22*16] %define Q2 [rsp+23*16] %else %define P2 [rsp+3*16] %define P1 [rsp+4*16] %define P0 [rsp+5*16] %define Q0 [rsp+6*16] %define Q1 [rsp+7*16] %define Q2 [rsp+8*16] %endif mova P2, m4 mova P1, m2 mova P0, m7 mova Q0, m6 mova Q1, m1 mova Q2, m0 %endif ; x86-32/64 %else ; %1 == 16 ; We only use 14 pixels but we'll need the remainder at the end for ; the second transpose mova m0, [dstq+strideq*0-16] mova m1, [dstq+strideq*1-16] mova m2, [dstq+strideq*2-16] mova m3, [dstq+stride3q -16] lea tmpq, [dstq+strideq*4] mova m4, [tmpq+strideq*0-16] mova m5, [tmpq+strideq*1-16] mova m6, [tmpq+strideq*2-16] %if ARCH_X86_64 mova m7, [tmpq+stride3q -16] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 SWAP 5, 13 SWAP 6, 8 SWAP 7, 9 %define P2 m13 %define P1 m8 %define P0 m9 %else ; x86-32 %define P2 [esp+18*16] %define P1 [esp+19*16] %define P0 [esp+20*16] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ [tmpq+stride3q -16], P2, "", a, a mova P1, m6 mova P0, m7 %endif ; x86-32/64 mova [rsp+ 7*16], m0 mova [rsp+ 8*16], m1 mova [rsp+ 9*16], m2 mova [rsp+10*16], m3 %define P3 [rsp+6*16] mova P3, m4 mova m0, [dstq+strideq*0] mova m1, [dstq+strideq*1] mova m2, [dstq+strideq*2] mova m3, [dstq+stride3q ] lea tmpq, [dstq+strideq*4] mova m4, [tmpq+strideq*0] mova m5, [tmpq+strideq*1] mova m6, [tmpq+strideq*2] %if ARCH_X86_64 mova m7, [tmpq+stride3q ] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 SWAP 0, 10 SWAP 1, 11 SWAP 2, 14 SWAP 3, 15 %define Q0 m10 %define Q1 m11 %define Q2 m14 %define Q3 m15 %else ; x86-32 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ [tmpq+stride3q ], [rsp+12*16], "", a, a %define Q0 [esp+21*16] %define Q1 [esp+22*16] %define Q2 [esp+23*16] %define Q3 [esp+24*16] mova Q0, m0 mova Q1, m1 mova Q2, m2 mova Q3, m3 %endif ; x86-32/64 mova [rsp+11*16], m4 %if ARCH_X86_64 mova [rsp+12*16], m5 %endif mova [rsp+13*16], m6 mova [rsp+14*16], m7 %endif ; %1 == 4/6/8/16 %endif ; %2 ==/!= v ; load L/E/I/H %if ARCH_X86_32 %define l_strideq r5 mov l_strideq, dword lstridem %ifidn %2, v %define lq r3 mov lq, dword lm %endif %endif %ifidn %2, v %if cpuflag(sse4) pmovzxbw m1, [lq] pmovzxbw m0, [lq+l_strideq] pxor m2, m2 %else ; ssse3 movq m1, [lq] movq m0, [lq+l_strideq] pxor m2, m2 REPX {punpcklbw x, m2}, m1, m0 %endif ; ssse3/sse4 %else ; %2 != v movq m0, [lq] ; l0, l1 movq m1, [lq+l_strideq] ; l2, l3 punpckldq m0, m1 ; l0, l2, l1, l3 pxor m2, m2 punpcklbw m1, m0, m2 ; l0, l2 punpckhbw m0, m2 ; l1, l3 %endif ; %2==/!=v %if ARCH_X86_32 %ifidn %2, v %undef lq mov mstrideq, mstridem %endif %endif pcmpeqw m5, m2, m0 pand m1, m5 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] pcmpeqw m5, m2, m0 ; !L psrlw m5, 1 %if ARCH_X86_64 psrlw m2, m0, [lutq+128] SPLATW m1, [lutq+136] %else ; x86-32 mov r5, lutm psrlw m2, m0, [r5+128] SPLATW m1, [r5+136] %endif ; x86-32/64 pminsw m2, m1 pmaxsw m2, [PIC_sym(pw_1)] ; I psrlw m1, m0, 4 ; H paddw m0, [PIC_sym(pw_2)] paddw m0, m0 paddw m0, m2 ; E REPX {pmullw x, [bdmulq]}, m0, m1, m2 %if ARCH_X86_32 %undef l_strideq lea stride3q, [strideq*3] %endif psubw m3, P1, P0 ; p1-p0 psubw m4, Q0, Q1 ; q0-q1 REPX {pabsw x, x}, m3, m4 pmaxsw m3, m5 pmaxsw m3, m4 pcmpgtw m7, m3, m1 ; hev %if %1 != 4 psubw m4, P2, P0 ; p2-p0 pabsw m4, m4 pmaxsw m4, m3 %if %1 != 6 mova m6, P3 ; p3 psubw m5, m6, P0 ; p3-p0 pabsw m5, m5 pmaxsw m4, m5 %endif ; %1 != 6 psubw m5, Q0, Q2 ; q0-q2 pabsw m5, m5 pmaxsw m4, m5 %if %1 != 6 psubw m5, Q0, Q3 ; q0-q3 pabsw m5, m5 pmaxsw m4, m5 %endif ; %1 != 6 pcmpgtw m4, [bdmulq] ; !flat8in psubw m5, P2, P1 ; p2-p1 pabsw m5, m5 %if %1 != 6 psubw m6, P2 ; p3-p2 pabsw m6, m6 pmaxsw m5, m6 psubw m6, Q2, Q3 ; q2-q3 pabsw m6, m6 pmaxsw m5, m6 %endif ; %1 != 6 psubw m6, Q2, Q1 ; q2-q1 pabsw m6, m6 pmaxsw m5, m6 %if %1 == 16 SPLATD m6, [maskq+8] SPLATD m1, [maskq+4] por m6, m1 pand m6, m12 pcmpeqd m6, m12 pand m5, m6 %else ; %1 != 16 SPLATD m6, [maskq+4] pand m6, m12 pcmpeqd m6, m12 pand m5, m6 ; only apply fm-wide to wd>4 blocks %endif ; %1==/!=16 pmaxsw m3, m5 %endif ; %1 != 4 pcmpgtw m3, m2 psubw m5, P1, Q1 ; p1-q1 psubw m6, P0, Q0 ; p0-q0 REPX {pabsw x, x}, m5, m6 paddw m6, m6 psrlw m5, 1 paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E por m3, m5 %if %1 == 16 %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] mova m1, [tmpq+strideq*2] mova m2, [tmpq+stride3q] %else ; %2 != v mova m0, [rsp+ 8*16] mova m1, [rsp+ 9*16] mova m2, [rsp+10*16] %endif ; %2==/!=v REPX {psubw x, P0}, m0, m1, m2 REPX {pabsw x, x}, m0, m1, m2 pmaxsw m1, m0 pmaxsw m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] mova m2, [tmpq+strideq*1] mova m5, [tmpq+strideq*2] %else ; %2 != v mova m0, [rsp+11*16] mova m2, [rsp+12*16] mova m5, [rsp+13*16] %endif ; %2==/!=v REPX {psubw x, Q0}, m0, m2, m5 REPX {pabsw x, x}, m0, m2, m5 pmaxsw m0, m2 pmaxsw m1, m5 pmaxsw m1, m0 pcmpgtw m1, [bdmulq] ; !flat8out por m1, m4 ; !flat8in | !flat8out SPLATD m2, [maskq+8] pand m5, m2, m12 pcmpeqd m5, m12 pandn m1, m5 ; flat16 pandn m5, m3, m1 ; flat16 & fm SWAP 1, 5 SPLATD m5, [maskq+4] por m5, m2 pand m2, m5, m12 pcmpeqd m2, m12 pandn m4, m2 ; flat8in pandn m2, m3, m4 SWAP 2, 4 SPLATD m2, [maskq+0] por m2, m5 pand m2, m12 pcmpeqd m2, m12 pandn m3, m2 pandn m0, m4, m3 ; fm & !flat8 & !flat16 SWAP 0, 3 pandn m0, m1, m4 ; flat8 & !flat16 SWAP 0, 4 %elif %1 != 4 SPLATD m0, [maskq+4] pand m2, m0, m12 pcmpeqd m2, m12 pandn m4, m2 pandn m2, m3, m4 ; flat8 & fm SWAP 2, 4 SPLATD m2, [maskq+0] por m0, m2 pand m0, m12 pcmpeqd m0, m12 pandn m3, m0 pandn m0, m4, m3 ; fm & !flat8 SWAP 0, 3 %else ; %1 == 4 SPLATD m0, [maskq+0] pand m0, m12 pcmpeqd m0, m12 pandn m3, m0 ; fm %endif ; %1==/!=4 ; short filter %if ARCH_X86_64 SPLATW m0, r7m %else SPLATW m0, bdmulm %endif pcmpeqw m2, m2 psrlw m0, 1 ; 511 or 2047 pxor m2, m0 ; -512 or -2048 psubw m5, Q0, P0 ; q0-p0 paddw m6, m5, m5 paddw m6, m5 ; 3*(q0-p0) psubw m5, P1, Q1 ; iclip_diff(p1-q1) pminsw m5, m0 pmaxsw m5, m2 pand m5, m7 ; f=iclip_diff(p1-q1)&hev paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) pminsw m5, m0 pmaxsw m5, m2 pand m3, m5 ; f&=fm paddw m5, m3, [PIC_sym(pw_3)] paddw m3, [PIC_sym(pw_4)] REPX {pminsw x, m0}, m5, m3 psraw m5, 3 ; f2 psraw m3, 3 ; f1 psubw m0, m2 ; 1023 or 4095 pxor m2, m2 %if ARCH_X86_64 paddw P0, m5 psubw Q0, m3 %else paddw m5, P0 psubw m6, Q0, m3 REPX {pminsw x, m0}, m5, m6 REPX {pmaxsw x, m2}, m5, m6 %endif paddw m3, [PIC_sym(pw_1)] psraw m3, 1 ; f=(f1+1)>>1 pandn m7, m3 ; f&=!hev SWAP 7, 3 %if ARCH_X86_64 paddw P1, m3 psubw Q1, m3 REPX {pminsw x, m0}, P1, P0, Q0, Q1 REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 %else psubw m7, Q1, m3 paddw m3, P1 REPX {pminsw x, m0}, m7, m3 REPX {pmaxsw x, m2}, m7, m3 %if %1 > 4 mova P1, m3 mova P0, m5 mova Q0, m6 mova Q1, m7 %endif %endif %if %1 == 16 ; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 ; m12=filter bits mask ; m13-15=p2/q2/q3 ; m0,2-3,5-7 = free ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 mova m6, [tmpq+strideq*4] ; p3 lea tmpq, [dstq+mstrideq*4] %else ; %2 != v mova m0, [rsp+ 8*16] mova m2, [rsp+ 9*16] mova m7, [rsp+10*16] mova m6, [rsp+ 6*16] %endif ; %2==/!=v mova [rsp+ 0*16], m4 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psllw m3, m0, 3 ; p6*8 paddw m3, [PIC_sym(pw_8)] paddw m5, m2, m7 ; p5+p4 psubw m3, m0 paddw m5, m5 ; (p5+p4)*2 paddw m3, m6 ; p6*7+p3 paddw m5, P2 ; (p5+p4)*2+p2 paddw m3, P1 ; p6*7+p3+p1 paddw m5, P0 ; (p5+p4)*2+p2+p0 paddw m3, Q0 ; p6*7+p3+p1+q0 paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, m2 por m5, m4 %ifidn %2, v mova [tmpq+mstrideq*2], m5 ; p5 %else ; %2 != v mova [rsp+9*16], m5 %endif ; %2==/!=v ; sub p6*2, add p3/q1 paddw m3, m6 paddw m5, m0, m0 paddw m3, Q1 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, m7 por m5, m4 %ifidn %2, v mova [tmpq+mstrideq*1], m5 ; p4 %else ; %2 != v mova [rsp+10*16], m5 %endif ; %2==/!=v ; sub p6/p5, add p2/q2 psubw m3, m0 paddw m5, P2, Q2 psubw m3, m2 paddw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, m6 por m5, m4 %ifidn %2, v mova [tmpq+strideq*0], m5 ; p3 %else ; %2 != v mova [rsp+6*16], m5 %endif ; %2==/!=v %define WRITE_IN_PLACE 0 %ifidn %2, v %if ARCH_X86_64 %define WRITE_IN_PLACE 1 %endif %endif ; sub p6/p4, add p1/q3 paddw m3, P1 paddw m5, m0, m7 paddw m3, Q3 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, P2 por m5, m4 %if WRITE_IN_PLACE mova [tmpq+strideq*1], m5 %else mova [rsp+1*16], m5 ; don't clobber p2/m13 %endif ; sub p6/p3, add p0/q4 paddw m3, P0 paddw m5, m0, m6 %ifidn %2, v paddw m3, [dstq+strideq*4] %else ; %2 != v paddw m3, [rsp+11*16] %endif ; %2==/!=v psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, P1 por m5, m4 %if WRITE_IN_PLACE mova [dstq+mstrideq*2], m5 %else mova [rsp+2*16], m5 ; don't clobber p1/m3 %endif ; sub p6/p2, add q0/q5 paddw m3, Q0 paddw m5, m0, P2 %ifidn %2, v %if ARCH_X86_32 lea r4, P2 %endif lea tmpq, [dstq+strideq*4] paddw m3, [tmpq+strideq*1] %else ; %2 != v paddw m3, [rsp+12*16] %endif ; %2==/!=v psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, P0 por m5, m4 %if WRITE_IN_PLACE mova [dstq+mstrideq*1], m5 %else mova [rsp+3*16], m5 ; don't clobber p0/m4 %endif ; sub p6/p1, add q1/q6 paddw m3, Q1 paddw m5, m0, P1 %ifidn %2, v mova m0, [tmpq+strideq*2] ; q6 %else ; %2 != v mova m0, [rsp+13*16] ; q6 %endif ; %2==/!=v paddw m3, m0 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, Q0 por m5, m4 %if WRITE_IN_PLACE mova [dstq], m5 %else mova [rsp+4*16], m5 ; don't clobber q0/m5 %endif ; sub p5/p0, add q2/q6 paddw m3, Q2 paddw m5, m2, P0 paddw m3, m0 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, Q1 por m2, m5, m4 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 paddw m3, Q3 paddw m7, Q0 paddw m3, m0 psubw m3, m7 psrlw m7, m3, 4 pand m7, m1 pandn m4, m1, Q2 por m7, m4 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v paddw m3, [tmpq+strideq*0] %else ; %2 != v paddw m3, [rsp+11*16] %endif ; %2==/!=v paddw m6, Q1 paddw m3, m0 psubw m3, m6 psrlw m6, m3, 4 pand m6, m1 pandn m4, m1, Q3 por m6, m4 %if WRITE_IN_PLACE mova [tmpq+mstrideq], m6 ; q3 %else ; %2 != v mova [rsp+5*16], m6 %endif ; %2==/!=v ; sub p2/q2, add q5/q6 %ifidn %2, v paddw m3, [tmpq+strideq*1] %if ARCH_X86_64 paddw m5, P2, Q2 %else ; because tmpq is clobbered, so we use a backup pointer for P2 instead paddw m5, [r4], Q2 mov pic_regq, pic_regm %endif %else ; %2 != v paddw m3, [rsp+12*16] paddw m5, P2, Q2 %endif ; %2==/!=v paddw m3, m0 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 %ifidn %2, v pandn m4, m1, [tmpq+strideq*0] %else ; %2 != v pandn m4, m1, [rsp+11*16] %endif ; %2==/!=v por m5, m4 %ifidn %2, v mova [tmpq+strideq*0], m5 ; q4 %else ; %2 != v mova [rsp+11*16], m5 %endif ; %2==/!=v ; sub p1/q3, add q6*2 psubw m3, P1 paddw m0, m0 psubw m3, Q3 paddw m3, m0 psrlw m5, m3, 4 pand m5, m1 %ifidn %2, v pandn m4, m1, [tmpq+strideq*1] %else ; %2 != v pandn m4, m1, [rsp+12*16] %endif ; %2==/!=v por m5, m4 %ifidn %2, v mova [tmpq+strideq*1], m5 ; q5 %else ; %2 != v mova [rsp+12*16], m5 %endif ; %2==/!=v mova m4, [rsp+0*16] %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %endif %if ARCH_X86_64 SWAP 2, 11 SWAP 7, 14 SWAP 6, 15 %else ; x86-32 mova Q1, m2 mova Q2, m7 %endif ; x86-32/64 %if WRITE_IN_PLACE mova P2, [tmpq+strideq*1] mova P1, [tmpq+strideq*2] mova P0, [tmpq+stride3q] mova Q0, [dstq] %elif ARCH_X86_64 mova P2, [rsp+1*16] mova P1, [rsp+2*16] mova P0, [rsp+3*16] mova Q0, [rsp+4*16] %else ; !WRITE_IN_PLACE & x86-32 mova m0, [rsp+1*16] mova m1, [rsp+2*16] mova m2, [rsp+3*16] mova m3, [rsp+4*16] mova m7, [rsp+5*16] mova P2, m0 mova P1, m1 mova P0, m2 mova Q0, m3 mova Q3, m7 %endif ; WRITE_IN_PLACE / x86-32/64 %undef WRITE_IN_PLACE %endif ; %1 == 16 %if %1 >= 8 ; flat8 filter mova m0, P3 ; p3 paddw m1, m0, P2 ; p3+p2 paddw m2, P1, P0 ; p1+p0 paddw m3, m1, m1 ; 2*(p3+p2) paddw m2, m0 ; p1+p0+p3 paddw m3, Q0 ; 2*(p3+p2)+q0 paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 pmulhrsw m7, m2, [PIC_sym(pw_4096)] psubw m7, P2 pand m7, m4 paddw m3, P1, Q1 ; p1+q1 psubw m2, m1 ; 2*p3+p2+p1+p0+q0 paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 pmulhrsw m3, m2, [PIC_sym(pw_4096)] psubw m3, P1 pand m3, m4 paddw m5, m0, P1 ; p3+p1 paddw m6, P0, Q2 ; p0+q2 psubw m2, m5 ; p3+p2+p1+p0+q0+q1 paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 pmulhrsw m5, m2, [PIC_sym(pw_4096)] psubw m5, P0 pand m5, m4 paddw m6, m0, P0 ; p3+p0 paddw m1, Q0, Q3 ; q0+q3 psubw m2, m6 ; p2+p1+p0+q0+q1+q2 paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 pmulhrsw m6, m2, [PIC_sym(pw_4096)] psubw m6, Q0 pand m6, m4 paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 paddw m1, P2, Q0 ; p2+q0 psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 pmulhrsw m1, m2, [PIC_sym(pw_4096)] psubw m1, Q1 pand m1, m4 psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 psubw m2, Q1 ; p0+q0+q1+q2+2*q3 paddw m0, Q3, Q2 ; q3+q2 paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 pmulhrsw m2, [PIC_sym(pw_4096)] psubw m2, Q2 pand m2, m4 paddw m7, P2 paddw m3, P1 paddw m5, P0 paddw m6, Q0 paddw m1, Q1 paddw m2, Q2 %ifidn %2, v mova [tmpq+strideq*1], m7 ; p2 mova [tmpq+strideq*2], m3 ; p1 mova [tmpq+stride3q ], m5 ; p0 mova [dstq+strideq*0], m6 ; q0 mova [dstq+strideq*1], m1 ; q1 mova [dstq+strideq*2], m2 ; q2 %else ; %2 != v mova m0, P3 %if %1 == 8 lea tmpq, [dstq+strideq*4] %if ARCH_X86_64 SWAP 4, 15 TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 %else TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ Q3, [tmpq+strideq*1-8], a, u %endif ; write 8x8 movu [dstq+strideq*0-8], m0 movu [dstq+strideq*1-8], m7 movu [dstq+strideq*2-8], m3 movu [dstq+stride3q -8], m5 movu [tmpq+strideq*0-8], m6 %if ARCH_X86_64 movu [tmpq+strideq*1-8], m1 %endif movu [tmpq+strideq*2-8], m2 movu [tmpq+stride3q -8], m4 lea dstq, [dstq+strideq*8] %else ; %1 != 8 %if ARCH_X86_64 SWAP 6, 8 SWAP 1, 9 SWAP 2, 10 %else mova [rsp+1*16], m6 mova [rsp+2*16], m1 mova [rsp+3*16], m2 %endif mova m1, [rsp+ 7*16] mova m2, [rsp+ 8*16] mova m4, [rsp+ 9*16] mova m6, [rsp+10*16] lea tmpq, [dstq+strideq*4] %if ARCH_X86_64 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 %else mova [rsp+7*16], m5 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ [rsp+7*16], [tmpq+strideq*1-16], a, a %endif mova [dstq+strideq*0-16], m1 mova [dstq+strideq*1-16], m2 mova [dstq+strideq*2-16], m4 mova [dstq+stride3q -16], m6 mova [tmpq+strideq*0-16], m0 %if ARCH_X86_64 mova [tmpq+strideq*1-16], m7 %endif mova [tmpq+strideq*2-16], m3 mova [tmpq+stride3q -16], m5 %if ARCH_X86_64 SWAP 6, 8 SWAP 1, 9 SWAP 2, 10 SWAP 4, 15 %else mova m6, [rsp+1*16] mova m1, [rsp+2*16] mova m2, [rsp+3*16] mova m4, Q3 %endif mova m0, [rsp+11*16] mova m3, [rsp+12*16] mova m5, [rsp+13*16] %if ARCH_X86_64 mova m7, [rsp+14*16] TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 %else TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ [rsp+14*16], [tmpq+strideq*1], a, a %endif mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m4 mova [tmpq+strideq*0], m0 %if ARCH_X86_64 mova [tmpq+strideq*1], m3 %endif mova [tmpq+strideq*2], m5 mova [tmpq+stride3q ], m7 lea dstq, [dstq+strideq*8] %endif ; %1==/!=8 %endif ; %2==/!=v %elif %1 == 6 ; flat6 filter paddw m3, P1, P0 ; p1+p0 paddw m3, P2 ; p2+p1+p0 paddw m6, P2, Q0 ; p2+q0 paddw m3, m3 ; 2*(p2+p1+p0) paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 pmulhrsw m2, m3, [PIC_sym(pw_4096)] psubw m2, P1 pand m2, m4 paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) paddw m6, P2, P2 ; 2*p2 paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 pmulhrsw m5, m3, [PIC_sym(pw_4096)] psubw m5, P0 pand m5, m4 paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) paddw m6, P2, P1 ; p2+p1 paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 pmulhrsw m6, m3, [PIC_sym(pw_4096)] psubw m6, Q0 pand m6, m4 psubw m3, P1 ; 2*(p0+q0+q1)+q2 %if ARCH_X86_64 paddw Q2, Q2 ; q2*2 %else mova m0, Q2 paddw m0, m0 %endif psubw m3, P0 ; p0+2*(q0+q1)+q2 %if ARCH_X86_64 paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 %else paddw m3, m0 %endif pmulhrsw m3, [PIC_sym(pw_4096)] psubw m3, Q1 pand m3, m4 paddw m2, P1 paddw m5, P0 paddw m6, Q0 paddw m3, Q1 %ifidn %2, v mova [dstq+mstrideq*2], m2 ; p1 mova [dstq+mstrideq*1], m5 ; p0 mova [dstq+strideq*0], m6 ; q0 mova [dstq+strideq*1], m3 ; q1 %else ; %2 != v TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 %endif ; %2==/!=v %else ; %1 == 4 %if ARCH_X86_64 %ifidn %2, v mova [dstq+mstrideq*2], P1 ; p1 mova [dstq+mstrideq*1], P0 ; p0 mova [dstq+strideq*0], Q0 ; q0 mova [dstq+strideq*1], Q1 ; q1 %else ; %2 != v TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 %endif ; %2==/!=v %else ; x86-32 %ifidn %2, v mova [dstq+mstrideq*2], m3 mova [dstq+mstrideq*1], m5 mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m7 %else ; %2 != v TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 %endif ; %2==/!=v %endif ; x86-32/64 %endif ; %1 %undef P3 %undef P2 %undef P1 %undef P0 %undef Q0 %undef Q1 %undef Q2 %undef Q3 %endmacro INIT_XMM ssse3 ; stack layout: ; r0 - flat8 backup inside flat16 code %if ARCH_X86_64 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov wd, wm shl l_strideq, 2 sub lq, l_strideq %else ; stack layout [32bit only]: ; r1-4 - p2-q0 post-filter16 ; r5 - p3 ; r6 - q3 post-filter16 ; r7 - GPRs [mask_bitsm, mstridem] ; r8 - m12/pb_mask ; r9 - bdmulq cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ dst, stride, mask, mstride, pic_reg, stride3, tmp RELOC_ARGS v, 10*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base %define pic_regm dword [esp+7*16+2*gprsize] mov pic_regm, pic_regq mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+9*16 mova [bdmulq], m0 shl dword lstridem, 2 sub r3, dword lstridem mov dword lm, r3 %endif mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mstridem dword [esp+7*16+1*gprsize] mov mstridem, mstrideq %define mask_bitsm dword [esp+7*16+0*gprsize] mov mask_bitsm, 0x3 mova m0, [PIC_sym(pb_mask)] %define m12 [esp+8*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+8], mask_bitsd ; vmask[2] %else mov r6d, mask_bitsm test [maskq+8], r6d %endif jz .no_flat16 FILTER 16, v jmp .end .no_flat16: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else test [maskq+4], r6d %endif jz .no_flat FILTER 8, v jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .end FILTER 4, v .end: %if ARCH_X86_64 pslld m12, 2 add lq, 8 %else mova m0, m12 pslld m0, 2 mova m12, m0 add dword lm, 8 %endif add dstq, 16 %if ARCH_X86_64 shl mask_bitsd, 2 sub wd, 2 %else shl mask_bitsm, 2 sub dword wm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET INIT_XMM ssse3 ; stack layout: ; r0 - flat8 backup inside flat16 ; r1-4 - p2-q0 post-filter16 backup ; r5 - q3 post-filter16 backup ; r6 - p3 ; r7-10 - p7-4 ; r11-14 - q4-7 %if ARCH_X86_64 cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov hd, hm shl l_strideq, 2 %else ; stack layout [32bit only]: ; r15 - GPRs [mask_bitsm] ; r16 - m12/pb_mask ; r17 - bdmulq ; r18-24 - p2-q3 cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ dst, stride, mask, l, pic_reg, stride3, tmp RELOC_ARGS h, 25*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+17*16 mova [bdmulq], m0 shl dword lstridem, 2 %endif sub lq, 4 lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mask_bitsm dword [esp+15*16+0*gprsize] mov mask_bitsm, 0x3 mova m0, [PIC_sym(pb_mask)] %define m12 [esp+16*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+8], mask_bitsd ; vmask[2] %else mov r6d, mask_bitsm test [maskq+8], r6d %endif jz .no_flat16 FILTER 16, h jmp .end .no_flat16: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else test [maskq+4], r6d %endif jz .no_flat FILTER 8, h jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .no_filter FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] .end: %if ARCH_X86_64 pslld m12, 2 lea lq, [lq+l_strideq*2] shl mask_bitsd, 2 sub hd, 2 %else mova m0, m12 pslld m0, 2 mova m12, m0 add lq, dword lstridem add lq, dword lstridem shl mask_bitsm, 2 sub dword hm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov wd, wm shl l_strideq, 2 sub lq, l_strideq %else ; stack layout [32bit only]: ; r0 - GPRs [mask_bitsm, mstridem] ; r1 - m12/pb_mask ; r2 - bdmulq cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ dst, stride, mask, mstride, pic_reg, stride3, tmp RELOC_ARGS v, 3*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+2*16 mova [bdmulq], m0 shl dword lstridem, 2 sub r3, dword lstridem mov dword lm, r3 %endif mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mask_bitsm dword [esp+0*gprsize] %define mstridem dword [esp+1*gprsize] mov mask_bitsm, 0x3 mov mstridem, mstrideq mova m0, [PIC_sym(pb_mask)] %define m12 [esp+1*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else mov r6d, mask_bitsm test [maskq+4], r6d %endif jz .no_flat FILTER 6, v jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .end FILTER 4, v .end: %if ARCH_X86_64 pslld m12, 2 add lq, 8 %else mova m0, m12 pslld m0, 2 mova m12, m0 add dword lm, 8 %endif add dstq, 16 %if ARCH_X86_64 shl mask_bitsd, 2 sub wd, 2 %else shl mask_bitsm, 2 sub dword wm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov hd, hm shl l_strideq, 2 %else ; stack layout [32bit only]: ; r0 - GPRs [mask_bitsm] ; r1 - m12/pb_mask ; r2 - bdmulq ; r3-8 - p2-q2 cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ dst, stride, mask, l, pic_reg, stride3, tmp RELOC_ARGS h, 9*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+2*16 mova [bdmulq], m0 shl dword lstridem, 2 %endif sub lq, 4 lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mask_bitsm dword [esp+0*gprsize] mov mask_bitsm, 0x3 mova m0, [PIC_sym(pb_mask)] %define m12 [esp+1*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else mov r6d, mask_bitsm test [maskq+4], r6d %endif jz .no_flat FILTER 6, h jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .no_filter FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] .end: %if ARCH_X86_64 pslld m12, 2 lea lq, [lq+l_strideq*2] shl mask_bitsd, 2 sub hd, 2 %else mova m0, m12 pslld m0, 2 mova m12, m0 add lq, dword lstridem add lq, dword lstridem shl mask_bitsm, 2 sub dword hm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/loopfilter_avx512.asm000066400000000000000000001407671517466257200252660ustar00rootroot00000000000000; Copyright © 2018, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080 dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60 hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 shift1: dq 0x0204081020408000 shift3: dq 0x0810204080000000 shift4: dq 0x1020408000000000 pb_1: times 4 db 1 pb_2: times 4 db 2 pb_3: times 4 db 3 pb_4: times 4 db 4 pb_16: times 4 db 16 pb_63: times 4 db 63 pb_64: times 4 db 64 pb_128: times 4 db 0x80 pb_2_1: times 2 db 2, 1 pb_3_1: times 2 db 3, 1 pb_7_1: times 2 db 7, 1 pb_m1_0: times 2 db -1, 0 pb_m1_1: times 2 db -1, 1 pb_m1_2: times 2 db -1, 2 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 SECTION .text %macro ABSSUB 4 ; dst, a, b, tmp psubusb %1, %2, %3 psubusb %4, %3, %2 por %1, %4 %endmacro %macro TRANSPOSE_16x4_AND_WRITE_4x32 5 punpcklbw m%5, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklwd m%4, m%5, m%2 punpckhwd m%5, m%2 punpcklwd m%2, m%1, m%3 punpckhwd m%1, m%3 kmovw k1, k6 lea t0, [dstq+strideq*4] vpscatterdd [dstq+m19-2]{k1}, m%4 kmovw k1, k6 lea t1, [dstq+strideq*8] vpscatterdd [t0 +m19-2]{k1}, m%5 kmovw k1, k6 lea t2, [t0 +strideq*8] vpscatterdd [t1 +m19-2]{k1}, m%2 kmovw k1, k6 vpscatterdd [t2 +m19-2]{k1}, m%1 %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 SWAP m16, m22 %endif punpcklbw m22, m24, m26 punpckhbw m24, m26 punpcklbw m26, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklbw m7, m8, m9 punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 punpcklbw m11, m25, m13 punpckhbw m25, m13 %if %1 == 0 SWAP m13, m16 %else mova m13, %3 %endif SWAP m16, m25 punpcklbw m25, m14, m13 punpckhbw m13, m14, m13 ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 punpcklwd m14, m22, m26 punpckhwd m22, m26 punpcklwd m26, m24, m2 punpckhwd m24, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpcklwd m6, m7, m9 punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 punpcklwd m10, m11, m25 punpckhwd m11, m25 SWAP m25, m16, m11 punpcklwd m11, m25, m13 punpckhwd m25, m13 ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 punpckldq m13, m14, m2 punpckhdq m14, m2 punpckldq m2, m22, m3 punpckhdq m22, m3 punpckldq m3, m26, m5 punpckhdq m26, m5 punpckldq m5, m24, m4 punpckhdq m24, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 punpckldq m11, m8, m25 punpckhdq m8, m25 SWAP m25, m16, m8 punpckldq m8, m7, m25 punpckhdq m7, m25 ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 punpcklqdq m25, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 punpcklqdq m8, m22, m7 punpckhqdq m22, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 punpcklqdq m10, m26, m9 punpckhqdq m26, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 SWAP m11, m16 %if %2 == 0 SWAP m16, m25 %else mova %3, m25 %endif punpcklqdq m25, m24, m11 punpckhqdq m24, m11 %if %2 == 0 SWAP m11, m16 %endif ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 SWAP 3, 14, 25, 9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %define is_h 0 %if %1 == 4 lea t0, [dstq+mstrideq*2] mova m3, [t0 +strideq*0] ; p1 mova m4, [t0 +strideq*1] ; p0 mova m5, [t0 +strideq*2] ; q0 mova m6, [t0 +stride3q ] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline %if %1 == 16 lea t0, [dstq+mstrideq*8] mova m16, [t0 +strideq*1] mova m17, [t0 +strideq*2] mova m18, [t0 +stride3q ] %endif lea t0, [dstq+mstrideq*4] %if %1 != 6 mova m25, [t0 +strideq*0] %endif mova m13, [t0 +strideq*1] mova m3, [t0 +strideq*2] mova m4, [t0 +stride3q ] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m22, [dstq+stride3q ] %endif %if %1 == 16 lea t0, [dstq+strideq*4] mova m29, [t0 +strideq*0] mova m30, [t0 +strideq*1] mova m31, [t0 +strideq*2] %endif %endif %else ; h %define is_h 1 ; load lines %if %1 == 4 vbroadcasti32x4 m0, [hshuf4] kmovw k1, k6 lea t0, [dstq+strideq*4] vpgatherdd m3{k1}, [dstq+m19-2] kmovw k1, k6 lea t1, [dstq+strideq*8] vpgatherdd m4{k1}, [t0 +m19-2] kmovw k1, k6 lea t2, [t0 +strideq*8] vpgatherdd m5{k1}, [t1 +m19-2] kmovw k1, k6 vpgatherdd m6{k1}, [t2 +m19-2] pshufb m3, m0 pshufb m4, m0 pshufb m5, m0 pshufb m6, m0 punpckldq m7, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpcklqdq m6, m7, m4 punpckhqdq m7, m4 punpcklqdq m4, m3, m5 punpckhqdq m3, m5 SWAP 3, 6 SWAP 5, 4, 7 ; 6,7,4,3 -> 3,4,5,6 %elif %1 == 6 || %1 == 8 kmovb k1, k7 lea t0, [dstq+strideq*1] vpgatherdq m3{k1}, [dstq+ym21-%1/2] kmovb k1, k7 lea t1, [dstq+strideq*2] vpgatherdq m4{k1}, [t0 +ym21-%1/2] kmovb k1, k7 lea t2, [dstq+stride3q ] vpgatherdq m5{k1}, [t1 +ym21-%1/2] kmovb k1, k7 vextracti32x8 ym0, m21, 1 vpgatherdq m6{k1}, [t2 +ym21-%1/2] kmovb k1, k7 vpgatherdq m12{k1}, [dstq+ym0 -%1/2] kmovb k1, k7 vpgatherdq m13{k1}, [t0 +ym0 -%1/2] kmovb k1, k7 vpgatherdq m14{k1}, [t1 +ym0 -%1/2] kmovb k1, k7 vpgatherdq m15{k1}, [t2 +ym0 -%1/2] ; transpose 8x16 ; xm3: A-H0,A-H8 ; xm4: A-H1,A-H9 ; xm5: A-H2,A-H10 ; xm6: A-H3,A-H11 ; xm12: A-H4,A-H12 ; xm13: A-H5,A-H13 ; xm14: A-H6,A-H14 ; xm15: A-H7,A-H15 punpcklbw m7, m3, m4 punpckhbw m3, m4 punpcklbw m4, m5, m6 punpckhbw m5, m6 punpcklbw m6, m12, m13 punpckhbw m12, m13 punpcklbw m13, m14, m15 punpckhbw m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 punpcklwd m15, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 punpcklwd m5, m6, m13 punpckhwd m6, m13 punpcklwd m13, m12, m14 punpckhwd m12, m14 ; xm15: A0-3,B0-3,C0-3,D0-3 ; xm7: E0-3,F0-3,G0-3,H0-3 ; xm4: A8-11,B8-11,C8-11,D8-11 ; xm3: E8-11,F8-11,G8-11,H8-11 ; xm5: A4-7,B4-7,C4-7,D4-7 ; xm6: E4-7,F4-7,G4-7,H4-7 ; xm13: A12-15,B12-15,C12-15,D12-15 ; xm12: E12-15,F12-15,G12-15,H12-15 punpckldq m14, m15, m5 punpckhdq m15, m5 punpckldq m5, m7, m6 %if %1 != 6 punpckhdq m7, m6 %endif punpckldq m6, m4, m13 punpckhdq m4, m13 punpckldq m13, m3, m12 %if %1 != 6 punpckhdq m12, m3, m12 %endif ; xm14: A0-7,B0-7 ; xm15: C0-7,D0-7 ; xm5: E0-7,F0-7 ; xm7: G0-7,H0-7 ; xm6: A8-15,B8-15 ; xm4: C8-15,D8-15 ; xm13: E8-15,F8-15 ; xm12: G8-15,H8-15 punpcklqdq m3, m14, m6 punpckhqdq m14, m6 punpckhqdq m6, m15, m4 punpcklqdq m15, m4 punpcklqdq m4, m5, m13 punpckhqdq m13, m5, m13 %if %1 == 8 punpcklqdq m5, m7, m12 punpckhqdq m25, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 ; xm6: D0-15 ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 ; xm25: H0-15 SWAP 25, 3, 15 SWAP 13, 14, 5, 4, 6 SWAP 15, 22 ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 %else SWAP 13, 3, 14 SWAP 6, 4, 15, 5 ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 %endif %else ; 16, h ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose movu xm24, [dstq+strideq*0-8] movu xm26, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea t0, [dstq+strideq*4] movu xm4, [t0 +strideq*0-8] movu xm5, [t0 +strideq*1-8] movu xm6, [t0 +strideq*2-8] movu xm7, [t0 +stride3q -8] lea t0, [t0 +strideq*4] movu xm8, [t0 +strideq*0-8] movu xm9, [t0 +strideq*1-8] movu xm10, [t0 +strideq*2-8] movu xm11, [t0 +stride3q -8] lea t0, [t0 +strideq*4] movu xm25, [t0 +strideq*0-8] movu xm13, [t0 +strideq*1-8] movu xm14, [t0 +strideq*2-8] movu xm22, [t0 +stride3q -8] lea t0, [t0 +strideq*4] vinserti32x4 ym24, [t0 +strideq*0-8], 1 vinserti32x4 ym26, [t0 +strideq*1-8], 1 vinserti32x4 ym2, [t0 +strideq*2-8], 1 vinserti32x4 ym3, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 ym4, [t0 +strideq*0-8], 1 vinserti32x4 ym5, [t0 +strideq*1-8], 1 vinserti32x4 ym6, [t0 +strideq*2-8], 1 vinserti32x4 ym7, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 ym8, [t0 +strideq*0-8], 1 vinserti32x4 ym9, [t0 +strideq*1-8], 1 vinserti32x4 ym10, [t0 +strideq*2-8], 1 vinserti32x4 ym11, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 ym25, [t0 +strideq*0-8], 1 vinserti32x4 ym13, [t0 +strideq*1-8], 1 vinserti32x4 ym14, [t0 +strideq*2-8], 1 vinserti32x4 ym22, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 m24, [t0 +strideq*0-8], 2 vinserti32x4 m26, [t0 +strideq*1-8], 2 vinserti32x4 m2, [t0 +strideq*2-8], 2 vinserti32x4 m3, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m4, [t0 +strideq*0-8], 2 vinserti32x4 m5, [t0 +strideq*1-8], 2 vinserti32x4 m6, [t0 +strideq*2-8], 2 vinserti32x4 m7, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m8, [t0 +strideq*0-8], 2 vinserti32x4 m9, [t0 +strideq*1-8], 2 vinserti32x4 m10, [t0 +strideq*2-8], 2 vinserti32x4 m11, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m25, [t0 +strideq*0-8], 2 vinserti32x4 m13, [t0 +strideq*1-8], 2 vinserti32x4 m14, [t0 +strideq*2-8], 2 vinserti32x4 m22, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m24, [t0 +strideq*0-8], 3 vinserti32x4 m26, [t0 +strideq*1-8], 3 vinserti32x4 m2, [t0 +strideq*2-8], 3 vinserti32x4 m3, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] vinserti32x4 m4, [t0 +strideq*0-8], 3 vinserti32x4 m5, [t0 +strideq*1-8], 3 vinserti32x4 m6, [t0 +strideq*2-8], 3 vinserti32x4 m7, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] vinserti32x4 m8, [t0 +strideq*0-8], 3 vinserti32x4 m9, [t0 +strideq*1-8], 3 vinserti32x4 m10, [t0 +strideq*2-8], 3 vinserti32x4 m11, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] vinserti32x4 m25, [t0 +strideq*0-8], 3 vinserti32x4 m13, [t0 +strideq*1-8], 3 vinserti32x4 m14, [t0 +strideq*2-8], 3 vinserti32x4 m22, [t0 +stride3q -8], 3 ; TRANSPOSE_16X16B 0, 1, [rsp+0*64] SWAP m16, m26 SWAP m17, m2 SWAP m18, m3 SWAP m29, m25 SWAP m30, m13 SWAP m31, m14 mova [rsp+4*64], m22 ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 SWAP 25, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 SWAP 11, 22 %endif %endif ; load L/E/I/H vpbroadcastd m15, [pb_1] %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else kmovw k1, k6 vpgatherdd m0{k1}, [lq+m20+4] kmovw k1, k6 vpgatherdd m1{k1}, [lq+m20+0] %endif pxor m2, m2 pcmpeqb k1, m0, m2 vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, pbshuf ; l[x][0] vpcmpub k3, m0, m2, 4 ; neq ; L psrlq m2, m0, [lutq+128] pand m2, [pb_63]{bcstd} vpbroadcastb m1, [lutq+136] pminub m2, m1 pmaxub m2, m15 ; I gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H paddd m0, [pb_2]{bcstd} paddb m0, m0 paddb m0, m2 ; E ABSSUB m8, m3, m4, m9 ; abs(p1-p0) ABSSUB m9, m5, m6, m10 ; abs(q1-q0) pmaxub m8, m9 vpcmpub k1, m8, m1, 6 ; gt ; hev %if %1 != 4 %if %1 == 6 ABSSUB m9, m13, m4, m10 ; abs(p2-p0) pmaxub m9, m8 %else ABSSUB m9, m25, m4, m10 ; abs(p3-p0) pmaxub m9, m8 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) pmaxub m9, m10 %endif ABSSUB m10, m5, m14, m11 ; abs(q2-q0) pmaxub m9, m10 %if %1 != 6 ABSSUB m10, m5, m22, m11 ; abs(q3-q0) pmaxub m9, m10 %endif vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in %if %1 == 6 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) %else ABSSUB m10, m25, m13, m11 ; abs(p3-p2) ABSSUB m11, m13, m3, m1 ; abs(p2-p1) pmaxub m10, m11 ABSSUB m11, m14, m22, m1 ; abs(q3-q2) pmaxub m10, m11 %endif ABSSUB m11, m14, m6, m1 ; abs(q2-q1) pmaxub m10, m11 %if %1 == 16 vpbroadcastd m11, [maskq+8] por m11, [maskq+4]{bcstd} %else vpbroadcastd m11, [maskq+4] %endif vptestmd k4, m11, pbmask vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks pmaxub m8, m10 %endif vpcmpub k3{k3}, m8, m2, 2 ; le ABSSUB m10, m3, m6, m11 ; abs(p1-q1) ABSSUB m11, m4, m5, m2 ; abs(p0-q0) paddusb m11, m11 gf2p8affineqb m10, m10, [shift1]{bcstq}, 0 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E %if %1 == 16 ABSSUB m1, m16, m4, m2 ABSSUB m2, m17, m4, m10 pmaxub m1, m2 ABSSUB m2, m18, m4, m10 pmaxub m1, m2 ABSSUB m2, m29, m5, m10 pmaxub m1, m2 ABSSUB m2, m30, m5, m10 pmaxub m1, m2 ABSSUB m2, m31, m5, m10 pmaxub m1, m2 kandq k2, k2, k3 vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out vpbroadcastd m2, [maskq+8] vptestmd k5, m2, pbmask vpmovm2d m7, k5 vptestmb k4{k4}, m7, m7 ; flat16 & fm por m10, m2, [maskq+4]{bcstd} vptestmd k5, m10, pbmask vpmovm2d m7, k5 vptestmb k2{k2}, m7, m7 ; flat8in por m2, m10, [maskq+0]{bcstd} vptestmd k5, m2, pbmask vpmovm2d m7, k5 vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 & !flat16 kandnq k2, k4, k2 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] vptestmd k4, m0, pbmask vpmovm2d m7, k4 vptestmb k2{k2}, m7, m7 kandq k2, k2, k3 ; flat8 & fm por m0, [maskq+0]{bcstd} vptestmd k4, m0, pbmask vpmovm2d m7, k4 vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 %else %ifidn %2, v vptestmd k4, pbmask, [maskq+0]{bcstd} %else vpbroadcastd m0, [maskq+0] vptestmd k4, m0, pbmask %endif vpmovm2d m7, k4 vptestmb k3{k3}, m7, m7 ; fm %endif ; short filter %if %1 >= 8 SWAP m23, m15 %endif vpbroadcastd m15, [pb_3] vpbroadcastd m0, [pb_4] vpbroadcastd m12, [pb_16] vpbroadcastd m1, [pb_64] pxor m3, pb128 pxor m6, pb128 psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev pxor m4, pb128 pxor m5, pb128 psubsb m11, m5, m4 paddsb m10, m11 paddsb m10, m11 paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm paddsb m8, m10, m15 paddsb m10, m0 gf2p8affineqb m8, m8, [shift3]{bcstq}, 16 gf2p8affineqb m10, m10, [shift3]{bcstq}, 16 psubb m8, m12 ; f2 psubb m10, m12 ; f1 paddsb m4, m8 psubsb m5, m10 pxor m4, pb128 pxor m5, pb128 ; pxor m10, pb128 pxor m8, m8 pavgb m8, m10 ; f=(f1+1)>>1 psubb m8, m1 knotq k1, k1 paddsb m3{k1}, m3, m8 psubsb m6{k1}, m6, m8 pxor m3, pb128 pxor m6, pb128 %if %1 == 16 ; flat16 filter %ifidn %2, v lea t0, [dstq+mstrideq*8] %endif SWAP m24, m16, m14 SWAP m2, m17, m22 SWAP m7, m18 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 vpbroadcastd m1, [pb_7_1] vpbroadcastd m12, [pb_2] punpcklbw m14, m24, m25 punpckhbw m22, m24, m25 pmaddubsw m10, m14, m1 pmaddubsw m11, m22, m1 ; p6*7+p3 punpcklbw m8, m2, m7 punpckhbw m9, m2, m7 pmaddubsw m8, m12 pmaddubsw m9, m12 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 %ifidn %2, h vpbroadcastd m27, [pw_2048] vpbroadcastd m1, [pb_m1_1] %define pw2048 m27 %define pbm1_1 m1 %endif punpcklbw m8, m13, m3 punpckhbw m9, m13, m3 pmaddubsw m8, m23 pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m8, m4, m5 punpckhbw m9, m4, m5 pmaddubsw m8, m23 pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5 %else vpblendmb m8{k4}, m2, m8 mova [rsp+1*64], m8 %endif ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, pbm1_1 pmaddubsw m22, pbm1_1 paddw m10, m14 paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 punpcklbw m8, m24, m6 punpckhbw m9, m24, m6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 SWAP m18, m8 SWAP m23, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+stride3q]{k4}, m8 ; p4 %else vpblendmb m8{k4}, m7, m8 mova [rsp+2*64], m8 %endif ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 SWAP m14, m16 punpcklbw m8, m24, m13 punpckhbw m9, m24, m13 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 punpcklbw m8, m2, m14 punpckhbw m2, m14 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 SWAP m16, m8 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 %else vpblendmb m8{k4}, m25, m8 mova [rsp+3*64], m8 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 SWAP m22, m17 punpcklbw m8, m24, m3 punpckhbw m9, m24, m3 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 punpcklbw m8, m7, m22 punpckhbw m7, m22 pmaddubsw m8, pbm1_1 pmaddubsw m7, pbm1_1 paddw m10, m8 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 SWAP m17, m8 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 %ifidn %2, v lea t0, [dstq+strideq*4] %endif punpcklbw m8, m24, m4 punpckhbw m9, m24, m4 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 punpcklbw m8, m25, m29 punpckhbw m9, m25, m29 SWAP m26, m29 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 SWAP m29, m8 SWAP m0, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, h SWAP m28, m24 punpcklbw m8, m28, m5 punpckhbw m24, m28, m5 %else punpcklbw m8, m24, m5 punpckhbw m24, m5 %endif pmaddubsw m8, pbm1_1 pmaddubsw m24, pbm1_1 paddw m10, m8 paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 punpcklbw m24, m13, m30 punpckhbw m9, m13, m30 %ifidn %2, h SWAP m27, m30 %endif SWAP m13, m15 pmaddubsw m24, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m24 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 SWAP m30, m24 SWAP m15, m9 %ifidn %2, h SWAP m9, m24 %define pw2048 m9 %endif pmulhrsw m24, m10, pw2048 pmulhrsw m8, m11, pw2048 paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 paddw m11, m23 packuswb m24, m8 punpcklbw m8, m3, m31 pmaddubsw m8, pbm1_1 paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m18, m8 pmulhrsw m8, m10, pw2048 paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h SWAP m16, m9 %define pw2048 m16 %endif punpckhbw m9, m3, m31 SWAP m3, m12 pmaddubsw m9, pbm1_1 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m23, m9 pmulhrsw m9, m11, pw2048 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h SWAP m2, m1 %define pbm1_1 m2 %endif vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 SWAP m24, m31 ; q6 packuswb m8, m9 %ifidn %2, h SWAP m31, m2 %define pbm1_1 m31 %endif vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 punpcklbw m8, m4, m24 punpckhbw m2, m4, m24 SWAP m4, m1 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 pmulhrsw m2, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m2, m9 vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 paddw m11, m7 punpcklbw m8, m5, m24 punpckhbw m9, m5, m24 SWAP m5, m12 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 pmulhrsw m7, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m7, m9 vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 paddw m11, m0 punpcklbw m8, m6, m24 punpckhbw m9, m6, m24 SWAP 2, 6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+mstrideq]{k4}, m8 %else SWAP m29, m16 %define pw2048 m29 vpblendmb m16{k4}, m22, m8 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 paddw m11, m15 %ifidn %2, h SWAP m15, m8 %endif punpcklbw m8, m14, m24 punpckhbw m9, m14, m24 SWAP 14, 7 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 %else vpblendmb m17{k4}, m26, m8 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 paddw m11, m23 punpcklbw m8, m22, m24 punpckhbw m9, m22, m24 SWAP m30, m24 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m10, pw2048 pmulhrsw m11, pw2048 packuswb m10, m11 %ifidn %2, v vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5 %else vmovdqu8 m27{k4}, m10 %endif %ifidn %2, v lea t0, [dstq+mstrideq*4] %endif %endif %if %1 >= 8 ; flat8 filter vpbroadcastd m9, [pb_3_1] vpbroadcastd m10, [pb_2_1] %if %1 == 16 vpbroadcastd m23, [pb_1] vpbroadcastd m0, [pb_4] %elifidn %2, h vpbroadcastd m31, [pb_m1_1] %define pbm1_1 m31 %endif punpcklbw m24, m25, m3 punpckhbw m26, m25, m3 pmaddubsw m2, m24, m9 pmaddubsw m7, m26, m9 ; 3 * p3 + p1 punpcklbw m8, m13, m4 punpckhbw m11, m13, m4 pmaddubsw m8, m10 pmaddubsw m11, m10 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 punpcklbw m8, m5, m0 punpckhbw m11, m5, m0 pmaddubsw m8, m23 pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 %if is_h || %1 == 16 vpblendmb m10{k2}, m13, m8 ; p2 %endif %ifidn %2, v %if %1 == 8 vmovdqu8 [t0+strideq*1]{k2}, m8 %else mova [t0+strideq*1], m10 %endif %endif pmaddubsw m8, m24, pbm1_1 pmaddubsw m11, m26, pbm1_1 paddw m2, m8 paddw m7, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 pmaddubsw m8, pbm1_1 pmaddubsw m11, pbm1_1 paddw m2, m8 paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendmb m8{k2}, m3, m8 ; p1 %ifidn %2, v mova [t0+strideq*2], m8 %else SWAP m18, m8 %endif pmaddubsw m24, m23 pmaddubsw m26, m23 psubw m2, m24 psubw m7, m26 punpcklbw m8, m4, m14 punpckhbw m11, m4, m14 pmaddubsw m8, m23 pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendmb m8{k2}, m4, m8 ; p0 %ifidn %2, v mova [t0+stride3q], m8 %else SWAP m29, m8 %endif punpcklbw m24, m5, m22 punpckhbw m26, m5, m22 pmaddubsw m8, m24, m23 pmaddubsw m11, m26, m23 paddw m2, m8 paddw m7, m11 punpcklbw m8, m4, m25 punpckhbw m11, m4, m25 pmaddubsw m8, m23 pmaddubsw m11, m23 psubw m2, m8 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendmb m11{k2}, m5, m8 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 %endif pmaddubsw m24, pbm1_1 pmaddubsw m26, pbm1_1 paddw m2, m24 paddw m7, m26 punpcklbw m8, m13, m6 punpckhbw m13, m6 pmaddubsw m8, pbm1_1 pmaddubsw m13, pbm1_1 paddw m2, m8 paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 psrlw m8, m2, 3 psrlw m13, m7, 3 packuswb m8, m13 vpblendmb m13{k2}, m6, m8 ; q1 %ifidn %2, v mova [dstq+strideq*1], m13 %endif punpcklbw m24, m3, m6 punpckhbw m26, m3, m6 pmaddubsw m24, m23 pmaddubsw m26, m23 psubw m2, m24 psubw m7, m26 punpcklbw m24, m14, m22 punpckhbw m26, m14, m22 pmaddubsw m24, m23 pmaddubsw m26, m23 paddw m2, m24 paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 %if is_h || %1 == 16 vpblendmb m2{k2}, m14, m2 ; q2 %endif %ifidn %2, v %if %1 == 8 vmovdqu8 [dstq+strideq*2]{k2}, m2 %else mova [dstq+strideq*2], m2 %endif %endif %ifidn %2, h SWAP m24, m18 SWAP m26, m29 %if %1 == 8 ; 16x8 transpose punpcklbw m3, m25, m10 punpckhbw m25, m10 punpcklbw m10, m24, m26 punpckhbw m24, m26 punpcklbw m26, m11, m13 punpckhbw m11, m13 punpcklbw m13, m2, m22 punpckhbw m2, m22 ; punpcklwd m22, m3, m10 punpckhwd m3, m10 punpcklwd m10, m25, m24 punpckhwd m25, m24 punpcklwd m24, m26, m13 punpckhwd m26, m13 punpcklwd m13, m11, m2 punpckhwd m11, m2 ; punpckldq m2, m22, m24 punpckhdq m22, m24 punpckldq m24, m3, m26 punpckhdq m3, m26 punpckldq m26, m10, m13 punpckhdq m10, m13 punpckldq m13, m25, m11 punpckhdq m25, m11 ; write 8x32 vpbroadcastd ym16, strided pmulld ym16, [hmulD] lea t1, [dstq+strideq*2] lea t2, [dstq+strideq*4] lea t3, [t1 +strideq*4] lea t0, [dstq+strideq*8] kmovb k1, k6 kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 vpscatterdq [dstq+ym16-4]{k1}, m2 vpscatterdq [t1 +ym16-4]{k2}, m22 vpscatterdq [t2 +ym16-4]{k3}, m24 vpscatterdq [t3 +ym16-4]{k4}, m3 lea t1, [t0+strideq*2] lea t2, [t0+strideq*4] lea t3, [t1+strideq*4] kmovb k1, k6 kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 vpscatterdq [t0+ym16-4]{k1}, m26 vpscatterdq [t1+ym16-4]{k2}, m10 vpscatterdq [t2+ym16-4]{k3}, m13 vpscatterdq [t3+ym16-4]{k4}, m25 %else ; 16x16 transpose and store SWAP 5, 10, 2 SWAP 6, 24 SWAP 7, 26 SWAP 8, 11 SWAP 9, 13 mova m24, [rsp+0*64] SWAP m26, m28 mova m2, [rsp+1*64] mova m3, [rsp+2*64] mova m4, [rsp+3*64] SWAP m11, m16 SWAP m25, m17 SWAP m13, m27 SWAP m14, m30 TRANSPOSE_16X16B 1, 0, [rsp+4*64] movu [dstq+strideq*0-8], xm24 movu [dstq+strideq*1-8], xm26 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea t0, [dstq+strideq*4] movu [t0+strideq*0-8], xm4 movu [t0+strideq*1-8], xm5 movu [t0+strideq*2-8], xm6 movu [t0+stride3q -8], xm7 lea t0, [t0+strideq*4] movu [t0+strideq*0-8], xm8 movu [t0+strideq*1-8], xm9 movu [t0+strideq*2-8], xm10 movu [t0+stride3q -8], xm11 lea t0, [t0+strideq*4] movu [t0+strideq*0-8], xm25 movu [t0+strideq*1-8], xm13 movu [t0+strideq*2-8], xm14 movu [t0+stride3q -8], xm22 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym24, 1 vextracti128 [t0+strideq*1-8], ym26, 1 vextracti128 [t0+strideq*2-8], ym2, 1 vextracti128 [t0+stride3q -8], ym3, 1 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym4, 1 vextracti128 [t0+strideq*1-8], ym5, 1 vextracti128 [t0+strideq*2-8], ym6, 1 vextracti128 [t0+stride3q -8], ym7, 1 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym8, 1 vextracti128 [t0+strideq*1-8], ym9, 1 vextracti128 [t0+strideq*2-8], ym10, 1 vextracti128 [t0+stride3q -8], ym11, 1 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym25, 1 vextracti128 [t0+strideq*1-8], ym13, 1 vextracti128 [t0+strideq*2-8], ym14, 1 vextracti128 [t0+stride3q -8], ym22, 1 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m24, 2 vextracti32x4 [t0+strideq*1-8], m26, 2 vextracti32x4 [t0+strideq*2-8], m2, 2 vextracti32x4 [t0+stride3q -8], m3, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m4, 2 vextracti32x4 [t0+strideq*1-8], m5, 2 vextracti32x4 [t0+strideq*2-8], m6, 2 vextracti32x4 [t0+stride3q -8], m7, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m8, 2 vextracti32x4 [t0+strideq*1-8], m9, 2 vextracti32x4 [t0+strideq*2-8], m10, 2 vextracti32x4 [t0+stride3q -8], m11, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m25, 2 vextracti32x4 [t0+strideq*1-8], m13, 2 vextracti32x4 [t0+strideq*2-8], m14, 2 vextracti32x4 [t0+stride3q -8], m22, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m24, 3 vextracti32x4 [t0+strideq*1-8], m26, 3 vextracti32x4 [t0+strideq*2-8], m2, 3 vextracti32x4 [t0+stride3q -8], m3, 3 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m4, 3 vextracti32x4 [t0+strideq*1-8], m5, 3 vextracti32x4 [t0+strideq*2-8], m6, 3 vextracti32x4 [t0+stride3q -8], m7, 3 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m8, 3 vextracti32x4 [t0+strideq*1-8], m9, 3 vextracti32x4 [t0+strideq*2-8], m10, 3 vextracti32x4 [t0+stride3q -8], m11, 3 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m25, 3 vextracti32x4 [t0+strideq*1-8], m13, 3 vextracti32x4 [t0+strideq*2-8], m14, 3 vextracti32x4 [t0+stride3q -8], m22, 3 %endif %endif %elif %1 == 6 ; flat6 filter vpbroadcastd m15, [pb_3_1] vpbroadcastd m12, [pb_2] punpcklbw m8, m13, m5 punpckhbw m11, m13, m5 pmaddubsw m0, m8, m15 pmaddubsw m1, m11, m15 punpcklbw m7, m4, m3 punpckhbw m10, m4, m3 pmaddubsw m2, m7, m12 pmaddubsw m12, m10, m12 %ifidn %2, h vpbroadcastd m15, [pb_m1_1] %define pbm1_1 m15 %endif paddw m0, m2 paddw m1, m12 pmulhrsw m2, m0, m16 pmulhrsw m12, m1, m16 packuswb m2, m12 vpblendmb m2{k2}, m3, m2 ; p1 %ifidn %2, v mova [t0+strideq*2], m2 %endif pmaddubsw m8, pbm1_1 pmaddubsw m11, pbm1_1 paddw m0, m8 paddw m1, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 pmaddubsw m8, pbm1_1 pmaddubsw m11, pbm1_1 paddw m0, m8 paddw m1, m11 pmulhrsw m12, m0, m16 pmulhrsw m13, m1, m16 packuswb m12, m13 vpblendmb m12{k2}, m4, m12 ; p0 %ifidn %2, v mova [t0+stride3q], m12 %endif vpbroadcastd m9, [pb_m1_2] vpbroadcastd m4, [pb_m1_0] paddw m0, m8 paddw m1, m11 punpcklbw m8, m3, m14 punpckhbw m11, m3, m14 pmaddubsw m14, m8, pbm1_1 pmaddubsw m13, m11, pbm1_1 paddw m0, m14 paddw m1, m13 pmulhrsw m14, m0, m16 pmulhrsw m13, m1, m16 packuswb m14, m13 vpblendmb m14{k2}, m5, m14 ; q0 %ifidn %2, v mova [dstq+strideq*0], m14 %endif pmaddubsw m8, m9 pmaddubsw m11, m9 paddw m0, m8 paddw m1, m11 pmaddubsw m7, m4 pmaddubsw m10, m4 paddw m0, m7 paddw m1, m10 pmulhrsw m0, m16 pmulhrsw m1, m16 packuswb m0, m1 vpblendmb m0{k2}, m6, m0 ; q1 %ifidn %2, v mova [dstq+strideq*1], m0 %else TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 %endif %else ; %1 == 4 %ifidn %2, v mova [t0+strideq*0], m3 ; p1 mova [t0+strideq*1], m4 ; p0 mova [t0+strideq*2], m5 ; q0 mova [t0+stride3q ], m6 ; q1 %else TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 %endif %endif %endmacro %define k7 k6 INIT_ZMM avx512icl cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mova m21, [pb_4x0_4x4_4x8_4x12] mova m20, [pb_mask] vpbroadcastd m19, [pb_128] vpbroadcastd m28, [pb_m1_1] vpbroadcastd m27, [pw_2048] %define pbshuf m21 %define pbmask m20 %define pb128 m19 %define pbm1_1 m28 %define pw2048 m27 .loop: cmp word [maskq+8], 0 ; vmask[2] je .no_flat16 FILTER 16, v jmp .end .no_flat16: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 8, v jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call .v4 .end: add lq, 64 add dstq, 64 add maskq, 2 sub wd, 16 jg .loop RET ALIGN function_align RESET_MM_PERMUTATION .v4: FILTER 4, v ret cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 DECLARE_REG_TMP 9, 10, 11, 12 shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea stride8q, [strideq*8] kxnorw k6, k6, k6 vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] pmulld m20, [hmulB] pmulld m19, [hmulC] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask [pb_mask] %define pb128 [pb_128]{bcstd} shl l_strideq, 1 .loop: cmp word [maskq+8], 0 ; vmask[2] je .no_flat16 FILTER 16, h jmp .end .no_flat16: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 8, h jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call .h4 .end: lea lq, [lq+l_strideq*8] lea dstq, [dstq+stride8q*8] add maskq, 2 sub hd, 16 jg .loop RET ALIGN function_align RESET_MM_PERMUTATION .h4: FILTER 4, h ret cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mova m21, [pb_4x0_4x4_4x8_4x12] mova m20, [pb_mask] vpbroadcastd m19, [pb_128] vpbroadcastd m17, [pb_m1_1] vpbroadcastd m16, [pw_4096] %define pbshuf m21 %define pbmask m20 %define pb128 m19 %define pbm1_1 m17 .loop: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 6, v jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 .end: add lq, 64 add dstq, 64 add maskq, 2 sub wd, 16 jg .loop RET %undef k7 cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 DECLARE_REG_TMP 9, 10, 11 mov r7d, 0xffff movzx r8d, r7b cmp hd, 9 cmovb r7d, r8d kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff shl l_strideq, 2 sub lq, 4 kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0 lea stride3q, [strideq*3] lea stride8q, [strideq*8] vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] pmulld m20, [hmulB] pmulld m19, [hmulC] mova m18, [pb_mask] vpbroadcastd m17, [pb_128] vpbroadcastd m16, [pw_4096] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask m18 %define pb128 m17 add l_strideq, l_strideq .loop: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 6, h jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 .end: lea lq, [lq+l_strideq*8] lea dstq, [dstq+stride8q*8] add maskq, 2 sub hd, 16 jg .loop RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/loopfilter_sse.asm000066400000000000000000001732111517466257200250200ustar00rootroot00000000000000; Copyright © 2018-2021, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 pb_7_1: times 8 db 7, 1 pb_3_1: times 8 db 3, 1 pb_2_1: times 8 db 2, 1 pb_m1_0: times 8 db -1, 0 pb_m1_1: times 8 db -1, 1 pb_m1_2: times 8 db -1, 2 pb_1: times 16 db 1 pb_2: times 16 db 2 pb_3: times 16 db 3 pb_4: times 16 db 4 pb_16: times 16 db 16 pb_63: times 16 db 63 pb_64: times 16 db 64 pb_128: times 16 db 0x80 pb_129: times 16 db 0x81 pb_240: times 16 db 0xf0 pb_248: times 16 db 0xf8 pb_254: times 16 db 0xfe pw_2048: times 8 dw 2048 pw_4096: times 8 dw 4096 pd_mask: dd 1, 2, 4, 8 SECTION .text %macro ABSSUB 4 ; dst, a, b, tmp psubusb %1, %2, %3 psubusb %4, %3, %2 por %1, %4 %endmacro %macro TRANSPOSE_16x4_AND_WRITE_4x16 5 ; transpose 16x4 punpcklbw m%5, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklwd m%4, m%5, m%2 punpckhwd m%5, m%2 punpcklwd m%2, m%1, m%3 punpckhwd m%1, m%3 ; write out %assign %%n 0 %rep 4 movd [dstq+strideq *0-2], xm%4 movd [dstq+strideq *4-2], xm%5 movd [dstq+strideq *8-2], xm%2 movd [dstq+stride3q*4-2], xm%1 add dstq, strideq %if %%n < 3 psrldq xm%4, 4 psrldq xm%5, 4 psrldq xm%2, 4 psrldq xm%1, 4 %endif %assign %%n (%%n+1) %endrep lea dstq, [dstq+stride3q*4] %endmacro %macro TRANSPOSE_16X16B 2 ; output_transpose, mem %if %1 == 0 mova %2, m15 ; m7 in 32-bit %endif ; input in m0-7 punpcklbw m15, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 %if ARCH_X86_64 SWAP 4, 5, 7 %else %if %1 == 0 mova m5, %2 %else mova m5, [esp+1*16] %endif mova %2, m4 %endif punpcklbw m4, m6, m5 punpckhbw m6, m5 ; interleaved in m15,0,1,2,3,7,4,6 punpcklwd m5, m15, m1 punpckhwd m15, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m4 punpckhwd m3, m4 %if ARCH_X86_64 SWAP 3, 4, 7 %else mova m4, %2 mova %2, m3 %endif punpcklwd m3, m4, m6 punpckhwd m4, m6 ; interleaved in m5,15,1,0,2,7,3,4 punpckldq m6, m5, m2 punpckhdq m5, m2 %if ARCH_X86_64 SWAP 2, 7, 5 %else mova m2, %2 mova [esp+1*16], m5 %endif punpckldq m5, m15, m2 punpckhdq m15, m2 punpckldq m2, m1, m3 punpckhdq m1, m3 punpckldq m3, m0, m4 punpckhdq m0, m4 %if ARCH_X86_32 mova [esp+0*16], m6 mova [esp+2*16], m5 mova [esp+3*16], m15 mova [esp+4*16], m2 mova [esp+5*16], m1 mova [esp+6*16], m3 mova [esp+7*16], m0 mova m8, [esp+ 8*16] mova m9, [esp+ 9*16] mova m10, [esp+10*16] %if %1 == 0 mova m11, [esp+11*16] mova m12, [esp+12*16] mova m13, [esp+13*16] mova m14, [esp+14*16] %else mova m11, [esp+20*16] mova m12, [esp+15*16] mova m13, [esp+16*16] mova m14, [esp+17*16] %endif %endif ; input in m8-m15 %if ARCH_X86_64 SWAP 7, 4 %endif punpcklbw m7, m8, m9 punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 punpcklbw m11, m12, m13 punpckhbw m12, m13 %if ARCH_X86_64 mova m13, %2 %else %if %1 == 0 mova m13, [esp+15*16] %else mova m13, [esp+18*16] %endif %endif mova %2, m12 punpcklbw m12, m14, m13 punpckhbw m14, m14, m13 ; interleaved in m7,8,9,10,11,rsp%2,12,14 punpcklwd m13, m7, m9 punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 punpcklwd m10, m11, m12 punpckhwd m11, m12 mova m12, %2 mova %2, m11 punpcklwd m11, m12, m14 punpckhwd m12, m14 ; interleaved in m13,7,9,8,10,rsp%2,11,12 punpckldq m14, m13, m10 punpckhdq m13, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 punpckldq m11, m8, m12 punpckhdq m8, m12 mova m12, %2 mova %2, m8 punpckldq m8, m7, m12 punpckhdq m7, m12 %if ARCH_X86_32 mova [esp+ 8*16], m10 mova [esp+ 9*16], m9 mova [esp+10*16], m11 SWAP 6, 1 SWAP 4, 2 SWAP 5, 3 mova m6, [esp+0*16] mova m4, [esp+1*16] mova m5, [esp+2*16] %endif ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 punpcklqdq m12, m6, m14 punpckhqdq m6, m14 punpcklqdq m14, m4, m13 punpckhqdq m4, m13 punpcklqdq m13, m5, m8 punpckhqdq m5, m8 %if ARCH_X86_64 SWAP 8, 5 %else mova m8, [esp+3*16] mova [esp+27*16], m5 %define m15 m8 %endif punpcklqdq m5, m15, m7 punpckhqdq m15, m7 %if ARCH_X86_32 mova [esp+11*16], m12 mova [esp+12*16], m6 mova [esp+13*16], m14 mova [esp+14*16], m4 mova [esp+26*16], m13 mova [esp+ 0*16], m5 mova [esp+ 1*16], m15 mova m2, [esp+ 4*16] mova m10, [esp+ 8*16] mova m1, [esp+ 5*16] mova m9, [esp+ 9*16] mova m3, [esp+ 6*16] mova m11, [esp+10*16] mova m0, [esp+ 7*16] %endif punpcklqdq m7, m2, m10 punpckhqdq m2, m10 punpcklqdq m10, m1, m9 punpckhqdq m1, m9 punpcklqdq m9, m3, m11 punpckhqdq m3, m11 mova m11, %2 %if ARCH_X86_32 %define m12 m3 %endif mova %2, m12 punpcklqdq m12, m0, m11 punpckhqdq m0, m11 %if %1 == 1 mova m11, %2 %endif %if ARCH_X86_64 ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 SWAP 0, 11, 1, 6, 5, 8, 7, 15 SWAP 2, 14, 12, 9 SWAP 3, 4, 13 %else %if %1 == 0 mova [esp+15*16], m9 mova [esp+17*16], m12 mova [esp+18*16], m0 mova [esp+28*16], m10 mova [esp+29*16], m1 mova m3, [esp+0*16] mova m4, [esp+1*16] SWAP m5, m7 SWAP m6, m2 %else SWAP 0, 7 SWAP 3, 1, 2, 4, 6 %endif %endif %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] %if ARCH_X86_64 %define %%flat8mem [rsp+0*16] %define %%q2mem [rsp+1*16] %define %%q3mem [rsp+2*16] %else %if %1 == 4 || %1 == 6 %define %%p2mem [esp+ 8*16] %define %%q2mem [esp+ 9*16] %define %%flat8mem [esp+10*16] %else %ifidn %2, v %define %%p2mem [esp+16*16] %define %%q2mem [esp+ 1*16] %define %%q3mem [esp+18*16] %define %%flat8mem [esp+ 0*16] %define %%flat16mem [esp+20*16] %else %define %%p2mem [esp+27*16] %define %%q2mem [esp+28*16] %define %%q3mem [esp+29*16] %define %%flat8mem [esp+21*16] %define %%flat16mem [esp+30*16] %endif %endif %xdefine m12reg m12 %endif %if ARCH_X86_32 lea stride3q, [strideq*3] %endif ; load data %ifidn %2, v %if ARCH_X86_32 mov mstrideq, strideq neg mstrideq %endif %if %1 == 4 lea tmpq, [dstq+mstrideq*2] mova m3, [tmpq+strideq*0] ; p1 mova m4, [tmpq+strideq*1] ; p0 mova m5, [tmpq+strideq*2] ; q0 mova m6, [tmpq+stride3q] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] ; we load p3 later %define %%p3mem [dstq+mstrideq*4] %if ARCH_X86_32 %define m13 m0 %define m14 m1 %define m15 m2 %endif mova m13, [tmpq+strideq*1] mova m3, [tmpq+strideq*2] mova m4, [tmpq+stride3q] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m15, [dstq+stride3q] %endif %if ARCH_X86_32 mova %%p2mem, m13 mova %%q2mem, m14 %define m13 %%p2mem %define m14 %%q2mem %if %1 != 6 mova %%q3mem, m15 %define m15 %%q3mem %endif %endif %endif %else ; %2 == h ; load lines %if %1 == 4 ; transpose 4x16 movd m7, [dstq+strideq*0-2] movd m3, [dstq+strideq*1-2] movd m4, [dstq+strideq*2-2] movd m5, [dstq+stride3q -2] lea tmpq, [dstq+strideq*4] punpcklbw m7, m3 punpcklbw m4, m5 movd m3, [tmpq+strideq*0-2] movd m1, [tmpq+strideq*1-2] movd m5, [tmpq+strideq*2-2] movd m6, [tmpq+stride3q -2] lea tmpq, [tmpq+strideq*4] punpcklbw m3, m1 punpcklbw m5, m6 movd m0, [tmpq+strideq*0-2] movd m1, [tmpq+strideq*1-2] punpcklbw m0, m1 movd m1, [tmpq+strideq*2-2] movd m2, [tmpq+stride3q -2] punpcklbw m1, m2 punpcklqdq m7, m0 punpcklqdq m4, m1 lea tmpq, [tmpq+strideq*4] movd m0, [tmpq+strideq*0-2] movd m1, [tmpq+strideq*1-2] punpcklbw m0, m1 movd m1, [tmpq+strideq*2-2] movd m2, [tmpq+stride3q -2] punpcklbw m1, m2 punpcklqdq m3, m0 punpcklqdq m5, m1 ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 punpcklwd m6, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 ; xm6: A0-3,B0-3,C0-3,D0-3 ; xm7: A8-11,B8-11,C8-11,D8-11 ; xm4: A4-7,B4-7,C4-7,D4-7 ; xm3: A12-15,B12-15,C12-15,D12-15 punpckldq m5, m6, m4 punpckhdq m6, m4 punpckldq m4, m7, m3 punpckhdq m7, m3 ; xm5: A0-7,B0-7 ; xm6: C0-7,D0-7 ; xm4: A8-15,B8-15 ; xm7: C8-15,D8-15 punpcklqdq m3, m5, m4 punpckhqdq m5, m5, m4 punpcklqdq m4, m6, m7 punpckhqdq m6, m7 ; xm3: A0-15 ; xm5: B0-15 ; xm4: C0-15 ; xm6: D0-15 SWAP 4, 5 %elif %1 == 6 || %1 == 8 ; transpose 8x16 movq m7, [dstq+strideq*0-%1/2] movq m3, [dstq+strideq*1-%1/2] movq m4, [dstq+strideq*2-%1/2] movq m5, [dstq+stride3q -%1/2] lea tmpq, [dstq+strideq*8] punpcklbw m7, m3 punpcklbw m4, m5 movq m3, [tmpq+strideq*0-%1/2] movq m1, [tmpq+strideq*1-%1/2] movq m5, [tmpq+strideq*2-%1/2] movq m6, [tmpq+stride3q -%1/2] lea tmpq, [dstq+strideq*4] punpcklbw m3, m1 punpcklbw m5, m6 movq m6, [tmpq+strideq*0-%1/2] movq m0, [tmpq+strideq*1-%1/2] movq m1, [tmpq+strideq*2-%1/2] movq m2, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] punpcklbw m6, m0 punpcklbw m1, m2 movq m2, [tmpq+strideq*2-%1/2] movq m0, [tmpq+stride3q -%1/2] punpcklbw m2, m0 %if ARCH_X86_64 SWAP m15, m2 %else %define m15 [esp+3*16] mova m15, m2 %endif movq m0, [tmpq+strideq*0-%1/2] movq m2, [tmpq+strideq*1-%1/2] punpcklbw m0, m2 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 punpcklwd m2, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 punpcklwd m5, m6, m1 punpckhwd m6, m1 punpcklwd m1, m0, m15 punpckhwd m0, m15 %if ARCH_X86_64 SWAP m15, m0 %else mova m15, m0 %endif ; xm2: A0-3,B0-3,C0-3,D0-3 ; xm7: E0-3,F0-3,G0-3,H0-3 ; xm4: A8-11,B8-11,C8-11,D8-11 ; xm3: E8-11,F8-11,G8-11,H8-11 ; xm5: A4-7,B4-7,C4-7,D4-7 ; xm6: E4-7,F4-7,G4-7,H4-7 ; xm1: A12-15,B12-15,C12-15,D12-15 ; xm0: E12-15,F12-15,G12-15,H12-15 punpckldq m0, m2, m5 punpckhdq m2, m5 punpckldq m5, m7, m6 %if %1 != 6 punpckhdq m7, m6 %endif punpckldq m6, m4, m1 punpckhdq m4, m1 punpckldq m1, m3, m15 %if %1 != 6 punpckhdq m3, m15 %if ARCH_X86_64 SWAP m15, m3 %else mova m15, m3 %endif %endif ; xm0: A0-7,B0-7 ; xm2: C0-7,D0-7 ; xm5: E0-7,F0-7 ; xm7: G0-7,H0-7 ; xm6: A8-15,B8-15 ; xm4: C8-15,D8-15 ; xm1: E8-15,F8-15 ; xm3: G8-15,H8-15 punpcklqdq m3, m0, m6 punpckhqdq m0, m6 punpckhqdq m6, m2, m4 punpcklqdq m2, m4 punpcklqdq m4, m5, m1 punpckhqdq m5, m1 %if %1 == 8 punpcklqdq m1, m7, m15 punpckhqdq m7, m15 ; xm3: A0-15 ; xm0: B0-15 ; xm2: C0-15 ; xm6: D0-15 ; xm4: E0-15 ; xm5: F0-15 ; xm1: G0-15 ; xm7: H0-15 %if ARCH_X86_64 SWAP 11, 3, 2 SWAP 13, 0 SWAP 6, 5, 4 SWAP 14, 1 SWAP 15, 7 ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 mova [rsp+21*16], m11 %define %%p3mem [rsp+21*16] %else %define m11 [esp+26*16] %define m13 [esp+27*16] %define m14 [esp+28*16] %define m15 [esp+29*16] mova m11, m3 mova m13, m0 SWAP 3, 2 SWAP 6, 5, 4 mova m14, m1 mova m15, m7 %define %%p3mem [esp+26*16] %endif %else %if ARCH_X86_64 SWAP 13, 3, 0 SWAP 14, 5, 6, 4, 2 ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 %else %define m13 %%p2mem %define m14 %%q2mem mova m13, m3 mova m14, m5 SWAP 3, 0 SWAP 5, 6, 4, 2 ; 0,2,6,4 -> 3,4,5,6 %endif %endif %else %if ARCH_X86_64 mova [rsp+20*16], m12 %endif ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose %if ARCH_X86_32 %xdefine m8 m0 %xdefine m9 m1 %xdefine m10 m2 %xdefine m11 m3 %xdefine m12 m4 %xdefine m13 m5 %xdefine m14 m6 %xdefine m15 m7 lea tmpq, [dstq+strideq*8] movu m8, [tmpq+strideq*0-8] movu m9, [tmpq+strideq*1-8] movu m10, [tmpq+strideq*2-8] movu m11, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] movu m12, [tmpq+strideq*0-8] movu m13, [tmpq+strideq*1-8] movu m14, [tmpq+strideq*2-8] movu m15, [tmpq+stride3q -8] mova [esp+ 8*16], m8 mova [esp+ 9*16], m9 mova [esp+10*16], m10 mova [esp+11*16], m11 mova [esp+12*16], m12 mova [esp+13*16], m13 mova [esp+14*16], m14 mova [esp+15*16], m15 %endif movu m0, [dstq+strideq*0-8] movu m1, [dstq+strideq*1-8] movu m2, [dstq+strideq*2-8] movu m3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu m4, [tmpq+strideq*0-8] movu m5, [tmpq+strideq*1-8] movu m6, [tmpq+strideq*2-8] movu m7, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] %if ARCH_X86_64 movu m8, [tmpq+strideq*0-8] movu m9, [tmpq+strideq*1-8] movu m10, [tmpq+strideq*2-8] movu m11, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] movu m12, [tmpq+strideq*0-8] movu m13, [tmpq+strideq*1-8] movu m14, [tmpq+strideq*2-8] movu m15, [tmpq+stride3q -8] %endif %if ARCH_X86_64 TRANSPOSE_16X16B 0, [rsp+11*16] mova [rsp+12*16], m1 mova [rsp+13*16], m2 mova [rsp+14*16], m3 mova [rsp+15*16], m12 mova [rsp+16*16], m13 mova [rsp+17*16], m14 mova [rsp+18*16], m15 ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 SWAP 12, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 SWAP 11, 15 mova [rsp+21*16], m12 %define %%p3mem [rsp+21*16] mova m12, [rsp+20*16] %else TRANSPOSE_16X16B 0, [esp+16*16] %define %%p3mem [esp+26*16] %define m11 %%p3mem %define m13 %%p2mem %define m14 %%q2mem %define m15 %%q3mem %endif %endif ; if 4 elif 6 or 8 else 16 %endif ; if v else h ; load L/E/I/H %if ARCH_X86_32 mov l_strideq, l_stridem %endif %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else %if ARCH_X86_32 lea l_stride3q, [l_strideq*3] %endif movq xm1, [lq] movq xm2, [lq+l_strideq*2] movhps xm1, [lq+l_strideq] movhps xm2, [lq+l_stride3q] shufps m0, m1, m2, q3131 shufps m1, m2, q2020 %if ARCH_X86_32 lea stride3q, [strideq*3] %endif %endif %if ARCH_X86_32 %ifidn %2, v mov lutd, lutm %endif %endif pxor m2, m2 pcmpeqb m7, m2, m0 pand m1, m7 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] pcmpeqb m2, m0 ; !L psrlq m7, m0, [lutq+128] pand m7, [PIC_sym(pb_63)] pminub m7, minlvl pmaxub m7, [PIC_sym(pb_1)] ; I pand m1, m0, [PIC_sym(pb_240)] psrlq m1, 4 ; H paddb m0, [PIC_sym(pb_2)] paddb m0, m0 paddb m0, m7 ; E pxor m1, [PIC_sym(pb_128)] pxor m7, [PIC_sym(pb_128)] pxor m0, [PIC_sym(pb_128)] SWAP 2, 7 %if ARCH_X86_64 SWAP 0, 8 SWAP 2, 10 %else %ifidn %2, v mov mstrideq, strideq neg mstrideq %if %1 == 4 lea tmpq, [dstq+mstrideq*2] %elif %1 == 6 || %1 == 8 lea tmpq, [dstq+mstrideq*4] %endif %endif mova [esp+3*16], m0 mova [esp+4*16], m2 %endif ABSSUB m0, m3, m4, m2 ; abs(p1-p0) pmaxub m0, m7 ABSSUB m2, m5, m6, m7 ; abs(q1-q0) pmaxub m0, m2 %if %1 == 4 pxor m0, [PIC_sym(pb_128)] pcmpgtb m7, m0, m1 ; hev %if ARCH_X86_64 SWAP 7, 11 %else mova [esp+5*16], m7 %endif %else pxor m7, m0, [PIC_sym(pb_128)] pcmpgtb m7, m1 ; hev %if ARCH_X86_64 SWAP 7, 11 %else mova [esp+5*16], m7 %endif %if %1 == 6 ABSSUB m1, m13, m4, m7 ; abs(p2-p0) pmaxub m1, m0 %else mova m2, %%p3mem ABSSUB m1, m2, m4, m7 ; abs(p3-p0) pmaxub m1, m0 ABSSUB m7, m13, m4, m2 ; abs(p2-p0) pmaxub m1, m7 %endif ABSSUB m7, m5, m14, m2 ; abs(p2-p0) pmaxub m1, m7 %if %1 != 6 ABSSUB m7, m5, m15, m2 ; abs(q3-q0) pmaxub m1, m7 %endif pxor m1, [PIC_sym(pb_128)] pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in %if ARCH_X86_64 SWAP 1, 9 %else mova [esp+6*16], m1 %endif %if %1 == 6 ABSSUB m7, m13, m3, m1 ; abs(p2-p1) %else mova m2, %%p3mem ABSSUB m7, m2, m13, m1 ; abs(p3-p2) ABSSUB m2, m13, m3, m1 ; abs(p2-p1) pmaxub m7, m2 ABSSUB m2, m14, m15, m1 ; abs(q3-q2) pmaxub m7, m2 %endif ABSSUB m2, m14, m6, m1 ; abs(q2-q1) pmaxub m7, m2 %if ARCH_X86_32 %define m12 m1 mova m12, maskmem %endif pand m2, m12, mask1 pcmpeqd m2, m12 pand m7, m2 ; only apply fm-wide to wd>4 blocks pmaxub m0, m7 pxor m0, [PIC_sym(pb_128)] %endif ; %if %1 == 4 else %if ARCH_X86_64 SWAP 2, 10 pcmpgtb m0, m2 %else pcmpgtb m0, [esp+4*16] %endif ABSSUB m1, m3, m6, m7 ; abs(p1-q1) ABSSUB m7, m4, m5, m2 ; abs(p0-q0) paddusb m7, m7 pand m1, [PIC_sym(pb_254)] psrlq m1, 1 paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pxor m1, [PIC_sym(pb_128)] %if ARCH_X86_64 pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E %else pcmpgtb m1, [esp+3*16] %endif por m0, m1 %if %1 == 16 %if ARCH_X86_64 SWAP 0, 8 %else mova [esp+3*16], m0 %endif %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] %else mova m0, [rsp+12*16] %endif ABSSUB m1, m0, m4, m2 %ifidn %2, v mova m0, [tmpq+strideq*2] %else mova m0, [rsp+13*16] %endif ABSSUB m2, m0, m4, m7 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+stride3q] %else mova m0, [rsp+14*16] %endif ABSSUB m2, m0, m4, m7 pmaxub m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] %else mova m0, [rsp+15*16] %endif ABSSUB m2, m0, m5, m7 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+strideq*1] %else mova m0, [rsp+16*16] %endif ABSSUB m2, m0, m5, m7 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+strideq*2] %else mova m0, [rsp+17*16] %endif ABSSUB m2, m0, m5, m7 pmaxub m1, m2 pxor m1, [PIC_sym(pb_128)] pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out %if ARCH_X86_64 por m1, m9 ; !flat8in | !flat8out %else por m1, [esp+6*16] %define m12 m7 mova m12, maskmem %endif pand m2, m12, mask2 pcmpeqd m2, m12 pandn m1, m2 ; flat16 %if ARCH_X86_64 pandn m2, m8, m1 ; flat16 & fm %else pandn m2, [esp+3*16], m1 ; flat16 & fm mova %%flat16mem, m2 %endif SWAP 1, 2 pand m2, m12, mask1 pcmpeqd m2, m12 %if ARCH_X86_64 pandn m9, m2 ; flat8in pandn m2, m8, m9 SWAP 2, 9 %else pandn m0, [esp+6*16], m2 pandn m2, [esp+3*16], m0 mova [esp+6*16], m2 %endif pand m2, m12, mask0 pcmpeqd m2, m12 %if ARCH_X86_64 pandn m8, m2 pandn m2, m9, m8 ; fm & !flat8 & !flat16 SWAP 2, 8 pandn m2, m1, m9 ; flat8 & !flat16 SWAP 2, 9 SWAP 0, 8 SWAP 1, 10 %else pandn m0, [esp+3*16], m2 pandn m2, [esp+6*16], m0 SWAP 2, 0 pandn m2, m1, [esp+6*16] mova %%flat8mem, m2 %endif %elif %1 != 4 %if ARCH_X86_64 SWAP 1, 9 %else %define m12 m7 mova m12, maskmem mova m1, [esp+6*16] %endif pand m2, m12, mask1 pcmpeqd m2, m12 pandn m1, m2 pandn m2, m0, m1 ; flat8 & fm pand m1, m12, mask0 pcmpeqd m1, m12 pandn m0, m1 pandn m1, m2, m0 ; fm & !flat8 SWAP 1, 2, 0 %if ARCH_X86_64 SWAP 1, 9 %else mova %%flat8mem, m1 %endif %else %if ARCH_X86_32 %define m12 m1 mova m12, maskmem %endif pand m2, m12, mask0 pcmpeqd m2, m12 pandn m0, m2 ; fm %endif ; short filter mova m1, [PIC_sym(pb_128)] %if ARCH_X86_64 SWAP 7, 11 %else mova m7, [esp+5*16] %endif pxor m3, m1 pxor m6, m1 pxor m4, m1 pxor m5, m1 psubsb m1, m3, m6 ; iclip_diff(p1-q1) pand m1, m7 ; f=iclip_diff(p1-q1)&hev psubsb m2, m5, m4 paddsb m1, m2 paddsb m1, m2 paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) mova m2, [PIC_sym(pb_16)] pand m0, m1 ; f&=fm paddsb m1, m0, [PIC_sym(pb_3)] paddsb m0, [PIC_sym(pb_4)] pand m1, [PIC_sym(pb_248)] pand m0, [PIC_sym(pb_248)] psrlq m1, 3 psrlq m0, 3 pxor m1, m2 pxor m0, m2 psubb m1, m2 ; f2 psubb m0, m2 ; f1 mova m2, [PIC_sym(pb_128)] paddsb m4, m1 psubsb m5, m0 pxor m4, m2 pxor m5, m2 pxor m0, m2 pxor m1, m1 pavgb m0, m1 ; f=(f1+1)>>1 psubb m0, [PIC_sym(pb_64)] pandn m7, m0 ; f&=!hev paddsb m3, m7 psubsb m6, m7 pxor m3, m2 pxor m6, m2 %if %1 == 16 ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 %else mova m0, [rsp+12*16] mova m2, [rsp+13*16] mova m7, [rsp+14*16] %endif %if ARCH_X86_64 SWAP 1, 10 mova %%flat8mem, m9 mova %%q2mem, m14 mova %%q3mem, m15 SWAP 0, 8 SWAP 1, 9 %else %ifidn %2, v mova [esp+17*16], m0 mova [esp+19*16], m3 mova [esp+21*16], m4 mova [esp+22*16], m5 mova [esp+23*16], m6 %xdefine m11 m3 %xdefine m14 m4 %xdefine m15 m5 %xdefine m10 m6 %define m13 %%p2mem %define m8 [esp+17*16] %define m9 %%flat16mem %define m3 [esp+19*16] %define m4 [esp+21*16] %define m5 [esp+22*16] %define m6 [esp+23*16] %else mova [esp+31*16], m0 mova [esp+32*16], m3 mova [esp+33*16], m4 mova [esp+34*16], m5 mova [esp+35*16], m6 %xdefine m11 m3 %xdefine m14 m4 %xdefine m15 m5 %xdefine m10 m6 %define m13 %%p2mem %define m8 [esp+31*16] %define m9 %%flat16mem %define m3 [esp+32*16] %define m4 [esp+33*16] %define m5 [esp+34*16] %define m6 [esp+35*16] %endif %endif ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 mova m11, %%p3mem %if ARCH_X86_64 punpcklbw m14, m8, m11 punpckhbw m15, m8, m11 %else punpcklbw m14, m0, m11 punpckhbw m15, m0, m11 %endif %ifidn %2, v mova [rsp+5*16], m11 %endif pmaddubsw m10, m14, [PIC_sym(pb_7_1)] pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 pmaddubsw m0, [PIC_sym(pb_2)] pmaddubsw m1, [PIC_sym(pb_2)] paddw m10, m0 paddw m11, m1 ; p6*7+p5*2+p4*2+p3 punpcklbw m0, m13, m3 punpckhbw m1, m13, m3 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m10, m0 paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m0, m4, m5 punpckhbw m1, m4, m5 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m10, m0 paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m2 por m0, m1 %ifidn %2, v mova [tmpq+strideq*2], m0 ; p5 %else mova [rsp+13*16], m0 %endif ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, [PIC_sym(pb_m1_1)] pmaddubsw m15, [PIC_sym(pb_m1_1)] paddw m10, m14 paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 punpcklbw m0, m8, m6 punpckhbw m1, m8, m6 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+3*16], m0 mova [rsp+4*16], m1 paddw m10, m0 paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m7 por m0, m1 %ifidn %2, v mova [tmpq+stride3q], m0 ; p4 %else mova [rsp+14*16], m0 %endif ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 mova m14, %%q2mem punpcklbw m0, m8, m13 punpckhbw m1, m8, m13 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 punpcklbw m0, m2, m14 punpckhbw m2, m14 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m2, [PIC_sym(pb_m1_1)] mova [rsp+1*16], m0 paddw m10, m0 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, %%p3mem por m0, m1 %ifidn %2, v mova [tmpq+strideq*4], m0 ; p3 %else mova [rsp+19*16], m0 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 mova m15, %%q3mem punpcklbw m0, m8, m3 punpckhbw m1, m8, m3 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 punpcklbw m0, m7, m15 punpckhbw m7, m15 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m7, [PIC_sym(pb_m1_1)] mova [rsp+2*16], m0 %if ARCH_X86_32 %ifidn %2, v mova [esp+24*16], m7 %else mova [esp+36*16], m7 %endif %endif paddw m10, m0 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m13 por m0, m1 mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 punpcklbw m0, m8, m4 punpckhbw m1, m8, m4 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 %if ARCH_X86_64 SWAP 7, 8 %endif %ifidn %2, v mova m1, [dstq+strideq*4] ; q4 mova m7, [rsp+5*16] ; (pre-filter) p3 %else mova m1, [rsp+15*16] mova m7, %%p3mem ; (pre-filter) p3 %endif punpcklbw m0, m1, m7 punpckhbw m1, m1, m7 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+7*16], m0 mova [rsp+5*16], m1 psubw m10, m0 psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m3 por m0, m1 mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, v mova m7, [tmpq+strideq*1] ; p6 lea tmpq, [dstq+strideq*4] mova m1, [tmpq+strideq*1] ; q5 %else mova m7, [rsp+12*16] ; p6 mova m1, [rsp+16*16] %endif punpcklbw m0, m7, m5 punpckhbw m7, m5 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m7, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 punpcklbw m7, m13, m1 pmaddubsw m7, [PIC_sym(pb_m1_1)] mova [rsp+9*16], m7 paddw m10, m7 %if ARCH_X86_64 punpckhbw m13, m1 mova m1, [rsp+6*16] SWAP 1, 13 %else punpckhbw m7, m13, m1 mova m1, [esp+6*16] mova m13, m1 SWAP 1, 7 %endif pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+10*16], m1 paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 pmulhrsw m7, m10, [PIC_sym(pw_2048)] pmulhrsw m0, m11, [PIC_sym(pw_2048)] packuswb m7, m0 pand m7, m9 pandn m0, m9, m4 por m7, m0 mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 %ifidn %2, v mova m7, [tmpq+strideq*2] ; q6 %else mova m7, [rsp+17*16] %endif paddw m10, [rsp+3*16] paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 punpcklbw m0, m3, m7 punpckhbw m1, m3, m7 %if ARCH_X86_64 mova m3, [rsp+8*16] %endif pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+3*16], m0 mova [rsp+4*16], m1 paddw m10, m0 paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m5 por m0, m1 %if ARCH_X86_32 mova m1, [esp+8*16] mova m3, m1 %endif mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 paddw m10, [rsp+1*16] paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 punpcklbw m0, m4, m7 punpckhbw m2, m4, m7 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m2, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 %if ARCH_X86_64 mova m4, [rsp+6*16] %else %define m4 [esp+6*16] %endif pmulhrsw m2, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m2, m1 pand m2, m9 pandn m1, m9, m6 por m2, m1 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 paddw m10, [rsp+2*16] %if ARCH_X86_64 SWAP 7, 8 paddw m11, m7 %else mova m8, m7 %ifidn %2, v paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 %else paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 %endif %endif punpcklbw m0, m5, m8 punpckhbw m1, m5, m8 %if ARCH_X86_64 mova m5, [rsp+8*16] %else %define m5 [esp+8*16] %endif pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 pmulhrsw m7, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m7, m1 pand m7, m9 pandn m1, m9, m14 por m7, m1 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 psubw m10, [rsp+7*16] psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 punpcklbw m0, m6, m8 punpckhbw m1, m6, m8 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m15 por m0, m1 %ifidn %2, v mova [tmpq+mstrideq], m0 ; q3 %else mova [rsp+20*16], m0 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 paddw m10, [rsp+ 9*16] paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 punpcklbw m0, m14, m8 punpckhbw m1, m14, m8 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 %ifidn %2, v pandn m1, m9, [tmpq+strideq*0] %else pandn m1, m9, [rsp+15*16] %endif por m0, m1 %ifidn %2, v mova [tmpq+strideq*0], m0 ; q4 %else mova [rsp+15*16], m0 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, [rsp+3*16] paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 punpcklbw m0, m15, m8 punpckhbw m1, m15, m8 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m10, [PIC_sym(pw_2048)] pmulhrsw m11, [PIC_sym(pw_2048)] packuswb m10, m11 pand m10, m9 %ifidn %2, v pandn m11, m9, [tmpq+strideq*1] %else pandn m11, m9, [rsp+16*16] %endif por m10, m11 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else mova [rsp+16*16], m10 %endif %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 14, 7 %else %xdefine m3 m11 %xdefine m4 m14 %xdefine m5 m15 %xdefine m6 m10 mova %%q2mem, m7 %ifidn %2, v mova m3, [esp+19*16] %else mova m3, [esp+32*16] %endif mova m4, [esp+ 6*16] mova m5, [esp+ 8*16] %endif SWAP m6, m2 %if ARCH_X86_64 mova m9, %%flat8mem %endif %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %endif %endif ; if %1 == 16 %if %1 >= 8 ; flat8 filter %if ARCH_X86_32 %define m9 %%flat8mem %define m11 m1 %define m13 %%p2mem %define m14 %%q2mem %define m15 %%q3mem %endif mova m11, %%p3mem punpcklbw m0, m11, m3 punpcklbw m7, m13, m4 pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 pmaddubsw m7, [PIC_sym(pb_2_1)] paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 punpcklbw m7, m5, [PIC_sym(pb_4)] pmaddubsw m7, [PIC_sym(pb_1)] paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 punpckhbw m1, m11, m3 pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 punpckhbw m0, m13, m4 pmaddubsw m0, [PIC_sym(pb_2_1)] paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 punpckhbw m0, m5, [PIC_sym(pb_4)] pmaddubsw m0, [PIC_sym(pb_1)] paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m13 por m0, m1 ; p2 %ifidn %2, v mova [tmpq+strideq*1], m0 %else %if ARCH_X86_64 SWAP 0, 10 %else mova [esp+2*16], m0 %endif %endif %if ARCH_X86_32 mova m11, %%p3mem %endif punpcklbw m0, m11, m3 punpckhbw m1, m11, m3 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 punpcklbw m0, m13, m6 punpckhbw m1, m13, m6 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m3 por m0, m1 ; p1 %ifidn %2, v mova [tmpq+strideq*2], m0 %else mova [rsp+0*16], m0 %endif %if ARCH_X86_32 mova m11, %%p3mem %endif punpcklbw m0, m11, m3 punpckhbw m1, m11, m3 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] psubw m2, m0 psubw m7, m1 punpcklbw m0, m4, m14 punpckhbw m1, m4, m14 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m2, m0 paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m4 por m0, m1 ; p0 %ifidn %2, v mova [tmpq+stride3q], m0 %else mova [rsp+1*16], m0 %endif punpcklbw m0, m5, m15 punpckhbw m1, m5, m15 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m2, m0 paddw m7, m1 %if ARCH_X86_32 mova m11, %%p3mem %endif punpcklbw m0, m11, m4 punpckhbw m11, m11, m4 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m11, [PIC_sym(pb_1)] psubw m2, m0 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m0, m2, 3 psrlw m11, m7, 3 packuswb m0, m11 pand m0, m9 pandn m11, m9, m5 por m11, m0 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 %elif ARCH_X86_32 mova [esp+8*16], m11 %endif punpcklbw m0, m5, m15 punpckhbw m1, m5, m15 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 punpcklbw m0, m13, m6 punpckhbw m1, m13, m6 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m6 por m0, m1 ; q1 %ifidn %2, v mova [dstq+strideq*1], m0 %else %if ARCH_X86_64 SWAP 0, 13 %else mova [esp+9*16], m0 %endif %endif punpcklbw m0, m3, m6 punpckhbw m1, m3, m6 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] psubw m2, m0 psubw m7, m1 punpcklbw m0, m14, m15 punpckhbw m1, m14, m15 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m2, m0 paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 pand m2, m9 pandn m7, m9, m14 por m2, m7 ; q2 %ifidn %2, v mova [dstq+strideq*2], m2 %else mova m0, [rsp+0*16] %if %1 == 8 mova m1, [rsp+1*16] mova m4, %%p3mem %if ARCH_X86_32 %define m10 [esp+2*16] %define m11 [esp+8*16] %define m13 [esp+9*16] %endif ; 16x8 transpose punpcklbw m3, m4, m10 punpckhbw m4, m10 punpcklbw m5, m0, m1 punpckhbw m0, m1 punpcklbw m1, m11, m13 punpckhbw m6, m11, m13 punpcklbw m7, m2, m15 punpckhbw m2, m15 %if ARCH_X86_64 SWAP 2, 15 %else mova m15, m2 %endif punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m0 punpckhwd m4, m0 punpcklwd m0, m1, m7 punpckhwd m1, m7 punpcklwd m7, m6, m15 punpckhwd m6, m15 %if ARCH_X86_64 SWAP 6, 15 %else mova m15, m6 %endif punpckldq m6, m2, m0 punpckhdq m2, m0 punpckldq m0, m3, m1 punpckhdq m3, m1 punpckldq m1, m5, m7 punpckhdq m5, m7 punpckldq m7, m4, m15 punpckhdq m4, m15 ; write 8x16 movq [dstq+strideq*0-4], xm6 movhps [dstq+strideq*1-4], xm6 movq [dstq+strideq*2-4], xm2 movhps [dstq+stride3q -4], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm0 movhps [dstq+strideq*1-4], xm0 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm1 movhps [dstq+strideq*1-4], xm1 movq [dstq+strideq*2-4], xm5 movhps [dstq+stride3q -4], xm5 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm7 movhps [dstq+strideq*1-4], xm7 movq [dstq+strideq*2-4], xm4 movhps [dstq+stride3q -4], xm4 lea dstq, [dstq+strideq*4] %else ; 16x16 transpose and store SWAP 6, 0 SWAP 7, 1 %if ARCH_X86_64 SWAP 5, 10, 2 SWAP 8, 11 SWAP 9, 13 mova [rsp+21*16], m12 %else mova [esp+10*16], m2 %xdefine m8 m0 %xdefine m9 m1 %xdefine m10 m2 %xdefine m11 m3 %xdefine m12 m4 %xdefine m13 m5 %xdefine m14 m6 %xdefine m15 m7 %endif mova m0, [rsp+11*16] mova m1, [rsp+12*16] mova m2, [rsp+13*16] mova m3, [rsp+14*16] mova m4, [rsp+19*16] %if ARCH_X86_64 mova m7, [rsp+ 1*16] mova m11, [rsp+20*16] mova m12, [rsp+15*16] mova m13, [rsp+16*16] mova m14, [rsp+17*16] TRANSPOSE_16X16B 1, [rsp+18*16] %else mova m5, [esp+ 2*16] TRANSPOSE_16X16B 1, [esp+32*16] mov tmpq, dstq lea dstq, [dstq+strideq*8] %endif movu [dstq+strideq*0-8], xm0 movu [dstq+strideq*1-8], xm1 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm4 movu [dstq+strideq*1-8], xm5 movu [dstq+strideq*2-8], xm6 movu [dstq+stride3q -8], xm7 %if ARCH_X86_64 lea dstq, [dstq+strideq*4] %else %xdefine m8 m0 %xdefine m9 m1 %xdefine m10 m2 %xdefine m11 m3 %xdefine m12 m4 %xdefine m13 m5 %xdefine m14 m6 %xdefine m15 m7 mova m8, [esp+11*16] mova m9, [esp+12*16] mova m10, [esp+13*16] mova m11, [esp+14*16] mova m12, [esp+26*16] mova m13, [esp+27*16] mova m14, [esp+ 0*16] mova m15, [esp+ 1*16] mov dstq, tmpq %endif movu [dstq+strideq*0-8], xm8 movu [dstq+strideq*1-8], xm9 movu [dstq+strideq*2-8], xm10 movu [dstq+stride3q -8], xm11 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm12 movu [dstq+strideq*1-8], xm13 movu [dstq+strideq*2-8], xm14 movu [dstq+stride3q -8], xm15 lea dstq, [dstq+strideq*4] %if ARCH_X86_32 lea dstq, [dstq+strideq*8] %else mova m12, [rsp+21*16] %endif %endif ; if %1 == 8 %endif ; ifidn %2, v %elif %1 == 6 ; flat6 filter %if ARCH_X86_32 mova [esp+3*16], m3 mova [esp+4*16], m4 mova [esp+5*16], m5 mova [esp+6*16], m6 %xdefine m8 m3 %xdefine m10 m4 %xdefine m11 m5 %xdefine m15 m6 %define m3 [esp+3*16] %define m4 [esp+4*16] %define m5 [esp+5*16] %define m6 [esp+6*16] %define m9 %%flat8mem %define m13 %%p2mem %define m14 %%q2mem %endif punpcklbw m8, m13, m5 punpckhbw m11, m13, m5 pmaddubsw m0, m8, [PIC_sym(pb_3_1)] pmaddubsw m1, m11, [PIC_sym(pb_3_1)] punpcklbw m7, m4, m3 punpckhbw m10, m4, m3 pmaddubsw m2, m7, [PIC_sym(pb_2)] pmaddubsw m15, m10, [PIC_sym(pb_2)] paddw m0, m2 paddw m1, m15 pmulhrsw m2, m0, [PIC_sym(pw_4096)] pmulhrsw m15, m1, [PIC_sym(pw_4096)] packuswb m2, m15 pand m2, m9 pandn m15, m9, m3 por m2, m15 %ifidn %2, v mova [tmpq+strideq*2], m2 ; p1 %elif ARCH_X86_32 mova [esp+11*16], m2 %endif pmaddubsw m8, [PIC_sym(pb_m1_1)] pmaddubsw m11, [PIC_sym(pb_m1_1)] paddw m0, m8 paddw m1, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 %if ARCH_X86_64 SWAP 2, 13 %endif pmaddubsw m8, [PIC_sym(pb_m1_1)] pmaddubsw m11, [PIC_sym(pb_m1_1)] paddw m0, m8 paddw m1, m11 pmulhrsw m2, m0, [PIC_sym(pw_4096)] pmulhrsw m15, m1, [PIC_sym(pw_4096)] packuswb m2, m15 pand m2, m9 pandn m15, m9, m4 por m2, m15 %ifidn %2, v mova [tmpq+stride3q], m2 ; p0 %elif ARCH_X86_32 mova [esp+8*16], m2 %endif paddw m0, m8 paddw m1, m11 punpcklbw m8, m3, m14 punpckhbw m11, m3, m14 %if ARCH_X86_64 SWAP 2, 14 %endif pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] paddw m0, m2 paddw m1, m15 pmulhrsw m2, m0, [PIC_sym(pw_4096)] pmulhrsw m15, m1, [PIC_sym(pw_4096)] packuswb m2, m15 pand m2, m9 pandn m15, m9, m5 por m2, m15 %ifidn %2, v mova [dstq+strideq*0], m2 ; q0 %endif pmaddubsw m8, [PIC_sym(pb_m1_2)] pmaddubsw m11, [PIC_sym(pb_m1_2)] paddw m0, m8 paddw m1, m11 pmaddubsw m7, [PIC_sym(pb_m1_0)] pmaddubsw m10, [PIC_sym(pb_m1_0)] paddw m0, m7 paddw m1, m10 pmulhrsw m0, [PIC_sym(pw_4096)] pmulhrsw m1, [PIC_sym(pw_4096)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m6 por m0, m1 %if ARCH_X86_32 %xdefine m3 m8 %xdefine m4 m10 %xdefine m5 m11 %xdefine m6 m15 %endif %ifidn %2, v mova [dstq+strideq*1], m0 ; q1 %else %if ARCH_X86_64 SWAP 3, 13 SWAP 4, 14 %else mova m3, [esp+11*16] mova m4, [esp+ 8*16] %endif SWAP 5, 2 SWAP 6, 0 TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 %endif %else ; if %1 == 4 %ifidn %2, v mova [tmpq+strideq*0], m3 ; p1 mova [tmpq+strideq*1], m4 ; p0 mova [tmpq+strideq*2], m5 ; q0 mova [tmpq+stride3q ], m6 ; q1 %else TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 %endif %endif %if ARCH_X86_32 %define m12 m12reg %endif %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; 32-bit PIC helpers ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %if ARCH_X86_32 %define PIC_base_offset $$ %macro SETUP_PIC 0 ; PIC_reg %define PIC_reg r2 %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) LEA PIC_reg, $$ %endmacro %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base %if %1 == 0 mov [esp+PIC_reg_stk_offset], PIC_reg mov PIC_reg, maskm %else mov PIC_reg, [esp+PIC_reg_stk_offset] %endif %endmacro %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) %else %macro XCHG_PIC_REG 1 %endmacro %define PIC_sym(sym) (sym) %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %if ARCH_X86_32 %if STACK_ALIGNMENT < required_stack_alignment %assign copy_args 1 %else %assign copy_args 0 %endif %endif %macro RELOC_ARGS 1 %if copy_args %define maskm [esp+stack_size-gprsize*1] %define l_stridem [esp+stack_size-gprsize*2] %define lutm [esp+stack_size-gprsize*3] %define %1m [esp+stack_size-gprsize*4] mov r6d, r6m mov maskm, maskd mov lutm, lutd mov %1m, r6d %else %define %1m r6m %endif %endmacro %if ARCH_X86_32 %define tmpq r4 %define mstrideq r5 %define stride3q r6 %define l_stride3q r6 %endif INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC %define m12 m5 %endif shl l_strideq, 2 sub lq, l_strideq %if ARCH_X86_64 mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movu m0, [maskq] pxor m4, m4 movd m3, [lutq+136] pshufb m3, m4 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m1, m2 por m0, m1 mova [rsp+11*16], m0 mova [rsp+12*16], m1 mova [rsp+13*16], m2 mova [rsp+14*16], m3 %define maskmem [esp+15*16] %define mask0 [rsp+11*16] %define mask1 [rsp+12*16] %define mask2 [rsp+13*16] %define minlvl [rsp+14*16] .loop: test [maskq+8], mask_bitsd ; vmask[2] je .no_flat16 %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+25*16], mask_bitsd mova maskmem, m12 %endif FILTER 16, v jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+25*16], mask_bitsd mova maskmem, m12 %endif FILTER 8, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+25*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, v .end: %if ARCH_X86_32 mova m12, maskmem mov mask_bitsd, [esp+25*16] %endif .no_filter: pslld m12, 4 shl mask_bitsd, 4 add lq, 16 add dstq, 16 %if ARCH_X86_64 sub wd, 4 %else sub dword wm, 4 %endif XCHG_PIC_REG 0 jg .loop RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC %define m12 m5 %endif sub lq, 4 shl l_strideq, 2 %if ARCH_X86_64 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movu m0, [maskq] pxor m4, m4 movd m3, [lutq+136] pshufb m3, m4 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m1, m2 por m0, m1 mova [rsp+22*16], m0 mova [rsp+23*16], m1 mova [rsp+24*16], m2 mova [rsp+25*16], m3 %define maskmem [esp+37*16] %define mask0 [rsp+22*16] %define mask1 [rsp+23*16] %define mask2 [rsp+24*16] %define minlvl [rsp+25*16] .loop: test [maskq+8], mask_bitsd ; vmask[2] je .no_flat16 %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+38*16], mask_bitsd mova maskmem, m12 %endif FILTER 16, h jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+38*16], mask_bitsd mova maskmem, m12 %endif FILTER 8, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+38*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] %if ARCH_X86_32 jmp .end_noload .end: mova m12, maskmem mov l_strideq, l_stridem mov mask_bitsd, [esp+38*16] .end_noload: %else .end: %endif lea lq, [lq+l_strideq*4] pslld m12, 4 shl mask_bitsd, 4 %if ARCH_X86_64 sub hd, 4 %else sub dword hm, 4 %endif XCHG_PIC_REG 0 jg .loop RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC %define m12 m4 %endif shl l_strideq, 2 sub lq, l_strideq %if ARCH_X86_64 mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movq m0, [maskq] pxor m3, m3 movd m2, [lutq+136] pshufb m2, m3 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m0, m1 mova [rsp+0*16], m0 mova [rsp+1*16], m1 mova [rsp+2*16], m2 %define maskmem [esp+7*16] %define mask0 [rsp+0*16] %define mask1 [rsp+1*16] %define minlvl [rsp+2*16] .loop: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+11*16], mask_bitsd mova maskmem, m12 %endif FILTER 6, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[1] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+11*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, v .end: %if ARCH_X86_32 mova m12, maskmem mov mask_bitsd, [esp+11*16] %endif .no_filter: pslld m12, 4 shl mask_bitsd, 4 add lq, 16 add dstq, 16 %if ARCH_X86_64 sub wd, 4 %else sub dword wm, 4 %endif XCHG_PIC_REG 0 jg .loop RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC %define m12 m4 %endif sub lq, 4 shl l_strideq, 2 %if ARCH_X86_64 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movq m0, [maskq] pxor m3, m3 movd m2, [lutq+136] pshufb m2, m3 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m0, m1 mova [rsp+0*16], m0 mova [rsp+1*16], m1 mova [rsp+2*16], m2 %define maskmem [esp+7*16] %define mask0 [rsp+0*16] %define mask1 [rsp+1*16] %define minlvl [rsp+2*16] .loop: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+12*16], mask_bitsd mova maskmem, m12 %endif FILTER 6, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[1] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+12*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] %if ARCH_X86_32 jmp .end_noload .end: mova m12, maskmem mov l_strided, l_stridem mov mask_bitsd, [esp+12*16] .end_noload: %else .end: %endif lea lq, [lq+l_strideq*4] pslld m12, 4 shl mask_bitsd, 4 %if ARCH_X86_64 sub hd, 4 %else sub dword hm, 4 %endif XCHG_PIC_REG 0 jg .loop RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/looprestoration.h000066400000000000000000000070651517466257200247040ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/looprestoration.h" #include "common/intops.h" #define decl_wiener_filter_fns(ext) \ decl_lr_filter_fn(BF(dav2d_wiener_filter7, ext)); \ decl_lr_filter_fn(BF(dav2d_wiener_filter5, ext)) #define decl_sgr_filter_fns(ext) \ decl_lr_filter_fn(BF(dav2d_sgr_filter_5x5, ext)); \ decl_lr_filter_fn(BF(dav2d_sgr_filter_3x3, ext)); \ decl_lr_filter_fn(BF(dav2d_sgr_filter_mix, ext)) decl_wiener_filter_fns(sse2); decl_wiener_filter_fns(ssse3); decl_wiener_filter_fns(avx2); decl_wiener_filter_fns(avx512icl); decl_sgr_filter_fns(ssse3); decl_sgr_filter_fns(avx2); decl_sgr_filter_fns(avx512icl); static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav2dLoopRestorationDSPContext *const c, const int bpc) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_SSE2)) return; #if BITDEPTH == 8 c->wiener[0] = BF(dav2d_wiener_filter7, sse2); c->wiener[1] = BF(dav2d_wiener_filter5, sse2); #endif if (!(flags & DAV2D_X86_CPU_FLAG_SSSE3)) return; c->wiener[0] = BF(dav2d_wiener_filter7, ssse3); c->wiener[1] = BF(dav2d_wiener_filter5, ssse3); if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = BF(dav2d_sgr_filter_5x5, ssse3); c->sgr[1] = BF(dav2d_sgr_filter_3x3, ssse3); c->sgr[2] = BF(dav2d_sgr_filter_mix, ssse3); } #if ARCH_X86_64 if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; c->wiener[0] = BF(dav2d_wiener_filter7, avx2); c->wiener[1] = BF(dav2d_wiener_filter5, avx2); if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = BF(dav2d_sgr_filter_5x5, avx2); c->sgr[1] = BF(dav2d_sgr_filter_3x3, avx2); c->sgr[2] = BF(dav2d_sgr_filter_mix, avx2); } if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; c->wiener[0] = BF(dav2d_wiener_filter7, avx512icl); #if BITDEPTH == 8 /* With VNNI we don't need a 5-tap version. */ c->wiener[1] = c->wiener[0]; #else c->wiener[1] = BF(dav2d_wiener_filter5, avx512icl); #endif if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = BF(dav2d_sgr_filter_5x5, avx512icl); c->sgr[1] = BF(dav2d_sgr_filter_3x3, avx512icl); c->sgr[2] = BF(dav2d_sgr_filter_mix, avx512icl); } #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc.h000066400000000000000000000070221517466257200220310ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/mc.h" #define decl_fn(type, name) \ decl_##type##_fn(BF(name, ssse3)); \ decl_##type##_fn(BF(name, avx2)); \ decl_##type##_fn(BF(name, avx512icl)); #define init_mc_fn(type, name, suffix) \ c->mc[type] = BF(dav2d_put_##name, suffix) #define init_mct_fn(type, name, suffix) \ c->mct[type] = BF(dav2d_prep_##name, suffix) #define init_mc_scaled_fn(type, name, suffix) \ c->mc_scaled[type] = BF(dav2d_put_##name, suffix) #define init_mct_scaled_fn(type, name, suffix) \ c->mct_scaled[type] = BF(dav2d_prep_##name, suffix) decl_8tap_fns(ssse3); decl_8tap_fns(avx2); decl_8tap_fns(avx512icl); decl_fn(mc, dav2d_put_bilin); decl_fn(mct, dav2d_prep_bilin); decl_fn(avg, dav2d_avg); decl_fn(w_avg, dav2d_w_avg); decl_fn(mask, dav2d_mask); decl_fn(w_mask, dav2d_w_mask_420); decl_fn(w_mask, dav2d_w_mask_422); decl_fn(w_mask, dav2d_w_mask_444); decl_fn(blend, dav2d_blend); decl_fn(warp8x8, dav2d_warp_affine_8x8); decl_fn(warp8x8t, dav2d_warp_affine_8x8t); decl_fn(emu_edge, dav2d_emu_edge); decl_fn(morph, dav2d_morph); decl_fn(sad_refine_mv, dav2d_sad_refine_mv); decl_fn(opfl_derive_mv, dav2d_opfl_derive_mv); decl_fn(sad8x8, dav2d_sad8x8); static ALWAYS_INLINE void mc_dsp_init_x86(Dav2dMCDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; init_8tap_fns(avx2); init_mc_fn(DAV2D_FILTER_BILINEAR, bilin, avx2); init_mct_fn(DAV2D_FILTER_BILINEAR, bilin, avx2); c->avg = BF(dav2d_avg, avx2); c->w_avg = BF(dav2d_w_avg, avx2); c->mask = BF(dav2d_mask, avx2); c->w_mask[0] = BF(dav2d_w_mask_444, avx2); c->w_mask[1] = BF(dav2d_w_mask_422, avx2); c->w_mask[2] = BF(dav2d_w_mask_420, avx2); c->blend = BF(dav2d_blend, avx2); c->warp8x8 = BF(dav2d_warp_affine_8x8, avx2); c->warp8x8t = BF(dav2d_warp_affine_8x8t, avx2); c->emu_edge = BF(dav2d_emu_edge, avx2); #if BITDEPTH == 8 c->morph = BF(dav2d_morph, avx2); c->sad_refine_mv = BF(dav2d_sad_refine_mv, avx2); c->opfl_derive_mv = BF(dav2d_opfl_derive_mv, avx2); c->sad8x8 = BF(dav2d_sad8x8, avx2); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc16_avx2.asm000066400000000000000000006315771517466257200235130ustar00rootroot00000000000000; Copyright © 2021-2026, VideoLAN and dav2d authors ; Copyright © 2021-2026, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 prep_mul: dw 16, 16, 4, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 put_8tap_h_rnd: dd 34, 40 s_8tap_h_rnd: dd 2, 8 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_rnd: dd 512, 128 put_s_8tap_v_sh: dd 10, 8 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) warp8x8t_rnd: dd 16384 - (8192 << 15) warp8x8_shift: dd 5, 3 warp8x8_rnd: dw 4096, 4096, 16384, 16384 bidir_rnd: dw -16400, -16400, -16388, -16388 bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul %define pd_512 put_s_8tap_v_rnd pw_2: times 2 dw 2 pw_64: times 2 dw 64 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 pw_m512: times 2 dw -512 pd_32: dd 32 pd_63: dd 63 pd_64: dd 64 pd_32768: dd 32768 pd_65538: dd 65538 pd_m524256: dd -524256 ; -8192 << 6 + 32 pd_0x3ff: dd 0x3ff pq_0x40000000: dq 0x40000000 dd 0 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32, 64 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64 %define table_offset(type, fn) type %+ fn %+ _avx2_table - type %+ _avx2 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter SECTION .text INIT_XMM avx2 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx2] %if UNIX64 DECLARE_REG_TMP 8 %define org_w r8d mov r8d, wd %else DECLARE_REG_TMP 7 %define org_w wm %endif tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu [dstq+dsq*0], m0 movu [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET INIT_YMM avx2 .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu [dstq+dsq*0], m0 movu [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] movu [dstq+dsq*0+32*0], m0 movu [dstq+dsq*0+32*1], m1 movu [dstq+dsq*1+32*0], m2 movu [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] add srcq, ssq mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 add dstq, dsq dec hd jg .put_w64 RET .h: movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add wq, r7 shr r6d, 11 vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] jmp wq .h_w2: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*0+2] movhps xm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xm0, xm4 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*0+2] vinserti128 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 movu [dstq+dsq*0], m0 movu [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+32*0] pmullw m1, m5, [srcq+32*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+32*1] pmullw m2, m5, [srcq+32*1+2] add srcq, ssq paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 movu [dstq+32*0], m0 movu [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: movifnidn t0d, org_w .h_w64_loop0: mov r6d, t0d .h_w64_loop: pmullw m0, m4, [srcq+r6*2-32*1] pmullw m1, m5, [srcq+r6*2-32*1+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r6*2-32*2] pmullw m2, m5, [srcq+r6*2-32*2+2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2-32*1], m0 mova [dstq+r6*2-32*2], m1 sub r6d, 32 jg .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64_loop0 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] shl mxyd, 11 movd xm5, mxyd add wq, r7 vpbroadcastw m5, xm5 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xm2, xm0, xm1 movd xm0, [srcq+ssq*0] punpckldq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm0, [srcq+ssq*0] .v_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xm2, xm0, xm1 movq xm0, [srcq+ssq*0] punpcklqdq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m0, m1, 0xf0 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m1, m0, 0xf0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movu [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] .v_w32_loop: movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m5 paddw m4, m0 movu m0, [srcq+ssq*0+32*0] movu [dstq+dsq*0+32*0], m4 psubw m4, m3, m1 pmulhrsw m4, m5 paddw m4, m1 movu m1, [srcq+ssq*0+32*1] movu [dstq+dsq*0+32*1], m4 psubw m4, m0, m2 pmulhrsw m4, m5 paddw m4, m2 movu [dstq+dsq*1+32*0], m4 psubw m4, m1, m3 pmulhrsw m4, m5 paddw m4, m3 movu [dstq+dsq*1+32*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w16: .v_w64: movifnidn t0d, org_w add t0d, t0d mov r4, srcq lea r6d, [hq+t0*8-256] mov r7, dstq .v_w16_loop0: movu m0, [srcq+ssq*0] .v_w16_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 movu [dstq+dsq*0], m1 movu [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 vpbroadcastd m3, [pw_2] movd xm6, mxyd vpbroadcastd m7, [pw_8192] add wq, r7 vpbroadcastw m6, xm6 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m7, [pw_2048] .hv_12bpc: jmp wq .hv_w2: vpbroadcastq xm1, [srcq+ssq*0] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w2_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm2, [srcq+ssq*0] pmullw xm1, xm4, xm2 psrlq xm2, 16 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 _ 2 _ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xm0, xm4, [srcq+ssq*0-8] pmullw xm1, xm5, [srcq+ssq*0-6] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w4_loop: movq xm1, [srcq+ssq*1] movq xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] movhps xm2, [srcq+ssq*0+2] pmullw xm1, xm4 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 2 shufpd xm2, xm0, xm1, 0x01 ; 0 1 mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+ssq*0] pmullw xm1, xm5, [srcq+ssq*0+2] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 vinserti128 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movu [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: %if UNIX64 lea r6d, [r8*2-32] %else mov r6d, wm lea r6d, [r6*2-32] %endif mov r4, srcq lea r6d, [hq+r6*8] mov r7, dstq .hv_w16_loop0: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 movu [dstq+dsq*0], m2 pmullw m0, m4, [srcq+ssq*0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 movu [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 RET cglobal prep_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [prep_avx2] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r7+wq*2+table_offset(prep,)] mov r6d, r8m ; bitdepth_max vpbroadcastd m5, [pw_8192] add wq, r7 shr r6d, 11 vpbroadcastd m4, [r7-prep_avx2+prep_mul+r6*4] lea r6, [ssq*3] lea r7, [dsq*3] jmp wq .prep_w4: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*2] movhps xm1, [srcq+r6 ] lea srcq, [srcq+ssq*4] pmullw xm0, xm4 pmullw xm1, xm4 psubw xm0, xm5 psubw xm1, xm5 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*2], xm0 movq [dstq+dsq*4], xm1 movhps [dstq+r7 *2], xm1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .prep_w4 RET .prep_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*2] vinserti128 m1, [srcq+r6 ], 1 lea srcq, [srcq+ssq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*2], m0, 1 mova [dstq+dsq*4], xm1 vextracti128 [dstq+r7 *2], m1, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .prep_w8 RET .prep_w16: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m4, [srcq+ssq*1] pmullw m2, m4, [srcq+ssq*2] pmullw m3, m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] REPX {psubw x, m5}, m0, m1, m2, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*2], m1 mova [dstq+dsq*4], m2 mova [dstq+r7 *2], m3 lea dstq, [dstq+dsq*8] sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+ssq*0+32*0] pmullw m1, m4, [srcq+ssq*0+32*1] pmullw m2, m4, [srcq+ssq*1+32*0] pmullw m3, m4, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] REPX {psubw x, m5}, m0, m1, m2, m3 mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*2+32*0], m2 mova [dstq+dsq*2+32*1], m3 lea dstq, [dstq+dsq*4] sub hd, 2 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] add srcq, ssq REPX {psubw x, m5}, m0, m1, m2, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 lea dstq, [dstq+dsq*2] dec hd jg .prep_w64 RET .h: movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] psubw m4, m5 test dword r8m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(prep, _bilin_h)] add wq, r7 jmp wq .h_w4: lea r6, [ssq*3] lea r7, [dsq*3] .h_w4_loop: movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*2], 1 movu xm2, [srcq+ssq*1] vinserti128 m2, [srcq+r6 ], 1 lea srcq, [srcq+ssq*4] punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m0, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*2], xm0 movq [dstq+dsq*4], xm1 movhps [dstq+r7 *2], xm1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*0+2] vinserti128 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*2], m0, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [dstq+dsq*0], m0 mova [dstq+dsq*2], m1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w16 RET .h_w32: xor r7d, r7d jmp .h_w32_loop0 .h_w64: mov r7, -32*2 sub srcq, r7 sub dstq, r7 .h_w32_loop0: mov r6, r7 .h_w32_loop: pmullw m0, m4, [srcq+r6+32*0] pmullw m1, m5, [srcq+r6+32*0+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r6+32*1] pmullw m2, m5, [srcq+r6+32*1+2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [dstq+r6+32*0], m0 mova [dstq+r6+32*1], m1 add r6, 32*2 jle .h_w32_loop add srcq, ssq lea dstq, [dstq+dsq*2] dec hd jg .h_w32_loop0 RET .v: movzx wd, word [r7+wq*2+table_offset(prep, _bilin_v)] movd xm5, mxyd vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] add wq, r7 lea r6, [ssq*3] psubw m4, m5 test dword r8m, 0x800 jnz .v_12bpc psllw m4, 2 psllw m5, 2 .v_12bpc: lea r7, [dsq*3] jmp wq .v_w4: movq xm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq xm1, [srcq+ssq*1] vpblendd m2, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpblendd m1, m0, 0xf0 ; 1 1 3 3 vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m2, 0x33 ; 0 1 2 3 vpblendd m0, m2, 0x0c ; 4 2 4 4 punpckhqdq m2, m1, m0 ; 1 2 3 4 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 vextracti128 xm2, m1, 1 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*2], xm1 movq [dstq+dsq*4], xm2 movhps [dstq+r7 *2], xm2 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m1, m0, m2, 0xf0 ; 0 1 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m2, m0, 0xf0 ; 1 2 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*2], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .v_w8_loop RET .v_w16: movu m0, [srcq+ssq*0] .v_w16_loop: movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+ssq*0] psraw m1, 2 pmullw m2, m4 mova [dstq+dsq*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [dstq+dsq*2], m1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .v_w16_loop RET .v_w64: lea r6d, [hq+256*3] jmp .v_w32_loop0 .v_w32: lea r6d, [hq+256*1] .v_w32_loop0: movu m0, [srcq+ssq*0] mov r4, srcq mov r7, dstq .v_w32_loop: movu m2, [r4+ssq*1] lea r4, [r4+ssq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [r4+ssq*0] psraw m1, 2 pmullw m2, m4 mova [r7+dsq*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [r7+dsq*2], m1 lea r7, [r7+dsq*4] sub hd, 2 jg .v_w32_loop add srcq, 32 add dstq, 32 movzx hd, r6b sub r6d, 1<<8 jg .v_w32_loop0 RET .hv: WIN64_SPILL_XMM 7 movzx wd, word [r7+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd add wq, r7 vpbroadcastw m6, xm6 jmp wq .hv_w4: movu xm1, [srcq+ssq*0] %if WIN64 movaps [rsp+32], xmm7 %endif pmullw xm0, xm4, xm1 psrldq xm1, 2 pmullw xm1, xm5 psubw xm0, xm3 lea r6, [ssq*3] paddw xm0, xm1 lea r7, [dsq*3] psraw xm0, 2 vpbroadcastq m0, xm0 .hv_w4_loop: movu xm1, [srcq+ssq*1] vinserti128 m1, [srcq+r6 ], 1 movu xm2, [srcq+ssq*2] lea srcq, [srcq+ssq*4] vinserti128 m2, [srcq+ssq*0], 1 punpcklqdq m7, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m7, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m7, m3 paddw m1, m7 psraw m1, 2 ; 1 2 3 4 vpblendd m0, m1, 0x3f vpermq m2, m0, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 vextracti128 xm2, m1, 1 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*2], xm1 movq [dstq+dsq*4], xm2 movhps [dstq+r7 *2], xm2 lea dstq, [dstq+dsq*8] sub hd, 4 jg .hv_w4_loop %if WIN64 movaps xmm7, [rsp+32] %endif RET .hv_w8: pmullw xm0, xm4, [srcq+ssq*0] pmullw xm1, xm5, [srcq+ssq*0+2] psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 vinserti128 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*2], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .hv_w8_loop RET .hv_w64: lea r6d, [hq+256*3] jmp .hv_w16_loop0 .hv_w32: lea r6d, [hq+256*1] jmp .hv_w16_loop0 .hv_w16: xor r6d, r6d .hv_w16_loop0: pmullw m0, m4, [srcq] pmullw m1, m5, [srcq+2] mov r4, srcq psubw m0, m3 mov r7, dstq paddw m0, m1 psraw m0, 2 .hv_w16_loop: pmullw m1, m4, [r4+ssq*1] pmullw m2, m5, [r4+ssq*1+2] lea r4, [r4+ssq*2] psubw m1, m3 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [r7+dsq*0], m2 pmullw m0, m4, [r4+ssq*0] pmullw m2, m5, [r4+ssq*0+2] psubw m0, m3 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [r7+dsq*2], m2 lea r7, [r7+dsq*4] sub hd, 2 jg .hv_w16_loop add srcq, 32 add dstq, 32 movzx hd, r6b sub r6d, 1<<8 jg .hv_w16_loop0 RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 3-4 ; prefix, name, type, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %if %0 == 4 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%4_avx2) %endif %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN smooth, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm add mxd, t0d ; 6tap_h, mx, 4tap_h add myd, t0d ; 6tap_v, my, 4tap_v movifnidn hd, hm lea r8, [put_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova xm2, [subpel_h_shuf2] vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] pmovsxbw xm3, xm3 .h_w2_loop: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm2 pshufb xm1, xm2 pmaddwd xm0, xm3 pmaddwd xm1, xm3 phaddd xm0, xm1 paddd xm0, xm4 psrad xm0, 6 packusdw xm0, xm0 pminsw xm0, xm5 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm3, [base+subpel_filters+mxq*8] WIN64_SPILL_XMM 8 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] pshufd xm3, xm3, q2211 vpbroadcastq m2, xm3 vpermq m3, m3, q1111 .h_w4_loop: movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m2 pmaddwd m1, m3 paddd m0, m4 paddd m0, m1 psrad m0, 6 vextracti128 xm1, m0, 1 packusdw xm0, xm1 pminsw xm0, xm5 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m5, r8m shr r7d, 11 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 WIN64_SPILL_XMM 11 shr mxd, 16 sub srcq, 4 vpbroadcastq m0, [base+subpel_filters+1+mxq*8] vbroadcasti128 m6, [base+subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m7, m0, q0000 pshufd m8, m0, q1111 pshufd m9, m0, q2222 sub wd, 16 jge .h_w16 .h_w8: %macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%1, m6 ; 01 12 23 34 pshufb m%2, m6 ; 45 56 67 78 pmaddwd m%4, m7, m%1 ; a0 pshufb m%3, m6 ; 89 9a ab bc pmaddwd m%5, m9, m%2 ; a2 shufpd m%1, m%2, 0x05 ; 23 34 45 56 paddd m%4, m%5 ; a0+a2 pmaddwd m%5, m7, m%2 ; b0 shufpd m%2, m%3, 0x05 ; 67 78 89 9a pmaddwd m%3, m9 ; b2 pmaddwd m%1, m8 ; a1 pmaddwd m%2, m8 ; b1 paddd m%3, m%5 ; b0+b2 paddd m%4, m4 paddd m%3, m4 paddd m%1, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 packusdw m%1, m%2 pminsw m%1, m5 %endmacro movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 shufpd m1, m0, m2, 0x05 lea srcq, [srcq+ssq*2] PUT_6TAP_H 0, 1, 2, 3, 10 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 8] movu m2, [srcq+r6*2+16] PUT_6TAP_H 0, 1, 2, 3, 10 movu [dstq+r6*2], m0 sub r6d, 16 jge .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [base+subpel_filters+1+myq*8] WIN64_SPILL_XMM 10, 12 vpbroadcastd m5, [pd_32] vpbroadcastw m6, r8m punpcklbw m0, m0 mov r6, ssq psraw m0, 8 ; sign-extend neg r6 pshufd m7, m0, q0000 pshufd m8, m0, q1111 pshufd m9, m0, q2222 cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd xm2, [srcq+r6 *2] pinsrd xm2, [srcq+r6 *1], 1 pinsrd xm2, [srcq+ssq*0], 2 pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] movd xm0, [srcq+ssq*0] palignr xm3, xm0, xm2, 4 ; 1 2 3 4 punpcklwd xm1, xm2, xm3 ; 01 12 punpckhwd xm2, xm3 ; 23 34 .v_w2_loop: movd xm3, [srcq+ssq*1] pmaddwd xm4, xm7, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm8 ; a1 b1 lea srcq, [srcq+ssq*2] paddd xm4, xm2 punpckldq xm2, xm0, xm3 ; 4 5 movd xm0, [srcq+ssq*0] punpckldq xm3, xm0 ; 5 6 punpcklwd xm2, xm3 ; 45 56 pmaddwd xm3, xm9, xm2 ; a2 b2 paddd xm4, xm5 paddd xm4, xm3 psrad xm4, 6 packusdw xm4, xm4 pminsw xm4, xm6 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm1, [srcq+r6 *2] vpbroadcastq m3, [srcq+r6 *1] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m3, 0x30 vpblendd m3, m2, 0x30 punpcklwd m1, m3 ; 01 12 vpblendd m2, m4, 0x30 vpblendd m4, m0, 0x30 punpcklwd m2, m4 ; 23 34 .v_w4_loop: vpbroadcastq m3, [srcq+ssq*1] pmaddwd m4, m7, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m8 ; a1 b1 lea srcq, [srcq+ssq*2] paddd m4, m2 vpblendd m2, m0, m3, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m3, m0, 0x30 punpcklwd m2, m3 ; 45 56 pmaddwd m3, m9, m2 ; a2 b2 paddd m4, m5 paddd m4, m3 psrad m4, 6 vextracti128 xm3, m4, 1 packusdw xm4, xm3 pminsw xm4, xm6 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: shl wd, 5 WIN64_PUSH_XMM 12 lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m3, [srcq+r6 *2] vbroadcasti128 m4, [srcq+r6 *1] lea r7, [srcq+ssq*2] vbroadcasti128 m0, [srcq+ssq*0] vbroadcasti128 m1, [srcq+ssq*1] mov r8, dstq vbroadcasti128 m2, [r7+ssq*0] shufpd m3, m0, 0x0c shufpd m4, m1, 0x0c punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 shufpd m0, m2, 0x0c punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .v_w8_loop: vbroadcasti128 m5, [r7+ssq*1] pmaddwd m10, m7, m1 ; a0 lea r7, [r7+ssq*2] pmaddwd m11, m7, m2 ; b0 mova m1, m3 pmaddwd m3, m8 ; a1 mova m2, m4 pmaddwd m4, m8 ; b1 paddd m10, m3 vbroadcasti128 m3, [r7+ssq*0] paddd m11, m4 shufpd m4, m0, m5, 0x0d shufpd m0, m5, m3, 0x0c punpcklwd m3, m4, m0 ; 45 punpckhwd m4, m0 ; 56 pmaddwd m5, m9, m3 ; a2 paddd m10, m5 pmaddwd m5, m9, m4 ; b2 paddd m5, m11 psrad m10, 5 psrad m5, 5 packusdw m10, m5 pxor m5, m5 pavgw m5, m10 pminsw m5, m6 vpermq m5, m5, q3120 movu [r8+dsq*0], xm5 vextracti128 [r8+dsq*1], m5, 1 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .v_w8_loop0 RET .hv: WIN64_SPILL_XMM 12, 16 vpbroadcastd m10, [pd_512] vpbroadcastw m11, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 2 neg r6 pxor m6, m6 punpcklbw m6, m0 punpcklbw m1, m1 psraw m1, 8 ; sign-extend test dword r8m, 0x800 jz .hv_10bit psraw m6, 2 psllw m1, 2 .hv_10bit: pshufd m7, m1, q0000 pshufd m8, m1, q1111 pshufd m9, m1, q2222 cmp wd, 4 je .hv_w4 vbroadcasti128 m5, [subpel_h_shuf2] vbroadcasti128 m0, [srcq+ssq*0] vinserti128 m2, m0, [srcq+r6*2], 1 ; 2 0 movu xm1, [srcq+ssq*1] vinserti128 m1, [srcq+r6 *1], 1 ; 3 1 lea srcq, [srcq+ssq*2] vinserti128 m0, [srcq+ssq*0], 0 ; 4 2 REPX {pshufb x, m5}, m2, m1, m0 REPX {pmaddwd x, m6}, m2, m1, m0 phaddd m2, m1 phaddd m1, m0 paddd m2, m10 paddd m1, m10 psrad m2, 10 psrad m1, 10 packssdw m2, m1 ; 2 3 3 4 0 1 1 2 punpckhqdq m0, m2, m2 punpcklwd m2, m0 ; 23 34 vextracti128 xm1, m2, 1 ; 01 12 .hv_w2_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu xm4, [srcq+ssq*0] pshufb xm3, xm5 pshufb xm4, xm5 pmaddwd xm3, xm6 pmaddwd xm4, xm6 phaddd xm3, xm4 pmaddwd xm4, xm7, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm8 ; a1 b1 paddd xm4, xm2 paddd xm3, xm10 psrad xm3, 10 packssdw xm3, xm3 palignr xm2, xm3, xm0, 12 mova xm0, xm3 punpcklwd xm2, xm0 ; 45 56 pmaddwd xm3, xm9, xm2 ; a2 b2 paddd xm4, xm10 paddd xm4, xm3 psrad xm4, 10 packusdw xm4, xm4 pminsw xm4, xm11 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: WIN64_PUSH_XMM 14 vbroadcasti128 m12, [subpel_h_shufA] pshufd m5, m6, q0000 vbroadcasti128 m13, [subpel_h_shufB] pshufd m6, m6, q1111 movu xm2, [srcq+r6 *2] vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0] ; 4 pshufb m1, m2, m12 pmaddwd m1, m5 pshufb m2, m13 pmaddwd m2, m6 pshufb m4, m0, m12 pmaddwd m4, m5 pshufb m0, m13 pmaddwd m0, m6 paddd m2, m1 pshufb xm1, xm3, xm12 pmaddwd xm1, xm5 pshufb xm3, xm13 pmaddwd xm3, xm6 paddd m0, m4 paddd m2, m10 paddd xm1, xm10 paddd m0, m10 paddd xm3, xm1 REPX {psrad x, 10}, m2, m0, xm3 packssdw m2, m0 ; 0 2 1 3 packssdw xm0, xm3 ; 2 4 vperm2i128 m0, m2, 0x03 punpcklwd m1, m2, m0 ; 01 12 punpckhwd m2, m0 ; 23 34 .hv_w4_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m3, [srcq+ssq*0], 1 pmaddwd m4, m7, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m8 ; a1 b1 paddd m4, m2 pshufb m2, m3, m12 pmaddwd m2, m5 pshufb m3, m13 pmaddwd m3, m6 paddd m2, m10 paddd m3, m2 psrad m3, 10 packssdw m3, m3 ; 5 5 6 6 vperm2i128 m2, m0, m3, 0x21 mova m0, m3 punpckhwd m2, m3 ; 45 56 pmaddwd m3, m9, m2 ; a2 b2 paddd m4, m10 paddd m4, m3 psrad m4, 10 vextracti128 xm3, m4, 1 packusdw xm4, xm3 pminsw xm4, xm11 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: WIN64_PUSH_XMM 16, 12 shr mxd, 16 vbroadcasti128 m12, [subpel_h_shufA] vpbroadcastq m2, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xm1, [base+subpel_filters+1+myq*8] shl wd, 5 mov r6, ssq sub srcq, 4 pxor m0, m0 neg r6 punpcklbw m0, m2 lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 psllw xm1, 2 .hv_w8_10bit: pshufd m7, m0, q0000 pshufd m8, m0, q1111 %if WIN64 %define v_mul (rsp+stack_offset+40) ; r4m %else %define v_mul (rsp+stack_offset+ 8) ; r6m %endif mova [v_mul], xm1 pshufd m9, m0, q2222 .hv_w8_loop0: vbroadcasti128 m0, [srcq+ssq*0+ 0] vinserti128 m3, m0, [srcq+r6*2+ 0], 0 lea r7, [srcq+ssq*2] vbroadcasti128 m2, [srcq+ssq*0+16] vinserti128 m1, m2, [srcq+r6*2+16], 0 mov r8, dstq vinserti128 m0, [r7 +ssq*0+ 0], 1 vinserti128 m2, [r7 +ssq*0+16], 1 shufpd m4, m3, m1, 0x05 %macro PUT_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%1, m12 ; 01 12 23 34 pshufb m%2, m12 ; 45 56 67 78 pmaddwd m%4, m7, m%1 ; a0 pshufb m%3, m12 ; 89 9a ab bc pmaddwd m%5, m9, m%2 ; a2 shufpd m%1, m%2, 0x05 ; 23 34 45 56 paddd m%4, m%5 ; a0+a2 pmaddwd m%5, m7, m%2 ; b0 shufpd m%2, m%3, 0x05 ; 67 78 89 9a pmaddwd m%3, m9 ; b2 pmaddwd m%1, m8 ; a1 pmaddwd m%2, m8 ; b1 paddd m%3, m%5 ; b0+b2 paddd m%4, m10 paddd m%3, m10 paddd m%1, m%4 paddd m%2, m%3 psrad m%1, 10 psrad m%2, 10 packssdw m%1, m%2 %endmacro PUT_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 movu xm4, [srcq+r6 *1+ 0] vinserti128 m4, [srcq+ssq*1+ 0], 1 shufpd m1, m0, m2, 0x05 PUT_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 movu xm2, [srcq+r6 *1+16] vinserti128 m2, [srcq+ssq*1+16], 1 shufpd m1, m4, m2, 0x05 PUT_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m0, m0, q3120 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .hv_w8_loop: vpbroadcastd m15, [v_mul+4*0] vpbroadcastd m13, [v_mul+4*1] movu xm5, [r7+ssq*1+ 0] movu xm6, [r7+ssq*1+16] lea r7, [r7+ssq*2] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 vinserti128 m5, [r7+ssq*0+ 0], 1 vinserti128 m6, [r7+ssq*0+16], 1 mova m1, m3 pmaddwd m3, m13 ; a1 mova m2, m4 pmaddwd m4, m13 ; b1 paddd m14, m3 shufpd m3, m5, m6, 0x05 paddd m15, m4 PUT_6TAP_HV_H 5, 3, 6, 4, 13 ; 5 6 vpbroadcastd m6, [v_mul+4*2] vpermq m5, m5, q3120 shufpd m4, m0, m5, 0x05 mova m0, m5 punpcklwd m3, m4, m5 ; 45 punpckhwd m4, m5 ; 56 pmaddwd m5, m6, m3 ; a2 pmaddwd m6, m4 ; b2 paddd m14, m10 paddd m15, m10 paddd m5, m14 paddd m6, m15 psrad m5, 10 psrad m6, 10 packusdw m5, m6 pminsw m5, m11 vpermq m5, m5, q3120 movu [r8+dsq*0], xm5 vextracti128 [r8+dsq*1], m5, 1 lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .hv_w8_loop0 RET cglobal put_8tap_sharp_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm movifnidn hd, hm add mxd, FILTER_SHARP ; 8tap_h, mx, 4tap_h add myd, FILTER_SHARP ; 8tap_v, my, 4tap_v lea r8, [put_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] WIN64_SPILL_XMM 12, 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m lea r6, [ssq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*4] movd xm3, [srcq+ssq*0] vpbroadcastd xm1, [srcq+ssq*1] vpbroadcastd xm0, [srcq+ssq*2] add srcq, r6 vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklwd xm3, xm1 ; 45 56 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 .v_w2_loop: vpbroadcastd xm4, [srcq+ssq*0] pmaddwd xm5, xm8, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm9 ; a1 b1 paddd xm5, xm6 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm10 ; a2 b2 paddd xm5, xm3 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklwd xm3, xm4 ; 67 78 pmaddwd xm4, xm11, xm3 ; a3 b3 paddd xm5, xm4 psrad xm5, 6 packusdw xm5, xm5 pminsw xm5, xm7 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm1, [srcq+ssq*0] vpbroadcastq m0, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m5, [srcq+ssq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+ssq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+ssq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 6 vextracti128 xm4, m5, 1 packusdw xm5, xm4 pminsw xm5, xm7 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: shl wd, 5 WIN64_PUSH_XMM 15 lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] lea r7, [srcq+ssq*4] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] mov r8, dstq vbroadcasti128 m1, [r7+ssq*0] vbroadcasti128 m2, [r7+ssq*1] vbroadcasti128 m3, [r7+ssq*2] add r7, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [r7+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [r7+ssq*1] lea r7, [r7+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packusdw m12, m13 pxor m13, m13 pavgw m12, m13 pminsw m12, m7 vpermq m12, m12, q3120 movu [r8+dsq*0], xm12 vextracti128 [r8+dsq*1], m12, 1 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .v_w8_loop0 RET .h: RESET_STACK_STATE test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m5, r8m shr r7d, 11 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 jl mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w2 je mangle(private_prefix %+ _put_6tap_16bpc_avx2).h_w4 WIN64_SPILL_XMM 13 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 sub wd, 16 jge .h_w16 .h_w8: %macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m4 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m4 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 packusdw m%1, m%2 pminsw m%1, m5 %endmacro movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 PUT_8TAP_H 0, 1, 2, 3, 12 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 8] movu m2, [srcq+r6*2+16] PUT_8TAP_H 0, 1, 2, 3, 12 movu [dstq+r6*2], m0 sub r6d, 16 jge .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16 RET .hv: WIN64_SPILL_XMM 16 vpbroadcastw m15, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] vpbroadcastd m6, [pd_512] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m1, 8 ; sign-extend test dword r8m, 0x800 jz .hv_10bit psraw m7, 2 psllw m1, 2 .hv_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m9, [subpel_h_shuf2] vbroadcasti128 m1, [srcq+r6 ] ; 3 3 movu xm3, [srcq+ssq*2] movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*4] vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 add srcq, r6 pshufb m1, m9 pshufb m3, m9 pshufb m0, m9 pshufb m2, m9 pmaddwd m1, m7 pmaddwd m3, m7 pmaddwd m0, m7 pmaddwd m2, m7 phaddd m1, m3 phaddd m0, m2 paddd m1, m6 paddd m0, m6 psrad m1, 10 psrad m0, 10 packssdw m1, m0 ; 3 2 0 1 vextracti128 xm0, m1, 1 ; 3 4 5 6 pshufd xm2, xm1, q1301 ; 2 3 1 2 pshufd xm3, xm0, q2121 ; 4 5 4 5 punpckhwd xm1, xm2 ; 01 12 punpcklwd xm2, xm0 ; 23 34 punpckhwd xm3, xm0 ; 45 56 .hv_w2_loop: movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm4, xm9 pshufb xm5, xm9 pmaddwd xm4, xm7 pmaddwd xm5, xm7 phaddd xm4, xm5 pmaddwd xm5, xm11, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm12 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm13 ; a2 b2 paddd xm5, xm3 paddd xm4, xm6 psrad xm4, 10 packssdw xm4, xm4 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm14, xm3 ; a3 b3 paddd xm5, xm6 paddd xm5, xm4 psrad xm5, 10 packusdw xm5, xm5 pminsw xm5, xm15 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 movu xm3, [srcq+ssq*1] vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m6 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m6 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m6 paddd m4, m0 paddd m5, m6 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 10 psrld m2, 10 vperm2i128 m3, m4, m5, 0x21 pslld m4, 6 pslld m5, 6 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 6 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 10 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+ssq*0] vinserti128 m4, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m6 paddd m4, m3 psrad m4, 10 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 10 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, xm15 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 pxor m0, m0 sub srcq, r6 punpcklbw m0, m2 lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 psllw xm1, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 mova [v_mul], xm1 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m10 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m10 paddd m2, m3 paddd m%3, m2 paddd m%2, m%3 psrad m%1, 10 psrad m%2, 10 packssdw m%1, m%2 %endmacro movu xm4, [srcq+r6 *1+ 0] vbroadcasti128 m8, [subpel_h_shufA] lea r7, [srcq+ssq*4] movu xm6, [srcq+r6 *1+ 8] vbroadcasti128 m9, [subpel_h_shufB] mov r8, dstq movu xm0, [srcq+r6 *1+16] vpbroadcastd m10, [pd_512] movu xm5, [srcq+ssq*0+ 0] vinserti128 m5, [r7 +ssq*0+ 0], 1 movu xm1, [srcq+ssq*0+16] vinserti128 m1, [r7 +ssq*0+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PUT_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PUT_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+ssq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+ssq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PUT_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] vinserti128 m6, [r7 +ssq*1+ 0], 1 vinserti128 m1, [r7 +ssq*1+16], 1 add r7, r6 shufpd m7, m6, m1, 0x05 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [r7+ssq*0] vinserti128 m5, [r7+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [r7+ssq*0+16] vinserti128 m6, [r7+ssq*1+16], 1 vextracti128 [r8], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m6, m5 movu xm5, [r7+ssq*0+8] vinserti128 m5, [r7+ssq*1+8], 1 lea r7, [r7+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 vpbroadcastd m10, [pd_512] paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [r8] paddd m8, m10 paddd m9, m10 paddd m0, m10 paddd m5, m10 vpbroadcastd m10, [v_mul+4*3] psrad m0, 10 psrad m5, 10 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 10 psrad m9, 10 packusdw m7, m9 pminsw m7, m15 vpermq m7, m7, q3120 movu [r8+dsq*0], xm7 vextracti128 [r8+dsq*1], m7, 1 lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .hv_w8_loop0 RET %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN smooth, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN regular, REGULAR cglobal prep_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-prep_avx2 imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm add mxd, t0d ; 6tap_h, mx, 4tap_h add myd, t0d ; 6tap_v, my, 4tap_v movifnidn hd, hm lea r8, [prep_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd mov r6d, r8m ; bitdepth_max movzx wd, word [r8+wq*2+table_offset(prep,)] vpbroadcastd m5, [pw_8192] shr r6d, 11 add wq, r8 vpbroadcastd m4, [base+prep_mul+r6*4] lea r6, [ssq*3] %if WIN64 pop r8 %endif lea r7, [dsq*3] jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm0, [base+subpel_filters+mxq*8] vbroadcasti128 m3, [subpel_h_shufA] lea r6, [ssq*3] vbroadcasti128 m4, [subpel_h_shufB] lea r7, [dsq*3] WIN64_SPILL_XMM 8 pshufd xm0, xm0, q2211 test dword r8m, 0x800 jnz .h_w4_12bpc psllw xm0, 2 .h_w4_12bpc: vpbroadcastq m6, xm0 vpermq m7, m0, q1111 .h_w4_loop: movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*2], 1 movu xm2, [srcq+ssq*1] vinserti128 m2, [srcq+r6 *1], 1 lea srcq, [srcq+ssq*4] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*2], xm0 movq [dstq+dsq*4], xm1 movhps [dstq+r7 *2], xm1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) cmp wd, 4 je .h_w4 shr mxd, 16 sub srcq, 4 vpbroadcastq m0, [base+subpel_filters+1+mxq*8] WIN64_SPILL_XMM 10 vbroadcasti128 m6, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r8m, 0x800 jnz .h_12bpc psllw m0, 2 .h_12bpc: pshufd m7, m0, q0000 pshufd m8, m0, q1111 pshufd m9, m0, q2222 cmp wd, 8 jg .h_w16 .h_w8: movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 %macro PREP_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%1, m6 ; 01 12 23 34 pshufb m%2, m6 ; 45 56 67 78 pmaddwd m%4, m7, m%1 ; a0 pshufb m%3, m6 ; 89 9a ab bc pmaddwd m%5, m9, m%2 ; a2 shufpd m%1, m%2, 0x05 ; 23 34 45 56 paddd m%4, m%5 ; a0+a2 pmaddwd m%5, m7, m%2 ; b0 shufpd m%2, m%3, 0x05 ; 67 78 89 9a pmaddwd m%3, m9 ; b2 pmaddwd m%1, m8 ; a1 pmaddwd m%2, m8 ; b1 paddd m%3, m%5 ; b0+b2 paddd m%4, m5 paddd m%3, m5 paddd m%1, m%4 paddd m%2, m%3 psrad m%1, 4 psrad m%2, 4 packssdw m%1, m%2 %endmacro PREP_6TAP_H 0, 1, 2, 3, 4 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*2], m0, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w8 RET .h_w16: add wd, wd .h_w16_loop0: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6-32] movu m1, [srcq+r6-24] movu m2, [srcq+r6-16] PREP_6TAP_H 0, 1, 2, 3, 4 mova [dstq+r6-32], m0 sub r6d, 32 jg .h_w16_loop add srcq, ssq lea dstq, [dstq+dsq*2] dec hd jg .h_w16_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+1+myq*8] WIN64_SPILL_XMM 9, 12 vpbroadcastd m5, [prep_8tap_1d_rnd] mov r6, ssq punpcklbw m0, m0 neg r6 psraw m0, 8 ; sign-extend test dword r8m, 0x800 jnz .v_12bpc psllw m0, 2 .v_12bpc: pshufd m6, m0, q0000 pshufd m7, m0, q1111 pshufd m8, m0, q2222 cmp wd, 4 jg .v_w8 .v_w4: movq xm1, [srcq+r6 *2] vpbroadcastq m3, [srcq+r6 *1] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m3, 0x30 vpblendd m3, m2, 0x30 punpcklwd m1, m3 ; 01 12 vpblendd m2, m4, 0x30 vpblendd m4, m0, 0x30 punpcklwd m2, m4 ; 23 34 .v_w4_loop: vpbroadcastq m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m4, m6, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m7 ; a1 b1 paddd m4, m2 vpblendd m2, m0, m3, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m3, m0, 0x30 punpcklwd m2, m3 ; 45 56 pmaddwd m3, m8, m2 ; a2 b2 paddd m4, m5 paddd m4, m3 psrad m4, 4 vextracti128 xm3, m4, 1 packssdw xm4, xm3 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*2], xm4 lea dstq, [dstq+dsq*4] sub hd, 2 jg .v_w4_loop RET .v_w8: WIN64_PUSH_XMM 12 shl wd, 5 lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m3, [srcq+r6 *2] vbroadcasti128 m4, [srcq+r6 *1] lea r8, [srcq+ssq*2] vbroadcasti128 m0, [srcq+ssq*0] vbroadcasti128 m1, [srcq+ssq*1] mov r7, dstq vbroadcasti128 m2, [r8+ssq*0] shufpd m3, m0, 0x0c shufpd m4, m1, 0x0c punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 shufpd m0, m2, 0x0c punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .v_w8_loop: vbroadcasti128 m9, [r8+ssq*1] pmaddwd m10, m6, m1 ; a0 lea r8, [r8+ssq*2] pmaddwd m11, m6, m2 ; b0 mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 paddd m10, m5 paddd m11, m5 paddd m10, m3 vbroadcasti128 m3, [r8+ssq*0] paddd m11, m4 shufpd m4, m0, m9, 0x0d shufpd m0, m9, m3, 0x0c punpcklwd m3, m4, m0 ; 45 punpckhwd m4, m0 ; 56 pmaddwd m9, m8, m3 ; a2 paddd m10, m9 pmaddwd m9, m8, m4 ; b2 paddd m11, m9 psrad m10, 4 psrad m11, 4 packssdw m10, m11 vpermq m10, m10, q3120 mova [r7+dsq*0], xm10 vextracti128 [r7+dsq*2], m10, 1 lea r7, [r7+dsq*4] sub hd, 2 jg .v_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .v_w8_loop0 RET .hv: WIN64_SPILL_XMM 13, 15 vpbroadcastd m7, [prep_8tap_2d_rnd] vbroadcasti128 m8, [subpel_h_shufA] cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 2 pxor m6, m6 neg r6 punpcklbw m6, m0 punpcklbw m1, m1 psraw m6, 4 psraw m1, 8 test dword r8m, 0x800 jz .hv_w4_10bit psraw m6, 2 .hv_w4_10bit: pshufd m10, m1, q0000 pshufd m11, m1, q1111 pshufd m12, m1, q2222 .hv_w4: movu xm2, [srcq+r6 *2] vinserti128 m2, [srcq+r6 *1], 1 ; 0 1 pshufd m5, m6, q0000 vbroadcasti128 m9, [base+subpel_h_shufB] movu xm0, [srcq+ssq*0] pshufd m6, m6, q1111 vinserti128 m0, [srcq+ssq*1], 1 ; 2 3 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0] ; 4 pshufb m1, m2, m8 pmaddwd m1, m5 pshufb m2, m9 pmaddwd m2, m6 pshufb m4, m0, m8 pmaddwd m4, m5 pshufb m0, m9 pmaddwd m0, m6 paddd m2, m1 pshufb xm1, xm3, xm8 pmaddwd xm1, xm5 pshufb xm3, xm9 pmaddwd xm3, xm6 paddd m0, m4 paddd m2, m7 paddd xm1, xm7 paddd m0, m7 paddd xm3, xm1 REPX {psrad x, 6}, m2, m0, xm3 packssdw m2, m0 ; 0 2 1 3 packssdw xm0, xm3 ; 2 4 vperm2i128 m0, m2, 0x03 punpcklwd m1, m2, m0 ; 01 12 punpckhwd m2, m0 ; 23 34 .hv_w4_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m3, [srcq+ssq*0], 1 pmaddwd m4, m10, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m11 ; a1 b1 paddd m4, m2 pshufb m2, m3, m8 pmaddwd m2, m5 pshufb m3, m9 pmaddwd m3, m6 paddd m2, m7 paddd m3, m2 psrad m3, 6 packssdw m3, m3 ; 5 5 6 6 vperm2i128 m2, m0, m3, 0x21 mova m0, m3 punpckhwd m2, m3 ; 45 56 pmaddwd m3, m12, m2 ; a2 b2 paddd m4, m7 paddd m4, m3 psrad m4, 6 vextracti128 xm3, m4, 1 packssdw xm4, xm3 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*2], xm4 lea dstq, [dstq+dsq*4] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd pmovsxbw xm1, [base+subpel_filters+1+myq*8] WIN64_PUSH_XMM 15 shl wd, 5 mov r6, ssq sub srcq, 4 neg r6 lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 psraw m0, 4 test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 .hv_w8_10bit: pshufd m10, m0, q0000 pshufd m11, m0, q1111 mova [v_mul], xm1 pshufd m12, m0, q2222 .hv_w8_loop0: vbroadcasti128 m0, [srcq+ssq*0+ 0] vinserti128 m3, m0, [srcq+r6*2+ 0], 0 lea r8, [srcq+ssq*2] vbroadcasti128 m2, [srcq+ssq*0+16] vinserti128 m1, m2, [srcq+r6*2+16], 0 mov r7, dstq vinserti128 m0, [r8 +ssq*0+ 0], 1 vinserti128 m2, [r8 +ssq*0+16], 1 shufpd m4, m3, m1, 0x05 %macro PREP_6TAP_HV_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%1, m8 ; 01 12 23 34 pshufb m%2, m8 ; 45 56 67 78 pmaddwd m%4, m10, m%1 ; a0 pshufb m%3, m8 ; 89 9a ab bc pmaddwd m%5, m12, m%2 ; a2 shufpd m%1, m%2, 0x05 ; 23 34 45 56 paddd m%4, m%5 ; a0+a2 pmaddwd m%5, m10, m%2 ; b0 shufpd m%2, m%3, 0x05 ; 67 78 89 9a pmaddwd m%3, m12 ; b2 pmaddwd m%1, m11 ; a1 pmaddwd m%2, m11 ; b1 paddd m%3, m%5 ; b0+b2 paddd m%4, m7 paddd m%3, m7 paddd m%1, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 packssdw m%1, m%2 %endmacro PREP_6TAP_HV_H 3, 4, 1, 5, 6 ; 0 2 movu xm4, [srcq+r6 *1+ 0] vinserti128 m4, [srcq+ssq*1+ 0], 1 shufpd m1, m0, m2, 0x05 PREP_6TAP_HV_H 0, 1, 2, 5, 6 ; 2 4 movu xm2, [srcq+r6 *1+16] vinserti128 m2, [srcq+ssq*1+16], 1 shufpd m1, m4, m2, 0x05 PREP_6TAP_HV_H 4, 1, 2, 5, 6 ; 1 3 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m0, m0, q3120 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .hv_w8_loop: vpbroadcastd m14, [v_mul+4*0] vpbroadcastd m9, [v_mul+4*1] movu xm5, [r8+ssq*1+ 0] movu xm6, [r8+ssq*1+16] lea r8, [r8+ssq*2] pmaddwd m13, m14, m1 ; a0 pmaddwd m14, m2 ; b0 vinserti128 m5, [r8+ssq*0+ 0], 1 vinserti128 m6, [r8+ssq*0+16], 1 mova m1, m3 pmaddwd m3, m9 ; a1 mova m2, m4 pmaddwd m4, m9 ; b1 paddd m13, m3 shufpd m3, m5, m6, 0x05 paddd m14, m4 PREP_6TAP_HV_H 5, 3, 6, 4, 9 ; 5 6 vpbroadcastd m6, [v_mul+4*2] vpermq m5, m5, q3120 shufpd m4, m0, m5, 0x05 mova m0, m5 punpcklwd m3, m4, m5 ; 45 punpckhwd m4, m5 ; 56 pmaddwd m5, m6, m3 ; a2 pmaddwd m6, m4 ; b2 paddd m13, m7 paddd m14, m7 paddd m5, m13 paddd m6, m14 psrad m5, 6 psrad m6, 6 packssdw m5, m6 vpermq m5, m5, q3120 mova [r7+dsq*0], xm5 vextracti128 [r7+dsq*2], m5, 1 lea r7, [r7+dsq*4] sub hd, 2 jg .hv_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .hv_w8_loop0 RET cglobal prep_8tap_sharp_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-prep_avx2 imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm movifnidn hd, hm add mxd, FILTER_SHARP ; 8tap_h, mx, 4tap_h add myd, FILTER_SHARP ; 8tap_v, my, 4tap_v lea r8, [prep_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] WIN64_SPILL_XMM 12, 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [ssq*3] punpcklbw m0, m0 sub srcq, r6 psraw m0, 8 ; sign-extend test dword r8m, 0x800 jnz .v_12bpc psllw m0, 2 .v_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 .v_w4: movq xm1, [srcq+ssq*0] vpbroadcastq m0, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m5, [srcq+ssq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+ssq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+ssq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m7 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 4 vextracti128 xm4, m5, 1 packssdw xm5, xm4 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*2], xm5 lea dstq, [dstq+dsq*4] sub hd, 2 jg .v_w4_loop RET .v_w8: WIN64_PUSH_XMM 15 shl wd, 5 lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] lea r8, [srcq+ssq*4] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] mov r7, dstq vbroadcasti128 m1, [r8+ssq*0] vbroadcasti128 m2, [r8+ssq*1] vbroadcasti128 m3, [r8+ssq*2] add r8, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [r8+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m7 paddd m13, m7 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [r8+ssq*1] lea r8, [r8+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 vpermq m12, m12, q3120 mova [r7+dsq*0], xm12 vextracti128 [r7+dsq*2], m12, 1 lea r7, [r7+dsq*4] sub hd, 2 jg .v_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .v_w8_loop0 RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) cmp wd, 4 je mangle(private_prefix %+ _prep_6tap_16bpc_avx2).h_w4 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] WIN64_SPILL_XMM 12 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r8m, 0x800 jnz .h_12bpc psllw m0, 2 .h_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m5 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m5 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 4 psrad m%2, 4 packssdw m%1, m%2 %endmacro movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 PREP_8TAP_H 0, 1, 2, 3, 4 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*2], m0, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w8 RET .h_w16: add wd, wd .h_w16_loop0: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6-32] movu m1, [srcq+r6-24] movu m2, [srcq+r6-16] PREP_8TAP_H 0, 1, 2, 3, 4 mova [dstq+r6-32], m0 sub r6d, 32 jg .h_w16_loop add srcq, ssq lea dstq, [dstq+dsq*2] dec hd jg .h_w16_loop0 RET .hv: WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 2 pxor m7, m7 sub srcq, r6 punpcklbw m7, m0 punpcklbw m1, m1 psraw m7, 4 psraw m1, 8 test dword r8m, 0x800 jz .hv_w4_10bit psraw m7, 2 .hv_w4_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 movu xm3, [srcq+ssq*1] vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m15 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m15 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m15 paddd m4, m0 paddd m5, m15 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 6 psrld m2, 6 vperm2i128 m3, m4, m5, 0x21 pslld m4, 10 pslld m5, 10 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 10 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 6 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+ssq*0] vinserti128 m4, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m15 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m15 paddd m4, m3 psrad m4, 6 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*2], xm4 lea dstq, [dstq+dsq*4] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 psraw m0, 4 test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 mova [v_mul], xm1 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: %macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m15 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m15 paddd m2, m3 paddd m2, m%3 paddd m2, m%2 psrad m%1, 6 psrad m2, 6 packssdw m%1, m2 %endmacro movu xm4, [srcq+r6 + 0] vbroadcasti128 m8, [subpel_h_shufA] lea r8, [srcq+ssq*4] movu xm6, [srcq+r6 + 8] vbroadcasti128 m9, [subpel_h_shufB] mov r7, dstq movu xm0, [srcq+r6 +16] movu xm5, [srcq+ssq*0+ 0] vinserti128 m5, [r8 +ssq*0+ 0], 1 movu xm1, [srcq+ssq*0+16] vinserti128 m1, [r8 +ssq*0+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PREP_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PREP_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+ssq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+ssq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PREP_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] vinserti128 m6, [r8 +ssq*1+ 0], 1 vinserti128 m1, [r8 +ssq*1+16], 1 add r8, r6 shufpd m7, m6, m1, 0x05 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m15 paddd m9, m15 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [r8+ssq*0] vinserti128 m5, [r8+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [r8+ssq*0+16] vinserti128 m6, [r8+ssq*1+16], 1 vextracti128 [r7], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m15 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m5, m15 paddd m6, m5 movu xm5, [r8+ssq*0+8] vinserti128 m5, [r8+ssq*1+8], 1 lea r8, [r8+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [r7] vpbroadcastd m10, [v_mul+4*3] psrad m0, 6 psrad m5, 6 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 6 psrad m9, 6 packssdw m7, m9 vpermq m7, m7, q3120 mova [r7+dsq*0], xm7 vextracti128 [r7+dsq*2], m7, 1 lea r7, [r7+dsq*4] sub hd, 2 jg .hv_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .hv_w8_loop0 RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd movu xm%1, [srcq+ r4*2] movu xm%2, [srcq+ r6*2] movu xm%3, [srcq+ r7*2] movu xm%4, [srcq+ r9*2] vinserti128 m%1, [srcq+r10*2], 1 vinserti128 m%2, [srcq+r11*2], 1 vinserti128 m%3, [srcq+r13*2], 1 vinserti128 m%4, [srcq+ rX*2], 1 add srcq, ssq movu xm%5, [srcq+ r4*2] movu xm%6, [srcq+ r6*2] movu xm%7, [srcq+ r7*2] movu xm%8, [srcq+ r9*2] vinserti128 m%5, [srcq+r10*2], 1 vinserti128 m%6, [srcq+r11*2], 1 vinserti128 m%7, [srcq+r13*2], 1 vinserti128 m%8, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m%1, m12 pmaddwd m%2, m13 pmaddwd m%3, m14 pmaddwd m%4, m15 pmaddwd m%5, m12 pmaddwd m%6, m13 pmaddwd m%7, m14 pmaddwd m%8, m15 phaddd m%1, m%2 %if %9 mova m10, [rsp+0x00] %endif phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, m10 paddd m%5, m10 psrad m%1, xm11 psrad m%5, xm11 packssdw m%1, m%5 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine base_reg r12 mov r7d, pxmaxm %else %assign isput 0 %assign isprep 1 cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [rsp+0xd0] %xdefine base_reg r11 %endif lea base_reg, [%1_8tap_scaled_16bpc_avx2] %define base base_reg-%1_8tap_scaled_16bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm10, mxd vpbroadcastd m10, xm10 mov r5d, t0d DECLARE_REG_TMP 5, 7 mov r6d, pxmaxm %else vpbroadcastd m10, mxm %if isput vpbroadcastw m11, pxmaxm %else mov r6d, pxmaxm %endif %endif mov dyd, dym %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %define dsm [rsp+0x98] %define rX r1 %define rXd r1d %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x98] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif shr r7d, 11 vpbroadcastd m6, [base+pd_0x3ff] vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] movd xm7, [base+s_8tap_h_sh+r7*4] %if isput vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 %else vpbroadcastd m13, [base+pd_m524256] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %if isput .w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0,1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 lea srcq, [srcq+ssq*4] REPX {pshufb x, m10}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm14, r6q pmovsxbw xm14, xm14 pshufd xm8, xm14, q0000 pshufd xm9, xm14, q1111 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pshufd xm8, xm14, q2222 pshufd xm14, xm14, q3333 paddd xm5, xm6 pmaddwd xm6, xm2, xm8 pmaddwd xm8, xm4, xm14 psrldq xm9, xm7, 8 paddd xm5, xm6 paddd xm5, xm13 paddd xm5, xm8 psrad xm5, xm9 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq], xm5 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movu xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm10 pmaddwd xm5, xm15 phaddd xm5, xm5 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movu xm6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm10 pshufb xm6, xm10 pmaddwd xm5, xm15 pmaddwd xm6, xm15 phaddd xm5, xm6 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym mova [rsp+0x00], m12 %if isput mova [rsp+0x20], xm13 %else SWAP m11, m13 %endif mova [rsp+0x30], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m0, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] movu xm1, [srcq+r4 ] movu xm3, [srcq+r6 ] movu xm2, [srcq+r11 ] movu xm4, [srcq+r13 ] lea srcq, [srcq+ssq*4] vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 vinserti128 m1, [srcq+r4 ], 1 vinserti128 m3, [srcq+r6 ], 1 vinserti128 m2, [srcq+r11 ], 1 vinserti128 m4, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m0 paddb m13, m0 REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m14}, m7, m9, m8, m10 REPX {pshufb x, m13}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x30] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 4 5 packssdw m8, m10 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm10, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm10 ; 67 mova [rsp+0x40], xm7 mova [rsp+0x50], xm8 mova [rsp+0x60], xm9 .w4_loop: and myd, 0x3ff mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq xm9, r11q pmovsxbw xm9, xm9 pshufd xm7, xm9, q0000 pshufd xm8, xm9, q1111 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pshufd xm7, xm9, q2222 pshufd xm9, xm9, q3333 pmaddwd xm6, xm2, xm7 pmaddwd xm8, xm3, xm9 %if isput mova xm7, [rsp+0x20] movd xm9, [rsp+0x38] %else SWAP m7, m11 %endif paddd xm4, xm5 paddd xm6, xm8 paddd xm4, xm6 paddd xm4, xm7 %if isput psrad xm4, xm9 packusdw xm4, xm4 pminuw xm4, xm11 movq [dstq], xm4 add dstq, dsq %else SWAP m11, m7 psrad xm4, 6 packssdw xm4, xm4 movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop mova xm8, [rsp+0x00] movd xm9, [rsp+0x30] movu xm4, [srcq] movu xm5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x40] mova [rsp+0x40], xm1 mova xm1, [rsp+0x50] mova [rsp+0x50], xm2 mova xm2, [rsp+0x60] mova [rsp+0x60], xm3 pshufb xm4, xm12 pshufb xm5, xm13 pmaddwd xm4, xm14 pmaddwd xm5, xm15 phaddd xm4, xm5 paddd xm4, xm8 psrad xm4, xm9 packssdw xm4, xm4 punpcklwd xm3, xm10, xm4 mova xm10, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm6, [srcq+ssq*1] movu xm7, [srcq+r6] movu m0, [rsp+0x50] pshufb xm4, xm12 pshufb xm6, xm12 pshufb xm5, xm13 pshufb xm7, xm13 pmaddwd xm4, xm14 pmaddwd xm6, xm14 pmaddwd xm5, xm15 pmaddwd xm7, xm15 mova [rsp+0x40], m0 phaddd xm4, xm5 phaddd xm6, xm7 paddd xm4, xm8 paddd xm6, xm8 psrad xm4, xm9 psrad xm6, xm9 packssdw xm4, xm6 punpcklwd xm9, xm10, xm4 mova [rsp+0x60], xm9 psrldq xm10, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm10 lea srcq, [srcq+ssq*2] jmp .w4_loop SWAP m10, m13 %if isprep SWAP m13, m11 %endif .w8: mov dword [rsp+0x80], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x80], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x80], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x80], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x80], 16 movifprep tmp_stridem, 256 .w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free %if isput movifnidn dsm, dsq mova [rsp+0xb0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 shr t0d, 16 sub srcq, 6 pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0x84], t0d mov [rsp+0x88], srcq mov [rsp+0x90], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+0x80] jz .ret add qword [rsp+0x90], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x40] vpbroadcastd m15, [rsp+0x84] pxor m9, m9 mov srcq, [rsp+0x88] mov r0q, [rsp+0x90] ; dstq / tmpq .hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x40], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq [rsp+0xa0], xm1 movq [rsp+0xa8], xm7 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x60], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x60] vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, mym mov dyd, dym pshufb m0, m9 ; 01a 01b pshufb m1, m9 ; 23a 23b pshufb m2, m9 ; 45a 45b pshufb m3, m9 ; 67a 67b .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm9, r6q punpcklqdq xm9, xm9 pmovsxbw m9, xm9 pshufd m8, m9, q0000 pshufd m7, m9, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m7 pshufd m8, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m9 %if isput psrldq xm8, xm11, 8 %endif paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, xm8 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xb0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x60], myd mov r4d, [rsp+0xa0] mov r6d, [rsp+0xa4] mov r7d, [rsp+0xa8] mov r9d, [rsp+0xac] jz .skip_line vbroadcasti128 m9, [base+wswap] movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq mov myd, [rsp+0x60] mov dyd, dym pshufb m0, m9 pshufb m1, m9 pshufb m2, m9 pshufb m3, m9 pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, m10 psrad m4, xm11 pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, [rsp+0x60] mov dyd, dym pshufb m3, m9 jmp .vloop SWAP m1, m12, m10 SWAP m7, m11 .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*1], 1 vinserti128 m2, [srcq+ssq*2], 1 add srcq, ss3q movq xm6, r4q pmovsxbw xm6, xm6 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 REPX {pshufb x, m10}, m0, m1, m2 pshufb xm3, xm10 REPX {pmaddwd x, m15}, m0, m1, m2 pmaddwd xm3, xm15 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movu xm1, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm1, xm10 pshufb xm5, xm10 pmaddwd xm1, xm15 pmaddwd xm5, xm15 phaddd xm1, xm5 pmaddwd xm5, xm3, xm8 mova xm3, xm0 pmaddwd xm0, xm9 paddd xm1, xm12 psrad xm1, xm7 packssdw xm1, xm1 paddd xm5, xm0 mova xm0, xm2 pmaddwd xm2, xm14 paddd xm5, xm2 palignr xm2, xm1, xm4, 12 punpcklwd xm2, xm1 ; 67 78 pmaddwd xm4, xm2, xm6 paddd xm5, xm13 paddd xm5, xm4 mova xm4, xm1 psrldq xm1, xm7, 8 psrad xm5, xm1 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*2] lea r11, [r4+ssq*1] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*2] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] movu xm3, [srcq+ssq*2] ; 6 _ movu xm10, [srcq+r6 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r11 ], 1 lea srcq, [srcq+ss3q ] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pmaddwd m0, m14 pmaddwd m1, m14 pshufb m7, m13 pshufb m8, m13 pmaddwd m7, m15 pmaddwd m8, m15 pshufb m2, m12 pshufb xm3, xm12 pmaddwd m2, m14 pmaddwd xm3, xm14 pshufb m9, m13 pshufb xm10, xm13 pmaddwd m9, m15 pmaddwd xm10, xm15 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 phaddd xm3, xm10 paddd m0, m5 paddd m1, m5 paddd m2, m5 paddd xm3, xm5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 psrad xm3, xm6 vperm2i128 m4, m0, m1, 0x21 ; 1 2 vperm2i128 m5, m1, m2, 0x21 ; 3 4 vperm2i128 m6, m2, m3, 0x21 ; 5 6 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pslld m4, 16 pslld m5, 16 pslld m6, 16 pblendw m0, m4, 0xaa ; 01 12 pblendw m1, m5, 0xaa ; 23 34 pblendw m2, m6, 0xaa ; 45 56 movq xm10, r13q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 .dy1_w4_loop: movu xm11, [srcq+ssq*0] movu xm6, [srcq+r4 ] vinserti128 m11, [srcq+ssq*1], 1 vinserti128 m6, [srcq+r11 ], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufb m11, m12 pshufb m6, m13 pmaddwd m11, m14 pmaddwd m6, m15 paddd m4, [rsp+0x20] phaddd m11, m6 pmaddwd m6, m2, m9 paddd m11, [rsp+0x00] psrad m11, [rsp+0x40] mova m0, m1 mova m1, m2 paddd m5, m6 paddd m4, m5 vinserti128 m2, m3, xm11, 1 pslld m3, m11, 16 pblendw m2, m3, 0xaa ; 67 78 pmaddwd m5, m2, m10 vextracti128 xm3, m11, 1 paddd m4, m5 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy1_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy1_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput %define dsm [rsp+0xb8] movifnidn dsm, dsq mova [rsp+0xc0], xm7 %else %if UNIX64 %define hm [rsp+0xb8] %endif %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy1_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep vbroadcasti128 m7, [base+wswap] pshufb m0, m7 pshufb m1, m7 pshufb m2, m7 pshufb m3, m7 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, [rsp+0x00] psrad m4, [rsp+0x40] pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop SWAP m1, m12, m10 SWAP m7, m11 .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*2] movu xm2, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m0, m10 pshufb m1, m10 pshufb m2, m10 pmaddwd m0, m15 pmaddwd m1, m15 pmaddwd m2, m15 movq xm6, r4q pmovsxbw xm6, xm6 phaddd m0, m1 phaddd m1, m2 paddd m0, m12 paddd m1, m12 psrad m0, xm7 psrad m1, xm7 packssdw m0, m1 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m0, 1 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 punpcklwd xm2, xm0, xm1 ; 01 23 punpckhwd xm1, xm0, xm1 ; 23 45 .dy2_w2_loop: movu xm3, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 lea srcq, [srcq+ssq*4] pmaddwd xm4, xm2, xm8 pmaddwd xm1, xm9 pshufb m3, m10 pshufb m5, m10 pmaddwd m3, m15 pmaddwd m5, m15 phaddd m3, m5 paddd xm4, xm1 paddd m3, m12 psrad m3, xm7 packssdw m3, m3 pshufd m3, m3, q2100 palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 vextracti128 xm1, m0, 1 punpcklwd xm2, xm0, xm1 ; 45 67 punpckhwd xm1, xm0, xm1 ; 67 89 pmaddwd xm3, xm2, xm14 pmaddwd xm5, xm1, xm6 paddd xm4, xm13 paddd xm4, xm3 psrldq xm3, xm7, 8 paddd xm4, xm5 psrad xm4, xm3 packusdw xm4, xm4 pminsw xm4, xm11 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pcmpeqd m6, m9 punpckldq m11, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 movq xm10, r13q pblendvb m14, m2, m11 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*1] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r6 ], 1 lea srcq, [srcq+ssq*2] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pshufb m2, m12 pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pshufb m7, m13 pshufb m8, m13 pshufb m9, m13 pmaddwd m7, m15 pmaddwd m8, m15 pmaddwd m9, m15 punpcklqdq xm10, xm10 pmovsxbw m10, xm10 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 paddd m0, m5 paddd m1, m5 paddd m2, m5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 vperm2i128 m3, m0, m2, 0x21 ; 2 4 vperm2i128 m2, m1, 0x13 ; 3 5 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 packssdw m0, m3 ; 0 2 2 4 packssdw m1, m2 ; 1 3 3 5 punpckhwd m2, m0, m1 ; 23 45 punpcklwd m0, m1 ; 01 23 .dy2_w4_loop: movu xm1, [srcq+ssq*0] movu xm6, [srcq+r4 ] movu xm3, [srcq+ssq*1] movu xm11, [srcq+r6 ] vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 vinserti128 m6, [srcq+r11 ], 1 vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 vinserti128 m11, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m7 pmaddwd m5, m2, m8 pshufb m1, m12 pshufb m3, m12 pmaddwd m1, m14 pmaddwd m3, m14 mova m0, [rsp+0x00] pshufb m6, m13 pshufb m11, m13 pmaddwd m6, m15 pmaddwd m11, m15 paddd m4, m5 movd xm5, [rsp+0x40] phaddd m1, m6 phaddd m3, m11 paddd m1, m0 paddd m3, m0 psrad m1, xm5 psrad m3, xm5 pslld m3, 16 pblendw m1, m3, 0xaa ; 67 89 vperm2i128 m0, m2, m1, 0x21 ; 45 67 paddd m4, [rsp+0x20] mova m2, m1 pmaddwd m5, m0, m9 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy2_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy2_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput movifnidn dsm, dsq mova [rsp+0xc0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy2_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movu xm3, [srcq+ r4*2] movu xm4, [srcq+ r6*2] movu xm5, [srcq+ r7*2] movu xm6, [srcq+ r9*2] vinserti128 m3, [srcq+r10*2], 1 vinserti128 m4, [srcq+r11*2], 1 vinserti128 m5, [srcq+r13*2], 1 vinserti128 m6, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m3, m12 pmaddwd m4, m13 pmaddwd m5, m14 pmaddwd m6, m15 phaddd m3, m4 phaddd m5, m6 phaddd m3, m5 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 mova m5, [rsp+0x00] movd xm7, [rsp+0x40] phaddd m4, m6 paddd m3, m5 paddd m4, m5 psrad m3, xm7 psrad m4, xm7 pslld m4, 16 pblendw m3, m4, 0xaa jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN smooth, SMOOTH, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN regular, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN smooth, SMOOTH, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN regular, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 01, 23, 45, 67 lea tmp1d, [myq+gammaq*4] lea tmp2d, [myq+gammaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+gammaq*4] lea myd, [tmp2q+gammaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+gammaq*4] lea tmp2d, [myq+gammaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+gammaq*4] lea myd, [tmp2q+deltaq] ; my += delta punpcklwd m8, m0 shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m9, m%2 mova m%2, m%3 paddd m0, m8 mova m%3, m%4 mova m%4, m%5 paddd m%1, m0, m9 %endmacro cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts mov r6d, r7m lea r9, [$$] shr r6d, 11 vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [warp8x8t_rnd] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m7, m14 paddd m0, m14 psrad m7, 15 psrad m0, 15 packssdw m7, m0 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jg .loop .end: RET cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ alpha, beta, filter, tmp1, gamma, \ my, delta mov r6d, r7m lea filterq, [$$] shr r6d, 11 vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] vpbroadcastw m15, r7m ; pixel_max call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 16 psrad m0, 16 packusdw m7, m0 pmulhrsw m7, m14 pminsw m7, m15 vpermq m7, m7, q3120 movu [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov abcdq, r5m mov mxd, r6m %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] vpbroadcastd m12, [pd_32768] pxor m11, m11 add filterq, mc_warp_filter-$$ lea tmp1q, [ssq*3] add mxd, 512+(3*64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h pblendw m1, m0, 0xaa ; 01 psrld m2, m0, 16 call .h pblendw m2, m0, 0xaa ; 12 psrld m3, m0, 16 call .h pblendw m3, m0, 0xaa ; 23 psrld m4, m0, 16 call .h pblendw m4, m0, 0xaa ; 34 psrld m5, m0, 16 call .h pblendw m5, m0, 0xaa ; 45 psrld m6, m0, 16 call .h pblendw m6, m0, 0xaa ; 56 movsx gammad, word [abcdq+2*2] movsx deltad, word [abcdq+2*3] add myd, 512+(3*64<<10) mov r4d, 4 lea tmp1d, [gammaq*3] sub deltad, tmp1d ; delta -= gamma*3 .main2: call .h psrld m7, m6, 16 pblendw m7, m0, 0xaa ; 67 WARP_V 7, 1, 3, 5, 7 call .h psrld m10, m5, 16 pblendw m10, m0, 0xaa ; 78 WARP_V 0, 2, 4, 6, 10 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] movu xm10, [srcq-6] vinserti128 m10, [srcq+2], 1 shr mxd, 10 ; 0 shr tmp1d, 10 ; 4 movq xm0, [filterq+mxq *8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] movu xm8, [srcq-4] vinserti128 m8, [srcq+4], 1 shr tmp2d, 10 ; 1 shr tmp1d, 10 ; 5 movq xm9, [filterq+tmp2q*8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 ; 2 shr tmp1d, 10 ; 6 punpcklbw m0, m11, m0 pmaddwd m0, m10 movu xm10, [srcq-2] vinserti128 m10, [srcq+6], 1 punpcklbw m9, m11, m9 pmaddwd m9, m8 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta phaddd m0, m9 ; 0 1 4 5 movu xm9, [srcq+0] vinserti128 m9, [srcq+8], 1 shr tmp2d, 10 ; 3 shr tmp1d, 10 ; 7 punpcklbw m8, m11, m8 pmaddwd m8, m10 movq xm10, [filterq+tmp2q*8] vinserti128 m10, [filterq+tmp1q*8], 1 punpcklbw m10, m11, m10 pmaddwd m9, m10 add srcq, ssq phaddd m8, m9 ; 2 3 6 7 phaddd m0, m8 ; 0 1 2 3 4 5 6 7 vpsllvd m0, m13 paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*4] .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 jg .w4_loop .ret: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 movu [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: movu [dstq+32*0], m0 movu [dstq+32*1], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 dec hd jg .w64_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx2_table lea r6, [avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m2, [base+bidir_rnd+t0*4] vpbroadcastd m3, [base+bidir_mul+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+32*0] paddsw m0, [tmp2q+32*0] mova m1, [tmp1q+32*1] paddsw m1, [tmp2q+32*1] add tmp1q, 32*2 add tmp2q, 32*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 pmulhw m0, m3 pmulhw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, h, stride3 lea r6, [w_avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; weight vpbroadcastw m6, r7m ; pixel_max vpbroadcastd m5, [r6-w_avg_avx2_table+pd_65538] movsxd wq, [r6+wq*4] paddw m5, m6 add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0w, r6w ; 16-weight, weight pslld m5, 7 rorx r6d, t0d, 30 ; << 2 and r6d, 0xfffcfffc test dword r7m, 0x800 cmovz r6d, t0d movifnidn hd, hm movd xm4, r6d vpbroadcastd m4, xm4 BIDIR_FN ALIGN function_align .main: mova m2, [tmp1q+32*0] mova m0, [tmp2q+32*0] punpckhwd m3, m0, m2 punpcklwd m0, m2 mova m2, [tmp1q+32*1] mova m1, [tmp2q+32*1] add tmp1q, 32*2 add tmp2q, 32*2 pmaddwd m3, m4 pmaddwd m0, m4 paddd m3, m5 paddd m0, m5 psrad m3, 8 psrad m0, 8 packusdw m0, m3 punpckhwd m3, m1, m2 punpcklwd m1, m2 pmaddwd m3, m4 pmaddwd m1, m4 paddd m3, m5 paddd m1, m5 psrad m3, 8 psrad m1, 8 packusdw m1, m3 pminsw m0, m6 pminsw m1, m6 ret cglobal mask_16bpc, 4, 8, 9, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx2_table lea r7, [mask_avx2_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_64] vpbroadcastd m7, [base+bidir_rnd+r6*4] vpbroadcastd m8, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: pmovzxbw m4, [maskq+16*0] mova m0, [tmp1q+32*0] mova m1, [tmp2q+32*0] punpckhwd m5, m0, m1 punpcklwd m0, m1 psubw m3, m6, m4 punpckhwd m1, m4, m3 ; m, 64-m punpcklwd m4, m3 pmovzxbw m3, [maskq+16*1] pmaddwd m5, m1 ; tmp1 * m + tmp2 * (64-m) mova m1, [tmp1q+32*1] pmaddwd m0, m4 mova m4, [tmp2q+32*1] add maskq, 16*2 punpckhwd m2, m1, m4 add tmp1q, 32*2 punpcklwd m1, m4 add tmp2q, 32*2 psrad m5, 5 psrad m0, 5 packssdw m0, m5 psubw m5, m6, m3 punpckhwd m4, m3, m5 punpcklwd m3, m5 pmaddwd m2, m4 pmaddwd m1, m3 psrad m2, 5 psrad m1, 5 packssdw m1, m2 pmaxsw m0, m7 pmaxsw m1, m7 psubsw m0, m7 psubsw m1, m7 pmulhw m0, m8 pmulhw m1, m8 ret cglobal w_mask_420_16bpc, 4, 9, 16, dst, stride, tmp1, tmp2, w, h, mask, mstr, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r9m ; pixel_max movd xm0, r8m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] movd xm14, [base+pw_2] lea stride3q, [strideq*3] mov maskq, maskmp psubw xm14, xm0 vpbroadcastw m14, xm14 add wq, r7 mov mstrq, mstrmp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w4: phaddd m4, m5 paddw m4, m14 psrlw m4, 2 packuswb m4, m4 vextracti128 xm5, m4, 1 punpcklwd xm4, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 mova [maskq], xm4 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 sub hd, 16 jg .w4_loop .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vperm2i128 m6, m4, m5, 0x21 vpblendd m4, m5, 0xf0 paddw m4, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 movu [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 mova [maskq], xm4 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] movu [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 movu [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16: punpcklqdq m6, m4, m5 punpckhqdq m4, m5 paddw m6, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 pshufd xm4, xm4, q3120 movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 movu [dstq+strideq*2], m2 movu [dstq+stride3q ], m3 mova [maskq], xm4 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m4, m14 paddw m4, m5 psrlw m15, m4, 2 movu [dstq+strideq*0+32*0], m0 movu [dstq+strideq*0+32*1], m1 movu [dstq+strideq*1+32*0], m2 movu [dstq+strideq*1+32*1], m3 call .main mova m6, [deint_shuf] paddw m4, m14 paddw m4, m5 psrlw m4, 2 packuswb m15, m4 vpermd m4, m6, m15 movu [dstq+strideq*2+32*0], m0 movu [dstq+strideq*2+32*1], m1 movu [dstq+stride3q +32*0], m2 movu [dstq+stride3q +32*1], m3 mova [maskq], m4 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, mstrq .w64: paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq], m4 ; no available registers call .main paddw m4, [maskq] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 ; 0 2 4 6 1 3 5 7 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq], m4 sub hd, 2 jg .w64_loop RET ALIGN function_align .main: %macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul mova m%1, [tmp1q+32*%1] mova m%2, [tmp2q+32*%1] punpcklwd m8, m%2, m%1 punpckhwd m9, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m7, m10, m%1 psrlw m7, 10 ; 64-m psubw m%2, m%3, m7 ; m punpcklwd m%1, m7, m%2 punpckhwd m7, m%2 pmaddwd m%1, m8 pmaddwd m7, m9 psrad m%1, 5 psrad m7, 5 packssdw m%1, m7 pmaxsw m%1, m%4 psubsw m%1, m%4 pmulhw m%1, m%5 %endmacro W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_422_16bpc, 4, 9, 16, dst, stride, tmp1, tmp2, w, h, mask, mstr, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r9m ; pixel_max vpbroadcastb m14, r8m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] mova m15, [base+deint_shuf] mov maskq, maskmp add wq, r7 mov mstrq, mstrmp mov r8d, 32 cmp mstrq, r8 cmovb mstrq, r8 lea stride3q, [strideq*3] call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*4] .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 sub hd, 16 jg .w4_loop .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 movu [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] movu [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 movu [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 movu [dstq+strideq*2], m2 movu [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: movu [dstq+strideq*0+32*0], m0 movu [dstq+strideq*0+32*1], m1 movu [dstq+strideq*1+32*0], m2 movu [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64: mov mstrq, mstrmp jmp .w64_inner .w64_loop: call .main add dstq, strideq .w64_inner: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET ALIGN function_align .main: W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 packuswb m4, m5 pxor m5, m5 psubb m4, m14 pavgb m4, m5 vpermd m4, m15, m4 mova [maskq], m4 add maskq, mstrq ret cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm mov r6d, r9m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m4, [base+pw_64] vpbroadcastd m5, [base+bidir_rnd+r6*4] vpbroadcastd m6, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*4] .w4: vextracti128 xm2, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 sub hd, 8 jl .w4_end vextracti128 xm2, m1, 1 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 jg .w4_loop .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 movu [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: movu [dstq+strideq*0], m0 movu [dstq+strideq*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: movu [dstq+32*0], m0 movu [dstq+32*1], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 dec hd jg .w64_loop RET ALIGN function_align .main: W_MASK 0, 2, 4, 5, 6 W_MASK 1, 3, 4, 5, 6 packuswb m2, m3 vpermq m2, m2, q3120 add tmp1q, 32*2 add tmp2q, 32*2 mova [maskq], m2 add maskq, 32 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw m3, [maskq] movq xm0, [dstq+dsq*0] movhps xm0, [dstq+dsq*1] vpbroadcastq m1, [dstq+dsq*2] vpbroadcastq m2, [dstq+r6 ] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psubw m1, m0, [tmpq] add maskq, 16 add tmpq, 32 pmullw m3, m6 pmulhrsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] movu xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 movu xm1, [dstq+dsq*2] vinserti128 m1, [dstq+r6 ], 1 psubw m2, m0, [tmpq+32*0] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 movu [dstq+dsq*2], xm1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] movu m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] movu m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 movu [dstq+dsq*0], m0 movu [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET .w32: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] movu m0, [dstq+32*0] psubw m2, m0, [tmpq+32*0] movu m1, [dstq+32*1] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 movu [dstq+32*0], m0 movu [dstq+32*1], m1 add dstq, dsq dec hd jg .w32 RET .w64: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+32*0] psubw m2, m0, [tmpq+32*0] movu m1, [dstq+32*1] psubw m3, m1, [tmpq+32*1] pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 pmovzxbw m4, [maskq+16*2] pmovzxbw m5, [maskq+16*3] paddw m0, m2 paddw m1, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova m0, [dstq+32*2] psubw m2, m0, [tmpq+32*2] movu m1, [dstq+32*3] psubw m3, m1, [tmpq+32*3] pmullw m4, m6 pmullw m5, m6 add maskq, 16*4 add tmpq, 32*4 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, dsq dec hd jg .w64 RET cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 lea srcq, [srcq+r10*2] ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastw m0, [srcq] .left_loop_%3: mova [dstq+r3*2], m0 add r3, 16 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3*2] %if %1 movu [r12+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, 16 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea r12, [r12+centerwq*2] %else lea r12, [dstq+centerwq*2] %endif xor r3, r3 vpbroadcastw m0, [srcq+centerwq*2-2] .right_loop_%3: movu [r12+r3*2], m0 add r3, 16 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 16 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 16 cmp r1, bwq jl .top_x_loop .end: RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc16_avx512.asm000066400000000000000000006252071517466257200236520ustar00rootroot00000000000000; Copyright © 2020, VideoLAN and dav2d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78 spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 1 pw_2048: times 2 dw 2048 dd 3 pw_8192: times 2 dw 8192 avg_shift: dw 5, 5, 3, 3 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 prep_hv_shift: dq 6, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_rnd: dd 128 - (8192 << 8) warp_8x8_rnd_h: dd 512, 2048 warp_8x8_rnd_v: dd 262144, 65536 warp_8x8t_rnd_v: dd 16384 - (8192 << 15) avg_round: dw -16400, -16400, -16388, -16388 w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) w_mask_round: dd 128, 64 bidir_shift: dw 6, 6, 4, 4 pb_64: times 4 db 64 pw_m512: times 2 dw -512 pw_2: times 2 dw 2 pw_64: times 2 dw 64 pd_32: dd 32 pd_63: dd 63 pd_128: dd 128 pd_640: dd 640 pd_2176: dd 2176 pd_16384: dd 16384 pd_0_4: dd 0, 4 %define pw_16 prep_mul %define pd_512 warp_8x8_rnd_h %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif INIT_ZMM avx512icl cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt t0d, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx t0d, word [r7+t0*2+table_offset(put,)] add t0, r7 jmp t0 .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] add srcq, ssq mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add t0, r7 shr r6d, 11 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] jmp t0 .h_w2: movq xmm1, [srcq+ssq*0] movhps xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xmm0, [srcq+ssq*0+0] movhps xmm0, [srcq+ssq*1+0] movq xmm1, [srcq+ssq*0+2] movhps xmm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xmm0, xm4 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0+0] vinserti32x4 ym0, [srcq+ssq*1+0], 1 movu xm1, [srcq+ssq*0+2] vinserti32x4 ym1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw ym0, ym4 pmullw ym1, ym5 paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 4 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu ym0, [srcq+ssq*0+0] vinserti32x8 m0, [srcq+ssq*1+0], 1 movu ym1, [srcq+ssq*0+2] vinserti32x8 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m6 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] pmullw m1, m4, [srcq+ssq*1+0] pmullw m3, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+64*0+0] pmullw m2, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m3, m5, [srcq+64*1+2] add srcq, ssq paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+64*0+0] pmullw m7, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m8, m5, [srcq+64*1+2] pmullw m2, m4, [srcq+64*2+0] pmullw m9, m5, [srcq+64*2+2] pmullw m3, m4, [srcq+64*3+0] pmullw m10, m5, [srcq+64*3+2] add srcq, ssq REPX {paddw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psrlw x, 4}, m0, m1, m2, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .h_w128 RET .v: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] shl mxyd, 11 vpbroadcastw m8, mxyd add t0, r7 jmp t0 .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xmm2, xmm0, xmm1 movd xmm0, [srcq+ssq*0] punpckldq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm0, [srcq+ssq*0] .v_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xmm2, xmm0, xmm1 movq xmm0, [srcq+ssq*0] punpcklqdq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xmm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 ymm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm2, ymm0, ymm1, 0xf0 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm1, ymm0, 0xf0 psubw ymm1, ymm2 pmulhrsw ymm1, ym8 paddw ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: movu ym0, [srcq+ssq*0] .v_w16_loop: movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw ym1, ym3, ym0 pmulhrsw ym1, ym8 paddw ym1, ym0 movu ym0, [srcq+ssq*0] psubw ym2, ym0, ym3 pmulhrsw ym2, ym8 paddw ym2, ym3 mova [dstq+dsq*0], ym1 mova [dstq+dsq*1], ym2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: movu m0, [srcq+ssq*0] .v_w32_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m8 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m8 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] .v_w64_loop: movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m8 paddw m4, m0 movu m0, [srcq+ssq*0+64*0] psubw m5, m3, m1 pmulhrsw m5, m8 paddw m5, m1 movu m1, [srcq+ssq*0+64*1] psubw m6, m0, m2 pmulhrsw m6, m8 psubw m7, m1, m3 pmulhrsw m7, m8 mova [dstq+dsq*0+64*0], m4 mova [dstq+dsq*0+64*1], m5 paddw m6, m2 paddw m7, m3 mova [dstq+dsq*1+64*0], m6 mova [dstq+dsq*1+64*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*0+64*2] movu m3, [srcq+ssq*0+64*3] .v_w128_loop: movu m4, [srcq+ssq*1+64*0] movu m5, [srcq+ssq*1+64*1] movu m6, [srcq+ssq*1+64*2] movu m7, [srcq+ssq*1+64*3] lea srcq, [srcq+ssq*2] psubw m9, m4, m0 pmulhrsw m9, m8 paddw m9, m0 movu m0, [srcq+ssq*0+64*0] psubw m10, m5, m1 pmulhrsw m10, m8 paddw m10, m1 movu m1, [srcq+ssq*0+64*1] psubw m11, m6, m2 pmulhrsw m11, m8 paddw m11, m2 movu m2, [srcq+ssq*0+64*2] psubw m12, m7, m3 pmulhrsw m12, m8 paddw m12, m3 movu m3, [srcq+ssq*0+64*3] mova [dstq+dsq*0+64*0], m9 psubw m9, m0, m4 pmulhrsw m9, m8 mova [dstq+dsq*0+64*1], m10 psubw m10, m1, m5 pmulhrsw m10, m8 mova [dstq+dsq*0+64*2], m11 psubw m11, m2, m6 pmulhrsw m11, m8 mova [dstq+dsq*0+64*3], m12 psubw m12, m3, m7 pmulhrsw m12, m8 paddw m9, m4 paddw m10, m5 mova [dstq+dsq*1+64*0], m9 mova [dstq+dsq*1+64*1], m10 paddw m11, m6 paddw m12, m7 mova [dstq+dsq*1+64*2], m11 mova [dstq+dsq*1+64*3], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w128_loop RET .hv: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] shl mxyd, 11 vpbroadcastd m6, [pw_2] vpbroadcastw m7, mxyd vpbroadcastd m8, [pw_8192] add t0, r7 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m8, [pw_2048] .hv_12bpc: jmp t0 .hv_w2: vpbroadcastq xmm1, [srcq+ssq*0] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w2_loop: movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm2, [srcq+ssq*0] pmullw xmm1, xmm2, xm4 psrlq xmm2, 16 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 _ 2 _ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xmm0, xm4, [srcq+ssq*0-8] pmullw xmm1, xm5, [srcq+ssq*0-6] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w4_loop: movq xmm1, [srcq+ssq*1+0] movq xmm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0+0] movhps xmm2, [srcq+ssq*0+2] pmullw xmm1, xm4 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 2 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xmm0, xm4, [srcq+ssq*0+0] pmullw xmm1, xm5, [srcq+ssq*0+2] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 vinserti32x4 ym0, xmm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1+0] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x4 ym1, [srcq+ssq*0+0], 1 vinserti32x4 ym2, [srcq+ssq*0+2], 1 pmullw ym1, ym4 pmullw ym2, ym5 paddw ym1, ym6 paddw ym1, ym2 psrlw ym1, 2 ; 1 2 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym7 paddw ym1, ym2 pmulhrsw ym1, ym8 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+ssq*0+0] pmullw ym1, ym5, [srcq+ssq*0+2] paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+ssq*1+0] movu ym2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0+0], 1 vinserti32x8 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m7 paddw m1, m2 pmulhrsw m1, m8 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: .hv_w64: .hv_w128: movifnidn wd, wm lea r6d, [hq+wq*8-256] mov r4, srcq mov r7, dstq .hv_w32_loop0: pmullw m0, m4, [srcq+ssq*0+0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m1 psrlw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+ssq*1+0] pmullw m1, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m3, m6 paddw m3, m1 psrlw m3, 2 psubw m1, m3, m0 paddw m1, m1 pmulhw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m3 paddw m2, m2 pmulhw m2, m7 paddw m2, m3 pmulhrsw m1, m8 pmulhrsw m2, m8 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w32_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: mov r3d, 0x0c kmovb k1, r3d .prep_w4_loop: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] vpbroadcastq ym1, [srcq+strideq*2] vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4} lea srcq, [srcq+strideq*4] pmullw ym0, ym4 psubw ym0, ym5 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .prep_w4_loop RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti32x4 ym0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .prep_w8 RET .prep_w16: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+strideq*0+64*0] pmullw m1, m4, [srcq+strideq*0+64*1] pmullw m2, m4, [srcq+strideq*1+64*0] pmullw m3, m4, [srcq+strideq*1+64*1] lea srcq, [srcq+strideq*2] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+64*0] pmullw m1, m4, [srcq+64*1] pmullw m2, m4, [srcq+64*2] pmullw m3, m4, [srcq+64*3] add srcq, strideq REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .prep_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastd m6, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti32x4 ym1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym0, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0+0] movu xm1, [srcq+strideq*0+2] vinserti32x4 ym0, [srcq+strideq*1+0], 1 vinserti32x4 ym1, [srcq+strideq*1+2], 1 vinserti32x4 m0, [srcq+strideq*2+0], 2 vinserti32x4 m1, [srcq+strideq*2+2], 2 vinserti32x4 m0, [srcq+stride3q +0], 3 vinserti32x4 m1, [srcq+stride3q +2], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8 RET .h_w16: movu ym0, [srcq+strideq*0+0] vinserti32x8 m0, [srcq+strideq*1+0], 1 movu ym1, [srcq+strideq*0+2] vinserti32x8 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] pmullw m1, m4, [srcq+strideq*1+0] pmullw m3, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] add srcq, strideq psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+ 0] pmullw m7, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m8, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m9, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m10, m5, [srcq+194] add srcq, strideq REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psraw x, 2}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .h_w128 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] vpbroadcastw m9, mxyd vpbroadcastd m8, [pw_16] vpbroadcastd m10, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m8, m9 test dword r7m, 0x800 jnz .v_12bpc psllw m8, 2 psllw m9, 2 .v_12bpc: jmp wq .v_w4: movq xmm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq xmm2, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm2, ymm1, 0x30 vpblendd ymm2, ymm3, 0xc0 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 movq xmm0, [srcq+strideq*0] valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 pmullw ymm1, ym8 pmullw ymm2, ym9 psubw ymm1, ym10 paddw ymm1, ymm2 psraw ymm1, 2 mova [tmpq], ymm1 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vinserti32x4 m1, [srcq+strideq*2], 2 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] valignq m2, m0, m1, 2 ; 1 2 3 4 pmullw m1, m8 pmullw m2, m9 psubw m1, m10 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: movu ym0, [srcq+strideq*0] .v_w16_loop: vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 movu ym3, [srcq+strideq*2] vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 lea srcq, [srcq+strideq*4] movu ym0, [srcq+strideq*0] vshufi32x4 m3, m1, m3, q1032 ; 1 2 vshufi32x4 m4, m2, m0, q1032 ; 3 4 pmullw m1, m8 pmullw m2, m8 pmullw m3, m9 pmullw m4, m9 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movu m0, [srcq+strideq*0] .v_w32_loop: movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m1, m8, m0 movu m0, [srcq+strideq*0] pmullw m2, m8, m3 pmullw m3, m9 pmullw m4, m9, m0 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w64_loop: add srcq, strideq pmullw m2, m8, m0 movu m0, [srcq+64*0] pmullw m3, m8, m1 movu m1, [srcq+64*1] pmullw m4, m9, m0 pmullw m5, m9, m1 psubw m2, m10 psubw m3, m10 paddw m2, m4 paddw m3, m5 psraw m2, 2 psraw m3, 2 mova [tmpq+64*0], m2 mova [tmpq+64*1], m3 add tmpq, 64*2 dec hd jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] .v_w128_loop: add srcq, strideq pmullw m4, m8, m0 movu m0, [srcq+64*0] pmullw m5, m8, m1 movu m1, [srcq+64*1] pmullw m6, m8, m2 movu m2, [srcq+64*2] pmullw m7, m8, m3 movu m3, [srcq+64*3] pmullw m11, m9, m0 pmullw m12, m9, m1 pmullw m13, m9, m2 pmullw m14, m9, m3 REPX {psubw x, m10}, m4, m5, m6, m7 paddw m4, m11 paddw m5, m12 paddw m6, m13 paddw m7, m14 REPX {psraw x, 2}, m4, m5, m6, m7 mova [tmpq+64*0], m4 mova [tmpq+64*1], m5 mova [tmpq+64*2], m6 mova [tmpq+64*3], m7 add tmpq, 64*4 dec hd jg .v_w128_loop RET .hv: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m7, mxyd add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: movq xmm0, [srcq+strideq*0+0] movq xmm1, [srcq+strideq*0+2] pmullw xmm0, xm4 pmullw xmm1, xm5 psubw xmm0, xm6 paddw xmm0, xmm1 psraw xmm0, 2 vpbroadcastq ym0, xmm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 ym1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 ym2, [srcq+strideq*0], 1 punpcklqdq ym3, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym3, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym3, ym6 paddw ym1, ym3 psraw ym1, 2 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym7 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0+0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm6 paddw xm0, xm1 psraw xm0, 2 vinserti32x4 m0, xm0, 3 .hv_w8_loop: movu xm1, [srcq+strideq*1+0] movu xm2, [srcq+strideq*1+2] vinserti32x4 ym1, [srcq+strideq*2+0], 1 vinserti32x4 ym2, [srcq+strideq*2+2], 1 vinserti32x4 m1, [srcq+stride3q +0], 2 vinserti32x4 m2, [srcq+stride3q +2], 2 lea srcq, [srcq+strideq*4] vinserti32x4 m1, [srcq+strideq*0+0], 3 vinserti32x4 m2, [srcq+strideq*0+2], 3 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+strideq*0+0] pmullw ym1, ym5, [srcq+strideq*0+2] psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+strideq*1+0] movu ym2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti32x8 m1, [srcq+strideq*0+0], 1 vinserti32x8 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m1 psraw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+strideq*1+0] pmullw m1, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m3, m6 paddw m3, m1 psraw m3, 2 psubw m1, m3, m0 pmulhrsw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m2 psraw m0, 2 psubw m2, m0, m3 pmulhrsw m2, m7 paddw m2, m3 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 .hv_w64_loop: add srcq, strideq pmullw m2, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m3, m4, [srcq+64] pmullw m9, m5, [srcq+66] psubw m2, m6 psubw m3, m6 paddw m2, m8 paddw m3, m9 psraw m2, 2 psraw m3, 2 psubw m8, m2, m0 psubw m9, m3, m1 pmulhrsw m8, m7 pmulhrsw m9, m7 paddw m8, m0 mova m0, m2 paddw m9, m1 mova m1, m3 mova [tmpq+64*0], m8 mova [tmpq+64*1], m9 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: pmullw m0, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m9, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m10, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m11, m5, [srcq+194] REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 REPX {psraw x, 2}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq pmullw m8, m4, [srcq+ 0] pmullw m12, m5, [srcq+ 2] pmullw m9, m4, [srcq+ 64] pmullw m13, m5, [srcq+ 66] pmullw m10, m4, [srcq+128] pmullw m14, m5, [srcq+130] pmullw m11, m4, [srcq+192] pmullw m15, m5, [srcq+194] REPX {psubw x, m6}, m8, m9, m10, m11 paddw m8, m12 paddw m9, m13 paddw m10, m14 paddw m11, m15 REPX {psraw x, 2}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 paddw m12, m0 mova m0, m8 paddw m13, m1 mova m1, m9 mova [tmpq+64*0], m12 mova [tmpq+64*1], m13 paddw m14, m2 mova m2, m10 paddw m15, m3 mova m3, m11 mova [tmpq+64*2], m14 mova [tmpq+64*3], m15 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %if %0 == 5 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %define buf rsp+stack_offset+8 ; shadow space %else DECLARE_REG_TMP 7, 8 %define buf rsp-40 ; red zone %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym15 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h: vpbroadcastw m15, r8m test myd, 0xf00 jnz .hv mov r7d, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4 shr mxd, 16 sub srcq, 4 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m12, [buf+8] vpbroadcastd m11, [buf+4] sub wd, 16 jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+12] vinserti32x8 m3, [srcq+ssq*1+12], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 b0 pshufb m4, m3, m7 vpdpwssd m1, m12, m4 ; a2' b2' pshufb m2, m7 pshufb m3, m6 vpdpwssd m0, m11, m2 ; a1 b1 vpdpwssd m1, m11, m3 ; a1' b1' shufpd m2, m3, 0x55 vpdpwssd m0, m12, m2 ; a2 b2 vpdpwssd m1, m10, m2 ; a0' b0' psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+12] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m7 vpdpwssd m1, m12, m4 ; b2 pshufb m2, m7 pshufb m3, m6 vpdpwssd m0, m11, m2 ; a1 vpdpwssd m1, m11, m3 ; b1 shufpd m2, m3, 0x55 vpdpwssd m0, m12, m2 ; a2 vpdpwssd m1, m10, m2 ; b0 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m11, [pd_32] pmovsxbw xmm0, [base+subpel_filters+1+myq*8] tzcnt r7d, wd vpbroadcastw m15, r8m mov r6, ssq movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)] neg r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] jmp r7 .v_w2: movd xmm2, [srcq+r6 *2] pinsrd xmm2, [srcq+r6 *1], 1 pinsrd xmm2, [srcq+ssq*0], 2 pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] movd xmm0, [srcq+ssq*0] palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 punpcklwd xmm1, xmm2, xmm3 ; 01 12 punpckhwd xmm2, xmm3 ; 23 34 .v_w2_loop: movd xmm3, [srcq+ssq*1] mova xmm4, xm11 vpdpwssd xmm4, xmm1, xm12 ; a0 b0 lea srcq, [srcq+ssq*2] mova xmm1, xmm2 vpdpwssd xmm4, xmm2, xm13 ; a1 b1 punpckldq xmm2, xmm0, xmm3 ; 4 5 movd xmm0, [srcq+ssq*0] punpckldq xmm3, xmm0 ; 5 6 punpcklwd xmm2, xmm3 ; 45 56 vpdpwssd xmm4, xmm2, xm14 ; a2 b2 psrad xmm4, 6 packusdw xmm4, xmm4 pminsw xmm4, xm15 movd [dstq+dsq*0], xmm4 pextrd [dstq+dsq*1], xmm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+r6 *2] vpbroadcastq ymm3, [srcq+r6 *1] vpbroadcastq ymm2, [srcq+ssq*0] vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm1, ymm3, 0x30 vpblendd ymm3, ymm2, 0x30 punpcklwd ymm1, ymm3 ; 01 12 vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 punpcklwd ymm2, ymm4 ; 23 34 .v_w4_loop: vpbroadcastq ymm3, [srcq+ssq*1] mova ymm4, ym11 vpdpwssd ymm4, ymm1, ym12 ; a0 b0 lea srcq, [srcq+ssq*2] mova ymm1, ymm2 vpdpwssd ymm4, ymm2, ym13 ; a1 b1 vpblendd ymm2, ymm0, ymm3, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm3, ymm0, 0x30 punpcklwd ymm2, ymm3 ; 45 56 vpdpwssd ymm4, ymm2, ym14 ; a2 b2 psrad ymm4, 6 vextracti128 xmm3, ymm4, 1 packusdw xmm4, xmm3 pminsw xmm4, xm15 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m0, [srcq+ssq*0] vinserti32x4 m1, m0, [srcq+r6 *2], 0 vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2 vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m5, [spel_v_shuf8] vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4 vpermb m1, m5, m1 ; 01 12 vpermb m2, m5, m0 ; 23 34 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0] mova m4, m11 vpdpwssd m4, m12, m1 ; a0 b0 vshufi32x4 m0, m3, q1032 ; 4 5 6 mova m1, m2 vpdpwssd m4, m13, m2 ; a1 b1 vpermb m2, m5, m0 ; 45 56 vpdpwssd m4, m14, m2 ; a2 b2 psrad m4, 6 vextracti32x8 ym3, m4, 1 packusdw ym4, ym3 pminsw ym4, ym15 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+r6 *1] vinserti32x8 m1, m0, [srcq+ssq*0], 1 vinserti32x8 m0, [srcq+r6*2], 0 mova m6, [spel_v_shuf16] movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m3, [srcq+ssq*0], 1 vpermb m1, m6, m1 ; 12 vpermb m0, m6, m0 ; 01 vpermb m3, m6, m3 ; 34 mova m7, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 .v_w16_loop: mova m5, m11 vpdpwssd m5, m12, m1 ; b0 mova m4, m11 vpdpwssd m4, m12, m0 ; a0 mova m1, m3 vpdpwssd m5, m13, m3 ; b1 mova m0, m2 vpdpwssd m4, m13, m2 ; a1 movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m3, [srcq+ssq*0], 1 vpermb m3, m6, m3 ; 56 vpshrdd m2, m1, m3, 16 ; 45 vpdpwssd m5, m14, m3 ; b2 vpdpwssd m4, m14, m2 ; a2 psrad m5, 6 psrad m4, 6 packusdw m4, m5 pminsw m4, m15 vpermq m4, m7, m4 mova [dstq+dsq*0], ym4 vextracti32x8 [dstq+dsq*1], m4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: lea wd, [hq+wq*8-256] .v_w32_loop0: movu m16, [srcq+r6 *2] movu m17, [srcq+r6 *1] lea r7, [srcq+ssq*2] movu m18, [srcq+ssq*0] movu m19, [srcq+ssq*1] mov r8, dstq movu m20, [r7 +ssq*0] punpcklwd m0, m16, m17 ; 01 punpckhwd m16, m17 punpcklwd m1, m17, m18 ; 12 punpckhwd m17, m18 punpcklwd m2, m18, m19 ; 23 punpckhwd m18, m19 punpcklwd m3, m19, m20 ; 34 punpckhwd m19, m20 .v_w32_loop: mova m4, m11 vpdpwssd m4, m12, m0 ; a0 mova m6, m11 vpdpwssd m6, m12, m16 mova m5, m11 vpdpwssd m5, m12, m1 ; b0 mova m7, m11 vpdpwssd m7, m12, m17 mova m0, m2 vpdpwssd m4, m13, m2 ; a1 mova m16, m18 vpdpwssd m6, m13, m18 mova m1, m3 vpdpwssd m5, m13, m3 ; b1 mova m17, m19 vpdpwssd m7, m13, m19 movu m19, [r7+ssq*1] lea r7, [r7+ssq*2] punpcklwd m2, m20, m19 ; 45 punpckhwd m18, m20, m19 movu m20, [r7+ssq*0] vpdpwssd m4, m14, m2 ; a2 vpdpwssd m6, m14, m18 punpcklwd m3, m19, m20 ; 56 punpckhwd m19, m20 vpdpwssd m5, m14, m3 ; b2 vpdpwssd m7, m14, m19 REPX {psrad x, 6}, m4, m6, m5, m7 packusdw m4, m6 packusdw m5, m7 pminsw m4, m15 pminsw m5, m15 mova [r8+dsq*0], m4 mova [r8+dsq*1], m5 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w32_loop add srcq, 64 add dstq, 64 movzx hd, wb sub wd, 1<<8 jg .v_w32_loop0 vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 2 neg r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: movu xm4, [srcq+r6 *2] vinserti32x4 ym4, [srcq+r6 *1], 1 vinserti32x4 m4, [srcq+ssq*0], 2 vbroadcasti32x4 m6, [spel_h_shufA] vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] ; 4 mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [buf+20] vpbroadcastd ym14, [buf+24] cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova m1, m10 pshufb m4, m6 pshufb xm5, xm6 punpcklqdq m2, m4, m5 vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_ mova ym6, [spel_h_shuf2a] punpckhqdq m4, m5 mova xm5, [spel_shuf2] vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 .hv_w2_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym3, [srcq+ssq*0], 1 vpermb ym3, ym6, ym3 pmaddwd xmm0, xm12, xm1 ; a0 b0 mova xm4, xm10 vpdpwssd xm4, xm8, xm3 vextracti32x4 xm3, ym3, 1 mova xm1, xm2 vpdpwssd xmm0, xm13, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm3 ; 5 6 vpermt2b xm2, xm5, xm4 ; 45 56 vpdpwssd xmm0, xm14, xm2 ; a2 b2 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m7, [spel_h_shufB] mova ym0, [spel_shuf4a] pshufb m1, m4, m6 mova m2, m10 vpdpwssd m2, m8, m1 pshufb xm1, xm5, xm6 mova xm3, xm10 vpdpwssd xm3, xm8, xm1 pshufb m4, m7 pshufb xm5, xm7 vpdpwssd m2, m9, m4 ; 0 1 2 3 vpdpwssd xm3, xm9, xm5 ; 4 mova ym5, [spel_shuf4b] vpermb m1, m0, m2 ; 01 12 vshufi32x4 m2, m3, q1032 ; 2 3 4 vpermb m2, m0, m2 ; 23 34 .hv_w4_loop: movu xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym3, [srcq+ssq*0], 1 pmaddwd ym0, ym12, ym1 ; a0 b0 mova ym1, ym2 pshufb ym4, ym3, ym6 mova ym2, ym10 vpdpwssd ym2, ym8, ym4 pshufb ym3, ym7 vpdpwssd ym0, ym13, ym1 ; a1 b1 vpdpwssd ym2, ym9, ym3 ; 5 6 vpermt2b ym2, ym5, ym1 ; 45 56 vpdpwssd ym0, ym14, ym2 ; a2 b2 psrad ym0, 10 vextracti32x4 xm4, ym0, 1 packusdw xm0, xm4 pminsw xmm0, xm0, xm15 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 4 neg r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m8, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m8, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m9, xmm0 vpbroadcastd m10, [buf+ 4] vpbroadcastd m11, [buf+ 8] vpbroadcastd m12, xmm1 vpbroadcastd m13, [buf+20] vpbroadcastd m14, [buf+24] cmp wd, 16 jge .hv_w16 mova m6, [spel_h_shufA] movu ym16, [srcq+r6 *2] vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1 movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3 lea srcq, [srcq+ssq*2] movu ym18, [srcq+ssq*0] ; 4 movu m7, [spel_h_shufC] vpermb m3, m6, m16 mova m1, m8 vpermb m4, m6, m17 vpdpwssd m1, m9, m3 ; a0 b0 mova m2, m8 vpermb m5, m6, m18 vpdpwssd m2, m9, m4 ; c0 d0 mova m0, m8 vpermb m16, m7, m16 vpdpwssd m0, m9, m5 ; e0 vpermb m17, m7, m17 vpdpwssd m1, m11, m16 ; a2 b2 vpermb m18, m7, m18 vpdpwssd m2, m11, m17 ; c2 d2 shufpd m3, m16, 0x55 vpdpwssd m0, m11, m18 ; e2 mova m16, [spel_shuf8a] shufpd m4, m17, 0x55 vpdpwssd m1, m10, m3 ; a1 b1 shufpd m5, m18, 0x55 vpdpwssd m2, m10, m4 ; c1 d1 vpdpwssd m0, m10, m5 ; e1 mova m5, [spel_shuf8b] vpermt2b m1, m16, m2 ; 01 12 vpermt2b m2, m16, m0 ; 23 34 .hv_w8_loop: movu ym18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0], 1 mova m0, m8 vpermb m17, m6, m18 vpdpwssd m0, m9, m17 ; f0 g0 vpermb m18, m7, m18 pmaddwd m16, m12, m1 ; A0 B0 vpdpwssd m0, m11, m18 ; f2 g2 shufpd m17, m18, 0x55 mova m1, m2 vpdpwssd m16, m13, m2 ; A1 B1 vpdpwssd m0, m10, m17 ; f1 g1 vpermt2b m2, m5, m0 ; 45 56 vpdpwssd m16, m14, m2 ; A2 B2 psrad m16, 10 vextracti32x8 ym17, m16, 1 packusdw ym16, ym17 pminsw ym16, ym15 mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] jg .hv_w32 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 movu ym16, [srcq+r6 *1+ 0] movu ym17, [srcq+r6 *1+12] vinserti32x8 m16, [srcq+ssq*0+ 0], 1 vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2 movu ym18, [srcq+ssq*1+ 0] movu ym19, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4 pshufb m2, m20 mova m1, m8 pshufb m3, m16, m20 vpdpwssd m1, m11, m2 ; a2 mova m2, m8 pshufb m4, m17, m21 vpdpwssd m2, m9, m3 ; b0 c0 mova m3, m8 pshufb m5, m18, m20 vpdpwssd m3, m11, m4 ; b2' c2' mova m4, m8 pshufb m7, m19, m21 vpdpwssd m4, m9, m5 ; d0 e0 mova m5, m8 pshufb m0, m6, m20 vpdpwssd m5, m11, m7 ; d2' e2' mova m7, [spel_shuf16] pshufb m16, m21 vpdpwssd m1, m9, m0 ; a0 pshufb m17, m20 vpdpwssd m2, m10, m16 ; b1 c1 pshufb m18, m21 vpdpwssd m3, m10, m17 ; b1' c1' pshufb m19, m20 vpdpwssd m4, m10, m18 ; d1 e1 pshufb m6, m21 vpdpwssd m5, m10, m19 ; d1' e1' shufpd m16, m17, 0x55 vpdpwssd m1, m10, m6 ; a1 shufpd m18, m19, 0x55 vpdpwssd m2, m11, m16 ; b2 c2 vpdpwssd m3, m9, m16 ; b0' c0' vpdpwssd m4, m11, m18 ; d2 e2 vpdpwssd m5, m9, m18 ; d0' e0' pslldq m1, 1 vpermt2b m2, m7, m3 ; 12 vpermt2b m4, m7, m5 ; 34 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 .hv_w16_loop: movu ym18, [srcq+ssq*1+ 0] movu ym19, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 mova m5, m8 mova m6, m8 pshufb m17, m18, m20 vpdpwssd m5, m9, m17 ; f0 g0 pshufb m16, m19, m21 vpdpwssd m6, m11, m16 ; f2' g2' pmaddwd m17, m12, m2 ; B0 mova m2, m4 pmaddwd m16, m12, m1 ; A0 mova m1, m3 pshufb m18, m21 vpdpwssd m5, m10, m18 ; f1 g1 pshufb m19, m20 vpdpwssd m6, m10, m19 ; f1' g1' vpdpwssd m17, m13, m4 ; B1 vpdpwssd m16, m13, m3 ; A1 shufpd m18, m19, 0x55 vpdpwssd m5, m11, m18 ; f2 g2 vpdpwssd m6, m9, m18 ; f0' g0' mova m4, m7 vpermi2b m4, m5, m6 ; 56 vpshrdd m3, m2, m4, 16 ; 45 vpdpwssd m17, m14, m4 ; B2 vpdpwssd m16, m14, m3 ; A2 psrad m16, 10 psrad m17, 10 vshufi32x4 m18, m16, m17, q3232 vinserti32x8 m16, ym17, 1 packusdw m16, m18 pminsw m16, m15 mova [dstq+dsq*0], ym16 vextracti32x8 [dstq+dsq*1], m16, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop vzeroupper RET .hv_w32: WIN64_SPILL_XMM 28 mova m27, [spel_shuf32] lea wd, [hq+wq*8-256] .hv_w32_loop0: movu m16, [srcq+r6 *2+ 0] movu m7, [srcq+r6 *2+12] movu m6, [srcq+r6 *1+ 0] movu m18, [srcq+r6 *1+12] lea r7, [srcq+ssq*2] movu m17, [srcq+ssq*0+ 0] movu m19, [srcq+ssq*0+12] movu m22, [srcq+ssq*1+ 0] movu m24, [srcq+ssq*1+12] mov r8, dstq movu m23, [r7 +ssq*0+ 0] movu m25, [r7 +ssq*0+12] pshufb m1, m16, m20 mova m0, m8 pshufb m2, m7, m21 vpdpwssd m0, m9, m1 ; a0 mova m1, m8 pshufb m4, m6, m20 vpdpwssd m1, m11, m2 ; a2' mova m2, m8 pshufb m3, m17, m20 vpdpwssd m2, m9, m4 ; b0 mova m4, m8 pshufb m5, m18, m21 vpdpwssd m4, m9, m3 ; c0 mova m3, m8 pshufb m26, m19, m21 vpdpwssd m3, m11, m5 ; b2' mova m5, m8 pshufb m16, m21 vpdpwssd m5, m11, m26 ; c2' pshufb m7, m20 vpdpwssd m0, m10, m16 ; a1 pshufb m6, m21 vpdpwssd m1, m10, m7 ; a1' pshufb m17, m21 vpdpwssd m2, m10, m6 ; b1 pshufb m18, m20 vpdpwssd m4, m10, m17 ; c1 pshufb m19, m20 vpdpwssd m3, m10, m18 ; b1' shufpd m16, m7, 0x55 vpdpwssd m5, m10, m19 ; c1' shufpd m6, m18, 0x55 vpdpwssd m0, m11, m16 ; a2 shufpd m17, m19, 0x55 vpdpwssd m1, m9, m16 ; a0' pshufb m16, m22, m20 vpdpwssd m2, m11, m6 ; b2 pshufb m7, m23, m20 vpdpwssd m4, m11, m17 ; c2 vpdpwssd m3, m9, m6 ; b0' mova m6, m8 vpdpwssd m5, m9, m17 ; c0' pshufb m17, m24, m21 vpdpwssd m6, m9, m16 ; d0 mova m16, m8 pshufb m26, m25, m21 vpdpwssd m16, m9, m7 ; e0 mova m7, m8 pshufb m22, m21 vpdpwssd m7, m11, m17 ; d2' mova m17, m8 pshufb m23, m21 vpdpwssd m17, m11, m26 ; e2' pshufb m24, m20 vpdpwssd m6, m10, m22 ; d1 pshufb m25, m20 vpdpwssd m16, m10, m23 ; e1 shufpd m22, m24, 0x55 vpdpwssd m7, m10, m24 ; d1' shufpd m23, m25, 0x55 vpdpwssd m17, m10, m25 ; e1' pslldq m0, 1 vpdpwssd m6, m11, m22 ; d2 pslldq m1, 1 vpdpwssd m16, m11, m23 ; e2 vpermt2b m2, m27, m4 ; 12 vpdpwssd m7, m9, m22 ; d0' vpermt2b m3, m27, m5 ; 12' vpdpwssd m17, m9, m23 ; e0' vpshrdd m0, m2, 16 ; 01 vpermt2b m6, m27, m16 ; 34 vpshrdd m1, m3, 16 ; 01' vpermt2b m7, m27, m17 ; 34' vpshrdd m4, m2, m6, 16 ; 23 vpshrdd m5, m3, m7, 16 ; 23' .hv_w32_loop: movu m22, [r7+ssq*1+ 0] movu m24, [r7+ssq*1+12] lea r7, [r7+ssq*2] movu m23, [r7+ssq*0+ 0] movu m25, [r7+ssq*0+12] pmaddwd m17, m12, m2 ; B0 mova m2, m6 pmaddwd m19, m12, m3 ; B0' mova m3, m7 pmaddwd m16, m12, m0 ; A0 mova m0, m4 pmaddwd m18, m12, m1 ; A0' mova m1, m5 vpdpwssd m17, m13, m6 ; B1 vpdpwssd m19, m13, m7 ; B1' mova m6, m8 vpdpwssd m16, m13, m4 ; A1 pshufb m4, m22, m20 vpdpwssd m18, m13, m5 ; A1' pshufb m7, m23, m20 vpdpwssd m6, m9, m4 ; f0 mova m4, m8 pshufb m5, m24, m21 vpdpwssd m4, m9, m7 ; g0 mova m7, m8 pshufb m26, m25, m21 vpdpwssd m7, m11, m5 ; f2' mova m5, m8 pshufb m22, m21 vpdpwssd m5, m11, m26 ; g2' pshufb m23, m21 vpdpwssd m6, m10, m22 ; f1 pshufb m24, m20 vpdpwssd m4, m10, m23 ; g1 pshufb m25, m20 vpdpwssd m7, m10, m24 ; f1' shufpd m22, m24, 0x55 vpdpwssd m5, m10, m25 ; g1' shufpd m23, m25, 0x55 vpdpwssd m6, m11, m22 ; f2 vpdpwssd m4, m11, m23 ; g2 vpdpwssd m7, m9, m22 ; f0' vpdpwssd m5, m9, m23 ; g0' vpermt2b m6, m27, m4 ; 56 vpermt2b m7, m27, m5 ; 56' vpdpwssd m17, m14, m6 ; B2 vpshrdd m4, m2, m6, 16 ; 45 vpdpwssd m19, m14, m7 ; B2' vpshrdd m5, m3, m7, 16 ; 45' vpdpwssd m16, m14, m4 ; A2 vpdpwssd m18, m14, m5 ; A2' REPX {psrad x, 10}, m17, m19, m16, m18 packusdw m17, m19 packusdw m16, m18 pminsw m17, m15 pminsw m16, m15 mova [r8+dsq*0], m16 mova [r8+dsq*1], m17 lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w32_loop add srcq, 64 add dstq, 64 movzx hd, wb sub wd, 1<<8 jg .hv_w32_loop0 RET PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc PUT_8TAP_FN sharp, SHARP, SHARP cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m10, [pd_32] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r7d, wd vpbroadcastw m11, r8m lea r6, [ssq*3] movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] sub srcq, r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] vpbroadcastd m15, [rsp+stack_offset+20] jmp r7 .v_w2: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, r6 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, r6 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklwd xmm3, xmm1 ; 45 56 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 .v_w2_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xmm5, xm10 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 mova xmm2, xmm3 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 7 8 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 psrad xmm5, 6 packusdw xmm5, xmm5 pminsw xmm5, xm11 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm5, [srcq+ssq*2] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 psrad ymm4, 6 vextracti128 xmm5, ymm4, 1 packusdw xmm4, xmm5 pminsw xmm4, xm11 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+ssq*2] vinserti32x4 m1, m2, [srcq+ssq*0], 0 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+ssq*0], 1 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 psrad m4, 6 vextracti32x8 ym5, m4, 1 packusdw ym4, ym5 pminsw ym4, ym11 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+ssq*1] vinserti32x8 m1, m0, [srcq+ssq*2], 1 vinserti32x8 m0, [srcq+ssq*0], 0 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+ssq*0] vinserti32x8 m3, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] add srcq, r6 vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m1, m8, m1 ; 12 vpermb m0, m8, m0 ; 01 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 mova m9, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m7, m10 vpdpwssd m7, m12, m1 ; b0 mova m6, m10 vpdpwssd m6, m12, m0 ; a0 mova m1, m3 vpdpwssd m7, m13, m3 ; b1 mova m0, m2 vpdpwssd m6, m13, m2 ; a1 mova m3, m5 vpdpwssd m7, m14, m5 ; b2 mova m2, m4 vpdpwssd m6, m14, m4 ; a2 movu ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m3, m5, 16 ; 67 vpdpwssd m7, m15, m5 ; b3 vpdpwssd m6, m15, m4 ; a3 psrad m7, 6 psrad m6, 6 packusdw m6, m7 pminsw m6, m11 vpermq m6, m9, m6 mova [dstq+dsq*0], ym6 vextracti32x8 [dstq+dsq*1], m6, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: WIN64_SPILL_XMM 23 lea wd, [hq+wq*8-256] .v_w32_loop0: movu m16, [srcq+ssq*0] movu m17, [srcq+ssq*1] lea r7, [srcq+r6 ] movu m18, [srcq+ssq*2] movu m19, [r7 +ssq*0] mov r8, dstq movu m20, [r7 +ssq*1] movu m21, [r7 +ssq*2] add r7, r6 movu m22, [r7 +ssq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [r7+ssq*1] lea r7, [r7+ssq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [r7+ssq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h REPX {psrad x, 6}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [r8+dsq*0], m6 mova [r8+dsq*1], m7 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w32_loop add srcq, 64 add dstq, 64 movzx hd, wb sub wd, 1<<8 jg .v_w32_loop0 RET .h_w2: RESET_STACK_STATE mova ym2, [spel_h_shuf2a] sub srcq, 2 pshufd xmm3, xmm0, q1111 pshufd xmm4, xmm0, q2222 .h_w2_loop: movu xm1, [srcq+ssq*0] vinserti32x4 ym1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova xmm0, xm8 vpermb ym1, ym2, ym1 vpdpwssd xmm0, xmm3, xm1 vextracti32x4 xm1, ym1, 1 vpdpwssd xmm0, xmm4, xm1 psrad xmm0, 6 packusdw xmm0, xmm0 pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] jl .h_w2 vbroadcasti32x4 ym4, [spel_h_shufA] vbroadcasti32x4 ym5, [spel_h_shufB] sub srcq, 2 pshufd xmm0, xmm0, q2211 vpbroadcastq ym6, xmm0 vpermq ym7, ymm0, q1111 .h_w4_loop: movu xm2, [srcq+ssq*0] vinserti32x4 ym2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova ym0, ym8 pshufb ym1, ym2, ym4 vpdpwssd ym0, ym6, ym1 pshufb ym2, ym5 vpdpwssd ym0, ym7, ym2 psrad ym0, 6 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 pminsw xmm0, xm0, xm15 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] mova m7, [spel_h_shufD] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 vpermb m1, m7, m2 vpdpwssd m0, m13, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym15 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h: vpbroadcastw m15, r8m test myd, 0xf00 jnz .hv mov r7d, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 jle .h_w4 shr mxd, 16 sub srcq, 6 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] sub wd, 16 jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+16] vinserti32x8 m3, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m11, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m13, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a2 vpdpwssd m1, m10, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a3 vpdpwssd m1, m11, m2 ; b1 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m10, m4 ; b0 vpdpwssd m0, m12, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m11, m3 ; b1 vpdpwssd m0, m13, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m12, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m11, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m13, m4 ; b3 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m15 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym11, xmm1 vpbroadcastd ym12, [buf+20] vpbroadcastd ym13, [buf+24] vpbroadcastd ym14, [buf+28] movu xm4, [srcq+ssq*0] vinserti32x4 ym4, [srcq+ssq*1], 1 vinserti32x4 m4, [srcq+ssq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova ym6, [spel_h_shuf2a] mova xm7, [spel_shuf2] mova m1, m10 pshufb m4, m2 pshufb m0, m2 punpcklqdq m2, m4, m0 vpdpwssd m1, m8, m2 ; 04 15 26 3_ punpckhqdq m4, m0 vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 vextracti32x4 xm3, m1, 2 ; 45 56 .hv_w2_loop: movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 mova xm4, xm10 vpermb ym5, ym6, ym5 pmaddwd xmm0, xm11, xm1 ; a0 b0 vpdpwssd xm4, xm8, xm5 vextracti32x4 xm5, ym5, 1 mova xm1, xm2 vpdpwssd xmm0, xm12, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm5 ; 7 8 mova xm2, xm3 vpdpwssd xmm0, xm13, xm3 ; a2 b2 vpermt2b xm3, xm7, xm4 ; 67 78 vpdpwssd xmm0, xm14, xm3 ; a3 b3 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm15 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym18, [srcq+ssq*0], 1 pmaddwd ym16, ym11, ym1 ; a0 b0 mova ym1, ym2 mova ym2, ym3 pshufb ym17, ym18, ym19 mova ym3, ym10 vpdpwssd ym3, ym8, ym17 pshufb ym18, ym20 vpdpwssd ym16, ym12, ym1 ; a1 b1 vpdpwssd ym3, ym9, ym18 ; 7 8 vpdpwssd ym16, ym13, ym2 ; a2 b2 vpermt2b ym3, ym7, ym2 ; 67 78 vpdpwssd ym16, ym14, ym3 ; a3 b3 psrad ym16, 10 vextracti128 xm17, ym16, 1 packusdw xm16, xm17 pminsw xm16, xm15 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m11, xmm0 vpbroadcastd m12, [buf+ 4] vpbroadcastd m13, [buf+ 8] vpbroadcastd m14, [buf+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [buf+20] vpbroadcastd m18, [buf+24] vpbroadcastd m19, [buf+28] cmp wd, 8 jg .hv_w16 mova m5, [spel_h_shufA] movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 movu ym9, [srcq+ssq*2] add srcq, r6 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 movu ym20, [srcq+ssq*1] vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+ssq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m11, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m11, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m11, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m11, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m12, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m12, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m12, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m12, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m13, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m13, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m13, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m13, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m14, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m14, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m14, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m14, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m0, [srcq+ssq*0], 1 mova m4, m10 vpermb m21, m5, m0 vpdpwssd m4, m11, m21 ; h0 i0 vpermb m21, m6, m0 pmaddwd m20, m16, m1 ; A0 B0 vpdpwssd m4, m12, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m13, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m14, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 psrad m20, 10 vextracti32x8 ym21, m20, 1 packusdw ym20, ym21 pminsw ym20, ym15 mova [dstq+dsq*0], xm20 vextracti128 [dstq+dsq*1], ym20, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 26 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] add wd, wd mova m9, [spel_shuf16] lea wd, [hq+wq*8-256] .hv_w16_loop0: vbroadcasti32x8 m5, [srcq+ssq*0+ 8] vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 movu ym6, [srcq+ssq*1+ 0] movu ym7, [srcq+ssq*1+16] lea r7, [srcq+r6] vinserti32x8 m6, [srcq+ssq*2+ 0], 1 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 movu ym22, [r7 +ssq*0+ 0] movu ym23, [r7 +ssq*0+16] mov r8, dstq vinserti32x8 m22, [r7 +ssq*1+ 0], 1 vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4 movu ym24, [r7 +ssq*2+ 0] movu ym25, [r7 +ssq*2+16] add r7, r6 vinserti32x8 m24, [r7 +ssq*0+ 0], 1 vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6 pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m11, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m11, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m13, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m12, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m12, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m14, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m13, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m13, m7 ; b2 vpdpwssd m3, m11, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m14, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m14, m6 ; b3 vpdpwssd m3, m12, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m11, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m13, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m11, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m13, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m12, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m14, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m12, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m14, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m13, m23 ; d2 vpdpwssd m5, m11, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m13, m25 ; f2 vpdpwssd m7, m11, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m14, m22 ; d3 vpdpwssd m5, m12, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m14, m24 ; f3 vpdpwssd m7, m12, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [r7+ssq*1+ 0] movu ym25, [r7+ssq*1+16] lea r7, [r7+ssq*2] vinserti32x8 m24, [r7+ssq*0+ 0], 1 vinserti32x8 m25, [r7+ssq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m11, m0 ; h0 pshufb m0, m25, m20 vpdpwssd m8, m13, m0 ; i2 pmaddwd m22, m16, m1 ; A0 mova m1, m3 pmaddwd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m12, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m14, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m13, m25 ; h2 vpdpwssd m8, m11, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m14, m24 ; h3 vpdpwssd m8, m12, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 psrad m22, 10 psrad m23, 10 vshufi32x4 m0, m22, m23, q3232 vinserti32x8 m22, ym23, 1 packusdw m22, m0 pminsw m22, m15 mova [r8+dsq*0], ym22 vextracti32x8 [r8+dsq*1], m22, 1 lea r8, [r8+dsq*2] sub hd, 2 jg .hv_w16_loop add srcq, 32 add dstq, 32 movzx hd, wb sub wd, 1<<8 jg .hv_w16_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [pw_8192] movzx wd, word [r7+wq*2+table_offset(prep,)] shr r5d, 11 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] add wq, r7 lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufC] mova m8, [prep_endB] .h_w8_loop: movu ym4, [srcq+ssq*0] vinserti32x8 m4, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+ssq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 ; a0 b0 vpdpwssd m1, m12, m3 ; c0 d0 vpermb m4, m7, m4 vpermb m5, m7, m5 vpdpwssd m0, m14, m4 ; a2 b2 vpdpwssd m1, m14, m5 ; c2 d2 shufpd m2, m4, 0x55 shufpd m3, m5, 0x55 vpdpwssd m0, m13, m2 ; a1 b1 vpdpwssd m1, m13, m3 ; c1 d1 vpermt2b m0, m8, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h: vpbroadcastd m10, [prep_8tap_rnd] test myd, 0xf00 jnz .hv lea r6, [ssq*3] cmp wd, 4 je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] mov r5d, r7m sub srcq, 4 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] cmp wd, 16 jl .h_w8 vbroadcasti32x4 m5, [spel_h_shufA] vbroadcasti32x4 m6, [spel_h_shufB] mova m7, [prep_endC] jg .h_w32 .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+12] vinserti32x8 m3, [srcq+ssq*1+12], 1 lea srcq, [srcq+ssq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m5 ; 01 vpdpwssd m0, m12, m4 ; a0 b0 pshufb m4, m3, m6 ; 89 vpdpwssd m1, m14, m4 ; a2' b2' pshufb m2, m6 ; 23 pshufb m3, m5 ; 67 vpdpwssd m0, m13, m2 ; a1 b1 vpdpwssd m1, m13, m3 ; a1' b1' shufpd m2, m3, 0x55 ; 45 vpdpwssd m0, m14, m2 ; a2 b2 vpdpwssd m1, m12, m2 ; a0' b0' vpermt2b m0, m7, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+12] mova m0, m10 mova m1, m10 pshufb m4, m2, m5 vpdpwssd m0, m12, m4 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 pshufb m2, m6 pshufb m3, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 shufpd m2, m3, 0x55 vpdpwssd m0, m14, m2 vpdpwssd m1, m12, m2 vpermt2b m0, m7, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, ssq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+1+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 mov r6, ssq vpbroadcastd m13, [tmpq+ 4] neg r6 vpbroadcastd m14, [tmpq+ 8] jmp r7 .v_w4: mov r3d, 0x330c movq xm1, [srcq+r6 *2] kmovw k1, r3d vpbroadcastq ym1{k1}, [srcq+r6 *1] vpbroadcastq m2, [srcq+ssq*0] vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3 movq xm0, [srcq+ssq*2] mova ym4, [prep_endA] valignq m0, m1, 2 punpcklwd m1, m0 ; 01 12 23 34 .v_w4_loop: lea srcq, [srcq+ssq*4] movq xm2, [srcq+r6 *1] vpbroadcastq ym2{k1}, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3 mova m3, m10 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 valignq m0, m2, m0, 6 ; 4 5 6 7 punpcklwd m0, m2 ; 45 56 67 78 vpdpwssd m3, m14, m0 ; a2 b2 c2 d2 vshufi32x4 m1, m0, q1032 ; 23 34 45 56 vpdpwssd m3, m13, m1 ; a1 b1 c1 d1 mova m1, m0 mova m0, m2 vpermb m3, m4, m3 mova [tmpq], ym3 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: vbroadcasti32x4 ym1, [srcq+r6 *1] mov r3d, 0x33 vbroadcasti32x4 m2, [srcq+ssq*0] kmovb k1, r3d mova m6, [spel_v_shuf8] vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2 vbroadcasti32x4 ym0, [srcq+ssq*1] vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4 mova m7, [prep_endB] vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m0 ; 23 34 .v_w8_loop: lea srcq, [srcq+ssq*4] vbroadcasti32x4 ym3, [srcq+r6 *1] movu xm4, [srcq+ssq*0] vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6 vbroadcasti32x4 ym0, [srcq+ssq*1] vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8 mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m5, m10 vpdpwssd m5, m12, m2 ; c0 d0 vpermb m1, m6, m3 ; 45 56 vpdpwssd m4, m13, m2 ; a1 b1 vpermb m2, m6, m0 ; 67 78 vpdpwssd m5, m13, m1 ; c1 d1 vpdpwssd m4, m14, m1 ; a2 b2 vpdpwssd m5, m14, m2 ; c2 d2 vpermt2b m4, m7, m5 mova [tmpq], m4 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+r6 *1] vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2 vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1 mova m6, [spel_v_shuf16] movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4 mova m7, [prep_endA] vpermb m1, m6, m1 ; 12 vpermb m0, m6, m0 ; 01 vpermb m3, m6, m3 ; 34 vpshrdd m2, m1, m3, 16 ; 23 .v_w16_loop: mova m5, m10 vpdpwssd m5, m12, m1 ; b0 mova m4, m10 vpdpwssd m4, m12, m0 ; a0 mova m1, m3 vpdpwssd m5, m13, m3 ; b1 movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpdpwssd m4, m13, m2 ; a1 vinserti32x8 m3, [srcq+ssq*0], 1 mova m0, m2 vpermb m3, m6, m3 ; 56 vpshrdd m2, m1, m3, 16 ; 45 vpdpwssd m5, m14, m3 ; b2 vpdpwssd m4, m14, m2 ; a2 vpermt2b m4, m7, m5 mova [tmpq], m4 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 push r8 %endif mova m11, [prep_endC] lea r5, [hq+wq*8-256] .v_w32_loop0: movu m4, [srcq+r6 *2] movu m5, [srcq+r6 *1] lea r7, [srcq+ssq*2] movu m6, [srcq+ssq*0] movu m7, [srcq+ssq*1] mov r8, tmpq movu m8, [r7 +ssq*0] punpcklwd m0, m4, m5 ; 01 punpckhwd m4, m5 punpcklwd m1, m5, m6 ; 12 punpckhwd m5, m6 punpcklwd m2, m6, m7 ; 23 punpckhwd m6, m7 punpcklwd m3, m7, m8 ; 34 punpckhwd m7, m8 .v_w32_loop: mova m16, m10 movu m9, [r7+ssq*1] mova m18, m10 vpdpwssd m16, m12, m0 ; a0 mova m17, m10 vpdpwssd m18, m12, m4 mova m19, m10 vpdpwssd m17, m12, m1 ; b0 lea r7, [r7+ssq*2] vpdpwssd m19, m12, m5 mova m0, m2 vpdpwssd m16, m13, m2 ; a1 punpcklwd m2, m8, m9 ; 45 mova m4, m6 vpdpwssd m18, m13, m6 punpckhwd m6, m8, m9 movu m8, [r7+ssq*0] vpdpwssd m17, m13, m3 ; b1 mova m1, m3 vpdpwssd m19, m13, m7 mova m5, m7 vpdpwssd m16, m14, m2 ; a2 punpcklwd m3, m9, m8 ; 56 vpdpwssd m18, m14, m6 punpckhwd m7, m9, m8 vpdpwssd m17, m14, m3 ; b2 vpdpwssd m19, m14, m7 vpermt2b m16, m11, m18 vpermt2b m17, m11, m19 mova [r8+wq*0], m16 mova [r8+wq*2], m17 lea r8, [r8+wq*4] sub hd, 2 jg .v_w32_loop add srcq, 64 add tmpq, 64 movzx hd, r5b sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 pop r8 %endif vzeroupper RET .hv_w4: movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 2 shr r5d, 11 neg r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m8, [tmpq+ 4] mov r3d, 0xf0 vpbroadcastd m9, [tmpq+ 8] vpbroadcastd m12, xmm1 movu xm3, [srcq+r6 *2] kmovb k1, r3d vinserti32x4 ym3, [srcq+r6 *1], 1 vbroadcasti32x4 m2, [srcq+ssq*0] vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3 movu xm4, [srcq+ssq*2] vbroadcasti32x4 m5, [spel_h_shufA] vbroadcasti32x4 m6, [spel_h_shufB] mova m1, m11 mova m15, [spel_shuf4a] mova xm2, xm11 pshufb m0, m3, m5 vpdpwssd m1, m8, m0 pshufb xm0, xm4, xm5 vpdpwssd xm2, xm8, xm0 vpbroadcastd m13, [tmpq+20] pshufb m3, m6 vpbroadcastd m14, [tmpq+24] pshufb xm4, xm6 mova m7, [spel_shuf4b] vpdpwssd m1, m9, m3 ; 0 1 2 3 vpdpwssd xm2, xm9, xm4 ; 4 vpermt2b m1, m15, m2 ; 01 12 23 34 mova ym15, [prep_endA] .hv_w4_loop: lea srcq, [srcq+ssq*4] movu xm4, [srcq+r6 *1] vinserti32x4 ym4, [srcq+ssq*0], 1 vbroadcasti32x4 m3, [srcq+ssq*1] vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3 mova m2, m11 pshufb m3, m4, m5 vpdpwssd m2, m8, m3 mova m3, m10 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 pshufb m4, m6 vpdpwssd m2, m9, m4 ; 5 6 7 8 mova m4, m1 vpermt2b m1, m7, m2 ; 45 56 67 78 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 vshufi32x4 m4, m1, q1032 ; 23 34 45 56 vpdpwssd m3, m13, m4 ; a1 b1 c1 d1 vpermb m3, m15, m3 mova [tmpq], ym3 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: mova m8, [spel_h_shufA] movu ym18, [srcq+r6 *2] vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1 movu ym19, [srcq+ssq*0] vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3 movu ym20, [srcq+ssq*2] ; 4 movu m9, [spel_h_shufC] mova m21, [spel_shuf8a] mova m0, [spel_shuf8b] vpermb m4, m8, m18 mova m1, m10 vpermb m5, m8, m19 vpdpwssd m1, m12, m4 ; a0 b0 mova m2, m10 vpermb m6, m8, m20 vpdpwssd m2, m12, m5 ; c0 d0 mova m3, m10 vpermb m18, m9, m18 vpdpwssd m3, m12, m6 ; e0 mova m7, [prep_endB] vpermb m19, m9, m19 vpdpwssd m1, m14, m18 ; a2 b2 vpermb m20, m9, m20 vpdpwssd m2, m14, m19 ; c2 d2 shufpd m4, m18, 0x55 vpdpwssd m3, m14, m20 ; e2 shufpd m5, m19, 0x55 vpdpwssd m1, m13, m4 ; a1 b1 shufpd m6, m20, 0x55 vpdpwssd m2, m13, m5 ; c1 d1 vpdpwssd m3, m13, m6 ; e1 vpermt2b m1, m21, m2 ; 01 12 vpermt2b m2, m21, m3 ; 23 34 .hv_w8_loop: lea srcq, [srcq+ssq*4] movu ym18, [srcq+r6 *1] vinserti32x8 m18, [srcq+ssq*0], 1 movu ym19, [srcq+ssq*1] vinserti32x8 m19, [srcq+ssq*2], 1 mova m3, m10 vpermb m5, m8, m18 mova m4, m10 vpermb m6, m8, m19 vpdpwssd m3, m12, m5 ; f0 g0 mova m20, m11 vpdpwssd m4, m12, m6 ; h0 i0 mova m21, m11 vpdpwssd m20, m15, m1 ; A0 B0 vpermb m18, m9, m18 vpdpwssd m21, m15, m2 ; C0 D0 vpermb m19, m9, m19 vpdpwssd m3, m14, m18 ; f2 g2 vpdpwssd m4, m14, m19 ; h2 i2 shufpd m5, m18, 0x55 vpdpwssd m20, m16, m2 ; A1 B1 shufpd m6, m19, 0x55 vpdpwssd m3, m13, m5 ; f1 g1 vpdpwssd m4, m13, m6 ; h1 i1 vpermt2b m2, m0, m3 ; 45 56 vpdpwssd m21, m16, m2 ; C1 D1 mova m1, m2 vpermt2b m2, m0, m4 ; 67 78 vpdpwssd m20, m17, m1 ; A2 B2 vpdpwssd m21, m17, m2 ; A2 B2 vpermt2b m20, m7, m21 mova [tmpq], m20 add tmpq, 64 sub hd, 4 jg .hv_w8_loop vzeroupper RET .hv: vpbroadcastd m11, [pd_128] cmp wd, 4 je .hv_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+1+myq*8] mov r6, ssq sub srcq, 4 shr r5d, 11 neg r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, xmm1 vpbroadcastd m16, [tmpq+20] vpbroadcastd m17, [tmpq+24] cmp wd, 16 jl .hv_w8 vbroadcasti32x4 m8, [spel_h_shufA] vbroadcasti32x4 m9, [spel_h_shufB] jg .hv_w32 vbroadcasti32x8 m6, [srcq+r6 *2+ 8] vinserti32x8 m2, m6, [srcq+r6 *2+16], 1 vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0 movu ym18, [srcq+r6 *1+ 0] movu ym19, [srcq+r6 *1+12] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2 movu ym20, [srcq+ssq*1+ 0] movu ym21, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m20, [srcq+ssq*0+ 0], 1 vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4 pshufb m2, m8 mova m1, m10 pshufb m3, m18, m8 vpdpwssd m1, m14, m2 ; a2 mova m2, m10 pshufb m4, m19, m9 vpdpwssd m2, m12, m3 ; b0 c0 mova m3, m10 pshufb m5, m20, m8 vpdpwssd m3, m14, m4 ; b2' c2' mova m4, m10 pshufb m7, m21, m9 vpdpwssd m4, m12, m5 ; d0 e0 mova m5, m10 pshufb m0, m6, m8 vpdpwssd m5, m14, m7 ; d2' e2' mova m7, [spel_shuf16] pshufb m18, m9 vpdpwssd m1, m12, m0 ; a0 pshufb m19, m8 vpdpwssd m2, m13, m18 ; b1 c1 pshufb m20, m9 vpdpwssd m3, m13, m19 ; b1' c1' pshufb m21, m8 vpdpwssd m4, m13, m20 ; d1 e1 pshufb m6, m9 vpdpwssd m5, m13, m21 ; d1' e1' mova m0, [prep_endB] shufpd m18, m19, 0x55 vpdpwssd m1, m13, m6 ; a1 shufpd m20, m21, 0x55 vpdpwssd m2, m14, m18 ; b2 c2 vpdpwssd m3, m12, m18 ; b0' c0' vpdpwssd m4, m14, m20 ; d2 e2 vpdpwssd m5, m12, m20 ; d0' e0' pslldq m1, 1 vpermt2b m2, m7, m3 ; 12 vpermt2b m4, m7, m5 ; 34 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 .hv_w16_loop: movu ym18, [srcq+ssq*1+ 0] movu ym19, [srcq+ssq*1+12] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0+ 0], 1 vinserti32x8 m19, [srcq+ssq*0+12], 1 mova m5, m10 mova m6, m10 pshufb m21, m18, m8 vpdpwssd m5, m12, m21 ; f0 g0 pshufb m20, m19, m9 mova m21, m11 vpdpwssd m6, m14, m20 ; f2' g2' mova m20, m11 vpdpwssd m21, m15, m2 ; B0 mova m2, m4 vpdpwssd m20, m15, m1 ; A0 mova m1, m3 pshufb m18, m9 vpdpwssd m5, m13, m18 ; f1 g1 pshufb m19, m8 vpdpwssd m6, m13, m19 ; f1' g1' vpdpwssd m21, m16, m4 ; B1 vpdpwssd m20, m16, m3 ; A1 shufpd m18, m19, 0x55 vpdpwssd m5, m14, m18 ; f2 g2 vpdpwssd m6, m12, m18 ; f0' g0' mova m4, m7 vpermi2b m4, m5, m6 ; 56 vpshrdd m3, m2, m4, 16 ; 45 vpdpwssd m21, m17, m4 ; B2 vpdpwssd m20, m17, m3 ; A2 vpermt2b m20, m0, m21 mova [tmpq], m20 add tmpq, 64 sub hd, 2 jg .hv_w16_loop vzeroupper RET .hv_w32: WIN64_SPILL_XMM 29 %if WIN64 push r8 %endif mova m27, [spel_shuf32] lea r5d, [hq+wq*8-256] mova m28, [prep_endC] .hv_w32_loop0: movu m18, [srcq+r6 *2+ 0] movu m7, [srcq+r6 *2+12] movu m6, [srcq+r6 *1+ 0] movu m20, [srcq+r6 *1+12] lea r7, [srcq+ssq*2] movu m19, [srcq+ssq*0+ 0] movu m21, [srcq+ssq*0+12] movu m22, [srcq+ssq*1+ 0] movu m24, [srcq+ssq*1+12] mov r8, tmpq movu m23, [r7 +ssq*0+ 0] movu m25, [r7 +ssq*0+12] pshufb m1, m18, m8 mova m0, m10 pshufb m2, m7, m9 vpdpwssd m0, m12, m1 ; a0 mova m1, m10 pshufb m4, m6, m8 vpdpwssd m1, m14, m2 ; a2' mova m2, m10 pshufb m3, m19, m8 vpdpwssd m2, m12, m4 ; b0 mova m4, m10 pshufb m5, m20, m9 vpdpwssd m4, m12, m3 ; c0 mova m3, m10 pshufb m26, m21, m9 vpdpwssd m3, m14, m5 ; b2' mova m5, m10 pshufb m18, m9 vpdpwssd m5, m14, m26 ; c2' pshufb m7, m8 vpdpwssd m0, m13, m18 ; a1 pshufb m6, m9 vpdpwssd m1, m13, m7 ; a1' pshufb m19, m9 vpdpwssd m2, m13, m6 ; b1 pshufb m20, m8 vpdpwssd m4, m13, m19 ; c1 pshufb m21, m8 vpdpwssd m3, m13, m20 ; b1' shufpd m18, m7, 0x55 vpdpwssd m5, m13, m21 ; c1' shufpd m6, m20, 0x55 vpdpwssd m0, m14, m18 ; a2 shufpd m19, m21, 0x55 vpdpwssd m1, m12, m18 ; a0' pshufb m18, m22, m8 vpdpwssd m2, m14, m6 ; b2 pshufb m7, m23, m8 vpdpwssd m4, m14, m19 ; c2 vpdpwssd m3, m12, m6 ; b0' mova m6, m10 vpdpwssd m5, m12, m19 ; c0' pshufb m19, m24, m9 vpdpwssd m6, m12, m18 ; d0 mova m18, m10 pshufb m26, m25, m9 vpdpwssd m18, m12, m7 ; e0 mova m7, m10 pshufb m22, m9 vpdpwssd m7, m14, m19 ; d2' mova m19, m10 pshufb m23, m9 vpdpwssd m19, m14, m26 ; e2' pshufb m24, m8 vpdpwssd m6, m13, m22 ; d1 pshufb m25, m8 vpdpwssd m18, m13, m23 ; e1 shufpd m22, m24, 0x55 vpdpwssd m7, m13, m24 ; d1' shufpd m23, m25, 0x55 vpdpwssd m19, m13, m25 ; e1' pslldq m0, 1 vpdpwssd m6, m14, m22 ; d2 pslldq m1, 1 vpdpwssd m18, m14, m23 ; e2 vpermt2b m2, m27, m4 ; 12 vpdpwssd m7, m12, m22 ; d0' vpermt2b m3, m27, m5 ; 12' vpdpwssd m19, m12, m23 ; e0' vpshrdd m0, m2, 16 ; 01 vpermt2b m6, m27, m18 ; 34 vpshrdd m1, m3, 16 ; 01' vpermt2b m7, m27, m19 ; 34' vpshrdd m4, m2, m6, 16 ; 23 vpshrdd m5, m3, m7, 16 ; 23' .hv_w32_loop: movu m22, [r7+ssq*1+ 0] movu m24, [r7+ssq*1+12] lea r7, [r7+ssq*2] movu m23, [r7+ssq*0+ 0] movu m25, [r7+ssq*0+12] mova m19, m11 vpdpwssd m19, m15, m2 ; B0 mova m21, m11 vpdpwssd m21, m15, m3 ; B0' mova m18, m11 vpdpwssd m18, m15, m0 ; A0 mova m20, m11 vpdpwssd m20, m15, m1 ; A0' mova m2, m6 vpdpwssd m19, m16, m6 ; B1 mova m3, m7 vpdpwssd m21, m16, m7 ; B1' mova m0, m4 vpdpwssd m18, m16, m4 ; A1 mova m1, m5 pshufb m4, m22, m8 vpdpwssd m20, m16, m5 ; A1' mova m6, m10 pshufb m7, m23, m8 vpdpwssd m6, m12, m4 ; f0 mova m4, m10 pshufb m5, m24, m9 vpdpwssd m4, m12, m7 ; g0 mova m7, m10 pshufb m26, m25, m9 vpdpwssd m7, m14, m5 ; f2' mova m5, m10 pshufb m22, m9 vpdpwssd m5, m14, m26 ; g2' pshufb m23, m9 vpdpwssd m6, m13, m22 ; f1 pshufb m24, m8 vpdpwssd m4, m13, m23 ; g1 pshufb m25, m8 vpdpwssd m7, m13, m24 ; f1' shufpd m22, m24, 0x55 vpdpwssd m5, m13, m25 ; g1' shufpd m23, m25, 0x55 vpdpwssd m6, m14, m22 ; f2 vpdpwssd m4, m14, m23 ; g2 vpdpwssd m7, m12, m22 ; f0' vpdpwssd m5, m12, m23 ; g0' vpermt2b m6, m27, m4 ; 56 vpermt2b m7, m27, m5 ; 56' vpdpwssd m19, m17, m6 ; B2 vpshrdd m4, m2, m6, 16 ; 45 vpdpwssd m21, m17, m7 ; B2' vpshrdd m5, m3, m7, 16 ; 45' vpdpwssd m18, m17, m4 ; A2 vpdpwssd m20, m17, m5 ; A2' vpermt2b m19, m28, m21 vpermt2b m18, m28, m20 mova [r8+wq*0], m18 mova [r8+wq*2], m19 lea r8, [r8+wq*4] sub hd, 2 jg .hv_w32_loop add srcq, 64 add tmpq, 64 movzx hd, r5b sub r5d, 1<<8 jg .hv_w32_loop0 %if WIN64 pop r8 %endif RET PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc PREP_8TAP_FN sharp, SHARP, SHARP cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 lea r6, [strideq*3] sub srcq, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] jmp r7 .v_w4: mov r3d, 0x330c movq xm1, [srcq+strideq*0] kmovw k1, r3d vpbroadcastq ym1{k1}, [srcq+strideq*1] vpbroadcastq m0, [srcq+r6 ] vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3 lea srcq, [srcq+strideq*4] vpbroadcastq ym0{k1}, [srcq+strideq*0] vpbroadcastq m2, [srcq+strideq*1] vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6 mova ym5, [prep_endA] vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4 vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5 punpcklwd m1, m3 ; 01 12 23 34 punpcklwd m2, m0 ; 23 34 45 56 .v_w4_loop: movq xm4, [srcq+r6 ] lea srcq, [srcq+strideq*4] vpbroadcastq ym4{k1}, [srcq+strideq*0] vpbroadcastq m3, [srcq+strideq*1] vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a mova m3, m10 vpdpwssd m3, m12, m1 ; a0 b0 c0 d0 valignq m1, m4, m0, 6 ; 6 7 8 9 vpdpwssd m3, m13, m2 ; a1 b1 c1 d1 mova m0, m4 punpcklwd m4, m1, m4 ; 67 78 89 9a vpdpwssd m3, m15, m4 ; a3 b3 c3 d3 vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78 vpdpwssd m3, m14, m1 ; a2 b2 c2 d2 mova m2, m4 vpermb m3, m5, m3 mova [tmpq], ym3 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+strideq*0] mov r3d, 0x33 vbroadcasti32x4 ym1, [srcq+strideq*1] kmovb k1, r3d mova m7, [spel_v_shuf8] vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 add srcq, r6 vbroadcasti32x4 ym2, [srcq+strideq*0] vbroadcasti32x4 m3, [srcq+strideq*1] vbroadcasti32x4 ym0, [srcq+strideq*2] vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4 vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6 mova m8, [prep_endB] vpermb m1, m7, m1 ; 01 12 vpermb m2, m7, m2 ; 23 34 vpermb m3, m7, m0 ; 45 56 .v_w8_loop: lea srcq, [srcq+strideq*4] vbroadcasti32x4 ym4, [srcq+strideq*0] movu xm5, [srcq+strideq*1] vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8 vbroadcasti32x4 ym0, [srcq+strideq*2] vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a mova m5, m10 vpdpwssd m5, m12, m1 ; a0 b0 mova m6, m10 vpdpwssd m6, m12, m2 ; c0 d0 mova m1, m3 vpdpwssd m5, m13, m2 ; c1 d1 vpdpwssd m6, m13, m3 ; c1 d1 vpermb m2, m7, m4 ; 67 78 vpdpwssd m5, m14, m3 ; a2 b2 vpermb m3, m7, m0 ; 89 9a vpdpwssd m6, m14, m2 ; c2 d2 vpdpwssd m5, m15, m2 ; a3 b3 vpdpwssd m6, m15, m3 ; c3 d3 vpermt2b m5, m8, m6 mova [tmpq], m5 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m0, [srcq+strideq*1] vinserti32x8 m1, m0, [srcq+strideq*2], 1 vinserti32x8 m0, [srcq+strideq*0], 0 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+strideq*0] vinserti32x8 m3, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] add srcq, r6 vinserti32x8 m5, [srcq+strideq*0], 1 mova m11, [prep_endA] vpermb m1, m8, m1 ; 12 vpermb m0, m8, m0 ; 01 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m7, m10 vpdpwssd m7, m12, m1 ; b0 mova m6, m10 vpdpwssd m6, m12, m0 ; a0 mova m1, m3 vpdpwssd m7, m13, m3 ; b1 mova m0, m2 vpdpwssd m6, m13, m2 ; a1 mova m3, m5 vpdpwssd m7, m14, m5 ; b2 mova m2, m4 vpdpwssd m6, m14, m4 ; a2 movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m5, [srcq+strideq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m3, m5, 16 ; 67 vpdpwssd m7, m15, m5 ; b3 vpdpwssd m6, m15, m4 ; a3 vpermt2b m6, m11, m7 mova [tmpq], m6 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: WIN64_PUSH_XMM 23 %if WIN64 push r8 %endif mova m11, [prep_endC] lea r5, [hq+wq*8-256] .v_w32_loop0: movu m16, [srcq+strideq*0] movu m17, [srcq+strideq*1] lea r7, [srcq+r6] movu m18, [srcq+strideq*2] movu m19, [r7 +strideq*0] mov r8, tmpq movu m20, [r7 +strideq*1] movu m21, [r7 +strideq*2] add r7, r6 movu m22, [r7 +strideq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [r7+strideq*1] lea r7, [r7+strideq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [r7+strideq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h vpermt2b m6, m11, m8 vpermt2b m7, m11, m9 mova [r8+wq*0], m6 mova [r8+wq*2], m7 lea r8, [r8+wq*4] sub hd, 2 jg .v_w32_loop add srcq, 64 add tmpq, 64 movzx hd, r5b sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 pop r8 %endif RET .h_w4: RESET_STACK_STATE movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m vbroadcasti32x4 m4, [spel_h_shufA] vbroadcasti32x4 m5, [spel_h_shufB] shr r5d, 11 mova ym9, [prep_endA] psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m6, [tmpq+4] vpbroadcastd m7, [tmpq+8] .h_w4_loop: movu xm2, [srcq+strideq*0] vinserti32x4 ym2, [srcq+strideq*1], 1 vinserti32x4 m2, [srcq+strideq*2], 2 vinserti32x4 m2, [srcq+r6 ], 3 lea srcq, [srcq+strideq*4] mova m0, m10 pshufb m1, m2, m4 vpdpwssd m0, m6, m1 pshufb m2, m5 vpdpwssd m0, m7, m2 vpermb m0, m9, m0 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufB] movu m8, [spel_h_shufC] mova m9, [spel_h_shufD] mova m11, [prep_endB] .h_w8_loop: movu ym4, [srcq+strideq*0] vinserti32x8 m4, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 vpdpwssd m1, m12, m3 vpermb m2, m7, m4 vpermb m3, m7, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 vpermb m2, m8, m4 vpermb m3, m8, m5 vpdpwssd m0, m14, m2 vpdpwssd m1, m14, m3 vpermb m2, m9, m4 vpermb m3, m9, m5 vpdpwssd m0, m15, m2 vpdpwssd m1, m15, m3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h: vpbroadcastd m10, [prep_8tap_rnd] test myd, 0xf00 jnz .hv lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m sub srcq, 6 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] cmp wd, 16 jl .h_w8 vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] mova m11, [prep_endC] jg .h_w32 .h_w16_loop: movu ym2, [srcq+strideq*0+ 0] vinserti32x8 m2, [srcq+strideq*1+ 0], 1 movu ym3, [srcq+strideq*0+16] vinserti32x8 m3, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m13, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m15, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m14, m4 ; a2 vpdpwssd m1, m12, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m15, m2 ; a3 vpdpwssd m1, m13, m2 ; b1 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b0 vpdpwssd m0, m14, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m13, m3 ; b1 vpdpwssd m0, m15, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m14, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m15, m4 ; b3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, strideq dec hd jg .h_w32_loop0 RET .hv: vpbroadcastd m11, [pd_128] cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm1 movu xm16, [srcq+strideq*0] mov r3d, 0xff0 vinserti128 ym16, [srcq+strideq*1], 1 kmovw k1, r3d vbroadcasti32x4 m18, [srcq+strideq*2] add srcq, r6 vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3 movu xm17, [srcq+strideq*1] vbroadcasti32x4 ym18, [srcq+strideq*2] add srcq, r6 vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2 vbroadcasti32x4 m5, [spel_h_shufA] vbroadcasti32x4 m6, [spel_h_shufB] vpbroadcastd m8, [tmpq+ 4] vpbroadcastd m9, [tmpq+ 8] mova m1, m10 mova m19, [spel_shuf4a] mova m2, m10 pshufb m0, m16, m5 vpdpwssd m1, m8, m0 pshufb m0, m17, m5 vpdpwssd m2, m8, m0 vpbroadcastd m13, [tmpq+20] pshufb m16, m6 vpbroadcastd m14, [tmpq+24] pshufb m17, m6 vpbroadcastd m15, [tmpq+28] vpdpwssd m1, m9, m16 ; 0 1 2 3 vpdpwssd m2, m9, m17 ; 4 5 6 mova m7, [spel_shuf4b] vpermt2b m1, m19, m2 ; 01 12 23 34 vpermb m2, m19, m2 ; 45 56 mova ym19, [prep_endA] vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56 .hv_w4_loop: movu xm17, [srcq+strideq*1] vinserti128 ym17, [srcq+strideq*2], 1 vbroadcasti32x4 m16, [srcq+r6 ] lea srcq, [srcq+strideq*4] vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3 mova m18, m10 pshufb m16, m17, m5 vpdpwssd m18, m8, m16 mova m16, m11 vpdpwssd m16, m12, m1 ; a0 b0 c0 d0 pshufb m17, m6 vpdpwssd m18, m9, m17 ; 7 8 9 a mova m1, m2 vpdpwssd m16, m13, m2 ; a1 b1 c1 d1 vpermt2b m2, m7, m18 ; 67 78 89 9a vpdpwssd m16, m15, m2 ; a3 b3 c3 d3 vshufi32x4 m1, m2, q1032 ; 45 56 67 78 vpdpwssd m16, m14, m1 ; a2 b2 c2 d2 vpermb m16, m19, m16 mova [tmpq], ym16 add tmpq, 32 sub hd, 4 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 6 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [tmpq+20] vpbroadcastd m18, [tmpq+24] vpbroadcastd m19, [tmpq+28] cmp wd, 8 jg .hv_w16 WIN64_SPILL_XMM 23 mova m5, [spel_h_shufA] movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 movu ym9, [srcq+strideq*2] add srcq, r6 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 movu ym20, [srcq+strideq*1] vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+strideq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] mova ym22, [prep_endB] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*0], 1 mova m4, m10 mova m20, m11 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 vpdpwssd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 vpermb m20, m22, m20 mova [tmpq], ym20 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: WIN64_SPILL_XMM 27 %if WIN64 push r8 %endif vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] add wd, wd mova m9, [spel_shuf16] mova m26, [prep_endB] lea r5d, [hq+wq*8-256] .hv_w16_loop0: vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 movu ym6, [srcq+strideq*1+ 0] movu ym7, [srcq+strideq*1+16] lea r7, [srcq+r6] vinserti32x8 m6, [srcq+strideq*2+ 0], 1 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 movu ym22, [r7 +strideq*0+ 0] movu ym23, [r7 +strideq*0+16] mov r8, tmpq vinserti32x8 m22, [r7 +strideq*1+ 0], 1 vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4 movu ym24, [r7 +strideq*2+ 0] movu ym25, [r7 +strideq*2+16] add r7, r6 vinserti32x8 m24, [r7 +strideq*0+ 0], 1 vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6 pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [r7+strideq*1+ 0] movu ym25, [r7+strideq*1+16] lea r7, [r7+strideq*2] vinserti32x8 m24, [r7+strideq*0+ 0], 1 vinserti32x8 m25, [r7+strideq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 mova m22, m11 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 mova m23, m11 vpdpwssd m22, m16, m1 ; A0 mova m1, m3 vpdpwssd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 vpermt2b m22, m26, m23 mova [r8+wq*0], ym22 vextracti32x8 [r8+wq*1], m22, 1 lea r8, [r8+wq*2] sub hd, 2 jg .hv_w16_loop add srcq, 32 add tmpq, 32 movzx hd, r5b sub r5d, 1<<8 jg .hv_w16_loop0 %if WIN64 pop r8 %endif RET %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts %define base r6-pd_0to7 mov t0d, r7m lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m8, [base+warp_8x8t_rnd_v] vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main psrad m14, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m16, 15 packssdw m14, m16 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m15, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 add tsq, tsq psrad m16, 15 packssdw m15, m16 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd mov t0d, r7m ; pixel_max lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] call .main psrad m14, m16, 13 call .main2 psrad m16, 13 packusdw m14, m16 call .main2 psrad m15, m16, 13 call .main2 vpbroadcastd m0, [base+bidir_shift+t0*4] vpsrlvw m14, m0 psrad m16, 13 packusdw m15, m16 vpsrlvw m15, m0 .end: mova m0, [base+warp8x8_end] vpermb m16, m0, m14 lea r2, [dsq*3] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 vpermb m16, m0, m15 lea dstq, [dstq+dsq*4] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 RET .main: vpbroadcastd ym3, [base+pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym3, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym3, r7m {1to8} ; my mova ym16, [base+pd_0to7] vpbroadcastd ym19, [abcdq+4*0] ; alpha vpbroadcastd ym21, [abcdq+4*1] ; gamma lea r4, [ssq*3+6] vpdpwssd ym18, ym19, ym16 ; tmx vpdpwssd ym20, ym21, ym16 ; tmy sub srcq, r4 mova m10, [base+warp8x8_permA] lea r4, [mc_warp_filter+64*8] vbroadcasti32x4 m12, [base+warp8x8_permC] kxnorb k1, k1, k1 vbroadcasti32x4 m13, [base+warp8x8_permD] movu ym5, [srcq+0] vinserti32x8 m5, [srcq+8], 1 psrad ym17, ym18, 10 mova m11, [base+warp8x8_permB] kmovb k2, k1 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 psrad ym19, 16 ; beta psrad ym21, 16 ; delta paddd ym18, ym19 vpermb m4, m10, m5 vpbroadcastq m9, [base+warp_shift_h+t0*8] pshufd m3, m3, q3120 paddd m7, m1, m1 pshufb m2, m3, m12 vpdpwssd m1, m4, m2 vpermb m5, m11, m5 vshufi32x4 m4, m5, q1021 pshufb m3, m13 vpdpwssd m1, m4, m3 call .h psllq m2, m1, 32 paddd m1, m2 vpmultishiftqb m1, m9, m1 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym6, ym20, 10 kmovb k1, k2 paddd ym17, ym20, ym21 ; my += delta vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 shufps m5, m20, m6, q2020 mova m16, m8 pshufb m4, m5, m12 vpdpwssd m16, m1, m4 ; a0 b0 pshufb m5, m13 mova m1, m2 vpdpwssd m16, m2, m5 ; a1 b1 shufps m6, m20, m6, q3131 paddd ym20, ym17, ym21 pshufb m4, m6, m12 mova m2, m3 vpdpwssd m16, m3, m4 ; a2 b2 vpshrdq m3, m0, 48 ; 67 78 pshufb m6, m13 vpdpwssd m16, m3, m6 ; a3 b3 ret ALIGN function_align .h: movu ym16, [srcq+ssq*1] psrad ym6, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x8 m5, m16, [srcq+ssq*0], 1 kmovb k1, k2 paddd ym17, ym18, ym19 ; mx += beta vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 vpermb m4, m10, m5 shufps m16, m18, m6, q2020 shufps m6, m18, m6, q3131 mova m0, m7 pshufb m18, m16, m12 vpdpwssd m0, m4, m18 ; a0 b0 vpermb m5, m11, m5 pshufb m18, m6, m13 vpdpwssd m0, m5, m18 ; a3 b3 paddd ym18, ym17, ym19 vshufi32x4 m17, m4, m5, q1021 pshufb m16, m13 vpdpwssd m0, m17, m16 ; a1 b1 vshufi32x4 m4, m5, q2132 pshufb m6, m12 vpdpwssd m0, m4, m6 ; a2 b2 vpmultishiftqb m0, m9, m0 ; a a b b ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm0, ym1, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m2, [base+avg_round+t0*4] vpbroadcastd m3, [base+avg_shift+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+64*0] paddsw m0, [tmp2q+64*0] mova m1, [tmp1q+64*1] paddsw m1, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 vpsrlvw m0, m3 vpsrlvw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+w_avg_round+t0*4] vpbroadcastd m7, [base+bidir_shift+t0*4] add wq, r6 mov r6d, r6m ; weight lea t0d, [r6-16] shl r6d, 16 sub r6d, t0d ; 16-weight, weight movifnidn hd, hm vpbroadcastd m6, r6d BIDIR_FN ALIGN function_align .main: mova m3, [tmp1q+64*0] mova m1, [tmp2q+64*0] mova m0, [tmp1q+64*1] mova m4, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 mova m0, m5 vpdpwssd m0, m6, m2 mova m2, m5 vpdpwssd m2, m6, m1 mova m1, m5 vpdpwssd m1, m6, m3 mova m3, m5 vpdpwssd m3, m6, m4 REPX {psrad x, 2}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m7 vpsrlvw m1, m7 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+mask_round+r6*4] vpbroadcastd m10, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: pmovzxbw m1, [maskq+32*0] mova m4, [tmp1q+64*0] mova m2, [tmp2q+64*0] pmovzxbw m6, [maskq+32*1] mova m5, [tmp1q+64*1] mova m3, [tmp2q+64*1] add maskq, 32*2 add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m7, m4, m2 punpckhwd m4, m2 psubw m0, m8, m1 punpcklwd m2, m1, m0 ; m, 64-m punpckhwd m1, m0 mova m0, m9 vpdpwssd m0, m7, m2 mova m2, m9 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) punpcklwd m7, m5, m3 punpckhwd m5, m3 psubw m1, m8, m6 punpcklwd m3, m6, m1 punpckhwd m6, m1 mova m1, m9 vpdpwssd m1, m7, m3 mova m3, m9 vpdpwssd m3, m5, m6 REPX {psrad x, 4}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m10 vpsrlvw m1, m10 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+mask_round+r6*4] vpbroadcastd m13, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m14, [base+w_mask_round+r6*4] mova ym15, [w_mask_end42x] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: mova m4, [w_mask_shuf4] vpermt2b m2, m4, m3 mova m3, m14 vpdpbusd m3, m2, [pb_64] {1to16} vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: mova m8, [w_mask_shuf8] vpbroadcastd m9, [pb_64] jmp .w8_start .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 mova [maskq], xm3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16: mova m8, [w_mask_shuf16] vpbroadcastd m9, [pb_64] jmp .w16_start .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 mova [maskq], xm3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m2, m3 mova m8, m14 vpdpwssd m8, m11, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 call .main paddw m2, m3 mova m3, m14 vpdpwssd m3, m11, m2 vpermt2b m8, m15, m3 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 mova [maskq], ym8 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: mova m8, m2 mova m9, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main paddw m8, m2 paddw m9, m3 mova m2, m14 vpdpwssd m2, m11, m8 mova m3, m14 vpdpwssd m3, m11, m9 vpermt2b m2, m15, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 mova [maskq], ym2 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: mova m16, m2 mova m8, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main mova m17, m2 mova m9, m3 mova [dstq+strideq*0+64*2], m0 mova [dstq+strideq*0+64*3], m1 call .main paddw m2, m16 paddw m3, m8 mova m16, m14 vpdpwssd m16, m11, m2 mova m8, m14 vpdpwssd m8, m11, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 call .main paddw m2, m17 paddw m3, m9 mova m17, m14 vpdpwssd m17, m11, m2 mova m9, m14 vpdpwssd m9, m11, m3 vpermt2b m16, m15, m8 vpermt2b m17, m15, m9 mova [dstq+strideq*1+64*2], m0 mova [dstq+strideq*1+64*3], m1 mova [maskq+32*0], ym16 mova [maskq+32*1], ym17 sub hd, 2 jg .w128_loop vzeroupper RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m10, m6 psrlw m6, 10 ; 64-m psubw m2, m11, m6 ; m punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m12 vpdpwssd m0, m5, m1 mova m1, m12 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m10, m5 psrlw m5, 10 psubw m3, m11, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m12 vpdpwssd m1, m6, m4 mova m4, m12 vpdpwssd m4, m7, m5 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m13 vpsrlvw m1, m13 ret cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] vpbroadcastd m11, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m12, [base+w_mask_round+r6*4] mova ym13, [w_mask_end42x] mov maskq, maskmp add wq, r7 paddw m14, m9, m9 ; pw_128 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 mova m5, m12 vpdpwssd m5, m14, m2 mova m2, m12 vpdpwssd m2, m14, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpermt2b m5, m13, m2 vpsrlvw m0, m11 vpsrlvw m1, m11 mova [maskq], ym5 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] mova m11, [w_mask_end444] vpbroadcastd m12, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 vpermt2b m2, m11, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m12 vpsrlvw m1, m12 mova [maskq], m2 add maskq, 64 ret cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw ym19, [maskq] movq xm16, [dstq+dsq*0] movhps xm16, [dstq+dsq*1] vpbroadcastq ym17, [dstq+dsq*2] vpbroadcastq ym18, [dstq+r6 ] pmullw ym19, ym6 vpblendd ym16, ym17, 0x30 vpblendd ym16, ym18, 0xc0 psubw ym17, ym16, [tmpq] add maskq, 16 add tmpq, 32 pmulhrsw ym17, ym19 paddw ym16, ym17 vextracti128 xm17, ym16, 1 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 movq [dstq+dsq*2], xm17 movhps [dstq+r6 ], xm17 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 vzeroupper RET .w8: pmovzxbw m2, [maskq] mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vinserti32x4 m0, [dstq+dsq*2], 2 vinserti32x4 m0, [dstq+r6 ], 3 pmullw m2, m6 psubw m1, m0, [tmpq] add maskq, 32 add tmpq, 64 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 mova ym1, [dstq+dsq*2] vinserti32x8 m1, [dstq+r6 ], 1 pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+64*0] psubw m3, m1, [tmpq+64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova m0, [dstq+dsq*0] mova m1, [dstq+dsq*1] pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+ 64*0] psubw m3, m1, [tmpq+ 64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc16_sse.asm000066400000000000000000011375241517466257200234170ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 pw_2: times 8 dw 2 pw_16: times 4 dw 16 prep_mul: times 4 dw 16 times 8 dw 4 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_2048: times 4 dw 2048 bidir_mul: times 4 dw 2048 pw_8192: times 8 dw 8192 pw_27615: times 8 dw 27615 pw_32766: times 8 dw 32766 pw_m512: times 8 dw -512 pd_63: times 4 dd 63 pd_64: times 4 dd 64 pd_512: times 4 dd 512 pd_2560: times 2 dd 2560 pd_8704: times 2 dd 8704 pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 pd_0x3ff: times 4 dd 0x3ff pd_0x4000: times 4 dd 0x4000 pq_0x400000: times 2 dq 0x400000 pq_0x40000000: times 2 dq 0x40000000 pd_65538: times 2 dd 65538 put_bilin_h_rnd: times 4 dw 8 times 4 dw 10 s_8tap_h_rnd: times 2 dd 2 times 2 dd 8 put_s_8tap_v_rnd: times 2 dd 512 times 2 dd 128 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_sh: dd 10, 8 bidir_rnd: times 4 dw -16400 times 4 dw -16388 put_8tap_h_rnd: dd 34, 34, 40, 40 prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) warp8x8_shift: dd 11, 13 warp8x8_rnd1: dd 1024, 1024, 4096, 4096 warp8x8_rnd2: times 4 dw 4096 times 4 dw 16384 warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter SECTION .text %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif INIT_XMM ssse3 cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy %define base t0-put_ssse3 mov mxyd, r6m ; mx LEA t0, put_ssse3 movifnidn wd, wm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] add wq, t0 movifnidn hd, hm jmp wq .put_w2: mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4d mov [dstq+dsq*1], r6d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq [dstq+dsq*0], m0 movq [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] movu m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+16*0], m0 mova [dstq+dsq*0+16*1], m1 mova [dstq+dsq*1+16*0], m2 mova [dstq+dsq*1+16*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, ssq mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 add dstq, dsq dec hd jg .put_w32 RET .put_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, ssq mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: add srcq, 16*8 add dstq, 16*8 .put_w128_loop: movu m0, [srcq-16*8] movu m1, [srcq-16*7] movu m2, [srcq-16*6] movu m3, [srcq-16*5] mova [dstq-16*8], m0 mova [dstq-16*7], m1 mova [dstq-16*6], m2 mova [dstq-16*5], m3 movu m0, [srcq-16*4] movu m1, [srcq-16*3] movu m2, [srcq-16*2] movu m3, [srcq-16*1] mova [dstq-16*4], m0 mova [dstq-16*3], m1 mova [dstq-16*2], m2 mova [dstq-16*1], m3 movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, ssq mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add dstq, dsq dec hd jg .put_w128_loop RET .h: movd m5, mxyd mov mxyd, r7m ; my mova m4, [base+pw_16] pshufb m5, [base+pw_256] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v mov r6d, r8m ; bitdepth_max shr r6d, 11 movddup m3, [base+put_bilin_h_rnd+r6*8] movifnidn hd, hm sub wd, 8 jg .h_w16 je .h_w8 cmp wd, -4 je .h_w4 .h_w2: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw m0, m4, m1 psrlq m1, 16 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movd [dstq+dsq*0], m0 punpckhqdq m0, m0 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] movq m1, [srcq+ssq*0+2] movhps m1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w16_loop0: mov r6, wq .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 movu m1, [srcq+r6*2+16] movu m2, [srcq+r6*2+18] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2+16*0], m0 mova [dstq+r6*2+16*1], m1 add r6, 16 jl .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16_loop0 RET .v: shl mxyd, 11 movd m5, mxyd pshufb m5, [base+pw_256] movifnidn hd, hm cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq m2, m0, m1 movd m0, [srcq+ssq*0] punpcklqdq m1, m0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movd [dstq+dsq*0], m1 punpckhqdq m1, m1 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq m0, [srcq+ssq*0] .v_w4_loop: movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq m2, m0, m1 movq m0, [srcq+ssq*0] punpcklqdq m1, m0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: %if ARCH_X86_64 %if WIN64 push r7 %endif shl wd, 5 mov r7, srcq lea r6d, [wq+hq-256] mov r4, dstq %else mov r6, srcq %endif .v_w8_loop0: movu m0, [srcq+ssq*0] .v_w8_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop %if ARCH_X86_64 add r7, 16 add r4, 16 movzx hd, r6b mov srcq, r7 mov dstq, r4 sub r6d, 1<<8 %else mov dstq, dstmp add r6, 16 mov hd, hm add dstq, 16 mov srcq, r6 mov dstmp, dstq sub wd, 8 %endif jg .v_w8_loop0 %if WIN64 pop r7 %endif RET .hv: WIN64_SPILL_XMM 8 shl mxyd, 11 mova m3, [base+pw_2] movd m6, mxyd mova m7, [base+pw_8192] pshufb m6, [base+pw_256] test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 mova m7, [base+pw_2048] .hv_12bpc: movifnidn hd, hm cmp wd, 4 jg .hv_w8 je .hv_w4 .hv_w2: movddup m0, [srcq+ssq*0] pshufhw m1, m0, q0321 pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w2_loop: movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m2, [srcq+ssq*0] pmullw m1, m4, m2 psrlq m2, 16 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 _ 2 _ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movd [dstq+dsq*0], m1 punpckhqdq m1, m1 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: movddup m0, [srcq+ssq*0] movddup m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w4_loop: movq m1, [srcq+ssq*1] movq m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] movhps m2, [srcq+ssq*0+2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 shufpd m2, m0, m1, 0x01 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: %if ARCH_X86_64 %if WIN64 push r7 %endif shl wd, 5 lea r6d, [wq+hq-256] mov r4, srcq mov r7, dstq %else mov r6, srcq %endif .hv_w8_loop0: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w8_loop: movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 mova [dstq+dsq*0], m2 movu m0, [srcq+ssq*0] movu m2, [srcq+ssq*0+2] pmullw m0, m4 pmullw m2, m5 paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop %if ARCH_X86_64 add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 %else mov dstq, dstmp add r6, 16 mov hd, hm add dstq, 16 mov srcq, r6 mov dstmp, dstq sub wd, 8 %endif jg .hv_w8_loop0 %if WIN64 pop r7 %endif RET cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 %define base r6-prep_ssse3 movifnidn mxyd, r5m ; mx LEA r6, prep_ssse3 movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: tzcnt wd, wd movzx wd, word [base+prep_ssse3_table+wq*2] mov r5d, r7m ; bitdepth_max mova m5, [base+pw_8192] add wq, r6 shr r5d, 11 movddup m4, [base+prep_mul+r5*8] lea stride3q, [strideq*3] jmp wq .prep_w4: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET .prep_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .prep_w8 RET .prep_w16: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .prep_w16 RET .prep_w32: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 dec hd jg .prep_w32 RET .prep_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*4], m0 mova [tmpq+16*5], m1 mova [tmpq+16*6], m2 mova [tmpq+16*7], m3 add tmpq, 16*8 dec hd jg .prep_w64 RET .prep_w128: movu m0, [srcq+16* 0] movu m1, [srcq+16* 1] movu m2, [srcq+16* 2] movu m3, [srcq+16* 3] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 movu m0, [srcq+16* 4] movu m1, [srcq+16* 5] movu m2, [srcq+16* 6] movu m3, [srcq+16* 7] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*4], m0 mova [tmpq+16*5], m1 mova [tmpq+16*6], m2 mova [tmpq+16*7], m3 movu m0, [srcq+16* 8] movu m1, [srcq+16* 9] movu m2, [srcq+16*10] movu m3, [srcq+16*11] add tmpq, 16*16 REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq-16*8], m0 mova [tmpq-16*7], m1 mova [tmpq-16*6], m2 mova [tmpq-16*5], m3 movu m0, [srcq+16*12] movu m1, [srcq+16*13] movu m2, [srcq+16*14] movu m3, [srcq+16*15] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq-16*4], m0 mova [tmpq-16*3], m1 mova [tmpq-16*2], m2 mova [tmpq-16*1], m3 dec hd jg .prep_w128 RET .h: movd m4, mxyd mov mxyd, r6m ; my mova m3, [base+pw_16] pshufb m4, [base+pw_256] mova m5, [base+pw_32766] psubw m3, m4 test dword r7m, 0x800 jnz .h_12bpc psllw m3, 2 psllw m4, 2 .h_12bpc: test mxyd, mxyd jnz .hv sub wd, 8 je .h_w8 jg .h_w16 .h_w4: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*0+2] movhps m1, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 16 sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 2 jg .h_w8 RET .h_w16: lea srcq, [srcq+wq*2] neg wq .h_w16_loop0: mov r6, wq .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 movu m1, [srcq+r6*2+16] movu m2, [srcq+r6*2+18] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 add r6, 16 jl .h_w16_loop add srcq, strideq dec hd jg .h_w16_loop0 RET .v: movd m4, mxyd mova m3, [base+pw_16] pshufb m4, [base+pw_256] mova m5, [base+pw_32766] psubw m3, m4 test dword r7m, 0x800 jnz .v_12bpc psllw m3, 2 psllw m4, 2 .v_12bpc: cmp wd, 8 je .v_w8 jg .v_w16 .v_w4: movq m0, [srcq+strideq*0] .v_w4_loop: movq m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklqdq m1, m0, m2 ; 0 1 movq m0, [srcq+strideq*0] punpcklqdq m2, m0 ; 1 2 pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: movu m0, [srcq+strideq*0] .v_w8_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4, m2 psubw m0, m5 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m3 mova [tmpq+16*0], m1 pmullw m1, m4, m0 psubw m2, m5 paddw m1, m2 psraw m1, 2 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 2 jg .v_w8_loop RET .v_w16: %if WIN64 push r7 %endif mov r5, srcq %if ARCH_X86_64 lea r6d, [wq*4-32] mov wd, wd lea r6d, [hq+r6*8] mov r7, tmpq %else mov r6d, wd %endif .v_w16_loop0: movu m0, [srcq+strideq*0] .v_w16_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4, m2 psubw m0, m5 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m3 mova [tmpq+wq*0], m1 pmullw m1, m4, m0 psubw m2, m5 paddw m1, m2 psraw m1, 2 mova [tmpq+wq*2], m1 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w16_loop %if ARCH_X86_64 add r5, 16 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 %else mov tmpq, tmpmp add r5, 16 mov hd, hm add tmpq, 16 mov srcq, r5 mov tmpmp, tmpq sub r6d, 8 %endif jg .v_w16_loop0 %if WIN64 pop r7 %endif RET .hv: WIN64_SPILL_XMM 7 shl mxyd, 11 movd m6, mxyd pshufb m6, [base+pw_256] cmp wd, 8 je .hv_w8 jg .hv_w16 .hv_w4: movddup m0, [srcq+strideq*0] movddup m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w4_loop: movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] movhps m1, [srcq+strideq*0] movhps m2, [srcq+strideq*0+2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 ; 1 2 shufpd m2, m0, m1, 0x01 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w8_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+16*0], m2 movu m0, [srcq+strideq*0] movu m2, [srcq+strideq*0+2] pmullw m0, m3 pmullw m2, m4 psubw m0, m5 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 2 jg .hv_w8_loop RET .hv_w16: %if WIN64 push r7 %endif mov r5, srcq %if ARCH_X86_64 lea r6d, [wq*4-32] mov wd, wd lea r6d, [hq+r6*8] mov r7, tmpq %else mov r6d, wd %endif .hv_w16_loop0: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+wq*0], m2 movu m0, [srcq+strideq*0] movu m2, [srcq+strideq*0+2] pmullw m0, m3 pmullw m2, m4 psubw m0, m5 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w16_loop %if ARCH_X86_64 add r5, 16 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 %else mov tmpq, tmpmp add r5, 16 mov hd, hm add tmpq, 16 mov srcq, r5 mov tmpmp, tmpq sub r6d, 8 %endif jg .hv_w16_loop0 %if WIN64 pop r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %if %0 == 5 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 1, 2, 6 %elif WIN64 DECLARE_REG_TMP 4, 5, 8 %else DECLARE_REG_TMP 7, 8, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_6tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my %define base t2-put_ssse3 %if ARCH_X86_32 %define mxb r0b %define mxd r0 %define mxq r0 %define myb r1b %define myd r1 %define myq r1 %endif imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v LEA t2, put_ssse3 movifnidn wd, wm movifnidn srcq, srcmp movifnidn ssq, ssmp movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .put: tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] movifnidn dstq, dstmp movifnidn dsq, dsmp add wq, t2 %if WIN64 pop r8 pop r7 %endif jmp wq .h_w2: mova m2, [base+spel_h_shuf2] pshufd m3, m3, q2121 .h_w2_loop: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m2 pshufb m1, m2 pmaddwd m0, m3 pmaddwd m1, m3 phaddd m0, m1 paddd m0, m4 psrad m0, 6 packssdw m0, m0 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 movd [dstq+dsq*0], m0 pshuflw m0, m0, q3232 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb lea srcq, [srcq-2] movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp punpcklbw m3, m3 psraw m3, 8 ; sign-extend jl .h_w2 WIN64_SPILL_XMM 9 mova m7, [base+spel_h_shufA] %if ARCH_X86_32 %define m8 [base+spel_h_shufB] %else mova m8, [base+spel_h_shufB] %endif pshufd m2, m3, q1111 pshufd m3, m3, q2222 .h_w4_loop: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m6, m0, m7 ; 0 1 1 2 2 3 3 4 pmaddwd m6, m2 pshufb m0, m8 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m3 paddd m0, m6 pshufb m6, m1, m7 pmaddwd m6, m2 pshufb m1, m8 pmaddwd m1, m3 paddd m0, m4 paddd m6, m4 paddd m1, m6 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: RESET_STACK_STATE test myd, 0xf00 jnz .hv mov myd, r8m movd m5, r8m shr myd, 11 movddup m4, [base+put_8tap_h_rnd+myq*8] movifnidn dsq, dsmp pshufb m5, [base+pw_256] sub wd, 4 jle .h_w4 WIN64_SPILL_XMM 11 shr mxd, 16 movq m2, [base+subpel_filters+1+mxq*8] movifnidn dstq, dstmp mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] lea srcq, [srcq+wq*2] punpcklbw m2, m2 lea dstq, [dstq+wq*2] psraw m2, 8 neg wq %if ARCH_X86_32 ALLOC_STACK -16*3 %define m8 [rsp+16*0] %define m9 [rsp+16*1] %define m10 [rsp+16*2] pshufd m0, m2, q0000 pshufd m1, m2, q1111 pshufd m2, m2, q2222 mova m8, m0 mova m9, m1 mova m10, m2 %else pshufd m8, m2, q0000 pshufd m9, m2, q1111 pshufd m10, m2, q2222 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m3, [srcq+r6*2-4] movu m2, [srcq+r6*2+8] pshufb m0, m3, m6 ; 01 12 23 34 pmaddwd m0, m8 ; abcd0 pshufb m3, m7 ; 23 34 45 56 pmaddwd m1, m9, m3 ; abcd1 paddd m0, m1 pshufb m1, m2, m6 ; 67 78 89 9a shufpd m3, m1, 0x01 ; 45 56 67 78 pmaddwd m1, m9 ; efgh1 pshufb m2, m7 ; 89 9a ab bc pmaddwd m2, m10 ; efgh2 paddd m1, m2 pmaddwd m2, m10, m3 ; abcd2 pmaddwd m3, m8 ; efgh0 paddd m0, m4 paddd m1, m4 paddd m0, m2 paddd m1, m3 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 mova [dstq+r6*2], m0 add r6, 8 jl .h_w8_loop add srcq, ssq add dstq, dsq dec hd jg .h_w8_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m2, [base+subpel_filters+1+myq*8] WIN64_SPILL_XMM 11, 16 movd m5, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp punpcklbw m2, m2 pshufb m5, [base+pw_256] psraw m2, 8 ; sign-extend %if ARCH_X86_32 ALLOC_STACK -16*4 pshufd m0, m2, q0000 mov r6, ssq pshufd m1, m2, q1111 neg r6 pshufd m2, m2, q2222 mova m8, m0 mova m9, m1 mova m10, m2 cmp wd, 2 jne .v_w4 %else mov r6, ssq pshufd m8, m2, q0000 neg r6 cmp wd, 4 jg .v_w8 pshufd m9, m2, q1111 pshufd m10, m2, q2222 je .v_w4 %endif .v_w2: movd m1, [srcq+r6 *2] movd m3, [srcq+r6 *1] movd m2, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m0, [srcq+ssq*0] punpckldq m1, m3 ; 0 1 punpckldq m3, m2 ; 1 2 punpckldq m2, m4 ; 2 3 punpckldq m4, m0 ; 3 4 punpcklwd m1, m3 ; 01 12 punpcklwd m2, m4 ; 23 34 pxor m6, m6 .v_w2_loop: movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m4, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m4, m2 punpckldq m2, m0, m3 ; 4 5 movd m0, [srcq+ssq*0] punpckldq m3, m0 ; 5 6 punpcklwd m2, m3 ; 67 78 pmaddwd m3, m10, m2 ; a2 b2 paddd m4, m3 psrad m4, 5 packssdw m4, m4 pmaxsw m4, m6 pavgw m4, m6 pminsw m4, m5 movd [dstq+dsq*0], m4 pshuflw m4, m4, q3232 movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 shl wd, 14 lea srcq, [srcq+r6*2] lea wd, [wq+hq-(1<<16)] %if STACK_ALIGNMENT < 16 %define dstmp [esp+16*3] %endif .v_w4_loop0: mov dstmp, dstq movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] lea r6, [srcq+ssq*2] movq m3, [r6 +ssq*0] movq m4, [r6 +ssq*1] lea r6, [r6 +ssq*2] %else movq m1, [srcq+r6 *2] movq m2, [srcq+r6 *1] lea r6, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] %endif movq m0, [r6 +ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m0 ; 34 .v_w4_loop: pmaddwd m6, m8, m1 ; a0 pmaddwd m7, m8, m2 ; b0 mova m1, m3 pmaddwd m3, m9 ; a1 mova m2, m4 pmaddwd m4, m9 ; b1 paddd m6, m3 movq m3, [r6+ssq*0] paddd m7, m4 movq m4, [r6+ssq*1] lea r6, [r6+ssq*2] movq m0, [r6+ssq*0] punpcklwd m3, m4 ; 45 punpcklwd m4, m0 ; 56 pmaddwd m0, m10, m3 ; a2 paddd m6, m0 pmaddwd m0, m10, m4 ; b2 paddd m7, m0 psrad m6, 5 psrad m7, 5 packssdw m6, m7 pxor m7, m7 pmaxsw m6, m7 pavgw m6, m7 pminsw m6, m5 movq [dstq+dsq*0], m6 movhps [dstq+dsq*1], m6 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov dstq, dstmp add srcq, 8 movzx hd, ww add dstq, 8 sub wd, 1<<16 jg .v_w4_loop0 RET %else RET .v_w8: mova r6m, m8 shl wd, 5 pshufd m6, m2, q1111 lea wd, [wq+hq-(1<<8)] pshufd m7, m2, q2222 WIN64_PUSH_XMM 16 .v_w8_loop0: movu m9, [srcq+ r6*2] movu m11, [srcq+ r6*1] lea r7, [srcq+ssq*2] movu m13, [srcq+ssq*0] movu m15, [srcq+ssq*1] mov r8, dstq movu m4, [r7 +ssq*0] punpcklwd m8, m9, m11 ; 01 punpckhwd m9, m11 punpcklwd m10, m11, m13 ; 12 punpckhwd m11, m13 punpcklwd m12, m13, m15 ; 23 punpckhwd m13, m15 punpcklwd m14, m15, m4 ; 34 punpckhwd m15, m4 .v_w8_loop: mova m3, r6m pmaddwd m0, m8, m3 ; a0 pmaddwd m2, m9, m3 ; a0' pmaddwd m1, m10, m3 ; b0 pmaddwd m3, m11 ; b0' mova m8, m12 pmaddwd m12, m6 ; a1 mova m9, m13 pmaddwd m13, m6 ; a1' mova m10, m14 pmaddwd m14, m6 ; b1 mova m11, m15 pmaddwd m15, m6 ; b1' paddd m0, m12 paddd m2, m13 movu m13, [r7+ssq*0] paddd m1, m14 paddd m3, m15 movu m15, [r7+ssq*1] lea r7, [r7+ssq*2] movu m4, [r7+ssq*0] punpcklwd m12, m13, m15 ; 45 punpckhwd m13, m15 punpcklwd m14, m15, m4 ; 56 punpckhwd m15, m4 pmaddwd m4, m7, m12 ; a2 paddd m0, m4 pmaddwd m4, m7, m13 ; a2' paddd m2, m4 pmaddwd m4, m7, m14 ; b2 paddd m1, m4 pmaddwd m4, m7, m15 ; b2' paddd m3, m4 REPX {psrad x, 5}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 pxor m2, m2 pmaxsw m0, m2 pmaxsw m1, m2 pavgw m0, m2 pavgw m1, m2 pminsw m0, m5 pminsw m1, m5 mova [r8+dsq*0], m0 mova [r8+dsq*1], m1 lea r8, [r8+dsq*2] sub hd, 2 jg .v_w8_loop add srcq, 16 add dstq, 16 movzx hd, wb sub wd, 1<<8 jg .v_w8_loop0 RET %endif .hv: cmp wd, 4 jg .hv_w8 WIN64_SPILL_XMM 12, 16 %if ARCH_X86_32 movd m3, r8m pshufb m3, [base+pw_256] %else movd m11, r8m pshufb m11, [base+pw_256] %endif movzx mxd, mxb movq m0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m2, [base+subpel_filters+1+myq*8] movddup m7, [base+pd_8704] sub srcq, 2 pshuflw m0, m0, q2121 pxor m6, m6 punpcklbw m6, m0 punpcklbw m2, m2 psraw m2, 8 ; sign-extend test dword r8m, 0x800 jz .hv_w2_10bpc movddup m7, [base+pd_2560] psraw m6, 2 psllw m2, 2 .hv_w2_10bpc: %if ARCH_X86_32 %assign regs_used 2 ALLOC_STACK -16*7 %assign regs_used 7 mov dstq, r0mp mov dsq, r1mp %define m11 [esp+16*4] pshufd m0, m2, q0000 pshufd m1, m2, q1111 pshufd m2, m2, q2222 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 neg ssq movu m3, [srcq+ssq*2] movu m4, [srcq+ssq*1] neg ssq %else pshufd m8, m2, q0000 mov r6, ssq pshufd m9, m2, q1111 neg r6 pshufd m10, m2, q2222 movu m3, [srcq+r6 *2] movu m4, [srcq+r6 *1] %endif movu m1, [srcq+ssq*0] movu m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu m2, [srcq+ssq*0] cmp wd, 4 je .hv_w4 mova m5, [base+spel_h_shuf2] REPX {pshufb x, m5}, m3, m4, m0, m1, m2 REPX {pmaddwd x, m6}, m3, m0, m4, m1, m2 phaddd m3, m0 ; 0 3 phaddd m4, m1 ; 1 2 phaddd m0, m2 ; 3 4 REPX {paddd x, m7}, m3, m4, m0 REPX {psrad x, 10}, m3, m4, m0 packssdw m3, m4 ; 0 3 1 2 packssdw m4, m0 ; 1 2 3 4 pshufd m2, m3, q1320 ; 0 1 2 3 punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 .hv_w2_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu m4, [srcq+ssq*0] pshufb m3, m5 pshufb m4, m5 pmaddwd m3, m6 pmaddwd m4, m6 phaddd m3, m4 pmaddwd m4, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m4, m2 paddd m3, m7 psrad m3, 10 ; 5 6 packssdw m0, m3 pshufd m2, m0, q2103 punpckhwd m2, m0 ; 45 56 mova m0, m3 pmaddwd m3, m10, m2 ; a2 b2 paddd m4, m3 psrad m4, 10 packssdw m4, m4 pxor m3, m3 pminsw m4, m11 pmaxsw m4, m3 movd [dstq+dsq*0], m4 pshuflw m4, m4, q1032 movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: %if ARCH_X86_32 %define m12 [esp+16*5] %define m13 [esp+16*6] %define m14 [base+spel_h_shufA] %define m15 [base+spel_h_shufB] pshufd m5, m6, q0000 pshufd m6, m6, q1111 mova m12, m5 mova m13, m6 %else WIN64_PUSH_XMM 16 mova m14, [base+spel_h_shufA] mova m15, [base+spel_h_shufB] pshufd m12, m6, q0000 pshufd m13, m6, q1111 %endif %macro HV_H_W4_6TAP 3-4 m15 ; dst, src, tmp, shufB pshufb %3, %2, m14 pmaddwd %3, m12 pshufb %2, %4 pmaddwd %2, m13 paddd %3, m7 paddd %1, %2, %3 %endmacro HV_H_W4_6TAP m3, m3, m5 HV_H_W4_6TAP m4, m4, m5 HV_H_W4_6TAP m5, m1, m5 HV_H_W4_6TAP m0, m0, m1 HV_H_W4_6TAP m2, m2, m1 REPX {psrad x, 10}, m3, m5, m4, m0, m2 packssdw m3, m5 ; 0 2 packssdw m4, m0 ; 1 3 packssdw m5, m2 ; 2 4 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m5 ; 12 punpckhwd m4, m5 ; 34 .hv_w4_loop: movu m0, [srcq+ssq*1] pmaddwd m5, m8, m1 ; a0 lea srcq, [srcq+ssq*2] pmaddwd m6, m8, m2 ; b0 mova m1, m3 pmaddwd m3, m9 ; a1 mova m2, m4 pmaddwd m4, m9 ; b1 paddd m5, m3 movu m3, [srcq+ssq*0] paddd m6, m4 HV_H_W4_6TAP m0, m0, m4 HV_H_W4_6TAP m3, m3, m4 psrad m4, m2, 16 psrad m0, 10 psrad m3, 10 packssdw m4, m0 ; 4 5 packssdw m0, m3 ; 5 6 punpcklwd m3, m4, m0 ; 45 punpckhwd m4, m0 ; 56 pmaddwd m0, m10, m3 ; a2 paddd m5, m0 pmaddwd m0, m10, m4 ; b2 paddd m6, m0 psrad m5, 10 psrad m6, 10 packssdw m5, m6 pxor m6, m6 pminsw m5, m11 pmaxsw m5, m6 movq [dstq+dsq*0], m5 movhps [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: RESET_STACK_STATE shr mxd, 16 movq m2, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m1, [base+subpel_filters+1+myq*8] movd m3, r8m movddup m4, [base+pd_8704] pshufb m3, [base+pw_256] pxor m0, m0 punpcklbw m0, m2 punpcklbw m1, m1 sub srcq, 4 psraw m1, 8 ; sign-extend test dword r8m, 0x800 jz .hv_w8_10bpc movddup m4, [base+pd_2560] psraw m0, 2 psllw m1, 2 .hv_w8_10bpc: %if ARCH_X86_32 %assign regs_used 2 ALLOC_STACK -16*9 %assign regs_used 7 mov dstq, r0mp mov dsq, r1mp mova [rsp+16*7], m4 %else ALLOC_STACK 16*7, 16 %endif mova [rsp+16*6], m3 pshufd m2, m0, q0000 mova [rsp+16*0], m2 pshufd m2, m0, q1111 mova [rsp+16*1], m2 pshufd m0, m0, q2222 mova [rsp+16*2], m0 pshufd m2, m1, q0000 mova [rsp+16*3], m2 pshufd m2, m1, q1111 mova [rsp+16*4], m2 pshufd m1, m1, q2222 mova [rsp+16*5], m1 mov r6, ssq neg r6 %if ARCH_X86_32 shl wd, 14 lea r4d, [wq+hq-(1<<16)] %if STACK_ALIGNMENT < 16 %define srcmp [esp+16*8+4*0] %define dstmp [esp+16*8+4*1] %endif %macro HV_H_6TAP 3-6 [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-2], mul[1-3] punpcklwd %1, %2, %3 ; 01 12 23 34 punpckhwd %2, %3 ; 45 56 67 78 pmaddwd %3, %4, %1 ; a0 shufpd %1, %2, 0x01 ; 23 34 45 56 pmaddwd %2, %6 ; a2 pmaddwd %1, %5 ; a1 paddd %2, %3 paddd %1, %2 %endmacro .hv_w8_loop0: mov srcmp, srcq mov dstmp, dstq movu m5, [srcq+r6*2+0] movu m6, [srcq+r6*2+2] mova m7, [rsp+16*0] mova m1, [rsp+16*1] mova m0, [rsp+16*2] HV_H_6TAP m2, m5, m6, m7, m1, m0 movu m5, [srcq+r6*1+0] movu m6, [srcq+r6*1+2] HV_H_6TAP m3, m5, m6, m7, m1, m0 movu m5, [srcq+ssq*0+0] movu m6, [srcq+ssq*0+2] HV_H_6TAP m4, m5, m6, m7, m1, m0 movu m5, [srcq+ssq*1+0] movu m6, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] HV_H_6TAP m0, m5, m6, m7, m1 movu m5, [srcq+ssq*0+0] movu m6, [srcq+ssq*0+2] HV_H_6TAP m1, m5, m6, m7 mova m5, [rsp+16*7] REPX {paddd x, m5}, m2, m3, m4, m0, m1 REPX {psrad x, 10}, m2, m4, m3, m0, m1 packssdw m2, m4 ; 0 2 packssdw m3, m0 ; 1 3 packssdw m4, m1 ; 2 4 punpcklwd m0, m2, m3 ; 01 punpckhwd m2, m3 ; 23 punpcklwd m1, m3, m4 ; 12 punpckhwd m3, m4 ; 34 .hv_w8_loop: mova m5, [rsp+16*3] mova m6, [rsp+16*4] pmaddwd m4, m0, m5 ; a0 pmaddwd m5, m1 ; b0 mova m0, m2 pmaddwd m2, m6 ; a1 mova m1, m3 pmaddwd m3, m6 ; b1 paddd m4, m2 movu m2, [srcq+ssq*1+0] paddd m5, m3 movu m3, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] HV_H_6TAP m6, m2, m3 movu m2, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+2] HV_H_6TAP m7, m2, m3 mova m2, [rsp+16*7] psrad m3, m1, 16 paddd m6, m2 paddd m7, m2 psrad m6, 10 psrad m7, 10 packssdw m3, m6 ; 4 5 packssdw m6, m7 ; 5 6 mova m7, [rsp+16*5] punpcklwd m2, m3, m6 ; 45 punpckhwd m3, m6 ; 56 pmaddwd m6, m2, m7 ; a2 pmaddwd m7, m3 ; b2 paddd m4, m6 paddd m5, m7 psrad m4, 10 psrad m5, 10 packssdw m4, m5 pxor m5, m5 pminsw m4, [rsp+16*6] pmaxsw m4, m5 movq [dstq+dsq*0], m4 movhps [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop mov srcq, srcmp mov dstq, dstmp movzx hd, r4w add srcq, 8 add dstq, 8 sub r4d, 1<<16 %else shl wd, 5 lea r8d, [wq+hq-256] %macro HV_H_6TAP 5-9 [spel_h_shufA], [rsp+16*0], [rsp+16*1], [rsp+16*2] ; dst, src[1-3], shift, shuf, mul[1-3] %ifid %6 REPX {pshufb x, %6}, %2, %3, %4 %else mova %1, %6 pshufb %2, %1 ; 01 12 23 34 pshufb %3, %1 ; 45 56 67 78 pshufb %4, %1 ; 89 9a ab bc %endif pmaddwd %1, %7, %2 shufpd %2, %3, 0x01 ; 23 34 45 56 pmaddwd %2, %8 paddd %1, %2 pmaddwd %2, %9, %3 paddd %1, %2 pmaddwd %2, %7, %3 shufpd %3, %4, 0x01 ; 67 78 89 9a pmaddwd %4, %9 pmaddwd %3, %8 paddd %1, m4 paddd %2, m4 paddd %3, %4 paddd %2, %3 psrad %1, %5 psrad %2, %5 packssdw %1, %2 %endmacro .hv_w8_loop0: mova m5, [spel_h_shufA] movu m0, [srcq+r6*2+ 0] mova m6, [rsp+16*0] movu m1, [srcq+r6*2+ 8] mova m7, [rsp+16*1] movu m2, [srcq+r6*2+16] mova m8, [rsp+16*2] HV_H_6TAP m9, m0, m1, m2, 10, m5, m6, m7, m8 movu m0, [srcq+r6*1+ 0] movu m1, [srcq+r6*1+ 8] movu m2, [srcq+r6*1+16] lea r4, [srcq+ssq*2] HV_H_6TAP m11, m0, m1, m2, 10, m5, m6, m7, m8 movu m0, [srcq+ssq*0+ 0] movu m1, [srcq+ssq*0+ 8] movu m2, [srcq+ssq*0+16] mov r7, dstq HV_H_6TAP m13, m0, m1, m2, 10, m5, m6, m7, m8 movu m0, [srcq+ssq*1+ 0] movu m1, [srcq+ssq*1+ 8] movu m2, [srcq+ssq*1+16] HV_H_6TAP m15, m0, m1, m2, 10, m5, m6, m7, m8 movu m0, [r4+ssq*0+ 0] movu m1, [r4+ssq*0+ 8] movu m2, [r4+ssq*0+16] HV_H_6TAP m5, m0, m1, m2, 10, m5, m6, m7, m8 punpcklwd m8, m9, m11 ; 01 punpckhwd m9, m11 punpcklwd m10, m11, m13 ; 12 punpckhwd m11, m13 punpcklwd m12, m13, m15 ; 23 punpckhwd m13, m15 punpcklwd m14, m15, m5 ; 34 punpckhwd m15, m5 .hv_w8_loop: mova m3, [rsp+16*3] mova m7, [rsp+16*4] pmaddwd m0, m8, m3 ; a0 mova m8, m12 pmaddwd m2, m9, m3 ; a0' mova m9, m13 pmaddwd m1, m10, m3 ; b0 mova m10, m14 pmaddwd m3, m11 ; b0' mova m11, m15 REPX {pmaddwd x, m7}, m12, m13, m14, m15 movu m6, [r4+ssq*1+ 0] paddd m0, m12 movu m7, [r4+ssq*1+ 8] paddd m2, m13 movu m12, [r4+ssq*1+16] paddd m1, m14 lea r4, [r4+ssq*2] paddd m3, m15 HV_H_6TAP m15, m6, m7, m12, 10 movu m6, [r4+ssq*0+ 0] movu m7, [r4+ssq*0+ 8] movu m14, [r4+ssq*0+16] punpcklwd m12, m5, m15 ; 45 punpckhwd m13, m5, m15 HV_H_6TAP m5, m6, m7, m14, 10 mova m7, [rsp+16*5] punpcklwd m14, m15, m5 ; 56 punpckhwd m15, m5 pmaddwd m6, m12, m7 ; a2 paddd m0, m6 pmaddwd m6, m13, m7 ; a2' paddd m2, m6 pmaddwd m6, m14, m7 ; b2 pmaddwd m7, m15 ; b2' paddd m1, m6 mova m6, [rsp+16*6] paddd m3, m7 REPX {psrad x, 10}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 pxor m2, m2 pminsw m0, m6 pminsw m1, m6 pmaxsw m0, m2 pmaxsw m1, m2 mova [r7+dsq*0], m0 mova [r7+dsq*1], m1 lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop add srcq, 16 add dstq, 16 movzx hd, r8b sub r8d, 1<<8 %endif jg .hv_w8_loop0 RET PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc PUT_8TAP_FN sharp, SHARP, SHARP cglobal put_8tap_16bpc, 0, 9, 0, dst, ds, src, ss, w, h, mx, my %if ARCH_X86_32 %define mxb r0b %define mxd r0 %define mxq r0 %define myb r1b %define myd r1 %define myq r1 %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %endif imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v LEA t2, put_ssse3 movifnidn wd, wm movifnidn srcq, srcmp movifnidn ssq, ssmp movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _put_6tap_16bpc_ssse3).put .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] WIN64_SPILL_XMM 15 movd m7, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp punpcklbw m3, m3 pshufb m7, [base+pw_256] psraw m3, 8 ; sign-extend %if ARCH_X86_32 ALLOC_STACK -16*7 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif lea r6, [ssq*3] sub srcq, r6 cmp wd, 2 jne .v_w4 .v_w2: movd m1, [srcq+ssq*0] movd m4, [srcq+ssq*1] movd m2, [srcq+ssq*2] add srcq, r6 movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m6, [srcq+ssq*2] add srcq, r6 movd m0, [srcq+ssq*0] punpckldq m1, m4 ; 0 1 punpckldq m4, m2 ; 1 2 punpckldq m2, m5 ; 2 3 punpckldq m5, m3 ; 3 4 punpckldq m3, m6 ; 4 5 punpckldq m6, m0 ; 5 6 punpcklwd m1, m4 ; 01 12 punpcklwd m2, m5 ; 23 34 punpcklwd m3, m6 ; 45 56 pxor m6, m6 .v_w2_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 punpckldq m3, m0, m4 ; 6 7 movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 5 packssdw m5, m5 pmaxsw m5, m6 pavgw m5, m6 pminsw m5, m7 movd [dstq+dsq*0], m5 pshuflw m5, m5, q3232 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 shl wd, 14 %if STACK_ALIGNMENT < 16 mov [esp+4*29], srcq mov [esp+4*30], dstq %else mov srcmp, srcq %endif lea wd, [wq+hq-(1<<16)] %else shl wd, 6 mov r7, srcq mov r8, dstq lea wd, [wq+hq-(1<<8)] %endif .v_w4_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] movq m3, [srcq+ssq*2] add srcq, r6 movq m4, [srcq+ssq*0] movq m5, [srcq+ssq*1] movq m6, [srcq+ssq*2] add srcq, r6 movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m5 ; 34 punpcklwd m5, m6 ; 45 punpcklwd m6, m0 ; 56 %if ARCH_X86_32 jmp .v_w4_loop_start .v_w4_loop: mova m1, m12 mova m2, m13 mova m3, m14 .v_w4_loop_start: pmaddwd m1, m8 ; a0 pmaddwd m2, m8 ; b0 mova m12, m3 mova m13, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m1, m3 paddd m2, m4 mova m14, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m1, m5 paddd m2, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m3, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m1, m3 pmaddwd m3, m11, m6 ; b3 paddd m2, m3 psrad m1, 5 psrad m2, 5 packssdw m1, m2 pxor m2, m2 pmaxsw m1, m2 pavgw m1, m2 pminsw m1, m7 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if STACK_ALIGNMENT < 16 mov srcq, [esp+4*29] mov dstq, [esp+4*30] movzx hd, ww add srcq, 8 add dstq, 8 mov [esp+4*29], srcq mov [esp+4*30], dstq %else mov srcq, srcmp mov dstq, dstmp movzx hd, ww add srcq, 8 add dstq, 8 mov srcmp, srcq mov dstmp, dstq %endif sub wd, 1<<16 %else .v_w4_loop: pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 paddd m13, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m14, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packssdw m12, m13 pxor m13, m13 pmaxsw m12, m13 pavgw m12, m13 pminsw m12, m7 movq [dstq+dsq*0], m12 movhps [dstq+dsq*1], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop add r7, 8 add r8, 8 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 %endif jg .v_w4_loop0 RET .h: RESET_STACK_STATE test myd, 0xf00 jnz .hv mov myd, r8m movd m5, r8m shr myd, 11 movddup m4, [base+put_8tap_h_rnd+myq*8] movifnidn dsq, dsmp pshufb m5, [base+pw_256] cmp wd, 4 jle mangle(private_prefix %+ _put_6tap_16bpc_ssse3).h_w4 WIN64_SPILL_XMM 12 shr mxd, 16 movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] %if UNIX64 mov wd, wd %endif lea srcq, [srcq+wq*2] punpcklbw m3, m3 lea dstq, [dstq+wq*2] psraw m3, 8 neg wq %if ARCH_X86_32 ALLOC_STACK -16*4 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m0, [srcq+r6*2- 6] movu m1, [srcq+r6*2+ 2] pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 pshufb m0, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m2, m8 ; abcd0 pmaddwd m0, m9 ; abcd1 pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 pshufb m1, m7 ; 6 7 7 8 8 9 9 a paddd m2, m4 paddd m0, m2 pmaddwd m2, m10, m3 ; abcd2 pmaddwd m3, m8 ; efgh0 paddd m0, m2 pmaddwd m2, m11, m1 ; abcd3 pmaddwd m1, m9 ; efgh1 paddd m0, m2 movu m2, [srcq+r6*2+10] paddd m3, m4 paddd m1, m3 pshufb m3, m2, m6 ; 8 9 9 a a b b c pshufb m2, m7 ; a b b c c d d e pmaddwd m3, m10 ; efgh2 pmaddwd m2, m11 ; efgh3 paddd m1, m3 paddd m1, m2 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 mova [dstq+r6*2], m0 add r6, 8 jl .h_w8_loop add srcq, ssq add dstq, dsq dec hd jg .h_w8_loop0 RET .hv: RESET_STACK_STATE %if ARCH_X86_32 movd m4, r8m pshufb m4, [base+pw_256] %else %if WIN64 ALLOC_STACK 16*6, 16 %endif movd m15, r8m pshufb m15, [base+pw_256] %endif cmp wd, 4 jg .hv_w8 movzx mxd, mxb je .hv_w4 movq m0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] movddup m6, [base+pd_8704] pshuflw m0, m0, q2121 pxor m7, m7 punpcklbw m7, m0 punpcklbw m3, m3 psraw m3, 8 ; sign-extend test dword r8m, 0x800 jz .hv_w2_10bpc movddup m6, [base+pd_2560] psraw m7, 2 psllw m3, 2 .hv_w2_10bpc: %if ARCH_X86_32 mov dstq, dstmp mov dsq, dsmp mova m5, [base+spel_h_shuf2] ALLOC_STACK -16*8 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m9, m5 mova m11, m0 mova m12, m1 mova m13, m2 mova m14, m3 mova m15, m4 %else mova m9, [base+spel_h_shuf2] pshufd m11, m3, q0000 pshufd m12, m3, q1111 pshufd m13, m3, q2222 pshufd m14, m3, q3333 %endif lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] movu m1, [srcq+ssq*2] add srcq, r6 movu m4, [srcq+ssq*0] %if ARCH_X86_32 REPX {pshufb x, m5}, m2, m3, m1, m4 %else REPX {pshufb x, m9}, m2, m3, m1, m4 %endif REPX {pmaddwd x, m7}, m2, m3, m1, m4 phaddd m2, m3 ; 0 1 phaddd m1, m4 ; 2 3 movu m3, [srcq+ssq*1] movu m4, [srcq+ssq*2] add srcq, r6 movu m0, [srcq+ssq*0] %if ARCH_X86_32 REPX {pshufb x, m5}, m3, m4, m0 %else REPX {pshufb x, m9}, m3, m4, m0 %endif REPX {pmaddwd x, m7}, m3, m4, m0 phaddd m3, m4 ; 4 5 phaddd m0, m0 ; 6 6 REPX {paddd x, m6}, m2, m1, m3, m0 REPX {psrad x, 10}, m2, m1, m3, m0 packssdw m2, m1 ; 0 1 2 3 packssdw m3, m0 ; 4 5 6 _ palignr m4, m3, m2, 4 ; 1 2 3 4 pshufd m5, m3, q0321 ; 5 6 _ _ punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 punpcklwd m3, m5 ; 45 56 .hv_w2_loop: movu m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu m5, [srcq+ssq*0] pshufb m4, m9 pshufb m5, m9 pmaddwd m4, m7 pmaddwd m5, m7 phaddd m4, m5 pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 paddd m4, m6 psrad m4, 10 ; 7 8 packssdw m0, m4 pshufd m3, m0, q2103 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m5, m4 psrad m5, 10 packssdw m5, m5 pxor m4, m4 pminsw m5, m15 pmaxsw m5, m4 movd [dstq+dsq*0], m5 pshuflw m5, m5, q3232 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w8: shr mxd, 16 .hv_w4: movq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 RESET_STACK_STATE mov dstq, dstmp mov dsq, dsmp mova m0, [base+spel_h_shufA] mova m1, [base+spel_h_shufB] mova m6, [base+pd_512] ALLOC_STACK -16*15 mova m8, m0 mova m9, m1 mova m14, m6 %else mova m8, [base+spel_h_shufA] mova m9, [base+spel_h_shufB] %endif pxor m0, m0 punpcklbw m0, m2 punpcklbw m3, m3 psraw m3, 8 test dword r8m, 0x800 jz .hv_w4_10bpc psraw m0, 2 psllw m3, 2 .hv_w4_10bpc: lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 %if ARCH_X86_32 %define tmp esp+16*8 shl wd, 14 %if STACK_ALIGNMENT < 16 mov [esp+4*61], srcq mov [esp+4*62], dstq %else mov srcmp, srcq %endif mova [tmp+16*5], m4 lea wd, [wq+hq-(1<<16)] pshufd m1, m0, q0000 pshufd m2, m0, q1111 pshufd m5, m0, q2222 pshufd m0, m0, q3333 mova m10, m1 mova m11, m2 mova m12, m5 mova m13, m0 %else %if WIN64 %define tmp rsp %else %define tmp rsp-104 ; red zone %endif shl wd, 6 mov r7, srcq mov r8, dstq lea wd, [wq+hq-(1<<8)] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 mova [tmp+16*5], m15 %endif pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [tmp+16*1], m0 mova [tmp+16*2], m1 mova [tmp+16*3], m2 mova [tmp+16*4], m3 %macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 pmaddwd m%3, m10 pmaddwd m%1, m11 paddd m%3, %5 paddd m%1, m%3 pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a pmaddwd m%3, m12 pmaddwd m%2, m13 paddd m%1, m%3 paddd m%1, m%2 psrad m%1, %4 %endmacro .hv_w4_loop0: %if ARCH_X86_64 mova m14, [pd_512] %endif movu m4, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] movu m5, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] movu m6, [srcq+ssq*2+0] movu m3, [srcq+ssq*2+8] add srcq, r6 PUT_8TAP_HV_H 4, 1, 0, 10 PUT_8TAP_HV_H 5, 2, 0, 10 PUT_8TAP_HV_H 6, 3, 0, 10 movu m7, [srcq+ssq*0+0] movu m2, [srcq+ssq*0+8] movu m1, [srcq+ssq*1+0] movu m3, [srcq+ssq*1+8] PUT_8TAP_HV_H 7, 2, 0, 10 PUT_8TAP_HV_H 1, 3, 0, 10 movu m2, [srcq+ssq*2+0] movu m3, [srcq+ssq*2+8] add srcq, r6 PUT_8TAP_HV_H 2, 3, 0, 10 packssdw m4, m7 ; 0 3 packssdw m5, m1 ; 1 4 movu m0, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 1, 3, 10 packssdw m6, m2 ; 2 5 packssdw m7, m0 ; 3 6 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 %if ARCH_X86_32 jmp .hv_w4_loop_start .hv_w4_loop: mova m1, [tmp+16*6] mova m2, m15 .hv_w4_loop_start: mova m7, [tmp+16*1] pmaddwd m1, m7 ; a0 pmaddwd m2, m7 ; b0 mova m7, [tmp+16*2] mova [tmp+16*6], m3 pmaddwd m3, m7 ; a1 mova m15, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m1, m3 paddd m2, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m1, m5 paddd m2, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 10 packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 10 mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m1, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m2, m7 ; b3 psrad m1, 9 psrad m2, 9 packssdw m1, m2 pxor m7, m7 pmaxsw m1, m7 pavgw m7, m1 pminsw m7, [tmp+16*5] movq [dstq+dsq*0], m7 movhps [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop %if STACK_ALIGNMENT < 16 mov srcq, [esp+4*61] mov dstq, [esp+4*62] add srcq, 8 add dstq, 8 mov [esp+4*61], srcq mov [esp+4*62], dstq %else mov srcq, srcmp mov dstq, dstmp add srcq, 8 add dstq, 8 mov srcmp, srcq mov dstmp, dstq %endif movzx hd, ww sub wd, 1<<16 %else .hv_w4_loop: mova m15, [tmp+16*1] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 mova m7, [tmp+16*2] mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m14, m3 paddd m15, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m14, m5 paddd m15, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m14, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m15, m7 ; b3 psrad m14, 9 psrad m15, 9 packssdw m14, m15 pxor m7, m7 pmaxsw m14, m7 pavgw m7, m14 pminsw m7, [tmp+16*5] movq [dstq+dsq*0], m7 movhps [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop add r7, 8 add r8, 8 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 %endif jg .hv_w4_loop0 RET %undef tmp %if ARCH_X86_32 DECLARE_REG_TMP 2, 1, 6, 4 %elif WIN64 DECLARE_REG_TMP 6, 4, 7, 4 %else DECLARE_REG_TMP 6, 7, 7, 8 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_6tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my %define base t2-prep_ssse3 %if ARCH_X86_32 %define mxb r0b %define mxd r0 %define mxq r0 %define myb r2b %define myd r2 %define myq r2 %endif imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v LEA t2, prep_ssse3 movifnidn wd, wm movifnidn hd, hm movifnidn srcq, srcmp test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd mov myd, r7m ; bitdepth_max movzx wd, word [base+prep_ssse3_table+wq*2] mova m5, [base+pw_8192] shr myd, 11 add wq, t2 movddup m4, [base+prep_mul+myq*8] movifnidn ssq, ssmp movifnidn tmpq, tmpmp lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq .h: RESET_STACK_STATE test myd, 0xf00 jnz .hv movifnidn ssq, r2mp movddup m5, [base+prep_8tap_1d_rnd] cmp wd, 4 je mangle(private_prefix %+ _prep_8tap_16bpc_ssse3).h_w4 WIN64_SPILL_XMM 10 shr mxd, 16 movq m2, [base+subpel_filters+1+mxq*8] movifnidn tmpq, r0mp mova m4, [base+spel_h_shufA] add wd, wd mova m6, [base+spel_h_shufB] add srcq, wq punpcklbw m2, m2 add tmpq, wq psraw m2, 8 neg wq test dword r7m, 0x800 jnz .h_w8_12bpc psllw m2, 2 .h_w8_12bpc: pshufd m7, m2, q0000 %if ARCH_X86_32 ALLOC_STACK -16*2 %define m8 [rsp+16*0] %define m9 [rsp+16*1] pshufd m0, m2, q1111 pshufd m1, m2, q2222 mova m8, m0 mova m9, m1 %else pshufd m8, m2, q1111 pshufd m9, m2, q2222 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m3, [srcq+r6-4] movu m2, [srcq+r6+8] pshufb m0, m3, m4 ; 01 12 23 34 pmaddwd m0, m7 ; abcd0 pshufb m3, m6 ; 23 34 45 56 pmaddwd m1, m8, m3 ; abcd1 paddd m0, m1 pshufb m1, m2, m4 ; 67 78 89 9a shufpd m3, m1, 0x01; 45 56 67 78 pmaddwd m1, m8 ; efgh1 pshufb m2, m6 ; 89 9a ab bc pmaddwd m2, m9 ; efgh2 paddd m1, m2 pmaddwd m2, m9 , m3 ; abcd2 pmaddwd m3, m7 ; efgh0 paddd m0, m5 paddd m1, m5 paddd m0, m2 paddd m1, m3 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq+r6], m0 add r6, 16 jl .h_w8_loop add srcq, ssq sub tmpq, wq dec hd jg .h_w8_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movddup m5, [base+prep_8tap_1d_rnd] movq m2, [base+subpel_filters+1+myq*8] WIN64_SPILL_XMM 11, 16 movifnidn ssq, r2mp movifnidn tmpq, r0mp punpcklbw m2, m2 sub srcq, ssq psraw m2, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m2, 2 .v_12bpc: sub srcq, ssq %if ARCH_X86_32 ALLOC_STACK -16*4 pshufd m0, m2, q0000 mov r6d, wd pshufd m1, m2, q1111 shl r6d, 14 pshufd m2, m2, q2222 lea r6d, [r6+hq-(1<<16)] mova m8, m0 mova m9, m1 mova m10, m2 %if STACK_ALIGNMENT < 16 %define srcmp [esp+16*3+4*0] %define tmpmp [esp+16*3+4*1] %endif .v_w4_loop0: mov srcmp, srcq mov tmpmp, tmpq %else pshufd m8, m2, q0000 and wd, -8 jnz .v_w8 pshufd m9, m2, q1111 pshufd m10, m2, q2222 %endif movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m0 ; 34 .v_w4_loop: pmaddwd m6, m8, m1 ; a0 pmaddwd m7, m8, m2 ; b0 mova m1, m3 pmaddwd m3, m9 ; a1 mova m2, m4 pmaddwd m4, m9 ; b1 paddd m6, m3 movq m3, [srcq+ssq*0] paddd m7, m4 movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] punpcklwd m3, m4 ; 45 punpcklwd m4, m0 ; 56 pmaddwd m0, m10, m3 ; a2 paddd m6, m5 paddd m6, m0 pmaddwd m0, m10, m4 ; b2 paddd m7, m5 paddd m7, m0 psrad m6, 4 psrad m7, 4 packssdw m6, m7 %if ARCH_X86_32 movq [tmpq+wq*0], m6 movhps [tmpq+wq*2], m6 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w4_loop mov srcq, srcmp mov tmpq, tmpmp movzx hd, r6w add srcq, 8 add tmpq, 8 sub r6d, 1<<16 jg .v_w4_loop0 RET %else mova [tmpq], m6 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: mova r6m, m8 lea r6d, [wq*4-(1<<5)] pshufd m6, m2, q1111 lea r6d, [hq+r6*8] pshufd m7, m2, q2222 WIN64_PUSH_XMM 16 .v_w8_loop0: movu m9, [srcq+ssq*0] lea r5, [srcq+ssq*2] movu m11, [srcq+ssq*1] mov r7, tmpq movu m13, [r5+ssq*0] movu m15, [r5+ssq*1] lea r5, [r5+ssq*2] movu m4, [r5+ssq*0] punpcklwd m8, m9, m11 ; 01 punpckhwd m9, m11 punpcklwd m10, m11, m13 ; 12 punpckhwd m11, m13 punpcklwd m12, m13, m15 ; 23 punpckhwd m13, m15 punpcklwd m14, m15, m4 ; 34 punpckhwd m15, m4 .v_w8_loop: mova m3, r6m pmaddwd m0, m8, m3 ; a0 pmaddwd m2, m9, m3 ; a0' pmaddwd m1, m10, m3 ; b0 pmaddwd m3, m11 ; b0' mova m8, m12 pmaddwd m12, m6 ; a1 mova m9, m13 pmaddwd m13, m6 ; a1' mova m10, m14 pmaddwd m14, m6 ; b1 mova m11, m15 pmaddwd m15, m6 ; b1' paddd m0, m12 paddd m2, m13 movu m13, [r5+ssq*0] paddd m1, m14 paddd m3, m15 movu m15, [r5+ssq*1] lea r5, [r5+ssq*2] movu m4, [r5+ssq*0] REPX {paddd x, m5}, m0, m2, m1, m3 punpcklwd m12, m13, m15 ; 45 punpckhwd m13, m15 punpcklwd m14, m15, m4 ; 56 punpckhwd m15, m4 pmaddwd m4, m7, m12 ; a2 paddd m0, m4 pmaddwd m4, m7, m13 ; a2' paddd m2, m4 pmaddwd m4, m7, m14 ; b2 paddd m1, m4 pmaddwd m4, m7, m15 ; b2' paddd m3, m4 REPX {psrad x, 4}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 mova [r7+wq*0], m0 mova [r7+wq*2], m1 lea r7, [r7+wq*4] sub hd, 2 jg .v_w8_loop add srcq, 16 add tmpq, 16 movzx hd, r6b sub r6d, 1<<8 jg .v_w8_loop0 RET %endif .hv: and wd, -8 jnz .hv_w8 movzx mxd, mxb movq m0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m2, [base+subpel_filters+1+myq*8] WIN64_SPILL_XMM 15 movifnidn ssq, r2mp movifnidn tmpq, r0mp mova m7, [base+prep_8tap_2d_rnd] sub srcq, 2 pshuflw m0, m0, q2121 pxor m6, m6 punpcklbw m6, m0 punpcklbw m2, m2 psraw m6, 4 psraw m2, 8 test dword r7m, 0x800 jz .hv_w4_10bpc psraw m6, 2 .hv_w4_10bpc: %if ARCH_X86_32 %assign regs_used 4 ALLOC_STACK -16*7 %assign regs_used 7 %define m10 [esp+16*3] %define m12 [esp+16*5] %define m13 [esp+16*6] %define m14 [base+spel_h_shufA] %define m11 [base+spel_h_shufB] pshufd m0, m2, q0000 pshufd m1, m2, q1111 pshufd m2, m2, q2222 pshufd m5, m6, q0000 pshufd m6, m6, q1111 mova m8, m0 mova m9, m1 mova m10, m2 mova m12, m5 mova m13, m6 neg ssq movu m3, [srcq+ssq*2] movu m4, [srcq+ssq*1] neg ssq %else mov r6, ssq pshufd m8, m2, q0000 neg r6 pshufd m9, m2, q1111 movu m3, [srcq+r6 *2] pshufd m10, m2, q2222 movu m4, [srcq+r6 *1] pshufd m12, m6, q0000 mova m14, [base+spel_h_shufA] pshufd m13, m6, q1111 mova m11, [base+spel_h_shufB] %endif movu m1, [srcq+ssq*0] movu m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu m2, [srcq+ssq*0] HV_H_W4_6TAP m3, m3, m5, m11 HV_H_W4_6TAP m4, m4, m5, m11 HV_H_W4_6TAP m5, m1, m5, m11 HV_H_W4_6TAP m0, m0, m1, m11 HV_H_W4_6TAP m2, m2, m1, m11 REPX {psrad x, 6}, m3, m5, m4, m0, m2 packssdw m3, m5 ; 0 2 packssdw m4, m0 ; 1 3 packssdw m5, m2 ; 2 4 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m5 ; 12 punpckhwd m4, m5 ; 34 .hv_w4_loop: movu m0, [srcq+ssq*1] pmaddwd m5, m8, m1 ; a0 lea srcq, [srcq+ssq*2] pmaddwd m6, m8, m2 ; b0 mova m1, m3 pmaddwd m3, m9 ; a1 mova m2, m4 pmaddwd m4, m9 ; b1 paddd m5, m3 movu m3, [srcq+ssq*0] paddd m6, m4 HV_H_W4_6TAP m0, m0, m4, m11 HV_H_W4_6TAP m3, m3, m4, m11 psrad m4, m2, 16 psrad m0, 6 psrad m3, 6 packssdw m4, m0 ; 4 5 packssdw m0, m3 ; 5 6 punpcklwd m3, m4, m0 ; 45 punpckhwd m4, m0 ; 56 pmaddwd m0, m10, m3 ; a2 paddd m5, m7 paddd m5, m0 pmaddwd m0, m10, m4 ; b2 paddd m6, m7 paddd m6, m0 psrad m5, 6 psrad m6, 6 packssdw m5, m6 mova [tmpq], m5 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: RESET_STACK_STATE shr mxd, 16 movq m2, [base+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m1, [base+subpel_filters+1+myq*8] movifnidn ssq, r2mp mova m4, [base+prep_8tap_2d_rnd] pxor m0, m0 punpcklbw m0, m2 punpcklbw m1, m1 sub srcq, 4 psraw m0, 4 psraw m1, 8 test dword r7m, 0x800 jz .hv_w8_10bpc psraw m0, 2 .hv_w8_10bpc: %if ARCH_X86_32 %assign regs_used 1 ALLOC_STACK -16*9 %assign regs_used 7 mov tmpq, r0mp mova [rsp+16*7], m4 %else %if WIN64 PUSH r8 %assign regs_used 9 %endif ALLOC_STACK 16*6, 16 %endif pshufd m2, m0, q0000 mova [rsp+16*0], m2 pshufd m2, m0, q1111 mova [rsp+16*1], m2 pshufd m0, m0, q2222 mova [rsp+16*2], m0 pshufd m2, m1, q0000 mova [rsp+16*3], m2 pshufd m2, m1, q1111 mova [rsp+16*4], m2 pshufd m1, m1, q2222 mova [rsp+16*5], m1 mov r6, ssq neg r6 %if ARCH_X86_32 mov r5d, wd shl r5d, 14 lea r5d, [r5+hq-(1<<16)] %if STACK_ALIGNMENT < 16 %define srcmp [esp+16*8+4*0] %define tmpmp [esp+16*8+4*1] %endif .hv_w8_loop0: mov srcmp, srcq mov tmpmp, tmpq movu m5, [srcq+r6*2+0] movu m6, [srcq+r6*2+2] mova m7, [rsp+16*0] mova m1, [rsp+16*1] mova m0, [rsp+16*2] HV_H_6TAP m2, m5, m6, m7, m1, m0 movu m5, [srcq+r6*1+0] movu m6, [srcq+r6*1+2] HV_H_6TAP m3, m5, m6, m7, m1, m0 movu m5, [srcq+ssq*0+0] movu m6, [srcq+ssq*0+2] HV_H_6TAP m4, m5, m6, m7, m1, m0 movu m5, [srcq+ssq*1+0] movu m6, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] HV_H_6TAP m0, m5, m6, m7, m1 movu m5, [srcq+ssq*0+0] movu m6, [srcq+ssq*0+2] HV_H_6TAP m1, m5, m6, m7 mova m5, [rsp+16*7] REPX {paddd x, m5}, m2, m3, m4, m0, m1 REPX {psrad x, 6 }, m2, m4, m3, m0, m1 packssdw m2, m4 ; 0 2 packssdw m3, m0 ; 1 3 packssdw m4, m1 ; 2 4 punpcklwd m0, m2, m3 ; 01 punpckhwd m2, m3 ; 23 punpcklwd m1, m3, m4 ; 12 punpckhwd m3, m4 ; 34 .hv_w8_loop: mova m5, [rsp+16*3] mova m6, [rsp+16*4] pmaddwd m4, m0, m5 ; a0 pmaddwd m5, m1 ; b0 mova m0, m2 pmaddwd m2, m6 ; a1 mova m1, m3 pmaddwd m3, m6 ; b1 paddd m4, m2 movu m2, [srcq+ssq*1+0] paddd m5, m3 movu m3, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] HV_H_6TAP m6, m2, m3 movu m2, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+2] HV_H_6TAP m7, m2, m3 mova m2, [rsp+16*7] psrad m3, m1, 16 REPX {paddd x, m2}, m6, m7, m4, m5 psrad m6, 6 psrad m7, 6 packssdw m3, m6 ; 4 5 packssdw m6, m7 ; 5 6 mova m7, [rsp+16*5] punpcklwd m2, m3, m6 ; 45 punpckhwd m3, m6 ; 56 pmaddwd m6, m2, m7 ; a2 pmaddwd m7, m3 ; b2 paddd m4, m6 paddd m5, m7 psrad m4, 6 psrad m5, 6 packssdw m4, m5 movq [tmpq+wq*0], m4 movhps [tmpq+wq*2], m4 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w8_loop mov srcq, srcmp mov tmpq, tmpmp movzx hd, r5w add srcq, 8 add tmpq, 8 sub r5d, 1<<16 %else lea r8d, [wq*4-(1<<5)] lea r8d, [hq+r8*8] .hv_w8_loop0: mova m5, [spel_h_shufA] movu m0, [srcq+r6*2+ 0] mova m6, [rsp+16*0] movu m1, [srcq+r6*2+ 8] mova m7, [rsp+16*1] movu m2, [srcq+r6*2+16] mova m8, [rsp+16*2] HV_H_6TAP m9, m0, m1, m2, 6, m5, m6, m7, m8 movu m0, [srcq+r6*1+ 0] movu m1, [srcq+r6*1+ 8] movu m2, [srcq+r6*1+16] lea r5, [srcq+ssq*2] HV_H_6TAP m11, m0, m1, m2, 6, m5, m6, m7, m8 movu m0, [srcq+ssq*0+ 0] movu m1, [srcq+ssq*0+ 8] movu m2, [srcq+ssq*0+16] mov r7, tmpq HV_H_6TAP m13, m0, m1, m2, 6, m5, m6, m7, m8 movu m0, [srcq+ssq*1+ 0] movu m1, [srcq+ssq*1+ 8] movu m2, [srcq+ssq*1+16] HV_H_6TAP m15, m0, m1, m2, 6, m5, m6, m7, m8 movu m0, [r5+ssq*0+ 0] movu m1, [r5+ssq*0+ 8] movu m2, [r5+ssq*0+16] HV_H_6TAP m5, m0, m1, m2, 6, m5, m6, m7, m8 punpcklwd m8, m9, m11 ; 01 punpckhwd m9, m11 punpcklwd m10, m11, m13 ; 12 punpckhwd m11, m13 punpcklwd m12, m13, m15 ; 23 punpckhwd m13, m15 punpcklwd m14, m15, m5 ; 34 punpckhwd m15, m5 .hv_w8_loop: mova m3, [rsp+16*3] mova m7, [rsp+16*4] pmaddwd m0, m8, m3 ; a0 mova m8, m12 pmaddwd m2, m9, m3 ; a0' mova m9, m13 pmaddwd m1, m10, m3 ; b0 mova m10, m14 pmaddwd m3, m11 ; b0' mova m11, m15 REPX {pmaddwd x, m7}, m12, m13, m14, m15 movu m6, [r5+ssq*1+ 0] paddd m0, m12 movu m7, [r5+ssq*1+ 8] paddd m2, m13 movu m12, [r5+ssq*1+16] paddd m1, m14 lea r5, [r5+ssq*2] paddd m3, m15 HV_H_6TAP m15, m6, m7, m12, 6 movu m6, [r5+ssq*0+ 0] movu m7, [r5+ssq*0+ 8] movu m14, [r5+ssq*0+16] punpcklwd m12, m5, m15 ; 45 punpckhwd m13, m5, m15 HV_H_6TAP m5, m6, m7, m14, 6 mova m7, [rsp+16*5] REPX {paddd x, m4}, m0, m2, m1, m3 punpcklwd m14, m15, m5 ; 56 punpckhwd m15, m5 pmaddwd m6, m12, m7 ; a2 paddd m0, m6 pmaddwd m6, m13, m7 ; a2' paddd m2, m6 pmaddwd m6, m14, m7 ; b2 pmaddwd m7, m15 ; b2' paddd m1, m6 paddd m3, m7 REPX {psrad x, 6}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 mova [r7+wq*0], m0 mova [r7+wq*2], m1 lea r7, [r7+wq*4] sub hd, 2 jg .hv_w8_loop add srcq, 16 add tmpq, 16 movzx hd, r8b sub r8d, 1<<8 %endif jg .hv_w8_loop0 RET PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc PREP_8TAP_FN sharp, SHARP, SHARP cglobal prep_8tap_16bpc, 0, 8, 0, tmp, src, ss, w, h, mx, my %if ARCH_X86_32 %define mxb r0b %define mxd r0 %define mxq r0 %define myb r2b %define myd r2 %define myq r2 %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %endif imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v LEA t2, prep_ssse3 movifnidn wd, wm movifnidn srcq, srcmp test mxd, 0xf00 jnz .h movifnidn hd, hm test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_16bpc_ssse3).prep .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] WIN64_SPILL_XMM 15 movddup m7, [base+prep_8tap_1d_rnd] movifnidn ssq, r2mp movifnidn tmpq, r0mp punpcklbw m3, m3 psraw m3, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m3, 2 .v_12bpc: %if ARCH_X86_32 ALLOC_STACK -16*7 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif lea r6, [ssq*3] sub srcq, r6 mov r6d, wd shl wd, 6 mov r5, srcq %if ARCH_X86_64 mov r7, tmpq %elif STACK_ALIGNMENT < 16 mov [esp+4*29], tmpq %endif lea wd, [wq+hq-(1<<8)] .v_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m5, [srcq+ssq*0] movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m5 ; 34 punpcklwd m5, m6 ; 45 punpcklwd m6, m0 ; 56 %if ARCH_X86_32 jmp .v_loop_start .v_loop: mova m1, m12 mova m2, m13 mova m3, m14 .v_loop_start: pmaddwd m1, m8 ; a0 pmaddwd m2, m8 ; b0 mova m12, m3 mova m13, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m1, m3 paddd m2, m4 mova m14, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m1, m5 paddd m2, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m3, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m1, m7 paddd m1, m3 pmaddwd m3, m11, m6 ; b3 paddd m2, m7 paddd m2, m3 psrad m1, 4 psrad m2, 4 packssdw m1, m2 movq [tmpq+r6*0], m1 movhps [tmpq+r6*2], m1 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .v_loop %if STACK_ALIGNMENT < 16 mov tmpq, [esp+4*29] add r5, 8 add tmpq, 8 mov srcq, r5 mov [esp+4*29], tmpq %else mov tmpq, tmpmp add r5, 8 add tmpq, 8 mov srcq, r5 mov tmpmp, tmpq %endif %else .v_loop: pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 paddd m13, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m14, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m12, m7 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m7 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 movq [tmpq+r6*0], m12 movhps [tmpq+r6*2], m12 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .v_loop add r5, 8 add r7, 8 mov srcq, r5 mov tmpq, r7 %endif movzx hd, wb sub wd, 1<<8 jg .v_loop0 RET .h: RESET_STACK_STATE test myd, 0xf00 jnz .hv movifnidn ssq, r2mp movifnidn hd, r4m movddup m5, [base+prep_8tap_1d_rnd] cmp wd, 4 jne .h_w8 .h_w4: movzx mxd, mxb movq m0, [base+subpel_filters+mxq*8] mova m3, [base+spel_h_shufA] mova m4, [base+spel_h_shufB] movifnidn tmpq, tmpmp sub srcq, 2 WIN64_SPILL_XMM 8 punpcklbw m0, m0 psraw m0, 8 test dword r7m, 0x800 jnz .h_w4_12bpc psllw m0, 2 .h_w4_12bpc: pshufd m6, m0, q1111 pshufd m7, m0, q2222 .h_w4_loop: movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq], m0 add tmpq, 16 sub hd, 2 jg .h_w4_loop RET .h_w8: WIN64_SPILL_XMM 11 shr mxd, 16 movq m2, [base+subpel_filters+mxq*8] mova m4, [base+spel_h_shufA] mova m6, [base+spel_h_shufB] movifnidn tmpq, r0mp add wd, wd punpcklbw m2, m2 add srcq, wq psraw m2, 8 add tmpq, wq neg wq test dword r7m, 0x800 jnz .h_w8_12bpc psllw m2, 2 .h_w8_12bpc: pshufd m7, m2, q0000 %if ARCH_X86_32 ALLOC_STACK -16*3 pshufd m0, m2, q1111 pshufd m1, m2, q2222 pshufd m2, m2, q3333 mova m8, m0 mova m9, m1 mova m10, m2 %else pshufd m8, m2, q1111 pshufd m9, m2, q2222 pshufd m10, m2, q3333 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m0, [srcq+r6- 6] movu m1, [srcq+r6+ 2] pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 pshufb m0, m6 ; 2 3 3 4 4 5 5 6 pmaddwd m2, m7 ; abcd0 pmaddwd m0, m8 ; abcd1 pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 pshufb m1, m6 ; 6 7 7 8 8 9 9 a paddd m2, m5 paddd m0, m2 pmaddwd m2, m9, m3 ; abcd2 pmaddwd m3, m7 ; efgh0 paddd m0, m2 pmaddwd m2, m10, m1 ; abcd3 pmaddwd m1, m8 ; efgh1 paddd m0, m2 movu m2, [srcq+r6+10] paddd m3, m5 paddd m1, m3 pshufb m3, m2, m4 ; a b b c c d d e pshufb m2, m6 ; 8 9 9 a a b b c pmaddwd m3, m9 ; efgh2 pmaddwd m2, m10 ; efgh3 paddd m1, m3 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq+r6], m0 add r6, 16 jl .h_w8_loop add srcq, ssq sub tmpq, wq dec hd jg .h_w8_loop0 RET .hv: RESET_STACK_STATE movzx t3d, mxb shr mxd, 16 cmp wd, 4 cmove mxd, t3d movifnidn hd, r4m movq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 mov ssq, r2mp mov tmpq, r0mp mova m0, [base+spel_h_shufA] mova m1, [base+spel_h_shufB] mova m4, [base+prep_8tap_2d_rnd] ALLOC_STACK -16*14 mova m8, m0 mova m9, m1 mova m14, m4 %else %if WIN64 ALLOC_STACK 16*6, 16 %endif mova m8, [base+spel_h_shufA] mova m9, [base+spel_h_shufB] %endif pxor m0, m0 punpcklbw m0, m2 punpcklbw m3, m3 psraw m0, 4 psraw m3, 8 test dword r7m, 0x800 jz .hv_10bpc psraw m0, 2 .hv_10bpc: lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 mov r6d, wd shl wd, 6 mov r5, srcq %if ARCH_X86_32 %define tmp esp+16*8 %if STACK_ALIGNMENT < 16 mov [esp+4*61], tmpq %endif pshufd m1, m0, q0000 pshufd m2, m0, q1111 pshufd m5, m0, q2222 pshufd m0, m0, q3333 mova m10, m1 mova m11, m2 mova m12, m5 mova m13, m0 %else %if WIN64 %define tmp rsp %else %define tmp rsp-88 ; red zone %endif mov r7, tmpq pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif lea wd, [wq+hq-(1<<8)] pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [tmp+16*1], m0 mova [tmp+16*2], m1 mova [tmp+16*3], m2 mova [tmp+16*4], m3 .hv_loop0: %if ARCH_X86_64 mova m14, [prep_8tap_2d_rnd] %endif movu m4, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] movu m5, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] movu m6, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+8] PUT_8TAP_HV_H 4, 1, 0, 6 PUT_8TAP_HV_H 5, 2, 0, 6 PUT_8TAP_HV_H 6, 3, 0, 6 movu m7, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] movu m1, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+8] PUT_8TAP_HV_H 7, 2, 0, 6 PUT_8TAP_HV_H 1, 3, 0, 6 movu m2, [srcq+ssq*1+0] movu m3, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 2, 3, 0, 6 packssdw m4, m7 ; 0 3 packssdw m5, m1 ; 1 4 movu m0, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 1, 3, 6 packssdw m6, m2 ; 2 5 packssdw m7, m0 ; 3 6 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 %if ARCH_X86_32 jmp .hv_loop_start .hv_loop: mova m1, [tmp+16*5] mova m2, m15 .hv_loop_start: mova m7, [tmp+16*1] pmaddwd m1, m7 ; a0 pmaddwd m2, m7 ; b0 mova m7, [tmp+16*2] mova [tmp+16*5], m3 pmaddwd m3, m7 ; a1 mova m15, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m1, m14 paddd m2, m14 paddd m1, m3 paddd m2, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m1, m5 paddd m2, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 6 packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 6 mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m1, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m2, m7 ; b3 psrad m1, 6 psrad m2, 6 packssdw m1, m2 movq [tmpq+r6*0], m1 movhps [tmpq+r6*2], m1 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .hv_loop %if STACK_ALIGNMENT < 16 mov tmpq, [esp+4*61] add r5, 8 add tmpq, 8 mov srcq, r5 mov [esp+4*61], tmpq %else mov tmpq, tmpmp add r5, 8 add tmpq, 8 mov srcq, r5 mov tmpmp, tmpq %endif %else .hv_loop: mova m15, [tmp+16*1] mova m7, [prep_8tap_2d_rnd] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 paddd m14, m7 paddd m15, m7 mova m7, [tmp+16*2] mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m14, m3 paddd m15, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m14, m5 paddd m15, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m14, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m15, m7 ; b3 psrad m14, 6 psrad m15, 6 packssdw m14, m15 movq [tmpq+r6*0], m14 movhps [tmpq+r6*2], m14 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .hv_loop add r5, 8 add r7, 8 mov srcq, r5 mov tmpq, r7 %endif movzx hd, wb sub wd, 1<<8 jg .hv_loop0 RET %undef tmp %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro SAVE_REG 1 %xdefine r%1_save r%1 %xdefine r%1q_save r%1q %xdefine r%1d_save r%1d %if ARCH_X86_32 %define r%1m_save [rstk+stack_offset+(%1+1)*4] %endif %endmacro %macro LOAD_REG 1 %xdefine r%1 r%1_save %xdefine r%1q r%1q_save %xdefine r%1d r%1d_save %if ARCH_X86_32 %define r%1m r%1m_save %endif %undef r%1d_save %undef r%1q_save %undef r%1_save %endmacro %macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %if ARCH_X86_32 %if %3 == 0 %xdefine r%1m r%2m %else %define r%1m [rstk+stack_offset+(%1+1)*4] %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %if ARCH_X86_64 SAVE_REG 14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %else SAVE_REG 5 %assign %%i 5 %rep 5 %assign %%j %%i-1 REMAP_REG %%i, %%j, 0 %assign %%i %%i-1 %endrep %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %if ARCH_X86_64 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep LOAD_REG 14 %else %rep 4 %assign %%j %%i+1 REMAP_REG %%i, %%j, 1 %assign %%i %%i+1 %endrep LOAD_REG 5 %endif %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %if ARCH_X86_32 %macro MC_4TAP_SCALED_H 1 ; dst_mem movu m7, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m5, [r4 +ssq*0] movu m6, [r4 +ssq*1] lea srcq, [srcq+ssq*2] lea r4, [r4 +ssq*2] REPX {pshufb x, m12}, m7, m2 REPX {pmaddwd x, m13}, m7, m2 REPX {pshufb x, m14}, m5, m6 REPX {pmaddwd x, m15}, m5, m6 phaddd m7, m5 phaddd m2, m6 mova m5, [esp+0x00] movd m6, [esp+0x10] paddd m7, m5 paddd m2, m5 psrad m7, m6 psrad m2, m6 packssdw m7, m2 mova [stk+%1], m7 %endmacro %endif %if ARCH_X86_64 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] movu m%1, [srcq+ r4*2] movu m%2, [srcq+ r6*2] movu m%3, [srcq+ r7*2] movu m%4, [srcq+ r9*2] movu m%5, [srcq+r10*2] movu m%6, [srcq+r11*2] movu m%7, [srcq+r13*2] movu m%8, [srcq+ rX*2] add srcq, ssq pmaddwd m%1, [stk+0x10] pmaddwd m%2, [stk+0x20] pmaddwd m%3, [stk+0x30] pmaddwd m%4, [stk+0x40] pmaddwd m%5, [stk+0x50] pmaddwd m%6, [stk+0x60] pmaddwd m%7, [stk+0x70] pmaddwd m%8, [stk+0x80] phaddd m%1, m%2 phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, hround paddd m%5, hround psrad m%1, m12 psrad m%5, m12 packssdw m%1, m%5 %endmacro %else %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets %if %3 == 1 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] %endif movu m0, [srcq+r0*2] movu m1, [srcq+rX*2] movu m2, [srcq+r4*2] movu m3, [srcq+r5*2] mov r0, [stk+16] mov rX, [stk+20] mov r4, [stk+24] mov r5, [stk+28] pmaddwd m0, [stk+%1+0x00] pmaddwd m1, [stk+%1+0x10] pmaddwd m2, [stk+%1+0x20] pmaddwd m3, [stk+%1+0x30] phaddd m0, m1 phaddd m2, m3 movu m4, [srcq+r0*2] movu m5, [srcq+rX*2] movu m6, [srcq+r4*2] movu m7, [srcq+r5*2] add srcq, ssq pmaddwd m4, [stk+%1+0xa0] pmaddwd m5, [stk+%1+0xb0] pmaddwd m6, [stk+%1+0xc0] pmaddwd m7, [stk+%1+0xd0] phaddd m4, m5 phaddd m6, m7 phaddd m0, m2 phaddd m4, m6 paddd m0, hround paddd m4, hround psrad m0, m12 psrad m4, m12 packssdw m0, m4 %if %2 != 0 mova [stk+%2], m0 %endif %endmacro %endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %endif %xdefine base_reg r12 %else ; prep %assign isput 0 %assign isprep 1 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [stk+0x138] %endif %xdefine base_reg r11 %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %endif %define tmp_stridem dword [stk+0x138] %endif %endif %if ARCH_X86_32 mov [esp+0x1f0], t0d mov [esp+0x1f4], t1d %if isput && required_stack_alignment > STACK_ALIGNMENT mov dstd, dstm mov dsd, dsm mov srcd, srcm mov ssd, ssm mov hd, hm mov r4, mxm %define r0m [esp+0x200] %define dsm [esp+0x204] %define dsmp dsm %define r1m dsm %define r2m [esp+0x208] %define ssm [esp+0x20c] %define r3m ssm %define hm [esp+0x210] %define mxm [esp+0x214] mov r0m, dstd mov dsm, dsd mov r2m, srcd mov ssm, ssd mov hm, hd mov r0, mym mov r1, dxm mov r2, dym %define mym [esp+0x218] %define dxm [esp+0x21c] %define dym [esp+0x220] mov mxm, r4 mov mym, r0 mov dxm, r1 mov dym, r2 tzcnt wd, wm %endif %if isput mov r3, pxmaxm %define pxmaxm r3 %else mov r2, pxmaxm %endif %if isprep && required_stack_alignment > STACK_ALIGNMENT %xdefine base_reg r5 %else %xdefine base_reg r6 %endif %endif LEA base_reg, %1_8tap_scaled_16bpc_ssse3 %xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm %endif %if ARCH_X86_64 %if isput mov r7d, pxmaxm %endif %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %endif movd m8, dxm movd m14, mxm %if isput movd m15, pxmaxm %endif pshufd m8, m8, q0000 pshufd m14, m14, q0000 %if isput pshuflw m15, m15, q0000 punpcklqdq m15, m15 %endif %if isprep %if UNIX64 mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif %if ARCH_X86_64 mov r6d, pxmaxm %endif %endif %if ARCH_X86_64 mov dyd, dym %endif %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %else %endif %if ARCH_X86_64 %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x138] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else %define rX r1 %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %xdefine hm r7m %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %if ARCH_X86_64 %define rX r14 %define rXd r14d %else %define rX r3 %endif %endif %if ARCH_X86_64 shr r7d, 11 mova m10, [base+pd_0x3ff] movddup m11, [base+s_8tap_h_rnd+r7*8] movd m12, [base+s_8tap_h_sh+r7*4] %if isput movddup m13, [base+put_s_8tap_v_rnd+r7*8] movd m7, [base+put_s_8tap_v_sh+r7*4] %define pxmaxm [rsp] mova pxmaxm, m15 punpcklqdq m12, m7 %endif lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q %else %define m10 [base+pd_0x3ff] %define m11 [esp+0x00] %define m12 [esp+0x10] shr r3, 11 movddup m1, [base+s_8tap_h_rnd+r3*8] movd m2, [base+s_8tap_h_sh+r3*4] %if isput %define m13 [esp+0x20] %define pxmaxm [esp+0x30] %define stk esp+0x40 movddup m5, [base+put_s_8tap_v_rnd+r3*8] movd m6, [base+put_s_8tap_v_sh+r3*4] mova pxmaxm, m15 punpcklqdq m2, m6 mova m13, m5 %else %define m13 [base+pd_m524256] %endif mov ssd, ssm mova m11, m1 mova m12, m2 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT mov r1, [esp+0x1f4] lea r0, [ssd*3] movzx r2, r1b shr r1, 16 cmp dword hm, 6 cmovs r1, r2 mov [esp+0x1f4], r1 %if isprep mov r1, r1m %endif mov r2, r2m sub srcq, r0 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define ss3q r0 %define myd r4 %define dyd dword dym %define hd dword hm %endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] add wq, base_reg jmp wq %if isput .w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b sub srcq, 2 movd m15, t0d %else movzx r4, byte [esp+0x1f0] sub srcq, 2 movd m15, r4 %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6, m7 REPX {pmaddwd x, m15}, m4, m5, m6, m7 phaddd m0, m1 phaddd m2, m3 phaddd m4, m5 phaddd m6, m7 REPX {paddd x, m11}, m0, m2, m4, m6 REPX {psrad x, m12}, m0, m2, m4, m6 packssdw m0, m2 ; 0 1 2 3 packssdw m4, m6 ; 4 5 6 7 SWAP m1, m4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m2, m3 movu m1, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m14}, m1, m7, m6, m3 REPX {pmaddwd x, m15}, m1, m7, m6, m3 phaddd m1, m7 phaddd m6, m3 REPX {paddd x, m11}, m0, m2, m1, m6 REPX {psrad x, m12}, m0, m2, m1, m6 packssdw m0, m2 packssdw m1, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] %endif palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mov myd, mym mov r0, r0m mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif .w2_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m10, r6q punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pmaddwd m5, m3, m7 pmaddwd m6, m0, m8 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddwd m7, m2, m9 pmaddwd m8, m4, m10 paddd m5, m6 paddd m7, m8 %else mov r1, [esp+0x1f4] xor r3, r3 mov r5, myd shr r5, 6 lea r1, [r1+r5] mov r5, 64 << 24 cmovnz r3, [base+subpel_filters+r1*8+4] cmovnz r5, [base+subpel_filters+r1*8+0] movd m6, r3 movd m7, r5 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pmaddwd m3, m5 pmaddwd m0, m6 pshufd m5, m7, q2222 pshufd m7, m7, q3333 pmaddwd m2, m5 pmaddwd m4, m7 paddd m3, m0 paddd m2, m4 SWAP m5, m3 SWAP m7, m2 %define m8 m3 %endif paddd m5, m13 pshufd m6, m12, q1032 pxor m8, m8 paddd m5, m7 psrad m5, m6 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, pxmaxm movd [dstq], m5 add dstq, dsmp dec hd jz .ret %if ARCH_X86_64 add myd, dyd %else add myd, dym %endif test myd, ~0x3ff %if ARCH_X86_32 SWAP m3, m5 SWAP m2, m7 mova m3, [stk+0x20] mova m0, [stk+0x30] mova m2, [stk+0x40] mova m4, [stk+0x50] %endif jz .w2_loop %if ARCH_X86_32 mov r3, r3m %endif movu m5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps m3, m0, q1032 ; 01 12 shufps m0, m2, q1032 ; 23 34 shufps m2, m4, q1032 ; 45 56 pshufb m5, m14 pmaddwd m5, m15 phaddd m5, m5 paddd m5, m11 psrad m5, m12 packssdw m5, m5 palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif jmp .w2_loop .w2_skip_line: movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m3, m0 ; 01 12 mova m0, m2 ; 23 34 pshufb m5, m14 pshufb m6, m14 pmaddwd m5, m15 pmaddwd m6, m15 phaddd m5, m6 paddd m5, m11 psrad m5, m12 packssdw m5, m5 ; 6 7 6 7 punpckhqdq m1, m5 ; 4 5 6 7 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif jmp .w2_loop %endif INIT_XMM ssse3 .w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 movzx r4, byte [esp+0x1f0] sub srcq, 2 movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %else %define m9 [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd rX, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] movifprep r3, r3m SWAP m4, m7 %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x20], m13 mova [stk+0x30], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] movu m1, [srcq+r4 ] movu m3, [srcq+r6 ] movu m2, [srcq+r11 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m13}, m7, m9, m8, m10 REPX {pshufb x, m14}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m4, [srcq+ss3q ] REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 packssdw m8, m10 ; 2 3 movu m0, [srcq+r4 ] movu m9, [srcq+r6 ] movu m10, [srcq+r11 ] movu m11, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m1, m2, m3, m4 REPX {pmaddwd x, m13}, m1, m2, m3, m4 REPX {pshufb x, m14}, m0, m9, m10, m11 REPX {pmaddwd x, m15}, m0, m9, m10, m11 phaddd m1, m0 phaddd m2, m9 phaddd m3, m10 phaddd m4, m11 REPX {paddd x, m5}, m1, m2, m3, m4 REPX {psrad x, xm6}, m1, m2, m3, m4 packssdw m1, m2 ; 4 5 packssdw m3, m4 ; 6 7 SWAP m9, m1 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 pshufd m10, m3, q1032 ; 7 _ punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 punpcklwd m3, m10 ; 67 mova [rsp+0x40], m7 mova [rsp+0x50], m8 mova [rsp+0x60], m9 %else mova [stk+0x00], m12 mova [stk+0x10], m14 add r4, srcq MC_4TAP_SCALED_H 0x40 ; 0 1 MC_4TAP_SCALED_H 0x50 ; 2 3 MC_4TAP_SCALED_H 0x60 ; 4 5 MC_4TAP_SCALED_H 0x70 ; 6 7 mova m4, [stk+0x40] mova m5, [stk+0x50] mova m6, [stk+0x60] mova m7, [stk+0x70] mov [stk+0xc0], r4 shufps m1, m4, m5, q1032 ; 1 2 shufps m2, m5, m6, q1032 ; 3 4 shufps m3, m6, m7, q1032 ; 5 6 pshufd m0, m7, q1032 ; 7 _ mova [stk+0xb0], m0 punpcklwd m0, m4, m1 ; 01 punpckhwd m4, m1 ; 12 punpcklwd m1, m5, m2 ; 23 punpckhwd m5, m2 ; 34 punpcklwd m2, m6, m3 ; 45 punpckhwd m6, m3 ; 56 punpcklwd m3, m7, [stk+0xb0] ; 67 mov myd, mym mov r0, r0m mova [stk+0x40], m0 ; 01 mova [stk+0x50], m1 ; 23 mova [stk+0x60], m2 ; 45 mova [stk+0x70], m3 ; 67 mova [stk+0x80], m4 ; 12 mova [stk+0x90], m5 ; 34 mova [stk+0xa0], m6 ; 56 %define m12 [stk+0x00] %define m14 [stk+0x10] %define m13 [stk+0x20] %define m15 [stk+0x30] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %endif .w4_loop: and myd, 0x3ff %if ARCH_X86_64 mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq m9, r11q punpcklbw m9, m9 psraw m9, 8 pshufd m7, m9, q0000 pshufd m8, m9, q1111 pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufd m7, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m7 pmaddwd m8, m3, m9 %if isput movd m9, [rsp+0x28] %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif paddd m4, m5 paddd m6, m8 paddd m4, m6 paddd m4, vrnd_mem %else mov mym, myd mov r5, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 pmaddwd m0, m4 pmaddwd m1, m5 pmaddwd m2, m6 pmaddwd m3, m7 %if isput movd m4, [esp+0x18] %endif paddd m0, m1 paddd m2, m3 paddd m0, vrnd_mem paddd m0, m2 SWAP m4, m0 %define m9 m0 %endif %if isput pxor m5, m5 psrad m4, m9 packssdw m4, m4 pmaxsw m4, m5 pminsw m4, pxmaxm movq [dstq], m4 add dstq, dsmp %else psrad m4, 6 packssdw m4, m4 movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop mova m8, [rsp+0x10] movd m9, [rsp+0x20] movu m4, [srcq] movu m5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova m0, [rsp+0x40] mova [rsp+0x40], m1 mova m1, [rsp+0x50] mova [rsp+0x50], m2 mova m2, [rsp+0x60] mova [rsp+0x60], m3 pshufb m4, m12 pshufb m5, m14 pmaddwd m4, m13 pmaddwd m5, m15 phaddd m4, m5 paddd m4, m8 psrad m4, m9 packssdw m4, m4 punpcklwd m3, m10, m4 mova m10, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu m6, [srcq+ssq*1] movu m7, [srcq+r6] mova m0, [rsp+0x50] mova m11, [rsp+0x60] pshufb m4, m12 pshufb m6, m12 pshufb m5, m14 pshufb m7, m14 pmaddwd m4, m13 pmaddwd m6, m13 pmaddwd m5, m15 pmaddwd m7, m15 mova [rsp+0x40], m0 mova [rsp+0x50], m11 phaddd m4, m5 phaddd m6, m7 paddd m4, m8 paddd m6, m8 psrad m4, m9 psrad m6, m9 packssdw m4, m6 punpcklwd m9, m10, m4 mova [rsp+0x60], m9 pshufd m10, m4, q1032 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m10 lea srcq, [srcq+ssq*2] jmp .w4_loop %else SWAP m0, m4 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff jnz .w4_next_line mova m0, [stk+0x40] mova m1, [stk+0x50] mova m2, [stk+0x60] mova m3, [stk+0x70] jmp .w4_loop .w4_next_line: mov r5, [stk+0xc0] movu m4, [srcq] movu m5, [r5] test myd, 0x400 jz .w4_skip_line add [stk+0xc0], ssq mova m0, [stk+0x80] mova m3, [stk+0x50] mova [stk+0x40], m0 mova [stk+0x80], m3 mova m1, [stk+0x90] mova m6, [stk+0x60] mova [stk+0x50], m1 mova [stk+0x90], m6 mova m2, [stk+0xa0] mova m7, [stk+0x70] mova [stk+0x60], m2 mova [stk+0xa0], m7 pshufb m4, m12 pshufb m5, m14 pmaddwd m4, m13 pmaddwd m5, m15 phaddd m4, m5 paddd m4, hrnd_mem psrad m4, hsh_mem packssdw m4, m4 punpcklwd m3, [stk+0xb0], m4 mova [stk+0xb0], m4 mova [stk+0x70], m3 add srcq, ssq jmp .w4_loop .w4_skip_line: movu m6, [srcq+ssq*1] movu m7, [r5 +ssq*1] lea r5, [r5 +ssq*2] mov [stk+0xc0], r5 mova m0, [stk+0x50] mova m1, [stk+0x60] mova m2, [stk+0x70] mova m3, [stk+0x90] pshufb m4, m12 pshufb m6, m12 pshufb m5, m14 pshufb m7, m14 pmaddwd m4, m13 pmaddwd m6, m13 pmaddwd m5, m15 pmaddwd m7, m15 mova [stk+0x40], m0 mova [stk+0x50], m1 mova [stk+0x60], m2 mova [stk+0x80], m3 phaddd m4, m5 phaddd m6, m7 mova m5, [stk+0xa0] mova m7, [stk+0xb0] paddd m4, hrnd_mem paddd m6, hrnd_mem psrad m4, hsh_mem psrad m6, hsh_mem packssdw m4, m6 punpcklwd m7, m4 pshufd m6, m4, q1032 mova [stk+0x90], m5 mova [stk+0xa0], m7 mova [stk+0xb0], m6 punpcklwd m3, m4, m6 mova [stk+0x70], m3 lea srcq, [srcq+ssq*2] jmp .w4_loop %endif INIT_XMM ssse3 %if ARCH_X86_64 %define stk rsp+0x20 %endif .w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .w_start: %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 %define hround m11 shr t0d, 16 movd m15, t0d %if isprep mova m13, [base+pd_m524256] %endif %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq ssm %endif mov r4, [esp+0x1f0] shr r4, 16 movd m15, r4 mov r0, r0m mov myd, mym %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 mov r5, hm mov [stk+0x0f4], myd mov [stk+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov myd, [stk+0x0f4] mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov mym, myd mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] mov myd, mym mov dyd, dym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m5, m11, q0000 pshufd m7, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m4, m5, m0 pmaddwd m5, m5, m1 pmaddwd m6, m7, m2 pmaddwd m7, m7, m3 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m10 pmaddwd m7, [stk+0xa0], m10 pmaddwd m8, [stk+0xb0], m11 pmaddwd m9, [stk+0xc0], m11 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov myd, mym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 .vloop: mov r0, r0m mov r5, [esp+0x1f4] and myd, 0x3ff mov mym, myd xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 pshufd m6, m7, q2222 pshufd m7, m7, q3333 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [stk+0x140], myd mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] jz .skip_line mova m14, [base+unpckw] movu m8, [srcq+r10*2] movu m9, [srcq+r11*2] movu m10, [srcq+r13*2] movu m11, [srcq+ rX*2] movu m4, [srcq+ r4*2] movu m5, [srcq+ r6*2] movu m6, [srcq+ r7*2] movu m7, [srcq+ r9*2] add srcq, ssq mov myd, [stk+0x140] mov dyd, dym pshufd m15, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m15 ; 3a 2a pshufb m3, m15 ; 3b 2b pmaddwd m8, [stk+0x50] pmaddwd m9, [stk+0x60] pmaddwd m10, [stk+0x70] pmaddwd m11, [stk+0x80] pmaddwd m4, [stk+0x10] pmaddwd m5, [stk+0x20] pmaddwd m6, [stk+0x30] pmaddwd m7, [stk+0x40] phaddd m8, m9 phaddd m10, m11 mova m11, hround phaddd m4, m5 phaddd m6, m7 phaddd m8, m10 phaddd m4, m6 paddd m4, m11 paddd m8, m11 psrad m4, m12 psrad m8, m12 packssdw m4, m8 pshufb m5, [stk+0x90], m14 ; 4a 5a pshufb m6, [stk+0xa0], m14 ; 4b 5b pshufb m7, [stk+0xb0], m15 ; 7a 6a pshufb m8, [stk+0xc0], m15 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b punpckhwd m5, m7 ; 56a punpckhwd m6, m8 ; 56b punpcklwd m7, m4 ; 78a punpckhqdq m4, m4 punpcklwd m8, m4 ; 78b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m8 jmp .vloop .skip_line: MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 mov myd, [stk+0x140] mov dyd, dym mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [stk+0x90] ; 23a mova m3, [stk+0xa0] ; 23b mova m5, [stk+0xb0] ; 45a mova m6, [stk+0xc0] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m4 %else mov r0m, r0 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff mov mym, myd jnz .next_line mova m0, [stk+0x20] mova m1, [stk+0x30] mova m2, [stk+0x40] mova m3, [stk+0x50] jmp .vloop .next_line: test myd, 0x400 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] jz .skip_line MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 mova m7, [base+unpckw] pshufd m4, m7, q1032 pshufb m0, [stk+0x20], m7 ; 0a 1a pshufb m1, [stk+0x30], m7 ; 0b 1b pshufb m2, [stk+0x40], m4 ; 3a 2a pshufb m3, [stk+0x50], m4 ; 3b 2b pshufb m5, [stk+0x60], m7 ; 4a 5a pshufb m6, [stk+0x70], m7 ; 4b 5b pshufb m7, [stk+0x80], m4 ; 7a 6a punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 punpckhwd m5, m7 ; 56a mova [stk+0x60], m5 pshufb m5, [stk+0x90], m4 ; 7b 6b punpcklwd m7, [stk+0xe0] ; 78a punpckhwd m6, m5 ; 56b mova [stk+0x70], m6 movq m6, [stk+0xe8] mova [stk+0x80], m7 punpcklwd m5, m6 mov myd, mym mova [stk+0x90], m5 jmp .vloop .skip_line: MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 MC_8TAP_SCALED_H 0xa0, 0 ; 9 mova m7, [stk+0xe0] mova m2, [stk+0x60] ; 23a mova m3, [stk+0x70] ; 23b mova m4, [stk+0x80] ; 45a mova m5, [stk+0x90] ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova m0, [stk+0x40] ; 01a mova m1, [stk+0x50] ; 01b mov myd, mym mova [stk+0x40], m2 mova [stk+0x50], m3 mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova [stk+0x20], m0 mova [stk+0x30], m1 %endif jmp .vloop INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %define m11 [esp+0x00] %define m12 [esp+0x10] %define m13 [esp+0x20] movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 mov r1, r1m %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] mov [stk+0x20], r3 mov r3, r3m %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6 REPX {pmaddwd x, m15}, m4, m5, m6 phaddd m0, m1 phaddd m2, m3 phaddd m4, m5 phaddd m6, m6 REPX {paddd x, m11}, m0, m2, m4, m6 REPX {psrad x, m12}, m0, m2, m4, m6 packssdw m0, m2 ; 0 1 2 3 packssdw m4, m6 ; 4 5 6 SWAP m1, m4 movq m10, r4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m2, m3 movu m1, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q REPX {pshufb x, m14}, m1, m7, m6 REPX {pmaddwd x, m15}, m1, m7, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] phaddd m1, m7 phaddd m6, m6 REPX {paddd x, m11}, m0, m2, m1, m6 REPX {psrad x, m12}, m0, m2, m1, m6 packssdw m0, m2 packssdw m1, m6 %define m8 m6 %define m9 m4 %define m10 m5 movd m10, r4 movd m9, [stk+0x20] punpckldq m10, m9 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 mova [stk+0x50], m7 mova [stk+0x60], m8 mova [stk+0x70], m9 mova [stk+0x80], m10 %define m7 [stk+0x50] %define m8 [stk+0x60] %define m9 [stk+0x70] %define m10 [stk+0x80] %endif palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m4, m1, q2121 ; 5 6 5 6 punpcklwd m2, m1, m4 ; 45 56 %if ARCH_X86_32 mov r0, r0m %endif .dy1_w2_loop: movu m1, [srcq+ssq*0] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m3, m7 mova m3, m0 pmaddwd m0, m8 pshufb m1, m14 pshufb m6, m14 pmaddwd m1, m15 pmaddwd m6, m15 phaddd m1, m6 paddd m1, m11 psrad m1, m12 packssdw m1, m1 paddd m5, m0 mova m0, m2 pmaddwd m2, m9 paddd m5, m2 palignr m2, m1, m4, 12 punpcklwd m2, m1 ; 67 78 pmaddwd m4, m2, m10 paddd m5, m13 paddd m5, m4 pxor m6, m6 mova m4, m1 pshufd m1, m12, q1032 psrad m5, m1 packssdw m5, m5 pmaxsw m5, m6 pminsw m5, pxmaxm movd [dstq+dsq*0], m5 pshuflw m5, m5, q1032 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif INIT_XMM ssse3 .dy1_w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %define vrnd_mem [rsp+0x30] %define stk rsp+0x40 %else %define vrnd_mem [base+pd_m524256] %define stk rsp+0x30 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m9 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd rX, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] SWAP m4, m7 %if isprep mov r3, r3m %endif %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x40], m13 mova [stk+0x50], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] movu m1, [srcq+r4 ] movu m3, [srcq+r6 ] movu m2, [srcq+r11 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m13}, m7, m9, m8, m10 REPX {pshufb x, m14}, m1, m3, m2, m4 REPX {pmaddwd x, m15}, m1, m3, m2, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 packssdw m8, m10 ; 2 3 movu m0, [srcq+r4 ] movu m9, [srcq+r6 ] movu m10, [srcq+r11 ] add srcq, ss3q REPX {pshufb x, m12}, m1, m2, m3 REPX {pmaddwd x, m13}, m1, m2, m3 REPX {pshufb x, m14}, m0, m9, m10 REPX {pmaddwd x, m15}, m0, m9, m10 phaddd m1, m0 phaddd m2, m9 phaddd m3, m10 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] REPX {paddd x, m5}, m1, m2, m3 REPX {psrad x, xm6}, m1, m2, m3 packssdw m1, m2 ; 4 5 packssdw m3, m3 ; 6 6 SWAP m9, m1 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 movq m10, r13 mova [stk+0x00], m1 mova [stk+0x10], m8 mova [stk+0x20], m2 mova [stk+0x30], m9 mova [stk+0x40], m3 %define hrnd_mem [rsp+0x10] %define hsh_mem [rsp+0x20] %define vsh_mem [rsp+0x28] %if isput %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif %else mova [stk+0x20], m12 mova [stk+0x30], m14 add r4, srcq MC_4TAP_SCALED_H 0x60 ; 0 1 MC_4TAP_SCALED_H 0x70 ; 2 3 MC_4TAP_SCALED_H 0x80 ; 4 5 movu m7, [srcq] movu m2, [r4] add srcq, ssq add r4, ssq mov [stk+0xb0], r4 pshufb m7, m12 pshufb m2, m14 pmaddwd m7, m13 pmaddwd m2, m15 phaddd m7, m2 paddd m7, [esp+0x00] psrad m7, [esp+0x10] packssdw m7, m7 ; 6 6 mova m4, [stk+0x60] mova m5, [stk+0x70] mova m6, [stk+0x80] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m shufps m1, m4, m5, q1032 ; 1 2 shufps m2, m5, m6, q1032 ; 3 4 shufps m3, m6, m7, q1032 ; 5 6 mova [stk+0xa0], m7 punpcklwd m0, m4, m1 ; 01 punpckhwd m4, m1 ; 12 punpcklwd m1, m5, m2 ; 23 punpckhwd m5, m2 ; 34 punpcklwd m2, m6, m3 ; 45 punpckhwd m6, m3 ; 56 movd m7, r4 movd m3, r5 mov r0, r0m %if isput mov r1, r1m %endif mov r4, [stk+0xb0] mova [stk+0xc0], m4 ; 12 mova [stk+0x60], m1 ; 23 mova [stk+0x70], m2 ; 45 mova [stk+0x80], m5 ; 34 mova [stk+0x90], m6 ; 56 %define m12 [stk+0x20] %define m14 [stk+0x30] %define m13 [stk+0x40] %define m15 [stk+0x50] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %define vsh_mem [esp+0x18] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %define m10 m7 punpckldq m10, m3 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m3, m10, q0000 pshufd m4, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 %xdefine m8 m3 %xdefine m9 m6 %xdefine m11 m5 %xdefine m6 m4 mova [stk+0x100], m3 mova [stk+0x110], m4 mova [stk+0x120], m5 mova [stk+0x130], m10 %define m3 [stk+0x100] %define m4 [stk+0x110] %define m5 [stk+0x120] %define m10 [stk+0x130] mova m7, [stk+0xc0] mova m8, [stk+0x80] %endif .dy1_w4_loop: movu m11, [srcq+ssq*0] movu m6, [srcq+ssq*1] pmaddwd m0, m3 pmaddwd m7, m3 pmaddwd m1, m4 pmaddwd m8, m4 pmaddwd m2, m5 pmaddwd m9, m5 paddd m1, m0 paddd m8, m7 %if ARCH_X86_64 movu m0, [srcq+r4] movu m7, [srcq+r6] %else movu m0, [r4+ssq*0] movu m7, [r4+ssq*1] lea r4, [r4+ssq*2] %endif lea srcq, [srcq+ssq*2] paddd m1, m2 paddd m8, m9 pshufb m11, m12 pshufb m6, m12 pmaddwd m11, m13 pmaddwd m6, m13 pshufb m0, m14 pshufb m7, m14 pmaddwd m0, m15 pmaddwd m7, m15 phaddd m11, m0 phaddd m6, m7 paddd m11, hrnd_mem paddd m6, hrnd_mem psrad m11, hsh_mem psrad m6, hsh_mem packssdw m11, m6 ; 7 8 %if ARCH_X86_64 shufps m9, [stk+0x40], m11, q1032 ; 6 7 mova m0, [stk+0x00] mova [stk+0x40], m11 %else shufps m9, [stk+0xa0], m11, q1032 ; 6 7 mova m0, [stk+0x60] mova [stk+0xa0], m11 %endif punpcklwd m2, m9, m11 ; 67 punpckhwd m9, m11 ; 78 pmaddwd m6, m2, m10 pmaddwd m7, m9, m10 %if isput movd m11, vsh_mem %endif paddd m1, vrnd_mem paddd m8, vrnd_mem paddd m1, m6 paddd m8, m7 %if ARCH_X86_64 mova m7, [stk+0x10] %else mova m7, [stk+0x80] %endif %if isput psrad m1, m11 psrad m8, m11 %else psrad m1, 6 psrad m8, 6 %endif packssdw m1, m8 %if ARCH_X86_64 mova m8, [stk+0x30] %else mova m8, [stk+0x90] %endif %if isput pxor m6, m6 pmaxsw m1, m6 pminsw m1, pxmaxm movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] %else mova [tmpq], m1 add tmpq, 16 %endif %if ARCH_X86_64 mova m1, [stk+0x20] mova [stk+0x10], m8 mova [stk+0x00], m1 mova [stk+0x20], m2 mova [stk+0x30], m9 %else mova m1, [stk+0x70] mova [stk+0x80], m8 mova [stk+0x60], m1 mova [stk+0x70], m2 mova [stk+0x90], m9 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET ; why not jz .ret? INIT_XMM ssse3 .dy1_w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 mova [rsp+0x20], m12 %define hround m11 %if isput mova [rsp+0x30], m13 %else mova m13, [base+pd_m524256] %endif shr t0d, 16 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q %else movd m5, r4 movd m6, r5 punpckldq m5, m6 SWAP m3, m5 %endif punpcklbw m3, m3 psraw m3, 8 mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 %if ARCH_X86_64 mova [stk+0x140], m0 mova [stk+0x150], m1 mova [stk+0x160], m2 mova [stk+0x170], m3 %if UNIX64 mov hm, hd %endif %else mova [stk+0x180], m0 mova [stk+0x190], m1 mova [stk+0x1a0], m2 mova [stk+0x1b0], m3 SWAP m5, m3 mov r5, hm mov [stk+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy1_hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova m10, [stk+0x140] mova m11, [stk+0x150] mova m14, [stk+0x160] mova m15, [stk+0x170] mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] %define shift [rsp+0x20] %if isput %define vround [rsp+0x30] %else %define vround [base+pd_m524256] %endif .dy1_vloop: pmaddwd m4, m0, m10 pmaddwd m5, m1, m10 pmaddwd m6, m2, m11 pmaddwd m7, m3, m11 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m14 pmaddwd m7, [stk+0xa0], m14 pmaddwd m8, [stk+0xb0], m15 pmaddwd m9, [stk+0xc0], m15 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov r0, r0m punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova m4, [stk+0x180] mova m5, [stk+0x190] mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 .dy1_vloop: pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep %if ARCH_X86_64 movu m8, [srcq+r10*2] movu m9, [srcq+r11*2] movu m12, [srcq+r13*2] movu m13, [srcq+ rX*2] movu m4, [srcq+ r4*2] movu m5, [srcq+ r6*2] movu m6, [srcq+ r7*2] movu m7, [srcq+ r9*2] add srcq, ssq pmaddwd m8, [stk+0x50] pmaddwd m9, [stk+0x60] pmaddwd m12, [stk+0x70] pmaddwd m13, [stk+0x80] pmaddwd m4, [stk+0x10] pmaddwd m5, [stk+0x20] pmaddwd m6, [stk+0x30] pmaddwd m7, [stk+0x40] phaddd m8, m9 phaddd m12, m13 mova m9, [base+unpckw] mova m13, hround phaddd m4, m5 phaddd m6, m7 phaddd m8, m12 phaddd m4, m6 pshufd m5, m9, q1032 pshufb m0, m9 ; 0a 1a pshufb m1, m9 ; 0b 1b pshufb m2, m5 ; 3a 2a pshufb m3, m5 ; 3b 2b mova m12, shift paddd m4, m13 paddd m8, m13 psrad m4, m12 psrad m8, m12 packssdw m4, m8 pshufb m6, [stk+0x90], m9 ; 4a 5a pshufb m7, [stk+0xa0], m9 ; 4b 5b pshufb m8, [stk+0xb0], m5 ; 7a 6a pshufb m13, [stk+0xc0], m5 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m6 ; 34a punpcklwd m3, m7 ; 34b punpckhwd m6, m8 ; 56a punpckhwd m7, m13 ; 56b punpcklwd m8, m4 ; 78a punpckhqdq m4, m4 punpcklwd m13, m4 ; 78b mova [stk+0x90], m6 mova [stk+0xa0], m7 mova [stk+0xb0], m8 mova [stk+0xc0], m13 mova m13, vround %else mov r0m, r0 mov r3, r3m mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 mova m7, [base+unpckw] pshufd m4, m7, q1032 pshufb m0, [stk+0x20], m7 ; 0a 1a pshufb m1, [stk+0x30], m7 ; 0b 1b pshufb m2, [stk+0x40], m4 ; 3a 2a pshufb m3, [stk+0x50], m4 ; 3b 2b pshufb m5, [stk+0x60], m7 ; 4a 5a pshufb m6, [stk+0x70], m7 ; 4b 5b pshufb m7, [stk+0x80], m4 ; 7a 6a punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 punpckhwd m5, m7 ; 56a mova [stk+0x60], m5 pshufb m5, [stk+0x90], m4 ; 7b 6b punpcklwd m7, [stk+0xe0] ; 78a mova m4, [stk+0x180] punpckhwd m6, m5 ; 56b mova [stk+0x70], m6 movq m6, [stk+0xe8] mova [stk+0x80], m7 mova m7, [stk+0x1b0] punpcklwd m5, m6 mova m6, [stk+0x1a0] mova [stk+0x90], m5 mova m5, [stk+0x190] mov r0, r0m %endif jmp .dy1_vloop INIT_XMM ssse3 %if ARCH_X86_64 %define stk rsp+0x20 %endif .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m13 %define vrnd_mem [rsp+0x10] movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %define m11 [esp+0x00] %define m12 [esp+0x10] %define vrnd_mem [esp+0x20] mov r1, r1m movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*2] movu m2, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*1] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] mov [stk+0x20], r3 mov r3, r3m %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2 REPX {pmaddwd x, m15}, m0, m1, m2 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6 REPX {pmaddwd x, m15}, m4, m5, m6 phaddd m0, m1 phaddd m1, m2 phaddd m4, m5 phaddd m5, m6 REPX {paddd x, m11}, m0, m1, m4, m5 REPX {psrad x, m12}, m0, m1, m4, m5 packssdw m0, m1 ; 0 2 2 4 packssdw m4, m5 ; 1 3 3 5 SWAP m2, m4 movq m10, r4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m1, m2 movu m2, [srcq+ssq*1] movu m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] REPX {pshufb x, m14}, m2, m7, m6 REPX {pmaddwd x, m15}, m2, m7, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] phaddd m2, m7 phaddd m7, m6 REPX {paddd x, m11}, m0, m1, m2, m7 REPX {psrad x, m12}, m0, m1, m2, m7 packssdw m0, m1 packssdw m2, m7 %define m8 m6 %define m9 m4 %define m10 m5 movd m10, r4 movd m9, [stk+0x20] punpckldq m10, m9 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 mova [stk+0x50], m7 mova [stk+0x60], m8 mova [stk+0x70], m9 mova [stk+0x80], m10 %xdefine m13 m7 %define m7 [stk+0x50] %define m8 [stk+0x60] %define m9 [stk+0x70] %define m10 [stk+0x80] %endif punpcklwd m1, m0, m2 ; 01 23 punpckhwd m3, m0, m2 ; 23 45 %if ARCH_X86_32 mov r4, r0m %define dstq r4 mova [stk+0x20], m3 mova [stk+0x30], m0 %endif .dy2_w2_loop: movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m13, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd m3, m8 REPX {pshufb x, m14}, m4, m5, m6, m13 REPX {pmaddwd x, m15}, m4, m5, m6, m13 phaddd m4, m5 phaddd m6, m13 pmaddwd m5, m1, m7 paddd m4, m11 paddd m6, m11 psrad m4, m12 psrad m6, m12 packssdw m4, m6 ; 6 7 8 9 paddd m5, m3 pshufd m3, m4, q2200 pshufd m4, m4, q3311 palignr m3, m0, 12 ; 4 6 6 8 palignr m4, m2, 12 ; 5 7 7 9 mova m0, m3 mova m2, m4 punpcklwd m1, m3, m4 punpckhwd m3, m4 pmaddwd m6, m1, m9 pmaddwd m4, m3, m10 paddd m5, vrnd_mem paddd m6, m4 paddd m5, m6 pshufd m4, m12, q1032 pxor m6, m6 psrad m5, m4 packssdw m5, m5 pmaxsw m5, m6 pminsw m5, pxmaxm movd [dstq+dsq*0], m5 pshuflw m5, m5, q1032 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif INIT_XMM ssse3 .dy2_w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %define vrnd_mem [rsp+0x30] %define stk rsp+0x40 %else %define vrnd_mem [base+pd_m524256] %define stk rsp+0x30 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m9 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r1, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r3, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] SWAP m4, m7 mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x40], m13 mova [stk+0x50], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m1, [srcq+ssq*0] movu m8, [srcq+ssq*2] movu m9, [srcq+ssq*1] movu m10, [srcq+ss3q ] movu m7, [srcq+r4 ] movu m2, [srcq+r11 ] movu m3, [srcq+r6 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m1, m9, m8, m10 REPX {pmaddwd x, m13}, m1, m9, m8, m10 REPX {pshufb x, m14}, m7, m3, m2, m4 REPX {pmaddwd x, m15}, m7, m3, m2, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m1, m7 phaddd m8, m2 phaddd m9, m3 phaddd m10, m4 movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] REPX {paddd x, m5}, m1, m9, m8, m10 REPX {psrad x, xm6}, m1, m9, m8, m10 packssdw m1, m8 ; 0 2 packssdw m9, m10 ; 1 3 movu m0, [srcq+r4 ] movu m8, [srcq+r6 ] lea srcq, [srcq+ssq*2] REPX {pshufb x, m12}, m2, m3 REPX {pmaddwd x, m13}, m2, m3 REPX {pshufb x, m14}, m0, m8 REPX {pmaddwd x, m15}, m0, m8 phaddd m2, m0 phaddd m3, m8 shr myd, 6 mov r9d, 64 << 24 lea myd, [t1+myq] cmovnz r9q, [base+subpel_filters+myq*8] REPX {paddd x, m5}, m2, m3 REPX {psrad x, xm6}, m2, m3 packssdw m2, m3 ; 4 5 pshufd m3, m2, q1032 ; 5 _ punpcklwd m0, m1, m9 ; 01 punpckhwd m1, m9 ; 23 punpcklwd m2, m3 ; 45 movq m10, r9 %define hrnd_mem [rsp+0x10] %define hsh_mem [rsp+0x20] %define vsh_mem [rsp+0x28] %if isput %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif %else mova [stk+0x20], m12 mova [stk+0x30], m14 add r4, srcq MC_4TAP_SCALED_H 0x60 ; 0 1 MC_4TAP_SCALED_H 0x70 ; 2 3 MC_4TAP_SCALED_H 0x80 ; 4 5 mov [stk+0xe0], r4 mova m3, [base+spel_s_shuf8] mova m0, [stk+0x60] mova m1, [stk+0x70] mova m2, [stk+0x80] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m pshufb m0, m3 ; 01 pshufb m1, m3 ; 23 pshufb m2, m3 ; 45 movd m7, r4 movd m4, r5 mov r5, r0m %if isput mov r1, r1m %endif mov r4, [stk+0xe0] %define dstq r5 %define tmpq r5 %define m12 [stk+0x20] %define m14 [stk+0x30] %define m13 [stk+0x40] %define m15 [stk+0x50] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %define vsh_mem [esp+0x18] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %define m10 m7 punpckldq m10, m4 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m3, m10, q0000 pshufd m4, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 %xdefine m8 m3 %xdefine m9 m6 %xdefine m11 m5 %xdefine m6 m4 mova [stk+0x100], m3 mova [stk+0x110], m4 mova [stk+0x120], m5 mova [stk+0x130], m10 %define m3 [stk+0x100] %define m4 [stk+0x110] %define m5 [stk+0x120] %define m10 [stk+0x130] %endif .dy2_w4_loop: pmaddwd m8, m0, m3 pmaddwd m9, m1, m3 mova m0, m2 pmaddwd m1, m4 pmaddwd m11, m2, m4 paddd m8, vrnd_mem paddd m9, vrnd_mem pmaddwd m2, m5 paddd m8, m1 paddd m9, m11 paddd m8, m2 movu m6, [srcq+ssq*0] movu m1, [srcq+ssq*2] %if ARCH_X86_64 movu m11, [srcq+r4 ] movu m2, [srcq+r11] %else movu m11, [r4+ssq*0] movu m2, [r4+ssq*2] %endif pshufb m6, m12 pshufb m1, m12 pmaddwd m6, m13 pmaddwd m1, m13 pshufb m11, m14 pshufb m2, m14 pmaddwd m11, m15 pmaddwd m2, m15 phaddd m6, m11 phaddd m1, m2 paddd m6, hrnd_mem paddd m1, hrnd_mem psrad m6, hsh_mem psrad m1, hsh_mem movu m7, [srcq+ssq*1] movu m11, [srcq+ss3q ] packssdw m6, m1 ; 6 8 %if ARCH_X86_64 movu m2, [srcq+r6 ] movu m1, [srcq+r13] %else movu m2, [r4+ssq*1] movu m1, [r4+ss3q ] %endif pshufb m7, m12 pshufb m11, m12 pmaddwd m7, m13 pmaddwd m11, m13 pshufb m2, m14 pshufb m1, m14 pmaddwd m2, m15 pmaddwd m1, m15 phaddd m7, m2 phaddd m11, m1 paddd m7, hrnd_mem paddd m11, hrnd_mem psrad m7, hsh_mem psrad m11, hsh_mem packssdw m7, m11 ; 7 9 %if ARCH_X86_32 lea r4, [r4+ssq*4] %endif lea srcq, [srcq+ssq*4] punpcklwd m1, m6, m7 ; 67 punpckhwd m6, m7 ; 89 mova m2, m6 pmaddwd m11, m1, m5 pmaddwd m7, m1, m10 pmaddwd m6, m10 paddd m9, m11 %if isput movd m11, vsh_mem %endif paddd m8, m7 paddd m9, m6 %if isput psrad m8, m11 psrad m9, m11 packssdw m8, m9 pxor m7, m7 pmaxsw m8, m7 pminsw m8, pxmaxm movq [dstq+dsq*0], m8 movhps [dstq+dsq*1], m8 lea dstq, [dstq+dsq*2] %else psrad m8, 6 psrad m9, 6 packssdw m8, m9 mova [tmpq], m8 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET ; why not jz .ret? INIT_XMM ssse3 .dy2_w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 mova [rsp+0x20], m12 %define hround m11 %if isput mova [rsp+0x30], m13 %else mova m13, [base+pd_m524256] %endif shr t0d, 16 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %xdefine m15 m3 %if isput %define dstq r0 %else %define tmpq r0 %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q %else movd m5, r4 movd m6, r5 punpckldq m5, m6 SWAP m3, m5 %endif punpcklbw m3, m3 psraw m3, 8 mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 %if ARCH_X86_64 mova [stk+0x140], m0 mova [stk+0x150], m1 mova [stk+0x160], m2 mova [stk+0x170], m3 %if UNIX64 mov hm, hd %endif %else mova [stk+0x180], m0 mova [stk+0x190], m1 mova [stk+0x1a0], m2 mova [stk+0x1b0], m3 SWAP m5, m3 mov r5, hm mov [stk+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy2_hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova m10, [stk+0x140] mova m11, [stk+0x150] mova m14, [stk+0x160] mova m15, [stk+0x170] mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] %define shift [rsp+0x20] %if isput %define vround [rsp+0x30] %else %define vround [base+pd_m524256] %endif .dy2_vloop: pmaddwd m4, m0, m10 pmaddwd m5, m1, m10 pmaddwd m6, m2, m11 pmaddwd m7, m3, m11 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m14 pmaddwd m7, [stk+0xa0], m14 pmaddwd m8, [stk+0xb0], m15 pmaddwd m9, [stk+0xc0], m15 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov r0, r0m punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova m4, [stk+0x180] mova m5, [stk+0x190] mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mova [stk+0x40], m2 mova [stk+0x50], m3 .dy2_vloop: pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep %if ARCH_X86_64 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 mova [stk+0xd0], m4 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 mova m4, [stk+0xd0] mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [stk+0x90] ; 23a mova m3, [stk+0xa0] ; 23b mova m5, [stk+0xb0] ; 45a mova m6, [stk+0xc0] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m4 %else mov r0m, r0 mov r3, r3m MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 MC_8TAP_SCALED_H 0xa0, 0 ; 9 mova m7, [stk+0xe0] mova m2, [stk+0x60] ; 23a mova m3, [stk+0x70] ; 23b mova m4, [stk+0x80] ; 45a mova m5, [stk+0x90] ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova m0, [stk+0x40] ; 01a mova m1, [stk+0x50] ; 01b mova [stk+0x40], m2 mova [stk+0x50], m3 mova [stk+0x60], m4 mova [stk+0x70], m5 mova m4, [stk+0x180] mova m5, [stk+0x190] mova [stk+0x80], m6 mova [stk+0x90], m7 mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mov r0, r0m %endif jmp .dy2_vloop INIT_XMM ssse3 .ret: MC_8TAP_SCALED_RET 0 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT %define r0m [rstk+stack_offset+ 4] %define r1m [rstk+stack_offset+ 8] %define r2m [rstk+stack_offset+12] %define r3m [rstk+stack_offset+16] %endif %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, (5*15 << 16) | 5*15 jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_16bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_16bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 2 %endif %if ARCH_X86_64 ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that ; by allocating 16 bytes more stack space so that stack offsets match up. %if WIN64 && STACK_ALIGNMENT == 16 %assign stksz 16*14 %else %assign stksz 16*13 %endif cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt %assign stack_size_padded_8x8t stack_size_padded %else cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my %define m8 [esp+16*13] %define m9 [esp+16*14] %define cntd dword [esp+4*63] %define dstq tmpq %define dsq 0 %if STACK_ALIGNMENT < 16 %define dstm [esp+4*65] %define dsm [esp+4*66] %else %define dstm r0m %define dsm r1m %endif %endif %define base filterq-$$ mov t0d, r7m LEA filterq, $$ shr t0d, 11 %if ARCH_X86_64 movddup m8, [base+warp8x8t_rnd] %else movddup m1, [base+warp8x8t_rnd] mov r1, r1m add r1, r1 mova m8, m1 mov r1m, r1 ; ds *= 2 %endif call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main jmp .start .loop: %if ARCH_X86_64 lea dstq, [dstq+dsq*4] %else add dstq, dsm mov dstm, dstq %endif call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 .start: %if ARCH_X86_32 mov dstq, dstm %endif paddd m1, m8 paddd m2, m8 psrad m1, 15 psrad m2, 15 packssdw m1, m2 mova [dstq+dsq*0], m1 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 %if ARCH_X86_32 mov dstq, dstm add dstq, dsm %endif paddd m1, m8 paddd m2, m8 psrad m1, 15 psrad m2, 15 packssdw m1, m2 mova [dstq+dsq*2], m1 dec cntd jg .loop RET %if ARCH_X86_64 cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt ASSERT stack_size_padded == stack_size_padded_8x8t %else cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my %endif mov t0d, r7m LEA filterq, $$ shr t0d, 11 %if ARCH_X86_64 movddup m8, [base+warp8x8_rnd2+t0*8] movd m9, r7m ; pixel_max pshufb m9, [base+pw_256] %else movddup m1, [base+warp8x8_rnd2+t0*8] movd m2, r7m ; pixel_max pshufb m2, [base+pw_256] mova m8, m1 mova m9, m2 %endif call .main jmp .start .loop: %if ARCH_X86_64 lea dstq, [dstq+dsq*2] %else add dstq, dsm mov dstm, dstq %endif call .main2 .start: %if ARCH_X86_32 mov dstq, dstm %endif psrad m1, 16 psrad m2, 16 packssdw m1, m2 pmaxsw m1, m6 pmulhrsw m1, m8 pminsw m1, m9 mova [dstq+dsq*0], m1 call .main3 %if ARCH_X86_32 mov dstq, dstm add dstq, dsm %endif psrad m1, 16 psrad m2, 16 packssdw m1, m2 pmaxsw m1, m6 pmulhrsw m1, m8 pminsw m1, m9 mova [dstq+dsq*1], m1 dec cntd jg .loop RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov deltaq, r5m mov mxd, r6m %endif movd m0, [base+warp8x8_shift+t0*4] movddup m7, [base+warp8x8_rnd1+t0*8] add filterq, mc_warp_filter-$$ %if ARCH_X86_64 movsx alphad, word [deltaq+2*0] movsx betad, word [deltaq+2*1] movsx gammad, word [deltaq+2*2] movsx deltad, word [deltaq+2*3] lea tmpq, [ssq*3] add mxd, 512+(64<<10) sub srcq, tmpq ; src -= ss*3 imul tmpd, alphad, -7 mov myd, r7m add betad, tmpd ; beta -= alpha*7 imul tmpd, gammad, -7 add myd, 512+(64<<10) mov cntd, 4 add deltad, tmpd ; delta -= gamma*7 %else %if STACK_ALIGNMENT < 16 %assign stack_offset stack_offset - gprsize %endif mov r3d, r5m ; abcd %if STACK_ALIGNMENT < 16 mov r0, r1m ; dst mov r1, r2m ; ds mov [esp+gprsize+4*65], r0 mov [esp+gprsize+4*66], r1 %endif movsx alphad, word [r3+2*0] movsx r2d, word [r3+2*1] movsx gammad, word [r3+2*2] movsx r3d, word [r3+2*3] imul r5d, alphad, -7 add r2d, r5d ; beta -= alpha*7 imul r5d, gammad, -7 mov [esp+gprsize+4*60], r2d add r3d, r5d ; delta -= gamma*7 mov [esp+gprsize+4*61], r3d mov r3d, r4m ; ss mov srcq, r3m mov mxd, r6m mov myd, r7m mov dword [esp+gprsize+4*63], 4 ; cnt mov [esp+gprsize+4*62], r3 lea r3, [r3*3] add mxd, 512+(64<<10) add myd, 512+(64<<10) sub srcq, r3 ; src -= ss*3 %if STACK_ALIGNMENT < 16 %assign stack_offset stack_offset + gprsize %endif %endif mova [rsp+gprsize], m0 pxor m6, m6 call .h mova m5, m0 call .h punpcklwd m1, m5, m0 ; 01 punpckhwd m5, m0 mova [rsp+gprsize+16* 1], m1 mova [rsp+gprsize+16* 4], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 12 punpckhwd m5, m0 mova [rsp+gprsize+16* 7], m1 mova [rsp+gprsize+16*10], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 23 punpckhwd m5, m0 mova [rsp+gprsize+16* 2], m1 mova [rsp+gprsize+16* 5], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 34 punpckhwd m5, m0 mova [rsp+gprsize+16* 8], m1 mova [rsp+gprsize+16*11], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 45 punpckhwd m5, m0 mova [rsp+gprsize+16* 3], m1 mova [rsp+gprsize+16* 6], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 56 punpckhwd m5, m0 mova [rsp+gprsize+16* 9], m1 mova [rsp+gprsize+16*12], m5 mova m5, m0 .main2: call .h %macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h lea tmpd, [myq+gammaq] shr myd, 10 movq m4, [filterq+myq*8] ; a lea myd, [tmpq+gammaq] shr tmpd, 10 movq m2, [filterq+tmpq*8] ; b lea tmpd, [myq+gammaq] shr myd, 10 movq m3, [filterq+myq*8] ; c lea myd, [tmpq+gammaq] shr tmpd, 10 movq m1, [filterq+tmpq*8] ; d lea tmpd, [myq+gammaq] shr myd, 10 punpcklwd m4, m2 punpcklwd m3, m1 punpckldq m2, m4, m3 punpckhdq m4, m3 punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 pmaddwd m1, [rsp+gprsize+16*%1] punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 mova m2, [rsp+gprsize+16*%2] pmaddwd m3, m2 mova [rsp+gprsize+16*%1], m2 paddd m1, m3 punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 mova m2, [rsp+gprsize+16*%3] pmaddwd m3, m2 mova [rsp+gprsize+16*%2], m2 paddd m1, m3 punpcklwd m3, m5, m0 ; 67 punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m2, m3 mova [rsp+gprsize+16*%3], m3 paddd m1, m2 movq m4, [filterq+myq*8] ; e lea myd, [tmpq+gammaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] ; f lea tmpd, [myq+gammaq] shr myd, 10 movq m2, [filterq+myq*8] ; g %if ARCH_X86_64 lea myd, [tmpq+deltaq] ; my += delta %else mov myd, [esp+gprsize+4*61] add myd, tmpd %endif shr tmpd, 10 punpcklwd m4, m3 movq m3, [filterq+tmpq*8] ; h punpcklwd m2, m3 punpckldq m3, m4, m2 punpckhdq m4, m2 punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 pmaddwd m2, [rsp+gprsize+16*%4] punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 mova m3, [rsp+gprsize+16*%5] pmaddwd m6, m3 mova [rsp+gprsize+16*%4], m3 pxor m3, m3 paddd m2, m6 punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 mova m6, [rsp+gprsize+16*%6] pmaddwd m3, m6 mova [rsp+gprsize+16*%5], m6 punpckhwd m5, m0 pxor m6, m6 paddd m2, m3 punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 pmaddwd m3, m5 mova [rsp+gprsize+16*%6], m5 mova m5, m0 paddd m2, m3 %endmacro WARP_V 1, 2, 3, 4, 5, 6 ret .main3: call .h WARP_V 7, 8, 9, 10, 11, 12 ret ALIGN function_align .h: lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] punpcklbw m0, m6, m3 movu m3, [srcq-6] pmaddwd m0, m3 ; 0 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m2, m6, m3 movu m3, [srcq-4] pmaddwd m2, m3 ; 1 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m0, m2 ; 0 1 punpcklbw m2, m6, m3 movu m3, [srcq-2] pmaddwd m2, m3 ; 2 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m1, m6, m3 movu m3, [srcq+0] pmaddwd m1, m3 ; 3 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m2, m1 ; 2 3 punpcklbw m1, m6, m3 movu m3, [srcq+2] pmaddwd m1, m3 ; 4 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] phaddd m0, m2 ; 0 1 2 3 punpcklbw m2, m6, m3 movu m3, [srcq+4] pmaddwd m2, m3 ; 5 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m1, m2 ; 4 5 punpcklbw m2, m6, m3 movu m3, [srcq+6] pmaddwd m2, m3 ; 6 %if ARCH_X86_64 lea mxd, [tmpq+betaq] ; mx += beta %else mov mxd, [esp+gprsize*2+4*60] add mxd, tmpd %endif shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m4, m6, m3 movu m3, [srcq+8] %if ARCH_X86_64 add srcq, ssq %else add srcq, [esp+gprsize*2+4*62] %endif pmaddwd m3, m4 ; 7 phaddd m2, m3 ; 6 7 phaddd m1, m2 ; 4 5 6 7 paddd m0, m7 paddd m1, m7 psrad m0, [rsp+gprsize*2] psrad m1, [rsp+gprsize*2] packssdw m0, m1 ret %macro BIDIR_FN 0 call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .ret: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jne .w8_loop RET .w16_loop: call .main add dstq, strideq .w16: mova [dstq+16*0], m0 mova [dstq+16*1], m1 dec hd jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h %define base r6-avg_ssse3_table LEA r6, avg_ssse3_table tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 movddup m2, [base+bidir_rnd+t0*8] movddup m3, [base+bidir_mul+t0*8] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+16*0] paddsw m0, [tmp2q+16*0] mova m1, [tmp1q+16*1] paddsw m1, [tmp2q+16*1] add tmp1q, 16*2 add tmp2q, 16*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 pmulhw m0, m3 pmulhw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h %define base r6-w_avg_ssse3_table LEA r6, w_avg_ssse3_table tzcnt wd, wm mov t0d, r6m ; weight movd m6, r7m ; pixel_max movddup m5, [base+pd_65538] movsxd wq, [r6+wq*4] pshufb m6, [base+pw_256] add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0d, r6d ; 16-weight, weight paddw m5, m6 mov r6d, t0d shl t0d, 2 test dword r7m, 0x800 cmovnz r6d, t0d movifnidn hd, hm movd m4, r6d pslld m5, 7 pxor m7, m7 pshufd m4, m4, q0000 BIDIR_FN ALIGN function_align .main: mova m2, [tmp1q+16*0] mova m0, [tmp2q+16*0] punpckhwd m3, m0, m2 punpcklwd m0, m2 mova m2, [tmp1q+16*1] mova m1, [tmp2q+16*1] add tmp1q, 16*2 add tmp2q, 16*2 pmaddwd m3, m4 pmaddwd m0, m4 paddd m3, m5 paddd m0, m5 psrad m3, 8 psrad m0, 8 packssdw m0, m3 punpckhwd m3, m1, m2 punpcklwd m1, m2 pmaddwd m3, m4 pmaddwd m1, m4 paddd m3, m5 paddd m1, m5 psrad m3, 8 psrad m1, 8 packssdw m1, m3 pminsw m0, m6 pminsw m1, m6 pmaxsw m0, m7 pmaxsw m1, m7 ret %if ARCH_X86_64 cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %else cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask %define hd dword r5m %define m8 [base+pw_64] %endif %define base r6-mask_ssse3_table LEA r6, mask_ssse3_table tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] movddup m6, [base+bidir_rnd+t0*8] movddup m7, [base+bidir_mul+t0*8] %if ARCH_X86_64 mova m8, [base+pw_64] movifnidn hd, hm %endif add wq, r6 mov maskq, r6mp BIDIR_FN ALIGN function_align .main: movq m3, [maskq+8*0] mova m0, [tmp1q+16*0] mova m4, [tmp2q+16*0] pxor m5, m5 punpcklbw m3, m5 punpckhwd m2, m0, m4 punpcklwd m0, m4 psubw m1, m8, m3 punpckhwd m4, m3, m1 ; m, 64-m punpcklwd m3, m1 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) pmaddwd m0, m3 movq m3, [maskq+8*1] mova m1, [tmp1q+16*1] mova m4, [tmp2q+16*1] add maskq, 8*2 add tmp1q, 16*2 add tmp2q, 16*2 psrad m2, 5 psrad m0, 5 packssdw m0, m2 punpcklbw m3, m5 punpckhwd m2, m1, m4 punpcklwd m1, m4 psubw m5, m8, m3 punpckhwd m4, m3, m5 ; m, 64-m punpcklwd m3, m5 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) pmaddwd m1, m3 psrad m2, 5 psrad m1, 5 packssdw m1, m2 pmaxsw m0, m6 pmaxsw m1, m6 psubsw m0, m6 psubsw m1, m6 pmulhw m0, m7 pmulhw m1, m7 ret cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_420_ssse3_table LEA t0, w_mask_420_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max movd m0, r7m ; sign shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m4, [base+bidir_mul+r6*8] ALLOC_STACK -16*4 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 mova [rsp+16*3], m4 %define m8 [rsp+gprsize+16*0] %define m9 [rsp+gprsize+16*1] %define m10 [rsp+gprsize+16*2] %define m11 [rsp+gprsize+16*3] %endif movd m7, [base+pw_2] psubw m7, m0 pshufb m7, [base+pw_256] add wq, t0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 4 .w4: movq [dstq+strideq*0], m0 phaddw m2, m3 movhps [dstq+strideq*1], m0 phaddd m2, m2 lea dstq, [dstq+strideq*2] paddw m2, m7 movq [dstq+strideq*0], m1 psrlw m2, 2 movhps [dstq+strideq*1], m1 packuswb m2, m2 movd [maskq], m2 sub hd, 4 jg .w4_loop RET .w8_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 4 .w8: mova [dstq+strideq*0], m0 paddw m2, m3 phaddw m2, m2 mova [dstq+strideq*1], m1 paddw m2, m7 psrlw m2, 2 packuswb m2, m2 movd [maskq], m2 sub hd, 2 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 8 .w16: mova [dstq+strideq*1+16*0], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*1], m3 mova [dstq+strideq*0+16*1], m1 call .main paddw m2, [dstq+strideq*1+16*0] paddw m3, [dstq+strideq*1+16*1] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*1], m1 paddw m2, m7 psrlw m2, 2 packuswb m2, m2 movq [maskq], m2 sub hd, 2 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16 .w32: mova [dstq+strideq*1+16*0], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*1], m3 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*0+16*2], m0 phaddw m2, m3 mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m1 call .main paddw m2, [dstq+strideq*1+16*0] paddw m3, [dstq+strideq*1+16*1] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*1], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*2] paddw m2, [dstq+strideq*1+16*3] mova [dstq+strideq*1+16*2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*3], m1 packuswb m3, m2 mova [maskq], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16*2 .w64: mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*2], m3 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*1+16*4], m3 mova [dstq+strideq*0+16*3], m1 call .main mova [dstq+strideq*1+16*5], m2 mova [dstq+strideq*0+16*4], m0 mova [dstq+strideq*1+16*6], m3 mova [dstq+strideq*0+16*5], m1 call .main mova [dstq+strideq*0+16*6], m0 phaddw m2, m3 mova [dstq+strideq*1+16*7], m2 mova [dstq+strideq*0+16*7], m1 call .main paddw m2, [dstq+strideq*1+16*1] paddw m3, [dstq+strideq*1+16*2] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*1], m1 call .main paddw m2, [dstq+strideq*1+16*3] paddw m3, [dstq+strideq*1+16*4] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*2] mova [dstq+strideq*1+16*2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*3], m1 packuswb m3, m2 mova [maskq+16*0], m3 call .main paddw m2, [dstq+strideq*1+16*5] paddw m3, [dstq+strideq*1+16*6] mova [dstq+strideq*1+16*4], m0 phaddw m2, m3 mova [dstq+strideq*1+16*6], m2 mova [dstq+strideq*1+16*5], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*6] paddw m2, [dstq+strideq*1+16*7] mova [dstq+strideq*1+16*6], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*7], m1 packuswb m3, m2 mova [maskq+16*1], m3 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16*4 .w128: mova [dstq+strideq*1+16* 1], m2 mova [dstq+strideq*0+16* 0], m0 mova [dstq+strideq*1+16* 2], m3 mova [dstq+strideq*0+16* 1], m1 call .main mova [dstq+strideq*1+16* 3], m2 mova [dstq+strideq*0+16* 2], m0 mova [dstq+strideq*1+16* 4], m3 mova [dstq+strideq*0+16* 3], m1 call .main mova [dstq+strideq*1+16* 5], m2 mova [dstq+strideq*0+16* 4], m0 mova [dstq+strideq*1+16* 6], m3 mova [dstq+strideq*0+16* 5], m1 call .main mova [dstq+strideq*1+16* 7], m2 mova [dstq+strideq*0+16* 6], m0 mova [dstq+strideq*1+16* 8], m3 mova [dstq+strideq*0+16* 7], m1 call .main mova [dstq+strideq*1+16* 9], m2 mova [dstq+strideq*0+16* 8], m0 mova [dstq+strideq*1+16*10], m3 mova [dstq+strideq*0+16* 9], m1 call .main mova [dstq+strideq*1+16*11], m2 mova [dstq+strideq*0+16*10], m0 mova [dstq+strideq*1+16*12], m3 mova [dstq+strideq*0+16*11], m1 call .main mova [dstq+strideq*1+16*13], m2 mova [dstq+strideq*0+16*12], m0 mova [dstq+strideq*1+16*14], m3 mova [dstq+strideq*0+16*13], m1 call .main mova [dstq+strideq*0+16*14], m0 phaddw m2, m3 mova [dstq+strideq*1+16*15], m2 mova [dstq+strideq*0+16*15], m1 call .main paddw m2, [dstq+strideq*1+16* 1] paddw m3, [dstq+strideq*1+16* 2] mova [dstq+strideq*1+16* 0], m0 phaddw m2, m3 mova [dstq+strideq*1+16* 2], m2 mova [dstq+strideq*1+16* 1], m1 call .main paddw m2, [dstq+strideq*1+16* 3] paddw m3, [dstq+strideq*1+16* 4] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16* 2] mova [dstq+strideq*1+16* 2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16* 3], m1 packuswb m3, m2 mova [maskq+16*0], m3 call .main paddw m2, [dstq+strideq*1+16* 5] paddw m3, [dstq+strideq*1+16* 6] mova [dstq+strideq*1+16* 4], m0 phaddw m2, m3 mova [dstq+strideq*1+16* 6], m2 mova [dstq+strideq*1+16* 5], m1 call .main paddw m2, [dstq+strideq*1+16* 7] paddw m3, [dstq+strideq*1+16* 8] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16* 6] mova [dstq+strideq*1+16* 6], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16* 7], m1 packuswb m3, m2 mova [maskq+16*1], m3 call .main paddw m2, [dstq+strideq*1+16* 9] paddw m3, [dstq+strideq*1+16*10] mova [dstq+strideq*1+16* 8], m0 phaddw m2, m3 mova [dstq+strideq*1+16*10], m2 mova [dstq+strideq*1+16* 9], m1 call .main paddw m2, [dstq+strideq*1+16*11] paddw m3, [dstq+strideq*1+16*12] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*10] mova [dstq+strideq*1+16*10], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*11], m1 packuswb m3, m2 mova [maskq+16*2], m3 call .main paddw m2, [dstq+strideq*1+16*13] paddw m3, [dstq+strideq*1+16*14] mova [dstq+strideq*1+16*12], m0 phaddw m2, m3 mova [dstq+strideq*1+16*14], m2 mova [dstq+strideq*1+16*13], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*14] paddw m2, [dstq+strideq*1+16*15] mova [dstq+strideq*1+16*14], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*15], m1 packuswb m3, m2 mova [maskq+16*3], m3 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: %macro W_MASK 2 ; dst/tmp_offset, mask mova m%1, [tmp1q+16*%1] mova m%2, [tmp2q+16*%1] punpcklwd m4, m%2, m%1 punpckhwd m5, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m6, m8, m%1 psrlw m6, 10 ; 64-m psubw m%2, m9, m6 ; m punpcklwd m%1, m6, m%2 punpckhwd m6, m%2 pmaddwd m%1, m4 pmaddwd m6, m5 psrad m%1, 5 psrad m6, 5 packssdw m%1, m6 pmaxsw m%1, m10 psubsw m%1, m10 pmulhw m%1, m11 %endmacro W_MASK 0, 2 W_MASK 1, 3 add tmp1q, 16*2 add tmp2q, 16*2 ret cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_422_ssse3_table LEA t0, w_mask_422_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max movd m7, r7m ; sign shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m4, [base+bidir_mul+r6*8] ALLOC_STACK -16*4 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 mova [rsp+16*3], m4 %endif pxor m0, m0 add wq, t0 pshufb m7, m0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .end: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2 W_MASK 1, 3 phaddw m2, m3 add tmp1q, 16*2 add tmp2q, 16*2 packuswb m2, m2 pxor m3, m3 psubb m2, m7 pavgb m2, m3 movq [maskq], m2 add maskq, 8 ret cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_444_ssse3_table LEA t0, w_mask_444_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m7, [base+bidir_mul+r6*8] ALLOC_STACK -16*3 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 %define m11 m7 %endif add wq, t0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .end: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2 W_MASK 1, 3 packuswb m2, m3 add tmp1q, 16*2 add tmp2q, 16*2 mova [maskq], m2 add maskq, 16 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 %define base r6-blend_ssse3_table LEA r6, blend_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp mova m7, [base+pw_m512] add wq, r6 lea stride3q, [strideq*3] pxor m6, m6 jmp wq .w4: mova m5, [maskq] movq m0, [dstq+strideq*0] movhps m0, [dstq+strideq*1] movq m1, [dstq+strideq*2] movhps m1, [dstq+stride3q ] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: mova m5, [maskq] mova m0, [dstq+strideq*0] mova m1, [dstq+strideq*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8 RET .w16: mova m5, [maskq] mova m0, [dstq+16*0] mova m1, [dstq+16*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16 RET .w32: mova m5, [maskq+16*0] mova m0, [dstq+16*0] mova m1, [dstq+16*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova m5, [maskq+16*1] mova m0, [dstq+16*2] mova m1, [dstq+16*3] psubw m2, m0, [tmpq+16*2] psubw m3, m1, [tmpq+16*3] add maskq, 32 add tmpq, 64 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*2], m0 mova [dstq+16*3], m1 add dstq, strideq dec hd jg .w32 RET ; emu_edge args: ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, ; const pixel *ref, const ptrdiff_t ref_stride ; ; bw, bh total filled size ; iw, ih, copied block -> fill bottom, right ; x, y, offset in bw/bh -> fill top, left cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ y, dst, dstride, src, sstride, \ bottomext, rightext, blk ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes %if ARCH_X86_64 %define reg_zero r12q %define reg_tmp r10 %define reg_src srcq %define reg_bottomext bottomextq %define reg_rightext rightextq %define reg_blkm r9m %else %define reg_zero r6 %define reg_tmp r0 %define reg_src r1 %define reg_bottomext r0 %define reg_rightext r1 %define reg_blkm r2m %endif ; ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor reg_zero, reg_zero lea reg_tmp, [ihq-1] cmp yq, ihq cmovs reg_tmp, yq test yq, yq cmovs reg_tmp, reg_zero %if ARCH_X86_64 imul reg_tmp, sstrideq add srcq, reg_tmp %else imul reg_tmp, sstridem mov reg_src, srcm add reg_src, reg_tmp %endif ; ; ref += iclip(x, 0, iw - 1) lea reg_tmp, [iwq-1] cmp xq, iwq cmovs reg_tmp, xq test xq, xq cmovs reg_tmp, reg_zero lea reg_src, [reg_src+reg_tmp*2] %if ARCH_X86_32 mov srcm, reg_src %endif ; ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) %if ARCH_X86_32 mov r1, r1m ; restore bh %endif lea reg_bottomext, [yq+bhq] sub reg_bottomext, ihq lea r3, [bhq-1] cmovs reg_bottomext, reg_zero ; DEFINE_ARGS bw, bh, iw, ih, x, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, reg_zero cmp reg_bottomext, bhq cmovns reg_bottomext, r3 cmp topextq, bhq cmovg topextq, r3 %if ARCH_X86_32 mov r4m, reg_bottomext ; ; right_ext = iclip(x + bw - iw, 0, bw - 1) mov r0, r0m ; restore bw %endif lea reg_rightext, [xq+bwq] sub reg_rightext, iwq lea r2, [bwq-1] cmovs reg_rightext, reg_zero DEFINE_ARGS bw, bh, iw, ih, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, reg_zero cmp reg_rightext, bwq cmovns reg_rightext, r2 %if ARCH_X86_32 mov r3m, r1 %endif cmp leftextq, bwq cmovns leftextq, r2 %undef reg_zero %undef reg_tmp %undef reg_src %undef reg_bottomext %undef reg_rightext DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; center_h = bh - top_ext - bottom_ext %if ARCH_X86_64 lea r3, [bottomextq+topextq] sub centerhq, r3 %else mov r1, centerhm ; restore r1 sub centerhq, topextq sub centerhq, r4m mov r1m, centerhq %endif ; ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq %if ARCH_X86_64 imul r2, dstrideq %else mov r6, r6m ; restore dstq imul r2, dstridem %endif add dstq, r2 mov reg_blkm, dstq ; save pointer for ext ; ; center_w = bw - left_ext - right_ext mov centerwq, bwq %if ARCH_X86_64 lea r3, [rightextq+leftextq] sub centerwq, r3 %else sub centerwq, r3m sub centerwq, leftextq %endif ; vloop Macro %macro v_loop 3 ; need_left_ext, need_right_ext, suffix %if ARCH_X86_64 %define reg_tmp r12 %else %define reg_tmp r0 %endif .v_loop_%3: %if ARCH_X86_32 mov r0, r0m mov r1, r1m %endif %if %1 ; left extension %if ARCH_X86_64 movd m0, [srcq] %else mov r3, srcm movd m0, [r3] %endif pshuflw m0, m0, q0000 punpcklqdq m0, m0 xor r3, r3 .left_loop_%3: mova [dstq+r3*2], m0 add r3, mmsize/2 cmp r3, leftextq jl .left_loop_%3 ; body lea reg_tmp, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: %if ARCH_X86_64 movu m0, [srcq+r3*2] %else mov r1, srcm movu m0, [r1+r3*2] %endif %if %1 movu [reg_tmp+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, mmsize/2 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea reg_tmp, [reg_tmp+centerwq*2] %else lea reg_tmp, [dstq+centerwq*2] %endif %if ARCH_X86_64 movd m0, [srcq+centerwq*2-2] %else mov r3, srcm movd m0, [r3+centerwq*2-2] %endif pshuflw m0, m0, q0000 punpcklqdq m0, m0 xor r3, r3 .right_loop_%3: movu [reg_tmp+r3*2], m0 add r3, mmsize/2 %if ARCH_X86_64 cmp r3, rightextq %else cmp r3, r3m %endif jl .right_loop_%3 %endif %if ARCH_X86_64 add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %else add dstq, dstridem mov r0, sstridem add srcm, r0 sub dword centerhm, 1 jg .v_loop_%3 mov r0, r0m ; restore r0 %endif %endmacro ; vloop MACRO test leftextq, leftextq jnz .need_left_ext %if ARCH_X86_64 test rightextq, rightextq jnz .need_right_ext %else cmp leftextq, r3m ; leftextq == 0 jne .need_right_ext %endif v_loop 0, 0, 0 jmp .body_done ;left right extensions .need_left_ext: %if ARCH_X86_64 test rightextq, rightextq %else mov r3, r3m test r3, r3 %endif jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; r0 ; bw ; r1 ;; x loop ; r4 ;; y loop ; r5 ; topextq ; r6 ;dstq ; r7 ;dstrideq ; r8 ; srcq %if ARCH_X86_64 %define reg_dstride dstrideq %else %define reg_dstride r2 %endif ; ; bottom edge extension %if ARCH_X86_64 test bottomextq, bottomextq jz .top %else xor r1, r1 cmp r1, r4m je .top %endif ; %if ARCH_X86_64 mov srcq, dstq sub srcq, dstrideq xor r1, r1 %else mov r3, dstq mov reg_dstride, dstridem sub r3, reg_dstride mov srcm, r3 %endif ; .bottom_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq %else mov r3, srcm mova m0, [r3+r1*2] lea r3, [dstq+r1*2] mov r4, r4m %endif ; .bottom_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .bottom_y_loop add r1, mmsize/2 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end %if ARCH_X86_64 mov srcq, reg_blkm %else mov r3, reg_blkm mov reg_dstride, dstridem %endif mov dstq, dstm xor r1, r1 ; .top_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1*2] %else mov r3, reg_blkm mova m0, [r3+r1*2] %endif lea r3, [dstq+r1*2] mov r4, topextq ; .top_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .top_y_loop add r1, mmsize/2 cmp r1, bwq jl .top_x_loop .end: RET %undef reg_dstride %undef reg_blkm %undef reg_tmp %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc_avx2.asm000066400000000000000000010236411517466257200233300ustar00rootroot00000000000000; Copyright © 2018-2026, VideoLAN and dav2d authors ; Copyright © 2018-2026, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 32 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 pw_4x84a10_4x42a5: times 4 dw 84, 10 times 4 dw 42, 5 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 morph_A: db 0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1 morph_B: db 8, -1, 12, -1, 9, -1, 13, -1, 10, -1, 14, -1, 11, -1, 15, -1 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f pw_3x42a5_1x84a10: times 3 dw 42, 5 times 1 dw 84, 10 shuf_left_1w: db 0, 1, 0, 1 shuf_right_1w: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 pb_1_m1: times 2 db 1, -1 pb_64: times 4 db 64 pw_m256: times 2 dw -256 pw_1: times 2 dw 1 pw_15: times 2 dw 15 pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 pw_512: times 2 dw 512 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 pd_32: dd 32 pd_63: dd 63 pd_64: dd 64 pd_512: dd 512 pd_32768: dd 32768 pd_0x3ff: dd 0x3ff pd_0x4000: dd 0x4000 pq_16: dq 16 pq_0x40000000: dq 0x40000000 sadrefinemv_idx2off: db -2, -2, -2, -1, -2, +0, -2, +1, -2, +2 db -1, -2, -1, -1, -1, +0, -1, +1, -1, +2 db +0, -2, +0, -1, +0, +1, +0, +2 db +1, -2, +1, -1, +1, +0, +1, +1, +1, +2 db +2, -2, +2, -1, +2, +0, +2, +1, +2, +2 cextern mc_subpel_filters cextern mc_warp_filter2 cextern z_filter_s %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) %define table_offset(type, fn) type %+ fn %+ _avx2_table - type %+ _avx2 BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64 HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64 HV_JMP_TABLE put, 6tap, avx2, 3, 2, 4, 8, 16, 32, 64 HV_JMP_TABLE put, 8tap_sharp, avx2, 3, 2, 4, 8, 16, 32, 64 HV_JMP_TABLE prep, 6tap, avx2, 1, 4, 8, 16, 32, 64 HV_JMP_TABLE prep, 8tap_sharp, avx2, 1, 4, 8, 16, 32, 64 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32, 64 BIDIR_JMP_TABLE morph, avx2, 4, 8, 16, 32, 64 SECTION .text INIT_XMM avx2 cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [put_avx2] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6w mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu [dstq+dsq*0], m0 movu [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET INIT_YMM avx2 .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu [dstq+dsq*0], m0 movu [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*1+32*0], m2 mova [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] add wq, r7 jmp wq .h_w2: movd xm0, [srcq+ssq*0] pinsrd xm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm5 pmulhrsw xm0, xm3 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: mova xm4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm5 pmulhrsw xm0, xm3 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pshufb xm1, xm4 pmaddubsw xm0, xm5 pmaddubsw xm1, xm5 pmulhrsw xm0, xm3 pmulhrsw xm1, xm3 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu [dstq], m0 add dstq, dsq dec hd jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu m1, [srcq+8*4] movu m2, [srcq+8*5] add srcq, ssq pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 packuswb m1, m2 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w64 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 255 vpbroadcastd m5, [pw_2048] add mxyd, 16 add wq, r7 movd xm4, mxyd vpbroadcastw m4, xm4 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xm1, xm1, q2301 ; 1 0 punpcklbw xm1, xm0 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 1 pextrw [dstq+dsq*1], xm1, 0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm1, xm2, xm0, 0x01 ; 0 1 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm2, xm0, 0x02 ; 1 2 punpcklbw xm1, xm2 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm0, [srcq+ssq*0] .v_w8_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw xm1, xm0, xm2 movq xm0, [srcq+ssq*0] punpcklbw xm2, xm0 pmaddubsw xm1, xm4 pmaddubsw xm2, xm4 pmulhrsw xm1, xm5 pmulhrsw xm2, xm5 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: movu xm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m3, m0, 0x0f ; 0 1 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m3, m0, 0xf0 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 movu [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: movu m0, [srcq+ssq*0] .v_w32_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m1, m0, m3 punpckhbw m2, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 movu [dstq+dsq*0], m1 movu [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] .v_w64_loop: add srcq, ssq movu m3, [srcq+32*0] punpcklbw m2, m0, m3 punpckhbw m0, m3 pmaddubsw m2, m4 pmaddubsw m0, m4 pmulhrsw m2, m5 pmulhrsw m0, m5 packuswb m2, m0 mova m0, m3 movu m3, [srcq+32*1] mova [dstq+32*0], m2 punpcklbw m2, m1, m3 punpckhbw m1, m3 pmaddubsw m2, m4 pmaddubsw m1, m4 pmulhrsw m2, m5 pmulhrsw m1, m5 packuswb m2, m1 mova m1, m3 mova [dstq+32*1], m2 add dstq, dsq dec hd jg .v_w64_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow vpbroadcastd m7, [pw_15] movd xm6, mxyd add wq, r7 paddb m5, m5 vpbroadcastw m6, xm6 jmp wq .hv_w2: vpbroadcastd xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 .hv_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pinsrd xm1, [srcq+ssq*0], 1 pshufb xm1, xm4 pmaddubsw xm1, xm5 ; 1 _ 2 _ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 pmulhw xm1, xm6 pavgw xm2, xm7 paddw xm1, xm2 psrlw xm1, 4 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 0 pextrw [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova xm4, [bilin_h_shuf4] movddup xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 .hv_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] pshufb xm1, xm4 pmaddubsw xm1, xm5 ; 1 2 shufps xm2, xm0, xm1, q1032 ; 0 1 mova xm0, xm1 psubw xm1, xm2 pmulhw xm1, xm6 pavgw xm2, xm7 paddw xm1, xm2 psrlw xm1, 4 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhw m1, m6 pavgw m2, m7 paddw m1, m2 psrlw m1, 4 vextracti128 xm2, m1, 1 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: movu m0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w16_loop: movu xm2, [srcq+ssq*1+8*0] vinserti128 m2, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0+8*0] vinserti128 m3, [srcq+ssq*0+8*1], 1 pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 psubw m1, m2, m0 pmulhw m1, m6 pavgw m0, m7 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 pmulhw m3, m6 pavgw m2, m7 paddw m3, m2 psrlw m1, 4 psrlw m3, 4 packuswb m1, m3 vpermq m1, m1, q3120 movu [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w64: lea r6d, [hq+(1<<16)] .hv_w32_start: mov r4, srcq mov r7, dstq .hv_w32: %if WIN64 movaps r4m, xmm8 %endif .hv_w32_loop0: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m8, m2, m0 pmulhw m8, m6 pavgw m0, m7 paddw m8, m0 mova m0, m2 psubw m2, m3, m1 pmulhw m2, m6 pavgw m1, m7 paddw m2, m1 mova m1, m3 psrlw m8, 4 psrlw m2, 4 packuswb m8, m2 movu [dstq], m8 add dstq, dsq dec hd jg .hv_w32_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<16 jg .hv_w32_loop0 %if WIN64 movaps xmm8, r4m %endif RET INIT_XMM avx2 cglobal prep_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [prep_avx2] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r7+wq*2+table_offset(prep,)] lea r6, [ssq*3] add wq, r7 lea r7, [dsq*3] jmp wq .prep_w4: movd xm0, [srcq+ssq*0] pinsrd xm0, [srcq+ssq*1], 1 movd xm1, [srcq+ssq*2] pinsrd xm1, [srcq+r6 ], 1 lea srcq, [srcq+ssq*4] pmovzxbw xm0, xm0 pmovzxbw xm1, xm1 psllw xm0, 4 psllw xm1, 4 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*2], xm0 movq [dstq+dsq*4], xm1 movhps [dstq+r7 *2], xm1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .prep_w4 RET INIT_YMM avx2 .prep_w8: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*2] movhps xm1, [srcq+r6 ] lea srcq, [srcq+ssq*4] pmovzxbw m0, xm0 pmovzxbw m1, xm1 psllw m0, 4 psllw m1, 4 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*2], m0, 1 mova [dstq+dsq*4], xm1 vextracti128 [dstq+r7 *2], m1, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .prep_w8 RET .prep_w16: pmovzxbw m0, [srcq+ssq*0] pmovzxbw m1, [srcq+ssq*1] pmovzxbw m2, [srcq+ssq*2] pmovzxbw m3, [srcq+r6 ] lea srcq, [srcq+ssq*4] REPX {psllw x, 4}, m0, m1, m2, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*2], m1 mova [dstq+dsq*4], m2 mova [dstq+r7 *2], m3 lea dstq, [dstq+dsq*8] sub hd, 4 jg .prep_w16 RET .prep_w32: pmovzxbw m0, [srcq+ssq*0+16*0] pmovzxbw m1, [srcq+ssq*0+16*1] pmovzxbw m2, [srcq+ssq*1+16*0] pmovzxbw m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] REPX {psllw x, 4}, m0, m1, m2, m3 mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*2+32*0], m2 mova [dstq+dsq*2+32*1], m3 lea dstq, [dstq+dsq*4] sub hd, 2 jg .prep_w32 RET .prep_w64: pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] pmovzxbw m3, [srcq+16*3] add srcq, ssq REPX {psllw x, 4}, m0, m1, m2, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 lea dstq, [dstq+dsq*2] dec hd jg .prep_w64 RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 vbroadcasti128 m4, [z_filter_s+2] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(prep, _bilin_h)] lea r6, [ssq*3] add wq, r7 lea r7, [dsq*3] jmp wq .h_w4: vbroadcasti128 m4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*2] movhps xm1, [srcq+r6 ] lea srcq, [srcq+ssq*4] pshufb xm0, xm4 pshufb xm1, xm4 pmaddubsw xm0, xm5 pmaddubsw xm1, xm5 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*2], xm0 movq [dstq+dsq*4], xm1 movhps [dstq+r7 *2], xm1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*2] vinserti128 m1, [srcq+r6 ], 1 lea srcq, [srcq+ssq*4] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*2], m0, 1 mova [dstq+dsq*4], xm1 vextracti128 [dstq+r7 *2], m1, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 movu xm1, [srcq+ssq*1+8*0] vinserti128 m1, [srcq+ssq*1+8*1], 1 movu xm2, [srcq+ssq*2+8*0] vinserti128 m2, [srcq+ssq*2+8*1], 1 movu xm3, [srcq+r6 +8*0] vinserti128 m3, [srcq+r6 +8*1], 1 lea srcq, [srcq+ssq*4] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*2], m1 mova [dstq+dsq*4], m2 mova [dstq+r7 *2], m3 lea dstq, [dstq+dsq*8] sub hd, 4 jg .h_w16 RET .h_w32: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 movu xm1, [srcq+ssq*0+8*2] vinserti128 m1, [srcq+ssq*0+8*3], 1 movu xm2, [srcq+ssq*1+8*0] vinserti128 m2, [srcq+ssq*1+8*1], 1 movu xm3, [srcq+ssq*1+8*2] vinserti128 m3, [srcq+ssq*1+8*3], 1 lea srcq, [srcq+ssq*2] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*2+32*0], m2 mova [dstq+dsq*2+32*1], m3 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w32 RET .h_w64: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 movu xm2, [srcq+8*4] vinserti128 m2, [srcq+8*5], 1 movu xm3, [srcq+8*6] vinserti128 m3, [srcq+8*7], 1 add srcq, ssq REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 lea dstq, [dstq+dsq*2] dec hd jg .h_w64 RET .v: movzx wd, word [r7+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 255 add mxyd, 16 add wq, r7 lea r7, [dsq*3] movd xm5, mxyd lea r6, [ssq*3] vpbroadcastw m5, xm5 jmp wq .v_w4: movd xm0, [srcq+ssq*0] .v_w4_loop: movd xm2, [srcq+ssq*1] vpbroadcastd xm3, [srcq+ssq*2] pinsrd xm2, [srcq+r6 ], 1 ; 1 3 lea srcq, [srcq+ssq*4] vpblendd xm1, xm0, xm3, 0x02 ; 0 2 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm0, 0x02 ; 2 4 punpcklbw xm1, xm2 punpcklbw xm2, xm3 pmaddubsw xm1, xm5 pmaddubsw xm2, xm5 movq [dstq+dsq*0], xm1 movq [dstq+dsq*2], xm2 movhps [dstq+dsq*4], xm1 movhps [dstq+r7 *2], xm2 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm0, [srcq+ssq*0] .v_w8_loop: movq xm2, [srcq+ssq*1] vpbroadcastq m3, [srcq+ssq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpblendd m1, m0, m3, 0x30 ; 0 2 vpbroadcastq m0, [srcq+ssq*0] vpblendd m2, m4, 0x30 ; 1 3 vpblendd m3, m0, 0x30 ; 2 4 punpcklbw m1, m2 punpcklbw m2, m3 pmaddubsw m1, m5 pmaddubsw m2, m5 mova [dstq+dsq*0], xm1 mova [dstq+dsq*2], xm2 vextracti128 [dstq+dsq*4], m1, 1 vextracti128 [dstq+r7 *2], m2, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti128 m0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 m3, [srcq+ssq*1] vbroadcasti128 m4, [srcq+ssq*2] vbroadcasti128 m1, [srcq+r6 ] lea srcq, [srcq+ssq*4] shufpd m2, m0, m4, 0x0c ; 0 2 vbroadcasti128 m0, [srcq+ssq*0] shufpd m3, m1, 0x0c ; 1 3 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m5 pmaddubsw m2, m5 shufpd m4, m0, 0x0c ; 2 4 mova [dstq+dsq*0], m1 mova [dstq+dsq*4], m2 punpcklbw m1, m3, m4 punpckhbw m3, m4 pmaddubsw m1, m5 pmaddubsw m3, m5 mova [dstq+dsq*2], m1 mova [dstq+r7 *2], m3 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w16_loop RET .v_w32: vpermq m0, [srcq+ssq*0], q3120 .v_w32_loop: vpermq m4, [srcq+ssq*1], q3120 lea srcq, [srcq+ssq*2] punpcklbw m1, m0, m4 punpckhbw m2, m0, m4 vpermq m0, [srcq+ssq*0], q3120 punpcklbw m3, m4, m0 punpckhbw m4, m0 REPX {pmaddubsw x, m5}, m1, m2, m3, m4 mova [dstq+dsq*0+32*0], m1 mova [dstq+dsq*0+32*1], m2 mova [dstq+dsq*2+32*0], m3 mova [dstq+dsq*2+32*1], m4 lea dstq, [dstq+dsq*4] sub hd, 2 jg .v_w32_loop RET .v_w64: vpermq m0, [srcq+ssq*0+32*0], q3120 vpermq m1, [srcq+ssq*0+32*1], q3120 .v_w64_loop: vpermq m2, [srcq+ssq*1+32*0], q3120 vpermq m3, [srcq+ssq*1+32*1], q3120 lea srcq, [srcq+ssq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 pmaddubsw m4, m5 pmaddubsw m0, m5 mova [dstq+dsq*0+32*0], m4 mova [dstq+dsq*0+32*1], m0 punpcklbw m4, m1, m3 punpckhbw m1, m3 pmaddubsw m4, m5 pmaddubsw m1, m5 mova [dstq+dsq*0+32*2], m4 mova [dstq+dsq*0+32*3], m1 vpermq m0, [srcq+ssq*0+32*0], q3120 vpermq m1, [srcq+ssq*0+32*1], q3120 punpcklbw m4, m2, m0 punpckhbw m2, m0 pmaddubsw m4, m5 pmaddubsw m2, m5 mova [dstq+dsq*2+32*0], m4 mova [dstq+dsq*2+32*1], m2 punpcklbw m4, m3, m1 punpckhbw m3, m1 pmaddubsw m4, m5 pmaddubsw m3, m5 mova [dstq+dsq*2+32*2], m4 mova [dstq+dsq*2+32*3], m3 lea dstq, [dstq+dsq*4] sub hd, 2 jg .v_w64_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) WIN64_SPILL_XMM 7 movzx wd, word [r7+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd lea r6, [ssq*3] vpbroadcastw m6, xm6 add wq, r7 lea r7, [dsq*3] jmp wq .hv_w4: vbroadcasti128 m4, [bilin_h_shuf4] vpbroadcastq m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: movq xm1, [srcq+ssq*1] movhps xm1, [srcq+ssq*2] movq xm2, [srcq+r6 ] lea srcq, [srcq+ssq*4] movhps xm2, [srcq+ssq*0] vinserti128 m1, xm2, 1 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 3 4 vpblendd m2, m1, m0, 0xc0 vpermq m2, m2, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 vextracti128 xm2, m1, 1 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*2], xm1 movq [dstq+dsq*4], xm2 movhps [dstq+r7 *2], xm2 lea dstq, [dstq+dsq*8] sub hd, 4 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xm1, [srcq+ssq*1] vinserti128 m1, [srcq+ssq*2], 1 movu xm2, [srcq+r6 ] lea srcq, [srcq+ssq*4] vinserti128 m2, [srcq+ssq*0], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 ; 1 2 vperm2i128 m3, m0, m1, 0x21 ; 0 1 pmaddubsw m0, m2, m5 ; 3 4 vperm2i128 m2, m1, m0, 0x21 ; 2 3 psubw m1, m3 pmulhrsw m1, m6 paddw m1, m3 psubw m3, m0, m2 pmulhrsw m3, m6 paddw m3, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*2], m1, 1 mova [dstq+dsq*4], xm3 vextracti128 [dstq+r7 *2], m3, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .hv_w8_loop RET .hv_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w16_loop: movu xm1, [srcq+ssq*1+8*0] vinserti128 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] movu xm2, [srcq+ssq*0+8*0] vinserti128 m2, [srcq+ssq*0+8*1], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [dstq+dsq*0], m3 mova [dstq+dsq*2], m2 lea dstq, [dstq+dsq*4] sub hd, 2 jg .hv_w16_loop RET .hv_w32: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, ssq movu xm2, [srcq+8*0] vinserti128 m2, [srcq+8*1], 1 pshufb m2, m4 pmaddubsw m2, m5 psubw m3, m2, m0 pmulhrsw m3, m6 paddw m3, m0 mova m0, m2 movu xm2, [srcq+8*2] vinserti128 m2, [srcq+8*3], 1 pshufb m2, m4 pmaddubsw m2, m5 mova [dstq+32*0], m3 psubw m3, m2, m1 pmulhrsw m3, m6 paddw m3, m1 mova m1, m2 mova [dstq+32*1], m3 lea dstq, [dstq+dsq*2] dec hd jg .hv_w32_loop RET .hv_w64: lea r6d, [hq+(3<<8)] .hv_w64_loop0: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 mov r4, srcq mov r7, dstq pshufb m0, m4 pmaddubsw m0, m5 .hv_w64_loop: movu xm1, [r4+ssq*1+8*0] vinserti128 m1, [r4+ssq*1+8*1], 1 lea r4, [r4+ssq*2] movu xm2, [r4+ssq*0+8*0] vinserti128 m2, [r4+ssq*0+8*1], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [r7+dsq*0], m3 mova [r7+dsq*2], m2 lea r7, [r7+dsq*4] sub hd, 2 jg .hv_w64_loop add srcq, 16 add dstq, 32 movzx hd, r6b sub r6d, 1<<8 jg .hv_w64_loop0 RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 3-4 ; fn, name, type, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %if %0 == 4 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%4_avx2) %endif %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN smooth, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm add mxd, t0d ; 6tap_h, mx, 4tap_h add myd, t0d ; 6tap_v, my, 4tap_v movifnidn hd, hm lea r8, [put_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 lea r6, [ssq*3] lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb lea srcq, [srcq-1] vpbroadcastd xm4, [r8+mxq*8+subpel_filters-put_avx2+2] je .h_w4 mova xm3, [subpel_h_shuf4] .h_w2_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm3 pmaddubsw xm0, xm4 phaddw xm0, xm0 paddw xm0, xm5 psraw xm0, 6 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: mova xm3, [subpel_h_shufA] .h_w4_loop: movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm3 pshufb xm1, xm3 pmaddubsw xm0, xm4 pmaddubsw xm1, xm4 phaddw xm0, xm1 paddw xm0, xm5 psraw xm0, 6 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) cmp wd, 4 jle .h_w2 WIN64_SPILL_XMM 11 tzcnt wd, wd vbroadcasti128 m4, [z_filter_s+ 2] ; 01 shr mxd, 16 vbroadcasti128 m6, [z_filter_s+ 6] ; 23 sub srcq, 2 vbroadcasti128 m7, [z_filter_s+10] ; 45 lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] movzx wd, word [r8+wq*2+table_offset(put, _6tap_h)] vpbroadcastw m8, [mxq+0] vpbroadcastw m9, [mxq+2] add wq, r8 vpbroadcastw m10, [mxq+4] jmp wq .h_w8: %macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] pshufb m%2, m%1, m4 pmaddubsw m%2, m8 pshufb m%3, m%1, m6 pmaddubsw m%3, m9 pshufb m%1, m7 pmaddubsw m%1, m10 paddw m%2, m5 paddw m%1, m%3 paddw m%1, m%2 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_6TAP_H 0, 1, 2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 PUT_6TAP_H 0, 2, 3 lea srcq, [srcq+ssq*2] PUT_6TAP_H 1, 2, 3 packuswb m0, m1 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 sub srcq, r6 sub dstq, r6 .h_start: mov r4, r6 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_6TAP_H 0, 2, 3 PUT_6TAP_H 1, 2, 3 packuswb m0, m1 movu [dstq+r6], m0 add r6, 32 jle .h_loop add srcq, ssq add dstq, dsq mov r6, r4 dec hd jg .h_loop RET .v: WIN64_SPILL_XMM 9, 12 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] vpbroadcastd m8, [pw_512] lea myq, [r8+myq*8+subpel_filters+1-put_avx2] vpbroadcastw m5, [myq+0] vpbroadcastw m6, [myq+2] vpbroadcastw m7, [myq+4] add r6, r8 mov nsq, ssq neg nsq jmp r6 .v_w2: movd xm2, [srcq+nsq*2] pinsrw xm2, [srcq+nsq*1], 2 pinsrw xm2, [srcq+ssq*0], 4 pinsrw xm2, [srcq+ssq*1], 6 ; 0 1 2 3 lea srcq, [srcq+ssq*2] vpbroadcastd xm0, [srcq+ssq*0] palignr xm3, xm0, xm2, 4 ; 1 2 3 4 punpcklbw xm1, xm2, xm3 ; 01 12 punpckhbw xm2, xm3 ; 23 34 .v_w2_loop: vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xm3, xm1, xm5 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm6 ; a1 b1 paddw xm3, xm2 vpblendd xm2, xm0, xm4, 0x02 ; 4 5 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 5 6 punpcklbw xm2, xm4 ; 67 78 pmaddubsw xm4, xm2, xm7 ; a3 b3 paddw xm3, xm4 pmulhrsw xm3, xm8 packuswb xm3, xm3 pextrw [dstq+dsq*0], xm3, 0 pextrw [dstq+dsq*1], xm3, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm2, [srcq+nsq*2] pinsrd xm2, [srcq+nsq*1], 1 pinsrd xm2, [srcq+ssq*0], 2 pinsrd xm2, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] vpbroadcastd xm0, [srcq+ssq*0] palignr xm3, xm0, xm2, 4 ; 1 2 3 4 punpcklbw xm1, xm2, xm3 ; 01 12 punpckhbw xm2, xm3 ; 23 34 .v_w4_loop: vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xm3, xm1, xm5 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm6 ; a1 b1 paddw xm3, xm2 vpblendd xm2, xm0, xm4, 0x02 ; 4 5 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 5 6 punpcklbw xm2, xm4 ; 45 56 pmaddubsw xm4, xm2, xm7 ; a2 b2 paddw xm3, xm4 pmulhrsw xm3, xm8 packuswb xm3, xm3 movd [dstq+dsq*0], xm3 pextrd [dstq+dsq*1], xm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+nsq*2] vpbroadcastq m3, [srcq+nsq*1] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m3, 0x30 vpblendd m3, m2, 0x30 punpcklbw m1, m3 ; 01 12 vpblendd m2, m4, 0x30 vpblendd m4, m0, 0x30 punpcklbw m2, m4 ; 23 34 .v_w8_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m3, m1, m5 ; a0 b0 mova m1, m2 pmaddubsw m2, m6 ; a1 b1 paddw m3, m2 vpblendd m2, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m4, m0, 0x30 punpcklbw m2, m4 ; 45 56 pmaddubsw m4, m2, m7 ; a2 b2 paddw m3, m4 pmulhrsw m3, m8 vextracti128 xm4, m3, 1 packuswb xm3, xm4 movq [dstq+dsq*0], xm3 movhps [dstq+dsq*1], xm3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: .v_w32: .v_w64: lea r6d, [wq*8-128] WIN64_PUSH_XMM 12 lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m3, [srcq+nsq*2] vbroadcasti128 m4, [srcq+nsq*1] lea r4, [srcq+ssq*2] vbroadcasti128 m0, [srcq+ssq*0] vbroadcasti128 m1, [srcq+ssq*1] mov r7, dstq vbroadcasti128 m2, [r4+ssq*0] shufpd m3, m0, 0x0c shufpd m4, m1, 0x0c punpcklbw m1, m3, m4 ; 01 punpckhbw m3, m4 ; 23 shufpd m0, m2, 0x0c punpcklbw m2, m4, m0 ; 12 punpckhbw m4, m0 ; 34 .v_w16_loop: vbroadcasti128 m9, [r4+ssq*1] pmaddubsw m10, m1, m5 ; a0 lea r4, [r4+ssq*2] pmaddubsw m11, m2, m5 ; b0 mova m1, m3 pmaddubsw m3, m6 ; a1 mova m2, m4 pmaddubsw m4, m6 ; b1 paddw m10, m3 vbroadcasti128 m3, [r4+ssq*0] paddw m11, m4 shufpd m4, m0, m9, 0x0d shufpd m0, m9, m3, 0x0c punpcklbw m3, m4, m0 ; 45 punpckhbw m4, m0 ; 56 pmaddubsw m9, m3, m7 ; a2 paddw m10, m9 pmaddubsw m9, m4, m7 ; b2 paddw m11, m9 pmulhrsw m10, m8 pmulhrsw m11, m8 packuswb m10, m11 vpermq m10, m10, q3120 movu [r7+dsq*0], xm10 vextracti128 [r7+dsq*1], m10, 1 lea r7, [r7+dsq*2] sub hd, 2 jg .v_w16_loop add srcq, 16 add dstq, 16 movzx hd, r6b sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: WIN64_SPILL_XMM 12, 16 cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m6, [r8+mxq*8+subpel_filters-put_avx2+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] vpbroadcastd m7, [pw_8192] punpcklbw m0, m0 vpbroadcastd m8, [pd_512] psraw m0, 8 ; sign-extend mov nsq, ssq pshufd m9, m0, q0000 neg nsq pshufd m10, m0, q1111 pshufd m11, m0, q2222 cmp wd, 4 je .hv_w4 vbroadcasti128 m5, [subpel_h_shuf4] movq xm2, [srcq+nsq*2] movhps xm2, [srcq+nsq*1] movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq m1, [srcq+ssq*0] vpblendd m2, m1, 0x30 pshufb m2, m5 pshufb xm0, xm5 pmaddubsw m2, m6 pmaddubsw xm0, xm6 phaddw m2, m0 pmulhrsw m2, m7 vextracti128 xm0, m2, 1 palignr xm0, xm2, 4 punpcklwd xm1, xm2, xm0 ; 01 12 punpckhwd xm2, xm0 ; 23 34 .hv_w2_loop: movq xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm4, [srcq+ssq*0] pshufb xm4, xm5 pmaddubsw xm4, xm6 pmaddwd xm3, xm9, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm10 ; a1 b1 phaddw xm4, xm4 paddd xm3, xm2 pmulhrsw xm4, xm7 palignr xm2, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm2, xm4 ; 45 56 pmaddwd xm4, xm11, xm2 ; a2 b2 paddd xm3, xm8 paddd xm3, xm4 psrad xm3, 10 packssdw xm3, xm3 packuswb xm3, xm3 pextrw [dstq+dsq*0], xm3, 0 pextrw [dstq+dsq*1], xm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m5, [subpel_h_shuf4] vpbroadcastq m2, [srcq+nsq*2] vpbroadcastq m4, [srcq+nsq*1] vpbroadcastq m1, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq m0, [srcq+ssq*0] vpblendd m2, m4, 0xcc ; 0 1 vpblendd m1, m3, 0xcc ; 2 3 pshufb m2, m5 pshufb m1, m5 pshufb m0, m5 pmaddubsw m2, m6 pmaddubsw m1, m6 pmaddubsw m0, m6 phaddw m2, m1 phaddw m0, m0 pmulhrsw m2, m7 pmulhrsw m0, m7 palignr m3, m0, m2, 4 punpcklwd m1, m2, m3 ; 01 12 punpckhwd m2, m3 ; 23 34 .hv_w4_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m3, m9, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m10 ; a1 b1 paddd m3, m2 vpbroadcastq m2, [srcq+ssq*0] vpblendd m4, m2, 0xcc ; 5 6 pshufb m4, m5 pmaddubsw m4, m6 phaddw m4, m4 pmulhrsw m4, m7 palignr m2, m4, m0, 12 mova m0, m4 punpcklwd m2, m4 ; 45 56 pmaddwd m4, m11, m2 ; a2 b2 paddd m3, m8 paddd m3, m4 psrad m3, 10 vextracti128 xm4, m3, 1 packssdw xm3, xm4 packuswb xm3, xm3 pshuflw xm3, xm3, q3120 movd [dstq+dsq*0], xm3 pextrd [dstq+dsq*1], xm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 2 lea mxq, [r8+mxq*8+subpel_filters+1-put_avx2] WIN64_PUSH_XMM 16 vpbroadcastw m10, [mxq+0] vpbroadcastw m11, [mxq+2] vpbroadcastw m12, [mxq+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters+1-put_avx2] lea r6d, [wq*8-64] vbroadcasti128 m8, [z_filter_s+ 6] punpcklbw m0, m0 vbroadcasti128 m9, [z_filter_s+10] psraw m0, 8 ; sign-extend mov nsq, ssq pshufd m13, m0, q0000 neg nsq pshufd m14, m0, q1111 lea r6d, [hq+r6*4] pshufd m15, m0, q2222 .hv_w8_loop0: vbroadcasti128 m7, [z_filter_s+2] movu xm3, [srcq+nsq*2] lea r4, [srcq+ssq*2] movu xm4, [srcq+nsq*1] vbroadcasti128 m0, [srcq+ssq*0] mov r7, dstq vinserti128 m4, [srcq+ssq*1], 1 ; 1 3 vpblendd m3, m0, 0xf0 ; 0 2 vinserti128 m0, [r4+ssq*0], 1 ; 2 4 vpbroadcastd m5, [pw_8192] %macro HV_H_6TAP_W8 6 ; src/dst, tmp[1-2], shuf[1-3] pshufb %2, %1, %4 pmaddubsw %2, m10 pshufb %3, %1, %5 pmaddubsw %3, m11 pshufb %1, %6 pmaddubsw %1, m12 paddw %2, %3 paddw %1, %2 %endmacro HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m0, m0, q3120 pmulhrsw m3, m5 pmulhrsw m4, m5 pmulhrsw m0, m5 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .hv_w8_loop: movu xm7, [r4+ssq*1] lea r4, [r4+ssq*2] vinserti128 m7, [r4+ssq*0], 1 ; 5 6 pmaddwd m5, m13, m1 ; a0 mova m1, m3 pmaddwd m6, m13, m2 ; b0 mova m2, m4 pmaddwd m3, m14 ; a1 pmaddwd m4, m14 ; b1 paddd m5, m3 vbroadcasti128 m3, [z_filter_s+2] paddd m6, m4 HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 vpbroadcastd m3, [pw_8192] vpbroadcastd m4, [pd_512] pmulhrsw m7, m3 paddd m5, m4 paddd m6, m4 mova m4, m0 vpermq m0, m7, q3120 shufpd m4, m0, 0x05 punpcklwd m3, m4, m0 ; 45 pmaddwd m7, m15, m3 ; a2 punpckhwd m4, m0 ; 67 paddd m5, m7 pmaddwd m7, m15, m4 ; b2 paddd m6, m7 psrad m5, 10 psrad m6, 10 packssdw m5, m6 vextracti128 xm6, m5, 1 packuswb xm5, xm6 pshufd xm5, xm5, q3120 movq [r7+dsq*0], xm5 movhps [r7+dsq*1], xm5 lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop add srcq, 8 add dstq, 8 movzx hd, r6b sub r6d, 1<<8 jg .hv_w8_loop0 RET cglobal put_8tap_sharp_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm movifnidn hd, hm add mxd, FILTER_SHARP ; 8tap_h, mx, 4tap_h add myd, FILTER_SHARP ; 8tap_v, my, 4tap_v lea r8, [put_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put .v: WIN64_SPILL_XMM 12, 15 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _8tap_sharp_v)] vpbroadcastd m7, [pw_512] lea myq, [r8+myq*8+subpel_filters-put_avx2] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] add r6, r8 lea ss3q, [ssq*3] sub srcq, ss3q jmp r6 .v_w2: movd xm2, [srcq+ssq*0] pinsrw xm2, [srcq+ssq*1], 2 pinsrw xm2, [srcq+ssq*2], 4 add srcq, ss3q pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 movd xm3, [srcq+ssq*1] vpbroadcastd xm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklbw xm3, xm1 ; 45 56 punpcklbw xm1, xm2, xm4 ; 01 12 punpckhbw xm2, xm4 ; 23 34 .v_w2_loop: pmaddubsw xm5, xm1, xm8 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm9 ; a1 b1 paddw xm5, xm2 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 paddw xm5, xm4 pmulhrsw xm5, xm7 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 add srcq, ss3q pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xm3, [srcq+ssq*1] vpbroadcastd xm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklbw xm3, xm1 ; 45 56 punpcklbw xm1, xm2, xm4 ; 01 12 punpckhbw xm2, xm4 ; 23 34 .v_w4_loop: pmaddubsw xm5, xm1, xm8 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm9 ; a1 b1 paddw xm5, xm2 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 paddw xm5, xm4 pmulhrsw xm5, xm7 packuswb xm5, xm5 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m5, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m6, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 vpblendd m2, m5, 0x30 vpblendd m5, m3, 0x30 punpcklbw m2, m5 ; 23 34 vpblendd m3, m6, 0x30 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m2, m9 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, m10 ; a2 b2 paddw m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, m11 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 vextracti128 xm4, m5, 1 packuswb xm5, xm4 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: .v_w32: .v_w64: lea r6d, [wq*8-128] WIN64_PUSH_XMM 15 lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] lea r4, [srcq+ss3q] vbroadcasti128 m6, [srcq+ssq*2] vbroadcasti128 m0, [r4+ssq*0] mov r7, dstq vbroadcasti128 m1, [r4+ssq*1] vbroadcasti128 m2, [r4+ssq*2] add r4, ss3q vbroadcasti128 m3, [r4+ssq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: vbroadcasti128 m12, [r4+ssq*1] lea r4, [r4+ssq*2] pmaddubsw m13, m1, m8 ; a0 pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 paddw m13, m3 paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 paddw m13, m5 vbroadcasti128 m5, [r4+ssq*0] paddw m14, m6 shufpd m6, m0, m12, 0x0d shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 paddw m13, m12 pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 pmulhrsw m13, m7 pmulhrsw m14, m7 packuswb m13, m14 vpermq m13, m13, q3120 movu [r7+dsq*0], xm13 vextracti128 [r7+dsq*1], m13, 1 lea r7, [r7+dsq*2] sub hd, 2 jg .v_w16_loop add srcq, 16 add dstq, 16 movzx hd, r6b sub r6d, 1<<8 jg .v_w16_loop0 RET .h: .h_w2: .h_w4: test myd, 0xf00 jnz .hv vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) cmp wd, 4 jle mangle(private_prefix %+ _put_6tap_8bpc_avx2).h_w2 WIN64_SPILL_XMM 11 tzcnt wd, wd vbroadcasti128 m6, [subpel_h_shufA] shr mxd, 16 vbroadcasti128 m7, [subpel_h_shufB] sub srcq, 3 vbroadcasti128 m8, [subpel_h_shufC] movzx wd, word [r8+wq*2+table_offset(put, _8tap_sharp_h)] vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] add wq, r8 jmp wq .h_w8: %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] pshufb m%2, m%1, m7 pshufb m%3, m%1, m8 pshufb m%1, m6 pmaddubsw m%4, m%2, m9 pmaddubsw m%2, m10 pmaddubsw m%3, m10 pmaddubsw m%1, m9 paddw m%3, m%4 paddw m%1, m%2 phaddw m%1, m%3 paddw m%1, m5 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 1, 2, 3 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 PUT_8TAP_H 0, 2, 3, 4 lea srcq, [srcq+ssq*2] PUT_8TAP_H 1, 2, 3, 4 packuswb m0, m1 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 sub srcq, r6 sub dstq, r6 .h_start: mov r4, r6 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 2, 3, 4 packuswb m0, m1 movu [dstq+r6], m0 add r6, 32 jle .h_loop add srcq, ssq add dstq, dsq mov r6, r4 dec hd jg .h_loop RET .hv: WIN64_SPILL_XMM 14, 16 cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] lea ss3q, [ssq*3] sub srcq, ss3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m8, [pw_8192] vpbroadcastd m9, [pd_512] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m6, [subpel_h_shuf4] movq xm2, [srcq+ssq*0] movhps xm2, [srcq+ssq*1] movq xm0, [srcq+ssq*2] add srcq, ss3q movhps xm0, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1, [srcq+ssq*0] vpblendd m2, m3, 0x30 vpblendd m0, m1, 0x30 vpblendd m2, m4, 0xc0 pshufb m2, m6 pshufb m0, m6 pmaddubsw m2, m7 pmaddubsw m0, m7 phaddw m2, m0 pmulhrsw m2, m8 vextracti128 xm3, m2, 1 palignr xm4, xm3, xm2, 4 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 pshufd xm0, xm3, q2121 punpcklwd xm3, xm0 ; 45 56 .hv_w2_loop: movq xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm4, [srcq+ssq*0] pshufb xm4, xm6 pmaddubsw xm4, xm7 pmaddwd xm5, xm1, xm10 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm11 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm12 ; a2 b2 phaddw xm4, xm4 pmulhrsw xm4, xm8 paddd xm5, xm3 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm3, xm13 ; a3 b3 paddd xm5, xm9 paddd xm5, xm4 psrad xm5, 10 packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m6, [subpel_h_shuf4] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m5, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpblendd m2, m4, 0xcc ; 0 1 vpbroadcastq m4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1, [srcq+ssq*0] vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m4, 0xcc ; 4 5 pshufb m2, m6 pshufb m0, m6 pshufb m3, m6 pshufb m1, m6 pmaddubsw m2, m7 pmaddubsw m0, m7 pmaddubsw m3, m7 pmaddubsw m1, m7 phaddw m2, m0 phaddw m3, m1 pmulhrsw m2, m8 pmulhrsw m3, m8 palignr m4, m3, m2, 4 punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m1, m10 ; a0 b0 mova m1, m2 pmaddwd m2, m11 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m12 ; a2 b2 paddd m5, m3 vpbroadcastq m3, [srcq+ssq*0] vpblendd m4, m3, 0xcc ; 7 8 pshufb m4, m6 pmaddubsw m4, m7 phaddw m4, m4 pmulhrsw m4, m8 palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; 67 78 pmaddwd m4, m3, m13 ; a3 b3 paddd m5, m9 paddd m5, m4 psrad m5, 10 vextracti128 xm4, m5, 1 packssdw xm5, xm4 packuswb xm5, xm5 pshuflw xm5, xm5, q3120 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: WIN64_PUSH_XMM 16 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] lea ss3q, [ssq*3] sub srcq, ss3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 lea r6d, [wq*8-64] lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] lea r4, [srcq+ss3q] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] mov r7, dstq vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+ssq*2] vbroadcasti128 m0, [r4+ssq*0] vpblendd m4, m0, 0xf0 ; 0 3 vinserti128 m5, [r4+ssq*1], 1 ; 1 4 vinserti128 m6, [r4+ssq*2], 1 ; 2 5 add r4, ss3q vinserti128 m0, [r4+ssq*0], 1 ; 3 6 %macro HV_H_8TAP_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 pshufb %4, %1, %7 pshufb %1, %5 pmaddubsw %2, %3, m10 pmaddubsw %4, m11 pmaddubsw %3, m11 pmaddubsw %1, m10 paddw %2, %4 paddw %1, %3 phaddw %1, %2 %endmacro HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 vpermq m6, m6, q3120 pmulhrsw m0, m7 pmulhrsw m4, m7 pmulhrsw m5, m7 pmulhrsw m6, m7 vpermq m7, m0, q3120 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 r6m, m0, 1 ; not enough registers movu xm0, [r4+ssq*1] lea r4, [r4+ssq*2] vinserti128 m0, [r4+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m13 ; a1 pmaddwd m4, m13 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m14 ; a2 pmaddwd m6, m14 ; b2 paddd m8, m5 paddd m9, m6 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_512] vbroadcasti128 m6, r6m pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 paddd m8, m7 pmaddwd m7, m6, m15 ; b3 paddd m7, m9 psrad m8, 10 psrad m7, 10 packssdw m8, m7 vextracti128 xm7, m8, 1 packuswb xm8, xm7 pshufd xm7, xm8, q3120 movq [r7+dsq*0], xm7 movhps [r7+dsq*1], xm7 lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop add srcq, 8 add dstq, 8 movzx hd, r6b sub r6d, 1<<8 jg .hv_w8_loop0 RET %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN smooth, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN regular, REGULAR cglobal prep_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm add mxd, t0d ; 6tap_h, mx, 4tap_h add myd, t0d ; 6tap_v, my, 4tap_v movifnidn hd, hm lea r8, [prep_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(prep,)] lea r6, [ssq*3] add wq, r8 lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .v: WIN64_SPILL_XMM 10, 12 movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd lea myq, [r8+myq*8+subpel_filters+1-prep_avx2] vpbroadcastd m9, [pw_8192] lea r6, [dsq*3] vpbroadcastw m6, [myq+0] mov nsq, ssq vpbroadcastw m7, [myq+2] neg nsq vpbroadcastw m8, [myq+4] cmp wd, 8 jg .v_w16 je .v_w8 .v_w4: movd xm2, [srcq+nsq*2] pinsrd xm2, [srcq+nsq*1], 1 vpbroadcastd m1, [srcq+ssq*0] vpbroadcastd m3, [srcq+ssq*1] vpbroadcastd m0, [srcq+ssq*2] vbroadcasti128 m5, [deint_shuf4] vpblendd m1, m2, 0xeb punpcklqdq m3, m0 vpblendd m1, m3, 0x60 ; 0 1 2 _ 2 3 4 _ pshufb m1, m5 ; 01 12 23 34 .v_w4_loop: lea srcq, [srcq+ssq*4] pinsrd xm0, [srcq+nsq*1], 1 vpbroadcastd m2, [srcq+ssq*0] vpbroadcastd m3, [srcq+ssq*1] vpblendd m2, m0, 0xeb vpbroadcastd m0, [srcq+ssq*2] punpcklqdq m3, m0 vpblendd m2, m3, 0x60 ; 4 5 6 _ 6 7 8 _ pshufb m2, m5 ; 45 56 67 78 pmaddubsw m3, m1, m6 ; a0 b0 c0 d0 vperm2i128 m1, m2, 0x21 ; 23 34 45 56 pmaddubsw m4, m2, m8 ; a2 b2 c2 d2 pmaddubsw m1, m7 ; a1 b1 c1 d1 paddw m3, m4 paddw m3, m1 pmulhrsw m3, m9 mova m1, m2 vextracti128 xm2, m3, 1 movq [dstq+dsq*0], xm3 movhps [dstq+dsq*2], xm3 movq [dstq+dsq*4], xm2 movhps [dstq+r6 *2], xm2 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+nsq*2] vpbroadcastq m3, [srcq+nsq*1] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] vpblendd m1, m3, 0x30 vpblendd m3, m2, 0x30 punpcklbw m1, m3 ; 01 12 vpblendd m2, m4, 0x30 vpblendd m4, m0, 0x30 punpcklbw m2, m4 ; 23 34 .v_w8_loop: lea srcq, [srcq+ssq*4] pmaddubsw m1, m6 ; a0 vpbroadcastq m3, [srcq+nsq*1] pmaddubsw m4, m2, m7 ; a1 pmaddubsw m5, m2, m6 ; b0 vpbroadcastq m2, [srcq+ssq*0] vpblendd m0, m3, 0x30 vpblendd m3, m2, 0x30 paddw m4, m1 punpcklbw m1, m0, m3 ; 45 56 vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] vpblendd m2, m3, 0x30 vpblendd m3, m0, 0x30 punpcklbw m2, m3 ; 67 78 pmaddubsw m3, m1, m7 ; b1 paddw m5, m3 pmaddubsw m3, m1, m8 ; a2 paddw m4, m3 pmaddubsw m3, m2, m8 ; b2 paddw m5, m3 pmulhrsw m4, m9 pmulhrsw m5, m9 mova [dstq+dsq*0], xm4 vextracti128 [dstq+dsq*2], m4, 1 mova [dstq+dsq*4], xm5 vextracti128 [dstq+r6 *2], m5, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w8_loop RET .v_w16: lea r6d, [wq*2-32] lea srcq, [srcq+nsq*2] WIN64_PUSH_XMM 12 lea r6d, [hq+r6*8] .v_w16_loop0: vbroadcasti128 m3, [srcq+ssq*0] lea r4, [srcq+ssq*2] vbroadcasti128 m4, [srcq+ssq*1] mov r7, dstq vbroadcasti128 m0, [r4+ssq*0] vbroadcasti128 m1, [r4+ssq*1] lea r4, [r4+ssq*2] vbroadcasti128 m2, [r4+ssq*0] shufpd m3, m0, 0x0c shufpd m4, m1, 0x0c punpcklbw m1, m3, m4 ; 01 punpckhbw m3, m4 ; 23 shufpd m0, m2, 0x0c punpcklbw m2, m4, m0 ; 12 punpckhbw m4, m0 ; 34 .v_w16_loop: vbroadcasti128 m5, [r4+ssq*1] pmaddubsw m10, m1, m6 ; a0 lea r4, [r4+ssq*2] pmaddubsw m11, m2, m6 ; b0 mova m1, m3 pmaddubsw m3, m7 ; a1 mova m2, m4 pmaddubsw m4, m7 ; b1 paddw m10, m3 vbroadcasti128 m3, [r4+ssq*0] paddw m11, m4 shufpd m4, m0, m5, 0x0d shufpd m0, m5, m3, 0x0c punpcklbw m3, m4, m0 ; 45 punpckhbw m4, m0 ; 56 pmaddubsw m5, m3, m8 ; a2 paddw m10, m5 pmaddubsw m5, m4, m8 ; b2 paddw m11, m5 pmulhrsw m10, m9 pmulhrsw m11, m9 mova [r7+dsq*0], m10 mova [r7+dsq*2], m11 lea r7, [r7+dsq*4] sub hd, 2 jg .v_w16_loop add srcq, 16 add dstq, 32 movzx hd, r6b sub r6d, 1<<8 jg .v_w16_loop0 RET .h_w4: RESET_STACK_STATE movzx mxd, mxb vbroadcasti128 m3, [subpel_h_shufA] dec srcq vpbroadcastd m5, [r8+mxq*8+subpel_filters-prep_avx2+2] lea r7, [ssq*3] lea r6, [dsq*3] .h_w4_loop: movq xm0, [srcq+ssq*0] vpbroadcastq m2, [srcq+ssq*1] movq xm1, [srcq+ssq*2] vpblendd m0, m2, 0x30 vpbroadcastq m2, [srcq+r7 ] lea srcq, [srcq+ssq*4] vpblendd m1, m2, 0x30 pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m5 pmaddubsw m1, m5 phaddw m0, m1 pmulhrsw m0, m4 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+dsq*4], xm0 movhps [dstq+r6 *2], xm1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m4, [pw_8192] cmp wd, 4 je .h_w4 WIN64_SPILL_XMM 10 tzcnt wd, wd vbroadcasti128 m3, [z_filter_s+ 2] shr mxd, 16 vbroadcasti128 m5, [z_filter_s+ 6] sub srcq, 2 vbroadcasti128 m6, [z_filter_s+10] lea mxq, [r8+mxq*8+subpel_filters+1-prep_avx2] movzx wd, word [r8+wq*2+table_offset(prep, _6tap_h)] vpbroadcastw m7, [mxq+0] vpbroadcastw m8, [mxq+2] add wq, r8 vpbroadcastw m9, [mxq+4] jmp wq .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] %macro PREP_6TAP_H 0 pshufb m1, m0, m3 pmaddubsw m1, m7 pshufb m2, m0, m5 pmaddubsw m2, m8 pshufb m0, m6 pmaddubsw m0, m9 paddw m1, m2 paddw m1, m0 pmulhrsw m1, m4 %endmacro PREP_6TAP_H mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*2], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 PREP_6TAP_H movu xm0, [srcq+ssq*1+8*0] vinserti128 m0, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m1 PREP_6TAP_H mova [dstq+dsq*2], m1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 add dstq, 32*2 sub srcq, r6 .h_start: mov r4, r6 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 PREP_6TAP_H movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 mova [dstq+r6*2+32*0], m1 PREP_6TAP_H mova [dstq+r6*2+32*1], m1 add r6, 32 jle .h_loop add srcq, ssq mov r6, r4 lea dstq, [dstq+dsq*2] dec hd jg .h_loop RET .hv: WIN64_SPILL_XMM 14, 16 cmp wd, 4 jne .hv_w8 .hv_w4: movzx mxd, mxb dec srcq vpbroadcastd m7, [r8+mxq*8+subpel_filters-prep_avx2+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mova m6, [subpel_h_shuf4] vpbroadcastq m0, [r8+myq*8+subpel_filters+1-prep_avx2] mov nsq, ssq pmovzxbd m13, [deint_shuf4] neg nsq vpbroadcastd m8, [pw_8192] vpbroadcastd m9, [pd_32] punpcklbw m0, m0 vpbroadcastq m2, [srcq+nsq*2] psraw m0, 8 ; sign-extend vpbroadcastq m4, [srcq+nsq*1] pshufd m10, m0, q0000 vpbroadcastq m1, [srcq+ssq*0] pshufd m11, m0, q1111 vpbroadcastq m3, [srcq+ssq*1] pshufd m12, m0, q2222 vpbroadcastq m0, [srcq+ssq*2] vpblendd m2, m4, 0xcc ; 0 1 vpblendd m1, m3, 0xcc ; 2 3 pshufb m2, m6 pshufb m1, m6 pshufb m0, m6 pmaddubsw m2, m7 pmaddubsw m1, m7 pmaddubsw m0, m7 phaddw m2, m1 ; 0 1 2 3 phaddw m0, m0 ; 4 pmulhrsw m2, m8 pmulhrsw m0, m8 lea r6, [dsq*3] palignr m0, m2, 4 punpcklwd m1, m2, m0 ; 01 12 punpckhwd m2, m0 ; 23 34 .hv_w4_loop: pmaddwd m4, m10, m1 ; a0 b0 lea srcq, [srcq+ssq*4] pmaddwd m5, m2, m10 ; c0 d0 vpbroadcastq m1, [srcq+nsq*1] pmaddwd m2, m11 ; a1 b1 vpbroadcastq m3, [srcq+ssq*0] paddd m4, m2 vpbroadcastq m2, [srcq+ssq*1] vpblendd m1, m3, 0xcc ; 5 6 vpbroadcastq m3, [srcq+ssq*2] vpblendd m2, m3, 0xcc ; 7 8 pshufb m1, m6 pshufb m2, m6 pmaddubsw m1, m7 pmaddubsw m2, m7 phaddw m1, m2 ; 5 6 7 8 pmulhrsw m1, m8 paddd m5, m9 paddd m4, m9 palignr m2, m1, m0, 12 mova m0, m1 punpcklwd m1, m2, m0 ; 45 56 punpckhwd m2, m0 ; 67 78 pmaddwd m3, m11, m1 ; c1 d1 paddd m5, m3 pmaddwd m3, m12, m1 ; a2 b2 paddd m4, m3 pmaddwd m3, m12, m2 ; c2 d2 paddd m5, m3 psrad m4, 6 psrad m5, 6 packssdw m4, m5 vpermd m4, m13, m4 vextracti128 xm5, m4, 1 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*2], xm4 movq [dstq+dsq*4], xm5 movhps [dstq+r6 *2], xm5 lea dstq, [dstq+dsq*8] sub hd, 4 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 lea mxq, [r8+mxq*8+subpel_filters+1-prep_avx2] WIN64_PUSH_XMM 16 vpbroadcastw m10, [mxq+0] vpbroadcastw m11, [mxq+2] vpbroadcastw m12, [mxq+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters+1-prep_avx2] lea r7, [ssq*2+2] vbroadcasti128 m8, [z_filter_s+ 6] punpcklbw m0, m0 vbroadcasti128 m9, [z_filter_s+10] psraw m0, 8 ; sign-extend lea r6d, [wq*8-64] pshufd m13, m0, q0000 sub srcq, r7 pshufd m14, m0, q1111 lea r6d, [hq+r6*4] pshufd m15, m0, q2222 .hv_w8_loop0: vbroadcasti128 m7, [z_filter_s+2] movu xm3, [srcq+ssq*0] lea r4, [srcq+ssq*2] movu xm4, [srcq+ssq*1] vbroadcasti128 m0, [r4+ssq*0] mov r7, dstq vinserti128 m4, [r4+ssq*1], 1 ; 1 3 lea r4, [r4+ssq*2] vpblendd m3, m0, 0xf0 ; 0 2 vinserti128 m0, [r4+ssq*0], 1 ; 2 4 vpbroadcastd m5, [pw_8192] HV_H_6TAP_W8 m3, m1, m2, m7, m8, m9 HV_H_6TAP_W8 m4, m1, m2, m7, m8, m9 HV_H_6TAP_W8 m0, m1, m2, m7, m8, m9 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m0, m0, q3120 pmulhrsw m3, m5 pmulhrsw m4, m5 pmulhrsw m0, m5 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .hv_w8_loop: movu xm7, [r4+ssq*1] lea r4, [r4+ssq*2] vinserti128 m7, [r4+ssq*0], 1 ; 5 6 pmaddwd m5, m13, m1 ; a0 mova m1, m3 pmaddwd m6, m13, m2 ; b0 mova m2, m4 pmaddwd m3, m14 ; a1 pmaddwd m4, m14 ; b1 paddd m5, m3 vbroadcasti128 m3, [z_filter_s+2] paddd m6, m4 HV_H_6TAP_W8 m7, m3, m4, m3, m8, m9 vpbroadcastd m3, [pw_8192] vpbroadcastd m4, [pd_32] pmulhrsw m7, m3 paddd m5, m4 paddd m6, m4 mova m4, m0 vpermq m0, m7, q3120 shufpd m4, m0, 0x05 punpcklwd m3, m4, m0 ; 45 pmaddwd m7, m15, m3 ; a2 punpckhwd m4, m0 ; 67 paddd m5, m7 pmaddwd m7, m15, m4 ; b2 paddd m6, m7 psrad m5, 6 psrad m6, 6 packssdw m5, m6 vpermq m5, m5, q3120 mova [r7+dsq*0], xm5 vextracti128 [r7+dsq*2], m5, 1 lea r7, [r7+dsq*4] sub hd, 2 jg .hv_w8_loop add srcq, 8 add dstq, 16 movzx hd, r6b sub r6d, 1<<8 jg .hv_w8_loop0 RET cglobal prep_8tap_sharp_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my imul mxd, mxm, 0x010101 imul myd, mym, 0x010101 mov wd, wm movifnidn hd, hm add mxd, FILTER_SHARP ; 8tap_h, mx, 4tap_h add myd, FILTER_SHARP ; 8tap_v, my, 4tap_v lea r8, [prep_avx2] test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep .v: WIN64_SPILL_XMM 12, 15 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. lea myq, [r8+myq*8+subpel_filters-prep_avx2] lea r8, [ssq*3] sub srcq, r8 vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] cmp wd, 8 jg .v_w16 lea r6, [dsq*3] je .v_w8 .v_w4: movd xm0, [srcq+ssq*0] vpbroadcastd m1, [srcq+ssq*2] vpbroadcastd xm2, [srcq+ssq*1] add srcq, r8 vpbroadcastd m3, [srcq+ssq*0] vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ vpbroadcastd m0, [srcq+ssq*1] vpbroadcastd m2, [srcq+ssq*2] vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ vpbroadcastd m0, [srcq+r8 ] vbroadcasti128 m5, [deint_shuf4] vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ punpcklbw m1, m2, m3 ; 01 12 23 34 vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 punpckhbw m2, m3 ; 23 34 45 56 .v_w4_loop: lea srcq, [srcq+ssq*4] pinsrd xm0, [srcq+ssq*0], 1 vpbroadcastd m3, [srcq+ssq*1] vpbroadcastd m4, [srcq+ssq*2] vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ vpbroadcastd m0, [srcq+r8 ] vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ pshufb m3, m5 ; 67 78 89 9a pmaddubsw m4, m1, m8 vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 pmaddubsw m2, m9 paddw m4, m2 mova m2, m3 pmaddubsw m3, m11 paddw m3, m4 pmaddubsw m4, m1, m10 paddw m3, m4 pmulhrsw m3, m7 vextracti128 xm4, m3, 1 movq [dstq+dsq*0], xm3 movhps [dstq+dsq*2], xm3 movq [dstq+dsq*4], xm4 movhps [dstq+r6 *2], xm4 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq m5, [srcq+r8 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m6, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 vpblendd m2, m5, 0x30 vpblendd m5, m3, 0x30 punpcklbw m2, m5 ; 23 34 vpblendd m3, m6, 0x30 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: vpbroadcastq m4, [srcq+r8 ] lea srcq, [srcq+ssq*4] pmaddubsw m5, m2, m9 ; a1 pmaddubsw m6, m2, m8 ; b0 vpblendd m2, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m4, m0, 0x30 punpcklbw m2, m4 ; 67 78 pmaddubsw m1, m8 ; a0 pmaddubsw m4, m3, m9 ; b1 paddw m5, m1 mova m1, m3 pmaddubsw m3, m10 ; a2 paddw m6, m4 paddw m5, m3 vpbroadcastq m4, [srcq+ssq*1] vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*2] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 89 9a pmaddubsw m4, m2, m11 ; a3 paddw m5, m4 pmaddubsw m4, m2, m10 ; b2 paddw m6, m4 pmaddubsw m4, m3, m11 ; b3 paddw m6, m4 pmulhrsw m5, m7 pmulhrsw m6, m7 mova [dstq+dsq*0], xm5 vextracti128 [dstq+dsq*2], m5, 1 mova [dstq+dsq*4], xm6 vextracti128 [dstq+r6 *2], m6, 1 lea dstq, [dstq+dsq*8] sub hd, 4 jg .v_w8_loop RET .v_w16: lea r6d, [wq*2-32] WIN64_PUSH_XMM 15 lea r6d, [hq+r6*8] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] lea r4, [srcq+ssq*2] vbroadcasti128 m0, [r4+ssq*1] vbroadcasti128 m6, [r4+ssq*0] lea r4, [r4+ssq*2] vbroadcasti128 m1, [r4+ssq*0] vbroadcasti128 m2, [r4+ssq*1] lea r4, [r4+ssq*2] vbroadcasti128 m3, [r4+ssq*0] mov r7, dstq shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: vbroadcasti128 m12, [r4+ssq*1] lea r4, [r4+ssq*2] pmaddubsw m13, m1, m8 ; a0 pmaddubsw m14, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 paddw m13, m3 paddw m14, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 paddw m13, m5 vbroadcasti128 m5, [r4+ssq*0] paddw m14, m6 shufpd m6, m0, m12, 0x0d shufpd m0, m12, m5, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 paddw m13, m12 pmaddubsw m12, m6, m11 ; b3 paddw m14, m12 pmulhrsw m13, m7 pmulhrsw m14, m7 mova [r7+dsq*0], m13 mova [r7+dsq*2], m14 lea r7, [r7+dsq*4] sub hd, 2 jg .v_w16_loop add srcq, 16 add dstq, 32 movzx hd, r6b sub r6d, 1<<8 jg .v_w16_loop0 RET .h: .h_w4: test myd, 0xf00 jnz .hv vpbroadcastd m4, [pw_8192] cmp wd, 4 je mangle(private_prefix %+ _prep_6tap_8bpc_avx2).h_w4 WIN64_SPILL_XMM 10 vbroadcasti128 m5, [subpel_h_shufA] tzcnt wd, wd vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r8+wq*2+table_offset(prep, _8tap_sharp_h)] vpbroadcastd m8, [r8+mxq*8+subpel_filters-prep_avx2+0] vpbroadcastd m9, [r8+mxq*8+subpel_filters-prep_avx2+4] add wq, r8 jmp wq .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] %macro PREP_8TAP_H 0 pshufb m1, m0, m5 pshufb m2, m0, m6 pshufb m3, m0, m7 pmaddubsw m1, m8 pmaddubsw m0, m2, m8 pmaddubsw m2, m9 pmaddubsw m3, m9 paddw m1, m2 paddw m0, m3 phaddw m1, m0 pmulhrsw m1, m4 %endmacro PREP_8TAP_H mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*2], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 PREP_8TAP_H movu xm0, [srcq+ssq*1+8*0] vinserti128 m0, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m1 PREP_8TAP_H mova [dstq+dsq*2], m1 lea dstq, [dstq+dsq*4] sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 add dstq, 32*2 sub srcq, r6 .h_start: mov r4, r6 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 PREP_8TAP_H movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 mova [dstq+r6*2+32*0], m1 PREP_8TAP_H mova [dstq+r6*2+32*1], m1 add r6, 32 jle .h_loop add srcq, ssq lea dstq, [dstq+dsq*2] mov r6, r4 dec hd jg .h_loop RET .hv: WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-prep_avx2+0] vpbroadcastd m11, [r8+mxq*8+subpel_filters-prep_avx2+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-prep_avx2] lea r8, [ssq*3] sub srcq, r8 punpcklbw m0, m0 lea r6d, [wq*8-64] psraw m0, 8 ; sign-extend lea r6d, [hq+r6*4] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] lea r4, [srcq+ssq*2] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] mov r7, dstq vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [r4+ssq*0] vbroadcasti128 m0, [r4+ssq*1] lea r4, [r4+ssq*2] vpblendd m4, m0, 0xf0 ; 0 3 vinserti128 m5, [r4+ssq*0], 1 ; 1 4 vinserti128 m6, [r4+ssq*1], 1 ; 2 5 lea r4, [r4+ssq*2] vinserti128 m0, [r4+ssq*0], 1 ; 3 6 HV_H_8TAP_W8 m4, m1, m2, m3, m7, m8, m9 HV_H_8TAP_W8 m5, m1, m2, m3, m7, m8, m9 HV_H_8TAP_W8 m6, m1, m2, m3, m7, m8, m9 HV_H_8TAP_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 vpermq m6, m6, q3120 pmulhrsw m0, m7 pmulhrsw m4, m7 pmulhrsw m5, m7 pmulhrsw m6, m7 vpermq m7, m0, q3120 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 [r7], m0, 1 ; not enough registers movu xm0, [r4+ssq*1] lea r4, [r4+ssq*2] vinserti128 m0, [r4+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m13 ; a1 pmaddwd m4, m13 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m14 ; a2 pmaddwd m6, m14 ; b2 paddd m8, m5 paddd m9, m6 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] HV_H_8TAP_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_32] vbroadcasti128 m6, [r7] pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 paddd m8, m7 pmaddwd m7, m6, m15 ; b3 paddd m7, m9 psrad m8, 6 psrad m7, 6 packssdw m8, m7 vpermq m7, m8, q3120 mova [r7+dsq*0], xm7 vextracti128 [r7+dsq*2], m7, 1 lea r7, [r7+dsq*4] sub hd, 2 jg .hv_w8_loop add srcq, 8 add dstq, 16 movzx hd, r6b sub r6d, 1<<8 jg .hv_w8_loop0 RET .hv_w4: movzx mxd, mxb dec srcq vpbroadcastd m8, [r8+mxq*8+subpel_filters-prep_avx2+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-prep_avx2] lea r8, [ssq*3] sub srcq, r8 mova m7, [subpel_h_shuf4] pmovzxbd m9, [deint_shuf4] vpbroadcastd m10, [pw_8192] punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] vpbroadcastq m5, [srcq+r8 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m6, [srcq+ssq*1] vpbroadcastq m1, [srcq+ssq*2] vpblendd m2, m4, 0xcc ; 0 1 vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m6, 0xcc ; 4 5 pshufb m2, m7 ; 00 01 10 11 02 03 12 13 pshufb m0, m7 ; 20 21 30 31 22 23 32 33 pshufb m3, m7 ; 40 41 50 51 42 43 52 53 pshufb m1, m7 ; 60 61 60 61 62 63 62 63 pmaddubsw m2, m8 pmaddubsw m0, m8 pmaddubsw m3, m8 pmaddubsw m1, m8 phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ pmulhrsw m2, m10 pmulhrsw m3, m10 lea r6, [dsq*3] palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: pmaddwd m5, m1, m12 ; a0 b0 pmaddwd m6, m2, m12 ; c0 d0 pmaddwd m2, m13 ; a1 b1 pmaddwd m4, m3, m13 ; c1 d1 mova m1, m3 pmaddwd m3, m14 ; a2 b2 paddd m5, m2 vpbroadcastq m2, [srcq+r8 ] lea srcq, [srcq+ssq*4] paddd m6, m4 vpbroadcastq m4, [srcq+ssq*0] paddd m5, m3 vpbroadcastq m3, [srcq+ssq*1] vpblendd m2, m4, 0xcc vpbroadcastq m4, [srcq+ssq*2] vpblendd m3, m4, 0xcc pshufb m2, m7 pshufb m3, m7 pmaddubsw m2, m8 pmaddubsw m3, m8 phaddw m2, m3 pmulhrsw m2, m10 palignr m3, m2, m0, 12 mova m0, m2 punpcklwd m2, m3, m0 ; 67 78 punpckhwd m3, m0 ; 89 9a pmaddwd m4, m2, m14 ; c2 d2 paddd m6, m11 paddd m5, m11 paddd m6, m4 pmaddwd m4, m2, m15 ; a3 b3 paddd m5, m4 pmaddwd m4, m3, m15 ; c3 d3 paddd m6, m4 psrad m5, 6 psrad m6, 6 packssdw m5, m6 vpermd m5, m9, m5 vextracti128 xm6, m5, 1 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*2], xm5 movq [dstq+dsq*4], xm6 movhps [dstq+r6 *2], xm6 lea dstq, [dstq+dsq*8] sub hd, 4 jg .hv_w4_loop RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] movq xm%1, [srcq+ r4] movq xm%2, [srcq+ r6] movhps xm%1, [srcq+ r7] movhps xm%2, [srcq+ r9] vinserti128 m%1, [srcq+r10], 1 vinserti128 m%2, [srcq+r11], 1 vpbroadcastq m%5, [srcq+r13] vpbroadcastq m%6, [srcq+ rX] add srcq, ssq movq xm%3, [srcq+ r4] movq xm%4, [srcq+ r6] movhps xm%3, [srcq+ r7] movhps xm%4, [srcq+ r9] vinserti128 m%3, [srcq+r10], 1 vinserti128 m%4, [srcq+r11], 1 vpbroadcastq m%7, [srcq+r13] vpbroadcastq m%8, [srcq+ rX] add srcq, ssq vpblendd m%1, m%5, 0xc0 vpblendd m%2, m%6, 0xc0 vpblendd m%3, m%7, 0xc0 vpblendd m%4, m%8, 0xc0 pmaddubsw m%1, m15 pmaddubsw m%2, m10 pmaddubsw m%3, m15 pmaddubsw m%4, m10 phaddw m%1, m%2 phaddw m%3, m%4 phaddw m%1, m%3 pmulhrsw m%1, m12 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy %xdefine base_reg r12 %define rndshift 10 %else %assign isprep 1 cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy %define tmp_stridem qword [rsp+120] %xdefine base_reg r11 %define rndshift 6 %endif lea base_reg, [%1_8tap_scaled_8bpc_avx2] %define base base_reg-%1_8tap_scaled_8bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm14, mxd vpbroadcastd m14, xm14 mov r5d, t0d DECLARE_REG_TMP 5, 7 %else vpbroadcastd m14, mxm %endif mov dyd, dym %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %define dsm [rsp+112] %define rX r1 %define rXd r1d %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+112] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif vpbroadcastd m10, [base+pd_0x3ff] vpbroadcastd m12, [base+pw_8192] %ifidn %1, put vpbroadcastd m13, [base+pd_512] %else vpbroadcastd m13, [base+pd_32] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0,1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*2] movhps xm0, [srcq+ssq*1] movhps xm1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*2], 1 vpbroadcastq m2, [srcq+ssq*1] vpbroadcastq m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] vpblendd m15, m7, 0xaa vpblendd m0, m2, 0xc0 ; 0 1 4 5 vpblendd m1, m3, 0xc0 ; 2 3 6 7 pblendvb m15, m11, m8 pshufb m0, m14 pshufb m1, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 phaddw m0, m1 pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 ; 4 5 6 7 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q pmovsxbw xm11, xm11 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 pshufd xm11, xm11, q3333 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pmaddwd xm7, xm2, xm10 pmaddwd xm8, xm4, xm11 paddd xm5, xm6 paddd xm7, xm8 paddd xm5, xm13 paddd xm5, xm7 psrad xm5, 10 packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq], xm5, 0 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movq xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm14 pmaddubsw xm5, xm15 phaddw xm5, xm5 pmulhrsw xm5, xm12 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movhps xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm14 pmaddubsw xm5, xm15 phaddw xm5, xm5 pmulhrsw xm5, xm12 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd xm15, xm0 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pinsrd xm15, [base+subpel_filters+r6*8+2], 1 pcmpeqd m0, m9 psrld m14, 10 movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] pinsrd xm15, [base+subpel_filters+r11*8+2], 2 movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] pinsrd xm15, [base+subpel_filters+r13*8+2], 3 lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m15, xm15, 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] pblendvb m15, m11, m0 pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 pshufb m10, m14 pmaddubsw m7, m15 pmaddubsw m9, m15 pmaddubsw m8, m15 pmaddubsw m10, m15 phaddw m7, m9 phaddw m8, m10 pmulhrsw m7, m12 ; 0 1 4 5 pmulhrsw m8, m12 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm11, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm11 ; 67 mova [rsp+0x00], xm7 mova [rsp+0x10], xm8 mova [rsp+0x20], xm9 .w4_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm10, r6q pmovsxbw xm10, xm10 pshufd xm7, xm10, q0000 pshufd xm8, xm10, q1111 pshufd xm9, xm10, q2222 pshufd xm10, xm10, q3333 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pmaddwd xm6, xm2, xm9 pmaddwd xm7, xm3, xm10 paddd xm4, xm5 paddd xm6, xm7 paddd xm4, xm13 paddd xm4, xm6 psrad xm4, rndshift packssdw xm4, xm4 %ifidn %1, put packuswb xm4, xm4 movd [dstq], xm4 add dstq, dsq %else movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop movu xm4, [srcq] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x00] mova [rsp+0x00], xm1 mova xm1, [rsp+0x10] mova [rsp+0x10], xm2 mova xm2, [rsp+0x20] mova [rsp+0x20], xm3 pshufb xm4, xm14 pmaddubsw xm4, xm15 phaddw xm4, xm4 pmulhrsw xm4, xm12 punpcklwd xm3, xm11, xm4 mova xm11, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm5, [srcq+ssq*1] movu m6, [rsp+0x10] pshufb xm4, xm14 pshufb xm5, xm14 pmaddubsw xm4, xm15 pmaddubsw xm5, xm15 movu [rsp+0x00], m6 phaddw xm4, xm5 pmulhrsw xm4, xm12 punpcklwd xm9, xm11, xm4 mova [rsp+0x20], xm9 psrldq xm11, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm11 lea srcq, [srcq+ssq*2] jmp .w4_loop .w8: mov dword [rsp+48], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+48], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+48], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+48], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+48], 16 movifprep tmp_stridem, 256 .w_start: %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+72], t0d mov [rsp+56], srcq mov [rsp+64], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+48] jz .ret add qword [rsp+64], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp+16] vpbroadcastd m15, [rsp+72] pxor m9, m9 mov srcq, [rsp+56] mov r0q, [rsp+64] ; dstq / tmpq .hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp+16], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 mova [rsp], xm14 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b mov myd, mym mov dyd, dym pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b vbroadcasti128 m14, [base+wswap] .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q punpcklqdq xm11, xm11 pmovsxbw m11, xm11 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pshufd m8, m11, q2222 pshufd m11, m11, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m11 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+52], myd mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] jz .skip_line vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 add srcq, ssq mov myd, [rsp+52] mov dyd, dym pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, m10 phaddw m4, m5 pslld m5, m4, 16 paddw m4, m5 pmulhrsw m4, m12 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 vpbroadcastq m7, [srcq+r13] vpbroadcastq m8, [srcq+ rX] movq xm3, [srcq+ r4] movq xm4, [srcq+ r6] movhps xm3, [srcq+ r7] movhps xm4, [srcq+ r9] vinserti128 m3, [srcq+r10], 1 vinserti128 m4, [srcq+r11], 1 add srcq, ssq movq xm5, [srcq+ r4] movq xm6, [srcq+ r6] movhps xm5, [srcq+ r7] movhps xm6, [srcq+ r9] vinserti128 m5, [srcq+r10], 1 vinserti128 m6, [srcq+r11], 1 vpbroadcastq m9, [srcq+r13] vpbroadcastq m11, [srcq+ rX] add srcq, ssq mov myd, [rsp+52] mov dyd, dym vpblendd m3, m7, 0xc0 vpblendd m4, m8, 0xc0 vpblendd m5, m9, 0xc0 vpblendd m6, m11, 0xc0 pmaddubsw m3, m15 pmaddubsw m4, m10 pmaddubsw m5, m15 pmaddubsw m6, m10 phaddw m3, m4 phaddw m5, m6 psrld m4, m3, 16 pslld m6, m5, 16 paddw m3, m4 paddw m5, m6 pblendw m3, m5, 0xaa pmulhrsw m3, m12 jmp .vloop .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0-1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*2] movhps xm0, [srcq+ssq*1] movhps xm1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*2], 1 vpbroadcastq m2, [srcq+ssq*1] add srcq, ss3q movq xm10, r4q pmovsxbw xm10, xm10 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 pshufd xm8, xm10, q0000 pshufd xm9, xm10, q1111 pshufd xm11, xm10, q3333 pshufd xm10, xm10, q2222 vpblendd m0, m2, 0xc0 pshufb m1, m14 pshufb m0, m14 pmaddubsw m1, m15 pmaddubsw m0, m15 phaddw m0, m1 pmulhrsw m0, m12 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pmaddwd xm7, xm2, xm10 mova xm3, xm0 mova xm0, xm2 paddd xm5, xm13 paddd xm6, xm7 pshufb xm1, xm14 pmaddubsw xm1, xm15 phaddw xm1, xm1 pmulhrsw xm1, xm12 palignr xm7, xm1, xm4, 12 punpcklwd xm2, xm7, xm1 ; 67 78 pmaddwd xm7, xm2, xm11 mova xm4, xm1 paddd xm5, xm6 paddd xm5, xm7 psrad xm5, rndshift packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 vpermq m8, m8, q3120 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r11d, xm15, 1 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] movu xm2, [srcq+ssq*0] movu xm3, [srcq+ssq*2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pcmpeqd m8, m9 psrld m14, 10 pinsrd xm15, [base+subpel_filters+r11*8+2], 1 vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 vinserti128 m2, [srcq+ssq*1], 1 vinserti128 m3, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m4, [srcq+ssq*1], 1 add srcq, ss3q vpblendd m15, m7, 0x30 punpcklqdq m15, m15 pblendvb m15, m11, m8 movq xm10, r4q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 pshufb xm5, xm14 vpermq m2, m2, q3120 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m5, m5, q3120 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m2, m3 phaddw m4, m5 pmulhrsw m2, m12 pmulhrsw m4, m12 palignr m5, m4, m2, 4 pshufd m3, m4, q2121 punpcklwd m0, m2, m5 ; 01 12 punpckhwd m1, m2, m5 ; 23 34 punpcklwd m2, m4, m3 ; 45 56 .dy1_w4_loop: movu xm11, [srcq+ssq*0] vinserti128 m11, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 mova m0, m1 mova m1, m2 paddd m4, m13 paddd m5, m6 pshufb m11, m14 vpermq m11, m11, q3120 pmaddubsw m11, m15 phaddw m11, m11 pmulhrsw m11, m12 palignr m6, m11, m3, 12 punpcklwd m2, m6, m11 ; 67 78 mova m3, m11 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 pshuflw xm4, xm4, q3120 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] %else pshufd xm4, xm4, q3120 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET .dy1_w8: mov dword [rsp+72], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+72], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+72], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+72], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+72], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+76], t0d mov [rsp+80], srcq mov [rsp+88], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+96], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+72] jz .ret add qword [rsp+88], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp+32] vpbroadcastd m15, [rsp+76] pxor m9, m9 mov srcq, [rsp+80] mov r0q, [rsp+88] ; dstq / tmpq .dy1_hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp+32], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 movq [rsp+64], xm14 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b movu [rsp], m10 vpbroadcastd m8, [rsp+0x60] vpbroadcastd m9, [rsp+0x64] vpbroadcastd m10, [rsp+0x68] vpbroadcastd m11, [rsp+0x6c] pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b vbroadcasti128 m14, [base+wswap] .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] add srcq, ssq pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, [rsp] phaddw m4, m5 pslld m5, m4, 16 paddw m4, m5 pmulhrsw m4, m12 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0-1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] vpbroadcastq m2, [srcq+ssq*1] movhps xm0, [srcq+ssq*2] vpbroadcastq m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 movhps xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] vpblendd m0, m2, 0x30 vpblendd m1, m4, 0xc0 vpblendd m0, m3, 0xc0 pshufb m0, m14 pshufb m1, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 movq xm11, r4q pmovsxbw xm11, xm11 phaddw m0, m1 pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 pshufd xm11, xm11, q3333 pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m2, 1 punpcklwd xm3, xm2, xm1 ; 01 23 punpckhwd xm2, xm1 ; 23 45 .dy2_w2_loop: movq xm6, [srcq+ssq*0] vpbroadcastq m7, [srcq+ssq*1] movhps xm6, [srcq+ssq*2] vpbroadcastq m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd xm4, xm3, xm8 pmaddwd xm5, xm2, xm9 vpblendd m6, m7, 0x30 vpblendd m6, m1, 0xc0 pshufb m6, m14 pmaddubsw m6, m15 phaddw m6, m6 pmulhrsw m6, m12 palignr m0, m6, m0, 8 pshufd m2, m0, q3221 vextracti128 xm1, m2, 1 punpcklwd xm3, xm2, xm1 ; 45 67 punpckhwd xm2, xm1 ; 67 89 pmaddwd xm6, xm3, xm10 pmaddwd xm7, xm2, xm11 paddd xm4, xm5 paddd xm4, xm13 paddd xm6, xm7 paddd xm4, xm6 psrad xm4, rndshift packssdw xm4, xm4 packuswb xm4, xm4 pextrw [dstq+dsq*0], xm4, 0 pextrw [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pinsrd xm15, [base+subpel_filters+r6*8+2], 1 pcmpeqd m8, m9 psrld m14, 10 movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*2] pinsrd xm15, [base+subpel_filters+r11*8+2], 2 movu xm1, [srcq+ssq*1] movu xm3, [srcq+ss3q ] pinsrd xm15, [base+subpel_filters+r13*8+2], 3 lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] vinserti128 m15, xm15, 1 pshufb m14, m5 paddb m14, m6 vinserti128 m2, [srcq+ssq*0], 1 vinserti128 m3, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pblendvb m15, m11, m8 pshufb xm0, xm14 pshufb m2, m14 pshufb xm1, xm14 pshufb m3, m14 pmaddubsw xm0, xm15 pmaddubsw m2, m15 pmaddubsw xm1, xm15 pmaddubsw m3, m15 movq xm11, r4q punpcklqdq xm11, xm11 pmovsxbw m11, xm11 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 punpcklwd xm2, xm0, xm1 punpckhwd m1, m0, m1 ; 23 45 vinserti128 m0, m2, xm1, 1 ; 01 23 .dy2_w4_loop: movu xm6, [srcq+ssq*0] movu xm7, [srcq+ssq*1] vinserti128 m6, [srcq+ssq*2], 1 vinserti128 m7, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pshufb m6, m14 pshufb m7, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 psrld m2, m6, 16 pslld m3, m7, 16 paddw m6, m2 paddw m7, m3 pblendw m6, m7, 0xaa ; 67 89 pmulhrsw m6, m12 paddd m4, m5 vperm2i128 m0, m1, m6, 0x21 ; 45 67 mova m1, m6 pmaddwd m6, m0, m10 pmaddwd m7, m1, m11 paddd m4, m13 paddd m6, m7 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] %else mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET .dy2_w8: mov dword [rsp+40], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+40], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+40], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+40], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+40], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+64], t0d mov [rsp+48], srcq mov [rsp+56], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+40] jz .ret add qword [rsp+56], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp] vpbroadcastd m15, [rsp+64] pxor m9, m9 mov srcq, [rsp+48] mov r0q, [rsp+56] ; dstq / tmpq .dy2_hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m11, [rsp+0x58] vpbroadcastd m4, [rsp+0x5c] pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b SWAP m14, m4 .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m11 pmaddwd m7, m3, m14 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movq xm3, [srcq+ r4] movq xm4, [srcq+ r6] movhps xm3, [srcq+ r7] movhps xm4, [srcq+ r9] vinserti128 m3, [srcq+r10], 1 vinserti128 m4, [srcq+r11], 1 vpbroadcastq m5, [srcq+r13] vpbroadcastq m6, [srcq+ rX] add srcq, ssq vpblendd m3, m5, 0xc0 vpblendd m4, m6, 0xc0 pmaddubsw m3, m15 pmaddubsw m4, m10 phaddw m3, m4 movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] add srcq, ssq vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, m10 phaddw m4, m5 psrld m5, m3, 16 pslld m6, m4, 16 paddw m3, m5 paddw m4, m6 pblendw m3, m4, 0xaa pmulhrsw m3, m12 jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_8bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN smooth, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN smooth, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s lea tmp1d, [myq+gammaq*4] lea tmp2d, [myq+gammaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+gammaq*4] lea myd, [tmp2q+gammaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+gammaq*4] lea tmp2d, [myq+gammaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+gammaq*4] lea myd, [tmp2q+deltaq] ; my += delta shr tmp2d, 10 shr tmp1d, 10 punpcklwd m8, m0 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m%2, m9 paddd m0, m8 paddd m%1, m0, m%2 %endmacro cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts %if WIN64 sub rsp, 0xa0 %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main .loop: psrad m7, 13 psrad m0, 13 packssdw m7, m0 pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] jmp .loop cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ beta, filter, tmp1, gamma, my, delta %if WIN64 %assign xmm_regs_used 16 %assign stack_size_padded 0xa0 SUB rsp, stack_size_padded %endif call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 18 psrad m0, 18 packusdw m7, m0 pavgw m7, m11 ; (x + (1 << 10)) >> 11 vextracti128 xm0, m7, 1 packuswb xm7, xm0 pshufd xm7, xm7, q3120 movq [dstq+dsq*0], xm7 movhps [dstq+dsq*1], xm7 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack is offset due to call %assign stack_offset stack_offset + gprsize %assign stack_size stack_size + gprsize %assign stack_size_padded stack_size_padded + gprsize movifnidn abcdq, abcdmp movifnidn mxd, mxm WIN64_PUSH_XMM movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] mova m12, [warp_8x8_shufA] mova m13, [warp_8x8_shufB] vpbroadcastd m14, [pw_8192] vpbroadcastd m15, [pd_32768] pxor m11, m11 lea filterq, [mc_warp_filter2] lea tmp1q, [ssq*3+3] add mxd, 512+(3*64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r6m call .h psrld m1, m0, 16 call .h psrld m4, m0, 16 call .h pblendw m1, m0, 0xaa ; 02 call .h pblendw m4, m0, 0xaa ; 13 call .h psrld m2, m1, 16 pblendw m2, m0, 0xaa ; 24 call .h psrld m5, m4, 16 pblendw m5, m0, 0xaa ; 35 call .h psrld m3, m2, 16 pblendw m3, m0, 0xaa ; 46 movsx gammad, word [abcdq+2*2] movsx deltad, word [abcdq+2*3] add myd, 512+(3*64<<10) mov r4d, 4 lea tmp1d, [gammaq*3] sub deltad, tmp1d ; delta -= gamma*3 .main2: call .h psrld m6, m5, 16 pblendw m6, m0, 0xaa ; 57 WARP_V 7, 1, 3, 4, 6 call .h mova m1, m2 mova m2, m3 psrld m3, 16 pblendw m3, m0, 0xaa ; 68 WARP_V 0, 4, 6, 1, 3 mova m4, m5 mova m5, m6 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] vbroadcasti128 m10, [srcq] shr mxd, 10 shr tmp1d, 10 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 movq xm9, [filterq+mxq *8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 punpcklqdq m8, m0 ; 0 1 4 5 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 punpcklqdq m9, m0 ; 2 3 6 7 pshufb m0, m10, m12 pmaddubsw m0, m8 pshufb m10, m13 pmaddubsw m10, m9 add srcq, ssq phaddw m0, m10 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 jg .w4_loop .ret: RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w8: vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 sub hd, 4 jg .w8_loop RET .w16_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq*4] .w16: vpermq m0, m0, q3120 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 %1 2 vpermq m0, m0, q3120 movu [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 sub hd, 4 jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq*2] .w32: vpermq m0, m0, q3120 movu [dstq+strideq*0], m0 %1 2 vpermq m0, m0, q3120 movu [dstq+strideq*1], m0 sub hd, 2 jg .w32_loop RET .w64_loop: %1_INC_PTR 4 %1 0 add dstq, strideq .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+32*1], m0 dec hd jg .w64_loop RET %endmacro %macro AVG 1 ; src_offset mova m0, [tmp1q+(%1+0)*32] paddw m0, [tmp2q+(%1+0)*32] mova m1, [tmp1q+(%1+1)*32] paddw m1, [tmp2q+(%1+1)*32] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*32 add tmp2q, %1*32 %endmacro cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx2_table lea r6, [avg_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m2, [base+pw_1024] add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp1q+(%1+0)*32] psubw m2, m0, [tmp2q+(%1+0)*32] mova m1, [tmp1q+(%1+1)*32] psubw m3, m1, [tmp2q+(%1+1)*32] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx2_table lea r6, [w_avg_avx2_table] tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight movsxd wq, dword [r6+wq*4] vpbroadcastd m5, [base+pw_2048] psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q pxor m0, m0 mov tmp1q, tmp2q psubw m4, m0, m4 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 vpermq m3, [maskq+%1*16], q3120 mova m0, [tmp2q+(%1+0)*32] psubw m1, m0, [tmp1q+(%1+0)*32] psubb m3, m4, m3 paddw m1, m1 ; (b - a) << 1 paddb m3, m3 punpcklbw m2, m4, m3 ; -m << 9 pmulhw m1, m2 paddw m0, m1 mova m1, [tmp2q+(%1+1)*32] psubw m2, m1, [tmp1q+(%1+1)*32] paddw m2, m2 punpckhbw m3, m4, m3 pmulhw m2, m3 paddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*16 add tmp2q, %1*32 add tmp1q, %1*32 %endmacro cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx2_table lea r7, [mask_avx2_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] vpbroadcastd m5, [base+pw_2048] pxor m4, m4 add wq, r7 BIDIR_FN MASK cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn maskq, maskmp movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pb_64] vpbroadcastd m5, [base+pw_512] sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq .w4: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 vpbroadcastd xm1, [dstq+dsq*2] pinsrd xm1, [dstq+r6 ], 3 mova xm6, [maskq] psubb xm3, xm4, xm6 punpcklbw xm2, xm3, xm6 punpckhbw xm3, xm6 mova xm6, [maskq+tmpq] add maskq, 4*4 punpcklbw xm0, xm6 punpckhbw xm1, xm6 pmaddubsw xm0, xm2 pmaddubsw xm1, xm3 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 packuswb xm0, xm1 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 pextrd [dstq+dsq*2], xm0, 2 pextrd [dstq+r6 ], xm0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: movq xm1, [dstq+dsq*0] movhps xm1, [dstq+dsq*1] vpbroadcastq m2, [dstq+dsq*2] vpbroadcastq m3, [dstq+r6 ] mova m0, [maskq] mova m6, [maskq+tmpq] add maskq, 8*4 vpblendd m1, m2, 0x30 vpblendd m1, m3, 0xc0 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: mova m0, [maskq] movu xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 mova m6, [maskq+tmpq] add maskq, 16*2 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 movu [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET .w32: mova m0, [maskq] movu m1, [dstq] mova m6, [maskq+tmpq] add maskq, 32 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 movu [dstq], m0 add dstq, dsq dec hd jg .w32 RET .w64: mova m0, [maskq+32*0] mova m1, [dstq+32*0] mova m2, [maskq+tmpq+32*0] psubb m6, m4, m0 punpcklbw m3, m6, m0 punpckhbw m6, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova m2, [maskq+32*1] pmaddubsw m0, m3 mova m3, [dstq+32*1] pmaddubsw m1, m6 mova m6, [maskq+tmpq+32*1] add maskq, 64 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 psubb m1, m4, m2 mova [dstq+32*0], m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 punpcklbw m2, m3, m6 punpckhbw m3, m6 pmaddubsw m2, m0 pmaddubsw m3, m1 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 mova [dstq+32*1], m2 add dstq, dsq dec hd jg .w64 RET cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 add srcq, r10 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastb m0, [srcq] .left_loop_%3: mova [dstq+r3], m0 add r3, 32 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3] %if %1 movu [r12+r3], m0 %else movu [dstq+r3], m0 %endif add r3, 32 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 add r12, centerwq %else lea r12, [dstq+centerwq] %endif xor r3, r3 vpbroadcastb m0, [srcq+centerwq-1] .right_loop_%3: movu [r12+r3], m0 add r3, 32 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 32 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 32 cmp r1, bwq jl .top_x_loop .end: RET %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 mova m%1, [tmp1q+32*%3] mova m1, [tmp2q+32*%3] psubw m1, m%1 pabsw m%2, m1 psubusw m%2, m6, m%2 psrlw m%2, 8 ; 64 - m psllw m2, m%2, 10 pmulhw m1, m2 paddw m%1, m1 mova m1, [tmp1q+32*%4] mova m2, [tmp2q+32*%4] psubw m2, m1 pabsw m3, m2 psubusw m3, m6, m3 psrlw m3, 8 %if %5 packuswb m%2, m3 psubb m%2, m5, m%2 vpermq m%2, m%2, q3120 %else phaddw m%2, m3 %endif psllw m3, 10 pmulhw m2, m3 paddw m1, m2 pmulhrsw m%1, m7 pmulhrsw m1, m7 packuswb m%1, m1 %endmacro cglobal w_mask_420_8bpc, 4, 9, 14, dst, stride, tmp1, tmp2, w, h, mask, mstr, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r8m ; sign movifnidn hd, hm movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] pmovzxbd m9, [base+deint_shuf4] vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign add wq, r7 W_MASK 0, 4, 0, 1 mov maskq, maskmp mov mstrq, mstrmp lea stride3q, [strideq*3] jmp wq .w4_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 je .w4_end W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] phaddd m4, m5 vextracti128 xm1, m0, 1 psubw m4, m8, m4 psrlw m4, 2 vpermd m4, m9, m4 vextracti128 xm5, m4, 1 packuswb xm4, xm5 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq], xm4 sub hd, 16 jg .w4_loop RET .w4_end: vextracti128 xm0, m4, 1 vpblendd xm1, xm4, xm0, 0x05 vpblendd xm4, xm0, 0x0a pshufd xm1, xm1, q2301 psubw xm4, xm8, xm4 psubw xm4, xm1 psrlw xm4, 2 packuswb xm4, xm4 movq [maskq], xm4 RET .w8_loop: add tmp1q, 2*32 add tmp2q, 2*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 8 .w8: vextracti128 xm2, m4, 1 vextracti128 xm1, m0, 1 psubw xm4, xm8, xm4 psubw xm4, xm2 psrlw xm4, 2 packuswb xm4, xm4 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 movq [maskq], xm4 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w16: vpermq m0, m0, q3120 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 W_MASK 0, 5, 2, 3 punpckhqdq m1, m4, m5 punpcklqdq m4, m5 psubw m1, m8, m1 psubw m1, m4 psrlw m1, 2 vpermq m0, m0, q3120 packuswb m1, m1 vpermd m1, m9, m1 movu [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 mova [maskq], xm1 sub hd, 4 jg .w16_loop RET .w32_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*2] add maskq, 16 .w32: vpermq m0, m0, q3120 movu [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 psubw m4, m8, m4 psubw m4, m5 psrlw m4, 2 vpermq m0, m0, q3120 packuswb m4, m4 vpermd m4, m9, m4 movu [dstq+strideq*1], m0 mova [maskq], xm4 sub hd, 2 jg .w32_loop RET .w64_loop_even: psubw m10, m8, m4 psubw m11, m8, m5 dec hd .w64_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 add dstq, strideq .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 vpermq m0, m0, q3120 mova [dstq+32*1], m0 test hd, 1 jz .w64_loop_even psubw m4, m10, m4 psubw m5, m11, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq], m4 add maskq, mstrq dec hd jg .w64_loop RET cglobal w_mask_422_8bpc, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, mstr, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r8m ; sign movifnidn hd, hm pxor m9, m9 movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] pmovzxbd m10, [base+deint_shuf4] vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign add wq, r7 mov maskq, maskmp mov mstrq, mstrmp W_MASK 0, 4, 0, 1 lea stride3q, [strideq*3] jmp wq .w4_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 je .w4_end W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermd m5, m10, m5 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq], m5 sub hd, 16 jg .w4_loop RET .w4_end: vextracti128 xm5, m4, 1 packuswb xm4, xm5 psubb xm5, xm8, xm4 pavgb xm5, xm9 pshufd xm5, xm5, q3120 mova [maskq], xm5 RET .w8_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vextracti128 xm5, m4, 1 vextracti128 xm1, m0, 1 packuswb xm4, xm5 psubb xm5, xm8, xm4 pavgb xm5, xm9 pshufd xm5, xm5, q3120 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 mova [maskq], xm5 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w16: vpermq m0, m0, q3120 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 movu [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 mova [maskq], m5 sub hd, 4 jg .w16_loop RET .w32_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*2] add maskq, 32 .w32: vpermq m0, m0, q3120 movu [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 movu [dstq+strideq*1], m0 mova [maskq], m5 sub hd, 2 jg .w32_loop RET .w64_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 add dstq, strideq add maskq, mstrq .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*1], m0 mova [maskq], m5 dec hd jg .w64_loop RET cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m5, [base+pb_64] vpbroadcastd m7, [base+pw_2048] add wq, r7 W_MASK 0, 4, 0, 1, 1 lea stride3q, [strideq*3] jmp wq .w4_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 mova [maskq], m4 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 jg .w4_loop .w4_end: RET .w8_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w8: vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 mova [maskq], m4 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*2] add maskq, 32 .w16: vpermq m0, m0, q3120 movu [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [maskq], m4 sub hd, 2 jg .w16_loop RET .w32_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32 .w32: vpermq m0, m0, q3120 movu [dstq], m0 mova [maskq], m4 dec hd jg .w32_loop RET .w64_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32*2 .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4 W_MASK 0, 4, 2, 3, 1 vpermq m0, m0, q3120 mova [dstq+32*1], m0 mova [maskq+32*1], m4 dec hd jg .w64_loop RET cglobal morph_8bpc, 4, 6, 9, dst, stride, a, b, w, h movd xm0, ad movd xm2, bd lea r2, [morph_avx2_table] vpbroadcastw m0, xm0 vpbroadcastd m2, xm2 vbroadcasti128 m3, [r2+morph_A-morph_avx2_table] DEFINE_ARGS dst, stride, _, stride3, w, h tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r2+wq*4] add wq, r2 lea stride3q, [strideq*3] pslld m1, m0, 16 psrld m0, 16 ; since a can be negative jmp wq .w4: movd xm4, [dstq+strideq*0] vpbroadcastd m6, [dstq+strideq*2] movd xm5, [dstq+strideq*1] vpbroadcastd m7, [dstq+stride3q ] vpblendd m4, m6, 0x10 vpblendd m5, m7, 0x10 REPX {pshufb x, m3}, m4, m5 REPX {pmaddwd x, m0}, m4, m5 REPX {paddd x, m2}, m4, m5 packusdw m4, m5 psrlw m4, 8 packuswb m4, m4 vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 movd [dstq+strideq*2], xm5 pextrd [dstq+stride3q ], xm5, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movq xm4, [dstq+strideq*0] movq xm6, [dstq+strideq*1] vpbroadcastq m5, [dstq+strideq*2] vpbroadcastq m7, [dstq+stride3q ] vpblendd m4, m5, 0xf0 vpblendd m6, m7, 0xf0 REPX {pshufb x, m3}, m4, m6 pmaddwd m5, m4, m1 pmaddwd m4, m0 pmaddwd m7, m6, m1 pmaddwd m6, m0 REPX {paddd x, m2}, m5, m4, m7, m6 packusdw m4, m5 packusdw m6, m7 REPX {psrlw x, 8 }, m4, m6 packuswb m4, m6 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm4 movq [dstq+strideq*2], xm5 movhps [dstq+stride3q ], xm5 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: vbroadcasti128 m4, [r2+morph_B-morph_avx2_table] .w16_loop: movu xm5, [dstq+strideq*0] vinserti128 m5, [dstq+strideq*1], 1 call .body movu [dstq+strideq*0], xm5 vextracti128 [dstq+strideq*1], m5, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET .w32: vbroadcasti128 m4, [r2+morph_B-morph_avx2_table] .w32_loop: movu m5, [dstq] call .body movu [dstq], m5 add dstq, strideq dec hd jg .w32_loop RET .w64: vbroadcasti128 m4, [r2+morph_B-morph_avx2_table] .w64_loop: mova m5, [dstq] call .body mova [dstq], m5 mova m5, [dstq+32] call .body mova [dstq+32], m5 add dstq, strideq dec hd jg .w64_loop RET .body: pshufb m6, m5, m4 pshufb m5, m3 pmaddwd m7, m5, m1 pmaddwd m5, m0 pmaddwd m8, m6, m1 pmaddwd m6, m0 REPX {paddd x, m2}, m7, m5, m8, m6 packusdw m5, m7 packusdw m6, m8 REPX {psrlw x, 8 }, m5, m6 packuswb m5, m6 ret cglobal sad_refine_mv_8bpc, 4, 10, 16, 9 * 32, p0, p0s, p1, p1s, p0s3, h, p1s3, p0s5, p1s5 movifnidn hd, hm lea r9, [hq+4] lea p1s3q, [p1sq*3] lea p0s5q, [p0sq*5] lea p1s5q, [p1sq*5] cmp dword r4m, 8 ; if w == 8 lea p0s3q, [p0sq*3] je .w8 ; w=16 imul r9d, 40 mov [rsp], r9d movu xm0, [p0q+2*p0sq] movu xm1, [p1q+2*p1sq] movq xm2, [p0q+2*p0sq+16] movq xm3, [p1q+2*p1sq+16] movu xm4, [p0q+2*p0s3q+8] movu xm5, [p1q+2*p1s3q+8] movhps xm2, [p0q+2*p0s3q] movhps xm3, [p1q+2*p1s3q] vinserti128 m0, [p0q+4*p0sq], 1 ; p0[y=2|4,x=0-15] vinserti128 m1, [p1q+4*p1sq], 1 ; p1[y=2|4,x=0-15] vinserti128 m2, [p0q+4*p0sq+16], 1 vinserti128 m3, [p1q+4*p1sq+16], 1 vpbroadcastq m6, [p0q+8*p0sq] vpbroadcastq m7, [p1q+8*p1sq] vinserti128 m4, [p0q+8*p0sq+8], 1 ; p0[y=6|8,x=8-23] vinserti128 m5, [p1q+8*p1sq+8], 1 ; p1[y=6|8,x=8-23] vpblendd m2, m6, 0xc0 ; p0[y=2|4,x=16-23 & y=6|8,x=0-7] vpblendd m3, m7, 0xc0 ; p1[y=2|4,x=16-23 & y=6|8,x=0-7] movu xm6, [p0q+2*p0s5q] movq xm8, [p0q+2*p0s5q+16] movu xm7, [p1q+2*p1s5q+8] movhps xm8, [p1q+2*p1s5q] vinserti128 m6, [p0q+4*p0s3q], 1 ; p0[y=10|12,x=0-15] vinserti128 m7, [p1q+4*p1s3q+8], 1 ; p1[y=10|12,x=8-23] vinserti128 m8, [p0q+4*p0s3q+16], 1 vpbroadcastq m9, [p1q+4*p1s3q] vpblendd m8, m9, 0xc0 ; p0[y=10|12,x=16-23] & p1[y=10|12,x=0-7] mov r9d, -1 cmp byte r6m, 0 je .w16_main ; m0: p0[y=2|4,x=0-15] ; m1: p1[y=2|4,x=0-15] ; m2: p0[y=2|4,x=16-23 & y=6|8,x=0-7] ; m3: p1[y=2|4,x=16-23 & y=6|8,x=0-7] ; m4: p0[y=6|8,x=8-23] ; m5: p1[y=6|8,x=8-23] ; m6: p0[y=10|12,x=0-15] ; m7: p1[y=10|12,x=8-23] ; m8: p0[y=10|12,x=16-23] & p1[y=10|12,x=0-7] ; m9-15: free palignr m9, m2, m0, 2 ; p0[y=2|4,x=2-17] palignr m10, m3, m1, 2 ; p1[y=2|4,x=2-17] palignr m11, m4, m2, 14 ; p0[y=6|8,x=6-21] palignr m12, m5, m3, 14 ; p1[y=6|8,x=6-21] palignr m13, m8, m6, 2 ; p0[y=10|12,x=2-17] palignr m14, m7, m8, 10 ; p1[y=10|12,x=2-17] psadbw m9, m10 psadbw m11, m12 psadbw m13, m14 psrldq m10, m2, 2 ; p0[y=2|4,x=18-23 & y=6|8,x=0-7], 2x0 palignr m12, m8, m7, 10 ; p1[y=10|12,x=18-23] & ; p0[y=10|12,x=16-23] & 2x? psrldq m14, m3, 2 ; p1[y=2|4,x=18-23 & y=6|8,x=0-7], 2x0 shufps m10, m12, q2220 ; p0[y=2|4,x=18-21 & y=6|8,x=2-5 & ; y=10|12,x=18-21 [2x]] shufps m14, m12, q2020 ; p1[y=2|4,x=18-21 & y=6|8,x=2-5 & ; y=10|12,x=18-21] & ; p0[y=10|12,x=18-21] paddw m9, m11 psadbw m10, m14 paddw m9, m13 paddw m9, m10 ; sad(p0,p1) @ dy=0,dx=0 cmp hd, 8 je .skip_16x16_thr_acc lea r9, [p0q+p0s3q*4] ; &p0[y=12] movu xm10, [r9+p0sq*2] movq xm12, [r9+p0sq*2+16] movu xm14, [r9+p0s3q*2+8] movhps xm12, [r9+p0s3q*2] vinserti128 m10, [r9+p0sq*4], 1 ; p0[y=14|16,x=0-15] vinserti128 m12, [r9+p0sq*4+16], 1 vpbroadcastq m15, [r9+p0sq*8] vinserti128 m14, [r9+p0sq*8+8], 1 ; p0[y=18|20,x=8-23] vpblendd m12, m15, 0xc0 ; p0[y=14|16,x=16-23 & ; y=18|20,x=0-7] lea r9, [p1q+p1s3q*4] movu xm11, [r9+p1sq*2] movq xm13, [r9+p1sq*2+16] movhps xm13, [r9+p1s3q*2] vinserti128 m11, [r9+p1sq*4], 1 ; p1[y=14|16,x=0-15] vinserti128 m13, [r9+p1sq*4+16], 1 vpbroadcastq m15, [r9+p1sq*8] vpblendd m13, m15, 0xc0 ; p1[y=14|16,x=16-23 & ; y=18|20,x=0-7] movu xm15, [r9+p1s3q*2+8] vinserti128 m15, [r9+p1sq*8+8], 1 ; p1[y=18|20,x=8-23] palignr m10, m12, m10, 2 ; p0[y=14|16,x=2-17] palignr m11, m13, m11, 2 ; p1[y=14|16,x=2-17] palignr m14, m12, 14 ; p0[y=18|20,x=6-21] palignr m15, m13, 14 ; p1[y=18|20,x=6-21] vpblendw m13, m12, 10011001b psadbw m10, m11 psadbw m14, m15 psadbw m12, m13 paddw m9, m10 paddw m12, m14 paddw m9, m12 .skip_16x16_thr_acc: vextracti128 xm10, m9, 1 paddd xm9, xm10 punpckhqdq xm10, xm9, xm9 paddd xm9, xm10 movd r9d, xm9 inc r9d imul r9d, 7 shr r9d, 3 ; (sad*7+7)>>3 cmp r9d, [rsp] jl .ret_origin .w16_main: mov word r6m, r9w ; r6m: best_sad ; m0: p0[y=2|4,x=0-15] ; m1: p1[y=2|4,x=0-15] ; m2: p0[y=2|4,x=16-23 & y=6|8,x=0-7] ; m3: p1[y=2|4,x=16-23 & y=6|8,x=0-7] ; m4: p0[y=6|8,x=8-23] ; m5: p1[y=6|8,x=8-23] ; m6: p0[y=10|12,x=0-15] ; m7: p1[y=10|12,x=8-23] ; m8: p0[y=10|12,x=16-23] & p1[y=10|12,x=0-7] ; m9-15: free call .w16_line_top pslldq m9, 2 pslldq m10, 4 pslldq m11, 6 por m15, m9 por m10, m11 mova [rsp+2*mmsize], m10 mova [rsp+3*mmsize], m15 ; r6m: best_sad ; m0: p0[y=2|4,x=0-15] ; m1: p1[y=2|4,x=0-15] ; m2: p0[y=2|4,x=16-23 & y=6|8,x=0-7] ; m3: p1[y=2|4,x=16-23 & y=6|8,x=0-7] ; m4: p0[y=6|8,x=8-23] ; m5: p1[y=6|8,x=8-23] ; m6: p0[y=10|12,x=0-15] ; m7: p1[y=10|12,x=8-23] ; m8: p0[y=10|12,x=16-23] & p1[y=10|12,x=0-7] ; r2: sad(p0,p1) @ dy=0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=0,dx=[+1..+2] ; compare p0[y=0,2,4,6,8] with p1[y=4,6,8,10,12], i.e. dy=-2 lea r9, [p1q+p1s3q*4] mova [rsp+4*mmsize+0*16], xm1 mova [rsp+4*mmsize+1*16], xm3 vextracti128 [rsp+4*mmsize+2*16], m6, 1 vextracti128 [rsp+4*mmsize+3*16], m8, 1 vpbroadcastq m9, [p0q+16] vinserti128 m10, m6, [p0q], 1 ; p0[y=10|0,x=0-15] vpblendd m9, m8, m9, 0x30 ; p0[y=10|0,x=16-23] & ; p1[y=10|12,x=0-7] movq xm11, [r9+p1sq*2+16] vinserti128 m1, [r9+p1sq*2], 0 ; p1[y=14|4,x=0-15] vpblendd m3, m11, 0x03 ; p1[y=14|4,x=16-23 & y=6|8,x=0-7] palignr m6, m4, m2, 8 punpckhqdq m8, m4, m8 palignr m4, m2, m0, 8 punpcklqdq m2, m9, m0 mova m0, m10 ; r6m: best_sad ; m0: p0[y=10|0,x=0-15] ; m1: p1[y=14|4,x=0-15] ; m2: p0[y=10|0,x=16-23 & y=2|4,x=0-7] ; m3: p1[y=14|4,x=16-23 & y=6|8,x=0-7] ; m4: p0[y=2|4,x=8-23] ; m5: p1[y=6|8,x=8-23] ; m6: p0[y=6|8,x=0-15] ; m7: p1[y=10|12,x=8-23] ; m8: p0[y=6|8,x=16-23] & p1[y=10|12,x=0-7] ; r2: sad(p0,p1) @ dy=0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=0,dx=[+1..+2] ; r4-5: backup data for y=2[p1] and y=12[p0] for dy=+2 call .w16_line_top_w_origin mova m12, [rsp+8*mmsize] pslldq m11, 2 pslldq m12, 4 pslldq m15, 6 por m10, m11 por m12, m15 por m10, m12 mova [rsp+1*mmsize], m9 mova [rsp+0*mmsize], m10 ; r6m: best_sad ; m0: p0[y=10|0,x=0-15] ; m1: p1[y=14|4,x=0-15] ; m2: p0[y=10|0,x=16-23 & y=2|4,x=0-7] ; m3: p1[y=14|4,x=16-23 & y=6|8,x=0-7] ; m4: p0[y=2|4,x=8-23] ; m5: p1[y=6|8,x=8-23] ; m6: p0[y=6|8,x=0-15] ; m7: p1[y=10|12,x=8-23] ; m8: p0[y=6|8,x=16-23] & p1[y=10|12,x=0-7] ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 ; r2: sad(p0,p1) @ dy=0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=0,dx=[+1..+2] ; r4-5: backup data for y=2[p1] and y=12[p0] for dy=+2 ; compare p0[y=4,6,8,10,12] with p1[y=0,2,4,6,8], i.e. dy=+2 vinserti128 m1, [rsp+4*mmsize+0*16], 0 ; p1[y=2|4,x=0-15] vinserti128 m3, [rsp+4*mmsize+1*16], 0 ; p1[y=2|4,x=0-15 & ; y=6|8,x=0-7] lea r9, [p0q+p0s3q*4] ; &p0[y=12] vpbroadcastq m9, [p1q] vinserti128 m7, [p1q+8], 1 ; p1[y=10|0,x=8-23] vpblendd m8, m9, 0xc0 ; p0[y=6|8,x=16-23] & ; p1[y=10|0,x=0-7] palignr m9, m4, m2, 8 vinserti128 m10, m4, [r9+p0sq*2+8], 0 vinserti128 m9, [r9+p0sq*2], 0 palignr xm11, xm2, xm0, 8 vinserti128 m12, m0, [rsp+4*mmsize+4*8], 1 vinserti128 m11, [rsp+4*mmsize+5*8], 1 mova m0, m6 mova m6, m9 mova m4, m11 punpcklqdq m2, m8, m12 punpckhqdq m8, m10, m8 call .w16_line_top_w_origin mova m12, [rsp+8*mmsize] pslldq m10, 6 pslldq m15, 4 pslldq m9, 6 pslldq m12, 2 por m15, m9 por m11, m12 por m11, m15 mova [rsp+4*mmsize], m10 mova [rsp+5*mmsize], m11 ; r6m: best_sad ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 ; r2: sad(p0,p1) @ dy=0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=0,dx=[+1..+2] ; r4: sad(p0,p1) @ dy=+2,dx=-2 ; r5: sad(p0,p1) @ dy=+2,dx=[11..+2] ; compare p0[y=1,3,5,7,9,11] with p1[y=3,5,7,9,11,13], i.e. dy=-1 lea r9, [p1q+p1sq] ; &p1[y=1] movu xm1, [r9+p1sq*2] movq xm3, [r9+p1sq*2+16] movu xm5, [r9+p1s3q*2+8] movhps xm3, [r9+p1s3q*2] vinserti128 m1, [r9+p1sq*4], 1 ; p1[y=3|5,x=0-15] vinserti128 m3, [r9+p1sq*4+16], 1 vinserti128 m5, [r9+p1sq*8+8], 1 ; p1[y=7|9,x=8-23] vpbroadcastq m9, [r9+p1sq*8] movu xm7, [r9+p1s5q*2+8] movhps xm8, [r9+p1s5q*2] vinserti128 m7, [r9+p1s3q*4+8], 1 ; p1[y=11|13,x=8-23] vpbroadcastq m10, [r9+p1s3q*4] lea r9, [p0q+p0sq] ; &p0[y=1] movu xm0, [r9] movq xm2, [r9+16] movu xm4, [r9+p0sq*4+8] movhps xm2, [r9+p0sq*4] vinserti128 m0, [r9+p0sq*2], 1 ; p0[y=1|3,x=0-15] vinserti128 m2, [r9+p0sq*2+16], 1 vinserti128 m4, [r9+p0s3q*2+8], 1 ; p0[y=5|7,x=8-23] vpbroadcastq m11, [r9+p0s3q*2] movu xm6, [r9+p0sq*8] movlps xm8, [r9+p0sq*8+16] vinserti128 m6, [r9+p0s5q*2], 1 ; p0[y=9|11,x=0-15] vinserti128 m8, [r9+p0s5q*2+16], 1 vpblendd m3, m9, 0xc0 ; p1[y=3|5,x=16-23 & y=7|9,x=0-7] vpblendd m2, m11, 0xc0 ; p0[y=1|3,x=16-23 & y=5|7,x=0-7] vpblendd m8, m10, 0xc0 ; p0[y=9|11,x=16-23] & ; p1[y=11|13,x=0-7] call .w16_line_top_w_origin mova m12, [rsp+8*mmsize] pslldq m10, 2 pslldq m11, 4 pslldq m12, 6 pslldq m9, 2 por m10, m11 por m12, [rsp+1*mmsize] por m9, m15 por m10, m12 por m9, [rsp+2*mmsize] mova [rsp+1*mmsize], m10 mova [rsp+2*mmsize], m9 ; r6m: best_sad ; m0: p0[y=1|3,x=0-15] ; m1: p1[y=3|5,x=0-15] ; m2: p0[y=1|3,x=16-23 & y=5|7,x=0-7] ; m3: p1[y=3|5,x=16-23 & y=7|9,x=0-7] ; m4: p0[y=5|7,x=8-23] ; m5: p1[y=7|9,x=8-23] ; m6: p0[y=9|11,x=0-15] ; m7: p1[y=11|13,x=8-23] ; m8: p0[y=9|11,x=16-23] & p1[y=11|13,x=0-7] ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 & dy=-1,dx=[-2..0] ; r2: sad(p0,p1) @ dy=-1,dx=[+1..+2] & dy=0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=0,dx=[+1..+2] ; r4: sad(p0,p1) @ dy=+2,dx=-2 ; r5: sad(p0,p1) @ dy=+2,dx=[11..+2] ; compare p0[y=3,5,7,9,11,13] with p1[y=1,3,5,7,9,11], i.e. dy=+1 vinserti128 m11, m0, [r9+p0s3q*4], 0 ; p0[y=13|3,x=0-15] movq xm9, [r9+p0s3q*4+16] vinserti128 m7, [p1q+p1sq+8], 1 ; p1[y=11|1,x=8-23] vpbroadcastq m10, [p1q+p1sq] vpblendd m9, m2, m9, 0x3 ; p0[y=13|3,x=16-23] & ; p1[y=3|5,x=0-7] vpblendd m8, m10, 0xc0 ; p0[y=9|11,x=16-23] & ; p1[y=11|1,x=0-7] palignr m0, m4, m2, 8 palignr m2, m6, m4, 8 palignr m4, m8, m6, 8 mova m6, m11 vpblendd m8, m9, 00110011b call .w16_line_top_w_origin mova m12, [rsp+8*mmsize] pslldq m10, 4 pslldq m11, 6 pslldq m15, 2 pslldq m9, 4 por m12, m15 por m9, [rsp+4*mmsize] por m10, m11 por m14, m12, m9 por m13, m10, [rsp+3*mmsize] ; r6m: best_sad ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 & dy=-1,dx=[-2..0] ; r2: sad(p0,p1) @ dy=-1,dx=[+1..+2] & dy=0,dx=[-2..-1] ; m13: sad(p0,p1) @ dy=0,dx=[+1..+2] & dy=+1,dx=[-2..-1] ; m14: sad(p0,p1) @ dy=+1,dx=[0..+2] & dy=+2,dx=-2 ; r5: sad(p0,p1) @ dy=+2,dx=[11..+2] cmp hd, 8 jg .bottom_16x16 mova m15, [rsp+2*mmsize] jmp .skip_8x16_main .bottom_16x16: lea p0q, [p0q+p0s3q*4] ; &p0[y=12] lea p1q, [p1q+p1s3q*4] ; &p1[y=12] ; compare p0[y=12,14,16,18] with p1[y=16,18,20,22], i.e. dy=-2 movu xm0, [p0q] movq xm2, [p0q+16] movu xm4, [p0q+p0sq*4+8] movhps xm2, [p0q+p0sq*4] vinserti128 m0, [p0q+p0sq*2], 1 ; p0[y=12|14,x=0-15] vinserti128 m2, [p0q+p0sq*2+16], 1 vpbroadcastq m6, [p0q+p0s3q*2] vinserti128 m4, [p0q+p0s3q*2+8], 1 ; p0[y=16|18,x=8-23] movu xm1, [p1q+p1sq*4] movq xm3, [p1q+p1sq*4+16] movu xm5, [p1q+p1sq*8+8] movhps xm3, [p1q+p1sq*8] vinserti128 m1, [p1q+p1s3q*2], 1 ; p1[y=16|18,x=0-15] vinserti128 m3, [p1q+p1s3q*2+16], 1 vpbroadcastq m7, [p1q+p1s5q*2] vinserti128 m5, [p1q+p1s5q*2+8], 1 ; p1[y=20|22,x=8-23] vpblendd m2, m6, 0xc0 ; p0[y=12|14,x=16-23 & ; y=16|18,x=0-7] vpblendd m3, m7, 0xc0 ; p1[y=16|18,x=16-23 & ; y=20|22,x=0-7] call .w16_line_bottom_w_origin pslldq m12, 2 pslldq m15, 4 pslldq m11, 6 por m10, m12 por m15, m11 por m10, m15 paddw m9, [rsp+1*mmsize] paddw m10, [rsp+0*mmsize] mova [rsp+1*mmsize], m9 mova [rsp+0*mmsize], m10 ; word r6m: best_sad ; m0: p0[y=12|14,x=0-15] ; m1: p1[y=16|18,x=0-15] ; m2: p0[y=12|14,x=16-23 & y=16|18,x=0-7] ; m3: p1[y=16|18,x=16-23 & y=20|22,x=0-7] ; m4: p0[y=16|18,x=8-23] ; m5: p1[y=20|22,x=8-23] ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] (full) ; r1: sad(p0,p1) @ dy=-2,dx=+2 (full) & dy=-1,dx=[-2..0] (top) ; r2: sad(p0,p1) @ dy=-1,dx=[+1..+2] & dy=0,dx=[-2..-1] (top) ; m13: sad(p0,p1) @ dy=0,dx=[+1..+2] & dy=+1,dx=[-2..-1] (top) ; m14: sad(p0,p1) @ dy=+1,dx=[0..+2] & dy=+2,dx=-2 (top) ; r5: sad(p0,p1) @ dy=+2,dx=[-1..+2] (top) ; compare p0[y=14,16,18,20] with p1[y=14,16,18,20], i.e. dy=0 vinserti128 m6, m0, [p0q+8*p0sq], 0 ; p0[y=20|14,x=0-15] vinserti128 m5, [p1q+2*p1sq+8], 1 ; p1[y=20|14,x=8-23] movq xm7, [p0q+8*p0sq+16] vpbroadcastq m8, [p1q+2*p1sq] vpblendd m7, m2, m7, 0x3 ; p0[y=20|14,x=16-23 & ; y=16|18,x=0-7] vpblendd m3, m8, 0xc0 ; p1[y=16|18,x=16-23 & ; y=20|14,x=0-7] palignr m0, m4, m2, 8 ; p0[y=16|18,y=0-15] palignr m2, m6, m4, 8 ; p0[y=16|18,x=16-23 & ; y=20|14,x=0-7] palignr m4, m7, m6, 8 ; p0[y=20|14,x=8-23] call .w16_line_bottom pslldq m10, 4 pslldq m12, 6 pslldq m9, 2 por m10, m12 por m11, m9 paddw m10, [rsp+2*mmsize] paddw m13, m11 mova [rsp+2*mmsize], m10 ; compare p0[y=16,18,20,22] with p1[y=12,14,16,18], i.e. dy=+2 vinserti128 m6, m4, [p0q+p0s5q*2+8], 1 ; p0[y=20|22,x=8-23] vinserti128 m5, [p1q+8], 0 ; p1[y=12|14,x=8-23] vpbroadcastq m7, [p0q+p0s5q*2] movhps xm8, [p1q] vpblendd m7, m2, m7, 0xc0 ; p0[y=16|18,x=16-23 & ; y=20|22,x=0-7] vpblendd m3, m8, 0xc ; p1[y=16|18,x=16-23 & ; y=12|14,x=0-7] palignr m4, m7, m0, 8 ; p0[y=16|18,x=8-23] palignr m2, m0, m6, 8 ; p0[y=20|22,x=16-23 & ; y=16|18,x=0-7] palignr m0, m6, m7, 8 ; p0[y=20|22,x=0-15] call .w16_line_bottom_w_origin pslldq m15, 2 pslldq m11, 4 pslldq m9, 6 por m12, m15 por m11, m9 pslldq m10, 6 por m12, m11 paddw m14, m10 paddw m12, [rsp+5*mmsize] mova [rsp+5*mmsize], m12 ; word r6m: best_sad ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] (full) ; r1: sad(p0,p1) @ dy=-2,dx=+2 (full) & dy=-1,dx=[-2..0] (top) ; r2: sad(p0,p1) @ dy=-1,dx=[+1..+2] (top) & dy=0,dx=[-2..-1] (full) ; m13: sad(p0,p1) @ dy=0,dx=[+1..+2] (full) & dy=+1,dx=[-2..-1] (top) ; m14: sad(p0,p1) @ dy=+1,dx=[0..+2] (top) & dy=+2,dx=-2 (full) ; r5: sad(p0,p1) @ dy=+2,dx=[-1..+2] (full) ; m0-12,15: free ; compare p0[y=13|15|17|19] with p1[y=15|17|19|21], i.e. dy=-1 add p0q, p0sq ; &p0[y=13] add p1q, p1sq ; &p1[y=13] movu xm0, [p0q] movq xm2, [p0q+16] movu xm4, [p0q+p0sq*4+8] movhps xm2, [p0q+p0sq*4] vinserti128 m0, [p0q+p0sq*2], 1 ; p0[y=13|15,x=0-15] vinserti128 m2, [p0q+p0sq*2+16], 1 vpbroadcastq m6, [p0q+p0s3q*2] vinserti128 m4, [p0q+p0s3q*2+8], 1 ; p0[y=17|19,x=8-23] movu xm1, [p1q+p1sq*2] movq xm3, [p1q+p1sq*2+16] movu xm5, [p1q+p1s3q*2+8] movhps xm3, [p1q+p1s3q*2] vinserti128 m1, [p1q+p1sq*4], 1 ; p1[y=15|17,x=0-15] vinserti128 m3, [p1q+p1sq*4+16], 1 vpbroadcastq m7, [p1q+p1sq*8] vinserti128 m5, [p1q+p1sq*8+8], 1 ; p1[y=19|21,x=8-23] vpblendd m2, m6, 0xc0 ; p0[y=13|15,x=16-23 & ; y=17|19,x=0-7] vpblendd m3, m7, 0xc0 ; p1[y=15|17,x=16-23 & ; y=19|21,x=0-7] call .w16_line_bottom_w_origin pslldq m10, 2 pslldq m12, 4 pslldq m15, 6 pslldq m9, 2 por m10, m12 por m11, m9 por m10, m15 paddw m11, [rsp+2*mmsize] paddw m10, [rsp+1*mmsize] mova [rsp+2*mmsize], m11 mova [rsp+1*mmsize], m10 ; compare p0[y=15|17|19|21] with p1[y=13|15|17|19], i.e. dy=+1 vinserti128 m6, m0, [p0q+p0sq*8], 0 ; p0[y=21|15,x=0-15] vinserti128 m5, [p1q+8], 1 ; p1[y=19|13,x=8-23] movq xm7, [p0q+p0sq*8+16] vpbroadcastq m8, [p1q] vpblendd m7, m2, m7, 0x3 ; p0[y=21|15,x=16-23 & ; y=17|19,x=0-7] vpblendd m3, m8, 0xc0 ; p1[y=15|17,x=16-23 & ; y=19|13,x=0-7] palignr m0, m4, m7, 8 ; p0[y=17|19,x=0-15] palignr m2, m6, m4, 8 ; p0[y=17|19,x=16-23 & ; y=21|15,x=0-7] palignr m4, m7, m6, 8 ; p0[y=21|15,x=8-23] call .w16_line_bottom_w_origin pslldq m11, 2 pslldq m9, 4 pslldq m10, 4 pslldq m12, 6 por m15, m11 por m10, m12 por m15, m9 paddw m13, m10 paddw m14, m15 mova m15, [rsp+2*mmsize] jmp .skip_8x16_main .w16_line_top_w_origin: ; dx=0 palignr m9, m2, m0, 2 ; p0[y=2|4,x=2-17] palignr m10, m3, m1, 2 ; p1[y=2|4,x=2-17] palignr m11, m4, m2, 14 ; p0[y=6|8,x=6-21] palignr m12, m5, m3, 14 ; p1[y=6|8,x=6-21] palignr m13, m8, m6, 2 ; p0[y=10|12,x=2-17] palignr m14, m7, m8, 10 ; p1[y=10|12,x=2-17] psadbw m9, m10 psadbw m11, m12 psadbw m13, m14 psrldq m10, m2, 2 ; p0[y=2|4,x=18-23 & y=6|8,x=0-7], 2x0 palignr m12, m8, m7, 10 ; p1[y=10|12,x=18-23] & ; p0[y=10|12,x=16-23] & 2x? psrldq m14, m3, 2 ; p1[y=2|4,x=18-23 & y=6|8,x=0-7], 2x0 shufps m10, m12, q2220 ; p0[y=2|4,x=18-21 & y=6|8,x=2-5 & ; y=10|12,x=18-21 [2x]] shufps m14, m12, q2020 ; p1[y=2|4,x=18-21 & y=6|8,x=2-5 & ; y=10|12,x=18-21] & ; p0[y=10|12,x=18-21] paddw m9, m11 psadbw m10, m14 paddw m9, m13 paddw m9, m10 ; sad(p0,p1) @ dx=0 mova [rsp+8*mmsize+gprsize], m9 .w16_line_top: ; dy=0,dx=-1 palignr m9, m2, m0, 1 ; p0[y=2|4,x=1-16] palignr m10, m3, m1, 3 ; p1[y=2|4,x=3-18] palignr m11, m4, m2, 13 ; p0[y=6|8,x=5-20] palignr m12, m5, m3, 15 ; p1[y=6|8,x=7-22] palignr m13, m8, m6, 1 ; p0[y=10|12,x=1-16] palignr m14, m7, m8, 11 ; p1[y=10|12,x=3-18] psadbw m9, m10 psadbw m11, m12 psadbw m13, m14 psrldq m10, m2, 1 ; p0[y=2|4,x=17-23 & y=6|8,x=0-7], 1x0 pslldq m12, m8, 11 ; 11x0,p0[y=10|12,x=16-20] psrldq m14, m3, 3 ; p1[y=2|4,x=19-23 & y=6|8,x=0-7], 3x0 psrldq m15, m7, 11 ; p1[y=10|12,x=19-23],11x0 shufps m10, m12, q0320 ; p0[y=2|4,x=17-20 & y=6|8,x=1-4 & ; y=10|12,x=17-20], 4x0 shufps m14, m15, q3020 ; p1[y=2|4,x=19-22 & y=6|8,x=3-6 & ; y=10|12,x=19-22], 4x0 paddw m9, m11 psadbw m10, m14 paddw m9, m13 paddw m9, m10 ; sad(p0,p1) @ dx=-1 mova [rsp+7*mmsize+gprsize], m9 ; dx=-2 ; m0 is good as-is ; p0[y=2|4,x=0-15] palignr m10, m3, m1, 4 ; p1[y=2|4,x=4-19] palignr m11, m4, m2, 12 ; p0[y=6|8,x=4-19] ; m5 is good as-is ; p1[y=6|8,x=8-23] ; m6 is good as-is ; p0[y=10|12,x=0-15] palignr m13, m7, m8, 12 ; p1[y=10|12,x=4-19] punpckhqdq m14, m7, m8 ; p1[y=10|12,x=16-23 & x=0-7] shufps m12, m2, m8, q3020 ; p0[y=2|4,x=16-19 & y=6|8,x=0-3 & ; y=10|12,x=16-19] & p1[y=10|12,x=4-7] shufps m14, m3, m14, q3131 ; p1[y=2|4,x=20-23 & y=6|8,x=4-7 & ; y=10|12,x=20-23 & y=10|12,x=4-7] psadbw m9, m0, m10 psadbw m11, m5 psadbw m13, m6, m13 psadbw m14, m12 paddw m9, m11 paddw m13, m14 paddw m9, m13 ; sad(p0,p1) @ dx=-2 mova [rsp+6*mmsize+gprsize], m9 ; dx=+1 palignr m9, m2, m0, 3 ; p0[y=2|4,x=3-18] palignr m10, m3, m1, 1 ; p1[y=2|4,x=1-16] palignr m11, m4, m2, 15 ; p0[y=6|8,x=7-22] palignr m12, m5, m3, 13 ; p1[y=6|8,x=5-20] palignr m13, m8, m6, 3 ; p0[y=10|12,x=3-18] palignr m14, m7, m8, 9 ; p1[y=10|12,x=1-16] psadbw m9, m10 psadbw m11, m12 psadbw m13, m14 psrldq m10, m2, 3 ; p0[y=2|4,x=19-23 & y=6|8,x=0-7], 3x0 pslldq m12, m8, 9 ; 9x0,p0[y=10|12,x=16-22] psrldq m14, m3, 1 ; p1[y=2|4,x=17-23 & y=6|8,x=0-7], 1x0 psrldq m15, m7, 9 ; p1[y=10|12,x=17-23],9x0 shufps m10, m12, q0320 ; p0[y=2|4,x=19-22 & y=6|8,x=3-6 & ; y=10|12,x=19-22], 4x0 shufps m14, m15, q3020 ; p1[y=2|4,x=17-20 & y=6|8,x=1-4 & ; y=10|12,x=17-20], 4x0 paddw m9, m11 psadbw m10, m14 paddw m9, m13 paddw m15, m9, m10 ; sad(p0,p1) @ dx=+1 ; dx=+2 palignr m9, m2, m0, 4 ; p0[y=2|4,x=4-19] ; m1 is good as-is ; p1[y=2|4,x=0-15] ; m4 is good as-is ; p0[y=6|8,x=8-23] palignr m11, m5, m3, 12 ; p1[y=6|8,x=4-19] palignr m13, m8, m6, 4 ; p0[y=10|12,x=4-19] palignr m14, m7, m8, 8 ; p1[y=10|12,x=0-15] punpckhqdq m12, m7, m8 ; p1[y=10|12,x=16-23] & ; p1[y=10|12,x=0-7] shufps m10, m2, m8, q3131 ; p0[y=2|4,x=20-23 & y=6|8,x=4-7 & ; y=10|12,x=20-23] & p1[y=10|12,x=4-7] shufps m12, m3, m12, q3020 ; p1[y=2|4,x=16-19 & y=6|8,x=0-3 & ; y=10|12,x=16-19 & y=10|12,x=4-7] psadbw m9, m1 psadbw m11, m4, m11 psadbw m13, m14 psadbw m10, m12 paddw m9, m11 paddw m13, m10 paddw m9, m13 ; sad(p0,p1) @ dx=+2 mova m10, [rsp+6*mmsize+gprsize] mova m11, [rsp+7*mmsize+gprsize] ret .w16_line_bottom_w_origin: ; dy=-2,dx=0 palignr m6, m2, m0, 2 ; p0[y=12|14,x=2-17] palignr m7, m3, m1, 2 ; p1[y=16|18,x=2-17] palignr m8, m4, m2, 14 ; p0[y=16|18,x=6-21] palignr m9, m5, m3, 14 ; p1[y=20|22,x=6-21] vpblendw m10, m2, m3, 01100110b psadbw m6, m7 psadbw m8, m9 psadbw m10, m2 paddw m6, m8 paddw m15, m6, m10 ; fall-through .w16_line_bottom: ; dy=-2,dx=-1 palignr m6, m2, m0, 1 ; p0[y=12|14,x=1-16] palignr m7, m3, m1, 3 ; p1[y=16|18,x=3-18] palignr m8, m4, m2, 13 ; p0[y=16|18,x=5-20] palignr m9, m5, m3, 15 ; p1[y=20|22,x=7-22] psrldq m10, m2, 1 psrldq m11, m3, 3 vpblendw m10, m11, 11001100b psadbw m6, m7 psadbw m8, m9 psadbw m10, m11 paddw m6, m8 paddw m12, m6, m10 ; dy=-2,dx=+1 palignr m6, m2, m0, 3 ; p0[y=12|14,x=3-18] palignr m7, m3, m1, 1 ; p1[y=16|18,x=1-16] palignr m8, m4, m2, 15 ; p0[y=16|18,x=7-22] palignr m9, m5, m3, 13 ; p1[y=20|22,x=5-20] psrldq m10, m2, 3 psrldq m11, m3, 1 vpblendw m10, m11, 11001100b psadbw m6, m7 psadbw m8, m9 psadbw m10, m11 paddw m6, m8 paddw m11, m6, m10 ; dy=-2,dx=-2 ; m0 is good as-is ; p0[y=12|14,x=0-15] palignr m6, m3, m1, 4 ; p1[y=16|18,x=4-19] palignr m8, m4, m2, 12 ; p0[y=16|18,x=4-19] ; m5 is good as-is ; p1[y=20|22,x=8-23] shufps m10, m2, m3, q3120 pshufd m9, m3, q3131 psadbw m6, m0 psadbw m8, m5 psadbw m10, m9 paddw m6, m8 paddw m10, m6 ; dy=-2,dx=+2 palignr m6, m2, m0, 4 ; p0[y=12|14,x=4-19] ; m1 is good as-is ; p1[y=16|18,x=0-15] ; m4 is good as-is ; p0[y=16|18,x=8-23] palignr m8, m5, m3, 12 ; p1[y=20|22,x=4-19] shufps m7, m3, m2, q3120 pshufd m9, m2, q3131 psadbw m6, m1 psadbw m8, m4 psadbw m9, m7 paddw m6, m8 paddw m9, m6 ret .w8: imul r9d, 24 mov [rsp], r9d movu xm0, [p0q+2*p0sq] movu xm1, [p1q+2*p1sq] vinserti128 m0, [p0q+4*p0sq], 1 ; p0[y=2|4] vinserti128 m1, [p1q+4*p1sq], 1 ; p1[y=2|4] movu xm2, [p0q+2*p0s3q] movu xm3, [p1q+2*p1s3q] vinserti128 m2, [p0q+8*p0sq], 1 ; p0[y=6|8] vinserti128 m3, [p1q+8*p1sq], 1 ; p1[y=6|8] movu xm4, [p0q+2*p0s5q] movu xm5, [p1q+2*p1s5q] vinserti128 m4, [p0q+4*p0s3q], 1 ; p0[y=10|12] vinserti128 m5, [p1q+4*p1s3q], 1 ; p1[y=10|12] pcmpeqw m6, m6 psrldq m6, 4 ; 12x255,4x0 pslldq m7, m6, 2 ; 2x0,12x255,2x0 pslldq m6, 1 ; 1x0,12x255,3x0 mov r9d, -1 cmp byte r6m, 0 je .w8_main vpblendvb m9, m1, m0, m7 vpblendvb m10, m3, m2, m7 vpblendvb m11, m5, m4, m7 ; p1[2x],p0[12x],p1[2x] psadbw m9, m1 psadbw m10, m3 psadbw m11, m5 ; p0/1 sad on inner 12px paddw m9, m10 paddw m9, m11 cmp hd, 8 je .skip_8x16_thr_acc lea r9, [p0q+4*p0s3q] movu xm10, [r9+2*p0sq] movu xm12, [r9+2*p0s3q] vinserti128 m10, [r9+4*p0sq], 1 vinserti128 m12, [r9+8*p0sq], 1 lea r9, [p1q+4*p1s3q] movu xm11, [r9+2*p1sq] movu xm13, [r9+2*p1s3q] vinserti128 m11, [r9+4*p1sq], 1 vinserti128 m13, [r9+8*p1sq], 1 vpblendvb m10, m11, m10, m7 vpblendvb m12, m13, m12, m7 psadbw m10, m11 psadbw m12, m13 ; p0/1 sad on inner 12px paddw m9, m10 paddw m9, m12 .skip_8x16_thr_acc: vextracti128 xm10, m9, 1 paddd xm9, xm10 punpckhqdq xm10, xm9, xm9 paddd xm9, xm10 movd r9d, xm9 inc r9d imul r9d, 7 shr r9d, 3 ; (sad*7+7)>>3 cmp r9d, [rsp] jl .ret_origin ; r9d: best_sad ; m0/2/4: p0[y=2|4,6|8,10|12] ; m1/3/5: p1[y=2|4,6|8,10|12] ; m6: 1x0,12x255,3x0[both lanes] ; m7: 12x255,4x0[both lanes] ; m8: 2x0,12x255,2x0[both lanes] ; m9-15: free .w8_main: mov r6m, r9w ; compare p0[y=2,4,6,8,10,12] with p1[y=2,4,6,8,10,12], i.e. dy=+0 call .w8_line_top pslldq m9, 4 pslldq m10, 6 pslldq m12, 2 por m9, m10 por m11, m12 mova [rsp+2*mmsize], m9 mova [rsp+3*mmsize], m11 ; compare p0[y=0,2,4,6,8,10] with p1[y=4,6,8,10,12,14], i.e. dy=-2 lea r9, [p1s5q+p1sq*2] ; p1s7q vpblendd m15, m1, m4, 11110000b vinserti128 m10, m4, [p0q], 1 ; p0[y=10|0] vinserti128 m1, [p1q+r9*2], 0 ; p1[y=14|4] mova m4, m2 mova m2, m0 mova m0, m10 call .w8_line_top_w_origin pslldq m10, 2 pslldq m8, 4 pslldq m11, 6 por m9, m10 por m8, m11 mova [rsp+1*mmsize], m12 por m8, m9 mova [rsp+0*mmsize], m8 ; compare p0[y=4,6,8,10,12,14] with p1[y=0,2,4,6,8,10], i.e. dy=+2 lea r9, [p0s5q+p0sq*2] ; p0s7q vinserti128 m5, [p1q], 1 ; p1[y=10|0] vpblendd m1, m15, 00001111b ; p1[y=2|4] vinserti128 m10, m2, [p0q+r9*2], 0 ; p0[y=14|4] vpblendd m2, m0, m15, 11110000b ; p0[y=10|12] mova m0, m4 mova m4, m10 call .w8_line_top_w_origin pslldq m8, 2 pslldq m11, 4 pslldq m12, 6 pslldq m9, 6 por m10, m8 por m11, m12 mova [rsp+4*mmsize], m9 por m10, m11 mova [rsp+5*mmsize], m10 ; r6m: best_sad ; m0-5: free ; m6: 1x0,12x255,3x0[both lanes] ; m7: 2x0,12x255,2x0[both lanes] ; m9-15: free ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 ; r2: sad(p0,p1) @ dy=+0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=+0,dx=[+1..+2] ; r4: sad(p0,p1) @ dy=+2,dx=-2 ; r5: sad(p0,p1) @ dy=+2,dx=[-1..+2] ; compare p0[y=1,3,5,7,9,11] with p1[y=3,5,7,9,11,13], i.e. dy=-1 lea r9, [p1q+p1s3q] movu xm1, [r9] movu xm3, [r9+p1sq*4] movu xm5, [r9+p1sq*8] vinserti128 m1, [r9+p1sq*2], 1 ; p1[y=3|5] vinserti128 m3, [r9+p1s3q*2], 1 ; p1[y=7|9] vinserti128 m5, [r9+p1s5q*2], 1 ; p1[y=11|13] lea r9, [p0q+p0s3q] movu xm0, [p0q+p0sq] movu xm2, [r9+p0sq*2] movu xm4, [r9+p0s3q*2] vinserti128 m0, [r9], 1 ; p0[y=1|3] vinserti128 m2, [r9+p0sq*4], 1 ; p0[y=5|7] vinserti128 m4, [r9+p0sq*8], 1 ; p0[y=9|11] call .w8_line_top_w_origin pslldq m9, 2 pslldq m10, 4 pslldq m8, 6 pslldq m12, 2 por m9, [rsp+1*mmsize] por m10, m8 por m11, m12 por m9, m10 por m15, m11, [rsp+2*mmsize] mova [rsp+1*mmsize], m9 ; r6m: best_sad ; m0/2/4: p0[y=1|3,5|7,9|11] ; m1/3/5: p1[y=3|5,7|9,11|13] ; m6: 1x0,12x255,3x0[both lanes] ; m7: 2x0,12x255,2x0[both lanes] ; m9-14: free ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 & dy=-1,dx=[-2..0] ; m15: sad(p0,p1) @ dy=-1,dx=[+1..+2] & dy=+0,dx=[-2..-1] ; r3: sad(p0,p1) @ dy=+0,dx=[+1..+2] ; r4: sad(p0,p1) @ dy=+2,dx=-2 ; r5: sad(p0,p1) @ dy=+2,dx=[-1..+2] ; compare p0[y=3,5,7,9,11,13] with p1[y=1,3,5,7,9,11], i.e. dy=+1 vinserti128 m10, m0, [r9+p0s5q*2], 0 ; p0[y=13|3] vinserti128 m5, [p1q+p1sq], 1 ; p1[y=11|1] mova m0, m2 mova m2, m4 mova m4, m10 call .w8_line_top_w_origin pslldq m9, 4 pslldq m10, 6 pslldq m11, 2 pslldq m12, 4 por m9, m10 por m8, m11 por m12, [rsp+4*mmsize] por m13, m9, [rsp+3*mmsize] por m14, m8, m12 ; r6m: best_sad ; m0-5: free ; m6: 1x0,12x255,3x0[both lanes] ; m7: 2x0,12x255,2x0[both lanes] ; m9-12: free ; r0: sad(p0,p1) @ dy=-2,dx=[-2..+1] ; r1: sad(p0,p1) @ dy=-2,dx=+2 & dy=-1,dx=[-2..0] ; m15: sad(p0,p1) @ dy=-1,dx=[+1..+2] & dy=+0,dx=[-2..-1] ; m13: sad(p0,p1) @ dy=+0,dx=[+1..+2] & dy=+1,dx=[-2..-1] ; m14: sad(p0,p1) @ dy=-1,dx=[0..+2] & dy=+2,dx=-2 ; r5: sad(p0,p1) @ dy=+2,dx=[-1..+2] cmp hd, 8 je .skip_8x16_main ; compare p0[y=14|16,18|12] with p1[y=18|20,22|16], i.e. dy=-2 lea p0q, [p0q+p0s3q*4] ; &p0[y=12] lea p1q, [p1q+p1s3q*4] ; &p1[y=12] movu xm0, [p0q+p0sq*2] movu xm3, [p1q+p1s5q*2] vinserti128 m0, [p0q+p0sq*4], 1 ; p0[y=14|16] vinserti128 m3, [p1q+p1sq*4], 1 ; p1[y=22|16] movu xm2, [p0q+p0s3q*2] movu xm1, [p1q+p1s3q*2] vinserti128 m2, [p0q], 1 ; p0[y=18|12] vinserti128 m1, [p1q+p1sq*8], 1 ; p1[y=18|20] call .w8_line_bottom_w_origin pslldq m8, 2 pslldq m4, 4 pslldq m9, 6 por m5, m8 por m4, m9 por m5, m4 paddw m10, [rsp+1*mmsize] paddw m5, [rsp+0*mmsize] mova [rsp+1*mmsize], m10 mova [rsp+0*mmsize], m5 ; compare p0[y=14|16,18|20] with p1[y=14|16,18|20], i.e. dy=0 vinserti128 m10, m2, [p0q+p0sq*8], 1 ; p0[y=18|20] vinserti128 m3, [p1q+p1sq*2], 0 ; p1[y=14|16] mova m2, m0 mova m0, m10 call .w8_line_bottom pslldq m5, 4 pslldq m8, 6 pslldq m10, 2 por m5, m8 por m9, m10 paddw m15, m5 paddw m13, m9 ; compare p0[y=18|20,22|16] with p1[y=14|16,18|12], i.e. dy=+2 vinserti128 m10, m2, [p0q+p0s5q*2], 0 ; p0[y=22|16] vinserti128 m1, [p1q], 1 ; p1[y=18|12] mova m2, m0 mova m0, m10 call .w8_line_bottom_w_origin pslldq m5, 6 pslldq m4, 2 pslldq m9, 4 pslldq m10, 6 por m8, m4 por m9, m10 por m8, m9 paddw m14, m5 paddw m12, m8, [rsp+5*mmsize] ; compare p0[y=13|15,17|19] with p1[y=15|17,19|21], i.e. dy=-1 lea r9, [p1q+p1s3q*2] movu xm1, [p1q+p1s3q] movu xm3, [r9+p1sq] vinserti128 m1, [p1q+p1s5q], 1 ; p1[y=15|17] vinserti128 m3, [r9+p1s3q], 1 ; p1[y=19|21] lea r9, [p0q+p0s3q*2] movu xm0, [p0q+p0sq] movu xm2, [p0q+p0s5q] vinserti128 m0, [p0q+p0s3q], 1 ; p0[y=13|15] vinserti128 m2, [r9+p0sq], 1 ; p0[y=15|17] call .w8_line_bottom_w_origin pslldq m5, 2 pslldq m8, 4 pslldq m4, 6 pslldq m10, 2 por m5, m8 por m9, m10 por m5, m4 paddw m5, [rsp+1*mmsize] paddw m15, m9 mova [rsp+1*mmsize], m5 ; compare p0[y=21|15,17|19] with p1[y=19|13,15|17], i.e. dy=+1 vinserti128 m10, m0, [r9+p0s3q], 0 ; p0[y=21|15] vinserti128 m3, [p1q+p1sq], 1 ; p1[y=19|13] mova m0, m2 mova m2, m10 call .w8_line_bottom_w_origin pslldq m9, 2 pslldq m10, 4 pslldq m5, 4 pslldq m8, 6 por m4, m9 por m5, m8 por m4, m10 paddw m13, m5 paddw m14, m4 jmp .skip_8x16_main_noreload .skip_8x16_main: mova m12, [rsp+5*mmsize] .skip_8x16_main_noreload: mova m10, [rsp+0*mmsize] mova m11, [rsp+1*mmsize] ; aggregate punpckhqdq m0, m10, m11 punpcklqdq m10, m11 paddw m0, m10 ; sad(p0,p1) @ dy=-2,dx=[-2..+2] & ; dy=-1,dx=[-2..0] punpckhqdq m1, m15, m13 punpcklqdq m15, m13 paddw m1, m15 ; sad(p0,p1) @ dy=-1,dx=[+1..+2] & ; dy=0,dx=[-2..+2] & ; dy=+1,dx=[-2..-1] punpckhqdq m2, m14, m12 punpcklqdq m14, m12 paddw m2, m14 ; sad(p0,p1) @ dy=+1,dx=[0..+2] & ; dy=+2,dx=[-2..+2] ; r6m: best_sad ; m9: sad(p0,p1) @ dy=-1,dx=[+1..+2] & dy=0,dx=[-2..+2] & dy=+1,dx=[-2..-1] ; m11: sad(p0,p1) @ dy=-2,dx=[-2..+2] & dy=-1,dx=[-2..0] ; m12: sad(p0,p1) @ dy=+1,dx=[0..+2] & dy=+2,dx=[-2..+2] vextracti128 xm3, m0, 1 vextracti128 xm4, m1, 1 vextracti128 xm5, m2, 1 paddw xm0, xm3 paddw xm1, xm4 paddw xm2, xm5 REPX {phminposuw x, x}, xm0, xm1, xm2 movd r0d, xm0 movd r1d, xm1 movd r2d, xm2 or r1d, 0x80000 or r2d, 0x100000 cmp r1w, r0w cmovb r0, r1 cmp r2w, r0w cmovb r0, r2 ; best_sad, excluding origin cmp r0w, word r6m jnb .ret_origin mov r1, r7m shr r0, 16 lea r2, [sadrefinemv_idx2off] mov r0w, [r2+r0*2] mov word [r1], r0w RET .w8_line_top_w_origin: ; dx=0 vpblendvb m8, m0, m1, m7 vpblendvb m9, m2, m3, m7 vpblendvb m10, m4, m5, m7 ; mask out left 2px & right 2px psadbw m8, m0 psadbw m9, m2 psadbw m10, m4 paddw m8, m9 paddw m8, m10 ; sad(p0,p1) @ dx=0 ; fall-through .w8_line_top: ; dx=-2 psrldq m9, m1, 4 ; p1[y=2|4,x=4-15] psrldq m10, m3, 4 ; p1[y=6|8,x=4-15] psrldq m11, m5, 4 ; p1[y=10|12,x=4-15] vpblendd m9, m0, m9, 01110111b vpblendd m10, m2, m10, 01110111b vpblendd m11, m4, m11, 01110111b ; mask out right 4px psadbw m9, m0 psadbw m10, m2 psadbw m11, m4 paddw m9, m10 paddw m9, m11 ; sad(p0,p1) @ dx=-2 ; dx=-1 psrldq m10, m1, 2 ; p1[y=2|4,x=2-15] psrldq m11, m3, 2 ; p1[y=6|8,x=2-15] psrldq m12, m5, 2 ; p1[y=10|12,x=2-15] vpblendvb m10, m0, m10, m6 vpblendvb m11, m2, m11, m6 vpblendvb m12, m4, m12, m6 ; mask out left 1px & right 3px psadbw m10, m0 psadbw m11, m2 psadbw m12, m4 paddw m10, m11 paddw m10, m12 ; sad(p0,p1) @ dy=-2,dx=-1 ; dx=+1 psrldq m11, m0, 2 ; p0[y=2|4,x=2-15] psrldq m12, m2, 2 ; p0[y=6|8,x=2-15] psrldq m13, m4, 2 ; p0[y=10|12,x=2-15] vpblendvb m11, m1, m11, m6 vpblendvb m12, m3, m12, m6 vpblendvb m13, m5, m13, m6 ; mask out left 1px & right 3px psadbw m11, m1 psadbw m12, m3 psadbw m13, m5 paddw m11, m12 paddw m11, m13 ; sad(p0,p1) @ dx=+1 ; dx=+2 psrldq m12, m0, 4 ; p0[y=2|4,x=4-15] psrldq m13, m2, 4 ; p0[y=6|8,x=4-15] psrldq m14, m4, 4 ; p0[y=10|12,x=4-15] vpblendd m12, m1, m12, 01110111b vpblendd m13, m3, m13, 01110111b vpblendd m14, m5, m14, 01110111b ; mask out right 4px psadbw m12, m1 psadbw m13, m3 psadbw m14, m5 paddw m12, m13 paddw m12, m14 ; sad(p0,p1) @ dx=+2 ret .w8_line_bottom_w_origin: ; dx=0 vpblendvb m4, m0, m1, m7 vpblendvb m5, m2, m3, m7 ; mask out left 2px & right 2px psadbw m4, m0 psadbw m5, m2 paddw m4, m5 ; sad(p0,p1) @ dx=0 .w8_line_bottom: ; dy=-2,dx=-2 psrldq m5, m1, 4 ; p1[y=14|16,x=4-15] psrldq m8, m3, 4 ; p1[y=18|20,x=4-15] vpblendd m5, m0, m5, 01110111b vpblendd m8, m2, m8, 01110111b ; mask out right 4px psadbw m5, m0 psadbw m8, m2 paddw m5, m8 ; sad(p0,p1) @ dx=-2 ; dy=-2,dx=-1 psrldq m8, m1, 2 ; p1[y=14|16,x=2-15] psrldq m9, m3, 2 ; p1[y=18|20,x=2-15] vpblendvb m8, m0, m8, m6 vpblendvb m9, m2, m9, m6 ; mask out left 1px & right 3px psadbw m8, m0 psadbw m9, m2 paddw m8, m9 ; sad(p0,p1) @ dx=-1 ; dy=-2,dx=+1 psrldq m9, m0, 2 ; p0[y=14|16,x=2-15] psrldq m10, m2, 2 ; p0[y=18|20,x=2-15] vpblendvb m9, m1, m9, m6 vpblendvb m10, m3, m10, m6 ; mask out left 1px & right 3px psadbw m9, m1 psadbw m10, m3 paddw m9, m10 ; sad(p0,p1) @ dx=+1 ; dy=-2,dx=+2 psrldq m10, m0, 4 ; p0[y=14|16,x=4-15] psrldq m11, m2, 4 ; p0[y=18|20,x=4-15] vpblendd m10, m1, m10, 01110111b vpblendd m11, m3, m11, 01110111b ; mask out right 4px psadbw m10, m1 psadbw m11, m3 paddw m10, m11 ; sad(p0,p1) @ dx=+2 ret .ret_origin: mov r0, r7m mov word [r0], 0 ; o->y = o->x = 0 RET cglobal opfl_derive_mv_8bpc, 5, 7, 0, out, p0, p0s, p1, p1s, w, h, bs, reldist movifnidn wd, wm cmp wd, 16 jge .w16 %macro opfl_loop 1 ; bs mov hd, hm vpbroadcastd m6, [pb_1_m1] vpbroadcastw m14, reldistm mova m15, [pw_4x84a10_4x42a5] movq xm0, [p0q] movq xm1, [p1q] vinserti128 m0, [p0q+p0sq*1], 1 vinserti128 m1, [p1q+p1sq*1], 1 punpcklbw m0, m1 pmaddubsw m1, m0, m6 ; tmp1[y=0|1] pmaddubsw m0, m14 ; tmp0[y=0|1] vpermq m2, m0, q1010 ; tmp0[y=0|0]=q1 mova m3, m2 ; q0 sub hd, 2 ; aggregators %if %1 == 4 vpbroadcastq xm7, [pq_16] ; bs * bs for u*u or v*v %else movd xm7, [pd_64] %endif REPX {pxor x, x}, xm8, xm10, xm11 mova xm9, xm7 jmp %%loop_skipconst %%loop: vpbroadcastd m6, [pb_1_m1] vpbroadcastw m14, reldistm vpbroadcastd m15, [pw_4x84a10_4x42a5+16] %%loop_skipconst: ; m0=tmp0[cur line] ; m1=tmp1[cur line] ; m2=q1 ; m3=q0 ; m6=pb_1_m1 [weights for tmp0] ; m7-11: res ; m4-5,12-13=free ; m14=reldistm [weights for tmp1] ; m15=gy0 weights (some permutation of [pw_4x84a10_4x42a5]) lea p0q, [p0q+p0sq*2] lea p1q, [p1q+p1sq*2] movq xm4, [p0q] movq xm5, [p1q] vinserti128 m4, [p0q+p0sq*1], 1 vinserti128 m5, [p1q+p1sq*1], 1 punpcklbw m4, m5 pmaddubsw m5, m4, m6 ; tmp1[y=2|3] pmaddubsw m4, m14 ; tmp0[y=2|3]=q3 vperm2i128 m6, m0, m4, 0x21 ; tmp0[y=1|2]=q2 ; m0=tmp0[cur line] ; m1=tmp1[cur line] ; m2=q1 ; m3=q0 ; m4=tmp0[next line]=q3 ; m5=tmp1[next line] ; m6=q2 ; m7-11=res [u*u, u*v, v*v, u*w, v*w] ; m12-13=free ; m14=reldistm [weights for tmp1] ; m15=gy0 weights (some permutation of [pw_4x84a10_4x42a5]) %%loop_skipload: ; gy0 vpbroadcastd m14, [pd_64] psubw m2, m6, m2 ; (q2 - q1)[y=0|1] psubw m3, m4 ; (q0 - q3)[y=0|1] punpckhwd m13, m2, m3 punpcklwd m2, m3 REPX {pmaddwd x, m15}, m13, m2 psrad m12, m13, 31 psrad m3, m2, 31 REPX {paddd x, m14}, m13, m2 paddd m13, m12 paddd m2, m3 REPX {psrad x, 7 }, m13, m2 packssdw m2, m13 ; gy0[y=0|1]=v ; m0=tmp0[cur line] ; m1=tmp1[cur line] ; m2=gy0 ; m4=tmp0[next line]=q3 ; m5=tmp1[next line] ; m6=q2 ; m7-11=res [u*u, u*v, v*v, u*w, v*w] ; m3,12-13,15=free ; m14=pd_64 ; gx0 vbroadcasti128 m3, [shuf_left_1w] vbroadcasti128 m15, [shuf_right_1w] pshufb m12, m0, m3 ; p1 pshufb m13, m0, m15 ; p2 pshufb m3, m12, m3 ; p0 pshufb m15, m13, m15 ; p3 psubw m13, m12 ; (p2 - p1)[y=0|1] vbroadcasti128 m12, [pw_3x42a5_1x84a10] psubw m3, m15 ; (p0 - p3)[y=0|1] punpckhwd m15, m13, m3 punpcklwd m13, m3 pshufd m3, m12, q0123 ; pw_1x84a10_3x42a5 pmaddwd m15, m12 pmaddwd m13, m3 psrad m12, m15, 31 psrad m3, m13, 31 REPX {paddd x, m14}, m15, m13 paddd m15, m12 paddd m3, m13 REPX {psrad x, 7 }, m15, m3 packssdw m3, m15 ; gx0[y=0|1]=u ; m0=tmp0[cur line] ; m1=tmp1[cur line] [w] ; m2=gy0 [v] ; m3=gx0 [u] ; m4=tmp0[next line]=q3 ; m5=tmp1[next line] ; m6=q2 ; m7-11=res [u*u, u*v, v*v, u*w, v*w] ; m12-15=free ; regression data pmaddwd m12, m1, m2 ; v*w pmaddwd m1, m3 ; u*w pmaddwd m15, m2, m3 ; u*v REPX {pmaddwd x, x}, m2, m3 ; v*v and u*u paddd m11, m12 paddd m10, m1 paddd m8, m15 paddd m9, m2 paddd m7, m3 test hd, %1-1 jz %%end sub hd, 2 %%noret: mova m3, m0 mova m0, m4 mova m1, m5 mova m2, m6 jg %%loop ; last line vpermq m4, m4, q3232 ; tmp0[y=3|3]=q2 & q3 vpermq m15, [pw_4x84a10_4x42a5], q1032 mova m6, m4 jmp %%loop_skipload %%end: ; m7-11=res [u*u, u*v, v*v, u*w, v*w] %if %1 == 8 phaddd m7, m8 phaddd m9, m10 vextracti128 xm12, m11, 1 phaddd m7, m9 paddd xm11, xm12 vextracti128 xm8, m7, 1 punpckhqdq xm12, xm11, xm11 paddd xm7, xm8 paddd xm11, xm12 psrlq xm12, xm11, 32 paddd xm11, xm12 movu [outq+0*4], xm7 movd [outq+4*4], xm11 %else phaddd m7, m9 phaddd m8, m10 vextracti128 xm12, m11, 1 vextracti128 xm9, m7, 1 vextracti128 xm10, m8, 1 paddd xm11, xm12 paddd xm7, xm9 psrlq xm12, xm11, 32 paddd xm8, xm10 paddd xm11, xm12 punpckhdq xm9, xm7, xm8 punpckldq xm7, xm8 punpcklqdq xm8, xm7, xm9 punpckhqdq xm7, xm9 movu [outq+0*4], xm8 movd [outq+4*4], xm11 movu [outq+5*4], xm7 pextrd [outq+9*4], xm11, 2 %endif sub hd, 2 jl %%ret add outq, 5*(32/%1) %if %1 == 4 vpbroadcastq xm7, [pq_16] ; bs * bs for u*u or v*v %else movd xm7, [pd_64] %endif REPX {pxor x, x}, xm8, xm10, xm11 mova xm9, xm7 jmp %%noret %%ret: RET %endmacro PROLOGUE 5, 7, 16, out, p0, p0s, p1, p1s, w, h, bs, reldist cmp dword bsm, 4 je .bs4 opfl_loop 8 .bs4: opfl_loop 4 .w16: PROLOGUE 5, 8, 16, 12 * mmsize * 4 * gprsize, out, p0, p0s, p1, p1s, w, h, bs, reldist lea r7d, [wd*5] shr r7d, 1 mov [rsp+12*mmsize+0*gprsize], p0q mov [rsp+12*mmsize+1*gprsize], p1q mov [rsp+12*mmsize+2*gprsize], outq mov [rsp+12*mmsize+3*gprsize], r7 .w16_loop: mov hd, hm vpbroadcastd m13, [pb_1_m1] vpbroadcastw m14, reldistm mova m15, [pw_4x84a10_4x42a5] movu xm0, [p0q] movu xm2, [p1q] vinserti128 m0, [p0q+p0sq*1], 1 vinserti128 m2, [p1q+p1sq*1], 1 punpckhbw m1, m0, m2 punpcklbw m0, m2 pmaddubsw m7, m1, m13 pmaddubsw m6, m0, m13 ; tmp1[y=0|1] REPX {pmaddubsw x, m14}, m0, m1 ; tmp0[y=0|1] vpermq m2, m0, q1010 vpermq m3, m1, q1010 ; tmp0[y=0|0]=q1 mova m4, m2 mova m5, m3 ; q0 sub hd, 2 ; aggregators movd xm10, [pd_64] pxor xm11, xm11 mova [rsp+ 2*mmsize], m10 mova [rsp+ 3*mmsize], m11 mova [rsp+ 4*mmsize], m10 mova [rsp+ 5*mmsize], m11 mova [rsp+ 6*mmsize], m11 mova [rsp+ 7*mmsize], m10 mova [rsp+ 8*mmsize], m11 mova [rsp+ 9*mmsize], m10 mova [rsp+10*mmsize], m11 mova [rsp+11*mmsize], m11 jmp .loop_skipconst .loop: vpbroadcastd m13, [pb_1_m1] vpbroadcastw m14, reldistm vpbroadcastd m15, [pw_4x84a10_4x42a5+16] .loop_skipconst: ; m0-1=tmp0[cur line] ; m2-3=q1 ; m4-5=q0 ; m6-7=tmp1[cur line] ; m8-12=free ; m13=[pb_1_m1] ; m14=reldistm [weights for tmp1] ; m15=gy0 weights (some permutation of [pw_4x84a10_4x42a5]) lea p0q, [p0q+p0sq*2] lea p1q, [p1q+p1sq*2] movu xm8, [p0q] movu xm10, [p1q] vinserti128 m8, [p0q+p0sq*1], 1 vinserti128 m10, [p1q+p1sq*1], 1 punpckhbw m9, m8, m10 punpcklbw m8, m10 pmaddubsw m11, m9, m13 pmaddubsw m10, m8, m13 ; tmp1[y=2|3] mova [rsp+0*mmsize], m10 mova [rsp+1*mmsize], m11 REPX {pmaddubsw x, m14}, m8, m9 ; tmp0[y=2|3]=q3 vperm2i128 m12, m0, m8, 0x21 vperm2i128 m13, m1, m9, 0x21 ; tmp0[y=1|2]=q2 ; m0-1=tmp0[cur line] ; m2-3=q1 ; m4-5=q0 ; m6-7=tmp1[cur line] ; m8-9=tmp0[next line]=q3 ; m10-11=free ; m12-13=q2 ; m14=reldistm [weights for tmp1] ; m15=gy0 weights (some permutation of [pw_4x84a10_4x42a5]) ; r0-1=tmp1[next line] ; r2-11=res .loop_skipload: ; gy0 psubw m2, m12, m2 psubw m3, m13, m3 ; (q2 - q1)[y=0|1] psubw m4, m8 psubw m5, m9 ; (q0 - q3)[y=0|1] punpckhwd m14, m2, m4 punpcklwd m2, m4 punpckhwd m4, m3, m5 punpcklwd m3, m5 vpbroadcastd m5, [pd_64] REPX {pmaddwd x, m15}, m14, m2, m4, m3 psrad m10, m14, 31 psrad m11, m2, 31 REPX {paddd x, m5 }, m14, m2 paddd m14, m10 paddd m2, m11 REPX {psrad x, 7 }, m14, m2 psrad m10, m4, 31 psrad m11, m3, 31 REPX {paddd x, m5 }, m4, m3 paddd m4, m10 paddd m3, m11 REPX {psrad x, 7 }, m4, m3 packssdw m2, m14 packssdw m3, m4 ; gy0[y=0|1]=v ; m0-1=tmp0[cur line] ; m2-3=gy0 ; m5=pd_64 ; m6-7=tmp1[cur line] ; m8-9=tmp0[next line]=q3 ; m12-13=q2 ; m4,10-11,14-15=free ; r0-1=tmp1[next line] ; r2-11=res ; gx0 vbroadcasti128 m14, [shuf_left_1w] vbroadcasti128 m15, [shuf_right_1w] pshufb m4, m0, m14 ; p1 palignr m11, m1, m0, 2 ; p2 pshufb m14, m4, m14 ; p0 palignr m10, m1, m0, 4 ; p3 psubw m11, m4 ; (p2 - p1)[y=0|1,x=0-7] psubw m14, m10 ; (p0 - p3)[y=0|1,x=0-7] pshufb m4, m1, m15 ; p2 palignr m10, m1, m0, 14 ; p1 pshufb m15, m4, m15 ; p3 palignr m5, m1, m0, 12 ; p0 psubw m4, m10 ; (p2 - p1)[y=0|1,x=8-15] psubw m5, m15 ; (p0 - p3)[y=0|1,x=8-15] vbroadcasti128 m10, [pw_3x42a5_1x84a10] punpckhwd m15, m11, m14 punpcklwd m11, m14 punpckhwd m14, m4, m5 punpcklwd m4, m5 pshufd m5, m10, q0123 ; pw_1x84a10_3x42a5 pmaddwd m14, m10 pshufd m10, m10, q1111 ; pw_4x42a5 pmaddwd m11, m5 REPX {pmaddwd x, m10}, m15, m4 psrad m5, m11, 31 psrad m10, m15, 31 paddd m11, m5 paddd m15, m10 psrad m5, m4, 31 psrad m10, m14, 31 paddd m5, m4 vpbroadcastd m4, [pd_64] paddd m14, m10 REPX {paddd x, m4 }, m11, m15, m5, m14 REPX {psrad x, 7 }, m11, m15, m5, m14 packssdw m4, m11, m15 packssdw m5, m14 ; gx0[y=0|1]=u ; m0-1=tmp0[cur line] ; m2-3=gy0 ; m4-5=gx0 ; m6-7=tmp1[cur line] ; m8-9=tmp0[next line]=q3 ; m12-13=q2 ; m10-11,14-15=free ; r0-1=tmp1[next line] ; r2-11=res ; regression data pmaddwd m14, m2, m6 pmaddwd m15, m3, m7 ; v*w pmaddwd m6, m4 pmaddwd m7, m5 ; u*w pmaddwd m10, m2, m4 pmaddwd m11, m3, m5 ; u*v REPX {pmaddwd x, x}, m2, m3, m4, m5 ; v*v and u*u paddd m4, [rsp+ 2*mmsize] paddd m10, [rsp+ 3*mmsize] paddd m2, [rsp+ 4*mmsize] paddd m6, [rsp+ 5*mmsize] paddd m14, [rsp+ 6*mmsize] paddd m5, [rsp+ 7*mmsize] paddd m11, [rsp+ 8*mmsize] paddd m3, [rsp+ 9*mmsize] paddd m7, [rsp+10*mmsize] paddd m15, [rsp+11*mmsize] test hd, 7 jz .end sub hd, 2 mova [rsp+ 2*mmsize], m4 mova [rsp+ 3*mmsize], m10 mova [rsp+ 4*mmsize], m2 mova [rsp+ 5*mmsize], m6 mova [rsp+ 6*mmsize], m14 mova [rsp+ 7*mmsize], m5 mova [rsp+ 8*mmsize], m11 mova [rsp+ 9*mmsize], m3 mova [rsp+10*mmsize], m7 mova [rsp+11*mmsize], m15 .noret: mova m4, m0 mova m5, m1 mova m0, m8 mova m1, m9 mova m2, m12 mova m3, m13 mova m6, [rsp+0*mmsize] mova m7, [rsp+1*mmsize] jg .loop ; last line REPX {vpermq x, x, q3232}, m8, m9 ; tmp0[y=3|3]=q2 & q3 vpermq m15, [pw_4x84a10_4x42a5], q1032 mova m12, m8 mova m13, m9 jmp .loop_skipload .end: ; m4-5,10-11,2-3,6-7,14-15=res [u*u, u*v, v*v, u*w, v*w] phaddd m4, m10 phaddd m2, m6 phaddd m14, m5 phaddd m11, m3 phaddd m7, m15 phaddd m4, m2 psrlq m15, m7, 32 phaddd m14, m11 paddd m7, m15 vextracti128 xm2, m4, 1 vextracti128 xm11, m14, 1 vextracti128 xm15, m7, 1 paddd xm4, xm2 paddd xm14, xm11 paddd xm7, xm15 movu [outq+0*4], xm4 pshufd xm7, xm7, q3120 movu [outq+4*4], xm14 movq [outq+8*4], xm7 sub hd, 2 jl .next_block add outq, [rsp+12*mmsize+3*gprsize] movd xm10, [pd_64] pxor xm11, xm11 mova [rsp+ 2*mmsize], m10 mova [rsp+ 3*mmsize], m11 mova [rsp+ 4*mmsize], m10 mova [rsp+ 5*mmsize], m11 mova [rsp+ 6*mmsize], m11 mova [rsp+ 7*mmsize], m10 mova [rsp+ 8*mmsize], m11 mova [rsp+ 9*mmsize], m10 mova [rsp+10*mmsize], m11 mova [rsp+11*mmsize], m11 jmp .noret .next_block: sub wd, 16 jz .ret mov p0q, [rsp+12*mmsize+0*gprsize] mov p1q, [rsp+12*mmsize+1*gprsize] mov outq, [rsp+12*mmsize+2*gprsize] REPX {add x, 16}, p0q, p1q add outq, 4*5*2 mov [rsp+12*mmsize+0*gprsize], p0q mov [rsp+12*mmsize+1*gprsize], p1q mov [rsp+12*mmsize+2*gprsize], outq jmp .w16_loop .ret: RET INIT_XMM avx2 cglobal sad8x8_8bpc, 4, 6, 6, p0, p0s, p1, p1s, p0s3, p1s3 lea p0s3q, [p0sq*3] lea p1s3q, [p1sq*3] movq m0, [p0q] movq m1, [p1q] movhps m0, [p0q+p0sq] movhps m1, [p1q+p1sq] movq m2, [p0q+p0sq*2] movq m3, [p1q+p1sq*2] movhps m2, [p0q+p0s3q] movhps m3, [p1q+p1s3q] lea p0q, [p0q+p0sq*4] lea p1q, [p1q+p1sq*4] psadbw m0, m1 psadbw m2, m3 movq m1, [p0q] movq m3, [p1q] movhps m1, [p0q+p0sq] movhps m3, [p1q+p1sq] movq m4, [p0q+p0sq*2] movq m5, [p1q+p1sq*2] movhps m4, [p0q+p0s3q] movhps m5, [p1q+p1s3q] psadbw m1, m3 psadbw m4, m5 paddd m0, m2 paddd m1, m4 paddd m0, m1 punpckhqdq m1, m0, m0 paddd m0, m1 movd eax, m0 RET dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc_avx512.asm000066400000000000000000005577211517466257200235100ustar00rootroot00000000000000; Copyright © 2020, VideoLAN and dav2d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 pd_16384: dd 16384 pd_262144: dd 262144 warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 bilin_h_perm16: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 bilin_h_perm32: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 bilin_v_perm8: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 db 80, 32, 81, 33, 82, 34, 83, 35, 84, 36, 85, 37, 86, 38, 87, 39 db 32, 64, 33, 65, 34, 66, 35, 67, 36, 68, 37, 69, 38, 70, 39, 71 bilin_v_perm16: db 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 db 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 db 16, 64, 17, 65, 18, 66, 19, 67, 20, 68, 21, 69, 22, 70, 23, 71 db 24, 72, 25, 73, 26, 74, 27, 75, 28, 76, 29, 77, 30, 78, 31, 79 bilin_v_perm32: db 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71 db 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79 db 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87 db 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 bilin_v_perm64: dd 0, 0, 4, 8, 1, 1, 5, 9, 2, 2, 6, 10, 3, 3, 7, 11 spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 spel_h_perm32: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 spel_v_perm8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 8, 16, 9, 17, 10, 18, 11, 19, 12, 20, 13, 21, 14, 22, 15, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 db 24, 32, 25, 33, 26, 34, 27, 35, 28, 36, 29, 37, 30, 38, 31, 39 spel_v_perm16a: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm16b: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_hv_perm8b: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 spel_hv_perm16b:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 bilin_v_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 pb_8x0_8x8: times 8 db 0 times 8 db 8 pb_4: times 4 db 4 pb_32: times 4 db 32 pb_127: times 4 db 127 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 pw_512: times 2 dw 512 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 pd_32: dd 32 pd_34: dd 34 pd_63: dd 63 pd_512: dd 512 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) %define pd_2 (pd_0to7+8) cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 SECTION .text %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro INIT_ZMM avx512icl cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6w mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 vbroadcasti128 m4, [bilin_h_perm16] add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r7m ; my test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] add wq, r7 jmp wq .h_w2: movd xmm0, [srcq+ssq*0] pinsrd xmm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb xmm0, xm4 pmaddubsw xmm0, xm5 pmulhrsw xmm0, xm3 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: mova xmm4, [bilin_h_shuf4] .h_w4_loop: movq xmm0, [srcq+ssq*0] movhps xmm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xmm4 pmaddubsw xmm0, xm5 pmulhrsw xmm0, xm3 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb ym0, ym4 pmaddubsw ym0, ym5 pmulhrsw ym0, ym3 vpmovuswb xm0, ym0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mova m4, [bilin_h_perm16] .h_w16_loop: movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] vpermb m0, m4, m0 pmaddubsw m0, m5 pmulhrsw m0, m3 vpmovuswb ym0, m0 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: movu ym0, [srcq+ssq*0+8*0] vinserti32x8 m0, [srcq+ssq*1+8*0], 1 movu ym1, [srcq+ssq*0+8*1] vinserti32x8 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 add srcq, ssq mova [dstq], m0 add dstq, dsq dec hd jg .h_w64 RET .h_w128: movu m0, [srcq+8*0] movu m2, [srcq+8*1] movu m1, [srcq+8*8] movu m6, [srcq+8*9] add srcq, ssq REPX {pshufb x, m4}, m0, m2, m1, m6 REPX {pmaddubsw x, m5}, m0, m2, m1, m6 REPX {pmulhrsw x, m3}, m0, m2, m1, m6 packuswb m0, m2 packuswb m1, m6 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 255 vpbroadcastd m5, [pw_2048] add mxyd, 16 add wq, r7 vpbroadcastw m4, mxyd jmp wq .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xmm1, xmm1, q2301 ; 1 0 punpcklbw xmm1, xmm0 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 pextrw [dstq+dsq*0], xmm1, 1 pextrw [dstq+dsq*1], xmm1, 0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xmm1, xmm2, xmm0, 0x01 ; 0 1 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm2, xmm0, 0x02 ; 1 2 punpcklbw xmm1, xmm2 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm0, [srcq+ssq*0] .v_w8_loop: movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw xmm1, xmm0, xmm2 movq xmm0, [srcq+ssq*0] punpcklbw xmm2, xmm0 pmaddubsw xmm1, xm4 pmaddubsw xmm2, xm4 pmulhrsw xmm1, xm5 pmulhrsw xmm2, xm5 packuswb xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: movu xmm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 ymm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm2, ymm3, ymm0, 0x0f ; 0 1 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm3, ymm0, 0xf0 ; 1 2 punpcklbw ymm1, ymm2, ymm3 punpckhbw ymm2, ymm3 pmaddubsw ymm1, ym4 pmaddubsw ymm2, ym4 pmulhrsw ymm1, ym5 pmulhrsw ymm2, ym5 packuswb ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop vzeroupper RET .v_w32: movu ym0, [srcq+ssq*0] kxnorb k1, k1, k1 .v_w32_loop: vbroadcasti32x8 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendmd m2{k1}, m3, m0 ; 0 1 vbroadcasti32x8 m0, [srcq+ssq*0] vpblendmd m3{k1}, m0, m3 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0] .v_w64_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m1, m0, m3 punpckhbw m6, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m6, m4 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 REPX {pmulhrsw x, m5}, m1, m6, m2, m3 packuswb m1, m6 packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w128_loop: add srcq, ssq movu m2, [srcq+64*0] movu m3, [srcq+64*1] punpcklbw m6, m0, m2 pmaddubsw m6, m4 punpckhbw m0, m2 pmaddubsw m0, m4 punpcklbw m7, m1, m3 pmaddubsw m7, m4 punpckhbw m1, m3 pmaddubsw m1, m4 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 packuswb m6, m0 mova m0, m2 packuswb m7, m1 mova m1, m3 mova [dstq+64*0], m6 mova [dstq+64*1], m7 add dstq, dsq dec hd jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow vpbroadcastd m7, [pw_2048] add wq, r7 vpbroadcastw m6, mxyd jmp wq .hv_w2: vpbroadcastd xmm0, [srcq+ssq*0] pshufb xmm0, xm4 pmaddubsw xmm0, xm5 .hv_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pinsrd xmm1, [srcq+ssq*0], 1 pshufb xmm1, xm4 pmaddubsw xmm1, xm5 ; 1 _ 2 _ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm6 paddw xmm1, xmm2 pmulhrsw xmm1, xm7 packuswb xmm1, xmm1 pextrw [dstq+dsq*0], xmm1, 0 pextrw [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova xmm4, [bilin_h_shuf4] movddup xmm0, [srcq+ssq*0] pshufb xmm0, xmm4 pmaddubsw xmm0, xm5 .hv_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0] pshufb xmm1, xmm4 pmaddubsw xmm1, xm5 ; 1 2 shufps xmm2, xmm0, xmm1, q1032 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm6 paddw xmm1, xmm2 pmulhrsw xmm1, xm7 packuswb xmm1, xmm1 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 ym0, [srcq+ssq*0] pshufb ym0, ym4 pmaddubsw ym0, ym5 .hv_w8_loop: movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym1, [srcq+ssq*0], 1 pshufb ym1, ym4 pmaddubsw ym1, ym5 ; 1 2 valignq ym2, ym1, ym0, 2 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym6 paddw ym1, ym2 pmulhrsw ym1, ym7 vpmovuswb xm1, ym1 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: vbroadcasti32x8 m0, [srcq+ssq*0] mova m4, [bilin_h_perm16] vpermb m0, m4, m0 pmaddubsw m0, m5 .hv_w16_loop: movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0], 1 vpermb m1, m4, m1 pmaddubsw m1, m5 ; 1 2 valignq m2, m1, m0, 4 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 vpmovuswb ym1, m1 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+ssq*0] pmovzxbq m8, [pb_02461357] pmaddubsw m0, m5 .hv_w32_loop: vpermb m2, m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpermb m3, m4, [srcq+ssq*0] pmaddubsw m2, m5 psubw m1, m2, m0 paddw m1, m1 pmulhw m1, m6 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 paddw m3, m3 pmulhw m3, m6 paddw m3, m2 pmulhrsw m1, m7 pmulhrsw m3, m7 packuswb m1, m3 vpermq m1, m8, m1 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop RET .hv_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w64_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m8, m2, m0 psubw m9, m3, m1 paddw m8, m8 pmulhw m8, m6 paddw m9, m9 pmulhw m9, m6 paddw m8, m0 pmulhrsw m8, m7 paddw m9, m1 pmulhrsw m9, m7 mova m0, m2 mova m1, m3 packuswb m8, m9 mova [dstq], m8 add dstq, dsq dec hd jg .hv_w64_loop RET .hv_w128: movu m0, [srcq+8*0] movu m1, [srcq+8*1] movu m2, [srcq+8*8] movu m3, [srcq+8*9] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 .hv_w128_loop: add srcq, ssq movu m8, [srcq+8*0] movu m9, [srcq+8*1] movu m10, [srcq+8*8] movu m11, [srcq+8*9] REPX {pshufb x, m4}, m8, m9, m10, m11 REPX {pmaddubsw x, m5}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 paddw m12, m12 pmulhw m12, m6 paddw m13, m13 pmulhw m13, m6 paddw m14, m14 pmulhw m14, m6 paddw m15, m15 pmulhw m15, m6 paddw m12, m0 pmulhrsw m12, m7 paddw m13, m1 pmulhrsw m13, m7 paddw m14, m2 pmulhrsw m14, m7 paddw m15, m3 pmulhrsw m15, m7 mova m0, m8 mova m1, m9 mova m2, m10 mova m3, m11 packuswb m12, m13 packuswb m14, m15 mova [dstq+64*0], m12 mova [dstq+64*1], m14 add dstq, dsq dec hd jg .hv_w128_loop RET DECLARE_REG_TMP 3, 5, 6 cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea t2, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [t2+wq*2+table_offset(prep,)] add wq, t2 lea stride3q, [strideq*3] jmp wq .prep_w4: movd xmm0, [srcq+strideq*0] pinsrd xmm0, [srcq+strideq*1], 1 pinsrd xmm0, [srcq+strideq*2], 2 pinsrd xmm0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmovzxbw ym0, xmm0 psllw ym0, 4 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movq xmm0, [srcq+strideq*0] movq xmm1, [srcq+strideq*1] vinserti128 ym0, ymm0, [srcq+strideq*2], 1 vinserti128 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1 pmovzxbw m0, ym0 psllw m0, 4 mova [tmpq], m0 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: movu xmm0, [srcq+strideq*0] vinserti128 ym0, ymm0, [srcq+strideq*1], 1 movu xmm1, [srcq+strideq*2] vinserti128 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmovzxbw m0, ym0 pmovzxbw m1, ym1 psllw m0, 4 psllw m1, 4 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmovzxbw m0, [srcq+strideq*0] pmovzxbw m1, [srcq+strideq*1] pmovzxbw m2, [srcq+strideq*2] pmovzxbw m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmovzxbw m0, [srcq+strideq*0+32*0] pmovzxbw m1, [srcq+strideq*0+32*1] pmovzxbw m2, [srcq+strideq*1+32*0] pmovzxbw m3, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmovzxbw m0, [srcq+32*0] pmovzxbw m1, [srcq+32*1] pmovzxbw m2, [srcq+32*2] pmovzxbw m3, [srcq+32*3] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 add srcq, strideq dec hd jg .prep_w128 RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 add mxyd, 16 vpbroadcastw m5, mxyd mov mxyd, r6m ; my test mxyd, mxyd jnz .hv movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] add wq, t2 lea stride3q, [strideq*3] jmp wq .h_w4: vbroadcasti32x4 ym4, [bilin_h_shuf4] .h_w4_loop: movq xmm0, [srcq+strideq*0] movq xmm1, [srcq+strideq*1] vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1 pshufb ym0, ym4 pmaddubsw ym0, ym5 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: vbroadcasti32x4 m4, [bilin_h_perm16] .h_w8_loop: movu xmm0, [srcq+strideq*0] vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pshufb m0, m4 pmaddubsw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: mova m4, [bilin_h_perm16] .h_w16_loop: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vpermb m0, m4, m0 vpermb m1, m4, m1 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .h_w16_loop RET .h_w32: mova m4, [bilin_h_perm32] .h_w32_loop: vpermb m0, m4, [srcq+strideq*0] vpermb m1, m4, [srcq+strideq*1] vpermb m2, m4, [srcq+strideq*2] vpermb m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .h_w32_loop RET .h_w64: mova m4, [bilin_h_perm32] .h_w64_loop: vpermb m0, m4, [srcq+strideq*0+32*0] vpermb m1, m4, [srcq+strideq*0+32*1] vpermb m2, m4, [srcq+strideq*1+32*0] vpermb m3, m4, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .h_w64_loop RET .h_w128: mova m4, [bilin_h_perm32] .h_w128_loop: vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] vpermb m2, m4, [srcq+32*2] vpermb m3, m4, [srcq+32*3] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 add srcq, strideq dec hd jg .h_w128_loop RET .v: WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 255 add mxyd, 16 add wq, t2 lea stride3q, [strideq*3] vpbroadcastw m6, mxyd jmp wq .v_w4: vpbroadcastd xm0, [srcq+strideq*0] mov r3d, 0x29 vbroadcasti32x4 ym3, [bilin_v_shuf4] kmovb k1, r3d .v_w4_loop: vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ vpbroadcastd ym2, [srcq+strideq*2] vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ lea srcq, [srcq+strideq*4] vpbroadcastd ym0, [srcq+strideq*0] punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ pshufb ym2, ym3 pmaddubsw ym2, ym6 mova [tmpq], ym2 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: mova m5, [bilin_v_perm8] vbroadcasti32x4 ym0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vpbroadcastq ym0, [srcq+strideq*2] vinserti32x4 m1, [srcq+stride3q ], 2 lea srcq, [srcq+strideq*4] vinserti32x4 ym0, [srcq+strideq*0], 0 vpermt2b m1, m5, m0 pmaddubsw m1, m6 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mova m5, [bilin_v_perm16] movu xm0, [srcq+strideq*0] .v_w16_loop: movu xm2, [srcq+strideq*2] vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vpermt2b m1, m5, m2 vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] vpermt2b m2, m5, m0 pmaddubsw m1, m6 pmaddubsw m2, m6 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: mova m5, [bilin_v_perm32] movu ym0, [srcq+strideq*0] .v_w32_loop: movu ym2, [srcq+strideq*1] movu ym3, [srcq+strideq*2] movu ym4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpermt2b m0, m5, m2 vpermt2b m2, m5, m3 vpermt2b m3, m5, m4 pmaddubsw m1, m0, m6 movu ym0, [srcq+strideq*0] vpermt2b m4, m5, m0 pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m4, m6 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 mova [tmpq+64*2], m3 mova [tmpq+64*3], m4 add tmpq, 64*4 sub hd, 4 jg .v_w32_loop RET .v_w64: mova m5, [bilin_v_perm64] vpermq m0, m5, [srcq+strideq*0] .v_w64_loop: vpermq m1, m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m1 punpckhbw m2, m0, m1 vpermq m0, m5, [srcq+strideq*0] punpcklbw m3, m1, m0 punpckhbw m1, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m1, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m2 mova [tmpq+64*2], m3 mova [tmpq+64*3], m1 add tmpq, 64*4 sub hd, 2 jg .v_w64_loop RET .v_w128: mova m5, [bilin_v_perm64] vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] .v_w128_loop: vpermq m2, m5, [srcq+strideq*1+ 0] vpermq m3, m5, [srcq+strideq*1+64] lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m0 punpcklbw m4, m1, m3 punpckhbw m1, m3 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+64*2], m4 mova [tmpq+64*3], m1 vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] punpcklbw m4, m2, m0 punpckhbw m2, m0 pmaddubsw m4, m6 pmaddubsw m2, m6 mova [tmpq+64*4], m4 mova [tmpq+64*5], m2 punpcklbw m4, m3, m1 punpckhbw m3, m1 pmaddubsw m4, m6 pmaddubsw m3, m6 mova [tmpq+64*6], m4 mova [tmpq+64*7], m3 add tmpq, 64*8 sub hd, 2 jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m6, mxyd add wq, t2 lea stride3q, [strideq*3] jmp wq .hv_w4: vbroadcasti32x4 ym4, [bilin_h_shuf4] vpbroadcastq ym0, [srcq+strideq*0] pshufb ym0, ym4 pmaddubsw ym0, ym5 .hv_w4_loop: movq xmm1, [srcq+strideq*1] movq xmm2, [srcq+strideq*2] vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 punpcklqdq ym1, ym2 pshufb ym1, ym4 pmaddubsw ym1, ym5 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym6 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: vbroadcasti32x4 m4, [bilin_h_perm16] vbroadcasti32x4 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xmm1, [srcq+strideq*1] vinserti128 ym1, ymm1, [srcq+strideq*2], 1 vinserti128 m1, [srcq+stride3q ], 2 lea srcq, [srcq+strideq*4] vinserti128 m1, [srcq+strideq*0], 3 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: mova m4, [bilin_h_perm16] vbroadcasti32x8 m0, [srcq+strideq*0] vpermb m0, m4, m0 pmaddubsw m0, m5 .hv_w16_loop: movu ym1, [srcq+strideq*1] vinserti32x8 m1, [srcq+strideq*2], 1 movu ym2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti32x8 m2, [srcq+strideq*0], 1 vpermb m1, m4, m1 vpermb m2, m4, m2 pmaddubsw m1, m5 ; 1 2 vshufi32x4 m3, m0, m1, q1032 ; 0 1 pmaddubsw m0, m2, m5 ; 3 4 vshufi32x4 m2, m1, m0, q1032 ; 2 3 psubw m1, m3 pmulhrsw m1, m6 paddw m1, m3 psubw m3, m0, m2 pmulhrsw m3, m6 paddw m3, m2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m3 add tmpq, 64*2 sub hd, 4 jg .hv_w16_loop RET .hv_w32: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+strideq*0] pmaddubsw m0, m5 .hv_w32_loop: vpermb m1, m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermb m2, m4, [srcq+strideq*0] pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+64*0], m3 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w64_loop: add srcq, strideq vpermb m2, m4, [srcq+32*0] vpermb m3, m4, [srcq+32*1] pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m7, m2, m0 psubw m8, m3, m1 pmulhrsw m7, m6 pmulhrsw m8, m6 paddw m7, m0 mova m0, m2 paddw m8, m1 mova m1, m3 mova [tmpq+64*0], m7 mova [tmpq+64*1], m8 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] vpermb m2, m4, [srcq+32*2] vpermb m3, m4, [srcq+32*3] REPX {pmaddubsw x, m5}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq vpermb m7, m4, [srcq+32*0] vpermb m8, m4, [srcq+32*1] vpermb m9, m4, [srcq+32*2] vpermb m10, m4, [srcq+32*3] REPX {pmaddubsw x, m5}, m7, m8, m9, m10 psubw m11, m7, m0 psubw m12, m8, m1 psubw m13, m9, m2 psubw m14, m10, m3 REPX {pmulhrsw x, m6}, m11, m12, m13, m14 paddw m11, m0 mova m0, m7 paddw m12, m1 mova m1, m8 paddw m13, m2 mova m2, m9 paddw m14, m3 mova m3, m10 mova [tmpq+64*0], m11 mova [tmpq+64*1], m12 mova [tmpq+64*2], m13 mova [tmpq+64*3], m14 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4-5 ; fn, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %if %0 == 5 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro %macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb %if %5 vpermb m%2, m6, m%1 vpermb m%3, m7, m%1 vpermb m%4, m8, m%1 %else %if %2 < %4 ; reuse a previous value if possible pshufb m%2, m%1, m6 %endif pshufb m%3, m%1, m7 pshufb m%4, m%1, m8 %endif mova m%1, m5 vpdpbusd m%1, m%2, m9 mova m%2, m5 vpdpbusd m%2, m%3, m9 vpdpbusd m%1, m%3, m10 vpdpbusd m%2, m%4, m10 packusdw m%1, m%2 psrlw m%1, 6 %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif ; Due to the use of vpdpbusd (which does 4 pixels per instruction) in ; the horizontal filter, 6-tap is only used for the vertical filter. %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_6tap_8bpc PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_6tap_8bpc PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ns %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .put: tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 lea r6, [ssq*3] lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _6tap_v)] vpbroadcastd m6, [pw_512] lea myq, [base+subpel_filters+1+myq*8] vpbroadcastw m7, [myq+0] add r6, r8 vpbroadcastw m8, [myq+2] mov nsq, ssq vpbroadcastw m9, [myq+4] neg nsq jmp r6 .v_w2: movd xmm2, [srcq+nsq*2] pinsrw xmm2, [srcq+nsq*1], 2 pinsrw xmm2, [srcq+ssq*0], 4 pinsrw xmm2, [srcq+ssq*1], 6 ; 0 1 2 3 lea srcq, [srcq+ssq*2] vpbroadcastd xmm0, [srcq+ssq*0] palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 punpcklbw xmm1, xmm2, xmm3 ; 01 12 punpckhbw xmm2, xmm3 ; 23 34 .v_w2_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xmm3, xmm1, xm7 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm8 ; a1 b1 paddw xmm3, xmm2 vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 5 6 punpcklbw xmm2, xmm4 ; 67 78 pmaddubsw xmm4, xmm2, xm9 ; a3 b3 paddw xmm3, xmm4 pmulhrsw xmm3, xm6 packuswb xmm3, xmm3 pextrw [dstq+dsq*0], xmm3, 0 pextrw [dstq+dsq*1], xmm3, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm2, [srcq+nsq*2] pinsrd xmm2, [srcq+nsq*1], 1 pinsrd xmm2, [srcq+ssq*0], 2 pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*2] vpbroadcastd xmm0, [srcq+ssq*0] palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4 punpcklbw xmm1, xmm2, xmm3 ; 01 12 punpckhbw xmm2, xmm3 ; 23 34 .v_w4_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xmm3, xmm1, xm7 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm8 ; a1 b1 paddw xmm3, xmm2 vpblendd xmm2, xmm0, xmm4, 0x02 ; 4 5 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 5 6 punpcklbw xmm2, xmm4 ; 45 56 pmaddubsw xmm4, xmm2, xm9 ; a2 b2 paddw xmm3, xmm4 pmulhrsw xmm3, xm6 packuswb xmm3, xmm3 movd [dstq+dsq*0], xmm3 pextrd [dstq+dsq*1], xmm3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm1, [srcq+nsq*2] vpbroadcastq ymm3, [srcq+nsq*1] vpbroadcastq ymm2, [srcq+ssq*0] vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm1, ymm3, 0x30 vpblendd ymm3, ymm2, 0x30 punpcklbw ymm1, ymm3 ; 01 12 vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 punpcklbw ymm2, ymm4 ; 23 34 .v_w8_loop: vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw ymm3, ymm1, ym7 ; a0 b0 mova ymm1, ymm2 pmaddubsw ymm2, ym8 ; a1 b1 paddw ymm3, ymm2 vpblendd ymm2, ymm0, ymm4, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm4, ymm0, 0x30 punpcklbw ymm2, ymm4 ; 45 56 pmaddubsw ymm4, ymm2, ym9 ; a2 b2 paddw ymm3, ymm4 pmulhrsw ymm3, ym6 vextracti128 xmm4, ymm3, 1 packuswb xmm3, xmm4 movq [dstq+dsq*0], xmm3 movhps [dstq+dsq*1], xmm3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: mova m5, [spel_v_perm16a] vbroadcasti32x4 m1, [srcq+nsq*2] vbroadcasti32x4 ym3, [srcq+nsq*1] mov r6d, 0x0f vbroadcasti32x4 m2, [srcq+ssq*0] kmovb k1, r6d vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m1{k1}, m3, m2, 0xcc vshufpd m2{k1}, m4, m0, 0xcc vpermb m1, m5, m1 ; 01 12 vpermb m2, m5, m2 ; 23 34 .v_w16_loop: vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m3, m1, m7 ; a0 b0 mova m1, m2 pmaddubsw m2, m8 ; a1 b1 paddw m3, m2 mova m2, m0 vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m2{k1}, m4, m0, 0xcc vpermb m2, m5, m2 ; 45 56 pmaddubsw m4, m2, m9 ; a2 b2 paddw m3, m4 pmulhrsw m3, m6 vextracti32x8 ym4, m3, 1 packuswb ym3, ym4 mova [dstq+dsq*0], xm3 vextracti32x4 [dstq+dsq*1], ym3, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: mova m10, [spel_v_perm32] pmovzxbq m5, [pb_02461357] vpshrdw m11, m10, m10, 8 movu ym0, [srcq+nsq*2] vinserti32x8 m0, [srcq+nsq*1], 1 vpermb m1, m10, m0 ; 01 vinserti32x8 m0, [srcq+ssq*0], 0 vpermb m2, m11, m0 ; 12 vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] vpermb m3, m10, m0 ; 23 vinserti32x8 m0, [srcq+ssq*0], 0 vpermb m4, m11, m0 ; 34 .v_w32_loop: vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddubsw m12, m1, m7 mova m1, m3 pmaddubsw m13, m2, m7 mova m2, m4 pmaddubsw m14, m3, m8 vpermb m3, m10, m0 ; 45 vinserti32x8 m0, [srcq+ssq*0], 0 pmaddubsw m15, m4, m8 vpermb m4, m11, m0 ; 56 paddw m12, m14 pmaddubsw m14, m3, m9 paddw m13, m15 pmaddubsw m15, m4, m9 paddw m12, m14 paddw m13, m15 pmulhrsw m12, m6 pmulhrsw m13, m6 packuswb m12, m13 vpermq m12, m5, m12 mova [dstq+dsq*0], ym12 vextracti32x8 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: .v_w128: lea r6d, [hq+wq*4-256] .v_loop0: movu m2, [srcq+nsq*2] movu m4, [srcq+nsq*1] lea r4, [srcq+ssq*2] movu m11, [srcq+ssq*0] movu m13, [srcq+ssq*1] mov r7, dstq movu m0, [r4 +ssq*0] punpcklbw m1, m2, m4 ; 01l punpckhbw m2, m4 ; 01h punpcklbw m3, m4, m11 ; 12l punpckhbw m4, m11 ; 12h punpcklbw m10, m11, m13 ; 23l punpckhbw m11, m13 ; 23h punpcklbw m12, m13, m0 ; 34l punpckhbw m13, m0 ; 34h .v_loop: movu m5, [r4+ssq*1] pmaddubsw m14, m1, m7 ; a0l mova m1, m10 pmaddubsw m10, m8 ; a1l lea r4, [r4+ssq*2] pmaddubsw m15, m2, m7 ; a0h mova m2, m11 pmaddubsw m11, m8 ; a1h paddw m14, m10 punpcklbw m10, m0, m5 ; 45l paddw m15, m11 punpckhbw m11, m0, m5 ; 45h pmaddubsw m0, m10, m9 ; a2l paddw m14, m0 pmaddubsw m0, m11, m9 ; a2h paddw m15, m0 movu m0, [r4+ssq*0] pmulhrsw m14, m6 pmulhrsw m15, m6 packuswb m14, m15 pmaddubsw m15, m3, m7 ; b0l mova m3, m12 pmaddubsw m12, m8 ; b1l mova [r7+dsq*0], m14 pmaddubsw m14, m4, m7 ; b0h mova m4, m13 pmaddubsw m13, m8 ; b1h paddw m15, m12 punpcklbw m12, m5, m0 ; 56l paddw m14, m13 punpckhbw m13, m5, m0 ; 56h pmaddubsw m5, m12, m9 ; b2l paddw m15, m5 pmaddubsw m5, m13, m9 ; b2h paddw m14, m5 pmulhrsw m15, m6 pmulhrsw m14, m6 packuswb m15, m14 mova [r7+dsq*1], m15 lea r7, [r7+dsq*2] sub hd, 2 jg .v_loop add srcq, 64 add dstq, 64 movzx hd, r6b sub r6d, 256 jg .v_loop0 RET .h: test myd, 0xf00 jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2 .hv: vpbroadcastd m9, [pd_34] mova xm10, [spel_hv_end] pxor xm0, xm0 cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq ym1, [base+subpel_filters+1+myq*8] mov nsq, ssq punpcklbw ym0, ym1 neg nsq psraw ym0, 2 ; << 6 pshufd ym11, ym0, q0000 pshufd ym12, ym0, q1111 pshufd ym13, ym0, q2222 cmp wd, 4 je .hv_w4 vbroadcasti128 ym5, [subpel_h_shuf4] movq xmm0, [srcq+nsq*2] movhps xmm0, [srcq+nsq*1] movq xmm2, [srcq+ssq*0] movhps xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpbroadcastq ymm1, [srcq+ssq*0] vpblendd ymm0, ymm1, 0x30 pshufb xmm2, xm5 ; 2 3 pshufb ymm0, ym5 ; 0 1 4 mova xmm1, xm9 vpdpbusd xmm1, xmm2, xm7 mova ymm2, ym9 vpdpbusd ymm2, ymm0, ym7 packssdw ymm2, ymm1 psraw ymm2, 2 vextracti128 xmm0, ymm2, 1 vzeroupper palignr xmm0, xmm2, 4 punpcklwd xmm1, xmm2, xmm0 ; 01 12 punpckhwd xmm2, xmm0 ; 23 34 .hv_w2_loop: movq xmm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm3, [srcq+ssq*0] pmaddwd xmm4, xmm1, xm11 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm4, xmm2, xm12 ; a1 b1 pshufb xmm3, xm5 mova xmm2, xm9 vpdpbusd xmm2, xmm3, xm7 packssdw xmm3, xmm2, xmm2 psraw xmm3, 2 palignr xmm2, xmm3, xmm0, 12 mova xmm0, xmm3 punpcklwd xmm2, xmm3 ; 45 56 vpdpwssd xmm4, xmm2, xm13 ; a2 b2 packuswb xmm4, xmm4 pshufb xmm4, xm10 pextrw [dstq+dsq*0], xmm4, 0 pextrw [dstq+dsq*1], xmm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: movq xm2, [srcq+nsq*2] vpbroadcastq ym1, [srcq+nsq*1] vinserti32x4 ym2, [srcq+ssq*0], 1 vinserti32x4 m1, [srcq+ssq*1], 2 ; _ 1 3 lea srcq, [srcq+ssq*2] vbroadcasti32x4 m5, [subpel_h_shufA] vinserti32x4 m2, [srcq+ssq*0], 2 ; 0 2 4 pshufb m1, m5 mova m0, m9 pshufb m2, m5 mova m3, m9 vpdpbusd m0, m1, m7 mova ym1, [spel_hv_perm4a] vpdpbusd m3, m2, m7 mova ym2, [spel_hv_perm4b] mov r6d, 0x5555 mova ym6, [spel_hv_perm4d] packssdw m0, m3 kmovw k1, r6d psraw m0, 2 ; _ 0 1 2 3 4 5 6 vpermb ym1, ym1, ym0 ; 01 12 vpermb m2, m2, m0 ; 23 34 .hv_w4_loop: movq xm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym3, [srcq+ssq*0], 1 pmaddwd ym4, ym1, ym11 ; a0 b0 mova ym1, ym2 pshufb ym3, ym5 mova ym0, ym9 vpdpbusd ym0, ym3, ym7 vpdpwssd ym4, ym2, ym12 ; a1 b1 vpsraw ym2{k1}, ym0, 2 ; 5 6 vpermb ym2, ym6, ym2 ; 45 56 vpdpwssd ym4, ym2, ym13 ; a2 b2 packuswb ym4, ym4 vpermb ym4, ym10, ym4 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m11, [base+subpel_filters+mxq*8+0] vpbroadcastd m12, [base+subpel_filters+mxq*8+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m1, [base+subpel_filters+1+myq*8] mov nsq, ssq punpcklbw m0, m1 neg nsq psraw m0, 2 ; << 6 pshufd m13, m0, q0000 pshufd m14, m0, q1111 pshufd m15, m0, q2222 cmp wd, 8 jne .hv_w16 movu xm0, [srcq+nsq*2] vinserti32x4 ym0, [srcq+nsq*1], 1 vbroadcasti32x4 m1, [subpel_h_shufA] vinserti32x4 m0, [srcq+ssq*0], 2 vbroadcasti32x4 m4, [subpel_h_shufB] vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] vbroadcasti32x4 m7, [subpel_h_shufC] vbroadcasti32x4 ym5, [srcq+ssq*0] vbroadcasti32x8 m6, [subpel_h_shufA] pshufb m1, m0, m1 ; 0 1 2 3 0123 mova m2, m9 vpdpbusd m2, m1, m11 pshufb m4, m0, m4 ; 0 1 2 3 4567 mova m1, m9 vpdpbusd m1, m4, m11 pshufb m0, m7 ; 0 1 2 3 89ab pshufb ym7, ym5, ym6 ; 4 0123 4567 mova ym3, ym9 vpdpbusd ym3, ym7, ym11 vbroadcasti32x8 m7, [subpel_h_shufB] vpdpbusd m2, m4, m12 mova m4, [spel_hv_perm8a] pshufb ym5, ym7 ; 4 4567 89ab vpdpbusd m1, m0, m12 vpaddd m0, m4, [pb_32] {1to16} vpdpbusd ym3, ym5, ym12 mova m5, [spel_hv_perm8b] mov r6, 0x55555555ff00 packssdw m2, m1 vpmovsdw xm3, ym3 kmovq k1, r6 psraw m2, 2 ; 0 1 2 3 psraw xm3, 2 ; 4 vpermb m1, m4, m2 ; 01 12 kshiftrq k2, k1, 16 vpermt2b m2, m0, m3 ; 23 34 .hv_w8_loop: vbroadcasti32x4 ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m3{k1}, [srcq+ssq*0] pmaddwd m0, m1, m13 ; a0 b0 pshufb m1, m3, m6 ; 5 6 0123 4567 mova m4, m9 vpdpbusd m4, m1, m11 pshufb m3, m7 ; 5 6 4567 89ab vpdpwssd m0, m2, m14 ; a1 b1 mova m1, m2 vpdpbusd m4, m3, m12 psraw m2{k2}, m4, 2 ; 53 64 vpermb m2, m5, m2 ; 45 56 vpdpwssd m0, m2, m15 ; a2 b2 packuswb m0, m0 vpermb m0, m10, m0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: movu m19, [spel_hv_perm16a] vpbroadcastd m7, [pb_4] lea r6d, [wq*2-32] mova m6, [spel_hv_perm16b] paddb m20, m7, m19 lea r6d, [hq+r6*8] paddb m21, m7, m20 mova ym10, [spel_hv_end16] paddb m7, m6 .hv_w16_loop0: movu ym16, [srcq+nsq*2] vinserti32x8 m16, [srcq+nsq*1], 1 lea r4, [srcq+ssq*2] movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 mov r7, dstq movu ym18, [r4 +ssq*0] vpermb m2, m19, m16 ; 0 1 0123 89ab mova m1, m9 vpermb m3, m21, m16 ; 0 1 89ab ghij vpdpbusd m1, m2, m11 mova m2, m9 vpermb m4, m19, m17 ; 2 3 0123 89ab vpdpbusd m2, m3, m12 mova m3, m9 vpermb m5, m21, m17 ; 2 3 89ab ghij vpdpbusd m3, m4, m11 mova m4, m9 vpermb m0, m6, m18 ; 4 0145 2367 89cd abef vpdpbusd m4, m5, m12 mova m5, m9 vpermb m16, m20, m16 ; 0 1 4567 cdef vpdpbusd m5, m0, m11 vpermb m17, m20, m17 ; 2 3 4567 cdef vpdpbusd m1, m16, m12 vpermb m18, m7, m18 ; 4 4589 67ab cdgh efij vpdpbusd m2, m16, m11 vpdpbusd m3, m17, m12 vpdpbusd m4, m17, m11 vpdpbusd m5, m18, m12 packssdw m1, m2 ; 01 packssdw m3, m4 ; 23 REPX {psraw x, 2}, m1, m3, m5 vpshrdd m2, m1, m3, 16 ; 12 vpshrdd m4, m3, m5, 16 ; 34 .hv_w16_loop: movu ym18, [r4+ssq*1] lea r4, [r4+ssq*2] vinserti32x8 m18, [r4+ssq*0], 1 pmaddwd m16, m1, m13 ; a0 vpermb m1, m19, m18 ; 5 6 0123 89ab pmaddwd m17, m2, m13 ; b0 vpermb m2, m20, m18 ; 5 6 4567 cdef mova m0, m9 vpdpbusd m0, m1, m11 vpermb m18, m21, m18 mova m1, m9 vpdpbusd m1, m2, m11 vpdpwssd m16, m3, m14 ; a1 vpdpwssd m17, m4, m14 ; b1 vpdpbusd m0, m2, m12 mova m2, m4 vpdpbusd m1, m18, m12 packssdw m0, m1 mova m1, m3 psraw m4, m0, 2 ; 5 6 vpshrdd m3, m2, m4, 16 ; 4 5 vpdpwssd m17, m4, m15 ; b2 vpdpwssd m16, m3, m15 ; a2 packuswb m16, m17 vpermb m16, m10, m16 mova [r7+dsq*0], xm16 vextracti128 [r7+dsq*1], ym16, 1 lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w16_loop add srcq, 16 add dstq, 16 movzx hd, r6b sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper RET PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc PUT_8TAP_FN sharp, SHARP, SHARP cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd lea myq, [base+subpel_filters+myq*8] movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] vpbroadcastd m7, [pw_512] vpbroadcastw m8, [myq+0] add r6, r8 vpbroadcastw m9, [myq+2] lea ss3q, [ssq*3] vpbroadcastw m10, [myq+4] sub srcq, ss3q vpbroadcastw m11, [myq+6] jmp r6 .v_w2: movd xmm2, [srcq+ssq*0] pinsrw xmm2, [srcq+ssq*1], 2 pinsrw xmm2, [srcq+ssq*2], 4 add srcq, ss3q pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklbw xmm3, xmm1 ; 45 56 punpcklbw xmm1, xmm2, xmm4 ; 01 12 punpckhbw xmm2, xmm4 ; 23 34 .v_w2_loop: pmaddubsw xmm5, xmm1, xm8 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm9 ; a1 b1 paddw xmm5, xmm2 mova xmm2, xmm3 pmaddubsw xmm3, xm10 ; a2 b2 paddw xmm5, xmm3 vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 punpcklbw xmm3, xmm4 ; 67 78 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 paddw xmm5, xmm4 pmulhrsw xmm5, xm7 packuswb xmm5, xmm5 pextrw [dstq+dsq*0], xmm5, 0 pextrw [dstq+dsq*1], xmm5, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, ss3q pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklbw xmm3, xmm1 ; 45 56 punpcklbw xmm1, xmm2, xmm4 ; 01 12 punpckhbw xmm2, xmm4 ; 23 34 .v_w4_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xmm5, xmm1, xm8 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm9 ; a1 b1 paddw xmm5, xmm2 mova xmm2, xmm3 pmaddubsw xmm3, xm10 ; a2 b2 paddw xmm5, xmm3 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 punpcklbw xmm3, xmm4 ; 67 78 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 paddw xmm5, xmm4 pmulhrsw xmm5, xm7 packuswb xmm5, xmm5 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, ss3q vpbroadcastq ymm5, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm4, [srcq+ssq*2] add srcq, ss3q vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklbw ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm5, 0x30 vpblendd ymm5, ymm3, 0x30 punpcklbw ymm2, ymm5 ; 23 34 vpblendd ymm3, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 punpcklbw ymm3, ymm4 ; 45 56 .v_w8_loop: vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw ymm5, ymm1, ym8 ; a0 b0 mova ymm1, ymm2 pmaddubsw ymm2, ym9 ; a1 b1 paddw ymm5, ymm2 mova ymm2, ymm3 pmaddubsw ymm3, ym10 ; a2 b2 paddw ymm5, ymm3 vpblendd ymm3, ymm0, ymm4, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm4, ymm4, ymm0, 0x30 punpcklbw ymm3, ymm4 ; 67 78 pmaddubsw ymm4, ymm3, ym11 ; a3 b3 paddw ymm5, ymm4 pmulhrsw ymm5, ym7 vextracti128 xmm4, ymm5, 1 packuswb xmm5, xmm4 movq [dstq+dsq*0], xmm5 movhps [dstq+dsq*1], xmm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: mova m12, [spel_v_perm16a] vbroadcasti32x4 m1, [srcq+ssq*0] vbroadcasti32x4 ym4, [srcq+ssq*1] mov r6d, 0x0f vbroadcasti32x4 m2, [srcq+ssq*2] add srcq, ss3q vbroadcasti32x4 ym5, [srcq+ssq*0] kmovb k1, r6d vbroadcasti32x4 m3, [srcq+ssq*1] vbroadcasti32x4 ym6, [srcq+ssq*2] add srcq, ss3q vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m1{k1}, m4, m2, 0xcc vshufpd m2{k1}, m5, m3, 0xcc vshufpd m3{k1}, m6, m0, 0xcc vpermb m1, m12, m1 ; 01 12 vpermb m2, m12, m2 ; 23 34 vpermb m3, m12, m3 ; 45 56 .v_w16_loop: pmaddubsw m4, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m5, m2, m9 ; a1 b1 mova m2, m3 pmaddubsw m6, m3, m10 ; a2 b2 mova m3, m0 paddw m4, m5 vbroadcasti32x4 ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m3{k1}, m5, m0, 0xcc vpermb m3, m12, m3 ; 67 78 pmaddubsw m5, m3, m11 ; a3 b3 paddw m4, m6 paddw m4, m5 pmulhrsw m4, m7 vextracti32x8 ym5, m4, 1 packuswb ym4, ym5 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: mova m12, [spel_v_perm32] pmovzxbq m14, [pb_02461357] vpshrdw m13, m12, m12, 8 movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 vpermb m1, m12, m0 ; 01 vinserti32x8 m0, [srcq+ssq*2], 0 add srcq, ss3q vpermb m2, m13, m0 ; 12 vinserti32x8 m0, [srcq+ssq*0], 1 vpermb m3, m12, m0 ; 23 vinserti32x8 m0, [srcq+ssq*1], 0 vpermb m4, m13, m0 ; 34 vinserti32x8 m0, [srcq+ssq*2], 1 add srcq, ss3q vpermb m5, m12, m0 ; 45 vinserti32x8 m0, [srcq+ssq*0], 0 vpermb m6, m13, m0 ; 56 .v_w32_loop: vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddubsw m15, m1, m8 mova m1, m3 pmaddubsw m16, m2, m8 mova m2, m4 pmaddubsw m17, m3, m9 mova m3, m5 pmaddubsw m18, m4, m9 mova m4, m6 pmaddubsw m19, m5, m10 vpermb m5, m12, m0 ; 67 vinserti32x8 m0, [srcq+ssq*0], 0 pmaddubsw m20, m6, m10 vpermb m6, m13, m0 ; 78 paddw m15, m17 pmaddubsw m17, m5, m11 paddw m16, m18 pmaddubsw m18, m6, m11 paddw m15, m19 paddw m16, m20 paddw m15, m17 paddw m16, m18 pmulhrsw m15, m7 pmulhrsw m16, m7 packuswb m15, m16 vpermq m15, m14, m15 mova [dstq+dsq*0], ym15 vextracti32x8 [dstq+dsq*1], m15, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: .v_w128: lea r6d, [hq+wq*4-256] mov r4, srcq mov r7, dstq .v_loop0: movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q movu m13, [srcq+ssq*0] movu m15, [srcq+ssq*1] movu m17, [srcq+ssq*2] add srcq, ss3q movu m0, [srcq+ssq*0] punpcklbw m1, m2, m4 ; 01l punpckhbw m2, m4 ; 01h punpcklbw m3, m4, m6 ; 12l punpckhbw m4, m6 ; 12h punpcklbw m5, m6, m13 ; 23l punpckhbw m6, m13 ; 23h punpcklbw m12, m13, m15 ; 34l punpckhbw m13, m15 ; 34h punpcklbw m14, m15, m17 ; 45l punpckhbw m15, m17 ; 45h punpcklbw m16, m17, m0 ; 56l punpckhbw m17, m0 ; 56h .v_loop: pmaddubsw m18, m1, m8 ; a0l mova m1, m5 pmaddubsw m19, m2, m8 ; a0h mova m2, m6 pmaddubsw m20, m3, m8 ; b0l mova m3, m12 pmaddubsw m21, m4, m8 ; b0h mova m4, m13 pmaddubsw m5, m9 ; a1l pmaddubsw m6, m9 ; a1h pmaddubsw m12, m9 ; b1l pmaddubsw m13, m9 ; b1h paddw m18, m5 mova m5, m14 pmaddubsw m14, m10 ; a2l paddw m19, m6 mova m6, m15 pmaddubsw m15, m10 ; a2h paddw m20, m12 mova m12, m16 pmaddubsw m16, m10 ; b2l paddw m21, m13 mova m13, m17 pmaddubsw m17, m10 ; b2h paddw m18, m14 paddw m19, m15 paddw m20, m16 paddw m21, m17 movu m17, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m14, m0, m17 ; 67l punpckhbw m15, m0, m17 ; 67h pmaddubsw m16, m14, m11 ; a3l pmaddubsw m0, m15, m11 ; a3h paddw m18, m16 paddw m19, m0 movu m0, [srcq+ssq*0] punpcklbw m16, m17, m0 ; 78l punpckhbw m17, m0 ; 78h pmulhrsw m18, m7 pmulhrsw m19, m7 packuswb m18, m19 mova [dstq+dsq*0], m18 pmaddubsw m18, m16, m11 ; b3l pmaddubsw m19, m17, m11 ; b3h paddw m18, m20 paddw m19, m21 pmulhrsw m18, m7 pmulhrsw m19, m7 packuswb m18, m19 mova [dstq+dsq*1], m18 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 256 jg .v_loop0 vzeroupper RET .h: test myd, 0xf00 jnz .hv .h2: vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) cmp wd, 4 jl .h_w2 vbroadcasti128 m6, [subpel_h_shufA] je .h_w4 tzcnt wd, wd vbroadcasti128 m7, [subpel_h_shufB] vbroadcasti128 m8, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] vpbroadcastd m9, [base+mxq*8+subpel_filters+0] vpbroadcastd m10, [base+mxq*8+subpel_filters+4] add wq, r8 jmp wq .h_w2: movzx mxd, mxb dec srcq mova xmm4, [subpel_h_shuf4] vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] .h_w2_loop: movq xmm0, [srcq+ssq*0] movhps xmm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xmm4 mova xmm1, xm5 vpdpbusd xmm1, xmm0, xmm3 packssdw xmm0, xmm1, xmm1 psraw xmm0, 6 packuswb xmm0, xm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb dec srcq vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] .h_w4_loop: movq xmm0, [srcq+ssq*0] movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xm6 pshufb xmm1, xm6 mova xmm2, xm5 vpdpbusd xmm2, xmm0, xmm3 mova xmm0, xm5 vpdpbusd xmm0, xmm1, xmm3 packssdw xmm0, xmm2, xmm0 psraw xmm0, 6 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 vpmovuswb xm0, ym0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mova m6, [spel_h_perm16] vpbroadcastd m8, [pb_4] paddb m7, m8, m6 paddb m8, m7 .h_w16_loop: movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 1, 2, 3, 1 vpmovuswb ym0, m0 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: movu ym0, [srcq+ssq*0+8*0] vinserti32x8 m0, [srcq+ssq*1+8*0], 1 movu ym1, [srcq+ssq*0+8*1] vinserti32x8 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 4, 3, 2 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 4, 3, 2 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w64 RET .h_w128: movu m0, [srcq+8*0] movu m2, [srcq+8*1] movu m1, [srcq+8*8] movu m3, [srcq+8*9] add srcq, ssq PUT_8TAP_H 0, 4, 11, 12 PUT_8TAP_H 2, 12, 11, 4 PUT_8TAP_H 1, 4, 11, 12 PUT_8TAP_H 3, 12, 11, 4 packuswb m0, m2 packuswb m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w128 RET .hv: vpbroadcastd m9, [pd_34] pxor xm0, xm0 cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq ym1, [base+subpel_filters+myq*8] lea ss3q, [ssq*3] mov r6, srcq punpcklbw ym0, ym1 sub r6, ss3q psraw ym0, 2 ; << 6 mova xm14, [spel_hv_end] pshufd ym10, ym0, q0000 pshufd ym11, ym0, q1111 pshufd ym12, ym0, q2222 pshufd ym13, ym0, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 ym6, [subpel_h_shuf4] movq xmm2, [r6+ssq*0] movhps xmm2, [r6+ssq*1] movq xmm0, [r6+ssq*2] movhps xmm0, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq ymm1, [srcq+ssq*0] vpblendd ymm2, ymm3, 0x30 vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 pshufb ymm2, ym6 pshufb ymm0, ym6 mova ymm1, ym9 vpdpbusd ymm1, ymm2, ym7 mova ymm2, ym9 vpdpbusd ymm2, ymm0, ym7 packssdw ymm2, ymm1, ymm2 psraw ymm2, 2 vextracti128 xmm3, ymm2, 1 palignr xmm4, xmm3, xmm2, 4 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 pshufd xmm0, xmm3, q2121 punpcklwd xmm3, xmm0 ; 45 56 .hv_w2_loop: movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm4, [srcq+ssq*0] pmaddwd xmm5, xmm1, xm10 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xmm2, xm11 ; a1 b1 pshufb xmm4, xm6 mova xmm2, xmm3 vpdpwssd xmm5, xmm3, xm12 ; a2 b2 mova xmm3, xm9 vpdpbusd xmm3, xmm4, xm7 packssdw xmm4, xmm3, xmm3 psraw xmm4, 2 palignr xmm3, xmm4, xmm0, 12 mova xmm0, xmm4 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xmm3, xm13 ; a3 b3 packuswb xmm5, xmm5 pshufb xmm5, xm14 pextrw [dstq+dsq*0], xmm5, 0 pextrw [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop vzeroupper RET .hv_w4: movq xmm1, [r6+ssq*0] vpbroadcastq ym2, [r6+ssq*1] vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 vinserti32x4 m2, [srcq+ssq*0], 2 vinserti32x4 m1, [srcq+ssq*1], 2 vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 vbroadcasti32x4 m6, [subpel_h_shufA] add srcq, ss3q vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 pshufb m2, m6 pshufb m1, m6 mova m0, m9 vpdpbusd m0, m2, m7 mova m4, m9 vpdpbusd m4, m1, m7 mova ym1, [spel_hv_perm4a] mova ym2, [spel_hv_perm4b] mova ym3, [spel_hv_perm4c] packssdw m0, m4 psraw m0, 2 ; _ 0 1 2 3 4 5 6 mov r6d, 0x5555 vpermb ym1, ym1, ym0 ; 01 12 vpermb m2, m2, m0 ; 23 34 vpermb m3, m3, m0 ; 45 56 kmovw k1, r6d mova ym15, [spel_hv_perm4d] .hv_w4_loop: movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 pmaddwd ym5, ym1, ym10 ; a0 b0 mova ym1, ym2 pshufb ym4, ym6 mova ym0, ym9 vpdpbusd ym0, ym4, ym7 vpdpwssd ym5, ym2, ym11 ; a1 b1 mova ym2, ym3 vpdpwssd ym5, ym3, ym12 ; a2 b2 vpsraw ym3{k1}, ym0, 2 ; 7 8 vpermb ym3, ym15, ym3 ; 67 78 vpdpwssd ym5, ym3, ym13 ; a3 b3 packuswb ym5, ym5 vpermb ym5, ym14, ym5 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] vpbroadcastd m11, [base+subpel_filters+mxq*8+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] punpcklbw m0, m1 lea ss3q, [ssq*3] psraw m0, 2 ; << 6 pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 cmp wd, 8 jne .hv_w16 mov r6, srcq sub r6, ss3q movu xmm1, [r6+ssq*0] vinserti128 ymm1, [r6+ssq*1], 1 movu xmm2, [srcq+ssq*1] vinserti32x4 m6, zmm1, [r6+ssq*2], 2 vinserti128 ymm2, [srcq+ssq*2], 1 vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 add srcq, ss3q vbroadcasti32x4 m4, [subpel_h_shufA] vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ vbroadcasti32x4 m7, [subpel_h_shufB] vbroadcasti32x4 m8, [subpel_h_shufC] pshufb m1, m6, m4 ; 0 1 2 3 0123 mova m2, m9 vpdpbusd m2, m1, m10 pshufb m5, m6, m7 ; 0 1 2 3 4567 mova m1, m9 vpdpbusd m1, m5, m10 pshufb m4, m0, m4 ; 4 5 6 _ 0123 mova m3, m9 vpdpbusd m3, m4, m10 pshufb m7, m0, m7 ; 4 5 6 _ 4567 mova m4, m9 vpdpbusd m4, m7, m10 pshufb m6, m8 vpdpbusd m2, m5, m11 vpdpbusd m1, m6, m11 pshufb m6, m0, m8 vpdpbusd m3, m7, m11 vpdpbusd m4, m6, m11 mova m5, [spel_hv_perm8a] vpaddd m0, m5, [pb_32] {1to16} mov r6, 0x55555555ff00 packssdw m2, m1 packssdw m3, m4 mova m8, [spel_hv_perm8b] psraw m2, 2 ; 0 1 2 3 psraw m3, 2 ; 4 5 6 _ vpermb m1, m5, m2 ; 01 12 vbroadcasti32x8 m6, [subpel_h_shufA] kmovq k1, r6 vpermt2b m2, m0, m3 ; 23 34 vbroadcasti32x8 m7, [subpel_h_shufB] kshiftrq k2, k1, 16 mova xm16, [spel_hv_end] vpermb m3, m5, m3 ; 45 56 .hv_w8_loop: vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m4{k1}, [srcq+ssq*0] pmaddwd m0, m1, m12 ; a0 b0 pshufb m1, m4, m6 ; 7 8 0123 4567 mova m5, m9 vpdpbusd m5, m1, m10 pshufb m4, m7 ; 7 8 4567 89ab vpdpwssd m0, m2, m13 ; a1 b1 mova m1, m2 vpdpbusd m5, m4, m11 mova m2, m3 vpdpwssd m0, m3, m14 ; a2 b2 psraw m3{k2}, m5, 2 ; 75 86 vpermb m3, m8, m3 ; 67 78 vpdpwssd m0, m3, m15 ; a3 b3 packuswb m0, m0 vpermb zmm1, m16, m0 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 23 movu m22, [spel_hv_perm16a] sub srcq, ss3q vpbroadcastd m8, [pb_4] lea r6d, [wq*2-32] mova m7, [spel_hv_perm16b] paddb m20, m8, m22 mova ym16, [spel_hv_end16] paddb m21, m8, m20 lea r6d, [hq+r6*8] paddb m8, m7 .hv_w16_loop0: movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 lea r4, [srcq+ss3q] movu ym18, [srcq+ssq*2] vinserti32x8 m18, [r4 +ssq*0], 1 ; 2 3 mov r7, dstq movu ym19, [r4 +ssq*1] vinserti32x8 m19, [r4 +ssq*2], 1 ; 4 5 add r4, ss3q vpermb m2, m22, m17 ; 0 1 0123 89ab mova m1, m9 vpermb m3, m21, m17 ; 0 1 89ab ghij vpdpbusd m1, m2, m10 mova m2, m9 vpermb m4, m22, m18 ; 2 3 0123 89ab vpdpbusd m2, m3, m11 mova m3, m9 vpermb m5, m21, m18 ; 2 3 89ab ghij vpdpbusd m3, m4, m10 mova m4, m9 vpermb m6, m22, m19 ; 4 5 0123 89ab vpdpbusd m4, m5, m11 mova m5, m9 vpermb m17, m20, m17 ; 0 1 4567 cdef vpdpbusd m5, m6, m10 mova m6, m9 vpermb m0, m21, m19 ; 4 5 89ab ghij vpdpbusd m1, m17, m11 vpdpbusd m2, m17, m10 movu ym17, [r4+ssq*0] ; 6 vpermb m18, m20, m18 ; 2 3 4567 cdef vpdpbusd m6, m0, m11 vpermb m0, m7, m17 ; 6 0145 2367 89cd abef vpdpbusd m3, m18, m11 vpermb m19, m20, m19 ; 4 5 4567 cdef vpdpbusd m4, m18, m10 mova m18, m9 vpermb m17, m8, m17 ; 6 4589 67ab cdgh efij vpdpbusd m18, m0, m10 packssdw m1, m2 vpdpbusd m5, m19, m11 vpdpbusd m6, m19, m10 packssdw m3, m4 vpdpbusd m18, m17, m11 psraw m1, 2 ; 01 psraw m3, 2 ; 23 packssdw m5, m6 vpshrdd m2, m1, m3, 16 ; 12 psraw m5, 2 ; 45 vpshrdd m4, m3, m5, 16 ; 34 psraw m18, 2 vpshrdd m6, m5, m18, 16 ; 56 .hv_w16_loop: movu ym19, [r4+ssq*1] lea r4, [r4+ssq*2] vinserti32x8 m19, [r4+ssq*0], 1 pmaddwd m17, m1, m12 ; a0 vpermb m1, m22, m19 ; 7 8 0123 89ab pmaddwd m18, m2, m12 ; b0 mova m0, m9 vpermb m2, m21, m19 ; 7 8 89ab ghij vpdpbusd m0, m1, m10 mova m1, m9 vpermb m19, m20, m19 ; 7 8 4567 cdef vpdpbusd m1, m2, m11 mova m2, m4 vpdpwssd m17, m3, m13 ; a1 vpdpwssd m18, m4, m13 ; b1 mova m4, m6 vpdpbusd m0, m19, m11 vpdpbusd m1, m19, m10 vpdpwssd m17, m5, m14 ; a2 vpdpwssd m18, m6, m14 ; b2 packssdw m0, m1 mova m1, m3 psraw m6, m0, 2 ; 78 mova m3, m5 vpshrdd m5, m4, m6, 16 ; 67 vpdpwssd m18, m6, m15 ; b3 vpdpwssd m17, m5, m15 ; a3 packuswb m17, m18 vpermb m17, m16, m17 mova [r7+dsq*0], xm17 vextracti128 [r7+dsq*1], ym17, 1 lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w16_loop add srcq, 16 add dstq, 16 movzx hd, r6b sub r6d, 1<<8 jg .hv_w16_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_6tap_8bpc PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my, ss3 %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 6tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 6tap_v, my, 4tap_v lea r7, [prep_avx512icl] movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd tzcnt r5d, wd lea myq, [base+subpel_filters+1+myq*8] movzx r5d, word [r7+r5*2+table_offset(prep, _6tap_v)] vpbroadcastd m7, [pw_8192] sub srcq, ssq vpbroadcastw m8, [myq+0] add r5, r7 vpbroadcastw m9, [myq+2] lea ss3q, [ssq*3] vpbroadcastw m10, [myq+4] sub srcq, ssq jmp r5 .v_w4: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 vpbroadcastd ymm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd ymm3, [srcq+ssq*0] vpbroadcastd ymm0, [srcq+ssq*1] vbroadcasti128 ymm5, [deint_shuf4] vpblendd ymm1, ymm2, 0xeb punpcklqdq ymm3, ymm0 vpblendd ymm1, ymm3, 0x60 ; 0 1 2 _ 2 3 4 _ pshufb ymm1, ymm5 ; 01 12 23 34 .v_w4_loop: pinsrd xmm0, [srcq+ssq*2], 1 vpbroadcastd ymm2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] vpbroadcastd ymm3, [srcq+ssq*0] vpblendd ymm2, ymm0, 0xeb vpbroadcastd ymm0, [srcq+ssq*1] punpcklqdq ymm3, ymm0 vpblendd ymm2, ymm3, 0x60 ; 4 5 6 _ 6 7 8 _ pshufb ymm2, ymm5 ; 45 56 67 78 pmaddubsw ymm3, ymm1, ym8 ; a0 b0 c0 d0 vperm2i128 ymm1, ymm2, 0x21 ; 23 34 45 56 pmaddubsw ymm4, ymm2, ym10 ; a2 b2 c2 d2 pmaddubsw ymm1, ym9 ; a1 b1 c1 d1 paddw ymm3, ymm4 paddw ymm3, ymm1 pmulhrsw ymm3, ym7 mova ymm1, ymm2 mova [tmpq], ymm3 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: mova m6, [spel_v_perm8] movq xm1, [srcq+ssq*0] mov r6d, 0x3e movq xm2, [srcq+ssq*1] kmovb k1, r6d vpbroadcastq ym3, [srcq+ssq*2] add srcq, ss3q vpunpcklqdq ym2, [srcq+ssq*0] {1to4} vpunpcklqdq m1{k1}, m3, [srcq+ssq*1] {1to8} movq xm0, [srcq+ssq*1] kshiftlb k2, k1, 2 shufpd m1, m2, 0x18 ; 0 1 2 3 4 vpermb m1, m6, m1 ; 01 12 23 34 .v_w8_loop: vpbroadcastq ym3, [srcq+ss3q ] vpunpcklqdq ym0{k1}, ym3, [srcq+ssq*2] {1to4} lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*1] vpunpcklqdq m0{k2}, m3, [srcq+ssq*0] {1to8} pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 vpermb m2, m6, m0 ; 45 56 67 78 mova xm0, xm3 vshufi32x4 m1, m2, q1032 ; 23 34 45 56 pmaddubsw m3, m2, m10 ; a3 b3 c3 d3 pmaddubsw m5, m1, m9 ; a2 b2 c2 d2 mova m1, m2 paddw m4, m3 paddw m4, m5 pmulhrsw m4, m7 mova [tmpq], m4 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mova m11, [spel_v_perm16b] vbroadcasti32x4 m1, [srcq+ssq*0] mov r6d, 0x0f vbroadcasti32x4 ym3, [srcq+ssq*1] vbroadcasti32x4 m2, [srcq+ssq*2] kmovb k1, r6d add srcq, ss3q vbroadcasti32x4 ym4, [srcq+ssq*0] vbroadcasti32x4 m0, [srcq+ssq*1] vshufpd m1{k1}, m3, m2, 0xcc vshufpd m2{k1}, m4, m0, 0xcc vpermb m1, m11, m1 ; 01 12 vpermb m2, m11, m2 ; 23 34 .v_w16_loop: pmaddubsw m3, m1, m8 ; a0 b0 pmaddubsw m5, m2, m9 ; a1 b1 vbroadcasti32x4 ym6, [srcq+ssq*2] pmaddubsw m4, m2, m8 ; c0 d0 vbroadcasti32x4 m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] vshufpd m0{k1}, m6, m2, 0xcc vbroadcasti32x4 ym6, [srcq+ssq*0] vpermb m1, m11, m0 ; 45 56 vbroadcasti32x4 m0, [srcq+ssq*1] vshufpd m2{k1}, m6, m0, 0xcc pmaddubsw m6, m1, m9 ; c1 d1 vpermb m2, m11, m2 ; 67 78 paddw m3, m5 pmaddubsw m5, m1, m10 ; a2 b2 paddw m4, m6 pmaddubsw m6, m2, m10 ; c2 d2 paddw m3, m5 paddw m4, m6 pmulhrsw m3, m7 pmulhrsw m4, m7 mova [tmpq+ 0], m3 mova [tmpq+64], m4 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movshdup m6, [bilin_v_perm64] movu ym16, [srcq+ssq*0] movu ym17, [srcq+ssq*1] movu ym18, [srcq+ssq*2] add srcq, ss3q movu ym19, [srcq+ssq*0] add srcq, ssq movu ym20, [srcq+ssq*0] vpermt2q m16, m6, m18 ; 0 2 vpermt2q m17, m6, m19 ; 1 3 vpermt2q m18, m6, m20 ; 2 4 punpcklbw m0, m16, m17 ; 01 punpcklbw m1, m17, m18 ; 12 punpckhbw m2, m16, m17 ; 23 punpckhbw m3, m17, m18 ; 34 .v_w32_loop: movu ym16, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu ym17, [srcq+ssq*0] pmaddubsw m4, m0, m8 ; a0 mova m0, m2 pmaddubsw m2, m9 ; a1 vpermt2q m16, m6, m17 ; 5 6 pmaddubsw m5, m1, m8 ; b0 mova m1, m3 pmaddubsw m3, m9 ; b1 shufpd m18, m16, 0x55 ; 4 5 paddw m4, m2 punpcklbw m2, m18, m16 ; 45 paddw m5, m3 punpckhbw m3, m18, m16 ; 56 mova m18, m16 pmaddubsw m16, m2, m10 ; a2 pmaddubsw m17, m3, m10 ; b2 paddw m4, m16 paddw m5, m17 pmulhrsw m4, m7 pmulhrsw m5, m7 mova [tmpq+ 0], m4 mova [tmpq+64], m5 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: .v_w128: mova m6, [bilin_v_perm64] add wd, wd lea r6d, [hq+wq] .v_loop0: vpermq m12, m6, [srcq+ssq*0] vpermq m13, m6, [srcq+ssq*1] lea r5, [srcq+ssq*2] vpermq m14, m6, [r5 +ssq*0] vpermq m15, m6, [r5 +ssq*1] lea r5, [r5+ssq*2] vpermq m16, m6, [r5 +ssq*0] mov r7, tmpq punpcklbw m0, m12, m13 ; 01 punpckhbw m12, m13 punpcklbw m1, m13, m14 ; 12 punpckhbw m13, m14 punpcklbw m2, m14, m15 ; 23 punpckhbw m14, m15 punpcklbw m3, m15, m16 ; 34 punpckhbw m15, m16 .v_loop: pmaddubsw m17, m0, m8 ; a0 vpermq m5, m6, [r5+ssq*1] pmaddubsw m18, m12, m8 mova m0, m2 pmaddubsw m2, m9 ; a1 mova m12, m14 pmaddubsw m14, m9 lea r5, [r5+ssq*2] pmaddubsw m19, m1, m8 ; b0 pmaddubsw m20, m13, m8 mova m1, m3 pmaddubsw m3, m9 ; b1 mova m13, m15 pmaddubsw m15, m9 paddw m17, m2 punpcklbw m2, m16, m5 ; 67 paddw m18, m14 punpckhbw m14, m16, m5 vpermq m16, m6, [r5+ssq*0] paddw m19, m3 pmaddubsw m3, m2, m10 ; a3 paddw m20, m15 pmaddubsw m15, m14, m10 paddw m17, m3 punpcklbw m3, m5, m16 ; 78 pmaddubsw m4, m3, m10 ; b3 paddw m18, m15 punpckhbw m15, m5, m16 pmaddubsw m5, m15, m10 paddw m19, m4 paddw m20, m5 REPX {pmulhrsw x, m7}, m17, m18, m19, m20 mova [r7+wq*0+ 0], m17 mova [r7+wq*0+64], m18 mova [r7+wq*1+ 0], m19 mova [r7+wq*1+64], m20 lea r7, [r7+wq*2] sub hd, 2 jg .v_loop add srcq, 64 add tmpq, 128 movzx hd, r6b sub r6d, 1<<8 jg .v_loop0 vzeroupper RET .h: test myd, 0xf00 jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2 .hv: vpbroadcastd m8, [pd_2] vpbroadcastd m9, [pd_32] cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m11, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m3, [base+subpel_filters+1+myq*8] vbroadcasti128 m10, [subpel_h_shufA] lea r6, [ssq*2+1] mov r3d, 0x30 sub srcq, r6 kmovb k1, r3d vpbroadcastq ym2, [srcq+ssq*0] lea ss3q, [ssq*3] vpbroadcastq m1, [srcq+ssq*1] kaddb k2, k1, k1 vpbroadcastq m2{k1}, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1{k2}, [srcq+ssq*0] ; _ _ 1 3 punpcklbw m3, m3 vpbroadcastq m2{k2}, [srcq+ssq*1] ; _ 0 2 4 psraw m3, 8 ; sign-extend mova m6, [spel_hv_perm4a] kshiftrb k1, k1, 2 movu m7, [spel_hv_perm4b] pshufb m1, m10 mova m0, m8 vpdpbusd m0, m1, m11 pshufb m2, m10 mova m1, m8 vpdpbusd m1, m2, m11 pshufd m12, m3, q0000 pshufd m13, m3, q1111 pshufd m14, m3, q2222 packssdw m0, m1 ; _ _ _ 0 1 2 3 4 psraw m0, 2 vpermb m1, m7, m0 ; 01 12 23 34 .hv_w4_loop: movq xm3, [srcq+ssq*2] movq xm4, [srcq+ss3q ] lea srcq, [srcq+ssq*4] vpbroadcastq ym3{k1}, [srcq+ssq*0] ; 5 7 vpbroadcastq ym4{k1}, [srcq+ssq*1] ; 6 8 pshufb ym3, ym10 mova ym2, ym8 vpdpbusd ym2, ym3, ym11 pshufb ym4, ym10 mova ym3, ym8 vpdpbusd ym3, ym4, ym11 mova m4, m9 vpdpwssd m4, m1, m12 ; a0 b0 c0 d0 packssdw ym2, ym3 ; 5 6 7 8 psraw ym2, 2 vshufi32x4 m0, m2, q1032 ; _ 2 3 4 5 6 7 8 vpermb m2, m6, m0 ; 23 34 45 56 vpermb m1, m7, m0 ; 45 56 67 78 vpdpwssd m4, m2, m13 ; a1 b1 c1 d1 vpdpwssd m4, m1, m14 ; a2 b2 c2 d2 psrad m4, 6 vpmovdw [tmpq], m4 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] vpbroadcastd m11, [base+subpel_filters+mxq*8+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+1+myq*8] lea r6, [ssq*2+3] punpcklbw m0, m0 sub srcq, r6 psraw m0, 8 ; sign-extend lea ss3q, [ssq*3] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 cmp wd, 8 jg .hv_w16 movu xm16, [srcq+ssq*0] vbroadcasti32x4 m19, [subpel_h_shufA] vinserti128 ym16, [srcq+ssq*1], 1 vbroadcasti32x4 m21, [subpel_h_shufC] vinserti32x4 m16, [srcq+ssq*2], 2 add srcq, ss3q vinserti32x4 m16, [srcq+ssq*0], 3 movu xm17, [srcq+ssq*1] vbroadcasti32x4 m20, [subpel_h_shufB] pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 pshufb m0, m16, m21 ; 0 1 2 3 89ab vpdpbusd m2, m3, m10 mova m3, m8 pshufb xm1, xm17, xm19 ; 3 4 5 6 0123 vpdpbusd m3, m0, m11 mova xm0, xm8 pshufb xm18, xm17, xm21 ; 3 4 5 6 89ab vpdpbusd xm0, xm1, xm10 mova xm1, xm8 pshufb m16, m20 ; 0 1 2 3 4567 vpdpbusd xm1, xm18, xm11 pshufb xm17, xm20 ; 3 4 5 6 4567 vpdpbusd m2, m16, m11 vpdpbusd m3, m16, m10 vpdpbusd xm0, xm17, xm11 vpdpbusd xm1, xm17, xm10 packssdw m2, m3 packssdw xm0, xm1 psraw m2, 2 ; 0 1 2 3 psraw xm0, 2 ; 4 valignq m0, m2, 2 ; 1 2 3 4 punpcklwd m1, m2, m0 ; 01 12 23 34 punpckhwd m2, m0 .hv_w8_loop: movu xm16, [srcq+ssq*2] vinserti128 ym16, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] vinserti32x4 m16, [srcq+ssq*0], 2 vinserti32x4 m16, [srcq+ssq*1], 3 pshufb m6, m16, m19 ; 5 6 7 8 0123 mova m5, m8 pshufb m3, m16, m21 ; 5 6 7 8 89ab vpdpbusd m5, m6, m10 mova m6, m8 pshufb m16, m20 ; 5 6 7 8 4567 vpdpbusd m6, m3, m11 mova m3, m9 vpdpwssd m3, m1, m12 ; a0 b0 c0 d0 mova m4, m9 vpdpwssd m4, m2, m12 vpdpbusd m5, m16, m11 vpdpbusd m6, m16, m10 mova m16, m1 packssdw m5, m6 mova m6, m2 psraw m5, 2 ; 5 6 7 8 valignq m2, m5, m0, 6 ; 4 5 6 7 mova m0, m5 punpcklwd m1, m2, m5 ; 45 56 67 78 punpckhwd m2, m5 vpdpwssd m3, m1, m14 ; a2 b2 c2 d2 vpdpwssd m4, m2, m14 vshufi32x4 m16, m1, q1032 ; 23 34 45 56 vshufi32x4 m6, m2, q1032 vpdpwssd m3, m16, m13 ; a1 b1 c1 d1 vpdpwssd m4, m6, m13 psrad m3, 6 psrad m4, 6 packssdw m3, m4 mova [tmpq], m3 add tmpq, 64 sub hd, 4 jg .hv_w8_loop vzeroupper RET .hv_w16: mova m16, [spel_h_perm16] vpbroadcastd m18, [pb_4] add wd, wd paddb m17, m18, m16 lea r6d, [hq+wq*8-256] paddb m18, m17 .hv_w16_loop0: movu ym19, [srcq+ssq*0] vinserti32x8 m19, [srcq+ssq*1], 1 lea r5, [srcq+ssq*2] movu ym20, [r5 +ssq*0] vinserti32x8 m20, [r5 +ssq*1], 1 lea r5, [r5 +ssq*2] movu ym21, [r5 +ssq*0] mov r7, tmpq vpermb m3, m16, m19 ; 0 1 0123 89ab mova m2, m8 vpermb m4, m18, m19 ; 0 1 89ab ghij vpdpbusd m2, m3, m10 mova m3, m8 vpermb m5, m16, m20 ; 2 3 0123 89ab vpdpbusd m3, m4, m11 mova m4, m8 vpermb m0, m18, m20 ; 2 3 89ab ghij vpdpbusd m4, m5, m10 mova m5, m8 vpermb ym1, ym16, ym21 ; 4 0123 89ab vpdpbusd m5, m0, m11 mova ym0, ym8 vpermb ym6, ym18, ym21 ; 4 89ab ghij vpdpbusd ym0, ym1, ym10 mova ym1, ym8 vpermb m19, m17, m19 ; 0 1 4567 cdef vpdpbusd ym1, ym6, ym11 vpermb m20, m17, m20 ; 2 3 4567 cdef vpdpbusd m2, m19, m11 vpdpbusd m3, m19, m10 vpermb ym21, ym17, ym21 ; 4 4567 cdef vpdpbusd m4, m20, m11 vpdpbusd m5, m20, m10 vpdpbusd ym0, ym21, ym11 vpdpbusd ym1, ym21, ym10 packssdw m2, m3 ; 0 1 packssdw m4, m5 ; 2 3 packssdw ym0, ym1 ; 4 REPX {psraw x, 2}, m2, m4, ym0 vshufi32x4 m3, m2, m4, q1032 ; 1 2 vshufi32x4 m0, m4, m0, q1032 ; 3 4 punpcklwd m1, m2, m3 ; 01 12 punpckhwd m2, m3 punpcklwd m3, m4, m0 ; 23 34 punpckhwd m4, m0 .hv_w16_loop: movu ym19, [r5+ssq*1] lea r5, [r5+ssq*2] vinserti32x8 m19, [r5+ssq*0], 1 vpermb m6, m16, m19 ; 5 6 0123 89ab mova m5, m8 vpermb m20, m18, m19 ; 5 6 89ab ghij vpdpbusd m5, m6, m10 mova m6, m8 vpermb m19, m17, m19 ; 5 6 4567 cdef vpdpbusd m6, m20, m11 mova m20, m9 vpdpwssd m20, m1, m12 ; a0 b0 mova m21, m9 vpdpwssd m21, m2, m12 vpdpbusd m5, m19, m11 vpdpbusd m6, m19, m10 vpdpwssd m20, m3, m13 ; a1 b1 vpdpwssd m21, m4, m13 packssdw m5, m6 mova m1, m3 psraw m5, 2 ; 5 6 mova m2, m4 vshufi32x4 m4, m0, m5, q1032 ; 4 5 mova m0, m5 punpcklwd m3, m4, m0 ; 45 56 punpckhwd m4, m0 vpdpwssd m20, m3, m14 ; a2 b2 vpdpwssd m21, m4, m14 psrad m20, 6 psrad m21, 6 packssdw m20, m21 mova [r7+wq*0], ym20 vextracti32x8 [r7+wq*1], m20, 1 lea r7, [r7+wq*2] sub hd, 2 jg .hv_w16_loop add srcq, 16 add tmpq, 32 movzx hd, r6b sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper RET %macro PREP_8TAP_H 0 vpermb m10, m5, m0 vpermb m11, m5, m1 vpermb m12, m6, m0 vpermb m13, m6, m1 vpermb m14, m7, m0 vpermb m15, m7, m1 mova m0, m4 vpdpbusd m0, m10, m8 mova m2, m4 vpdpbusd m2, m12, m8 mova m1, m4 vpdpbusd m1, m11, m8 mova m3, m4 vpdpbusd m3, m13, m8 vpdpbusd m0, m12, m9 vpdpbusd m2, m14, m9 vpdpbusd m1, m13, m9 vpdpbusd m3, m15, m9 packssdw m0, m2 packssdw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 %endmacro PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc PREP_8TAP_FN sharp, SHARP, SHARP cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep .v: movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. tzcnt r5d, wd lea myq, [base+subpel_filters+myq*8] movzx r5d, word [r7+r5*2+table_offset(prep, _8tap_v)] vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] add r5, r7 vpbroadcastw m9, [myq+2] lea stride3q, [strideq*3] vpbroadcastw m10, [myq+4] sub srcq, stride3q vpbroadcastw m11, [myq+6] jmp r5 .v_w4: movd xmm0, [srcq+strideq*0] vpbroadcastd ymm1, [srcq+strideq*2] vpbroadcastd xmm2, [srcq+strideq*1] vpbroadcastd ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ vpbroadcastd ymm0, [srcq+strideq*0] vpbroadcastd ymm2, [srcq+strideq*1] vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ vpbroadcastd ymm0, [srcq+strideq*2] vbroadcasti128 ymm5, [deint_shuf4] vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 punpckhbw ymm2, ymm3 ; 23 34 45 56 .v_w4_loop: pinsrd xmm0, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vpbroadcastd ymm3, [srcq+strideq*0] vpbroadcastd ymm4, [srcq+strideq*1] vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ vpbroadcastd ymm0, [srcq+strideq*2] vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ pshufb ymm3, ymm5 ; 67 78 89 9a pmaddubsw ymm4, ymm1, ym8 vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 pmaddubsw ymm2, ym9 paddw ymm4, ymm2 mova ymm2, ymm3 pmaddubsw ymm3, ym11 paddw ymm3, ymm4 pmaddubsw ymm4, ymm1, ym10 paddw ymm3, ymm4 pmulhrsw ymm3, ym7 mova [tmpq], ymm3 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: mova m6, [spel_v_perm8] movq xm1, [srcq+strideq*0] mov r6d, 0x3e movq xm2, [srcq+strideq*1] vpbroadcastq ym3, [srcq+strideq*2] kmovb k1, r6d vpbroadcastq ym4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpunpcklqdq m1{k1}, m3, [srcq+strideq*0] {1to8} vpunpcklqdq m2{k1}, m4, [srcq+strideq*1] {1to8} movq xm0, [srcq+strideq*2] kshiftlb k2, k1, 2 shufpd m1, m2, 0x30 ; 0 1 2 3 4 5 vshufi32x4 m2, m1, m0, q0021 ; 2 3 4 5 6 _ vpermb m1, m6, m1 ; 01 12 23 34 vpermb m2, m6, m2 ; 23 34 45 56 .v_w8_loop: vpbroadcastq ym3, [srcq+strideq*4] vpunpcklqdq ym0{k1}, ym3, [srcq+stride3q] {1to4} lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*2] vpunpcklqdq m0{k2}, m3, [srcq+strideq*1] {1to8} pmaddubsw m4, m1, m8 ; a0 b0 c0 d0 mova m1, m2 pmaddubsw m5, m2, m9 ; a1 b1 c1 d1 vpermb m2, m6, m0 ; 67 78 89 9a mova xm0, xm3 vshufi32x4 m1, m2, q1032 ; 45 56 67 78 pmaddubsw m3, m2, m11 ; a3 b3 c3 d3 paddw m4, m5 pmaddubsw m5, m1, m10 ; a2 b2 c2 d2 paddw m4, m3 paddw m4, m5 pmulhrsw m4, m7 mova [tmpq], m4 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mova m12, [spel_v_perm16b] vbroadcasti32x4 m1, [srcq+strideq*0] mov r6d, 0x0f vbroadcasti32x4 ym4, [srcq+strideq*1] vbroadcasti32x4 m2, [srcq+strideq*2] kmovb k1, r6d vbroadcasti32x4 ym5, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vbroadcasti32x4 m3, [srcq+strideq*0] vbroadcasti32x4 ym6, [srcq+strideq*1] vbroadcasti32x4 m0, [srcq+strideq*2] vshufpd m1{k1}, m4, m2, 0xcc vshufpd m2{k1}, m5, m3, 0xcc vshufpd m3{k1}, m6, m0, 0xcc vpermb m1, m12, m1 ; 01 12 vpermb m2, m12, m2 ; 23 34 vpermb m3, m12, m3 ; 45 56 .v_w16_loop: pmaddubsw m4, m1, m8 ; a0 b0 mova m1, m3 pmaddubsw m13, m2, m9 ; a1 b1 vbroadcasti32x4 ym6, [srcq+stride3q ] pmaddubsw m5, m2, m8 ; c0 d0 lea srcq, [srcq+strideq*4] pmaddubsw m14, m3, m9 ; c1 d1 vbroadcasti32x4 m3, [srcq+strideq*0] vshufpd m0{k1}, m6, m3, 0xcc vbroadcasti32x4 ym6, [srcq+strideq*1] vpermb m2, m12, m0 ; 67 78 vbroadcasti32x4 m0, [srcq+strideq*2] vshufpd m3{k1}, m6, m0, 0xcc paddw m4, m13 pmaddubsw m13, m1, m10 ; a2 b2 vpermb m3, m12, m3 ; 89 9a paddw m5, m14 pmaddubsw m14, m2, m10 ; c2 d2 pmaddubsw m15, m2, m11 ; a3 b3 pmaddubsw m6, m3, m11 ; c3 d3 paddw m4, m13 paddw m5, m14 paddw m4, m15 paddw m5, m6 pmulhrsw m4, m7 pmulhrsw m5, m7 mova [tmpq+ 0], m4 mova [tmpq+64], m5 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movshdup m21, [bilin_v_perm64] movu ym16, [srcq+strideq*0] movu ym17, [srcq+strideq*1] movu ym18, [srcq+strideq*2] add srcq, stride3q movu ym19, [srcq+strideq*0] vpermt2q m16, m21, m19 ; 0 3 movu ym20, [srcq+strideq*1] vpermt2q m17, m21, m20 ; 1 4 movu ym20, [srcq+strideq*2] add srcq, stride3q vpermt2q m18, m21, m20 ; 2 5 movu ym20, [srcq+strideq*0] vpermt2q m19, m21, m20 ; 3 6 punpcklbw m0, m16, m17 ; 01 punpcklbw m1, m17, m18 ; 12 punpcklbw m2, m18, m19 ; 23 punpckhbw m3, m16, m17 ; 34 punpckhbw m4, m17, m18 ; 45 punpckhbw m5, m18, m19 ; 56 .v_w32_loop: movu ym16, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym17, [srcq+strideq*0] pmaddubsw m14, m0, m8 mova m0, m2 pmaddubsw m15, m1, m8 mova m1, m3 pmaddubsw m2, m9 vpermt2q m16, m21, m17 ; 7 8 pmaddubsw m3, m9 pmaddubsw m12, m4, m10 pmaddubsw m13, m5, m10 shufpd m19, m16, 0x55 ; 6 7 paddw m14, m2 mova m2, m4 punpcklbw m4, m19, m16 ; 67 paddw m15, m3 mova m3, m5 punpckhbw m5, m19, m16 ; 78 paddw m14, m12 paddw m15, m13 pmaddubsw m12, m4, m11 pmaddubsw m13, m5, m11 mova m19, m16 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 mova [tmpq+ 0], m14 mova [tmpq+64], m15 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: .v_w128: WIN64_SPILL_XMM 24 mova m23, [bilin_v_perm64] add wd, wd lea r6d, [hq+wq] .v_loop0: vpermq m12, m23, [srcq+strideq*0] vpermq m13, m23, [srcq+strideq*1] lea r5, [srcq+strideq*2] vpermq m14, m23, [r5 +strideq*0] vpermq m15, m23, [r5 +strideq*1] lea r5, [r5+strideq*2] vpermq m16, m23, [r5 +strideq*0] vpermq m17, m23, [r5 +strideq*1] lea r5, [r5+strideq*2] vpermq m18, m23, [r5 +strideq*0] mov r7, tmpq punpcklbw m0, m12, m13 ; 01 punpckhbw m12, m13 punpcklbw m1, m13, m14 ; 12 punpckhbw m13, m14 punpcklbw m2, m14, m15 ; 23 punpckhbw m14, m15 punpcklbw m3, m15, m16 ; 34 punpckhbw m15, m16 punpcklbw m4, m16, m17 ; 45 punpckhbw m16, m17 punpcklbw m5, m17, m18 ; 56 punpckhbw m17, m18 .v_loop: pmaddubsw m19, m0, m8 ; a0 vpermq m6, m23, [r5+strideq*1] pmaddubsw m20, m12, m8 mova m0, m2 pmaddubsw m2, m9 ; a1 mova m12, m14 pmaddubsw m14, m9 lea r5, [r5+strideq*2] pmaddubsw m21, m1, m8 ; b0 pmaddubsw m22, m13, m8 mova m1, m3 pmaddubsw m3, m9 ; b1 mova m13, m15 pmaddubsw m15, m9 paddw m19, m2 mova m2, m4 pmaddubsw m4, m10 ; a2 paddw m20, m14 mova m14, m16 pmaddubsw m16, m10 paddw m21, m3 mova m3, m5 pmaddubsw m5, m10 ; b2 paddw m22, m15 mova m15, m17 pmaddubsw m17, m10 paddw m19, m4 punpcklbw m4, m18, m6 ; 67 paddw m20, m16 punpckhbw m16, m18, m6 vpermq m18, m23, [r5+strideq*0] paddw m21, m5 pmaddubsw m5, m4, m11 ; a3 paddw m22, m17 pmaddubsw m17, m16, m11 paddw m19, m5 punpcklbw m5, m6, m18 ; 78 paddw m20, m17 punpckhbw m17, m6, m18 pmaddubsw m6, m5, m11 ; b3 paddw m21, m6 pmaddubsw m6, m17, m11 paddw m22, m6 REPX {pmulhrsw x, m7}, m19, m20, m21, m22 mova [r7+wq*0+ 0], m19 mova [r7+wq*0+64], m20 mova [r7+wq*1+ 0], m21 mova [r7+wq*1+64], m22 lea r7, [r7+wq*2] sub hd, 2 jg .v_loop add srcq, 64 add tmpq, 128 movzx hd, r6b sub r6d, 1<<8 jg .v_loop0 RET .h: RESET_STACK_STATE test myd, 0xf00 jnz .hv .h2: vpbroadcastd m4, [pd_2] cmp wd, 4 je .h_w4 tzcnt wd, wd shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] vpbroadcastd m8, [base+subpel_filters+mxq*8+0] vpbroadcastd m9, [base+subpel_filters+mxq*8+4] add wq, r7 jmp wq .h_w4: movzx mxd, mxb vbroadcasti128 ym5, [subpel_h_shufA] mov r3d, 0x4 dec srcq vpbroadcastd ym6, [base+subpel_filters+mxq*8+2] kmovb k1, r3d lea stride3q, [strideq*3] .h_w4_loop: movq xm2, [srcq+strideq*0] movq xm3, [srcq+strideq*1] vpbroadcastq ym2{k1}, [srcq+strideq*2] vpbroadcastq ym3{k1}, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pshufb ym2, ym5 pshufb ym3, ym5 mova ym0, ym4 vpdpbusd ym0, ym2, ym6 mova ym1, ym4 vpdpbusd ym1, ym3, ym6 packssdw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: vbroadcasti128 m5, [subpel_h_shufA] vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] lea stride3q, [strideq*3] .h_w8_loop: movu xmm3, [srcq+strideq*0] vinserti128 ym3, ymm3, [srcq+strideq*1], 1 vinserti128 m3, [srcq+strideq*2], 2 vinserti128 m3, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pshufb m1, m3, m5 pshufb m2, m3, m6 mova m0, m4 vpdpbusd m0, m1, m8 mova m1, m4 vpdpbusd m1, m2, m8 pshufb m3, m7 vpdpbusd m0, m2, m9 vpdpbusd m1, m3, m9 packssdw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: mova m5, [spel_h_perm16] vpbroadcastd m7, [pb_4] lea stride3q, [strideq*3] paddb m6, m7, m5 paddb m7, m6 .h_w16_loop: movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*1], 1 vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] PREP_8TAP_H add tmpq, 64*2 sub hd, 4 jg .h_w16_loop RET .h_w32: mova m5, [spel_h_perm32] vpbroadcastd m7, [pb_4] paddb m6, m7, m5 paddb m7, m6 .h_w32_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] PREP_8TAP_H add tmpq, 64*2 sub hd, 2 jg .h_w32_loop RET .h_w64: xor r6d, r6d jmp .h_start .h_w128: mov r6, -64*1 .h_start: mova m5, [spel_h_perm32] vpbroadcastd m7, [pb_4] sub srcq, r6 paddb m6, m7, m5 paddb m7, m6 .h_loop0: mov r5, r6 .h_loop: movu m0, [srcq+r5+32*0] movu m1, [srcq+r5+32*1] PREP_8TAP_H add tmpq, 64*2 add r5, 64 jle .h_loop add srcq, strideq dec hd jg .h_loop0 RET .hv: RESET_STACK_STATE vpbroadcastd m8, [pd_2] vpbroadcastd m9, [pd_32] cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m11, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] lea stride3q, [strideq*3] sub srcq, stride3q mov r3d, 0x04 kmovb k1, r3d kshiftlb k2, k1, 2 kshiftlb k3, k1, 4 vbroadcasti128 m10, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 movq xm3, [srcq+strideq*0] vpbroadcastq ym2, [srcq+strideq*1] vpbroadcastq ym3{k1}, [srcq+strideq*2] vpbroadcastq m2{k2}, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3{k2}, [srcq+strideq*0] vpbroadcastq m2{k3}, [srcq+strideq*1] vpbroadcastq m3{k3}, [srcq+strideq*2] mova m6, [spel_hv_perm4a] movu m7, [spel_hv_perm4b] mova m0, m8 mova m1, m8 pshufb m2, m10 pshufb m3, m10 vpdpbusd m0, m2, m11 vpdpbusd m1, m3, m11 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 psraw m0, 2 vpermb m1, m6, m0 ; 01 12 23 34 vpermb m2, m7, m0 ; 23 34 45 56 .hv_w4_loop: movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movq xm4, [srcq+strideq*0] vpbroadcastq ym3{k1}, [srcq+strideq*1] vpbroadcastq ym4{k1}, [srcq+strideq*2] mova m5, m9 pshufb ym3, ym10 vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 mova ym1, ym8 pshufb ym4, ym10 vpdpbusd ym1, ym3, ym11 mova ym3, ym8 vpdpbusd ym3, ym4, ym11 vpdpwssd m5, m2, m13 ; a1 b1 c1 d1 packssdw ym1, ym3 ; 7 8 9 a psraw ym1, 2 vshufi32x4 m0, m1, q1032 ; _ 4 5 6 7 8 9 a vpermb m1, m6, m0 ; 45 56 67 78 vpermb m2, m7, m0 ; 67 78 89 9a vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 vpdpwssd m5, m2, m15 ; a3 b3 c3 d3 psrad m5, 6 vpmovdw [tmpq], m5 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] vpbroadcastd m11, [base+subpel_filters+mxq*8+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] lea stride3q, [strideq*3] sub srcq, stride3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 cmp wd, 8 jg .hv_w16 vbroadcasti32x4 m17, [srcq+stride3q ] vinserti32x4 m16, m17, [srcq+strideq*0], 0 vbroadcasti32x4 m19, [subpel_h_shufA] vinserti32x4 m16, [srcq+strideq*1], 1 vbroadcasti32x4 m21, [subpel_h_shufC] vinserti32x4 m16, [srcq+strideq*2], 2 lea srcq, [srcq+strideq*4] vinserti128 ym17, [srcq+strideq*0], 1 vbroadcasti32x4 m20, [subpel_h_shufB] vinserti32x4 m17, [srcq+strideq*1], 2 vinserti32x4 m17, [srcq+strideq*2], 3 pshufb m3, m16, m19 ; 0 1 2 3 0123 mova m2, m8 pshufb m0, m16, m21 ; 0 1 2 3 89ab vpdpbusd m2, m3, m10 mova m3, m8 pshufb m1, m17, m19 ; 3 4 5 6 0123 vpdpbusd m3, m0, m11 mova m0, m8 pshufb m4, m17, m21 ; 3 4 5 6 89ab vpdpbusd m0, m1, m10 mova m1, m8 pshufb m16, m20 ; 0 1 2 3 4567 vpdpbusd m1, m4, m11 pshufb m17, m20 ; 3 4 5 6 4567 vpdpbusd m2, m16, m11 vpdpbusd m3, m16, m10 vpdpbusd m0, m17, m11 vpdpbusd m1, m17, m10 packssdw m2, m3 packssdw m0, m1 psraw m2, 2 ; 0 1 2 3 psraw m0, 2 ; 3 4 5 6 vshufi32x4 m4, m2, m0, q2132 ; 2 3 4 5 vshufi32x4 m5, m2, m0, q1021 ; 1 2 3 4 punpcklwd m3, m4, m0 ; 23 34 45 56 punpckhwd m4, m0 punpcklwd m1, m2, m5 ; 01 12 23 34 punpckhwd m2, m5 .hv_w8_loop: movu xm18, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 ym18, [srcq+strideq*0], 1 vinserti32x4 m18, [srcq+strideq*1], 2 vinserti32x4 m18, [srcq+strideq*2], 3 pshufb m17, m18, m19 ; 7 8 9 a 0123 mova m16, m8 pshufb m5, m18, m21 ; 7 8 9 a 89ab vpdpbusd m16, m17, m10 mova m17, m8 pshufb m18, m20 ; 7 8 9 a 4567 vpdpbusd m17, m5, m11 mova m5, m9 vpdpwssd m5, m3, m13 ; a1 b1 c1 d1 mova m6, m9 vpdpwssd m6, m4, m13 vpdpbusd m16, m18, m11 vpdpbusd m17, m18, m10 vpdpwssd m5, m1, m12 ; a0 b0 c0 d0 mova m1, m3 vpdpwssd m6, m2, m12 mova m2, m4 packssdw m16, m17 psraw m16, 2 ; 7 8 9 a valignq m4, m16, m0, 6 ; 6 7 8 9 mova m0, m16 punpcklwd m3, m4, m16 ; 67 78 89 9a punpckhwd m4, m16 vpdpwssd m5, m3, m15 ; a3 b3 c3 d3 vpdpwssd m6, m4, m15 vshufi32x4 m1, m3, q1032 ; 45 56 67 78 vshufi32x4 m2, m4, q1032 vpdpwssd m5, m1, m14 ; a2 b2 c2 d2 vpdpwssd m6, m2, m14 psrad m5, 6 psrad m6, 6 packssdw m5, m6 mova [tmpq], m5 add tmpq, 64 sub hd, 4 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 23 mova m16, [spel_h_perm16] vpbroadcastd m18, [pb_4] add wd, wd paddb m17, m18, m16 lea r6d, [hq+wq*8-256] paddb m18, m17 .hv_w16_loop0: movu ym19, [srcq+strideq*0] vinserti32x8 m19, [srcq+strideq*1], 1 lea r5, [srcq+strideq*2] movu ym20, [r5 +strideq*0] vinserti32x8 m20, [r5 +strideq*1], 1 lea r5, [r5 +strideq*2] movu ym21, [r5 +strideq*0] vinserti32x8 m21, [r5 +strideq*1], 1 lea r5, [r5 +strideq*2] movu ym22, [r5 +strideq*0] mov r7, tmpq vpermb m3, m16, m19 ; 0 1 0123 89ab mova m2, m8 vpermb m4, m18, m19 ; 0 1 89ab ghij vpdpbusd m2, m3, m10 mova m3, m8 vpermb m5, m16, m20 ; 2 3 0123 89ab vpdpbusd m3, m4, m11 mova m4, m8 vpermb m6, m18, m20 ; 2 3 89ab ghij vpdpbusd m4, m5, m10 mova m5, m8 vpermb m7, m16, m21 ; 4 5 0123 89ab vpdpbusd m5, m6, m11 mova m6, m8 vpermb m0, m18, m21 ; 4 5 89ab ghij vpdpbusd m6, m7, m10 mova m7, m8 vpermb ym1, ym16, ym22 ; 6 0123 89ab vpdpbusd m7, m0, m11 mova ym0, ym8 vpermb m19, m17, m19 ; 0 1 4567 cdef vpdpbusd ym0, ym1, ym10 vpermb ym1, ym18, ym22 ; 6 89ab ghij vpdpbusd m2, m19, m11 vpdpbusd m3, m19, m10 mova ym19, ym8 vpermb m20, m17, m20 ; 2 3 4567 cdef vpdpbusd ym19, ym1, ym11 vpermb m21, m17, m21 ; 4 5 4567 cdef vpdpbusd m4, m20, m11 vpdpbusd m5, m20, m10 vpermb ym22, ym17, ym22 ; 6 4567 cdef vpdpbusd m6, m21, m11 vpdpbusd m7, m21, m10 packssdw m2, m3 ; 0 1 vpdpbusd ym0, ym22, ym11 packssdw m4, m5 ; 2 3 vpdpbusd ym19, ym22, ym10 packssdw m6, m7 ; 4 5 packssdw ym0, ym19 ; 6 REPX {psraw x, 2}, m2, m4, m6, ym0 vshufi32x4 m3, m2, m4, q1032 ; 1 2 vshufi32x4 m5, m4, m6, q1032 ; 3 4 vshufi32x4 m0, m6, m0, q1032 ; 5 6 punpcklwd m1, m2, m3 ; 01 12 punpckhwd m2, m3 punpcklwd m3, m4, m5 ; 23 34 punpckhwd m4, m5 punpcklwd m5, m6, m0 ; 45 56 punpckhwd m6, m0 .hv_w16_loop: movu ym19, [r5+strideq*1] lea r5, [r5+strideq*2] vinserti32x8 m19, [r5+strideq*0], 1 mova m20, m9 vpdpwssd m20, m1, m12 ; a0 vpermb m1, m16, m19 mova m21, m9 vpdpwssd m21, m2, m12 ; b0 vpermb m2, m17, m19 mova m22, m8 vpdpbusd m22, m1, m10 mova m1, m8 vpermb m19, m18, m19 vpdpbusd m1, m2, m10 vpdpwssd m20, m3, m13 ; a1 vpdpwssd m21, m4, m13 ; b1 vpdpbusd m22, m2, m11 mova m2, m4 vpdpbusd m1, m19, m11 mova m4, m6 vpdpwssd m20, m5, m14 ; a2 vpdpwssd m21, m6, m14 ; b2 packssdw m22, m1 mova m1, m3 psraw m22, 2 ; 7 8 mova m3, m5 vshufi32x4 m6, m0, m22, q1032 ; 6 7 mova m0, m22 punpcklwd m5, m6, m0 ; 67 78 punpckhwd m6, m0 vpdpwssd m20, m5, m15 ; a3 vpdpwssd m21, m6, m15 ; b3 psrad m20, 6 psrad m21, 6 packssdw m20, m21 mova [r7+wq*0], ym20 vextracti32x8 [r7+wq*1], m20, 1 lea r7, [r7+wq*2] sub hd, 2 jg .hv_w16_loop add srcq, 16 add tmpq, 32 movzx hd, r6b sub r6d, 1<<8 jg .hv_w16_loop0 RET cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts vpbroadcastd m9, [pd_16384] mova ym15, [warp_8x8t_end] call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m16, m16 vpermb m16, m15, m16 mova [tmpq+tsq*0], xm16 vextracti128 [tmpq+tsq*2], ym16, 1 sub r6d, 0x1800 jg .loop RET cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter vpbroadcastd m9, [pd_262144] mova xm15, [warp_8x8_end] call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m16, 19 packuswb m16, m16 vpermb m16, m15, m16 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 sub r6d, 0x1800 jg .loop RET ALIGN function_align .main: vpbroadcastd m1, [pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym1, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym1, r7m {1to8} ; my mova ym16, [pd_0to7] vpbroadcastd ym19, [abcdq+4*0] vpbroadcastd ym21, [abcdq+4*1] lea r4, [ssq*3+3] mova m10, [warp_8x8_permA] mov r6d, 0x5555 mova m11, [warp_8x8_permB] lea filterq, [mc_warp_filter+64*8] vpbroadcastq m12, [warp_8x8_hpack] sub srcq, r4 ; src -= src_stride*3 + 3 vbroadcasti32x4 m13, [warp_8x8_permC] kxnorb k2, k2, k2 vbroadcasti32x4 m14, [warp_8x8_permD] vpdpwssd ym18, ym19, ym16 ; alpha vpdpwssd ym20, ym21, ym16 ; gamma vbroadcasti32x4 m0, [srcq] psrad ym19, 16 ; beta psrad ym21, 16 ; delta kmovw k1, r6d psrad ym16, ym18, 10 kmovb k3, k2 paddd ym18, ym19 vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 psrld m1, 8 ; pd_2 pshufb m0, m11 paddd m8, m1, m1 ; pd_4 vpdpbusd m1, m0, m2 call .h psllq m2, m1, 45 pslld m1, 13 paddd m1, m2 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym17, ym20, 10 kmovb k2, k3 paddd ym20, ym21 vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 psrad ym16, ym20, 10 kmovb k3, k2 paddd ym20, ym21 vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 mova m16, m9 pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 vpdpwssd m16, m1, m4 pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 mova m1, m2 vpdpwssd m16, m2, m5 shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 mova m2, m3 pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 vpdpwssd m16, m3, m4 vpshrdq m3, m0, 48 ; 67 78 pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 vpdpwssd m16, m3, m5 ret ALIGN function_align .h: movu xm5, [srcq+ssq*1] psrad ym16, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 kmovb k2, k3 paddd ym18, ym19 vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 psrad ym17, ym18, 10 kmovb k3, k2 paddd ym18, ym19 vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 mova m0, m8 vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 vpdpbusd m0, m4, m17 vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 vpdpbusd m0, m5, m16 vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) ret %macro BIDIR_FN 1 ; op lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM %1 0 vextracti32x4 xm1, ym0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_ret: RET .w4_h16: vpbroadcastd m7, strided pmulld m7, [bidir_sctr_w4] %1 0 kxnorw k1, k1, k1 vpscatterdd [dstq+m7]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM %1 0 vextracti32x4 xm1, ym0, 1 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w8_h8: %1 0 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq ], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w16: %1 0 vpermq m0, m0, q3120 mova [dstq ], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m7, [pb_02461357] .w32_loop: %1 0 %1_INC_PTR 2 vpermq m0, m7, m0 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m7, [pb_02461357] .w64_loop: %1 0 %1_INC_PTR 2 vpermq m0, m7, m0 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m7, [pb_02461357] .w128_loop: %1 0 vpermq m6, m7, m0 %1 2 mova [dstq+64*0], m6 %1_INC_PTR 4 vpermq m6, m7, m0 mova [dstq+64*1], m6 add dstq, strideq dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset mova m0, [tmp1q+(%1+0)*mmsize] paddw m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pw_1024] add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp1q+(%1+0)*mmsize] psubw m2, m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] psubw m3, m1, [tmp2q+(%1+1)*mmsize] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight movsxd wq, dword [r6+wq*4] vpbroadcastd m5, [base+pw_2048] psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q pxor m0, m0 mov tmp1q, tmp2q psubw m4, m0, m4 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 %if mmsize == 64 vpermq m3, m8, [maskq+%1*32] %else vpermq m3, [maskq+%1*16], q3120 %endif mova m0, [tmp2q+(%1+0)*mmsize] psubw m1, m0, [tmp1q+(%1+0)*mmsize] psubb m3, m4, m3 paddw m1, m1 ; (b - a) << 1 paddb m3, m3 punpcklbw m2, m4, m3 ; -m << 9 pmulhw m1, m2 paddw m0, m1 mova m1, [tmp2q+(%1+1)*mmsize] psubw m2, m1, [tmp1q+(%1+1)*mmsize] paddw m2, m2 punpckhbw m3, m4, m3 pmulhw m2, m3 paddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*32 add tmp2q, %1*64 add tmp1q, %1*64 %endmacro cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] pxor m4, m4 mova m8, [base+bilin_v_perm64] vpbroadcastd m5, [base+pw_2048] add wq, r7 BIDIR_FN MASK %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 mova m%1, [tmp1q+mmsize*%3] mova m1, [tmp2q+mmsize*%3] psubw m1, m%1 pabsw m%2, m1 psubusw m%2, m6, m%2 psrlw m%2, 8 ; 64 - m psllw m2, m%2, 10 pmulhw m1, m2 paddw m%1, m1 mova m1, [tmp1q+mmsize*%4] mova m2, [tmp2q+mmsize*%4] psubw m2, m1 pabsw m3, m2 psubusw m3, m6, m3 vpshldw m%2, m3, 8 psllw m3, m%2, 10 %if %5 psubb m%2, m5, m%2 %endif pmulhw m2, m3 paddw m1, m2 pmulhrsw m%1, m7 pmulhrsw m1, m7 packuswb m%1, m1 %endmacro cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] vpbroadcastd m9, [base+pb_m64] ; -1 << 6 mova ym10, [base+wm_420_mask+32] vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 add wq, r7 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: mova m5, [wm_420_perm4] cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1 vinserti128 ym5, [wm_420_perm4+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: vpermb ym8, ym10, ym8 movq [maskq], xm8 RET .w4_h16: vpbroadcastd m11, strided pmulld m11, [bidir_sctr_w4] W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 vpdpbusd m8, m4, m9 kxnorw k1, k1, k1 vpermb m8, m10, m8 mova [maskq], xm8 vpscatterdd [dstq+m11]{k1}, m0 RET .w8: mova m5, [wm_420_perm8] cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1 vinserti128 ym5, [wm_420_perm8+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 vpermb m8, m10, m8 mova [maskq], xm8 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 16 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 mova m1, m8 vpdpbusd m1, m4, m9 vpermb m1, m10, m1 mova [maskq], xm1 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16: mova m5, [wm_420_perm16] .w16_loop: W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 mova m1, m8 vpdpbusd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m0, q3120 mova [maskq], xm1 add maskq, 16 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpbusd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 mova [maskq], xm1 add maskq, 16 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 .w64_loop: W_MASK 0, 4, 0, 2 W_MASK 11, 5, 1, 3 mova m2, m8 vpdpbusd m2, m4, m9 mova m3, m8 vpdpbusd m3, m5, m9 add tmp1q, 256 add tmp2q, 256 vpermt2b m2, m10, m3 mova m1, m0 vpermt2q m0, m12, m11 vpermt2q m1, m13, m11 mova [maskq], ym2 add maskq, 32 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64_loop RET .w128: pmovzxbq m14, [wm_420_perm64] mova m10, [wm_420_mask] psrlq m15, m14, 4 .w128_loop: W_MASK 0, 12, 0, 4 W_MASK 11, 13, 1, 5 mova m4, m8 vpdpbusd m4, m12, m9 mova m5, m8 vpdpbusd m5, m13, m9 mova m1, m0 vpermt2q m0, m14, m11 vpermt2q m1, m15, m11 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*1+64*0], m1 W_MASK 0, 12, 2, 6 W_MASK 11, 13, 3, 7 vprold m4, 16 vprold m5, 16 vpdpbusd m4, m12, m9 vpdpbusd m5, m13, m9 add tmp1q, 512 add tmp2q, 512 vpermt2b m4, m10, m5 mova m1, m0 vpermt2q m0, m14, m11 vpermt2q m1, m15, m11 mova [maskq], m4 add maskq, 64 mova [dstq+strideq*0+64*1], m0 mova [dstq+strideq*1+64*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w128_loop RET cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] vpbroadcastd m9, [base+pw_m128] mova m10, [base+wm_422_mask] vpbroadcastd m11, [base+pb_127] add wq, r7 vpbroadcastd m8, [base+wm_sign+4+r6*4] mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: pand xm8, xm11 mova [maskq], xm8 RET .w4_h16: vpbroadcastd m5, strided pmulld m5, [bidir_sctr_w4] W_MASK 0, 4, 0, 1 vpdpwssd m8, m4, m9 kxnorw k1, k1, k1 vpermb m8, m10, m8 pand ym8, ym11 mova [maskq], ym8 vpscatterdd [dstq+m5]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 pand xm8, xm11 mova [maskq], xm8 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 32 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 vpermb m1, m10, m1 pand ym1, ym11 mova [maskq], ym1 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 32 lea dstq, [dstq+strideq*4] .w16: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 vpermb m1, m10, m1 vpermq m0, m0, q3120 pand ym1, ym11 mova [maskq], ym1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 pand ym1, ym11 mova [maskq], ym1 add maskq, 32 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m5, [pb_02461357] .w64_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 pand ym1, ym11 mova [maskq], ym1 add maskq, 32 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m13, [pb_02461357] .w128_loop: W_MASK 0, 4, 0, 1 W_MASK 12, 5, 2, 3 mova m2, m8 vpdpwssd m2, m4, m9 mova m3, m8 vpdpwssd m3, m5, m9 add tmp1q, 256 add tmp2q, 256 vpermt2b m2, m10, m3 vpermq m0, m13, m0 vpermq m1, m13, m12 pand m2, m11 mova [maskq], m2 add maskq, 64 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m5, [base+pb_64] vpbroadcastd m7, [base+pw_2048] mova m8, [base+wm_444_mask] add wq, r7 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1, 1 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: RET .w4_h16: vpbroadcastd m9, strided pmulld m9, [bidir_sctr_w4] W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 kxnorw k1, k1, k1 mova [maskq], m4 vpscatterdd [dstq+m9]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1, 1 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 64 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 mova [maskq], m4 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 64 lea dstq, [dstq+strideq*4] .w16: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 vpermq m0, m0, q3120 mova [maskq], m4 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m9, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 add tmp1q, 128 add tmp2q, 128 vpermq m0, m9, m0 mova [maskq], m4 add maskq, 64 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m9, [pb_02461357] .w64_loop: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 add tmp1q, 128 add tmp2q, 128 vpermq m0, m9, m0 mova [maskq], m4 add maskq, 64 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m11, [pb_02461357] .w128_loop: W_MASK 0, 4, 0, 1, 1 W_MASK 10, 9, 2, 3, 1 vpermb m4, m8, m4 vpermb m9, m8, m9 add tmp1q, 256 add tmp2q, 256 vpermq m0, m11, m0 vpermq m10, m11, m10 mova [maskq+64*0], m4 mova [maskq+64*1], m9 add maskq, 128 mova [dstq+64*0], m0 mova [dstq+64*1], m10 add dstq, strideq dec hd jg .w128_loop RET cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn maskq, maskmp movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m6, [base+pb_64] vpbroadcastd m7, [base+pw_512] sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq .w4: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 vpbroadcastd xmm1, [dstq+dsq*2] pinsrd xmm1, [dstq+r6 ], 3 mova xmm4, [maskq] mova xmm5, [maskq+tmpq] add maskq, 4*4 psubb xmm3, xm6, xmm4 punpcklbw xmm0, xmm5 punpcklbw xmm2, xmm3, xmm4 punpckhbw xmm1, xmm5 punpckhbw xmm3, xmm4 pmaddubsw xmm0, xmm2 pmaddubsw xmm1, xmm3 pmulhrsw xmm0, xm7 pmulhrsw xmm1, xm7 packuswb xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 pextrd [dstq+dsq*2], xmm0, 2 pextrd [dstq+r6 ], xmm0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: movq xmm0, [dstq+dsq*0] vpbroadcastq xmm1, [dstq+dsq*1] vpbroadcastq ymm2, [dstq+dsq*2] vpbroadcastq ymm3, [dstq+r6 ] mova ymm4, [maskq] mova ymm5, [maskq+tmpq] add maskq, 8*4 vpblendd ymm0, ymm2, 0x30 vpblendd ymm1, ymm3, 0xc0 psubb ymm3, ym6, ymm4 punpcklbw ymm0, ymm5 punpcklbw ymm2, ymm3, ymm4 punpckhbw ymm1, ymm5 punpckhbw ymm3, ymm4 pmaddubsw ymm0, ymm2 pmaddubsw ymm1, ymm3 pmulhrsw ymm0, ym7 pmulhrsw ymm1, ym7 packuswb ymm0, ymm1 vextracti128 xmm1, ymm0, 1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 movq [dstq+dsq*2], xmm1 movhps [dstq+r6 ], xmm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 vzeroupper RET .w16: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 vinserti32x4 m1, [dstq+dsq*2], 2 mova m4, [maskq] vinserti32x4 m1, [dstq+r6 ], 3 mova m5, [maskq+tmpq] add maskq, 16*4 psubb m3, m6, m4 punpcklbw m0, m1, m5 punpcklbw m2, m3, m4 punpckhbw m1, m5 punpckhbw m3, m4 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 mova m4, [maskq] mova m5, [maskq+tmpq] add maskq, 32*2 psubb m3, m6, m4 punpcklbw m0, m1, m5 punpcklbw m2, m3, m4 punpckhbw m1, m5 punpckhbw m3, m4 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/mc_sse.asm000066400000000000000000012013251517466257200232370ustar00rootroot00000000000000; Copyright © 2018, VideoLAN and dav2d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2018, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufD: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 subpel_h_shufE: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 subpel_h_shufF: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 wm_420_sign: times 4 dw 258 times 4 dw 257 wm_422_sign: times 8 db 128 times 8 db 127 pb_8x0_8x8: times 8 db 0 times 8 db 8 bdct_lb_dw: times 4 db 0 times 4 db 4 times 4 db 8 times 4 db 12 pb_64: times 16 db 64 pw_m256: times 8 dw -256 pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_8: times 8 dw 8 pw_15: times 8 dw 15 pw_26: times 8 dw 26 pw_34: times 8 dw 34 pw_512: times 8 dw 512 pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 pw_6903: times 8 dw 6903 pw_8192: times 8 dw 8192 pd_32: times 4 dd 32 pd_63: times 4 dd 63 pd_512: times 4 dd 512 pd_16384: times 4 dd 16484 pd_32768: times 4 dd 32768 pd_262144:times 4 dd 262144 pd_0x3ff: times 4 dd 0x3ff pd_0x4000:times 4 dd 0x4000 pq_0x40000000: times 2 dq 0x40000000 const mc_warp_filter2 ; dav2d_mc_warp_filter[] reordered for pmaddubsw usage ; [-3, -2) db 127, 0, 0, 0, 1, 0, 0, 0,126, 0, 0, 0, 2, 0, 0, 0 db 124, 0, 0, 0, 4, 0, 0, 0,122, 0, 0, 0, 6, 0, 0, 0 db 120, 0, 0, 0, 8, 0, 0, 0,118, 0, 0, 0, 10, 0, 0, 0 db 116, 0, 0, 0, 12, 0, 0, 0,114, 0, 0, 0, 14, 0, 0, 0 db 112, 0, 0, 0, 16, 0, 0, 0,110, 0, 0, 0, 18, 0, 0, 0 db 108, 0, 0, 0, 20, 0, 0, 0,106, 0, 0, 0, 22, 0, 0, 0 db 104, 0, 0, 0, 24, 0, 0, 0,102, 0, 0, 0, 26, 0, 0, 0 db 100, 0, 0, 0, 28, 0, 0, 0, 98, 0, 0, 0, 30, 0, 0, 0 db 96, 0, 0, 0, 32, 0, 0, 0, 94, 0, 0, 0, 34, 0, 0, 0 db 92, 0, 0, 0, 36, 0, 0, 0, 90, 0, 0, 0, 38, 0, 0, 0 db 88, 0, 0, 0, 40, 0, 0, 0, 86, 0, 0, 0, 42, 0, 0, 0 db 84, 0, 0, 0, 44, 0, 0, 0, 82, 0, 0, 0, 46, 0, 0, 0 db 80, 0, 0, 0, 48, 0, 0, 0, 78, 0, 0, 0, 50, 0, 0, 0 db 76, 0, 0, 0, 52, 0, 0, 0, 74, 0, 0, 0, 54, 0, 0, 0 db 72, 0, 0, 0, 56, 0, 0, 0, 70, 0, 0, 0, 58, 0, 0, 0 db 68, 0, 0, 0, 60, 0, 0, 0, 66, 0, 0, 0, 62, 0, 0, 0 db 64, 0, 0, 0, 64, 0, 0, 0, 62, 0, 0, 0, 66, 0, 0, 0 db 60, 0, 0, 0, 68, 0, 0, 0, 58, 0, 0, 0, 70, 0, 0, 0 db 56, 0, 0, 0, 72, 0, 0, 0, 54, 0, 0, 0, 74, 0, 0, 0 db 52, 0, 0, 0, 76, 0, 0, 0, 50, 0, 0, 0, 78, 0, 0, 0 db 48, 0, 0, 0, 80, 0, 0, 0, 46, 0, 0, 0, 82, 0, 0, 0 db 44, 0, 0, 0, 84, 0, 0, 0, 42, 0, 0, 0, 86, 0, 0, 0 db 40, 0, 0, 0, 88, 0, 0, 0, 38, 0, 0, 0, 90, 0, 0, 0 db 36, 0, 0, 0, 92, 0, 0, 0, 34, 0, 0, 0, 94, 0, 0, 0 db 32, 0, 0, 0, 96, 0, 0, 0, 30, 0, 0, 0, 98, 0, 0, 0 db 28, 0, 0, 0,100, 0, 0, 0, 26, 0, 0, 0,102, 0, 0, 0 db 24, 0, 0, 0,104, 0, 0, 0, 22, 0, 0, 0,106, 0, 0, 0 db 20, 0, 0, 0,108, 0, 0, 0, 18, 0, 0, 0,110, 0, 0, 0 db 16, 0, 0, 0,112, 0, 0, 0, 14, 0, 0, 0,114, 0, 0, 0 db 12, 0, 0, 0,116, 0, 0, 0, 10, 0, 0, 0,118, 0, 0, 0 db 8, 0, 0, 0,120, 0, 0, 0, 6, 0, 0, 0,122, 0, 0, 0 db 4, 0, 0, 0,124, 0, 0, 0, 2, 0, 0, 0,126, 0, 0, 0 ; [-2, -1) db 0, 1, 0, 0,127, 0, 0, 0, -1, 2, 0, 0,127, 0, 0, 0 db -2, 4, 0, 0,127, -1, 0, 0, -3, 6, 0, 0,126, -1, 0, 0 db -3, 8, 0, 0,125, -2, 0, 0, -4, 11, 0, 0,124, -3, 0, 0 db -5, 13, 0, 0,123, -3, 0, 0, -5, 15, 0, 0,121, -3, 0, 0 db -6, 18, 0, 0,120, -4, 0, 0, -7, 20, 0, 0,119, -4, 0, 0 db -7, 22, 0, 0,118, -5, 0, 0, -8, 25, 0, 0,116, -5, 0, 0 db -8, 27, 0, 0,115, -6, 0, 0, -9, 30, 0, 0,113, -6, 0, 0 db -9, 32, 0, 0,112, -7, 0, 0, -9, 34, 0, 0,110, -7, 0, 0 db -10, 37, 0, 0,108, -7, 0, 0,-10, 39, 0, 0,107, -8, 0, 0 db -10, 41, 0, 0,105, -8, 0, 0,-11, 44, 0, 0,103, -8, 0, 0 db -11, 47, 0, 0,101, -9, 0, 0,-11, 49, 0, 0, 99, -9, 0, 0 db -11, 51, 0, 0, 97, -9, 0, 0,-11, 54, 0, 0, 95,-10, 0, 0 db -11, 56, 0, 0, 93,-10, 0, 0,-12, 59, 0, 0, 91,-10, 0, 0 db -12, 61, 0, 0, 89,-10, 0, 0,-12, 64, 0, 0, 87,-11, 0, 0 db -12, 66, 0, 0, 85,-11, 0, 0,-12, 69, 0, 0, 82,-11, 0, 0 db -12, 71, 0, 0, 80,-11, 0, 0,-12, 73, 0, 0, 78,-11, 0, 0 db -11, 75, 0, 0, 75,-11, 0, 0,-11, 78, 0, 0, 73,-12, 0, 0 db -11, 80, 0, 0, 71,-12, 0, 0,-11, 82, 0, 0, 69,-12, 0, 0 db -11, 85, 0, 0, 66,-12, 0, 0,-11, 87, 0, 0, 64,-12, 0, 0 db -10, 89, 0, 0, 61,-12, 0, 0,-10, 91, 0, 0, 59,-12, 0, 0 db -10, 93, 0, 0, 56,-11, 0, 0,-10, 95, 0, 0, 54,-11, 0, 0 db -9, 97, 0, 0, 51,-11, 0, 0, -9, 99, 0, 0, 49,-11, 0, 0 db -9,101, 0, 0, 47,-11, 0, 0, -8,103, 0, 0, 44,-11, 0, 0 db -8,105, 0, 0, 41,-10, 0, 0, -8,107, 0, 0, 39,-10, 0, 0 db -7,108, 0, 0, 37,-10, 0, 0, -7,110, 0, 0, 34, -9, 0, 0 db -7,112, 0, 0, 32, -9, 0, 0, -6,113, 0, 0, 30, -9, 0, 0 db -6,115, 0, 0, 27, -8, 0, 0, -5,116, 0, 0, 25, -8, 0, 0 db -5,118, 0, 0, 22, -7, 0, 0, -4,119, 0, 0, 20, -7, 0, 0 db -4,120, 0, 0, 18, -6, 0, 0, -3,121, 0, 0, 15, -5, 0, 0 db -3,123, 0, 0, 13, -5, 0, 0, -3,124, 0, 0, 11, -4, 0, 0 db -2,125, 0, 0, 8, -3, 0, 0, -1,126, 0, 0, 6, -3, 0, 0 db -1,127, 0, 0, 4, -2, 0, 0, 0,127, 0, 0, 2, -1, 0, 0 ; [-1, 0) db 0,127, 0, 0, 0, 1, 0, 0, 0,127, 0, 0, -1, 2, 0, 0 db 1,127, -1, 0, -3, 4, 0, 0, 1,126, -2, 0, -4, 6, 1, 0 db 1,126, -3, 0, -5, 8, 1, 0, 1,125, -4, 0, -6, 11, 1, 0 db 1,124, -4, 0, -7, 13, 1, 0, 2,123, -5, 0, -8, 15, 1, 0 db 2,122, -6, 0, -9, 18, 1, 0, 2,121, -6, 0,-10, 20, 1, 0 db 2,120, -7, 0,-11, 22, 2, 0, 2,119, -8, 0,-12, 25, 2, 0 db 3,117, -8, 0,-13, 27, 2, 0, 3,116, -9, 0,-13, 29, 2, 0 db 3,114,-10, 0,-14, 32, 3, 0, 3,113,-10, 0,-15, 35, 2, 0 db 3,111,-11, 0,-15, 37, 3, 0, 3,109,-11, 0,-16, 40, 3, 0 db 3,108,-12, 0,-16, 42, 3, 0, 4,106,-13, 0,-17, 45, 3, 0 db 4,104,-13, 0,-17, 47, 3, 0, 4,102,-14, 0,-17, 50, 3, 0 db 4,100,-14, 0,-17, 52, 3, 0, 4, 98,-15, 0,-18, 55, 4, 0 db 4, 96,-15, 0,-18, 58, 3, 0, 4, 94,-16, 0,-18, 60, 4, 0 db 4, 91,-16, 0,-18, 63, 4, 0, 4, 89,-16, 0,-18, 65, 4, 0 db 4, 87,-17, 0,-18, 68, 4, 0, 4, 85,-17, 0,-18, 70, 4, 0 db 4, 82,-17, 0,-18, 73, 4, 0, 4, 80,-17, 0,-18, 75, 4, 0 db 4, 78,-18, 0,-18, 78, 4, 0, 4, 75,-18, 0,-17, 80, 4, 0 db 4, 73,-18, 0,-17, 82, 4, 0, 4, 70,-18, 0,-17, 85, 4, 0 db 4, 68,-18, 0,-17, 87, 4, 0, 4, 65,-18, 0,-16, 89, 4, 0 db 4, 63,-18, 0,-16, 91, 4, 0, 4, 60,-18, 0,-16, 94, 4, 0 db 3, 58,-18, 0,-15, 96, 4, 0, 4, 55,-18, 0,-15, 98, 4, 0 db 3, 52,-17, 0,-14,100, 4, 0, 3, 50,-17, 0,-14,102, 4, 0 db 3, 47,-17, 0,-13,104, 4, 0, 3, 45,-17, 0,-13,106, 4, 0 db 3, 42,-16, 0,-12,108, 3, 0, 3, 40,-16, 0,-11,109, 3, 0 db 3, 37,-15, 0,-11,111, 3, 0, 2, 35,-15, 0,-10,113, 3, 0 db 3, 32,-14, 0,-10,114, 3, 0, 2, 29,-13, 0, -9,116, 3, 0 db 2, 27,-13, 0, -8,117, 3, 0, 2, 25,-12, 0, -8,119, 2, 0 db 2, 22,-11, 0, -7,120, 2, 0, 1, 20,-10, 0, -6,121, 2, 0 db 1, 18, -9, 0, -6,122, 2, 0, 1, 15, -8, 0, -5,123, 2, 0 db 1, 13, -7, 0, -4,124, 1, 0, 1, 11, -6, 0, -4,125, 1, 0 db 1, 8, -5, 0, -3,126, 1, 0, 1, 6, -4, 0, -2,126, 1, 0 db 0, 4, -3, 0, -1,127, 1, 0, 0, 2, -1, 0, 0,127, 0, 0 ; [0, 1) db 0, 0, 1, 0, 0,127, 0, 0, 0, -1, 2, 0, 0,127, 0, 0 db 0, -3, 4, 1, 1,127, -2, 0, 0, -5, 6, 1, 1,127, -2, 0 db 0, -6, 8, 1, 2,126, -3, 0, -1, -7, 11, 2, 2,126, -4, -1 db -1, -8, 13, 2, 3,125, -5, -1, -1,-10, 16, 3, 3,124, -6, -1 db -1,-11, 18, 3, 4,123, -7, -1, -1,-12, 20, 3, 4,122, -7, -1 db -1,-13, 23, 3, 4,121, -8, -1, -2,-14, 25, 4, 5,120, -9, -1 db -1,-15, 27, 4, 5,119,-10, -1, -1,-16, 30, 4, 5,118,-11, -1 db -2,-17, 33, 5, 6,116,-12, -1, -2,-17, 35, 5, 6,114,-12, -1 db -2,-18, 38, 5, 6,113,-13, -1, -2,-19, 41, 6, 7,111,-14, -2 db -2,-19, 43, 6, 7,110,-15, -2, -2,-20, 46, 6, 7,108,-15, -2 db -2,-20, 49, 6, 7,106,-16, -2, -2,-21, 51, 7, 7,104,-16, -2 db -2,-21, 54, 7, 7,102,-17, -2, -2,-21, 56, 7, 8,100,-18, -2 db -2,-22, 59, 7, 8, 98,-18, -2, -2,-22, 62, 7, 8, 96,-19, -2 db -2,-22, 64, 7, 8, 94,-19, -2, -2,-22, 67, 8, 8, 91,-20, -2 db -2,-22, 69, 8, 8, 89,-20, -2, -2,-22, 72, 8, 8, 87,-21, -2 db -2,-21, 74, 8, 8, 84,-21, -2, -2,-22, 77, 8, 8, 82,-21, -2 db -2,-21, 79, 8, 8, 79,-21, -2, -2,-21, 82, 8, 8, 77,-22, -2 db -2,-21, 84, 8, 8, 74,-21, -2, -2,-21, 87, 8, 8, 72,-22, -2 db -2,-20, 89, 8, 8, 69,-22, -2, -2,-20, 91, 8, 8, 67,-22, -2 db -2,-19, 94, 8, 7, 64,-22, -2, -2,-19, 96, 8, 7, 62,-22, -2 db -2,-18, 98, 8, 7, 59,-22, -2, -2,-18,100, 8, 7, 56,-21, -2 db -2,-17,102, 7, 7, 54,-21, -2, -2,-16,104, 7, 7, 51,-21, -2 db -2,-16,106, 7, 6, 49,-20, -2, -2,-15,108, 7, 6, 46,-20, -2 db -2,-15,110, 7, 6, 43,-19, -2, -2,-14,111, 7, 6, 41,-19, -2 db -1,-13,113, 6, 5, 38,-18, -2, -1,-12,114, 6, 5, 35,-17, -2 db -1,-12,116, 6, 5, 33,-17, -2, -1,-11,118, 5, 4, 30,-16, -1 db -1,-10,119, 5, 4, 27,-15, -1, -1, -9,120, 5, 4, 25,-14, -2 db -1, -8,121, 4, 3, 23,-13, -1, -1, -7,122, 4, 3, 20,-12, -1 db -1, -7,123, 4, 3, 18,-11, -1, -1, -6,124, 3, 3, 16,-10, -1 db -1, -5,125, 3, 2, 13, -8, -1, -1, -4,126, 2, 2, 11, -7, -1 db 0, -3,126, 2, 1, 8, -6, 0, 0, -2,127, 1, 1, 6, -5, 0 db 0, -2,127, 1, 1, 4, -3, 0, 0, 0,127, 0, 0, 2, -1, 0 ; [1, 2) db 0, 0,127, 0, 0, 1, 0, 0, 0, 0,127, 0, 0, -1, 2, 0 db 0, 1,127, -1, 0, -3, 4, 0, 0, 1,126, -2, 0, -4, 6, 1 db 0, 1,126, -3, 0, -5, 8, 1, 0, 1,125, -4, 0, -6, 11, 1 db 0, 1,124, -4, 0, -7, 13, 1, 0, 2,123, -5, 0, -8, 15, 1 db 0, 2,122, -6, 0, -9, 18, 1, 0, 2,121, -6, 0,-10, 20, 1 db 0, 2,120, -7, 0,-11, 22, 2, 0, 2,119, -8, 0,-12, 25, 2 db 0, 3,117, -8, 0,-13, 27, 2, 0, 3,116, -9, 0,-13, 29, 2 db 0, 3,114,-10, 0,-14, 32, 3, 0, 3,113,-10, 0,-15, 35, 2 db 0, 3,111,-11, 0,-15, 37, 3, 0, 3,109,-11, 0,-16, 40, 3 db 0, 3,108,-12, 0,-16, 42, 3, 0, 4,106,-13, 0,-17, 45, 3 db 0, 4,104,-13, 0,-17, 47, 3, 0, 4,102,-14, 0,-17, 50, 3 db 0, 4,100,-14, 0,-17, 52, 3, 0, 4, 98,-15, 0,-18, 55, 4 db 0, 4, 96,-15, 0,-18, 58, 3, 0, 4, 94,-16, 0,-18, 60, 4 db 0, 4, 91,-16, 0,-18, 63, 4, 0, 4, 89,-16, 0,-18, 65, 4 db 0, 4, 87,-17, 0,-18, 68, 4, 0, 4, 85,-17, 0,-18, 70, 4 db 0, 4, 82,-17, 0,-18, 73, 4, 0, 4, 80,-17, 0,-18, 75, 4 db 0, 4, 78,-18, 0,-18, 78, 4, 0, 4, 75,-18, 0,-17, 80, 4 db 0, 4, 73,-18, 0,-17, 82, 4, 0, 4, 70,-18, 0,-17, 85, 4 db 0, 4, 68,-18, 0,-17, 87, 4, 0, 4, 65,-18, 0,-16, 89, 4 db 0, 4, 63,-18, 0,-16, 91, 4, 0, 4, 60,-18, 0,-16, 94, 4 db 0, 3, 58,-18, 0,-15, 96, 4, 0, 4, 55,-18, 0,-15, 98, 4 db 0, 3, 52,-17, 0,-14,100, 4, 0, 3, 50,-17, 0,-14,102, 4 db 0, 3, 47,-17, 0,-13,104, 4, 0, 3, 45,-17, 0,-13,106, 4 db 0, 3, 42,-16, 0,-12,108, 3, 0, 3, 40,-16, 0,-11,109, 3 db 0, 3, 37,-15, 0,-11,111, 3, 0, 2, 35,-15, 0,-10,113, 3 db 0, 3, 32,-14, 0,-10,114, 3, 0, 2, 29,-13, 0, -9,116, 3 db 0, 2, 27,-13, 0, -8,117, 3, 0, 2, 25,-12, 0, -8,119, 2 db 0, 2, 22,-11, 0, -7,120, 2, 0, 1, 20,-10, 0, -6,121, 2 db 0, 1, 18, -9, 0, -6,122, 2, 0, 1, 15, -8, 0, -5,123, 2 db 0, 1, 13, -7, 0, -4,124, 1, 0, 1, 11, -6, 0, -4,125, 1 db 0, 1, 8, -5, 0, -3,126, 1, 0, 1, 6, -4, 0, -2,126, 1 db 0, 0, 4, -3, 0, -1,127, 1, 0, 0, 2, -1, 0, 0,127, 0 ; [2, 3) db 0, 0, 0, 1, 0, 0,127, 0, 0, 0, -1, 2, 0, 0,127, 0 db 0, 0, -2, 4, 0, 0,127, -1, 0, 0, -3, 6, 0, 0,126, -1 db 0, 0, -3, 8, 0, 0,125, -2, 0, 0, -4, 11, 0, 0,124, -3 db 0, 0, -5, 13, 0, 0,123, -3, 0, 0, -5, 15, 0, 0,121, -3 db 0, 0, -6, 18, 0, 0,120, -4, 0, 0, -7, 20, 0, 0,119, -4 db 0, 0, -7, 22, 0, 0,118, -5, 0, 0, -8, 25, 0, 0,116, -5 db 0, 0, -8, 27, 0, 0,115, -6, 0, 0, -9, 30, 0, 0,113, -6 db 0, 0, -9, 32, 0, 0,112, -7, 0, 0, -9, 34, 0, 0,110, -7 db 0, 0,-10, 37, 0, 0,108, -7, 0, 0,-10, 39, 0, 0,107, -8 db 0, 0,-10, 41, 0, 0,105, -8, 0, 0,-11, 44, 0, 0,103, -8 db 0, 0,-11, 47, 0, 0,101, -9, 0, 0,-11, 49, 0, 0, 99, -9 db 0, 0,-11, 51, 0, 0, 97, -9, 0, 0,-11, 54, 0, 0, 95,-10 db 0, 0,-11, 56, 0, 0, 93,-10, 0, 0,-12, 59, 0, 0, 91,-10 db 0, 0,-12, 61, 0, 0, 89,-10, 0, 0,-12, 64, 0, 0, 87,-11 db 0, 0,-12, 66, 0, 0, 85,-11, 0, 0,-12, 69, 0, 0, 82,-11 db 0, 0,-12, 71, 0, 0, 80,-11, 0, 0,-12, 73, 0, 0, 78,-11 db 0, 0,-11, 75, 0, 0, 75,-11, 0, 0,-11, 78, 0, 0, 73,-12 db 0, 0,-11, 80, 0, 0, 71,-12, 0, 0,-11, 82, 0, 0, 69,-12 db 0, 0,-11, 85, 0, 0, 66,-12, 0, 0,-11, 87, 0, 0, 64,-12 db 0, 0,-10, 89, 0, 0, 61,-12, 0, 0,-10, 91, 0, 0, 59,-12 db 0, 0,-10, 93, 0, 0, 56,-11, 0, 0,-10, 95, 0, 0, 54,-11 db 0, 0, -9, 97, 0, 0, 51,-11, 0, 0, -9, 99, 0, 0, 49,-11 db 0, 0, -9,101, 0, 0, 47,-11, 0, 0, -8,103, 0, 0, 44,-11 db 0, 0, -8,105, 0, 0, 41,-10, 0, 0, -8,107, 0, 0, 39,-10 db 0, 0, -7,108, 0, 0, 37,-10, 0, 0, -7,110, 0, 0, 34, -9 db 0, 0, -7,112, 0, 0, 32, -9, 0, 0, -6,113, 0, 0, 30, -9 db 0, 0, -6,115, 0, 0, 27, -8, 0, 0, -5,116, 0, 0, 25, -8 db 0, 0, -5,118, 0, 0, 22, -7, 0, 0, -4,119, 0, 0, 20, -7 db 0, 0, -4,120, 0, 0, 18, -6, 0, 0, -3,121, 0, 0, 15, -5 db 0, 0, -3,123, 0, 0, 13, -5, 0, 0, -3,124, 0, 0, 11, -4 db 0, 0, -2,125, 0, 0, 8, -3, 0, 0, -1,126, 0, 0, 6, -3 db 0, 0, -1,127, 0, 0, 4, -2, 0, 0, 0,127, 0, 0, 2, -1 ; [3, 4) db 0, 0, 0,127, 0, 0, 0, 1, 0, 0, 0,126, 0, 0, 0, 2 db 0, 0, 0,124, 0, 0, 0, 4, 0, 0, 0,122, 0, 0, 0, 6 db 0, 0, 0,120, 0, 0, 0, 8, 0, 0, 0,118, 0, 0, 0, 10 db 0, 0, 0,116, 0, 0, 0, 12, 0, 0, 0,114, 0, 0, 0, 14 db 0, 0, 0,112, 0, 0, 0, 16, 0, 0, 0,110, 0, 0, 0, 18 db 0, 0, 0,108, 0, 0, 0, 20, 0, 0, 0,106, 0, 0, 0, 22 db 0, 0, 0,104, 0, 0, 0, 24, 0, 0, 0,102, 0, 0, 0, 26 db 0, 0, 0,100, 0, 0, 0, 28, 0, 0, 0, 98, 0, 0, 0, 30 db 0, 0, 0, 96, 0, 0, 0, 32, 0, 0, 0, 94, 0, 0, 0, 34 db 0, 0, 0, 92, 0, 0, 0, 36, 0, 0, 0, 90, 0, 0, 0, 38 db 0, 0, 0, 88, 0, 0, 0, 40, 0, 0, 0, 86, 0, 0, 0, 42 db 0, 0, 0, 84, 0, 0, 0, 44, 0, 0, 0, 82, 0, 0, 0, 46 db 0, 0, 0, 80, 0, 0, 0, 48, 0, 0, 0, 78, 0, 0, 0, 50 db 0, 0, 0, 76, 0, 0, 0, 52, 0, 0, 0, 74, 0, 0, 0, 54 db 0, 0, 0, 72, 0, 0, 0, 56, 0, 0, 0, 70, 0, 0, 0, 58 db 0, 0, 0, 68, 0, 0, 0, 60, 0, 0, 0, 66, 0, 0, 0, 62 db 0, 0, 0, 64, 0, 0, 0, 64, 0, 0, 0, 62, 0, 0, 0, 66 db 0, 0, 0, 60, 0, 0, 0, 68, 0, 0, 0, 58, 0, 0, 0, 70 db 0, 0, 0, 56, 0, 0, 0, 72, 0, 0, 0, 54, 0, 0, 0, 74 db 0, 0, 0, 52, 0, 0, 0, 76, 0, 0, 0, 50, 0, 0, 0, 78 db 0, 0, 0, 48, 0, 0, 0, 80, 0, 0, 0, 46, 0, 0, 0, 82 db 0, 0, 0, 44, 0, 0, 0, 84, 0, 0, 0, 42, 0, 0, 0, 86 db 0, 0, 0, 40, 0, 0, 0, 88, 0, 0, 0, 38, 0, 0, 0, 90 db 0, 0, 0, 36, 0, 0, 0, 92, 0, 0, 0, 34, 0, 0, 0, 94 db 0, 0, 0, 32, 0, 0, 0, 96, 0, 0, 0, 30, 0, 0, 0, 98 db 0, 0, 0, 28, 0, 0, 0,100, 0, 0, 0, 26, 0, 0, 0,102 db 0, 0, 0, 24, 0, 0, 0,104, 0, 0, 0, 22, 0, 0, 0,106 db 0, 0, 0, 20, 0, 0, 0,108, 0, 0, 0, 18, 0, 0, 0,110 db 0, 0, 0, 16, 0, 0, 0,112, 0, 0, 0, 14, 0, 0, 0,114 db 0, 0, 0, 12, 0, 0, 0,116, 0, 0, 0, 10, 0, 0, 0,118 db 0, 0, 0, 8, 0, 0, 0,120, 0, 0, 0, 6, 0, 0, 0,122 db 0, 0, 0, 4, 0, 0, 0,124, 0, 0, 0, 2, 0, 0, 0,126 db 0, 0, 0, 2, 0, 0, 0,126 pw_258: times 2 dw 258 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %macro BIDIR_JMP_TABLE 2-* ;evaluated at definition time (in loop below) %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) ; dynamically generated label %%table: %rep %0 - 2 ; repeat for num args dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX SECTION .text INIT_XMM ssse3 %if ARCH_X86_32 DECLARE_REG_TMP 1 %define base t0-put_ssse3 %else DECLARE_REG_TMP 7 %define base 0 %endif %macro RESTORE_DSQ_32 1 %if ARCH_X86_32 mov %1, dsm ; restore dsq %endif %endmacro cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx LEA t0, put_ssse3 movifnidn srcq, srcmp movifnidn ssq, ssmp tzcnt wd, wm mov hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [t0+wq*2+table_offset(put,)] add wq, t0 RESTORE_DSQ_32 t0 jmp wq .put_w2: movzx r4d, word [srcq+ssq*0] movzx r6d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4w mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4d mov [dstq+dsq*1], r6d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq [dstq+dsq*0], m0 movq [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] movu m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+16*0], m0 mova [dstq+dsq*0+16*1], m1 mova [dstq+dsq*1+16*0], m2 mova [dstq+dsq*1+16*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, ssq mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add srcq, ssq add dstq, dsq dec hd jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0x00ff00ff mova m4, [base+subpel_h_shufD] mova m0, [base+bilin_h_shuf4] add mxyd, 0x00100010 movd m5, mxyd mov mxyd, r7m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] mova m3, [base+pw_2048] add wq, t0 movifnidn dsq, dsmp jmp wq .h_w2: pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} .h_w2_loop: movd m0, [srcq+ssq*0] movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq m0, m1 pshufb m0, m4 pmaddubsw m0, m5 pmulhrsw m0, m3 packuswb m0, m0 movd r6d, m0 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movq m4, [srcq+ssq*0] movhps m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m4, m0 pmaddubsw m4, m5 pmulhrsw m4, m3 packuswb m4, m4 movd [dstq+dsq*0], m4 psrlq m4, 32 movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w16 RET .h_w32: movu m0, [srcq+mmsize*0+8*0] movu m1, [srcq+mmsize*0+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu m1, [srcq+mmsize*1+8*0] movu m2, [srcq+mmsize*1+8*1] add srcq, ssq pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 packuswb m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: mov r6, -16*3 .h_w64_loop: movu m0, [srcq+r6+16*3+8*0] movu m1, [srcq+r6+16*3+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+16*3], m0 add r6, 16 jle .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64 RET .h_w128: mov r6, -16*7 .h_w128_loop: movu m0, [srcq+r6+16*7+8*0] movu m1, [srcq+r6+16*7+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+16*7], m0 add r6, 16 jle .h_w128_loop add srcq, ssq add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] imul mxyd, 0x00ff00ff mova m5, [base+pw_2048] add mxyd, 0x00100010 add wq, t0 movd m4, mxyd pshufd m4, m4, q0000 movifnidn dsq, dsmp jmp wq .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: pinsrw m0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pshuflw m1, m0, q2301 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 punpcklbw m1, m0 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 movd r6d, m1 mov [dstq+dsq*1], r6w shr r6d, 16 mov [dstq+dsq*0], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd m0, [srcq+ssq*0] .v_w4_loop: movd m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 movd m0, [srcq+ssq*0] punpckldq m1, m2 ; 0 1 punpckldq m2, m0 ; 1 2 punpcklbw m1, m2 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 ; lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq m0, [srcq+ssq*0] .v_w8_loop: movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 movq m0, [srcq+ssq*0] punpcklbw m1, m2 punpcklbw m2, m0 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET %macro PUT_BILIN_V_W16 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 mova m2, m0 movu m0, [srcq+ssq*0] punpcklbw m1, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro .v_w16: PUT_BILIN_V_W16 RET .v_w128: lea r6d, [hq+(7<<16)] jmp .v_w16gt .v_w64: lea r6d, [hq+(3<<16)] jmp .v_w16gt .v_w32: lea r6d, [hq+(1<<16)] .v_w16gt: mov r4, srcq %if ARCH_X86_64 mov r7, dstq %endif .v_w16gt_loop: PUT_BILIN_V_W16 %if ARCH_X86_64 add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 %else mov dstq, dstmp add r4, 16 movzx hd, r6w add dstq, 16 mov srcq, r4 mov dstmp, dstq %endif sub r6d, 1<<16 jg .v_w16gt RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow mova m7, [base+pw_15] movd m6, mxyd add wq, t0 pshuflw m6, m6, q0000 paddb m5, m5 punpcklqdq m6, m6 jmp wq .hv_w2: RESTORE_DSQ_32 t0 movd m0, [srcq+ssq*0] punpckldq m0, m0 pshufb m0, m4 pmaddubsw m0, m5 .hv_w2_loop: movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] punpckldq m1, m2 pshufb m1, m4 pmaddubsw m1, m5 ; 1 _ 2 _ shufps m2, m0, m1, q1032 ; 0 _ 1 _ mova m0, m1 psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 pavgw m2, m7 ; src[x] + 8 paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 psrlw m1, 4 packuswb m1, m1 %if ARCH_X86_64 movq r6, m1 %else pshuflw m1, m1, q2020 movd r6d, m1 %endif mov [dstq+dsq*0], r6w shr r6, gprsize*4 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 shufps m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhw m1, m6 pavgw m2, m7 paddw m1, m2 psrlw m1, 4 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m2, m4 pmaddubsw m2, m5 psubw m1, m2, m0 pmulhw m1, m6 pavgw m0, m7 paddw m1, m0 movu m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 psubw m3, m0, m2 pmulhw m3, m6 pavgw m2, m7 paddw m3, m2 psrlw m1, 4 psrlw m3, 4 packuswb m1, m3 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w128: lea r6d, [hq+(7<<16)] jmp .hv_w16_start .hv_w64: lea r6d, [hq+(3<<16)] jmp .hv_w16_start .hv_w32: lea r6d, [hq+(1<<16)] .hv_w16_start: mov r4, srcq %if ARCH_X86_32 %define m8 [dstq] %else mov r7, dstq %endif .hv_w16: movifnidn dsq, dsmp %if WIN64 movaps r4m, m8 %endif .hv_w16_loop0: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w16_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 mova m8, m2 psubw m2, m0 pmulhw m2, m6 pavgw m0, m7 paddw m2, m0 mova m0, m3 psubw m3, m1 pmulhw m3, m6 pavgw m1, m7 paddw m3, m1 mova m1, m0 mova m0, m8 psrlw m2, 4 psrlw m3, 4 packuswb m2, m3 mova [dstq], m2 add dstq, dsmp dec hd jg .hv_w16_loop %if ARCH_X86_32 mov dstq, dstm add r4, 16 movzx hd, r6w add dstq, 16 mov srcq, r4 mov dstm, dstq %else add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w16_loop0 %if WIN64 movaps m8, r4m %endif RET %if ARCH_X86_32 %define base r6-prep%+SUFFIX %else %define base 0 %endif cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx LEA r6, prep_ssse3 tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] pxor m4, m4 add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: movd m0, [srcq+strideq*0] movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m2, m3 punpcklbw m0, m4 punpcklbw m2, m4 psllw m0, 4 psllw m2, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET .prep_w8: movq m0, [srcq+strideq*0] movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 punpcklbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .prep_w8 RET .prep_w16: movu m1, [srcq+strideq*0] movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .prep_w16 RET .prep_w128: mov r3, -128 jmp .prep_w32_start .prep_w64: mov r3, -64 jmp .prep_w32_start .prep_w32: mov r3, -32 .prep_w32_start: sub srcq, r3 .prep_w32_vloop: mov r6, r3 .prep_w32_hloop: movu m1, [srcq+r6+16*0] movu m3, [srcq+r6+16*1] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add r6, 32 jl .prep_w32_hloop add srcq, strideq dec hd jg .prep_w32_vloop RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 0x00ff00ff mova m4, [base+subpel_h_shufD] add mxyd, 0x00100010 movd m5, mxyd mov mxyd, r6m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 jmp wq .h_w4: mova m4, [base+bilin_h_shuf4] lea stride3q, [strideq*3] .h_w4_loop: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+0 ], m0 mova [tmpq+16], m1 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: lea stride3q, [strideq*3] .h_w8_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .h_w8_loop RET .h_w16: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] movu m2, [srcq+strideq*1+8*0] movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .h_w16 RET .h_w128: mov r3, -128 jmp .h_w32_start .h_w64: mov r3, -64 jmp .h_w32_start .h_w32: mov r3, -32 .h_w32_start: sub srcq, r3 .h_w32_vloop: mov r6, r3 .h_w32_hloop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] movu m2, [srcq+r6+8*2] movu m3, [srcq+r6+8*3] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add r6, 32 jl .h_w32_hloop add srcq, strideq dec hd jg .h_w32_vloop RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 0x00ff00ff add mxyd, 0x00100010 add wq, r6 lea stride3q, [strideq*3] movd m5, mxyd pshufd m5, m5, q0000 jmp wq .v_w4: movd m0, [srcq+strideq*0] .v_w4_loop: movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m1, m2 punpcklbw m0, m1 ; 01 12 pmaddubsw m0, m5 mova [tmpq+16*0], m0 movd m0, [srcq+strideq*0] punpckldq m2, m3 punpckldq m3, m0 punpcklbw m2, m3 ; 23 34 pmaddubsw m2, m5 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 jg .v_w4_loop RET .v_w8: movq m0, [srcq+strideq*0] .v_w8_loop: movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m0, m1 ; 01 punpcklbw m1, m2 ; 12 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+16*0], m0 movq m0, [srcq+strideq*0] punpcklbw m2, m3 ; 23 punpcklbw m3, m0 ; 34 pmaddubsw m2, m5 mova [tmpq+16*1], m1 pmaddubsw m3, m5 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .v_w8_loop RET .v_w16: movu m0, [srcq+strideq*0] .v_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m4, m0, m1 punpckhbw m0, m1 pmaddubsw m4, m5 pmaddubsw m0, m5 mova [tmpq+16*0], m4 punpcklbw m4, m1, m2 punpckhbw m1, m2 pmaddubsw m4, m5 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0] pmaddubsw m1, m5 mova [tmpq+16*2], m4 punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m5 mova [tmpq+16*3], m1 pmaddubsw m2, m5 mova [tmpq+16*4], m4 punpcklbw m4, m3, m0 punpckhbw m3, m0 pmaddubsw m4, m5 mova [tmpq+16*5], m2 pmaddubsw m3, m5 mova [tmpq+16*6], m4 mova [tmpq+16*7], m3 add tmpq, 16*8 sub hd, 4 jg .v_w16_loop RET .v_w128: lea r3d, [hq+(3<<8)] mov r6d, 256 jmp .v_w32_start .v_w64: lea r3d, [hq+(1<<8)] mov r6d, 128 jmp .v_w32_start .v_w32: xor r3d, r3d mov r6d, 64 .v_w32_start: %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif mov r5, srcq .v_w32_hloop: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] .v_w32_vloop: movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 pmaddubsw m4, m5 pmaddubsw m0, m5 mova [tmpq+16*0], m4 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0+16*0] punpcklbw m4, m1, m3 punpckhbw m1, m3 pmaddubsw m4, m5 pmaddubsw m1, m5 mova [tmpq+16*2], m4 mova [tmpq+16*3], m1 movu m1, [srcq+strideq*0+16*1] add tmpq, r6 punpcklbw m4, m2, m0 punpckhbw m2, m0 pmaddubsw m4, m5 pmaddubsw m2, m5 mova [tmpq+16*0], m4 mova [tmpq+16*1], m2 punpcklbw m4, m3, m1 punpckhbw m3, m1 pmaddubsw m4, m5 pmaddubsw m3, m5 mova [tmpq+16*2], m4 mova [tmpq+16*3], m3 add tmpq, r6 sub hd, 2 jg .v_w32_vloop add r5, 32 movzx hd, r3b mov srcq, r5 %if ARCH_X86_64 add r7, 16*4 mov tmpq, r7 %else mov tmpq, tmpmp add tmpq, 16*4 mov tmpmp, tmpq %endif sub r3d, 1<<8 jg .v_w32_hloop %if WIN64 POP r7 %endif RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 movd m6, mxyd add wq, r6 pshufd m6, m6, q0000 jmp wq .hv_w4: mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+strideq*0] lea r3, [strideq*3] pshufb m0, m4 pmaddubsw m0, m5 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] movhps m1, [srcq+strideq*2] movq m2, [srcq+r3 ] lea srcq, [srcq+strideq*4] movhps m2, [srcq+strideq*0] pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 ; 1 2 pmaddubsw m2, m5 ; 3 4 shufpd m0, m1, 0x01 ; 0 1 shufpd m3, m1, m2, 0x01 ; 2 3 psubw m1, m0 pmulhrsw m1, m6 paddw m1, m0 mova m0, m2 psubw m2, m3 pmulhrsw m2, m6 paddw m2, m3 mova [tmpq+16*0], m1 mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 ; 0 .hv_w8_loop: movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu m2, [srcq+strideq*0] pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 ; 1 pmaddubsw m2, m5 ; 2 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 mova m0, m2 psubw m2, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+16*0], m3 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 2 jg .hv_w8_loop RET .hv_w128: lea r3d, [hq+(7<<8)] mov r5d, 256 jmp .hv_w16_start .hv_w64: lea r3d, [hq+(3<<8)] mov r5d, 128 jmp .hv_w16_start .hv_w32: lea r3d, [hq+(1<<8)] mov r5d, 64 jmp .hv_w16_start .hv_w16: xor r3d, r3d mov r5d, 32 .hv_w16_start: mov r6, srcq %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 ; 0a pmaddubsw m1, m5 ; 0b .hv_w16_vloop: movu m2, [srcq+strideq*1+8*0] pshufb m2, m4 pmaddubsw m2, m5 ; 1a psubw m3, m2, m0 pmulhrsw m3, m6 paddw m3, m0 mova [tmpq+16*0], m3 movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] pshufb m3, m4 pmaddubsw m3, m5 ; 1b psubw m0, m3, m1 pmulhrsw m0, m6 paddw m0, m1 mova [tmpq+16*1], m0 add tmpq, r5 movu m0, [srcq+strideq*0+8*0] pshufb m0, m4 pmaddubsw m0, m5 ; 2a psubw m1, m0, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq+16*0], m1 movu m1, [srcq+strideq*0+8*1] pshufb m1, m4 pmaddubsw m1, m5 ; 2b psubw m2, m1, m3 pmulhrsw m2, m6 paddw m2, m3 mova [tmpq+16*1], m2 add tmpq, r5 sub hd, 2 jg .hv_w16_vloop movzx hd, r3b %if ARCH_X86_64 add r6, 16 add r7, 2*16 mov srcq, r6 mov tmpq, r7 %else mov tmpq, tmpm add r6, 16 add tmpq, 2*16 mov srcq, r6 mov tmpm, tmpq %endif sub r3d, 1<<8 jg .hv_w16_hloop %if WIN64 POP r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %if %0 == 5 ; skip the jump in the last filter jmp mangle(private_prefix %+ _%5 %+ SUFFIX) %endif %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %if ARCH_X86_32 %define base_reg r1 %define base base_reg-put_ssse3 %else %define base_reg r8 %define base 0 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_8bpc PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_8bpc PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_8bpc PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h %if ARCH_X86_64 imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v %else imul ssd, mym, 0x010101 add ssd, t1d ; 8tap_v, my, 4tap_v mov srcq, srcm %endif mov wd, wm movifnidn hd, hm LEA base_reg, put_ssse3 test mxd, 0xf00 jnz .h %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .v .put: tzcnt wd, wd movzx wd, word [base_reg+wq*2+table_offset(put,)] movifnidn ssq, ssmp add wq, base_reg movifnidn dsq, dsmp %if WIN64 pop r8 %endif lea r6, [ssq*3] jmp wq .h: %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .hv movifnidn ssq, ssmp mova m5, [base+pw_34] ; 2 + (8 << 2) cmp wd, 4 jle mangle(private_prefix %+ _put_8tap_8bpc %+ SUFFIX).h_w4 WIN64_SPILL_XMM 11 %if ARCH_X86_64 mova m8, [base+subpel_h_shufD] mova m9, [base+subpel_h_shufE] mova m10, [base+subpel_h_shufF] %endif shr mxd, 16 sub srcq, 2 movq m7, [base_reg-put_ssse3+subpel_filters+1+mxq*8] punpcklwd m7, m7 pshufd m4, m7, q0000 pshufd m6, m7, q1111 pshufd m7, m7, q2222 sub wd, 16 jge .h_w16 %macro PUT_6TAP_H 3 ; dst/src, tmp[1-2] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufD] pshufb %3, %1, [base+subpel_h_shufE] pshufb %1, [base+subpel_h_shufF] %else pshufb %2, %1, m8 pshufb %3, %1, m9 pshufb %1, m10 %endif pmaddubsw %2, m4 pmaddubsw %3, m6 pmaddubsw %1, m7 paddw %2, m5 paddw %2, %3 paddw %1, %2 psraw %1, 6 %endmacro %if ARCH_X86_32 mov r4, dsm %endif .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] PUT_6TAP_H m0, m2, m3 PUT_6TAP_H m1, m2, m3 packuswb m0, m1 %if ARCH_X86_32 movq [dstq+r4*0], m0 movhps [dstq+r4*1], m0 lea dstq, [dstq+r4*2] %else movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jg .h_w8 RET .h_w16: add srcq, wq add dstq, wq neg wq .h_w16_loop_v: mov r6, wq .h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_6TAP_H m0, m2, m3 PUT_6TAP_H m1, m2, m3 packuswb m0, m1 mova [dstq+r6], m0 add r6, 16 jle .h_w16_loop_h add srcq, ssq add dstq, dsmp dec hd jg .h_w16_loop_v RET .v: %if ARCH_X86_32 %define dsq r4 %define m8 [base+pw_512] movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m7, [base_reg-put_ssse3+subpel_filters+1+ssq*8] mov ssq, ssm punpcklwd m7, m7 pshufd m5, m7, q0000 mov r6, ssq pshufd m6, m7, q1111 neg r6 pshufd m7, m7, q2222 cmp wd, 4 jge .v_w4 %else WIN64_SPILL_XMM 9, 12 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m7, [base_reg-put_ssse3+subpel_filters+1+myq*8] mova m8, [base+pw_512] punpcklwd m7, m7 pshufd m5, m7, q0000 mov nsq, ssq pshufd m6, m7, q1111 neg nsq pshufd m7, m7, q2222 cmp wd, 4 je .v_w4 jg .v_w8 %endif .v_w2: %if ARCH_X86_32 mov dsq, dsm movd m1, [srcq+r6 *2] movd m3, [srcq+r6 *1] %else movd m1, [srcq+nsq*2] movd m3, [srcq+nsq*1] %endif movd m2, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m0, [srcq+ssq*0] punpcklwd m1, m3 ; 0 1 punpcklwd m3, m2 ; 1 2 punpcklwd m2, m4 ; 2 3 punpcklwd m4, m0 ; 3 4 punpcklbw m1, m3 ; 01 12 punpcklbw m2, m4 ; 23 34 .v_w2_loop: movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m4, m1, m5 ; a0 b0 mova m1, m2 pmaddubsw m2, m6 ; a1 b1 paddw m4, m2 punpcklwd m2, m0, m3 ; 4 5 movd m0, [srcq+ssq*0] punpcklwd m3, m0 ; 5 6 punpcklbw m2, m3 ; 67 78 pmaddubsw m3, m2, m7 ; a2 b2 paddw m4, m3 pmulhrsw m4, m8 packuswb m4, m4 movd r6d, m4 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 shl wd, 14 lea srcq, [srcq+r6*2] lea r6d, [hq+wq-(1<<16)] mov srcm, srcq mov dsq, dsm .v_w4_loop0: movd m1, [srcq+ssq*0] movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m1, [srcq+nsq*2] movd m3, [srcq+nsq*1] %endif movd m2, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m0, [srcq+ssq*0] punpckldq m1, m3 ; 0 1 punpckldq m3, m2 ; 1 2 punpckldq m2, m4 ; 2 3 punpckldq m4, m0 ; 3 4 punpcklbw m1, m3 ; 01 12 punpcklbw m2, m4 ; 23 34 .v_w4_loop: movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m4, m1, m5 ; a0 b0 mova m1, m2 pmaddubsw m2, m6 ; a1 b1 paddw m4, m2 punpckldq m2, m0, m3 ; 4 5 movd m0, [srcq+ssq*0] punpckldq m3, m0 ; 5 6 punpcklbw m2, m3 ; 67 78 pmaddubsw m3, m2, m7 ; a2 b2 paddw m4, m3 pmulhrsw m4, m8 packuswb m4, m4 movd [dstq+dsq*0], m4 psrlq m4, 32 movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov srcq, srcm mov dstq, dstm movzx hd, r6w add srcq, 4 add dstq, 4 mov srcm, srcq mov dstm, dstq sub r6d, 1<<16 jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: WIN64_PUSH_XMM 12 shl wd, 5 lea r6d, [hq+wq-256] .v_w8_loop0: movq m1, [srcq+nsq*2] movq m2, [srcq+nsq*1] lea r4, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] mov r7, dstq movq m0, [r4 +ssq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m0 ; 34 .v_w8_loop: pmaddubsw m10, m1, m5 ; a0 mova m1, m3 pmaddubsw m11, m2, m5 ; b0 mova m2, m4 pmaddubsw m3, m6 ; a1 pmaddubsw m4, m6 ; b1 paddw m10, m3 paddw m11, m4 movq m4, [r4+ssq*1] lea r4, [r4+ssq*2] punpcklbw m3, m0, m4 ; 67 movq m0, [r4+ssq*0] punpcklbw m4, m0 ; 78 pmaddubsw m9, m3, m7 ; a2 paddw m10, m9 pmaddubsw m9, m4, m7 ; b2 paddw m11, m9 pmulhrsw m10, m8 pmulhrsw m11, m8 packuswb m10, m11 movq [r7+dsq*0], m10 movhps [r7+dsq*1], m10 lea r7, [r7+dsq*2] sub hd, 2 jg .v_w8_loop add srcq, 8 add dstq, 8 movzx hd, r6b sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 .hv: RESET_STACK_STATE cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m1, [base_reg-put_ssse3+subpel_filters+2+mxq*8] %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg-put_ssse3+subpel_filters+1+ssq*8] mov ssq, ssmp ALLOC_STACK -mmsize*4 %define m8 [rsp+mmsize*0] %define m9 [rsp+mmsize*1] %define m10 [rsp+mmsize*2] punpcklbw m0, m0 sub srcq, ssq psraw m0, 8 ; sign-extend sub srcq, ssq pshufd m2, m0, q0000 mova m8, m2 pshufd m2, m0, q1111 mova m9, m2 pshufd m2, m0, q2222 mova m10, m2 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg-put_ssse3+subpel_filters+1+myq*8] WIN64_SPILL_XMM 11, 14 mov nsq, ssq punpcklbw m0, m0 neg nsq psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 %endif cmp wd, 4 je .hv_w4 .hv_w2: mova m5, [base+subpel_h_shuf4] mova m6, [base+pw_34] pshufd m7, m1, q0000 %if ARCH_X86_32 movq m2, [srcq+ssq*0] movhps m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov dsq, [rstk+stack_offset+gprsize*2] %else movq m2, [srcq+nsq*2] movhps m2, [srcq+nsq*1] ; 0 1 %endif movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] ; 2 3 lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] ; 4 REPX {pshufb x, m5}, m2, m1, m0 REPX {pmaddubsw x, m7}, m2, m1, m0 phaddw m2, m1 phaddw m0, m0 paddw m2, m6 paddw m0, m6 psraw m2, 2 ; 0 1 2 3 psraw m0, 2 palignr m0, m2, 4 ; 1 2 3 4 punpcklwd m1, m2, m0 ; 01 12 punpckhwd m2, m0 ; 23 34 .hv_w2_loop: movq m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m3, [srcq+ssq*0] ; 5 6 pshufb m3, m5 pmaddubsw m3, m7 pmaddwd m4, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 phaddw m3, m3 paddw m3, m6 psraw m3, 2 paddd m4, m2 palignr m2, m3, m0, 12 ; 4 5 mova m0, m3 punpcklwd m2, m3 ; 45 56 pmaddwd m3, m10, m2 ; a2 b2 paddd m4, m3 psrad m4, 10 packssdw m4, m5 packuswb m4, m4 movd r6d, m4 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: %if ARCH_X86_32 movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov dsq, [rstk+stack_offset+gprsize*2] %define m11 [base+pw_34] %define m12 [base+subpel_h_shufA] %define m13 [rsp+mmsize*3] pshufd m1, m1, q0000 mova m13, m1 %else WIN64_PUSH_XMM 14 movq m3, [srcq+nsq*2] movq m4, [srcq+nsq*1] pshufd m13, m1, q0000 mova m12, [base+subpel_h_shufA] mova m11, [base+pw_34] %endif movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m2, [srcq+ssq*0] %if ARCH_X86_32 mova m5, m12 mova m6, m13 REPX {pshufb x, m5 }, m3, m4, m0, m1, m2 mova m5, m11 REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2 %else REPX {pshufb x, m12}, m3, m4, m0, m1, m2 REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2 %endif phaddw m3, m0 ; 0 2 phaddw m4, m1 ; 1 3 phaddw m0, m2 ; 2 4 %if ARCH_X86_32 REPX {paddw x, m5 }, m3, m4, m0 %else REPX {paddw x, m11}, m3, m4, m0 %endif REPX {psraw x, 2 }, m3, m4, m0 punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .hv_w4_loop: movq m7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m6, [srcq+ssq*0] pshufb m7, m12 pshufb m6, m12 pmaddubsw m7, m13 pmaddubsw m6, m13 pmaddwd m5, m8, m1 ; a0 mova m1, m3 phaddw m7, m6 ; 5 6 pmaddwd m6, m8, m2 ; b0 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddw m7, m11 psraw m7, 2 paddd m5, m3 paddd m6, m4 shufpd m4, m0, m7, 0x01 ; 4 5 mova m0, m7 punpcklwd m3, m4, m7 ; 45 punpckhwd m4, m7 ; 56 pmaddwd m7, m10, m3 ; a2 paddd m5, m7 pmaddwd m7, m10, m4 ; b2 paddd m6, m7 psrad m5, 10 psrad m6, 10 packssdw m5, m6 packuswb m5, m5 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: RESET_STACK_STATE shr mxd, 16 sub srcq, 2 %if ARCH_X86_32 movq m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8] movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m1, [base_reg-put_ssse3+subpel_filters+1+ssq*8] shl wd, 13 mov ssq, ssm lea r6d, [hq+wq-(1<<16)] %assign regs_used 5 ALLOC_STACK -mmsize*16 %assign regs_used 7 mov dsq, [rstk+stack_offset+gprsize*2] sub srcq, ssq sub srcq, ssq %if STACK_ALIGNMENT < 16 %define srcm [esp+mmsize*15+gprsize*0] %define dstm [esp+mmsize*15+gprsize*1] mov dstm, dstq %endif mov srcm, srcq %else ALLOC_STACK 16*6, 16 movq m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg-put_ssse3+subpel_filters+1+myq*8] mov nsq, ssq shl wd, 13 neg nsq lea r6d, [hq+wq-(1<<16)] %endif mova m7, [base+pw_34] punpcklwd m0, m0 punpcklbw m1, m1 psraw m1, 8 ; sign-extend pshufd m2, m0, q0000 mova [rsp+16*0], m2 pshufd m2, m0, q1111 mova [rsp+16*1], m2 pshufd m0, m0, q2222 mova [rsp+16*2], m0 pshufd m2, m1, q0000 mova [rsp+16*3], m2 pshufd m2, m1, q1111 mova [rsp+16*4], m2 pshufd m1, m1, q2222 mova [rsp+16*5], m1 %macro HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \ [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3] pshufb %2, %1, %4 pshufb %1, %5 pmaddubsw %3, %2, %6 shufps %2, %1, q2121 pmaddubsw %1, %8 pmaddubsw %2, %7 paddw %3, m7 paddw %1, %3 paddw %1, %2 psraw %1, 2 %endmacro .hv_w8_loop0: mova m2, [base+subpel_h_shufD] mova m3, [base+subpel_h_shufF] mova m4, [rsp+16*0] %if ARCH_X86_32 movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] HV_H_6TAP m0, m5, m6, m2, m3, m4 HV_H_6TAP m1, m5, m6, m2, m3, m4 movu m5, [srcq+ssq*0] punpcklwd m6, m0, m1 ; 01 punpckhwd m0, m1 mova [rsp+16* 6], m6 mova [rsp+16* 7], m0 HV_H_6TAP m5, m0, m6, m2, m3, m4 movu m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m6, m1, m5 ; 12 punpckhwd m1, m5 mova [rsp+16* 8], m6 mova [rsp+16* 9], m1 HV_H_6TAP m0, m1, m6, m2, m3, m4 movu m1, [srcq+ssq*0] punpcklwd m6, m5, m0 ; 23 punpckhwd m5, m0 mova [rsp+16*10], m6 mova [rsp+16*11], m5 HV_H_6TAP m1, m5, m6, m2, m3, m4 mova [rsp+16*14], m1 punpcklwd m6, m0, m1 ; 34 punpckhwd m0, m1 mova [rsp+16*12], m6 mova [rsp+16*13], m0 .hv_w8_loop: mova m3, [rsp+16* 3] pmaddwd m0, m3, [rsp+16* 6] ; a0 pmaddwd m2, m3, [rsp+16* 7] ; a0' pmaddwd m1, m3, [rsp+16* 8] ; b0 pmaddwd m3, [rsp+16* 9] ; b0' mova m6, [rsp+16* 4] mova m4, [rsp+16*10] mova m5, [rsp+16*11] mova [rsp+16* 6], m4 pmaddwd m4, m6 ; a1 mova [rsp+16* 7], m5 pmaddwd m5, m6 ; a1' paddd m0, m4 mova m4, [rsp+16*12] paddd m2, m5 mova m5, [rsp+16*13] mova [rsp+16* 8], m4 pmaddwd m4, m6 ; b1 mova [rsp+16* 9], m5 pmaddwd m5, m6 ; b1' movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] paddd m1, m4 paddd m3, m5 HV_H_6TAP m6, m4, m5 mova m5, [rsp+16*14] punpcklwd m4, m5, m6 ; 45 punpckhwd m5, m6 mova [rsp+16*10], m4 mova [rsp+16*11], m5 pmaddwd m4, [rsp+16*5] ; a2 pmaddwd m5, [rsp+16*5] ; a2' paddd m0, m4 movu m4, [srcq+ssq*0] paddd m2, m5 psrad m0, 10 psrad m2, 10 packssdw m0, m2 HV_H_6TAP m4, m2, m5 mova m2, [rsp+16*5] punpcklwd m5, m6, m4 ; 56 mova [rsp+16*14], m4 punpckhwd m6, m4 mova [rsp+16*12], m5 pmaddwd m5, m2 ; b2 mova [rsp+16*13], m6 pmaddwd m6, m2 ; b2' paddd m1, m5 paddd m3, m6 psrad m1, 10 psrad m3, 10 packssdw m1, m3 packuswb m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop mov srcq, srcm mov dstq, dstm movzx hd, r6w add srcq, 8 add dstq, 8 mov srcm, srcq mov dstm, dstq %else movu m9, [srcq+nsq*2] movu m11, [srcq+nsq*1] lea r4, [srcq+ssq*2] movu m13, [srcq+ssq*0] movu m15, [srcq+ssq*1] mov r7, dstq movu m6, [r4 +ssq*0] mova m5, [rsp+16*1] mova m8, [rsp+16*2] HV_H_6TAP m9, m0, m1, m2, m3, m4, m5, m8 HV_H_6TAP m11, m0, m1, m2, m3, m4, m5, m8 HV_H_6TAP m13, m0, m1, m2, m3, m4, m5, m8 HV_H_6TAP m15, m0, m1, m2, m3, m4, m5, m8 HV_H_6TAP m6, m0, m1, m2, m3, m4, m5, m8 punpcklwd m8, m9, m11 ; 01 punpckhwd m9, m11 punpcklwd m10, m11, m13 ; 12 punpckhwd m11, m13 punpcklwd m12, m13, m15 ; 23 punpckhwd m13, m15 punpcklwd m14, m15, m6 ; 34 punpckhwd m15, m6 .hv_w8_loop: mova m3, [rsp+16*3] mova m4, [rsp+16*4] pmaddwd m0, m8, m3 ; a0 mova m8, m12 pmaddwd m2, m9, m3 ; a0' mova m9, m13 pmaddwd m1, m10, m3 ; b0 mova m10, m14 pmaddwd m3, m11 ; b0' mova m11, m15 REPX {pmaddwd x, m4}, m12, m13, m14, m15 paddd m0, m12 paddd m2, m13 paddd m1, m14 paddd m3, m15 movu m15, [r4+ssq*1] lea r4, [r4+ssq*2] HV_H_6TAP m15, m4, m5 punpcklwd m12, m6, m15 punpckhwd m13, m6, m15 movu m6, [r4+ssq*0] HV_H_6TAP m6, m4, m5 mova m4, [rsp+16*5] punpcklwd m14, m15, m6 punpckhwd m15, m6 pmaddwd m5, m12, m4 ; a2 paddd m0, m5 pmaddwd m5, m13, m4 ; a2' paddd m2, m5 pmaddwd m5, m14, m4 ; b2 paddd m1, m5 pmaddwd m4, m15 ; b2' paddd m3, m4 REPX {psrad x, 10}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 packuswb m0, m1 movq [r7+dsq*0], m0 movhps [r7+dsq*1], m0 lea r7, [r7+dsq*2] sub hd, 2 jg .hv_w8_loop add srcq, 8 add dstq, 8 movzx hd, r6b %endif sub r6d, 1<<16 jg .hv_w8_loop0 RET PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_8bpc PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_8bpc PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_8bpc PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_8bpc PUT_8TAP_FN sharp, SHARP, SHARP cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h %if ARCH_X86_64 imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v %else imul ssd, mym, 0x010101 add ssd, t1d ; 8tap_v, my, 4tap_v mov srcq, srcm %endif mov wd, wm movifnidn hd, hm LEA base_reg, put_ssse3 test mxd, 0xf00 jnz .h %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .v tzcnt wd, wd movzx wd, word [base_reg+wq*2+table_offset(put,)] movifnidn ssq, ssmp add wq, base_reg movifnidn dsq, dsmp %if WIN64 pop r8 %endif lea r6, [ssq*3] jmp wq .h_w2: mova m3, [base+subpel_h_shuf4] movifnidn dsq, dsmp .h_w2_loop: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m3 pmaddubsw m0, m4 phaddw m0, m0 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 movd r6d, m0 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif movd m4, [base_reg+mxq*8+subpel_filters-put_ssse3+2] dec srcq pshufd m4, m4, q0000 cmp wd, 4 jl .h_w2 mova m3, [base+subpel_h_shufA] movifnidn dsq, dsmp .h_w4_loop: movq m0, [srcq+ssq*0] ; 1 movq m1, [srcq+ssq*1] ; 2 lea srcq, [srcq+ssq*2] pshufb m0, m3 ; subpel_h_shufA pshufb m1, m3 ; subpel_h_shufA pmaddubsw m0, m4 ; subpel_filters pmaddubsw m1, m4 ; subpel_filters phaddw m0, m1 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .hv movifnidn ssq, ssmp mova m5, [base+pw_34] ; 2 + (8 << 2) cmp wd, 4 jle .h_w4 WIN64_SPILL_XMM 12 %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %endif shr mxd, 16 sub srcq, 3 movq m7, [base_reg+mxq*8+subpel_filters-put_ssse3] pshufd m6, m7, q0000 pshufd m7, m7, q1111 sub wd, 16 jge .h_w16 %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufB] pshufb %3, %1, [base+subpel_h_shufC] pshufb %1, [base+subpel_h_shufA] %else pshufb %2, %1, m11; subpel_h_shufB pshufb %3, %1, m9 ; subpel_h_shufC pshufb %1, m10 ; subpel_h_shufA %endif pmaddubsw %4, %2, m6 ; subpel +0 B0 pmaddubsw %2, m7 ; subpel +4 B4 pmaddubsw %3, m7 ; C4 pmaddubsw %1, m6 ; A0 paddw %3, %4 ; C4+B0 paddw %1, %2 ; A0+B4 phaddw %1, %3 paddw %1, m5 ; pw34 psraw %1, 6 %endmacro .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 %if ARCH_X86_32 movq [dstq], m0 add dstq, dsm movhps [dstq], m0 add dstq, dsm %else movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jg .h_w8 RET .h_w16: add srcq, wq add dstq, wq neg wq .h_w16_loop_v: mov r6, wq .h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 mova [dstq+r6], m0 add r6, 16 jle .h_w16_loop_h add srcq, ssq add dstq, dsmp dec hd jg .h_w16_loop_v RET .v: %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] %endif punpcklwd m0, m0 mova m7, [base+pw_512] %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed ALLOC_STACK -16*4 %assign regs_used 7 pshufd m1, m0, q0000 mova subpel0, m1 pshufd m1, m0, q1111 mova subpel1, m1 pshufd m1, m0, q2222 mova subpel2, m1 pshufd m1, m0, q3333 mova subpel3, m1 mov ssq, [rstk+stack_offset+gprsize*4] lea ssq, [ssq*3] sub srcq, ssq mov ssq, [rstk+stack_offset+gprsize*4] mov dsq, [rstk+stack_offset+gprsize*2] cmp wd, 2 jne .v_w4 %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 lea ss3q, [ssq*3] pshufd m8, m0, q0000 sub srcq, ss3q pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 je .v_w4 jg .v_w8 %endif .v_w2: movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] movd m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m3, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m2, [srcq+ssq*2] add srcq, ss3q movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m4, [srcq+ssq*2] add srcq, ss3q %endif punpcklwd m1, m0 ; 0 1 punpcklwd m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+ssq*0] punpcklwd m2, m5 ; 2 3 punpcklwd m5, m3 ; 3 4 punpcklwd m3, m4 ; 4 5 punpcklwd m4, m0 ; 5 6 punpcklbw m2, m5 ; 23 34 punpcklbw m3, m4 ; 45 56 .v_w2_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 punpcklwd m3, m0, m4 ; 6 7 movd m0, [srcq+ssq*0] punpcklwd m4, m0 ; 7 8 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 movd r6d, m5 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 shl wd, 14 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*4+gprsize] mov dstm, dstq %endif lea r6d, [hq+wq-(1<<16)] mov r4, srcq .v_w4_loop0: %endif movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] movd m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m3, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m2, [srcq+ssq*2] add srcq, ss3q movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m4, [srcq+ssq*2] add srcq, ss3q %endif punpckldq m1, m0 ; 0 1 punpckldq m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+ssq*0] punpckldq m2, m5 ; 2 3 punpckldq m5, m3 ; 3 4 punpckldq m3, m4 ; 4 5 punpckldq m4, m0 ; 5 6 punpcklbw m2, m5 ; 23 34 punpcklbw m3, m4 ; 45 56 .v_w4_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 punpckldq m3, m0, m4 ; 6 7 _ _ movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov dstq, dstm add r4, 4 movzx hd, r6w add dstq, 4 mov srcq, r4 mov dstm, dstq sub r6d, 1<<16 jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: shl wd, 5 lea r6d, [hq+wq-256] .v_w8_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] lea r4, [srcq+ss3q] movq m3, [srcq+ssq*2] movq m4, [r4 +ssq*0] mov r7, dstq movq m5, [r4 +ssq*1] movq m6, [r4 +ssq*2] add r4, ss3q movq m0, [r4 +ssq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m5 ; 34 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: movq m13, [r4+ssq*1] lea r4, [r4+ssq*2] pmaddubsw m14, m1, subpel0 ; a0 mova m1, m3 pmaddubsw m15, m2, subpel0 ; b0 mova m2, m4 pmaddubsw m3, subpel1 ; a1 mova m12, m0 pmaddubsw m4, subpel1 ; b1 movq m0, [r4+ssq*0] paddw m14, m3 paddw m15, m4 mova m3, m5 pmaddubsw m5, subpel2 ; a2 mova m4, m6 pmaddubsw m6, subpel2 ; b2 punpcklbw m12, m13 ; 67 punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 pmaddubsw m12, subpel3 ; a3 paddw m15, m6 mova m6, m13 pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 packuswb m14, m15 movq [r7+dsq*0], m14 movhps [r7+dsq*1], m14 lea r7, [r7+dsq*2] sub hd, 2 jg .v_w8_loop add srcq, 8 add dstq, 8 movzx hd, r6b sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 %undef subpel0 %undef subpel1 %undef subpel2 %undef subpel3 .hv: RESET_STACK_STATE cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] mov ssq, ssmp lea r6, [ssq*3] sub srcq, r6 %define base_reg r6 mov r6, r1; use as new base %assign regs_used 2 ALLOC_STACK -mmsize*14 %assign regs_used 7 mov dsq, [rstk+stack_offset+gprsize*2] %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 mova subpelv1, m6 pshufd m6, m0, q2222 mova subpelv2, m6 pshufd m6, m0, q3333 mova subpelv3, m6 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] ALLOC_STACK mmsize*14, 14 lea ss3q, [ssq*3] sub srcq, ss3q %define subpelv0 m10 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 ; sign-extend mova m8, [base+pw_8192] mova m9, [base+pd_512] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 cmp wd, 4 je .hv_w4 .hv_w2: mova m6, [base+subpel_h_shuf4] movq m2, [srcq+ssq*0] ; 0 movhps m2, [srcq+ssq*1] ; 0 _ 1 %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] ; 2 movhps m0, [srcq+ssq*1] ; 2 _ 3 lea srcq, [srcq+ssq*2] %else %define w8192reg m8 %define d512reg m9 movq m0, [srcq+ssq*2] ; 2 add srcq, ss3q movhps m0, [srcq+ssq*0] ; 2 _ 3 %endif pshufb m2, m6 ; 0 ~ 1 ~ pshufb m0, m6 ; 2 ~ 3 ~ pmaddubsw m2, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m2, m0 ; 0 1 2 3 pmulhrsw m2, w8192reg %if ARCH_X86_32 movq m3, [srcq+ssq*0] ; 4 movhps m3, [srcq+ssq*1] ; 4 _ 5 lea srcq, [srcq+ssq*2] %else movq m3, [srcq+ssq*1] ; 4 movhps m3, [srcq+ssq*2] ; 4 _ 5 add srcq, ss3q %endif movq m0, [srcq+ssq*0] ; 6 pshufb m3, m6 ; 4 ~ 5 ~ pshufb m0, m6 ; 6 ~ pmaddubsw m3, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m3, m0 ; 4 5 6 _ pmulhrsw m3, w8192reg palignr m4, m3, m2, 4; V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 punpckhwd m2, m4 ; V 23 34 2 3 3 4 pshufd m0, m3, q2121; V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 4 5 5 6 .hv_w2_loop: movq m4, [srcq+ssq*1] ; V 7 lea srcq, [srcq+ssq*2] ; V movhps m4, [srcq+ssq*0] ; V 7 8 pshufb m4, m6 pmaddubsw m4, m7 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 ; V pmaddwd m2, subpelv1 ; V a1 b1 paddd m5, m2 ; V mova m2, m3 ; V pmaddwd m3, subpelv2 ; a2 b2 phaddw m4, m4 pmulhrsw m4, w8192reg paddd m5, m3 ; V palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; V 67 78 pmaddwd m4, m3, subpelv3 ; V a3 b3 paddd m5, d512reg paddd m5, m4 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 movd r4d, m5 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET %undef w8192reg %undef d512reg .hv_w4: %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 %define hv4_line_0_3 7 %define hv4_line_0_4 8 %define hv4_line_0_5 9 %define hv4_line_1_0 10 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %macro SAVELINE_W4 3 mova [rsp+mmsize*hv4_line_%3_%2], %1 %endmacro %macro RESTORELINE_W4 3 mova %1, [rsp+mmsize*hv4_line_%3_%2] %endmacro %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] %else %define w8192reg m8 %define d512reg m9 %endif ; lower shuffle 0 1 2 3 4 mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 0 _ _ _ movhps m5, [srcq+ssq*1] ; 0 _ 1 _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movq m4, [srcq+ssq*0] ; 2 _ _ _ movhps m4, [srcq+ssq*1] ; 2 _ 3 _ lea srcq, [srcq+ssq*2] %else movq m4, [srcq+ssq*2] ; 2 _ _ _ movhps m4, [srcq+ss3q ] ; 2 _ 3 _ lea srcq, [srcq+ssq*4] %endif pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ;H pw_8192 SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 mova m6, [base+subpel_h_shuf4+16] pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ;H pw_8192 ; ; lower shuffle mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 4 _ _ _ movhps m5, [srcq+ssq*1] ; 4 _ 5 _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movq m4, [srcq+ssq*0] ; 6 _ _ _ add srcq, ssq %else movq m4, [srcq+ssq*2] ; 6 _ _ _ add srcq, ss3q %endif pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 SAVELINE_W4 m3, 3, 0 ; upper shuffle mova m6, [base+subpel_h_shuf4+16] pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 ;process high palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 .hv_w4_loop: ;process low pmaddwd m5, m1, subpelv0 ; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 pmulhrsw m4, w8192reg ;H pw_8192 palignr m3, m4, m0, 12 ; 6 7 8 7 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m5, 10 SAVELINE_W4 m0, 0, 0 SAVELINE_W4 m1, 1, 0 SAVELINE_W4 m2, 2, 0 SAVELINE_W4 m3, 3, 0 SAVELINE_W4 m5, 5, 0 ;process high RESTORELINE_W4 m0, 0, 1 RESTORELINE_W4 m1, 1, 1 RESTORELINE_W4 m2, 2, 1 RESTORELINE_W4 m3, 3, 1 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4+16] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ lea srcq, [srcq+ssq*2] pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 pmulhrsw m4, w8192reg ;H pw_8192 palignr m3, m4, m0, 12 ; 6 7 8 7 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m4, m5, 10 RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 ; d -> w packuswb m5, m5 ; w -> b pshuflw m5, m5, q3120 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 RESTORELINE_W4 m0, 0, 0 RESTORELINE_W4 m1, 1, 0 RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 jg .hv_w4_loop RET %undef subpelv0 %undef subpelv1 %undef subpelv2 %undef subpelv3 .hv_w8: RESET_STACK_STATE %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 %define hv8_line_4 3 %define hv8_line_6 4 %macro SAVELINE_W8 2 mova [rsp+hv8_line_%1*mmsize], %2 %endmacro %macro RESTORELINE_W8 2 mova %2, [rsp+hv8_line_%1*mmsize] %endmacro shr mxd, 16 sub srcq, 3 %if ARCH_X86_32 %define base_reg r1 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] %define subpelv1 [rsp+mmsize*8] %define subpelv2 [rsp+mmsize*9] %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] mov ssq, ssmp ALLOC_STACK -mmsize*13 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*13+gprsize*1] %define dsm [rsp+mmsize*13+gprsize*2] mov r6, [rstk+stack_offset+gprsize*2] mov dsm, r6 %endif pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 psraw m5, 8 ; sign-extend pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 pshufd m5, m5, q3333 mova subpelh0, m0 mova subpelh1, m1 mova subpelv0, m2 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 lea r6, [ssq*3] mov dstm, dstq sub srcq, r6 %else ALLOC_STACK 16*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 %define subpelv1 m13 %define subpelv2 m14 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 punpcklbw m1, m1 psraw m1, 8 ; sign-extend pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea ss3q, [ssq*3] mov r7, dstq sub srcq, ss3q %endif shl wd, 14 lea r6d, [hq+wq-(1<<16)] mov r4, srcq .hv_w8_loop0: movu m4, [srcq+ssq*0] ; 0 = _ _ movu m5, [srcq+ssq*1] ; 1 = _ _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] %endif %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] %if ARCH_X86_32 pshufb %3, %1, [base+subpel_h_shufB] pshufb %4, %1, [base+subpel_h_shufC] pshufb %1, [base+subpel_h_shufA] %else pshufb %3, %1, %6 ; subpel_h_shufB pshufb %4, %1, %7 ; subpel_h_shufC pshufb %1, %5 ; subpel_h_shufA %endif pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 pmaddubsw %4, subpelh1; subpel +4 B4 pmaddubsw %3, subpelh1; C4 pmaddubsw %1, subpelh0; A0 paddw %2, %4 ; C0+B4 paddw %1, %3 ; A0+C4 phaddw %1, %2 %endmacro %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %endif HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ %if ARCH_X86_32 movu m6, [srcq+ssq*0] ; 2 = _ _ movu m0, [srcq+ssq*1] ; 3 = _ _ lea srcq, [srcq+ssq*2] %else movu m6, [srcq+ssq*2] ; 2 = _ _ add srcq, ss3q movu m0, [srcq+ssq*0] ; 3 = _ _ %endif HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ mova m7, [base+pw_8192] pmulhrsw m4, m7 ; H pw_8192 pmulhrsw m5, m7 ; H pw_8192 pmulhrsw m6, m7 ; H pw_8192 pmulhrsw m0, m7 ; H pw_8192 punpcklwd m1, m4, m5 ; 0 1 ~ punpcklwd m2, m5, m6 ; 1 2 ~ punpcklwd m3, m6, m0 ; 2 3 ~ SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 mova m7, [base+subpel_h_shufA] %if ARCH_X86_32 movu m4, [srcq+ssq*0] ; 4 = _ _ movu m5, [srcq+ssq*1] ; 5 = _ _ lea srcq, [srcq+ssq*2] %else movu m4, [srcq+ssq*1] ; 4 = _ _ movu m5, [srcq+ssq*2] ; 5 = _ _ add srcq, ss3q %endif movu m6, [srcq+ssq*0] ; 6 = _ _ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ mova m7, [base+pw_8192] pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ punpcklwd m4, m0, m1 ; 3 4 ~ punpcklwd m5, m1, m2 ; 4 5 ~ punpcklwd m6, m2, m3 ; 5 6 ~ SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: ; m8 accu for V a ; m9 accu for V b SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 SAVELINE_W8 4, m6 %if ARCH_X86_32 pmaddwd m0, m1, subpelv0 ; a0 pmaddwd m7, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m0, m3 paddd m7, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_512] paddd m0, m5 ; pd_512 paddd m7, m5 ; pd_512 mova accuv0, m0 mova accuv1, m7 %else pmaddwd m8, m1, subpelv0 ; a0 pmaddwd m9, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m8, m3 paddd m9, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m8, m5 paddd m9, m6 mova m7, [base+pd_512] paddd m8, m7 ; pd_512 paddd m9, m7 ; pd_512 mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] %endif movu m0, [srcq+ssq*1] ; 7 movu m4, [srcq+ssq*2] ; 8 lea srcq, [srcq+ssq*2] HV_H_W8 m0, m1, m2, m3, m5, m7, m6 HV_H_W8 m4, m1, m2, m3, m5, m7, m6 mova m5, [base+pw_8192] pmulhrsw m0, m5 ; H pw_8192 pmulhrsw m4, m5 ; H pw_8192 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 6 7 ~ punpcklwd m6, m0, m4 ; 7 8 ~ pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 paddd m1, m1, accuv1 ; H + V psrad m2, 10 psrad m1, 10 packssdw m2, m1 ; d -> w packuswb m2, m1 ; w -> b movd [dstq+dsq*0], m2 psrlq m2, 32 %if ARCH_X86_32 add dstq, dsm movd [dstq+dsq*0], m2 add dstq, dsm %else movd [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jle .hv_w8_outer SAVELINE_W8 6, m4 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: %if ARCH_X86_32 mov dstq, dstm add r4, 4 movzx hd, r6w add dstq, 4 mov srcq, r4 mov dstm, dstq %else add r4, 4 add r7, 4 movzx hd, r6b mov srcq, r4 mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w8_loop0 RET %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %if ARCH_X86_32 %define base_reg r2 %define base base_reg-prep_ssse3 %else %define base_reg r7 %define base 0 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_8bpc PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_8bpc PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, mx, my, ns imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v mov wd, wm movifnidn srcd, srcm movifnidn hd, hm LEA base_reg, prep_ssse3 test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v .prep: tzcnt wd, wd movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] pxor m4, m4 add wq, base_reg movifnidn ssq, ssmp lea r6, [ssq*3] %if WIN64 pop r8 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv test myd, 0xf00 jnz .hv %if ARCH_X86_32 %define ssq r6 mov ssq, ssmp %endif cmp wd, 4 jle mangle(private_prefix %+ _prep_8tap_8bpc %+ SUFFIX).h_w4 WIN64_SPILL_XMM 11 mova m5, [base+pw_8192] %if ARCH_X86_64 mova m8, [base+subpel_h_shufD] mova m9, [base+subpel_h_shufE] mova m10, [base+subpel_h_shufF] %endif shr mxd, 16 sub srcq, 2 movq m7, [base_reg-prep_ssse3+subpel_filters+1+mxq*8] punpcklwd m7, m7 pshufd m4, m7, q0000 pshufd m6, m7, q1111 pshufd m7, m7, q2222 sub wd, 16 jge .h_w16 %macro PREP_6TAP_H 3 ; dst/src, tmp[1-2] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufD] pshufb %3, %1, [base+subpel_h_shufE] pshufb %1, [base+subpel_h_shufF] %else pshufb %2, %1, m8 pshufb %3, %1, m9 pshufb %1, m10 %endif pmaddubsw %2, m4 pmaddubsw %3, m6 pmaddubsw %1, m7 paddw %2, %3 paddw %1, %2 pmulhrsw %1, m5 %endmacro .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] PREP_6TAP_H m0, m2, m3 PREP_6TAP_H m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: add srcq, wq neg wq .h_w16_loop_v: mov r5, wq .h_w16_loop_h: movu m0, [srcq+r5+8*0] movu m1, [srcq+r5+8*1] PREP_6TAP_H m0, m2, m3 PREP_6TAP_H m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 add r5, 16 jle .h_w16_loop_h add srcq, ssq dec hd jg .h_w16_loop_v RET .v: %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f %else WIN64_SPILL_XMM 9, 12 movzx mxd, myb %endif shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m7, [base_reg-prep_ssse3+subpel_filters+1+myq*8] punpcklwd m7, m7 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pshufd m7, m7, q2222 %if ARCH_X86_32 %define m8 [base+pw_8192] mov ssq, ssm sub srcq, ssq sub srcq, ssq %else mova m8, [base+pw_8192] mov nsq, ssq neg nsq cmp wd, 4 jg .v_w8 %endif .v_w4: %if ARCH_X86_32 lea r5d, [wq-4] shl r5d, 14 add r5d, hd mov srcm, srcq .v_w4_loop0: movd m1, [srcq+ssq*0] movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m1, [srcq+nsq*2] movd m3, [srcq+nsq*1] %endif movd m2, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m0, [srcq+ssq*0] punpckldq m1, m3 ; 0 1 punpckldq m3, m2 ; 1 2 punpckldq m2, m4 ; 2 3 punpckldq m4, m0 ; 3 4 punpcklbw m1, m3 ; 01 12 punpcklbw m2, m4 ; 23 34 .v_w4_loop: movd m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m4, m1, m5 ; a0 b0 mova m1, m2 pmaddubsw m2, m6 ; a1 b1 paddw m4, m2 punpckldq m2, m0, m3 ; 4 5 movd m0, [srcq+ssq*0] punpckldq m3, m0 ; 5 6 punpcklbw m2, m3 ; 67 78 pmaddubsw m3, m2, m7 ; a2 b2 paddw m4, m3 pmulhrsw m4, m8 %if ARCH_X86_32 movq [tmpq+wq*0], m4 movhps [tmpq+wq*2], m4 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w4_loop mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq sub r5d, 1<<16 jg .v_w4_loop0 %else mova [tmpq], m4 add tmpq, 16 sub hd, 2 jg .v_w4_loop %endif RET %if ARCH_X86_64 .v_w8: WIN64_PUSH_XMM 12 lea r6d, [wq*4-32] lea r6d, [r6*8+hq] .v_w8_loop0: movq m1, [srcq+nsq*2] movq m2, [srcq+nsq*1] lea r5, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] mov r8, tmpq movq m0, [r5 +ssq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m0 ; 34 .v_w8_loop: pmaddubsw m10, m1, m5 ; a0 mova m1, m3 pmaddubsw m11, m2, m5 ; b0 mova m2, m4 pmaddubsw m3, m6 ; a1 pmaddubsw m4, m6 ; b1 paddw m10, m3 paddw m11, m4 movq m4, [r5+ssq*1] lea r5, [r5+ssq*2] punpcklbw m3, m0, m4 ; 67 movq m0, [r5+ssq*0] punpcklbw m4, m0 ; 78 pmaddubsw m9, m3, m7 ; a2 paddw m10, m9 pmaddubsw m9, m4, m7 ; b2 paddw m11, m9 pmulhrsw m10, m8 pmulhrsw m11, m8 mova [r8+wq*0], m10 mova [r8+wq*2], m11 lea r8, [r8+wq*4] sub hd, 2 jg .v_w8_loop add srcq, 8 add tmpq, 16 movzx hd, r6b sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 .hv: RESET_STACK_STATE cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m1, [base_reg-prep_ssse3+subpel_filters+2+mxq*8] %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f %else movzx mxd, myb %endif shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg-prep_ssse3+subpel_filters+1+myq*8] %if ARCH_X86_32 mov ssq, ssmp %define regs_used 6 ALLOC_STACK -mmsize*4 %define regs_used 7 %define m8 [rsp+mmsize*0] %define m9 [rsp+mmsize*1] %define m10 [rsp+mmsize*2] punpcklbw m0, m0 sub srcq, ssq psraw m0, 8 ; sign-extend sub srcq, ssq pshufd m2, m0, q0000 mova m8, m2 pshufd m2, m0, q1111 mova m9, m2 pshufd m2, m0, q2222 mova m10, m2 movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %define m11 [base+pw_8192] %define m12 [base+subpel_h_shufA] %define m13 [rsp+mmsize*3] %define m14 [base+pd_32] pshufd m1, m1, q0000 mova m13, m1 %else WIN64_SPILL_XMM 15 mov nsq, ssq punpcklbw m0, m0 neg nsq psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 movq m3, [srcq+nsq*2] movq m4, [srcq+nsq*1] pshufd m13, m1, q0000 mova m12, [base+subpel_h_shufA] mova m11, [base+pw_8192] mova m14, [base+pd_32] %endif movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m2, [srcq+ssq*0] %if ARCH_X86_32 mova m5, m12 mova m6, m13 REPX {pshufb x, m5 }, m3, m4, m0, m1, m2 mova m5, m11 REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2 %else REPX {pshufb x, m12}, m3, m4, m0, m1, m2 REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2 %endif phaddw m3, m0 ; 0 2 phaddw m4, m1 ; 1 3 phaddw m0, m2 ; 2 4 %if ARCH_X86_32 REPX {pmulhrsw x, m5 }, m3, m4, m0 %else REPX {pmulhrsw x, m11}, m3, m4, m0 %endif punpcklwd m1, m3, m4 ; 01 punpckhwd m3, m4 ; 23 punpcklwd m2, m4, m0 ; 12 punpckhwd m4, m0 ; 34 .hv_w4_loop: movq m7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m6, [srcq+ssq*0] pshufb m7, m12 pshufb m6, m12 pmaddubsw m7, m13 pmaddubsw m6, m13 pmaddwd m5, m8, m1 ; a0 mova m1, m3 phaddw m7, m6 ; 5 6 pmaddwd m6, m8, m2 ; b0 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 pmulhrsw m7, m11 paddd m5, m14 paddd m6, m14 paddd m5, m3 paddd m6, m4 shufpd m4, m0, m7, 0x01 ; 4 5 mova m0, m7 punpcklwd m3, m4, m7 ; 45 punpckhwd m4, m7 ; 56 pmaddwd m7, m10, m3 ; a2 paddd m5, m7 pmaddwd m7, m10, m4 ; b2 paddd m6, m7 psrad m5, 6 psrad m6, 6 packssdw m5, m6 mova [tmpq], m5 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: RESET_STACK_STATE shr mxd, 16 sub srcq, 2 movq m0, [base_reg-prep_ssse3+subpel_filters+1+mxq*8] %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f %else movzx mxd, myb %endif shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg-prep_ssse3+subpel_filters+1+myq*8] %if ARCH_X86_32 mov ssq, ssm %assign regs_used 6 ALLOC_STACK -mmsize*16 %assign regs_used 7 sub srcq, ssq sub srcq, ssq %if STACK_ALIGNMENT < 16 %define srcm [esp+mmsize*15+gprsize*0] %define tmpm [esp+mmsize*15+gprsize*1] mov tmpm, tmpq %endif mov srcm, srcq %else ALLOC_STACK 16*6, 16 mov nsq, ssq neg nsq %endif mova m7, [base+pw_8192] lea r5d, [wq-8] punpcklwd m0, m0 shl r5d, 13 punpcklbw m1, m1 add r5d, hd psraw m1, 8 ; sign-extend pshufd m2, m0, q0000 mova [rsp+16*0], m2 pshufd m2, m0, q1111 mova [rsp+16*1], m2 pshufd m0, m0, q2222 mova [rsp+16*2], m0 pshufd m2, m1, q0000 mova [rsp+16*3], m2 pshufd m2, m1, q1111 mova [rsp+16*4], m2 pshufd m1, m1, q2222 mova [rsp+16*5], m1 %macro PREP_HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \ [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3] pshufb %2, %1, %4 pshufb %1, %5 pmaddubsw %3, %2, %6 shufps %2, %1, q2121 pmaddubsw %1, %8 pmaddubsw %2, %7 paddw %1, %3 paddw %1, %2 pmulhrsw %1, m7 %endmacro .hv_w8_loop0: mova m2, [base+subpel_h_shufD] mova m3, [base+subpel_h_shufF] mova m4, [rsp+16*0] %if ARCH_X86_32 movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] PREP_HV_H_6TAP m0, m5, m6, m2, m3, m4 PREP_HV_H_6TAP m1, m5, m6, m2, m3, m4 movu m5, [srcq+ssq*0] punpcklwd m6, m0, m1 ; 01 punpckhwd m0, m1 mova [rsp+16* 6], m6 mova [rsp+16* 7], m0 PREP_HV_H_6TAP m5, m0, m6, m2, m3, m4 movu m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m6, m1, m5 ; 12 punpckhwd m1, m5 mova [rsp+16* 8], m6 mova [rsp+16* 9], m1 PREP_HV_H_6TAP m0, m1, m6, m2, m3, m4 movu m1, [srcq+ssq*0] punpcklwd m6, m5, m0 ; 23 punpckhwd m5, m0 mova [rsp+16*10], m6 mova [rsp+16*11], m5 PREP_HV_H_6TAP m1, m5, m6, m2, m3, m4 mova [rsp+16*14], m1 punpcklwd m6, m0, m1 ; 34 punpckhwd m0, m1 mova [rsp+16*12], m6 mova [rsp+16*13], m0 .hv_w8_loop: mova m3, [rsp+16* 3] pmaddwd m0, m3, [rsp+16* 6] ; a0 pmaddwd m2, m3, [rsp+16* 7] ; a0' pmaddwd m1, m3, [rsp+16* 8] ; b0 pmaddwd m3, [rsp+16* 9] ; b0' mova m6, [rsp+16* 4] mova m4, [rsp+16*10] mova m5, [rsp+16*11] mova [rsp+16* 6], m4 pmaddwd m4, m6 ; a1 mova [rsp+16* 7], m5 pmaddwd m5, m6 ; a1' paddd m0, m4 mova m4, [rsp+16*12] paddd m2, m5 mova m5, [rsp+16*13] mova [rsp+16* 8], m4 pmaddwd m4, m6 ; b1 mova [rsp+16* 9], m5 pmaddwd m5, m6 ; b1' movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] paddd m1, m4 paddd m3, m5 PREP_HV_H_6TAP m6, m4, m5 mova m4, [base+pd_32] mova m5, [rsp+16*14] REPX {paddd x, m4}, m0, m2, m1, m3 punpcklwd m4, m5, m6 ; 45 punpckhwd m5, m6 mova [rsp+16*10], m4 mova [rsp+16*11], m5 pmaddwd m4, [rsp+16*5] ; a2 pmaddwd m5, [rsp+16*5] ; a2' paddd m0, m4 movu m4, [srcq+ssq*0] paddd m2, m5 psrad m0, 6 psrad m2, 6 packssdw m0, m2 PREP_HV_H_6TAP m4, m2, m5 mova m2, [rsp+16*5] punpcklwd m5, m6, m4 ; 56 mova [rsp+16*14], m4 punpckhwd m6, m4 mova [rsp+16*12], m5 pmaddwd m5, m2 ; b2 mova [rsp+16*13], m6 pmaddwd m6, m2 ; b2' paddd m1, m5 paddd m3, m6 psrad m1, 6 psrad m3, 6 packssdw m1, m3 mova [tmpq+wq*0], m0 mova [tmpq+wq*2], m1 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w8_loop mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 8 add tmpq, 16 mov srcm, srcq mov tmpm, tmpq %else movu m9, [srcq+nsq*2] movu m11, [srcq+nsq*1] lea r6, [srcq+ssq*2] movu m13, [srcq+ssq*0] movu m15, [srcq+ssq*1] mov r8, tmpq movu m6, [r6 +ssq*0] mova m5, [rsp+16*1] mova m8, [rsp+16*2] PREP_HV_H_6TAP m9, m0, m1, m2, m3, m4, m5, m8 PREP_HV_H_6TAP m11, m0, m1, m2, m3, m4, m5, m8 PREP_HV_H_6TAP m13, m0, m1, m2, m3, m4, m5, m8 PREP_HV_H_6TAP m15, m0, m1, m2, m3, m4, m5, m8 PREP_HV_H_6TAP m6, m0, m1, m2, m3, m4, m5, m8 punpcklwd m8, m9, m11 ; 01 punpckhwd m9, m11 punpcklwd m10, m11, m13 ; 12 punpckhwd m11, m13 punpcklwd m12, m13, m15 ; 23 punpckhwd m13, m15 punpcklwd m14, m15, m6 ; 34 punpckhwd m15, m6 .hv_w8_loop: mova m3, [rsp+16*3] mova m4, [rsp+16*4] mova m5, [base+pd_32] pmaddwd m0, m8, m3 ; a0 mova m8, m12 pmaddwd m2, m9, m3 ; a0' mova m9, m13 pmaddwd m1, m10, m3 ; b0 mova m10, m14 pmaddwd m3, m11 ; b0' mova m11, m15 REPX {pmaddwd x, m4}, m12, m13, m14, m15 REPX {paddd x, m5}, m0, m2, m1, m3 paddd m0, m12 paddd m2, m13 paddd m1, m14 paddd m3, m15 movu m15, [r6+ssq*1] lea r6, [r6+ssq*2] PREP_HV_H_6TAP m15, m4, m5 punpcklwd m12, m6, m15 punpckhwd m13, m6, m15 movu m6, [r6+ssq*0] PREP_HV_H_6TAP m6, m4, m5 mova m4, [rsp+16*5] punpcklwd m14, m15, m6 punpckhwd m15, m6 pmaddwd m5, m12, m4 ; a2 paddd m0, m5 pmaddwd m5, m13, m4 ; a2' paddd m2, m5 pmaddwd m5, m14, m4 ; b2 paddd m1, m5 pmaddwd m4, m15 ; b2' paddd m3, m4 REPX {psrad x, 6}, m0, m2, m1, m3 packssdw m0, m2 packssdw m1, m3 mova [r8+wq*0], m0 mova [r8+wq*2], m1 lea r8, [r8+wq*4] sub hd, 2 jg .hv_w8_loop add srcq, 8 add tmpq, 16 movzx hd, r5b %endif sub r5d, 1<<16 jg .hv_w8_loop0 RET PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_8bpc PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_8bpc PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_8bpc PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_8bpc PREP_8TAP_FN sharp, SHARP, SHARP cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v mov wd, wm movifnidn srcd, srcm movifnidn hd, hm LEA base_reg, prep_ssse3 test mxd, 0xf00 jnz .h test myd, 0xf00 jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep .v: %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f %else WIN64_SPILL_XMM 16 movzx mxd, myb %endif shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] mova m2, [base+pw_512] mova m7, [base+pw_8192] punpcklwd m0, m0 %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed ALLOC_STACK -mmsize*4 %assign regs_used 7 mov strideq, [rstk+stack_offset+gprsize*3] pshufd m1, m0, q0000 mova subpel0, m1 pshufd m1, m0, q1111 mova subpel1, m1 lea r5, [strideq*3] pshufd m1, m0, q2222 mova subpel2, m1 pshufd m1, m0, q3333 mova subpel3, m1 sub srcq, r5 %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 pshufd m8, m0, q0000 pshufd m9, m0, q1111 lea stride3q, [strideq*3] pshufd m10, m0, q2222 pshufd m11, m0, q3333 sub srcq, stride3q cmp wd, 8 jns .v_w8 %endif .v_w4: %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize %define srcm [esp+stack_size+gprsize*1] %define tmpm [esp+stack_size+gprsize*2] %endif mov tmpm, tmpq mov srcm, srcq lea r5d, [wq - 4] ; horizontal loop shl r5d, (16 - 2) ; (wq / 4) << 16 mov r5w, hw .v_w4_loop0: %endif movd m1, [srcq+strideq*0] movd m0, [srcq+strideq*1] %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movd m2, [srcq+strideq*0] movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movd m3, [srcq+strideq*0] movd m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] %else movd m2, [srcq+strideq*2] add srcq, stride3q movd m4, [srcq+strideq*0] movd m3, [srcq+strideq*1] movd m5, [srcq+strideq*2] add srcq, stride3q %endif punpckldq m1, m0 ; 0 1 punpckldq m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+strideq*0] punpckldq m2, m4 ; 2 3 punpckldq m4, m3 ; 3 4 punpckldq m3, m5 ; 4 5 punpckldq m5, m0 ; 5 6 punpcklbw m2, m4 ; 23 34 punpcklbw m3, m5 ; 45 56 .v_w4_loop: mova m5, m1 pmaddubsw m5, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddw m5, m3 punpckldq m3, m0, m4 ; 6 7 _ _ movd m0, [srcq+strideq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 mova m4, m3 pmaddubsw m4, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 movq [tmpq+wq*0], m5 movhps [tmpq+wq*2], m5 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq sub r5d, 1<<16 ; horizontal-- jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: lea r6d, [wq*8-64] mov r5, srcq mov r8, tmpq lea r6d, [hq+r6*4] .v_w8_loop0: movq m1, [srcq+strideq*0] movq m2, [srcq+strideq*1] movq m3, [srcq+strideq*2] add srcq, stride3q movq m4, [srcq+strideq*0] movq m5, [srcq+strideq*1] movq m6, [srcq+strideq*2] add srcq, stride3q movq m0, [srcq+strideq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m5 ; 34 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: movq m13, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, subpel1 ; a1 pmaddubsw m4, subpel1 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, subpel2 ; a2 pmaddubsw m6, subpel2 ; b2 punpcklbw m12, m0, m13 ; 67 movq m0, [srcq+strideq*0] punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 pmaddubsw m12, subpel3 ; a3 paddw m15, m6 mova m6, m13 pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 movu [tmpq+wq*0], m14 movu [tmpq+wq*2], m15 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop add r5, 8 add r8, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r8 sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 %undef subpel0 %undef subpel1 %undef subpel2 %undef subpel3 .h_w4: WIN64_SPILL_XMM 7 %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] mova m5, [base+subpel_h_shufA] mova m6, [base+pw_8192] movifnidn r2, stridemp pshufd m4, m4, q0000 lea r3, [r2*3] .h_w4_loop: movq m0, [srcq+r2*0] movq m1, [srcq+r2*1] movq m2, [srcq+r2*2] movq m3, [srcq+r3 ] lea srcq, [srcq+r2*4] REPX {pshufb x, m5}, m0, m1, m2, m3 REPX {pmaddubsw x, m4}, m0, m1, m2, m3 phaddw m0, m1 phaddw m2, m3 pmulhrsw m0, m6 pmulhrsw m2, m6 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv cmp wd, 4 je .h_w4 WIN64_SPILL_XMM 12 %if ARCH_X86_32 %define strideq r6 mov strideq, stridem %endif tzcnt wd, wd %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %else %define m10 [base+subpel_h_shufA] %define m11 [base+subpel_h_shufB] %define m9 [base+subpel_h_shufC] %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] movq m6, [base_reg+mxq*8+subpel_filters-prep_ssse3] mova m7, [base+pw_8192] pshufd m5, m6, q0000 pshufd m6, m6, q1111 add wq, base_reg jmp wq %macro PREP_8TAP_H 2 ; dst, src_memloc movu m%1, [%2] pshufb m2, m%1, m11 ; subpel_h_shufB pshufb m3, m%1, m9 ; subpel_h_shufC pshufb m%1, m10 ; subpel_h_shufA mova m4, m2 pmaddubsw m4, m5 ; subpel +0 B0 pmaddubsw m2, m6 ; subpel +4 B4 pmaddubsw m3, m6 ; subpel +4 C4 pmaddubsw m%1, m5 ; subpel +0 A0 paddw m3, m4 paddw m%1, m2 phaddw m%1, m3 pmulhrsw m%1, m7 %endmacro .h_w8: PREP_8TAP_H 0, srcq+strideq*0 PREP_8TAP_H 1, srcq+strideq*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 lea srcq, [srcq+strideq*2] add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: mov r3, -16*1 jmp .h_start .h_w32: mov r3, -16*2 jmp .h_start .h_w64: mov r3, -16*4 jmp .h_start .h_w128: mov r3, -16*8 .h_start: sub srcq, r3 mov r5, r3 .h_loop: PREP_8TAP_H 0, srcq+r3+8*0 PREP_8TAP_H 1, srcq+r3+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 add r3, 16 jl .h_loop add srcq, strideq mov r3, r5 dec hd jg .h_loop RET .hv: RESET_STACK_STATE cmp wd, 4 jg .hv_w8 and mxd, 0x7f movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] %if ARCH_X86_32 mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 lea r5, [strideq*3+1] sub srcq, r5 %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 psraw m0, 8 pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 mova subpelv1, m6 pshufd m6, m0, q2222 mova subpelv2, m6 pshufd m6, m0, q3333 mova subpelv3, m6 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] ALLOC_STACK mmsize*14, 14 lea stride3q, [strideq*3] sub srcq, stride3q dec srcq %define subpelv0 m10 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 mova m8, [base+pw_8192] mova m9, [base+pd_32] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 %define hv4_line_0_3 7 %define hv4_line_0_4 8 %define hv4_line_0_5 9 %define hv4_line_1_0 10 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d32reg [base+pd_32] %else %define w8192reg m8 %define d32reg m9 %endif ; lower shuffle 0 1 2 3 4 mova m6, [base+subpel_h_shuf4] movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m4, [srcq+strideq*0] ; 2 _ _ _ movhps m4, [srcq+strideq*1] ; 2 _ 3 _ lea srcq, [srcq+strideq*2] %else movq m4, [srcq+strideq*2] ; 2 _ _ _ movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif pshufb m2, m5, m6 ;H subpel_h_shuf4 0~1~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2~3~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 pmulhrsw m2, w8192reg SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 mova m6, [base+subpel_h_shuf4+16] pshufb m2, m5, m6 ;H subpel_h_shuf4 0~1~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2~3~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ; lower shuffle mova m6, [base+subpel_h_shuf4] movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m4, [srcq+strideq*0] ; 6 _ _ _ add srcq, strideq %else movq m4, [srcq+strideq*2] ; 6 _ _ _ add srcq, stride3q %endif pshufb m3, m5, m6 ;H subpel_h_shuf4 4~5~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6~6~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg SAVELINE_W4 m3, 3, 0 ; upper shuffle mova m6, [base+subpel_h_shuf4+16] pshufb m3, m5, m6 ;H subpel_h_shuf4 4~5~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6~6~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;process high palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 .hv_w4_loop: ;process low pmaddwd m5, m1, subpelv0 ; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4] movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ pshufb m4, m6 ; H subpel_h_shuf4 7~8~ pmaddubsw m4, m7 ; H subpel_filters phaddw m4, m4 ; H 7878 pmulhrsw m4, w8192reg palignr m3, m4, m0, 12 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m5, 6 SAVELINE_W4 m0, 0, 0 SAVELINE_W4 m1, 1, 0 SAVELINE_W4 m2, 2, 0 SAVELINE_W4 m3, 3, 0 SAVELINE_W4 m5, 5, 0 ;process high RESTORELINE_W4 m0, 0, 1 RESTORELINE_W4 m1, 1, 1 RESTORELINE_W4 m2, 2, 1 RESTORELINE_W4 m3, 3, 1 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4+16] movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ pshufb m4, m6 ; H subpel_h_shuf4 7~8~ pmaddubsw m4, m7 ; H subpel_filters phaddw m4, m4 ; H 7878 pmulhrsw m4, w8192reg palignr m3, m4, m0, 12 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 pshufd m5, m5, q3120 movu [tmpq], m5 lea srcq, [srcq+strideq*2] add tmpq, 16 sub hd, 2 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 RESTORELINE_W4 m0, 0, 0 RESTORELINE_W4 m1, 1, 0 RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 jg .hv_w4_loop RET %undef subpelv0 %undef subpelv1 %undef subpelv2 %undef subpelv3 .hv_w8: RESET_STACK_STATE %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 %define hv8_line_4 3 %define hv8_line_6 4 shr mxd, 16 %if ARCH_X86_32 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] %define subpelv1 [rsp+mmsize*8] %define subpelv2 [rsp+mmsize*9] %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 %if STACK_ALIGNMENT < mmsize %define tmpm [rsp+mmsize*13+gprsize*1] %define srcm [rsp+mmsize*13+gprsize*2] %define stridem [rsp+mmsize*13+gprsize*3] mov tmpm, tmpq mov stridem, strideq %endif pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 psraw m5, 8 pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 pshufd m5, m5, q3333 mova subpelh0, m0 mova subpelh1, m1 mova subpelv0, m2 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 lea r5, [strideq*3+3] sub srcq, r5 mov srcm, srcq %else ALLOC_STACK mmsize*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 %define subpelv1 m13 %define subpelv2 m14 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 punpcklbw m1, m1 psraw m1, 8 pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q mov r6, srcq mov r8, tmpq %endif lea r5d, [wq-4] shl r5d, 14 add r5d, hd .hv_w8_loop0: %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %define shufA m7 %define shufB m8 %define shufC m9 %else %define shufA [base+subpel_h_shufA] %define shufB [base+subpel_h_shufB] %define shufC [base+subpel_h_shufC] %endif %macro PREP_8TAP_HV 2 ; dst, src_memloc, tmp[1-2] movu %1, [%2] pshufb m2, %1, shufB pshufb m3, %1, shufC pshufb %1, shufA mova m1, m2 pmaddubsw m1, subpelh0 ; subpel +0 C0 pmaddubsw m3, subpelh1 ; subpel +4 B4 pmaddubsw m2, subpelh1 ; C4 pmaddubsw %1, subpelh0 ; A0 paddw m1, m3 ; C0+B4 paddw %1, m2 ; A0+C4 phaddw %1, m1 %endmacro PREP_8TAP_HV m4, srcq+strideq*0 PREP_8TAP_HV m5, srcq+strideq*1 %if ARCH_X86_64 PREP_8TAP_HV m6, srcq+strideq*2 add srcq, stride3q PREP_8TAP_HV m0, srcq+strideq*0 %else lea srcq, [srcq+strideq*2] PREP_8TAP_HV m6, srcq+strideq*0 PREP_8TAP_HV m0, srcq+strideq*1 lea srcq, [srcq+strideq*2] %endif mova m7, [base+pw_8192] REPX {pmulhrsw x, m7}, m4, m5, m6, m0 punpcklwd m1, m4, m5 ; 01 punpcklwd m2, m5, m6 ; 12 punpcklwd m3, m6, m0 ; 23 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 mova m7, [base+subpel_h_shufA] %if ARCH_X86_64 PREP_8TAP_HV m4, srcq+strideq*1 PREP_8TAP_HV m5, srcq+strideq*2 add srcq, stride3q PREP_8TAP_HV m6, srcq+strideq*0 %else PREP_8TAP_HV m4, srcq+strideq*0 PREP_8TAP_HV m5, srcq+strideq*1 lea srcq, [srcq+strideq*2] PREP_8TAP_HV m6, srcq+strideq*0 %endif mova m3, [base+pw_8192] pmulhrsw m1, m3, m4 pmulhrsw m2, m3, m5 pmulhrsw m3, m6 punpcklwd m4, m0, m1 ; 34 punpcklwd m5, m1, m2 ; 45 punpcklwd m6, m2, m3 ; 56 SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 SAVELINE_W8 4, m6 %if ARCH_X86_32 pmaddwd m0, m1, subpelv0 ; a0 pmaddwd m7, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m0, m3 paddd m7, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_32] paddd m0, m5 paddd m7, m5 mova accuv0, m0 mova accuv1, m7 %else pmaddwd accuv0, m1, subpelv0 ; a0 pmaddwd accuv1, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd accuv0, m3 paddd accuv1, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd accuv0, m5 paddd accuv1, m6 mova m7, [base+pd_32] paddd accuv0, m7 paddd accuv1, m7 mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] %define shufA m5 %define shufB m7 %define shufC m6 %endif PREP_8TAP_HV m0, srcq+strideq*1 lea srcq, [srcq+strideq*2] PREP_8TAP_HV m4, srcq+strideq*0 mova m5, [base+pw_8192] pmulhrsw m0, m5 pmulhrsw m4, m5 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 67 punpcklwd m6, m0, m4 ; 78 pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 paddd m1, m1, accuv1 psrad m2, 6 psrad m1, 6 packssdw m2, m1 movq [tmpq+wq*0], m2 movhps [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] sub hd, 2 jle .hv_w8_outer SAVELINE_W8 6, m4 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq %else add r6, 4 add r8, 8 movzx hd, r5b mov srcq, r6 mov tmpq, r8 %endif sub r5d, 1<<16 jg .hv_w8_loop0 RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro SAVE_REG 1 %xdefine r%1_save r%1 %xdefine r%1q_save r%1q %xdefine r%1d_save r%1d %if ARCH_X86_32 %define r%1m_save [rstk+stack_offset+(%1+1)*4] %endif %endmacro %macro LOAD_REG 1 %xdefine r%1 r%1_save %xdefine r%1q r%1q_save %xdefine r%1d r%1d_save %if ARCH_X86_32 %define r%1m r%1m_save %endif %undef r%1d_save %undef r%1q_save %undef r%1_save %endmacro %macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %if ARCH_X86_32 %if %3 == 0 %xdefine r%1m r%2m %else %define r%1m [rstk+stack_offset+(%1+1)*4] %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %if ARCH_X86_64 SAVE_REG 14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %else SAVE_REG 5 %assign %%i 5 %rep 5 %assign %%j %%i-1 REMAP_REG %%i, %%j, 0 %assign %%i %%i-1 %endrep %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %if ARCH_X86_64 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep LOAD_REG 14 %else %rep 4 %assign %%j %%i+1 REMAP_REG %%i, %%j, 1 %assign %%i %%i+1 %endrep LOAD_REG 5 %endif %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %if ARCH_X86_64 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] SWAP m%2, m%5 movq m%1, [srcq+ r4] movq m%2, [srcq+ r6] movhps m%1, [srcq+ r7] movhps m%2, [srcq+ r9] movq m%3, [srcq+r10] movq m%4, [srcq+r11] movhps m%3, [srcq+r13] movhps m%4, [srcq+ rX] add srcq, ssq movq m%5, [srcq+ r4] movq m%6, [srcq+ r6] movhps m%5, [srcq+ r7] movhps m%6, [srcq+ r9] movq m%7, [srcq+r10] movq m%8, [srcq+r11] movhps m%7, [srcq+r13] movhps m%8, [srcq+ rX] add srcq, ssq pmaddubsw m%1, m%9 pmaddubsw m%5, m%9 pmaddubsw m%2, m%10 pmaddubsw m%6, m%10 pmaddubsw m%3, m%11 pmaddubsw m%7, m%11 pmaddubsw m%4, m%12 pmaddubsw m%8, m%12 phaddw m%1, m%2 phaddw m%5, m%6 phaddw m%3, m%4 phaddw m%7, m%8 phaddw m%1, m%3 phaddw m%5, m%7 pmulhrsw m%1, m12 pmulhrsw m%5, m12 SWAP m%2, m%5 %endmacro %else %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets %if %3 == 1 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] %endif movq m0, [srcq+r0] movq m1, [srcq+rX] movhps m0, [srcq+r4] movhps m1, [srcq+r5] add srcq, ssq movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] sub srcq, ssq movq m2, [srcq+r0] movq m3, [srcq+rX] movhps m2, [srcq+r4] movhps m3, [srcq+r5] add srcq, ssq movq m6, [srcq+r0] movq m7, [srcq+rX] movhps m6, [srcq+r4] movhps m7, [srcq+r5] add srcq, ssq pmaddubsw m0, [esp+%1+ 0] pmaddubsw m4, [esp+%1+ 0] pmaddubsw m1, [esp+%1+16] pmaddubsw m5, [esp+%1+16] pmaddubsw m2, [esp+%1+32] pmaddubsw m6, [esp+%1+32] pmaddubsw m3, [esp+%1+48] pmaddubsw m7, [esp+%1+48] phaddw m0, m1 phaddw m4, m5 phaddw m2, m3 phaddw m6, m7 phaddw m0, m2 phaddw m4, m6 pmulhrsw m0, m12 pmulhrsw m4, m12 %if %2 != 0 mova [esp+%2+ 0], m0 mova [esp+%2+16], m4 %endif %endmacro %endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy %else cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy %endif %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy %else cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy %endif %endif %xdefine base_reg r12 %define rndshift 10 %else ; prep %assign isprep 1 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy %define tmp_stridem qword [rsp+0x138] %endif %xdefine base_reg r11 %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy %else cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy %endif %define tmp_stridem dword [esp+0x138] %endif %define rndshift 6 %endif %if ARCH_X86_32 mov [esp+0x1f0], t0d mov [esp+0x1f4], t1d %if !isprep && required_stack_alignment > STACK_ALIGNMENT mov dstd, dstm mov dsd, dsm mov srcd, srcm mov ssd, ssm mov hd, hm mov r4, mxm %define r0m [esp+0x200] %define dsm [esp+0x204] %define dsmp dsm %define r1m dsm %define r2m [esp+0x208] %define ssm [esp+0x20c] %define r3m ssm %define hm [esp+0x210] %define mxm [esp+0x214] mov r0m, dstd mov dsm, dsd mov r2m, srcd mov ssm, ssd mov hm, hd mov r0, mym mov r1, dxm mov r2, dym %define mym [esp+0x218] %define dxm [esp+0x09c] %define dym [esp+0x21c] mov mxm, r4 mov mym, r0 mov dxm, r1 mov dym, r2 tzcnt wd, wm %endif %if isprep && required_stack_alignment > STACK_ALIGNMENT %xdefine base_reg r5 %else %xdefine base_reg r6 %endif mov ssd, ssm %endif LEA base_reg, %1_8tap_scaled_8bpc_ssse3 %xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm %endif %if ARCH_X86_32 %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %endif movd m8, dxm movd m14, mxm pshufd m8, m8, q0000 pshufd m14, m14, q0000 %if isprep && UNIX64 mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif %if ARCH_X86_64 mov dyd, dym %endif %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %if ARCH_X86_64 %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x138] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else %define rX r1 %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x94] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %if ARCH_X86_64 %define rX r14 %define rXd r14d %else %define rX r3 %endif %endif %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m12, [base+pw_8192] %ifidn %1, put mova m13, [base+pd_512] %else mova m13, [base+pd_32] %endif %else %define m10 [base+pd_0x3ff] %define m12 [base+pw_8192] %ifidn %1, put %define m13 [base+pd_512] %else %define m13 [base+pd_32] %endif %endif pxor m9, m9 %if ARCH_X86_64 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q %else MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT mov r1, [esp+0x1f4] lea r0, [ssq*3] movzx r2, r1b shr r1, 16 cmp dword hm, 6 cmovs r1, r2 mov [esp+0x1f4], r1 mov r1, r1m mov r2, r2m sub srcq, r0 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define ss3q r0 %define myd r4 %define dyd dword dym %define hd dword hm %endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [rsp+0x180], m14 SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] movhps m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 %else pand m7, m8, m11 pandn m8, m15 %define m8 m6 %define m15 m5 por m15, m7 mova [rsp+0x190], m15 %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 1 2 3 pmulhrsw m1, m12 ; 4 5 6 7 palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mov myd, mym mov r0, r0m mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif .w2_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m5, m3, m8 pmaddwd m6, m0, m9 pmaddwd m7, m2, m10 pmaddwd m8, m4, m11 paddd m5, m6 paddd m7, m8 %else mov mym, myd mov r1, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r1, [r1+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r1*8+0] cmovnz r3, [base+subpel_filters+r1*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pmaddwd m3, m5 pmaddwd m0, m6 pshufd m5, m7, q2222 pshufd m7, m7, q3333 pmaddwd m2, m5 pmaddwd m4, m7 paddd m3, m0 paddd m2, m4 SWAP m5, m3 SWAP m7, m2 %endif paddd m5, m13 paddd m5, m7 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 %if ARCH_X86_64 pextrw r6d, m5, 0 mov [dstq], r6w add dstq, dsq dec hd jz .ret add myd, dyd %else pextrw r3d, m5, 0 mov [dstq], r3w add dstq, dsm dec hd jz .ret mov myd, mym add myd, dym %endif test myd, ~0x3ff %if ARCH_X86_32 SWAP m3, m5 SWAP m2, m7 mova m3, [rsp+0x1a0] mova m0, [rsp+0x1b0] mova m2, [rsp+0x1c0] mova m4, [rsp+0x1d0] %define m14 [esp+0x180] %define m15 [esp+0x190] %endif jz .w2_loop %if ARCH_X86_32 mov r3, r3m %endif movq m5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps m3, m0, q1032 ; 01 12 shufps m0, m2, q1032 ; 23 34 shufps m2, m4, q1032 ; 45 56 pshufb m5, m14 pmaddubsw m5, m15 phaddw m5, m5 pmulhrsw m5, m12 palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif jmp .w2_loop .w2_skip_line: movhps m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m3, m0 ; 01 12 mova m0, m2 ; 23 34 pshufb m5, m14 pmaddubsw m5, m15 phaddw m5, m5 pmulhrsw m5, m12 ; 6 7 6 7 palignr m4, m5, m1, 8 ; 4 5 6 7 pshufd m5, m4, q0321 ; 5 6 7 _ mova m1, m4 punpcklwd m2, m4, m5 ; 45 56 punpckhwd m4, m5 ; 67 __ %if ARCH_X86_32 mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif jmp .w2_loop %endif INIT_XMM ssse3 .w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd rX, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] movifprep r3, r3m SWAP m4, m7 %define m15 m1 %endif mova m5, [base+bdct_lb_dw] movq m6, [base+subpel_s_shuf2] psrld m14, 10 punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 punpcklqdq m6, m6 pshufb m14, m5 paddb m14, m6 %if ARCH_X86_64 pcmpeqd m0, m9 pand m11, m0 %else mova [esp+0x180], m14 SWAP m7, m4 pxor m3, m3 pcmpeqd m0, m3 pand m2, m11, m0 %define m11 m2 %endif pandn m0, m15 %if ARCH_X86_64 SWAP m15, m0 %else %define m15 m0 %endif por m15, m11 %if ARCH_X86_64 movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 pshufb m10, m14 pshufb m2, m14 pshufb m4, m14 pshufb m3, m14 pshufb m5, m14 pmaddubsw m7, m15 pmaddubsw m9, m15 pmaddubsw m8, m15 pmaddubsw m10, m15 pmaddubsw m2, m15 pmaddubsw m4, m15 pmaddubsw m3, m15 pmaddubsw m5, m15 phaddw m7, m9 phaddw m8, m10 phaddw m9, m2, m4 phaddw m3, m5 pmulhrsw m7, m12 ; 0 1 pmulhrsw m8, m12 ; 2 3 pmulhrsw m9, m12 ; 4 5 pmulhrsw m3, m12 ; 6 7 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 psrldq m11, m3, 8 ; 7 _ punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 punpcklwd m3, m11 ; 67 mova [rsp+0x00], m7 mova [rsp+0x10], m8 mova [rsp+0x20], m9 %else mova [esp+0x190], m15 lea ss3q, [ssq*3] movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] movu m7, [srcq+ssq*2] movu m6, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m2, m14 pshufb m3, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 phaddw m2, m3 phaddw m7, m6 movu m1, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m6, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m1, m14 pshufb m5, m14 pshufb m3, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m5, m15 pmaddubsw m3, m15 pmaddubsw m6, m15 phaddw m1, m5 phaddw m3, m6 pmulhrsw m2, m12 pmulhrsw m7, m12 pmulhrsw m1, m12 pmulhrsw m3, m12 shufps m4, m2, m7, q1032 ; 1 2 shufps m5, m7, m1, q1032 ; 3 4 shufps m6, m1, m3, q1032 ; 5 6 psrldq m0, m3, 8 ; 7 _ mova [esp+0x1a0], m0 %define m11 [esp+0x1a0] punpcklwd m0, m2, m4 ; 01 punpckhwd m2, m4 ; 12 punpcklwd m4, m7, m5 ; 23 punpckhwd m7, m5 ; 34 punpcklwd m5, m1, m6 ; 45 punpckhwd m1, m6 ; 56 punpcklwd m3, [esp+0x1a0] ; 67 mov myd, mym mov r0, r0m mova [esp+0x1b0], m0 ; 01 mova [esp+0x1c0], m4 ; 23 mova [esp+0x1d0], m5 ; 45 mova [esp+0x1e0], m3 ; 67 mova [rsp+0x00], m2 ; 12 mova [rsp+0x10], m7 ; 34 mova [rsp+0x20], m1 ; 56 SWAP m1, m4 SWAP m2, m5 %endif .w4_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m10, r6q punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m10 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 %else mov mym, myd mov r5, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 pmaddwd m0, m4 pmaddwd m1, m5 pmaddwd m2, m6 pmaddwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m13 paddd m0, m2 SWAP m4, m0 %endif psrad m4, rndshift packssdw m4, m4 %ifidn %1, put packuswb m4, m4 movd [dstq], m4 add dstq, dsmp %else movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop %else SWAP m0, m4 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff jnz .w4_next_line mova m0, [esp+0x1b0] mova m1, [esp+0x1c0] mova m2, [esp+0x1d0] mova m3, [esp+0x1e0] jmp .w4_loop .w4_next_line: %define m14 [esp+0x180] %define m15 [esp+0x190] %endif movu m4, [srcq] test myd, 0x400 jz .w4_skip_line %if ARCH_X86_64 mova m0, [rsp+0x00] mova [rsp+0x00], m1 mova m1, [rsp+0x10] mova [rsp+0x10], m2 mova m2, [rsp+0x20] mova [rsp+0x20], m3 %else mova m5, [esp+0x1c0] mova m0, [rsp+0x000] mova [rsp+0x00], m5 mova [esp+0x1b0], m0 mova m6, [esp+0x1d0] mova m1, [rsp+0x010] mova [rsp+0x10], m6 mova [esp+0x1c0], m1 mova m7, [esp+0x1e0] mova m2, [rsp+0x020] mova [rsp+0x20], m7 mova [esp+0x1d0], m2 %endif pshufb m4, m14 pmaddubsw m4, m15 phaddw m4, m4 pmulhrsw m4, m12 punpcklwd m3, m11, m4 %if ARCH_X86_32 mova [esp+0x1e0], m3 %endif mova m11, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: %if ARCH_X86_32 mova m0, [esp+0x1c0] mova m1, [esp+0x1d0] mova m2, [esp+0x1e0] %endif movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m6, [rsp+0x10] mova m7, [rsp+0x20] pshufb m4, m14 pshufb m5, m14 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m4, m5 pmulhrsw m4, m12 punpcklwd m5, m11, m4 mova [rsp+0x00], m6 mova [rsp+0x10], m7 mova [rsp+0x20], m5 %if ARCH_X86_64 psrldq m11, m4, 8 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m11 %else psrldq m6, m4, 8 punpcklwd m3, m4, m6 mova [esp+0x1a0], m6 mova [esp+0x1b0], m0 mova [esp+0x1c0], m1 mova [esp+0x1d0], m2 mova [esp+0x1e0], m3 %endif jmp .w4_loop INIT_XMM ssse3 .w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .w_start: %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq ssm %endif mov r4, [esp+0x1f0] shr r4, 16 movd m15, r4 mov r0, r0m mov myd, mym %endif sub srcq, 3 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 mov r5, hm mov [esp+0x094], myd mov [esp+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [esp+0x130], 8*(isprep+1) mov myd, [esp+0x094] mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %endif mova m15, [rsp+0x120] pxor m9, m9 mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov mym, myd mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .hloop: %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m11, m4 pand m8, m11, m6 pand m15, m11, m14 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m11, m5 mova [rsp+0x10], m7 mova [rsp+0x20], m8 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m9, [rsp+0x80] mov myd, mym mov dyd, dym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 SWAP m14, m8 .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m5, m11, q0000 pshufd m7, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m4, m5, m0 pmaddwd m5, m5, m1 pmaddwd m6, m7, m2 pmaddwd m7, m7, m3 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 pmaddwd m8, [rsp+0x70], m11 pmaddwd m9, [rsp+0x80], m11 paddd m4, m6 paddd m5, m7 paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 mova m5, [esp+0x180] mova m6, [esp+0x190] mova m7, [esp+0x1a0] mova m0, [esp+0x1b0] mov myd, mym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x180], m4 mova [esp+0x190], m5 mova [esp+0x1a0], m6 mova [esp+0x1b0], m7 mova m1, [esp+0x140] mova m2, [esp+0x150] mova m3, [esp+0x160] mova m4, [esp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x140], m0 mova [esp+0x150], m1 mova [esp+0x160], m2 mova [esp+0x170], m3 .vloop: mov r0, r0m mov r5, [esp+0x1f4] and myd, 0x3ff mov mym, myd xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 pshufd m6, m7, q2222 pshufd m7, m7, q3333 paddd m0, m2 paddd m1, m3 pmaddwd m2, [esp+0x180], m6 pmaddwd m3, [esp+0x190], m6 pmaddwd m4, [esp+0x1a0], m7 pmaddwd m5, [esp+0x1b0], m7 paddd m0, m2 paddd m1, m3 paddd m0, m13 paddd m1, m13 paddd m4, m0 paddd m5, m1 %endif psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x140], myd mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] jz .skip_line mova m14, [base+unpckw] movq m6, [srcq+r10] movq m7, [srcq+r11] movhps m6, [srcq+r13] movhps m7, [srcq+ rX] movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] movhps m5, [srcq+ r9] add srcq, ssq mov myd, [rsp+0x140] mov dyd, dym pshufd m9, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m9 ; 3a 2a pshufb m3, m9 ; 3b 2b pmaddubsw m6, [rsp+0x30] pmaddubsw m7, [rsp+0x40] pmaddubsw m4, [rsp+0x10] pmaddubsw m5, [rsp+0x20] phaddw m6, m7 phaddw m4, m5 phaddw m4, m6 pmulhrsw m4, m12 pshufb m5, [rsp+0x50], m14 ; 4a 5a pshufb m6, [rsp+0x60], m14 ; 4b 5b pshufb m7, [rsp+0x70], m9 ; 7a 6a pshufb m8, [rsp+0x80], m9 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b punpckhwd m5, m7 ; 56a punpckhwd m6, m8 ; 56b punpcklwd m7, m4 ; 78a punpckhqdq m4, m4 punpcklwd m8, m4 ; 78b mova [rsp+0x50], m5 mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m8 jmp .vloop .skip_line: mova m0, [rsp+0x10] mova m1, [rsp+0x20] mova m14, [rsp+0x30] mova m15, [rsp+0x40] MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 mov myd, [rsp+0x140] mov dyd, dym mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [rsp+0x50] ; 23a mova m3, [rsp+0x60] ; 23b mova m5, [rsp+0x70] ; 45a mova m6, [rsp+0x80] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [rsp+0x50], m5 mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m4 %else mov r0m, r0 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff mov mym, myd jnz .next_line mova m0, [esp+0x140] mova m1, [esp+0x150] mova m2, [esp+0x160] mova m3, [esp+0x170] jmp .vloop .next_line: test myd, 0x400 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] jz .skip_line mova m6, [base+unpckw] mova m0, [esp+0x140] mova m1, [esp+0x150] mova m7, [esp+0x180] movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] pshufb m0, m6 ; 0a 1a pshufb m1, m6 ; 0b 1b pshufb m7, m6 ; 4a 5a mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] movq m3, [srcq+r0] movq m2, [srcq+rX] movhps m3, [srcq+r4] movhps m2, [srcq+r5] add srcq, ssq pmaddubsw m4, [esp+0x20] pmaddubsw m5, [esp+0x30] pmaddubsw m3, [esp+0x40] pmaddubsw m2, [esp+0x50] phaddw m4, m5 phaddw m3, m2 mova m5, [esp+0x190] mova m2, [esp+0x160] phaddw m4, m3 mova m3, [esp+0x170] pmulhrsw m4, m12 ; 8a 8b mov myd, mym pshufb m5, m6 ; 4b 5b pshufd m6, m6, q1032 pshufb m2, m6 ; 3a 2a pshufb m3, m6 ; 3b 2b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b mova [esp+0x140], m0 mova [esp+0x150], m1 mova m0, [esp+0x1a0] mova m1, [esp+0x1b0] punpcklwd m2, m7 ; 34a punpcklwd m3, m5 ; 34b mova [esp+0x160], m2 mova [esp+0x170], m3 pshufb m0, m6 ; 7a 6a pshufb m1, m6 ; 7b 6b punpckhwd m7, m0 ; 56a punpckhwd m5, m1 ; 56b punpcklwd m0, m4 punpckhqdq m4, m4 punpcklwd m1, m4 mova [esp+0x180], m7 mova [esp+0x190], m5 mova [esp+0x1a0], m0 mova [esp+0x1b0], m1 mova m0, [esp+0x140] mova m1, [esp+0x150] jmp .vloop .skip_line: MC_8TAP_SCALED_H 0x20, 0x1c0, 0 mov myd, mym mova m0, [esp+0x160] mova m1, [esp+0x170] mova m2, [esp+0x180] mova m3, [esp+0x190] mova [esp+0x140], m0 mova [esp+0x150], m1 mova m4, [esp+0x1a0] mova m5, [esp+0x1b0] mova [esp+0x160], m2 mova [esp+0x170], m3 mova m6, [esp+0x1c0] mova m7, [esp+0x1d0] mova [esp+0x180], m4 mova [esp+0x190], m5 punpcklwd m4, m6, m7 punpckhwd m6, m7 mova [esp+0x1a0], m4 mova [esp+0x1b0], m6 %endif jmp .vloop INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 movzx r5, byte [esp+0x1f0] dec srcd movd m15, r5 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [esp+0x00], m14 %define m14 [esp+0x00] SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movq m10, r4 %else mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] %define m10 m4 movd m10, r4 movd m3, r3 mov r3, r3m punpckldq m10, m3 %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] add srcq, ss3q punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 %else pand m7, m11, m8 %define m11 m7 %endif pandn m8, m15 SWAP m15, m8 por m15, m11 %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 %else mova [esp+0x10], m15 %define m15 [esp+0x10] mov r0, r0m pshufd m5, m4, q0000 pshufd m6, m4, q1111 pshufd m7, m4, q2222 pshufd m4, m4, q3333 %define m8 [esp+0x20] %define m9 [esp+0x30] %define m10 [esp+0x40] %define m11 [esp+0x50] mova m8, m5 mova m9, m6 mova m10, m7 mova m11, m4 %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 pmulhrsw m1, m12 palignr m2, m1, m0, 4 pshufd m4, m1, q2121 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 punpcklwd m2, m1, m4 ; 45 56 .dy1_w2_loop: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m3, m8 pmaddwd m6, m0, m9 pmaddwd m7, m2, m10 mova m3, m0 mova m0, m2 paddd m5, m13 paddd m6, m7 pshufb m1, m14 pmaddubsw m1, m15 phaddw m1, m1 pmulhrsw m1, m12 palignr m7, m1, m4, 12 punpcklwd m2, m7, m1 ; 67 78 pmaddwd m7, m2, m11 mova m4, m1 paddd m5, m6 paddd m5, m7 psrad m5, rndshift packssdw m5, m5 packuswb m5, m5 movd r4d, m5 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif INIT_XMM ssse3 .dy1_w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else movd r1, m15 movd r3, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 %define m15 m5 SWAP m4, m7 movd m15, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m4, [base+subpel_filters+r5*8+2] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %endif punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 movq m6, [base+subpel_s_shuf2] %if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpcklqdq m6, m6 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m7, [srcq+ssq*2] add srcq, ss3q pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 paddb m14, m6 movq m10, r4q punpcklbw m10, m10 psraw m10, 8 pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 pshufb m5, m14 pshufb m7, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 pmaddubsw m7, m15 phaddw m0, m1 phaddw m2, m3 phaddw m4, m5 phaddw m6, m7, m7 pmulhrsw m0, m12 ; 0 1 pmulhrsw m2, m12 ; 2 3 pmulhrsw m4, m12 ; 4 5 pmulhrsw m6, m12 ; 6 _ shufps m1, m0, m2, q1032 ; 1 2 shufps m3, m2, m4, q1032 ; 3 4 shufps m5, m4, m6, q1032 ; 5 6 punpcklwd m7, m0, m1 ; 01 punpckhwd m0, m1 ; 12 punpcklwd m8, m2, m3 ; 23 punpckhwd m2, m3 ; 34 punpcklwd m9, m4, m5 ; 45 punpckhwd m4, m5 ; 56 %else pxor m3, m3 pcmpeqd m8, m3 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] add srcq, ss3q punpcklqdq m6, m6 SWAP m4, m7 pand m7, m11, m8 pandn m8, m15 SWAP m5, m0 por m15, m7 paddb m14, m6 movu m0, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m0, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 mova [esp+0x00], m14 mova [esp+0x10], m15 pmaddubsw m0, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 phaddw m1, m2 movu m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] mov r0, r0m phaddw m3, m0 pshufb m2, m14 pmaddubsw m2, m15 %define m14 [esp+0x00] %define m15 [esp+0x10] phaddw m7, m6 phaddw m2, m2 movd m6, r4 movd m0, r5 punpckldq m6, m0 punpcklbw m6, m6 psraw m6, 8 mova [esp+0x20], m6 pmulhrsw m1, m12 ; 0 1 pmulhrsw m3, m12 ; 2 3 pmulhrsw m7, m12 ; 4 5 pmulhrsw m2, m12 ; 6 _ shufps m0, m1, m3, q1032 ; 1 2 shufps m4, m3, m7, q1032 ; 3 4 shufps m5, m7, m2, q1032 ; 5 6 punpcklwd m6, m1, m0 ; 01 punpckhwd m1, m0 ; 12 mova [esp+0x30], m1 punpcklwd m1, m3, m4 ; 23 punpckhwd m3, m4 ; 34 mova [esp+0x40], m3 punpcklwd m3, m7, m5 ; 45 punpckhwd m7, m5 ; 56 mova [esp+0x50], m7 mova [esp+0x60], m2 mova m0, [esp+0x20] %xdefine m8 m1 %xdefine m9 m3 %xdefine m10 m0 SWAP m7, m6 SWAP m1, m4 SWAP m3, m2 %endif pshufd m1, m10, q0000 pshufd m3, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_64 mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 %else mova [esp+0x70], m8 mova [esp+0x80], m9 mova [esp+0x90], m1 mova [esp+0xa0], m3 mova [esp+0xb0], m5 mova [esp+0xc0], m10 %ifidn %1, put mov dsd, dsm %endif %define m11 m6 %endif .dy1_w4_loop: %if ARCH_X86_64 movu m11, [srcq+ssq*0] pmaddwd m7, m1 pmaddwd m8, m3 pmaddwd m0, m1 pmaddwd m2, m3 pmaddwd m9, m5 pmaddwd m4, m5 paddd m7, m8 paddd m0, m2 movu m8, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m11, m14 pmaddubsw m11, m15 paddd m7, m13 paddd m0, m13 paddd m7, m9 paddd m0, m4 pshufb m8, m14 pmaddubsw m8, m15 phaddw m11, m8 mova m8, [rsp+0x20] pmulhrsw m11, m12 punpcklwd m9, m6, m11 ; 67 psrldq m6, m11, 8 punpcklwd m4, m11, m6 ; 78 pmaddwd m2, m9, m10 pmaddwd m11, m4, m10 paddd m7, m2 mova m2, [rsp+0x30] paddd m0, m11 %else SWAP m7, m6 SWAP m1, m4 SWAP m3, m2 movu m5, [srcq+ssq*0] mova m0, [esp+0x30] mova m2, [esp+0x40] mova m4, [esp+0x50] pmaddwd m6, [esp+0x90] pmaddwd m1, [esp+0xa0] pmaddwd m0, [esp+0x90] pmaddwd m2, [esp+0xa0] pmaddwd m3, [esp+0xb0] pmaddwd m4, [esp+0xb0] paddd m6, m1 paddd m0, m2 movu m7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m5, m14 pmaddubsw m5, m15 paddd m6, m13 paddd m0, m13 paddd m6, m3 paddd m0, m4 pshufb m7, m14 pmaddubsw m7, m15 phaddw m5, m7 mova m7, [rsp+0x80] pmulhrsw m5, m12 punpcklwd m3, [esp+0x60], m5 ; 67 psrldq m1, m5, 8 punpcklwd m4, m5, m1 ; 78 pmaddwd m2, m3, [esp+0xc0] pmaddwd m5, m4, [esp+0xc0] mova [esp+0x60], m1 paddd m6, m2 mova m2, [esp+0x50] paddd m0, m5 SWAP m7, m6 %endif psrad m7, rndshift psrad m0, rndshift packssdw m7, m0 %if ARCH_X86_64 mova m0, [rsp+0x10] %else mova m0, [esp+0x40] %define m11 m5 %endif %ifidn %1, put packuswb m7, m7 psrldq m11, m7, 4 movd [dstq+dsq*0], m7 movd [dstq+dsq*1], m11 lea dstq, [dstq+dsq*2] %else mova [tmpq], m7 add tmpq, 16 %endif sub hd, 2 jz .ret %if ARCH_X86_64 mova m7, [rsp+0x00] mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 %else mova m7, [esp+0x70] ; 01 mova m1, [esp+0x80] ; 23 mova m2, [esp+0x50] ; 34 mova [esp+0x30], m0 mova [esp+0x70], m1 mova [esp+0x40], m2 mova [esp+0x80], m3 mova [esp+0x50], m4 %endif jmp .dy1_w4_loop INIT_XMM ssse3 .dy1_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define m8 m0 %define m9 m1 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 sub srcq, 3 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 %else movd m5, r4 movd m6, r5 punpckldq m5, m6 punpcklbw m5, m5 psraw m5, 8 SWAP m3, m5 %endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [rsp+0x140], m0 mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 SWAP m5, m3 mov r5, hm mov [esp+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [rsp+0x130], 8*(isprep+1) mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %else %define m10 [base+pd_0x3ff] %endif mova m15, [rsp+0x120] mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy1_hloop: pxor m9, m9 %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m8, m11, m4 pand m9, m11, m6 pand m15, m11, m7 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m7, m2 pandn m5, m3 por m8, m4 por m9, m6 por m15, m7 por m11, m5 mova [rsp+0x10], m8 mova [rsp+0x20], m9 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m15, [rsp+0x80] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b SWAP m14, m8 mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m15; 23a punpckhwd m3, m15 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 mova m14, [base+unpckw] %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 mova m5, [esp+0x1a0] mova m6, [esp+0x1b0] mova m7, [esp+0x1c0] mova m0, [esp+0x1d0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 mova [esp+0x1d0], m7 mova m1, [esp+0x060] mova m2, [esp+0x070] mova m3, [esp+0x180] mova m4, [esp+0x190] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x060], m0 mova [esp+0x070], m1 mova [esp+0x180], m2 mova [esp+0x190], m3 %define m8 [esp+0x140] %define m9 [esp+0x150] %define m10 [esp+0x160] %define m11 [esp+0x170] %endif .dy1_vloop: %if ARCH_X86_32 mov r0, r0m %endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m9 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 %else pmaddwd m6, [rsp+0x1a0], m10 pmaddwd m7, [rsp+0x1b0], m10 %endif paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x70], m11 pmaddwd m7, [rsp+0x80], m11 %else pmaddwd m6, [rsp+0x1c0], m11 pmaddwd m7, [rsp+0x1d0], m11 %endif paddd m4, m6 paddd m5, m7 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif %if ARCH_X86_32 mov r0m, r0 %endif dec hd jz .dy1_hloop_prep %if ARCH_X86_64 movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] movhps m5, [srcq+ r9] movq m6, [srcq+r10] movq m7, [srcq+r11] movhps m6, [srcq+r13] movhps m7, [srcq+ rX] add srcq, ssq pshufd m15, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m15 ; 3a 2a pshufb m3, m15 ; 3b 2b pmaddubsw m4, [rsp+0x10] pmaddubsw m5, [rsp+0x20] pmaddubsw m6, [rsp+0x30] pmaddubsw m7, [rsp+0x40] phaddw m4, m5 phaddw m6, m7 phaddw m4, m6 pmulhrsw m4, m12 pshufb m5, [rsp+0x70], m15 ; 7a 6a pshufb m7, [rsp+0x80], m15 ; 7b 6b pshufb m6, [rsp+0x50], m14 ; 4a 5a pshufb m15, [rsp+0x60], m14 ; 4b 5b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m6 ; 34a punpcklwd m3, m15 ; 34b punpckhwd m6, m5 ; 56a punpckhwd m15, m7 ; 56b punpcklwd m5, m4 ; 78a psrldq m4, 8 punpcklwd m7, m4 ; 78b mova [rsp+0x50], m6 mova [rsp+0x60], m15 mova [rsp+0x70], m5 mova [rsp+0x80], m7 %else mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova m6, [base+unpckw] mova m0, [esp+0x060] mova m1, [esp+0x070] mova m7, [esp+0x1a0] movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] pshufb m0, m6 ; 0a 1a pshufb m1, m6 ; 0b 1b pshufb m7, m6 ; 4a 5a mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] movq m3, [srcq+r0] movq m2, [srcq+rX] movhps m3, [srcq+r4] movhps m2, [srcq+r5] add srcq, ssq pmaddubsw m4, [esp+0x20] pmaddubsw m5, [esp+0x30] pmaddubsw m3, [esp+0x40] pmaddubsw m2, [esp+0x50] phaddw m4, m5 phaddw m3, m2 mova m5, [esp+0x1b0] mova m2, [esp+0x180] phaddw m4, m3 mova m3, [esp+0x190] pmulhrsw m4, m12 ; 8a 8b pshufb m5, m6 ; 4b 5b pshufd m6, m6, q1032 pshufb m2, m6 ; 3a 2a pshufb m3, m6 ; 3b 2b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b mova [esp+0x60], m0 mova [esp+0x70], m1 mova m0, [esp+0x1c0] mova m1, [esp+0x1d0] punpcklwd m2, m7 ; 34a punpcklwd m3, m5 ; 34b mova [esp+0x180], m2 mova [esp+0x190], m3 pshufb m0, m6 ; 7a 6a pshufb m1, m6 ; 7b 6b punpckhwd m7, m0 ; 56a punpckhwd m5, m1 ; 56b punpcklwd m0, m4 punpckhqdq m4, m4 punpcklwd m1, m4 mova [esp+0x1a0], m7 mova [esp+0x1b0], m5 mova [esp+0x1c0], m0 mova [esp+0x1d0], m1 mova m0, [esp+0x60] mova m1, [esp+0x70] %endif jmp .dy1_vloop INIT_XMM ssse3 .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 movzx r5, byte [esp+0x1f0] dec srcd movd m15, r5 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [esp+0x00], m14 %define m14 [esp+0x00] SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] movhps m0, [srcq+ssq*2] movhps m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movq m10, r4q %else mov myd, mym mov r3, [esp+0x1f4] xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r3, r3m %define m10 m4 movd m10, r4 movd m3, r5 punpckldq m10, m3 %endif movq m3, [srcq+ssq*0] movhps m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 %else pand m7, m11, m8 %define m11 m7 %endif pandn m8, m15 SWAP m15, m8 por m15, m11 %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 %else mova [esp+0x10], m15 %define m15 [esp+0x10] mov r5, r0m %define dstq r5 mov dsd, dsm pshufd m5, m4, q0000 pshufd m6, m4, q1111 pshufd m7, m4, q2222 pshufd m4, m4, q3333 %define m8 [esp+0x20] %define m9 [esp+0x30] %define m10 [esp+0x40] %define m11 [esp+0x50] mova m8, m5 mova m9, m6 mova m10, m7 mova m11, m4 %endif pshufb m0, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 pslldq m2, m3, 8 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 pshufd m2, m0, q3110 ; 0 2 2 4 pshufd m1, m1, q3110 ; 1 3 3 5 punpcklwd m3, m2, m1 ; 01 23 punpckhwd m2, m1 ; 23 45 .dy2_w2_loop: movq m6, [srcq+ssq*0] movq m7, [srcq+ssq*1] movhps m6, [srcq+ssq*2] movhps m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd m4, m3, m8 pmaddwd m5, m2, m9 pshufb m6, m14 pshufb m7, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 phaddw m6, m7 pmulhrsw m6, m12 psrldq m7, m6, 8 palignr m6, m0, 8 palignr m7, m1, 8 mova m0, m6 mova m1, m7 pshufd m6, m6, q3221 pshufd m7, m7, q3221 punpcklwd m3, m6, m7 ; 45 67 punpckhwd m2, m6, m7 ; 67 89 pmaddwd m6, m3, m10 pmaddwd m7, m2, m11 paddd m4, m5 paddd m4, m13 paddd m6, m7 paddd m4, m6 psrad m4, rndshift packssdw m4, m4 packuswb m4, m4 movd r4d, m4 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif INIT_XMM ssse3 .dy2_w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %define dstq r0 %if isprep %define ssq r3 %endif movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] movq m6, [base+subpel_s_shuf2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else movd r1, m15 movd r3, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 %define m15 m5 SWAP m4, m7 movd m15, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m4, [base+subpel_filters+r5*8+2] movq m6, [base+subpel_s_shuf2] mov myd, mym mov r3, [esp+0x1f4] xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %endif punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 %if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 movu m0, [srcq+ssq*0] movu m2, [srcq+ssq*2] movu m1, [srcq+ssq*1] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpcklqdq m6, m6 pshufb m14, [base+bdct_lb_dw] movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 paddb m14, m6 movq m11, r4q punpcklbw m11, m11 psraw m11, 8 pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pshufb m4, m14 pshufb m5, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m0, m2 phaddw m1, m3 phaddw m4, m5 pmulhrsw m0, m12 ; 0 2 pmulhrsw m1, m12 ; 1 3 pmulhrsw m4, m12 ; 4 5 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 %else pxor m3, m3 pcmpeqd m8, m3 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*2] movu m3, [srcq+ssq*1] add srcq, ss3q punpcklqdq m6, m6 SWAP m4, m7 pand m7, m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m7 paddb m14, m6 movu m0, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m0, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 mova [esp+0x00], m14 mova [esp+0x10], m15 pmaddubsw m0, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 %define m14 [esp+0x00] %define m15 [esp+0x10] phaddw m1, m2 phaddw m3, m0 phaddw m7, m6 %ifidn %1, put mov dsd, dsm %define dstq r5 %else %define tmpq r5 %endif movd m6, r4 movd m0, r5 punpckldq m6, m0 punpcklbw m6, m6 psraw m6, 8 mov r5, r0m pmulhrsw m1, m12 ; 0 2 pmulhrsw m3, m12 ; 1 3 pmulhrsw m7, m12 ; 4 5 SWAP m0, m1, m3 SWAP m4, m7 pshufd m2, m6, q0000 pshufd m3, m6, q1111 pshufd m7, m6, q2222 pshufd m6, m6, q3333 mova [esp+0x30], m2 mova [esp+0x40], m3 mova [esp+0x50], m7 mova [esp+0x60], m6 %define m8 [esp+0x30] %define m9 [esp+0x40] %define m10 [esp+0x50] %define m11 [esp+0x60] %endif psrldq m5, m4, 8 ; 5 _ punpckhwd m2, m0, m1 ; 23 punpcklwd m0, m1 ; 01 punpcklwd m4, m5 ; 45 .dy2_w4_loop: pmaddwd m0, m8 ; a0 pmaddwd m5, m2, m8 ; b0 pmaddwd m2, m9 ; a1 pmaddwd m7, m4, m9 ; b1 pmaddwd m3, m4, m10 ; a2 paddd m0, m13 paddd m5, m13 paddd m0, m2 paddd m5, m7 paddd m0, m3 movu m6, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m6, m14 pshufb m7, m14 pshufb m3, m14 pshufb m1, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 pmaddubsw m3, m15 pmaddubsw m1, m15 phaddw m6, m7 phaddw m3, m1 pmulhrsw m6, m12 ; 6 7 pmulhrsw m3, m12 ; 8 9 psrldq m7, m6, 8 psrldq m1, m3, 8 punpcklwd m6, m7 ; 67 punpcklwd m3, m1 ; 89 mova m2, m6 pmaddwd m1, m6, m10 ; b2 pmaddwd m6, m11 ; a3 pmaddwd m7, m3, m11 ; b3 paddd m5, m1 paddd m0, m6 paddd m5, m7 psrad m0, rndshift psrad m5, rndshift packssdw m0, m5 %ifidn %1, put packuswb m0, m0 psrldq m1, m0, 4 movd [dstq+dsq*0], m0 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] %else mova [tmpq], m0 add tmpq, 16 %endif mova m0, m4 mova m4, m3 sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET INIT_XMM ssse3 .dy2_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %define m9 m1 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define tmpq r0 %define ssq ssm %else %define dstq r0 %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 sub srcq, 3 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 %else movd m5, r4 movd m6, r5 punpckldq m5, m6 punpcklbw m5, m5 psraw m5, 8 SWAP m3, m5 %endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [rsp+0x140], m0 mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 SWAP m5, m3 mov r5, hm mov [esp+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [rsp+0x130], 8*(isprep+1) mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %else %define m10 [base+pd_0x3ff] %endif mova m15, [rsp+0x120] mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy2_hloop: pxor m9, m9 %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m8, m11, m4 pand m9, m11, m6 pand m15, m11, m7 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m7, m2 pandn m5, m3 por m8, m4 por m9, m6 por m15, m7 por m11, m5 mova [rsp+0x10], m8 mova [rsp+0x20], m9 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m15, [rsp+0x80] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b SWAP m14, m8 mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m15; 23a punpckhwd m3, m15 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 mova m5, [esp+0x1a0] mova m6, [esp+0x1b0] mova m7, [esp+0x1c0] mova m0, [esp+0x1d0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 mova [esp+0x1d0], m7 mova m1, [esp+0x060] mova m2, [esp+0x070] mova m3, [esp+0x180] mova m4, [esp+0x190] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x180], m2 mova [esp+0x190], m3 %define m8 [esp+0x140] %define m9 [esp+0x150] %define m10 [esp+0x160] %define m11 [esp+0x170] %endif .dy2_vloop: %if ARCH_X86_32 mov r0, r0m %endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m9 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 %else pmaddwd m6, [esp+0x1a0], m10 pmaddwd m7, [esp+0x1b0], m10 %endif paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x70], m11 pmaddwd m7, [rsp+0x80], m11 %else pmaddwd m6, [esp+0x1c0], m11 pmaddwd m7, [esp+0x1d0], m11 %endif paddd m4, m6 paddd m5, m7 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif %if ARCH_X86_32 mov r0m, r0 %endif dec hd jz .dy2_hloop_prep %if ARCH_X86_64 mova m8, [rsp+0x10] mova m9, [rsp+0x20] mova m10, [rsp+0x30] mova m11, [rsp+0x40] mova m0, m2 ; 01a mova m1, m3 ; 01b MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 mova m3, [rsp+0x50] ; 23a mova m4, [rsp+0x60] ; 23b mova m5, [rsp+0x70] ; 45a mova m7, [rsp+0x80] ; 45b mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m14, m2, m6 ; 67a punpckhwd m2, m6 ; 67b mova [rsp+0x50], m5 mova [rsp+0x60], m7 mova [rsp+0x70], m14 mova [rsp+0x80], m2 mova m2, m3 mova m3, m4 %else MC_8TAP_SCALED_H 0x20, 0 punpcklwd m6, m0, m4 punpckhwd m7, m0, m4 mova m0, [esp+0x180] ; 01a mova m1, [esp+0x190] ; 01b mova m2, [rsp+0x1a0] ; 23a mova m3, [esp+0x1b0] ; 23b mova m4, [esp+0x1c0] ; 45a mova m5, [esp+0x1d0] ; 45b mova [esp+0x180], m2 mova [esp+0x190], m3 mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 ; 67a mova [esp+0x1d0], m7 ; 67b %endif jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT %define r0m [rstk+stack_offset+ 4] %define r1m [rstk+stack_offset+ 8] %define r2m [rstk+stack_offset+12] %define r3m [rstk+stack_offset+16] %endif %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_8bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, (5*15 << 16) | 5*15 jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, put_8tap_scaled_8bpc PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH, prep_8tap_scaled_8bpc PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 mov alpham, alphad mov betam, betad %endmacro %macro SAVE_GAMMA_DELTA 0 mov gammam, gammad mov deltam, deltad %endmacro %macro LOAD_ALPHA_BETA_MX 0 mov mym, myd mov alphad, alpham mov betad, betam mov mxd, mxm %endmacro %macro LOAD_GAMMA_DELTA_MY 0 mov mxm, mxd mov gammad, gammam mov deltad, deltam mov myd, mym %endmacro %define PIC_reg r2 %define PIC_base_offset $$ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) %else %define SAVE_ALPHA_BETA %define SAVE_GAMMA_DELTA %define PIC_sym(sym) sym %endif %if ARCH_X86_32 %if STACK_ALIGNMENT < required_stack_alignment %assign copy_args 8*4 %else %assign copy_args 0 %endif %endif %macro RELOC_ARGS 0 %if copy_args mov r0, r0m mov r1, r1m mov r2, r2m mov r3, r3m mov r5, r5m mov dstm, r0 mov dsm, r1 mov srcm, r2 mov ssm, r3 mov mxm, r5 mov r0, r6m mov mym, r0 %endif %endmacro %macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 %if cpuflag(sse4) pblendw %1, %2, 0xAA %else pand %2, m10 por %1, %2 %endif %endmacro %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 %if ARCH_X86_32 %define m8 m4 %define m9 m5 %define m14 m6 %define m15 m7 %define m11 m7 %endif %if ARCH_X86_32 pxor m11, m11 %endif lea tmp1d, [myq+gammaq*4] lea tmp2d, [myq+gammaq*1] shr myd, 10 shr tmp1d, 10 movq m2, [filterq+myq *8] ; a movq m8, [filterq+tmp1q*8] ; e lea tmp1d, [tmp2q+gammaq*4] lea myd, [tmp2q+gammaq*1] shr tmp2d, 10 shr tmp1d, 10 movq m3, [filterq+tmp2q*8] ; b movq m0, [filterq+tmp1q*8] ; f punpcklwd m2, m3 punpcklwd m8, m0 lea tmp1d, [myq+gammaq*4] lea tmp2d, [myq+gammaq*1] shr myd, 10 shr tmp1d, 10 movq m0, [filterq+myq *8] ; c movq m9, [filterq+tmp1q*8] ; g lea tmp1d, [tmp2q+gammaq*4] lea myd, [tmp2q+deltaq] ; my += delta shr tmp2d, 10 shr tmp1d, 10 movq m3, [filterq+tmp2q*8] ; d movq m1, [filterq+tmp1q*8] ; h punpcklwd m0, m3 punpcklwd m9, m1 punpckldq m1, m2, m0 punpckhdq m2, m0 punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m0, %3 pmaddwd m3, %5 pmaddwd m1, %7 pmaddwd m14, %9 paddd m0, m3 paddd m1, m14 paddd m0, m1 mova %1, m0 %if ARCH_X86_64 SWAP m3, m14 %endif punpckldq m0, m8, m9 punpckhdq m8, m9 punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 pmaddwd m1, %4 pmaddwd m14, %6 pmaddwd m2, %8 pmaddwd m15, %10 paddd m1, m14 paddd m2, m15 paddd m1, m2 mova %2, m1 %if ARCH_X86_64 SWAP m14, m3 %endif %endmacro %if ARCH_X86_64 %define counterd r4d %else %if copy_args == 0 %define counterd dword r4m %else %define counterd dword [esp+stack_size-4*7] %endif %endif %macro WARP_AFFINE_8X8 0 %if ARCH_X86_64 cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts %else cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts %if copy_args %define tmpm [esp+stack_size-4*1] %define tsm [esp+stack_size-4*2] %endif %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main .loop: %if ARCH_X86_32 %define m12 m4 %define m13 m5 %define m14 m6 %define m15 m7 mova m12, [esp+0xC0] mova m13, [esp+0xD0] mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif psrad m12, 13 psrad m13, 13 psrad m14, 13 psrad m15, 13 packssdw m12, m13 packssdw m14, m15 mova m13, [PIC_sym(pw_8192)] pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 pmulhrsw m14, m13 mova [tmpq+tsq*0], m12 mova [tmpq+tsq*2], m14 dec counterd jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end %if ARCH_X86_32 mov tmpm, tmpd mov r0, [esp+0x100] mov r1, [esp+0x104] %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 lea tmpq, [tmpq+tsq*4] jmp .loop %if ARCH_X86_64 cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ filter, tmp1, gamma, my, delta %else cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ filter, tmp1, gamma, my, delta %define alphaq r0 %define alphad r0 %define alpham [esp+gprsize+0x100] %define betaq r1 %define betad r1 %define betam [esp+gprsize+0x104] %define gammaq r0 %define gammad r0 %define gammam [esp+gprsize+0x108] %define deltaq r1 %define deltad r1 %define deltam [esp+gprsize+0x10C] %define filterq r3 %define tmp1q r4 %define tmp1d r4 %define tmp1m [esp+gprsize+0x110] %define myq r5 %define myd r5 %define mym r6m %if copy_args %define dstm [esp+stack_size-4*1] %define dsm [esp+stack_size-4*2] %define srcm [esp+stack_size-4*3] %define ssm [esp+stack_size-4*4] %define mxm [esp+stack_size-4*5] %define mym [esp+stack_size-4*6] %endif %endif call .main jmp .start .loop: %if ARCH_X86_32 mov dstm, dstd mov alphad, [esp+0x100] mov betad, [esp+0x104] %endif call .main2 lea dstq, [dstq+dsq*2] .start: %if notcpuflag(sse4) %define roundval pw_8192 %if ARCH_X86_64 mova m10, [PIC_sym(roundval)] %else %define m10 [PIC_sym(roundval)] %endif %endif %if ARCH_X86_32 %define m12 m5 %define m13 m6 mova m12, [esp+0xC0] mova m13, [esp+0xD0] %endif %if cpuflag(sse4) %if ARCH_X86_32 %define m11 m4 pxor m11, m11 %endif psrad m12, 18 psrad m13, 18 packusdw m12, m13 pavgw m12, m11 ; (x + (1 << 10)) >> 11 %else psrad m12, 17 psrad m13, 17 packssdw m12, m13 pmulhrsw m12, m10 %endif %if ARCH_X86_32 %define m14 m6 %define m15 m7 mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif %if cpuflag(sse4) psrad m14, 18 psrad m15, 18 packusdw m14, m15 pavgw m14, m11 ; (x + (1 << 10)) >> 11 %else psrad m14, 17 psrad m15, 17 packssdw m14, m15 pmulhrsw m14, m10 %endif packuswb m12, m14 movq [dstq+dsq*0], m12 movhps [dstq+dsq*1], m12 dec counterd jg .loop .end: RET ALIGN function_align .main: %assign stack_offset stack_offset+gprsize %if ARCH_X86_32 %assign stack_size stack_size+4 %if copy_args %assign stack_offset stack_offset-4 %endif RELOC_ARGS LEA PIC_reg, $$ %define PIC_mem [esp+gprsize+0x114] mov abcdd, abcdm %if copy_args == 0 mov ssd, ssm mov mxd, mxm %endif mov PIC_mem, PIC_reg mov srcd, srcm %endif movsx gammad, word [abcdq+2*2] movsx deltad, word [abcdq+2*3] lea tmp1d, [gammaq*3] sub deltad, tmp1d ; delta -= gamma*3 SAVE_GAMMA_DELTA %if ARCH_X86_32 mov abcdd, abcdm %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 %if ARCH_X86_32 mov srcm, srcd mov PIC_reg, PIC_mem %endif sub betad, tmp2d ; beta -= alpha*3 lea filterq, [PIC_sym(mc_warp_filter2)] %if ARCH_X86_64 mov myd, r6m pxor m11, m11 %endif call .h psrld m2, m0, 16 psrld m3, m1, 16 %if ARCH_X86_32 mova [esp+gprsize+0x10], m3 %endif call .h psrld m4, m0, 16 psrld m5, m1, 16 %if ARCH_X86_32 mova [esp+gprsize+0x20], m4 mova [esp+gprsize+0x30], m5 %endif call .h %if ARCH_X86_64 %define blendmask [rsp+gprsize+0x80] %else mova m3, [esp+gprsize+0x10] %define blendmask [esp+gprsize+0x120] %define m10 m7 %endif pcmpeqd m10, m10 pslld m10, 16 mova blendmask, m10 BLENDHWDW m2, m0 ; 0 BLENDHWDW m3, m1 ; 2 mova [rsp+gprsize+0x00], m2 mova [rsp+gprsize+0x10], m3 call .h %if ARCH_X86_32 mova m4, [esp+gprsize+0x20] mova m5, [esp+gprsize+0x30] %endif mova m10, blendmask BLENDHWDW m4, m0 ; 1 BLENDHWDW m5, m1 ; 3 mova [rsp+gprsize+0x20], m4 mova [rsp+gprsize+0x30], m5 call .h %if ARCH_X86_32 mova m3, [esp+gprsize+0x10] %define m10 m5 %endif psrld m6, m2, 16 psrld m7, m3, 16 mova m10, blendmask BLENDHWDW m6, m0 ; 2 BLENDHWDW m7, m1 ; 4 mova [rsp+gprsize+0x40], m6 mova [rsp+gprsize+0x50], m7 call .h %if ARCH_X86_32 mova m4, [esp+gprsize+0x20] mova m5, [esp+gprsize+0x30] %endif psrld m2, m4, 16 psrld m3, m5, 16 mova m10, blendmask BLENDHWDW m2, m0 ; 3 BLENDHWDW m3, m1 ; 5 mova [rsp+gprsize+0x60], m2 mova [rsp+gprsize+0x70], m3 call .h %if ARCH_X86_32 mova m6, [esp+gprsize+0x40] mova m7, [esp+gprsize+0x50] %define m10 m7 %endif psrld m4, m6, 16 psrld m5, m7, 16 mova m10, blendmask BLENDHWDW m4, m0 ; 4 BLENDHWDW m5, m1 ; 6 %if ARCH_X86_64 add myd, 512+(64<<10) mova m6, m2 mova m7, m3 %else mova [esp+gprsize+0x80], m4 mova [esp+gprsize+0x90], m5 add dword mym, 512+(64<<10) %endif mov counterd, 4 SAVE_ALPHA_BETA .main2: call .h %if ARCH_X86_32 mova m6, [esp+gprsize+0x60] mova m7, [esp+gprsize+0x70] %define m10 m5 %endif psrld m6, 16 psrld m7, 16 mova m10, blendmask BLENDHWDW m6, m0 ; 5 BLENDHWDW m7, m1 ; 7 %if ARCH_X86_64 WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ m4, m5, \ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ m6, m7 %else mova [esp+gprsize+0xA0], m6 mova [esp+gprsize+0xB0], m7 LOAD_GAMMA_DELTA_MY WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ [esp+gprsize+0x00], [esp+gprsize+0x10], \ [esp+gprsize+0x80], [esp+gprsize+0x90], \ [esp+gprsize+0x20], [esp+gprsize+0x30], \ [esp+gprsize+0xA0], [esp+gprsize+0xB0] LOAD_ALPHA_BETA_MX %endif call .h mova m2, [rsp+gprsize+0x40] mova m3, [rsp+gprsize+0x50] %if ARCH_X86_32 mova m4, [rsp+gprsize+0x80] mova m5, [rsp+gprsize+0x90] %define m10 m7 %endif mova [rsp+gprsize+0x00], m2 mova [rsp+gprsize+0x10], m3 mova [rsp+gprsize+0x40], m4 mova [rsp+gprsize+0x50], m5 psrld m4, 16 psrld m5, 16 mova m10, blendmask BLENDHWDW m4, m0 ; 6 BLENDHWDW m5, m1 ; 8 %if ARCH_X86_64 WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ m6, m7, \ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ m4, m5 %else mova [esp+gprsize+0x80], m4 mova [esp+gprsize+0x90], m5 LOAD_GAMMA_DELTA_MY WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ [esp+gprsize+0x20], [esp+gprsize+0x30], \ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ [esp+gprsize+0x00], [esp+gprsize+0x10], \ [esp+gprsize+0x80], [esp+gprsize+0x90] mov mym, myd mov dstd, dstm mov dsd, dsm mov mxd, mxm %endif mova m2, [rsp+gprsize+0x60] mova m3, [rsp+gprsize+0x70] %if ARCH_X86_32 mova m6, [esp+gprsize+0xA0] mova m7, [esp+gprsize+0xB0] %endif mova [rsp+gprsize+0x20], m2 mova [rsp+gprsize+0x30], m3 mova [rsp+gprsize+0x60], m6 mova [rsp+gprsize+0x70], m7 ret ALIGN function_align .h: %if ARCH_X86_32 %define m8 m3 %define m9 m4 %define m10 m5 %define m14 m6 %define m15 m7 %endif lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] %if ARCH_X86_32 %assign stack_offset stack_offset+4 %assign stack_size stack_size+4 %define PIC_mem [esp+gprsize*2+0x114] mov PIC_mem, PIC_reg mov srcd, srcm %endif movu m10, [srcq] %if ARCH_X86_32 add srcd, ssm mov srcm, srcd mov PIC_reg, PIC_mem %else add srcq, ssq %endif shr mxd, 10 shr tmp1d, 10 movq m1, [filterq+mxq *8] ; 0 X movq m8, [filterq+tmp1q*8] ; 4 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 movhps m1, [filterq+tmp2q*8] ; 0 1 movhps m8, [filterq+tmp1q*8] ; 4 5 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 movq m14, [filterq+mxq *8] ; 2 X movq m9, [filterq+tmp1q*8] ; 6 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 movhps m14, [filterq+tmp2q*8] ; 2 3 movhps m9, [filterq+tmp1q*8] ; 6 7 pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] pmaddubsw m0, m1 pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] pmaddubsw m1, m8 pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] pmaddubsw m15, m14 pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] pmaddubsw m10, m9 phaddw m0, m15 phaddw m1, m10 mova m14, [PIC_sym(pw_8192)] mova m9, [PIC_sym(pd_32768)] pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 pmaddwd m1, m14 paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword paddd m1, m9 ret %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w4: ; tile 4x movd [dstq ], m0 ; copy dw[0] pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] movd [dstq+strideq*1], m1 ; copy dw[1] punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] movd [dstq+strideq*2], m0 ; dw[2] psrlq m0, 32 ; shift right in dw[3] movd [dstq+stride3q ], m0 ; copy sub hd, 4 jg .w4_loop RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*2] .w8: movq [dstq ], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq] .w16: mova [dstq ], m0 dec hd jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq] .w32: mova [dstq ], m0 %1 2 mova [dstq + 16 ], m0 dec hd jg .w32_loop RET .w64_loop: %1_INC_PTR 8 %1 0 add dstq, strideq .w64: %assign i 0 %rep 4 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 4 %1 2*i %endif %endrep dec hd jg .w64_loop RET .w128_loop: %1_INC_PTR 16 %1 0 add dstq, strideq .w128: %assign i 0 %rep 8 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 8 %1 2*i %endif %endrep dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 LEA r6, avg_ssse3_table tzcnt wd, wm ; leading zeros movifnidn hd, hm ; move h(stack) to h(register) if not already that register movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m2, [tmp1q+(%1+0)*mmsize] mova m0, m2 psubw m2, [tmp2q+(%1+0)*mmsize] mova m3, [tmp1q+(%1+1)*mmsize] mova m1, m3 psubw m3, [tmp2q+(%1+1)*mmsize] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 LEA r6, w_avg_ssse3_table tzcnt wd, wm movd m4, r6m movifnidn hd, hm pxor m0, m0 movsxd wq, dword [r6+wq*4] mova m5, [pw_2048+r6-w_avg_ssse3_table] pshufb m4, m0 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q psubw m0, m4 mov tmp1q, tmp2q mova m4, m0 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 mova m3, [maskq+(%1+0)*(mmsize/2)] mova m0, [tmp2q+(%1+0)*mmsize] ; b psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a mova m6, m3 ; m psubb m3, m4, m6 ; -m paddw m1, m1 ; (b - a) << 1 paddb m3, m3 ; -m << 1 punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) pmulhw m1, m2 ; (-m * (b - a)) << 10 paddw m0, m1 ; + b mova m1, [tmp2q+(%1+1)*mmsize] ; b psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a paddw m2, m2 ; (b - a) << 1 mova m6, m3 ; (-m << 1) punpckhbw m3, m4, m6 ; (-m << 9) pmulhw m2, m3 ; (-m << 9) paddw m1, m2 ; (-m * (b - a)) << 10 pmulhrsw m0, m5 ; round pmulhrsw m1, m5 ; round packuswb m0, m1 ; interleave 16 -> 8 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*mmsize/2 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro %if ARCH_X86_64 cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 movifnidn hd, hm %else cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 %define hd dword r5m %endif %define base r6-mask_ssse3_table LEA r6, mask_ssse3_table tzcnt wd, wm movsxd wq, dword [r6+wq*4] pxor m4, m4 mova m5, [base+pw_2048] add wq, r6 mov maskq, r6m BIDIR_FN MASK %undef hd %macro W_MASK_420_END 1-* %rep %0 call .main paddw m2, [maskq+16*%1] mova [maskq+16*%1], m2 mova [dstq+strideq*1+16*(2*%1+0)], m0 call .main psubw m3, m7, m2 psubw m1, m7, [maskq+16*%1] psubw m3, [dstq+strideq*1+16*(2*%1+1)] psrlw m1, 2 psrlw m3, 2 packuswb m1, m3 mova [maskq+16*%1], m1 mova [dstq+strideq*1+16*(2*%1+1)], m0 %rotate 1 %endrep %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_420_ssse3_table LEA t0, w_mask_420_ssse3_table tzcnt wd, wm mov r6d, r7m ; sign sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_2048] movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign add wq, t0 %if ARCH_X86_64 mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 movifnidn hd, hm %else %define m8 [base+pw_6903] %define hd dword hm %endif mov maskq, maskmp call .main jmp wq .w4_loop: call .main add maskq, 4 lea dstq, [dstq+strideq*2] .w4: pshufd m3, m2, q2020 pshufd m2, m2, q3131 psubw m1, m7, m3 psubw m1, m2 psrlw m1, 2 packuswb m1, m1 movd [maskq], m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call .main add maskq, 4 lea dstq, [dstq+strideq*2] .w8: movhlps m3, m2 psubw m1, m7, m2 psubw m1, m3 psrlw m1, 2 packuswb m1, m1 movd [maskq], m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: call .main add maskq, 8 lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*1], m2 mova [dstq+strideq*0], m0 call .main psubw m1, m7, [dstq+strideq*1] psubw m1, m2 psrlw m1, 2 packuswb m1, m1 movq [maskq], m1 mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add maskq, 16 lea dstq, [dstq+strideq*2] .w32: mova [maskq], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 W_MASK_420_END 0 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add maskq, 16*2 lea dstq, [dstq+strideq*2] .w64: mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 call .main mova [maskq+16*1], m2 mova [dstq+strideq*0+16*2], m0 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m0 W_MASK_420_END 0, 1 sub hd, 2 jg .w64_loop RET .w128_loop: call .main add maskq, 16*4 lea dstq, [dstq+strideq*2] .w128: mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 call .main mova [maskq+16*1], m2 mova [dstq+strideq*0+16*2], m0 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m0 call .main mova [maskq+16*2], m2 mova [dstq+strideq*0+16*4], m0 call .main mova [dstq+strideq*1+16*5], m2 mova [dstq+strideq*0+16*5], m0 call .main mova [maskq+16*3], m2 mova [dstq+strideq*0+16*6], m0 call .main mova [dstq+strideq*1+16*7], m2 mova [dstq+strideq*0+16*7], m0 W_MASK_420_END 0, 1, 2, 3 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: mova m0, [tmp1q +16*0] mova m3, [tmp1q+tmp2q+16*0] mova m1, [tmp1q +16*1] mova m4, [tmp1q+tmp2q+16*1] add tmp1q, 16*2 psubw m3, m0 psubw m4, m1 pabsw m5, m3 psubusw m2, m8, m5 psrlw m2, 8 ; 64 - m psllw m5, m2, 10 pmulhw m3, m5 pabsw m5, m4 paddw m0, m3 psubusw m3, m8, m5 psrlw m3, 8 phaddw m2, m3 psllw m3, 10 pmulhw m4, m3 paddw m1, m4 pmulhrsw m0, m6 pmulhrsw m1, m6 packuswb m0, m1 ret %macro W_MASK_422_BACKUP 1 ; mask_offset %if ARCH_X86_64 mova m10, m2 %else mova [maskq+16*%1], m2 %endif %endmacro %macro W_MASK_422_END 1 ; mask_offset %if ARCH_X86_64 packuswb m10, m2 psubb m1, m7, m10 pavgb m1, m9 %else mova m3, [maskq+16*%1] packuswb m3, m2 pxor m2, m2 psubb m1, m7, m3 pavgb m1, m2 %endif mova [maskq+16*%1], m1 %endmacro cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_422_ssse3_table LEA t0, w_mask_422_ssse3_table tzcnt wd, wm mov r6d, r7m ; sign sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_2048] movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign add wq, t0 %if ARCH_X86_64 mova m8, [base+pw_6903] pxor m9, m9 movifnidn hd, hm %else add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table %define hd dword hm %endif mov maskq, maskmp call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main jmp wq .w4_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 8 lea dstq, [dstq+strideq*2] .w4: packuswb m2, m2 psubb m1, m7, m2 %if ARCH_X86_64 pavgb m1, m9 %else pxor m2, m2 pavgb m1, m2 %endif movq [maskq], m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 lea dstq, [dstq+strideq*2] .w8: W_MASK_422_BACKUP 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main lea dstq, [dstq+strideq*2] W_MASK_422_END 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 4 jg .w8_loop RET .w16_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 lea dstq, [dstq+strideq*2] .w16: W_MASK_422_BACKUP 0 mova [dstq+strideq*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 add dstq, strideq .w32: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 dec hd jg .w32_loop RET .w64_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16*2 add dstq, strideq .w64: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 1 mova [dstq+16*2], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 1 mova [dstq+16*3], m0 dec hd jg .w64_loop RET .w128_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16*4 add dstq, strideq .w128: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 1 mova [dstq+16*2], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 1 mova [dstq+16*3], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 2 mova [dstq+16*4], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 2 mova [dstq+16*5], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 3 mova [dstq+16*6], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 3 mova [dstq+16*7], m0 dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_444_ssse3_table LEA t0, w_mask_444_ssse3_table tzcnt wd, wm mov maskq, maskmp sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_6903] mova m7, [base+pw_2048] add wq, t0 %if ARCH_X86_64 mova m8, [base+pb_64] movifnidn hd, hm %else %define m8 [base+pb_64] %define hd dword hm %endif call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0], m0 call .main mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 call .main mova [dstq+16*2], m0 call .main mova [dstq+16*3], m0 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 call .main mova [dstq+16*2], m0 call .main mova [dstq+16*3], m0 call .main mova [dstq+16*4], m0 call .main mova [dstq+16*5], m0 call .main mova [dstq+16*6], m0 call .main mova [dstq+16*7], m0 dec hd jg .w128_loop RET ALIGN function_align .main: mova m0, [tmp1q +16*0] mova m3, [tmp1q+tmp2q+16*0] mova m1, [tmp1q +16*1] mova m4, [tmp1q+tmp2q+16*1] add tmp1q, 16*2 psubw m3, m0 psubw m4, m1 pabsw m5, m3 psubusw m2, m6, m5 psrlw m2, 8 ; 64 - m psllw m5, m2, 10 pmulhw m3, m5 pabsw m5, m4 paddw m0, m3 psubusw m3, m6, m5 psrlw m3, 8 packuswb m2, m3 psllw m3, 10 pmulhw m4, m3 psubb m3, m8, m2 paddw m1, m4 pmulhrsw m0, m7 pmulhrsw m1, m7 mova [maskq], m3 add maskq, 16 packuswb m0, m1 ret %macro BLEND_64M 4; a, b, mask1, mask2 punpcklbw m0, %1, %2; {b;a}[7..0] punpckhbw %1, %2 ; {b;a}[15..8] pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 packuswb m0, %1 ; {blendpx}[15..0] u8 %endmacro %macro BLEND 2; a, b psubb m3, m4, m0 ; m3 = (64 - m) punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] punpckhbw m3, m0 ; {m;(64-m)}[15..8] BLEND_64M %1, %2, m2, m3 %endmacro cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_ssse3_table LEA r6, blend_ssse3_table tzcnt wd, wm movifnidn hd, hm movifnidn maskq, maskmp movsxd wq, dword [r6+wq*4] mova m4, [base+pb_64] mova m5, [base+pw_512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: movq m0, [maskq]; m movd m1, [dstq+dsq*0] ; a movd m6, [dstq+dsq*1] punpckldq m1, m6 movq m6, [tmpq] ; b psubb m3, m4, m0 ; m3 = (64 - m) punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] punpcklbw m1, m6 ; {b;a}[7..0] pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 packuswb m1, m0 ; {blendpx}[15..0] u8 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 add maskq, 8 add tmpq, 8 lea dstq, [dstq+dsq*2] ; dst_stride * 2 sub hd, 2 jg .w4 RET .w8: mova m0, [maskq]; m movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m6, [tmpq] ; b BLEND m1, m6 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 add maskq, 16 add tmpq, 16 lea dstq, [dstq+dsq*2] ; dst_stride * 2 sub hd, 2 jg .w8 RET .w16: mova m0, [maskq]; m mova m1, [dstq] ; a mova m6, [tmpq] ; b BLEND m1, m6 mova [dstq], m0 add maskq, 16 add tmpq, 16 add dstq, dsq ; dst_stride dec hd jg .w16 RET .w32: %assign i 0 %rep 2 mova m0, [maskq+16*i]; m mova m1, [dstq+16*i] ; a mova m6, [tmpq+16*i] ; b BLEND m1, m6 mova [dstq+i*16], m0 %assign i i+1 %endrep add maskq, 32 add tmpq, 32 add dstq, dsq ; dst_stride dec hd jg .w32 RET ; emu_edge args: ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, ; const pixel *ref, const ptrdiff_t ref_stride ; ; bw, bh total filled size ; iw, ih, copied block -> fill bottom, right ; x, y, offset in bw/bh -> fill top, left cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ y, dst, dstride, src, sstride, \ bottomext, rightext, blk ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes pxor m1, m1 %if ARCH_X86_64 %define reg_zero r12q %define reg_tmp r10 %define reg_src srcq %define reg_bottomext bottomextq %define reg_rightext rightextq %define reg_blkm r9m %else %define reg_zero r6 %define reg_tmp r0 %define reg_src r1 %define reg_bottomext r0 %define reg_rightext r1 %define reg_blkm r2m %endif ; ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor reg_zero, reg_zero lea reg_tmp, [ihq-1] cmp yq, ihq cmovs reg_tmp, yq test yq, yq cmovs reg_tmp, reg_zero %if ARCH_X86_64 imul reg_tmp, sstrideq add srcq, reg_tmp %else imul reg_tmp, sstridem mov reg_src, srcm add reg_src, reg_tmp %endif ; ; ref += iclip(x, 0, iw - 1) lea reg_tmp, [iwq-1] cmp xq, iwq cmovs reg_tmp, xq test xq, xq cmovs reg_tmp, reg_zero add reg_src, reg_tmp %if ARCH_X86_32 mov srcm, reg_src %endif ; ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) %if ARCH_X86_32 mov r1, r1m ; restore bh %endif lea reg_bottomext, [yq+bhq] sub reg_bottomext, ihq lea r3, [bhq-1] cmovs reg_bottomext, reg_zero ; DEFINE_ARGS bw, bh, iw, ih, x, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, reg_zero cmp reg_bottomext, bhq cmovns reg_bottomext, r3 cmp topextq, bhq cmovg topextq, r3 %if ARCH_X86_32 mov r4m, reg_bottomext ; ; right_ext = iclip(x + bw - iw, 0, bw - 1) mov r0, r0m ; restore bw %endif lea reg_rightext, [xq+bwq] sub reg_rightext, iwq lea r2, [bwq-1] cmovs reg_rightext, reg_zero DEFINE_ARGS bw, bh, iw, ih, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, reg_zero cmp reg_rightext, bwq cmovns reg_rightext, r2 %if ARCH_X86_32 mov r3m, r1 %endif cmp leftextq, bwq cmovns leftextq, r2 %undef reg_zero %undef reg_tmp %undef reg_src %undef reg_bottomext %undef reg_rightext DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; center_h = bh - top_ext - bottom_ext %if ARCH_X86_64 lea r3, [bottomextq+topextq] sub centerhq, r3 %else mov r1, centerhm ; restore r1 sub centerhq, topextq sub centerhq, r4m mov r1m, centerhq %endif ; ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq %if ARCH_X86_64 imul r2, dstrideq %else mov r6, r6m ; restore dstq imul r2, dstridem %endif add dstq, r2 mov reg_blkm, dstq ; save pointer for ext ; ; center_w = bw - left_ext - right_ext mov centerwq, bwq %if ARCH_X86_64 lea r3, [rightextq+leftextq] sub centerwq, r3 %else sub centerwq, r3m sub centerwq, leftextq %endif ; vloop Macro %macro v_loop 3 ; need_left_ext, need_right_ext, suffix %if ARCH_X86_64 %define reg_tmp r12 %else %define reg_tmp r0 %endif .v_loop_%3: %if ARCH_X86_32 mov r0, r0m mov r1, r1m %endif %if %1 ; left extension %if ARCH_X86_64 movd m0, [srcq] %else mov r3, srcm movd m0, [r3] %endif pshufb m0, m1 xor r3, r3 .left_loop_%3: mova [dstq+r3], m0 add r3, mmsize cmp r3, leftextq jl .left_loop_%3 ; body lea reg_tmp, [dstq+leftextq] %endif xor r3, r3 .body_loop_%3: %if ARCH_X86_64 movu m0, [srcq+r3] %else mov r1, srcm movu m0, [r1+r3] %endif %if %1 movu [reg_tmp+r3], m0 %else movu [dstq+r3], m0 %endif add r3, mmsize cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 add reg_tmp, centerwq %else lea reg_tmp, [dstq+centerwq] %endif %if ARCH_X86_64 movd m0, [srcq+centerwq-1] %else mov r3, srcm movd m0, [r3+centerwq-1] %endif pshufb m0, m1 xor r3, r3 .right_loop_%3: movu [reg_tmp+r3], m0 add r3, mmsize %if ARCH_X86_64 cmp r3, rightextq %else cmp r3, r3m %endif jl .right_loop_%3 %endif %if ARCH_X86_64 add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %else add dstq, dstridem mov r0, sstridem add srcm, r0 sub dword centerhm, 1 jg .v_loop_%3 mov r0, r0m ; restore r0 %endif %endmacro ; vloop MACRO test leftextq, leftextq jnz .need_left_ext %if ARCH_X86_64 test rightextq, rightextq jnz .need_right_ext %else cmp leftextq, r3m ; leftextq == 0 jne .need_right_ext %endif v_loop 0, 0, 0 jmp .body_done ;left right extensions .need_left_ext: %if ARCH_X86_64 test rightextq, rightextq %else mov r3, r3m test r3, r3 %endif jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; r0 ; bw ; r1 ;; x loop ; r4 ;; y loop ; r5 ; topextq ; r6 ;dstq ; r7 ;dstrideq ; r8 ; srcq %if ARCH_X86_64 %define reg_dstride dstrideq %else %define reg_dstride r2 %endif ; ; bottom edge extension %if ARCH_X86_64 test bottomextq, bottomextq jz .top %else xor r1, r1 cmp r1, r4m je .top %endif ; %if ARCH_X86_64 mov srcq, dstq sub srcq, dstrideq xor r1, r1 %else mov r3, dstq mov reg_dstride, dstridem sub r3, reg_dstride mov srcm, r3 %endif ; .bottom_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, bottomextq %else mov r3, srcm mova m0, [r3+r1] lea r3, [dstq+r1] mov r4, r4m %endif ; .bottom_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .bottom_y_loop add r1, mmsize cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end %if ARCH_X86_64 mov srcq, reg_blkm %else mov r3, reg_blkm mov reg_dstride, dstridem %endif mov dstq, dstm xor r1, r1 ; .top_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1] %else mov r3, reg_blkm mova m0, [r3+r1] %endif lea r3, [dstq+r1] mov r4, topextq ; .top_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .top_y_loop add r1, mmsize cmp r1, bwq jl .top_x_loop .end: RET %undef reg_dstride %undef reg_blkm %undef reg_tmp %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro INIT_XMM ssse3 WARP_AFFINE_8X8 INIT_XMM sse4 WARP_AFFINE_8X8 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/msac.asm000066400000000000000000000301471517466257200227120ustar00rootroot00000000000000; Copyright © 2019-2026, VideoLAN and dav2d authors ; Copyright © 2019-2026, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 unary_mul32: dd 0x8000, 0xc000, 0xe000, 0xf000, 0xf800, 0xfc00, 0xfe00, 0xff00 dd 0xff80, 0xffc0, 0xffe0, 0xfff0, 0xfff8, 0xfffc, 0xfffe, 0xffff unary_mul64: dq 0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, 0xfffff800, 0 pw_127: times 8 dw 127 struc msac .buf: resq 1 .end: resq 1 .dif: resq 1 .rng: resd 1 .cnt: resd 1 .update_cdf: resd 1 endstruc cextern msac_rate cextern msac_min_prob %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) SECTION .text %if WIN64 DECLARE_REG_TMP 3 %define buf rsp+stack_offset+8 ; shadow space %else DECLARE_REG_TMP 0 %define buf rsp-40 ; red zone %endif %define base rax-$$ %macro REFILL 3 ; cnt, tmp, is_early_refill mov %2, [t0+msac.buf] mov rcx, [t0+msac.end] lea r5, [%2+8] cmp r5, rcx ja %%refill_eob mov %2, [%2] lea ecx, [%1+23] add %1d, 16 shr ecx, 3 ; shift_bytes bswap %2 sub r5, rcx shl ecx, 3 ; shift_bits shr %2, cl sub ecx, %1d ; shift_bits - 16 - cnt mov %1d, 48 shl %2, cl mov [t0+msac.buf], r5 sub %1d, ecx ; cnt + 64 - shift_bits xor r4, %2 %if %3 ret %else .end: mov [t0+msac.cnt], %1d mov [t0+msac.dif], r4 RET %endif %%refill_eob: ; avoid overreading the input buffer mov r5, rcx mov ecx, 40 sub ecx, %1d ; c %%refill_eob_loop: cmp %2, r5 jae %%refill_eob_end ; eob reached movzx %1d, byte [%2] inc %2 shl %1, cl xor r4, %1 sub ecx, 8 jge %%refill_eob_loop %%refill_eob_end: mov %1d, 40 sub %1d, ecx mov [t0+msac.buf], %2 %if %3 ret %else mov [t0+msac.dif], r4 mov [t0+msac.cnt], %1d RET %endif %endmacro %macro DECODE_SYMBOL_ADAPT 2 ; n, sz cglobal msac_decode_symbol_adapt%1, 3, 7, 4, s, cdf, ns movd m2, [sq+msac.rng] lea rax, [$$] mov%2 m0, [cdfq] add nsd, nsd movq m3, [sq+msac.dif] mov r3d, [sq+msac.update_cdf] pshuflw m2, m2, q0000 movd [buf+12], m2 por m1, m0, [pw_127] psrlw m2, 8 psubusw m1, [base+msac_min_prob-16+r2*8] psllw m2, 6 %if %1 == 8 punpcklqdq m2, m2 %endif pmulhuw m1, m2 pshuflw m3, m3, q3333 pxor m2, m2 psllw m1, 3 %if %1 == 8 punpcklqdq m3, m3 %endif mova [buf+16], m1 psubusw m1, m3 pcmpeqw m1, m2 ; c >= v test r3d, r3d jz m(msac_decode_symbol_adapt4).renorm ; !allow_update_cdf ; update_cdf: movzx r3d, word [cdfq+nsq] pcmpeqw m2, m2 lea r4d, [r3*3] movzx r5d, r3b shr r4d, 8 ; para * sizeof(*msac_rate) shr r5d, 4 ; count >> 4 add r4d, r5d movzx eax, byte [base+msac_rate+r4] cmp nsd, 3*2 sbb eax, -1 ; rate + (n_symbols > 2) cmp r3b, 32 adc r3d, 0 ; count + (count < 32) movd m3, eax pavgw m2, m1 ; i >= val ? 65535 : 32768 psrlw m4, m1, m3 psubw m2, m0 ; for (i = 0; i < val; i++) psubw m0, m4 ; cdf[i] += (32768 - cdf[i]) >> rate; psrlw m2, m3 ; for (; i < n_symbols; i++) paddw m0, m2 ; cdf[i] += ((65535 - cdf[i]) >> rate) - (65535 >> rate); mov%2 [cdfq], m0 mov [cdfq+nsq], r3w %if %1 == 8 jmp m(msac_decode_symbol_adapt4).renorm %else .renorm: pmovmskb eax, m1 mov r4, [sq+msac.dif] tzcnt eax, eax movzx r1d, word [buf+rax+16] ; v movzx r2d, word [buf+rax+14] ; u shr eax, 1 not r4 sub r2d, r1d ; rng shl r1, gprsize*8-16 add r4, r1 ; ~dif .renorm2: mov r1d, [sq+msac.cnt] movifnidn t0, sq .renorm3: bsr ecx, r2d xor ecx, 15 ; d shl r2d, cl shl r4, cl mov [t0+msac.rng], r2d not r4 sub r1d, ecx jae .end ; no refill required .refill: REFILL r1, r2, 0 %endif %endmacro INIT_XMM sse2 DECODE_SYMBOL_ADAPT 4, q DECODE_SYMBOL_ADAPT 8, a cglobal msac_decode_bool_adapt, 2, 7, 0, s, cdf movzx eax, word [cdfq] movzx r3d, byte [sq+msac.rng+1] mov r4, [sq+msac.dif] mov r2d, [sq+msac.rng] shr eax, 7 imul eax, r3d shr r3d, 1 mov r5, r4 add eax, r3d and eax, ~7 ; v mov r3d, eax shl rax, 48 ; vw sub r2d, r3d ; r - v sub r4, rax ; dif - vw setb al cmovb r2d, r3d mov r3d, [sq+msac.update_cdf] cmovb r4, r5 not r4 test r3d, r3d jz m(msac_decode_symbol_adapt4).renorm2 movzx r5d, word [cdfq+2] %if WIN64 push r7 mov t0, sq %endif lea ecx, [r5*3] movzx r7d, r5b shr ecx, 8 ; para * sizeof(*msac_rate) shr r7d, 4 ; count >> 4 cmp r5b, 32 adc r5d, 0 mov [cdfq+2], r5w lea r5, [msac_rate] add ecx, r7d movzx r7d, word [r1] movzx ecx, byte [r5+rcx] imul r5d, eax, -32769 add r5d, r7d ; if (bit) sub r7d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1; sar r5d, cl ; else sub r7d, r5d ; cdf[0] -= cdf[0] >> rate; mov [cdfq], r7w %if WIN64 mov r1d, [t0+msac.cnt] pop r7 jmp m(msac_decode_symbol_adapt4).renorm3 %else jmp m(msac_decode_symbol_adapt4).renorm2 %endif cglobal msac_decode_bool_bypass, 1, 7, 0, s mov eax, [sq+msac.rng] mov r4, [sq+msac.dif] mov r1d, [sq+msac.cnt] shl rax, 47 mov r2, r4 sub r4, rax ; dif - vw cmovb r4, r2 setb al movifnidn t0, sq lea r4, [r4*2+1] ; dif sub r1d, 1 ; cnt jb m(msac_decode_symbol_adapt4).refill mov [sq+msac.cnt], r1d mov [sq+msac.dif], r4 RET cglobal msac_decode_bools_bypass, 2, 7, 0, s, n mov r2d, [sq+msac.cnt] mov r4, [sq+msac.dif] movifnidn t0, sq cmp r2d, nd jae .main call .refill .main: mov r5d, [t0+msac.rng] not r4 sub r2d, nd shl r5, 47 mov [t0+msac.cnt], r2d xor eax, eax mov ecx, nd .loop: mov r2, r4 add r4, r5 ; dif - vw cmovb r4, r2 adc eax, eax ; ret = (ret << 1) + (dif < vw) shr r5, 1 dec nd jg .loop shl r4, cl not r4 mov [t0+msac.dif], r4 RET .refill: REFILL r2, r6, 1 INIT_YMM avx2 cglobal msac_decode_unary_bypass6, 2, 7, 5, s, n vpbroadcastd m0, [sq+msac.rng] psrld m0, 1 pmulld m0, [unary_mul32] mov r2d, [sq+msac.cnt] mov r4, [sq+msac.dif] movifnidn t0, sq cmp r2d, nd jb .refill vpbroadcastd m1, [t0+msac.dif+4] psrld m1, 1 .main: mov r5d, [t0+msac.rng] psubd m1, m0 movmskps eax, m1 rorx ecx, nd, 32-5 or eax, ecx ; clip to max_bits (5 or 6) .end: vzeroupper .end2: tzcnt eax, eax mov ecx, -1 shl r5d, 16 shrx ecx, ecx, eax not r4 not ecx imul r5, rcx ; vw_sum xor ecx, ecx add r4, r5 ; dif - vw_sum cmp eax, nd adc ecx, eax ; bit = ret + (ret < max_bits) shlx r4, r4, rcx sub r2d, ecx not r4 mov [t0+msac.cnt], r2d mov [t0+msac.dif], r4 ret ; no epilogue (vzeroupper already performed) .refill: call mangle(private_prefix %+ _msac_decode_bools_bypass_sse2).refill movq xm1, r4 psrlq xm1, 33 vpbroadcastd m1, xm1 jmp .main cglobal msac_decode_unary_bypass21, 1, 7, 5, s, n vpbroadcastd m2, [sq+msac.rng] psrld m2, 1 pmuludq m0, m2, [unary_mul64] pmulld m1, m2, [unary_mul32+32*1] pmulld m2, [unary_mul32+32*0] mov r2d, [sq+msac.cnt] mov nd, 21 mov r4, [sq+msac.dif] movifnidn t0, sq cmp r2d, nd jb .refill vpbroadcastq m3, [t0+msac.dif] vpbroadcastd m4, [t0+msac.dif+4] psrlq m3, 17 psrld m4, 1 .main: mov r5d, [t0+msac.rng] mov rcx, 0xfffff8000000 psubq m0, m3, m0 imul rcx, r5 ; vw[20] psubd m1, m4, m1 movmskpd eax, m0 psubd m2, m4, m2 cmp r4, rcx lea ecx, [rax+16] cmovb eax, ecx movmskps ecx, m1 shl eax, 16 mov ah, cl movmskps ecx, m2 lea eax, [rax+rcx+(1<<21)] jmp mangle(private_prefix %+ _msac_decode_unary_bypass6_avx2).end .refill: call mangle(private_prefix %+ _msac_decode_bools_bypass_sse2).refill movq xm4, r4 psrlq xm3, xm4, 17 psrlq xm4, 33 vpbroadcastq m3, xm3 vpbroadcastd m4, xm4 jmp .main INIT_ZMM avx512icl cglobal msac_decode_unary_bypass21, 1, 7, 5, s, n vpbroadcastd m1, [sq+msac.rng] pmulld m0, m1, [unary_mul32] pmuludq m1, [unary_mul64] mov r2d, [sq+msac.cnt] mov nd, 21 mov r4, [sq+msac.dif] movifnidn t0, sq cmp r2d, nd jb .refill vpbroadcastq m3, [sq+msac.dif] vpbroadcastd m2, [sq+msac.dif+4] .main: mov r5d, [t0+msac.rng] psrlq m3, 16 vpcmpud k1, m0, m2, 6 ; vw > dif vpcmpuq k2, m1, m3, 6 kunpckwd k1, k2, k1 kmovd eax, k1 or eax, 1<<21 jmp mangle(private_prefix %+ _msac_decode_unary_bypass6_avx2).end2 .refill: call mangle(private_prefix %+ _msac_decode_bools_bypass_sse2).refill vpbroadcastq m3, r4 pshufd m2, m3, q3311 jmp .main dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/msac.h000066400000000000000000000062161517466257200223610ustar00rootroot00000000000000/* * Copyright © 2019-2026, VideoLAN and dav2d authors * Copyright © 2019-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_SRC_X86_MSAC_H #define DAV2D_SRC_X86_MSAC_H #include "src/cpu.h" unsigned dav2d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav2d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav2d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); unsigned dav2d_msac_decode_bool_bypass_sse2(MsacContext *s); unsigned dav2d_msac_decode_bools_bypass_sse2(MsacContext *s, unsigned n_bits); unsigned dav2d_msac_decode_unary_bypass6_avx2(MsacContext *s, unsigned max_bits); unsigned dav2d_msac_decode_unary_bypass21_avx2(MsacContext *s); unsigned dav2d_msac_decode_unary_bypass21_avx512icl(MsacContext *s); #define dav2d_msac_decode_symbol_adapt4 dav2d_msac_decode_symbol_adapt4_sse2 #define dav2d_msac_decode_symbol_adapt8 dav2d_msac_decode_symbol_adapt8_sse2 #define dav2d_msac_decode_bool_adapt dav2d_msac_decode_bool_adapt_sse2 #define dav2d_msac_decode_bool_bypass dav2d_msac_decode_bool_bypass_sse2 #define dav2d_msac_decode_bools_bypass dav2d_msac_decode_bools_bypass_sse2 #define dav2d_msac_decode_unary_bypass6(s, n) ((s)->unary_bypass6(s, n)) #define dav2d_msac_decode_unary_bypass21(s) ((s)->unary_bypass21(s)) static ALWAYS_INLINE void msac_dsp_init_x86(MsacContext *const s) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; s->unary_bypass6 = dav2d_msac_decode_unary_bypass6_avx2; s->unary_bypass21 = dav2d_msac_decode_unary_bypass21_avx2; if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; s->unary_bypass21 = dav2d_msac_decode_unary_bypass21_avx512icl; } #endif /* DAV2D_SRC_X86_MSAC_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/pal.asm000066400000000000000000000426371517466257200225520ustar00rootroot00000000000000; Copyright © 2023, VideoLAN and dav2d authors ; Copyright © 2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 const pb_0to63, db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %if ARCH_X86_64 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 %endif pal_idx_w8_padh: db 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11 pb_1_16: times 4 db 1, 16 %if ARCH_X86_64 pb_32: times 4 db 32 %endif %macro JMP_TABLE 2-* %xdefine %1_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%base %+ .w%2 - (%%table - 2*4) %rotate 1 %endrep %endmacro JMP_TABLE pal_idx_finish_ssse3, 4, 8, 16, 32, 64 %if ARCH_X86_64 JMP_TABLE pal_idx_finish_avx2, 4, 8, 16, 32, 64 JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64 %endif SECTION .text INIT_XMM ssse3 cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h %define base r6-pal_idx_finish_ssse3_table LEA r6, pal_idx_finish_ssse3_table tzcnt bwd, bwm movifnidn bhd, bhm movifnidn wd, wm movifnidn hd, hm movsxd bwq, [r6+bwq*4] movddup m3, [base+pb_1_16] add bwq, r6 sub bhd, hd jmp bwq .w4: mova m0, [srcq] add srcq, 16 pmaddubsw m0, m3 packuswb m0, m0 movq [dstq], m0 add dstq, 8 sub hd, 4 jg .w4 test bhd, bhd jz .w4_end pshuflw m0, m0, q3333 .w4_padv: movq [dstq], m0 add dstq, 8 sub bhd, 4 jg .w4_padv .w4_end: RET .w8_padh: pshufb m0, m2 pshufb m1, m2 jmp .w8_main .w8: mova m2, [base+pal_idx_w8_padh] .w8_loop: mova m0, [srcq+16*0] mova m1, [srcq+16*1] cmp wd, 8 jl .w8_padh .w8_main: pmaddubsw m0, m3 pmaddubsw m1, m3 add srcq, 16*2 packuswb m0, m1 movu [dstq], m0 add dstq, 16 sub hd, 4 jg .w8_loop test bhd, bhd jz .w8_end pshufd m0, m0, q3333 .w8_padv: movu [dstq], m0 add dstq, 16 sub bhd, 4 jg .w8_padv .w8_end: RET .w16_padh: pshufb m0, m4 pshufb m1, m4 jmp .w16_main .w16: cmp wd, 16 je .w16_loop call .setup_padh .w16_loop: mova m0, [srcq+16*0] mova m1, [srcq+16*1] cmp wd, 16 jl .w16_padh .w16_main: pmaddubsw m0, m3 pmaddubsw m1, m3 add srcq, 16*2 packuswb m0, m1 movu [dstq], m0 add dstq, 16 sub hd, 2 jg .w16_loop test bhd, bhd jz .w16_end punpckhqdq m0, m0 .w16_padv: movu [dstq+16*0], m0 movu [dstq+16*1], m0 add dstq, 16*2 sub bhd, 4 jg .w16_padv .w16_end: RET .w32_padh: cmp wd, 16 jg .w32_padh2 pshufb m1, m0, m5 pshufb m0, m4 jmp .w32_main .w32_padh2: pshufb m1, m4 jmp .w32_main .w32: cmp wd, 32 je .w32_loop call .setup_padh .w32_loop: mova m0, [srcq+16*0] mova m1, [srcq+16*1] cmp wd, 32 jl .w32_padh .w32_main: pmaddubsw m0, m3 pmaddubsw m1, m3 add srcq, 16*2 packuswb m0, m1 movu [dstq], m0 add dstq, 16 dec hd jg .w32_loop test bhd, bhd jz .w32_end .w32_padv: movu [dstq+16*0], m0 movu [dstq+16*1], m0 movu [dstq+16*2], m0 movu [dstq+16*3], m0 add dstq, 16*4 sub bhd, 4 jg .w32_padv .w32_end: RET .w64_padh: cmp wd, 16 jg .w64_padh2 pshufb m1, m0, m5 pshufb m0, m4 pmaddubsw m0, m3 pmaddubsw m1, m3 packuswb m0, m1 packuswb m1, m1 jmp .w64_main .w64_padh2: pshufb m1, m4 pmaddubsw m0, m3 pmaddubsw m2, m1, m3 pshufb m1, m5 pmaddubsw m1, m3 packuswb m0, m2 packuswb m1, m1 jmp .w64_main .w64_padh3: cmp wd, 48 jg .w64_padh4 pshufb m2, m1, m5 pshufb m1, m4 jmp .w64_main2 .w64_padh4: pshufb m2, m4 jmp .w64_main2 .w64: cmp wd, 64 je .w64_loop call .setup_padh .w64_loop: mova m0, [srcq+16*0] mova m1, [srcq+16*1] cmp wd, 32 jle .w64_padh pmaddubsw m0, m3 pmaddubsw m1, m3 packuswb m0, m1 mova m1, [srcq+16*2] mova m2, [srcq+16*3] cmp wd, 64 jl .w64_padh3 .w64_main2: pmaddubsw m1, m3 pmaddubsw m2, m3 packuswb m1, m2 .w64_main: add srcq, 16*4 movu [dstq+16*0], m0 movu [dstq+16*1], m1 add dstq, 16*2 dec hd jg .w64_loop test bhd, bhd jz .w64_end .w64_padv: movu [dstq+16*0], m0 movu [dstq+16*1], m1 movu [dstq+16*2], m0 movu [dstq+16*3], m1 add dstq, 16*4 sub bhd, 2 jg .w64_padv .w64_end: RET .setup_padh: mova m4, [base+pb_0to63] lea r6d, [wq-1] and r6d, 15 movd m5, r6d pxor m0, m0 pshufb m5, m0 pminub m4, m5 ret %if ARCH_X86_64 INIT_YMM avx2 cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h %define base r6-pal_idx_finish_avx2_table lea r6, [pal_idx_finish_avx2_table] tzcnt bwd, bwd movifnidn wd, wm movifnidn hd, hm movsxd bwq, [r6+bwq*4] vpbroadcastd m2, [base+pb_1_16] dec wd add bwq, r6 sub bhd, hd jmp bwq .w4: mova xm0, [srcq] add srcq, 16 pmaddubsw xm0, xm2 packuswb xm0, xm0 movq [dstq], xm0 add dstq, 8 sub hd, 4 jg .w4 test bhd, bhd jz .w4_end pshuflw xm0, xm0, q3333 .w4_padv: movq [dstq], xm0 add dstq, 8 sub bhd, 4 jg .w4_padv .w4_end: RET .w8_padh: pshufb xm0, xm3 pshufb xm1, xm3 jmp .w8_main .w8: mova xm3, [base+pal_idx_w8_padh] .w8_loop: mova xm0, [srcq+16*0] mova xm1, [srcq+16*1] cmp wd, 7 jl .w8_padh .w8_main: pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 add srcq, 16*2 packuswb xm0, xm1 movu [dstq], xm0 add dstq, 16 sub hd, 4 jg .w8_loop test bhd, bhd jz .w8_end pshufd xm0, xm0, q3333 .w8_padv: movu [dstq], xm0 add dstq, 16 sub bhd, 4 jg .w8_padv .w8_end: RET .w16_padh: pshufb m0, m3 pshufb m1, m3 jmp .w16_main .w16: cmp wd, 15 je .w16_loop vbroadcasti128 m0, [base+pb_0to63] movd xm3, wd vpbroadcastb m3, xm3 pminub m3, m0 .w16_loop: mova m0, [srcq+32*0] mova m1, [srcq+32*1] cmp wd, 15 jl .w16_padh .w16_main: pmaddubsw m0, m2 pmaddubsw m1, m2 add srcq, 32*2 packuswb m0, m1 vpermq m1, m0, q3120 movu [dstq], m1 add dstq, 32 sub hd, 4 jg .w16_loop test bhd, bhd jz .w16_end vpermq m0, m0, q3333 .w16_padv: movu [dstq], m0 add dstq, 32 sub bhd, 4 jg .w16_padv .w16_end: RET .w32_padh: cmp wd, 15 jg .w32_padh2 vinserti128 m0, xm0, 1 vinserti128 m1, xm1, 1 .w32_padh2: pshufb m0, m3 pshufb m1, m3 jmp .w32_main .w32: cmp wd, 31 je .w32_loop movd xm3, wd vpbroadcastb m3, xm3 pminub m3, [base+pb_0to63] .w32_loop: mova m0, [srcq+32*0] mova m1, [srcq+32*1] cmp wd, 31 jl .w32_padh .w32_main: pmaddubsw m0, m2 pmaddubsw m1, m2 add srcq, 32*2 packuswb m0, m1 vpermq m1, m0, q3120 movu [dstq], m1 add dstq, 32 sub hd, 2 jg .w32_loop test bhd, bhd jz .w32_end vpermq m0, m0, q3131 .w32_padv: movu [dstq+32*0], m0 movu [dstq+32*1], m0 add dstq, 32*2 sub bhd, 4 jg .w32_padv .w32_end: RET .w64_padh: cmp wd, 15 jg .w64_padh2 vinserti128 m1, m0, xm0, 1 pshufb m0, m1, m3 pshufb m1, m4 jmp .w64_main .w64_padh2: cmp wd, 31 jg .w64_padh3 vperm2i128 m1, m0, m0, 0x11 pshufb m0, m3 pshufb m1, m4 jmp .w64_main .w64_padh3: cmp wd, 47 jg .w64_padh4 vinserti128 m1, xm1, 1 .w64_padh4: pshufb m1, m3 jmp .w64_main .w64: cmp wd, 63 je .w64_loop mov r6d, wd and r6d, 31 movd xm4, r6d vpbroadcastb m4, xm4 pminub m3, m4, [pb_0to63] .w64_loop: mova m0, [srcq+32*0] mova m1, [srcq+32*1] cmp wd, 63 jl .w64_padh .w64_main: pmaddubsw m0, m2 pmaddubsw m1, m2 add srcq, 32*2 packuswb m0, m1 vpermq m0, m0, q3120 movu [dstq], m0 add dstq, 32 dec hd jg .w64_loop test bhd, bhd jz .w64_end .w64_padv: movu [dstq+32*0], m0 movu [dstq+32*1], m0 movu [dstq+32*2], m0 movu [dstq+32*3], m0 add dstq, 32*4 sub bhd, 4 jg .w64_padv .w64_end: RET INIT_ZMM avx512icl cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h %define base r6-pal_idx_finish_avx512icl_table lea r6, [pal_idx_finish_avx512icl_table] tzcnt bwd, bwd movifnidn wd, wm movifnidn hd, hm movsxd bwq, [r6+bwq*4] vpbroadcastd m4, [base+pb_1_16] dec wd add bwq, r6 sub bhd, hd jmp bwq .w4: mova xmm0, [srcq] add srcq, 16 pmaddubsw xmm0, xm4 packuswb xmm0, xmm0 movq [dstq], xmm0 add dstq, 8 sub hd, 4 jg .w4 test bhd, bhd jz .w4_end pshuflw xmm0, xmm0, q3333 .w4_padv: movq [dstq], xmm0 add dstq, 8 sub bhd, 4 jg .w4_padv .w4_end: RET .w8_padh: pshufb xmm0, xmm2 pshufb xmm1, xmm2 jmp .w8_main .w8: mova xmm2, [base+pal_idx_w8_padh] .w8_loop: mova xmm0, [srcq+16*0] mova xmm1, [srcq+16*1] cmp wd, 7 jl .w8_padh .w8_main: pmaddubsw xmm0, xm4 pmaddubsw xmm1, xm4 add srcq, 16*2 packuswb xmm0, xmm1 movu [dstq], xmm0 add dstq, 16 sub hd, 4 jg .w8_loop test bhd, bhd jz .w8_end pshufd xmm0, xmm0, q3333 .w8_padv: movu [dstq], xmm0 add dstq, 16 sub bhd, 4 jg .w8_padv .w8_end: RET .w16_padh: pshufb m0, m2 jmp .w16_main .w16: cmp wd, 15 je .w16_loop vbroadcasti32x4 m2, [base+pb_0to63] vpbroadcastb m0, wd pminub m2, m0 .w16_loop: mova m0, [srcq] cmp wd, 15 jl .w16_padh .w16_main: pmaddubsw m0, m4 add srcq, 64 vpmovwb ym0, m0 movu [dstq], ym0 add dstq, 32 sub hd, 4 jg .w16_loop test bhd, bhd jz .w16_end vpermq ym0, ym0, q3333 .w16_padv: movu [dstq], ym0 add dstq, 32 sub bhd, 4 jg .w16_padv .w16_end: RET .w32_padh: vpermb m0, m2, m0 vpermb m1, m2, m1 jmp .w32_main .w32: mova m2, [base+pb_0to63] paddb m3, m2, m2 cmp wd, 31 je .w32_loop vpbroadcastb m0, wd mov r6d, 0xff00 kmovw k1, r6d vpaddd m0{k1}, [pb_32] {1to16} pminub m2, m0 .w32_loop: mova m0, [srcq+64*0] mova m1, [srcq+64*1] cmp wd, 31 jl .w32_padh .w32_main: pmaddubsw m0, m4 pmaddubsw m1, m4 add srcq, 64*2 vpermt2b m0, m3, m1 movu [dstq], m0 add dstq, 64 sub hd, 4 jg .w32_loop test bhd, bhd jz .w32_end vshufi32x4 m0, m0, q3333 .w32_padv: movu [dstq], m0 add dstq, 64 sub bhd, 4 jg .w32_padv .w32_end: RET .w64_padh: REPX {vpermb x, m5, x}, m0, m1, m2, m3 jmp .w64_main .w64: mova m5, [base+pb_0to63] paddb m6, m5, m5 cmp wd, 63 je .w64_loop vpbroadcastb m0, wd pminub m5, m0 .w64_loop: mova m0, [srcq+64*0] mova m1, [srcq+64*1] mova m2, [srcq+64*2] mova m3, [srcq+64*3] cmp wd, 63 jl .w64_padh .w64_main: REPX {pmaddubsw x, m4}, m0, m1, m2, m3 add srcq, 64*4 vpermt2b m0, m6, m1 vpermt2b m2, m6, m3 movu [dstq+64*0], m0 movu [dstq+64*1], m2 add dstq, 64*2 sub hd, 4 jg .w64_loop test bhd, bhd jz .w64_end vshufi32x4 m2, m2, q3232 .w64_padv: movu [dstq+64*0], m2 movu [dstq+64*1], m2 add dstq, 64*2 sub bhd, 4 jg .w64_padv .w64_end: RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/pal.h000066400000000000000000000040421517466257200222050ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * Copyright © 2023, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" decl_pal_idx_finish_fn(dav2d_pal_idx_finish_ssse3); decl_pal_idx_finish_fn(dav2d_pal_idx_finish_avx2); decl_pal_idx_finish_fn(dav2d_pal_idx_finish_avx512icl); static ALWAYS_INLINE void pal_dsp_init_x86(Dav2dPalDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_SSSE3)) return; c->pal_idx_finish = dav2d_pal_idx_finish_ssse3; #if ARCH_X86_64 if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; c->pal_idx_finish = dav2d_pal_idx_finish_avx2; if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; c->pal_idx_finish = dav2d_pal_idx_finish_avx512icl; #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/refmvs.asm000066400000000000000000000716451517466257200233010ustar00rootroot00000000000000; Copyright © 2021, VideoLAN and dav2d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 %macro JMP_TABLE 2-* %xdefine %%prefix mangle(private_prefix %+ _%1) %1_table: %xdefine %%base %1_table %rep %0 - 1 dd %%prefix %+ .w%2 - %%base %rotate 1 %endrep %endmacro %macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix %rep %1 db %2*3 db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ mangle(private_prefix %+ _save_tmvs_%3).write1 %endrep %endmacro %if ARCH_X86_64 mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 dw 1024, 963, 910, 862, 819, 780, 744, 712 dw 682, 655, 630, 606, 585, 564, 546, 528 splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 %endif save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 pb_128: times 16 db 128 pq_8192: dq 8192 save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 SAVE_TMVS_TABLE 4, 8, ssse3 SAVE_TMVS_TABLE 4, 4, ssse3 SAVE_TMVS_TABLE 5, 2, ssse3 SAVE_TMVS_TABLE 7, 1, ssse3 %if ARCH_X86_64 save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 SAVE_TMVS_TABLE 4, 8, avx2 SAVE_TMVS_TABLE 4, 4, avx2 SAVE_TMVS_TABLE 5, 2, avx2 SAVE_TMVS_TABLE 7, 1, avx2 save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl SAVE_TMVS_TABLE 4, 8, avx512icl SAVE_TMVS_TABLE 4, 4, avx512icl SAVE_TMVS_TABLE 5, 2, avx512icl SAVE_TMVS_TABLE 7, 1, avx512icl JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 %endif JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 struc rf .frm_hdr: resq 1 .iw4: resd 1 .ih4: resd 1 .iw8: resd 1 .ih8: resd 1 .sbsz: resd 1 .use_rf_mvs: resd 1 .sign_bias: resb 7 .mfmv_sign: resb 7 .pocdiff: resb 7 .mfmv_ref: resb 3 .mfmv_ref2cur: resb 3 .mfmv_ref2ref: resb 3*7 .n_mfmvs: resd 1 .n_blocks: resd 1 .rp: resq 1 .rp_ref: resq 1 .rp_proj: resq 1 .rp_stride: resq 1 .r: resq 1 .n_tile_threads: resd 1 .n_frame_threads: resd 1 endstruc SECTION .text %macro movif32 2 %if ARCH_X86_32 mov %1, %2 %endif %endmacro INIT_XMM ssse3 ; refmvs_temporal_block *rp, ptrdiff_t stride, ; refmvs_block **rr, uint8_t *ref_sign, ; int col_end8, int row_end8, int col_start8, int row_start8 %if ARCH_X86_64 cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ xend, yend, xstart, ystart %define base_reg r12 %else cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ xend, yend, xstart, ystart movq m5, [ref_signq] lea strided, [strided*5] mov stridem, strided mov r3, xstartm mov r1, ystartm DEFINE_ARGS b, ystart, rr, cand, xend, x %define stridemp r1m %define m8 [base+pb_128] %define m9 [base+save_pack0+ 0] %define m10 [base+save_pack0+16] %define base_reg r6 %endif %define base base_reg-.write1 LEA base_reg, .write1 %if ARCH_X86_64 movifnidn xendd, xendm movifnidn yendd, yendm mov xstartd, xstartm mov ystartd, ystartm movq m5, [ref_signq] %endif movu m4, [base+save_ref_shuf] movddup m6, [base+save_cond0] movddup m7, [base+save_cond1] %if ARCH_X86_64 mova m8, [base+pb_128] mova m9, [base+save_pack0+ 0] mova m10, [base+save_pack0+16] %endif psllq m5, 8 %if ARCH_X86_64 lea r9d, [xendq*5] lea xstartd, [xstartq*5] sub yendd, ystartd add ystartd, ystartd lea strideq, [strideq*5] sub xstartq, r9 add xendd, r9d add rpq, r9 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand %else lea r0, [xendd*5] ; xend5 lea r3, [r3*5] ; xstart5 sub r3, r0 ; -w5 mov r6m, r3 %define xstartq r6m add xendd, r0 ; xend6 add r0m, r0 ; rp+xend5 mov xendm, xendd sub r5, r1 ; h add r1, r1 mov r7m, r1 mov r5m, r5 %define hd r5mp jmp .loop_y_noload %endif .loop_y: movif32 ystartd, r7m movif32 xendd, xendm .loop_y_noload: and ystartd, 30 mov xq, xstartq mov bq, [rrq+ystartq*gprsize] add ystartd, 2 movif32 r7m, ystartd lea bq, [bq+xendq*4] .loop_x: %if ARCH_X86_32 %define rpq r3 %define r10 r1 %define r10d r1 %define r11 r4 %define r11d r4 %endif imul candq, xq, 0x9999 ; x / 5 * 3 sar candq, 16 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs movu m0, [bq+candq*8+12] ; cand_b movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] add r10, base_reg add candq, r11 jge .calc movu m1, [bq+candq*8+12] movzx r11d, byte [bq+candq*8+22] movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] add r11, base_reg .calc: movif32 rpq, r0m ; ref check punpckhqdq m2, m0, m1 pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] ; mv check punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... pabsw m2, m2 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 ; res pcmpgtd m3, m2 pshufd m2, m3, q2301 pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... por m3, m2 ; b0.shuf b1.shuf | ... pxor m3, m8 ; if cond0|cond1 == 0 => zero out pshufb m0, m3 pshufb m1, m3 call r10 jge .next_line pshufd m0, m1, q3232 call r11 jl .loop_x .next_line: add rpq, stridemp movif32 r0m, rpq dec hd jg .loop_y RET .write1: movd [rpq+xq+0], m0 psrlq m0, 8 movd [rpq+xq+1], m0 add xq, 5*1 ret .write2: movq [rpq+xq+0], m0 psrlq m0, 8 movd [rpq+xq+6], m0 add xq, 5*2 ret .write4: pshufb m0, m9 movu [rpq+xq+ 0], m0 psrlq m0, 8 movd [rpq+xq+16], m0 add xq, 5*4 ret .write8: pshufb m2, m0, m9 movu [rpq+xq+ 0], m2 pshufb m0, m10 movu [rpq+xq+16], m0 psrldq m2, 2 movq [rpq+xq+32], m2 add xq, 5*8 ret .write16: pshufb m2, m0, m9 movu [rpq+xq+ 0], m2 pshufb m0, m10 movu [rpq+xq+16], m0 shufps m2, m0, q1032 movu [rpq+xq+48], m2 shufps m2, m0, q2121 movu [rpq+xq+32], m2 shufps m0, m2, q1032 movu [rpq+xq+64], m0 add xq, 5*16 ret INIT_XMM sse2 ; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 add bx4d, bw4d tzcnt bw4d, bw4d mova m2, [aq] LEA aq, splat_mv_sse2_table lea bx4q, [bx4q*3-32] movsxd bw4q, [aq+bw4q*4] movifnidn bh4d, bh4m pshufd m0, m2, q0210 pshufd m1, m2, q1021 pshufd m2, m2, q2102 add bw4q, aq .loop: mov aq, [rrq] add rrq, gprsize lea aq, [aq+bx4q*4] jmp bw4q .w32: mova [aq-16*16], m0 mova [aq-16*15], m1 mova [aq-16*14], m2 mova [aq-16*13], m0 mova [aq-16*12], m1 mova [aq-16*11], m2 mova [aq-16*10], m0 mova [aq-16* 9], m1 mova [aq-16* 8], m2 mova [aq-16* 7], m0 mova [aq-16* 6], m1 mova [aq-16* 5], m2 .w16: mova [aq-16* 4], m0 mova [aq-16* 3], m1 mova [aq-16* 2], m2 mova [aq-16* 1], m0 mova [aq+16* 0], m1 mova [aq+16* 1], m2 .w8: mova [aq+16* 2], m0 mova [aq+16* 3], m1 mova [aq+16* 4], m2 .w4: mova [aq+16* 5], m0 mova [aq+16* 6], m1 mova [aq+16* 7], m2 dec bh4d jg .loop RET .w2: movu [aq+104], m0 movq [aq+120], m1 dec bh4d jg .loop RET .w1: movq [aq+116], m0 movd [aq+124], m2 dec bh4d jg .loop RET %if ARCH_X86_64 INIT_XMM sse4 ; refmvs_frame *rf, int tile_row_idx, ; int col_start8, int col_end8, int row_start8, int row_end8 cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ stride, rp_proj, roff, troff, \ xendi, xstarti, iw8, ih8, dst xor r14d, r14d cmp dword [rfq+rf.n_tile_threads], 1 mov ih8d, [rfq+rf.ih8] mov iw8d, [rfq+rf.iw8] mov xstartd, xstartd mov xendd, xendd cmove tridxd, r14d lea xstartid, [xstartq-8] lea xendid, [xendq+8] mov strideq, [rfq+rf.rp_stride] mov rp_projq, [rfq+rf.rp_proj] cmp ih8d, yendd mov [rsp+0x30], strideq cmovs yendd, ih8d test xstartid, xstartid cmovs xstartid, r14d cmp iw8d, xendid cmovs xendid, iw8d mov troffq, strideq shl troffq, 4 imul troffq, tridxq mov dstd, ystartd and dstd, 15 imul dstq, strideq add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride lea dstq, [dstq*5] add dstq, rp_projq lea troffq, [troffq*5] ; 16 * tridx * stride * 5 lea r13d, [xendq*5] lea r12, [strideq*5] DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ _, troff, xendi, xstarti, stride5, _, dst lea w5d, [xstartq*5] add r7, troffq ; rp_proj + tile_row_offset mov hd, yendd mov [rsp+0x28], r7 add dstq, r13 sub w5q, r13 sub hd, ystartd .init_xloop_start: mov x5q, w5q test w5b, 1 jz .init_2blk mov dword [dstq+x5q], 0x80008000 add x5q, 5 jz .init_next_row .init_2blk: mov dword [dstq+x5q+0], 0x80008000 mov dword [dstq+x5q+5], 0x80008000 add x5q, 10 jl .init_2blk .init_next_row: add dstq, stride5q dec hd jg .init_xloop_start DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ _, _, xendi, xstarti, stride5, _, n mov r13d, [rfq+rf.n_mfmvs] test r13d, r13d jz .ret mov [rsp+0x0c], r13d mov strideq, [rsp+0x30] movddup m3, [pq_8192] mov r9d, ystartd mov [rsp+0x38], yendd mov [rsp+0x20], xstartid xor nd, nd lea n7q, [rfq+rf.mfmv_ref2ref-1] imul r9, strideq ; ystart * stride mov [rsp+0x48], rfq mov [rsp+0x18], stride5q lea r7, [r9*5] mov [rsp+0x24], ystartd mov [rsp+0x00], r7 .nloop: DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ ref, rp_ref, xendi, xstarti, _, _, n mov rfq, [rsp+0x48] movsx refd, byte [rfq+rf.mfmv_ref2cur+nq] cmp refd, -32 ; INVALID_REF2CUR je .next_n mov [rsp+0x40], refd mov offq, [rsp+0x00] ; ystart * stride * 5 movzx refd, byte [rfq+rf.mfmv_ref+nq] lea refsignq, [refq-4] mov rp_refq, [rfq+rf.rp_ref] movq m2, refsignq add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset mov [rsp+0x14], nd mov yd, ystartd .yloop: mov r11d, [rsp+0x24] ; ystart mov r12d, [rsp+0x38] ; yend mov r14d, yd and r14d, ~7 ; y_sb_align cmp r11d, r14d cmovs r11d, r14d ; imax(y_sb_align, ystart) mov [rsp+0x44], r11d ; y_proj_start add r14d, 8 cmp r12d, r14d cmovs r14d, r12d ; imin(y_sb_align + 8, yend) mov [rsp+0x3c], r14d ; y_proj_end DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ ref, x, xendi, mvx, mvy, rb, ref2ref mov xd, [rsp+0x20] ; xstarti .xloop: lea rbd, [xq*5] add rbq, srcq movzx refd, byte [rbq+4] test refd, refd jz .next_x_bad_ref movzx ref2refd, byte [n7q+refq] ; rf->mfmv_ref2ref[n][b_ref-1] test ref2refd, ref2refd jz .next_x_bad_ref lea fracq, [mv_proj] movzx fracd, word [fracq+ref2refq*2] mov mvd, [rbq] imul fracd, [rsp+0x40] ; ref2cur pmovsxwq m0, [rbq] movd m1, fracd punpcklqdq m1, m1 pmuldq m0, m1 ; mv * frac pshufd m1, m0, q3311 paddd m0, m3 paddd m0, m1 psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 pabsd m1, m0 packssdw m0, m0 psrld m1, 6 packuswb m1, m1 pxor m0, m2 ; offset ^ ref_sign psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) movq mvxq, m1 lea mvyd, [mvxq+yq] ; ypos sar mvxq, 32 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ ref, x, xendi, mvx, ypos, rb, ref2ref cmp yposd, [rsp+0x44] ; y_proj_start jl .next_x_bad_pos_y cmp yposd, [rsp+0x3c] ; y_proj_end jge .next_x_bad_pos_y and yposd, 15 add mvxq, xq ; xpos imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ ref, x, xendi, xpos, pos, rb, ref2ref mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset add posq, xposq ; pos += xpos lea posq, [posq*5] add dstq, posq ; dst += pos5 jmp .write_loop_entry .write_loop: add rbq, 5 cmp refb, byte [rbq+4] jne .xloop cmp mvd, [rbq] jne .xloop add dstq, 5 inc xposd .write_loop_entry: mov r12d, xd and r12d, ~7 lea r5d, [r12-8] cmp r5d, xstartd cmovs r5d, xstartd ; x_proj_start cmp xposd, r5d jl .next_xpos add r12d, 16 cmp xendd, r12d cmovs r12d, xendd ; x_proj_end cmp xposd, r12d jge .next_xpos mov [dstq+0], mvd mov byte [dstq+4], ref2refb .next_xpos: inc xd cmp xd, xendid jl .write_loop .next_y: DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n add srcq, [rsp+0x18] ; stride5 inc yd cmp yd, [rsp+0x38] ; yend jne .yloop mov nd, [rsp+0x14] mov ystartd, [rsp+0x24] .next_n: add n7q, 7 inc nd cmp nd, [rsp+0x0c] ; n_mfmvs jne .nloop .ret: RET .next_x: DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ add rbq, 5 cmp refb, byte [rbq+4] jne .xloop cmp mvd, [rbq] jne .xloop .next_x_bad_pos_y: inc xd cmp xd, xendid jl .next_x jmp .next_y .next_x_bad_ref: inc xd cmp xd, xendid jl .xloop jmp .next_y INIT_YMM avx2 ; refmvs_temporal_block *rp, ptrdiff_t stride, ; refmvs_block **rr, uint8_t *ref_sign, ; int col_end8, int row_end8, int col_start8, int row_start8 cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ xend, yend, xstart, ystart %define base r12-.write1 lea r12, [.write1] movifnidn xendd, xendm movifnidn yendd, yendm mov xstartd, xstartm mov ystartd, ystartm vpbroadcastq m4, [ref_signq] vpbroadcastq m3, [base+save_ref_shuf+8] vpbroadcastq m5, [base+save_cond0] vpbroadcastq m6, [base+save_cond1] vpbroadcastd m7, [base+pb_128] mova m8, [base+save_pack0] mova m9, [base+save_pack1] psllq m4, 8 lea r9d, [xendq*5] lea xstartd, [xstartq*5] sub yendd, ystartd add ystartd, ystartd lea strideq, [strideq*5] sub xstartq, r9 add xendd, r9d add rpq, r9 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand .loop_y: and ystartd, 30 mov xq, xstartq mov bq, [rrq+ystartq*8] add ystartd, 2 lea bq, [bq+xendq*4] .loop_x: imul candq, xq, 0x9999 sar candq, 16 ; x / 5 * 3 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs movu xm0, [bq+candq*8+12] ; cand_b movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] add r10, r12 add candq, r11 jge .calc vinserti128 m0, [bq+candq*8+12], 1 movzx r11d, byte [bq+candq*8+22] movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] add r11, r12 .calc: pshufb m1, m0, m3 pabsw m2, m0 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 pcmpgtd m1, m2 pshufd m2, m1, q2301 pand m1, m5 ; b0.cond0 b1.cond0 pand m2, m6 ; b0.cond1 b1.cond1 por m1, m2 ; b0.shuf b1.shuf pxor m1, m7 ; if cond0|cond1 == 0 => zero out pshufb m0, m1 call r10 jge .next_line vextracti128 xm0, m0, 1 call r11 jl .loop_x .next_line: add rpq, strideq dec hd jg .loop_y RET .write1: movd [rpq+xq+ 0], xm0 pextrb [rpq+xq+ 4], xm0, 4 add xq, 5*1 ret .write2: movq [rpq+xq+0], xm0 psrlq xm1, xm0, 8 movd [rpq+xq+6], xm1 add xq, 5*2 ret .write4: pshufb xm1, xm0, xm8 movu [rpq+xq+ 0], xm1 psrlq xm1, 8 movd [rpq+xq+16], xm1 add xq, 5*4 ret .write8: vinserti128 m1, m0, xm0, 1 pshufb m1, m8 movu [rpq+xq+ 0], m1 psrldq xm1, 2 movq [rpq+xq+32], xm1 add xq, 5*8 ret .write16: vinserti128 m1, m0, xm0, 1 pshufb m2, m1, m8 movu [rpq+xq+ 0], m2 pshufb m1, m9 movu [rpq+xq+32], m1 shufps xm2, xm1, q1021 movu [rpq+xq+64], xm2 add xq, 5*16 ret cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 add bx4d, bw4d tzcnt bw4d, bw4d vbroadcasti128 m0, [aq] lea aq, [splat_mv_avx2_table] lea bx4q, [bx4q*3-32] movsxd bw4q, [aq+bw4q*4] pshufb m0, [splat_mv_shuf] movifnidn bh4d, bh4m pshufd m1, m0, q2102 pshufd m2, m0, q1021 add bw4q, aq .loop: mov aq, [rrq] add rrq, gprsize lea aq, [aq+bx4q*4] jmp bw4q .w32: mova [aq-32*8], m0 mova [aq-32*7], m1 mova [aq-32*6], m2 mova [aq-32*5], m0 mova [aq-32*4], m1 mova [aq-32*3], m2 .w16: mova [aq-32*2], m0 mova [aq-32*1], m1 mova [aq+32*0], m2 .w8: mova [aq+32*1], m0 mova [aq+32*2], m1 mova [aq+32*3], m2 dec bh4d jg .loop RET .w4: movu [aq+ 80], m0 mova [aq+112], xm1 dec bh4d jg .loop RET .w2: movu [aq+104], xm0 movq [aq+120], xm2 dec bh4d jg .loop RET .w1: movq [aq+116], xm0 movd [aq+124], xm1 dec bh4d jg .loop RET INIT_ZMM avx512icl ; refmvs_temporal_block *rp, ptrdiff_t stride, ; refmvs_block **rr, uint8_t *ref_sign, ; int col_end8, int row_end8, int col_start8, int row_start8 cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ xend, yend, xstart, ystart %define base r14-.write1 lea r14, [.write1] movifnidn xendd, xendm movifnidn yendd, yendm mov xstartd, xstartm mov ystartd, ystartm psllq m4, [ref_signq]{bcstq}, 8 vpbroadcastq m3, [base+save_ref_shuf+8] vbroadcasti32x4 m5, [base+cond_shuf512] vbroadcasti32x4 m6, [base+save_cond0] vpbroadcastd m7, [base+pb_128] mova m8, [base+save_pack0] movu xm9, [base+save_pack0+4] lea r9d, [xendq*5] lea xstartd, [xstartq*5] sub yendd, ystartd add ystartd, ystartd lea strideq, [strideq*5] sub xstartq, r9 add xendd, r9d add rpq, r9 mov r10d, 0x1f kmovb k2, r10d DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand .loop_y: and ystartd, 30 mov xq, xstartq mov bq, [rrq+ystartq*8] add ystartd, 2 lea bq, [bq+xendq*4] .loop_x: imul candq, xq, 0x9999 sar candq, 16 ; x / 5 * 3 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs movu xm0, [bq+candq*8+12] ; cand_b movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] add r10, r14 add candq, r11 jge .calc movzx r11d, byte [bq+candq*8+22] vinserti32x4 ym0, [bq+candq*8+12], 1 movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] add r11, r14 add candq, r12 jge .calc movzx r12d, byte [bq+candq*8+22] vinserti32x4 m0, [bq+candq*8+12], 2 movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] add r12, r14 add candq, r13 jge .calc vinserti32x4 m0, [bq+candq*8+12], 3 movzx r13d, byte [bq+candq*8+22] movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] add r13, r14 .calc: pshufb m1, m0, m3 pabsw m2, m0 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 psubd m2, m1 pshufb m2, m5 ; c0 c1 c1 c0 pand m2, m6 punpckhqdq m1, m2, m2 vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 pshufb m2, m0, m1 mova xm0, xm2 call r10 jge .next_line vextracti32x4 xm0, m2, 1 call r11 jge .next_line vextracti32x4 xm0, m2, 2 call r12 jge .next_line vextracti32x4 xm0, m2, 3 call r13 jl .loop_x .next_line: add rpq, strideq dec hd jg .loop_y RET .write1: vmovdqu8 [rpq+xq]{k2}, xm0 add xq, 5*1 ret .write2: pshufb xm0, xm8 vmovdqu16 [rpq+xq]{k2}, xm0 add xq, 5*2 ret .write4: vpermb ym0, ym8, ym0 vmovdqu32 [rpq+xq]{k2}, ym0 add xq, 5*4 ret .write8: vpermb m0, m8, m0 vmovdqu64 [rpq+xq]{k2}, m0 add xq, 5*8 ret .write16: vpermb m1, m8, m0 movu [rpq+xq+ 0], m1 pshufb xm0, xm9 movu [rpq+xq+64], xm0 add xq, 5*16 ret INIT_ZMM avx512icl cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 vbroadcasti32x4 m0, [aq] lea r1, [splat_mv_avx512icl_table] tzcnt bw4d, bw4d lea bx4d, [bx4q*3] pshufb m0, [splat_mv_shuf] movsxd bw4q, [r1+bw4q*4] mov r6d, bh4m add bw4q, r1 lea rrq, [rrq+r6*8] mov r1d, 0x3f neg r6 kmovb k1, r1d jmp bw4q .w1: mov r1, [rrq+r6*8] vmovdqu16 [r1+bx4q*4]{k1}, xm0 inc r6 jl .w1 RET .w2: mov r1, [rrq+r6*8] vmovdqu32 [r1+bx4q*4]{k1}, ym0 inc r6 jl .w2 RET .w4: mov r1, [rrq+r6*8] vmovdqu64 [r1+bx4q*4]{k1}, m0 inc r6 jl .w4 RET .w8: pshufd ym1, ym0, q1021 .w8_loop: mov r1, [rrq+r6*8+0] mov r3, [rrq+r6*8+8] movu [r1+bx4q*4+ 0], m0 mova [r1+bx4q*4+64], ym1 movu [r3+bx4q*4+ 0], m0 mova [r3+bx4q*4+64], ym1 add r6, 2 jl .w8_loop RET .w16: pshufd m1, m0, q1021 pshufd m2, m0, q2102 .w16_loop: mov r1, [rrq+r6*8+0] mov r3, [rrq+r6*8+8] mova [r1+bx4q*4+64*0], m0 mova [r1+bx4q*4+64*1], m1 mova [r1+bx4q*4+64*2], m2 mova [r3+bx4q*4+64*0], m0 mova [r3+bx4q*4+64*1], m1 mova [r3+bx4q*4+64*2], m2 add r6, 2 jl .w16_loop RET .w32: pshufd m1, m0, q1021 pshufd m2, m0, q2102 .w32_loop: mov r1, [rrq+r6*8] lea r1, [r1+bx4q*4] mova [r1+64*0], m0 mova [r1+64*1], m1 mova [r1+64*2], m2 mova [r1+64*3], m0 mova [r1+64*4], m1 mova [r1+64*5], m2 inc r6 jl .w32_loop RET %endif ; ARCH_X86_64 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/src/x86/refmvs.h000066400000000000000000000046741517466257200227460ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * Copyright © 2021, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/cpu.h" #include "src/refmvs.h" //decl_load_tmvs_fn(dav2d_load_tmvs_sse4); decl_save_tmvs_fn(dav2d_save_tmvs_ssse3); decl_save_tmvs_fn(dav2d_save_tmvs_avx2); decl_save_tmvs_fn(dav2d_save_tmvs_avx512icl); decl_splat_mv_fn(dav2d_splat_mv_sse2); decl_splat_mv_fn(dav2d_splat_mv_avx2); decl_splat_mv_fn(dav2d_splat_mv_avx512icl); static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav2dRefmvsDSPContext *const c) { const unsigned flags = dav2d_get_cpu_flags(); if (!(flags & DAV2D_X86_CPU_FLAG_SSE2)) return; c->splat_mv = dav2d_splat_mv_sse2; if (!(flags & DAV2D_X86_CPU_FLAG_SSSE3)) return; c->save_tmvs = dav2d_save_tmvs_ssse3; if (!(flags & DAV2D_X86_CPU_FLAG_SSE41)) return; #if ARCH_X86_64 //c->load_tmvs = dav2d_load_tmvs_sse4; if (!(flags & DAV2D_X86_CPU_FLAG_AVX2)) return; c->save_tmvs = dav2d_save_tmvs_avx2; c->splat_mv = dav2d_splat_mv_avx2; if (!(flags & DAV2D_X86_CPU_FLAG_AVX512ICL)) return; c->save_tmvs = dav2d_save_tmvs_avx512icl; c->splat_mv = dav2d_splat_mv_avx512icl; #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/subprojects/000077500000000000000000000000001517466257200224075ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/subprojects/checkasm.wrap000066400000000000000000000001501517466257200250540ustar00rootroot00000000000000[wrap-git] url = https://code.videolan.org/videolan/checkasm.git revision = v1.1.0 directory = checkasm dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/000077500000000000000000000000001517466257200212065ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/avm-update-md5.sh000077500000000000000000000004621517466257200242750ustar00rootroot00000000000000#!/bin/bash set -e AVMDEC="$1" if [ ! -e "$AVMDEC" ]; then echo $0 path/to/avmdec exit 1 fi SRC_PATH="$(dirname "$0")/.." for file in "$SRC_PATH"/media/*.obu; do base="$(basename "$file")" a_md5=$("$AVMDEC" "$file" --rawvideo --md5 | cut -d' ' -f1) echo "$a_md5" > "$file.md5" done dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/000077500000000000000000000000001517466257200227645ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/cdef.c000066400000000000000000000141441517466257200240350ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include #include #include "common/dump.h" #include "src/levels.h" #include "src/cdef.h" static int to_binary(int x) { /* 0-15 -> 0000-1111 */ return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8); } static void init_tmp(pixel *buf, int n, const int bitdepth_max) { const int fill_type = rnd() & 7; if (fill_type == 0) while (n--) /* check for cdef_filter underflows */ *buf++ = rnd() & 1; else if (fill_type == 1) while (n--) /* check for cdef_filter overflows */ *buf++ = bitdepth_max - (rnd() & 1); else while (n--) *buf++ = rnd() & bitdepth_max; } static void check_cdef_filter(const cdef_fn fn, const int w, const int h) { ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8; ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8; ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8; ALIGN_STK_64(pixel, bot_buf, 16 * 2 + 16, ), *const bot = bot_buf + 8; ALIGN_STK_16(pixel, left, 8,[2]); const ptrdiff_t stride = 16 * sizeof(pixel); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *top, const pixel *bot, int pri_strength, int sec_strength, int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); for (int s = 0x1; s <= 0x3; s++) { if (check_func(fn, "cdef_filter_%dx%d_%02d_%dbpc", w, h, to_binary(s), BITDEPTH)) { for (int dir = 0; dir < 8; dir++) { for (enum CdefEdgeFlags edges = 0x0; edges <= 0xf; edges++) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; init_tmp(c_src, 16 * 10 + 16, bitdepth_max); init_tmp(top_buf, 16 * 2 + 16, bitdepth_max); init_tmp(bot_buf, 16 * 2 + 16, bitdepth_max); init_tmp((pixel *) left, 8 * 2, bitdepth_max); memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel)); const int pri_strength = s & 2 ? (1 + (rnd() % 15)) << bitdepth_min_8 : 0; const int sec_strength = s & 1 ? 1 << ((rnd() % 3) + bitdepth_min_8) : 0; const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1)); call_ref(c_dst, stride, left, top, bot, pri_strength, sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); call_new(a_dst, stride, left, top, bot, pri_strength, sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) { fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n", pri_strength, sec_strength, dir, damping, to_binary(edges)); return; } if (dir == 7 && (edges == 0x5 || edges == 0xa || edges == 0xf)) bench_new(alternate(c_dst, a_dst), stride, left, top, bot, pri_strength, sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); } } } } } static void check_cdef_direction(const cdef_dir_fn fn) { ALIGN_STK_64(pixel, src, 8 * 8,); declare_func(int, const pixel *src, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX); if (check_func(fn, "cdef_dir_%dbpc", BITDEPTH)) { unsigned c_var, a_var; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif init_tmp(src, 64, bitdepth_max); const int c_dir = call_ref(src, 8 * sizeof(pixel), &c_var HIGHBD_TAIL_SUFFIX); const int a_dir = call_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX); if (c_var != a_var || c_dir != a_dir) { if (fail()) { hex_fdump(stderr, src, 8 * sizeof(pixel), 8, 8, "src"); fprintf(stderr, "c_dir %d a_dir %d\n", c_dir, a_dir); } } bench_new(src, 8 * sizeof(pixel), &a_var HIGHBD_TAIL_SUFFIX); } report("cdef_dir"); } void bitfn(checkasm_check_cdef)(void) { Dav2dCdefDSPContext c; bitfn(dav2d_cdef_dsp_init)(&c); check_cdef_direction(c.dir); check_cdef_filter(c.fb[0], 8, 8); check_cdef_filter(c.fb[1], 4, 8); check_cdef_filter(c.fb[2], 4, 4); report("cdef_filter"); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/checkasm.c000066400000000000000000000104011517466257200247020ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include "src/cpu.h" #include "tests/checkasm/internal.h" /* List of tests to invoke */ static const CheckasmTest tests[] = { { "msac", checkasm_check_msac }, { "pal", checkasm_check_pal }, { "refmvs", checkasm_check_refmvs }, #if CONFIG_8BPC { "cdef_8bpc", checkasm_check_cdef_8bpc }, { "deblock_8bpc", checkasm_check_deblock_8bpc }, { "filmgrain_8bpc", checkasm_check_filmgrain_8bpc }, { "ipred_8bpc", checkasm_check_ipred_8bpc }, { "itx_8bpc", checkasm_check_itx_8bpc }, { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc }, { "mc_8bpc", checkasm_check_mc_8bpc }, #endif #if CONFIG_16BPC { "cdef_16bpc", checkasm_check_cdef_16bpc }, { "deblock_16bpc", checkasm_check_deblock_16bpc }, { "filmgrain_16bpc", checkasm_check_filmgrain_16bpc }, { "ipred_16bpc", checkasm_check_ipred_16bpc }, { "itx_16bpc", checkasm_check_itx_16bpc }, { "looprestoration_16bpc", checkasm_check_looprestoration_16bpc }, { "mc_16bpc", checkasm_check_mc_16bpc }, #endif { 0 } }; /* List of cpu flags to check */ static const CheckasmCpuInfo flags[] = { #if ARCH_X86 { "SSE2", "sse2", DAV2D_X86_CPU_FLAG_SSE2 }, { "SSSE3", "ssse3", DAV2D_X86_CPU_FLAG_SSSE3 }, { "SSE4.1", "sse4", DAV2D_X86_CPU_FLAG_SSE41 }, { "AVX2", "avx2", DAV2D_X86_CPU_FLAG_AVX2 }, { "AVX-512 (Ice Lake)", "avx512icl", DAV2D_X86_CPU_FLAG_AVX512ICL }, #elif ARCH_AARCH64 || ARCH_ARM { "NEON", "neon", DAV2D_ARM_CPU_FLAG_NEON }, { "DOTPROD", "dotprod", DAV2D_ARM_CPU_FLAG_DOTPROD }, { "I8MM", "i8mm", DAV2D_ARM_CPU_FLAG_I8MM }, #if ARCH_AARCH64 { "SVE", "sve", DAV2D_ARM_CPU_FLAG_SVE }, { "SVE2", "sve2", DAV2D_ARM_CPU_FLAG_SVE2 }, #endif /* ARCH_AARCH64 */ #elif ARCH_LOONGARCH { "LSX", "lsx", DAV2D_LOONGARCH_CPU_FLAG_LSX }, { "LASX", "lasx", DAV2D_LOONGARCH_CPU_FLAG_LASX }, #elif ARCH_PPC64LE { "VSX", "vsx", DAV2D_PPC_CPU_FLAG_VSX }, { "PWR9", "pwr9", DAV2D_PPC_CPU_FLAG_PWR9 }, #elif ARCH_RISCV { "RVV", "rvv", DAV2D_RISCV_CPU_FLAG_V }, #endif { 0 } }; static void set_cpu_flags(CheckasmCpu flags) { dav2d_set_cpu_flags_mask((unsigned) flags); } int main(int argc, const char *argv[]) { #if TRIM_DSP_FUNCTIONS fprintf(stderr, "checkasm: reference functions unavailable, reconfigure using '-Dtrim_dsp=false'\n"); return 0; #endif CheckasmConfig cfg = { .cpu_flags = flags, .tests = tests, .set_cpu_flags = set_cpu_flags, }; dav2d_init_cpu(); cfg.cpu = dav2d_get_cpu_flags(); return checkasm_main(&cfg, argc, argv); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/deblock.c000066400000000000000000000250111517466257200245320ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include #include #include #include #include "src/deblock.h" #include "src/tables.h" #include "src/quantizer.h" static unsigned deblock_quant_thr(const int hbd, const int qidx) { const int qmax = 255 + 48 * hbd; return (dav2d_dq_lookup(iclip(qidx, 0, qmax)) + 4) >> (3 + 6); } static unsigned deblock_side_thr(const int hbd, const int qidx) { const int bitdepth_min_8 = 2 * hbd; const int q_ind = iclip(qidx - 24 * bitdepth_min_8, 0, 296 - 1); const int side_thr = dav2d_deblock_side_thresholds[q_ind]; return imax(side_thr + (1 << 4 >> bitdepth_min_8), 0) >> (5 - bitdepth_min_8); } // Given a target width and pixel data, check if the deblock filter is // triggered for at least that width. If not, find the amount you can scale all // pixels to make that width trigger or almost trigger. static int check_width(const pixel *const s, const pixel *const t, unsigned q_thr, unsigned side_thr, int target_width, int edge, int is_chroma, double *scale) { unsigned deriv_s, deriv_t; unsigned second_derivs_buf[4]; unsigned *second_deriv = &second_derivs_buf[2]; int pass = 1; *scale = 1.0; if (target_width == 0) return pass; for (int dist = -2; dist < 2; dist++) { deriv_s = abs(s[(dist - 1)] - (s[dist] << 1) + s[(dist + 1)]); deriv_t = abs(t[(dist - 1)] - (t[dist] << 1) + t[(dist + 1)]); second_deriv[dist] = (deriv_s + deriv_t + 1) >> 1; } #define TEST(val, thr) do { \ if ((val) > (thr)) { \ *scale = fmin((double)(thr)/(val), *scale); \ pass = 0; \ } \ } while (0); TEST(second_deriv[-2], side_thr); TEST(second_deriv[1], side_thr); if (target_width == 1) return pass; const unsigned side_thr2 = side_thr >> 2; TEST(second_deriv[-2], side_thr2); TEST(second_deriv[1], side_thr2); TEST(second_deriv[-1] + second_deriv[0], q_thr * 4); if (target_width == 2) return pass; const unsigned side_thr3 = side_thr >> 3; TEST(second_deriv[-2], side_thr3); TEST(second_deriv[1], side_thr3); TEST(second_deriv[-1] + second_deriv[0], q_thr * 3); const unsigned end_thr = (side_thr * 3) >> 4; // if chroma && !edge if (!(is_chroma && edge)) { deriv_s = abs(s[-1] - s[-4] - 3 * (s[-1] - s[-2])); deriv_t = abs(t[-1] - t[-4] - 3 * (t[-1] - t[-2])); TEST(((deriv_s + deriv_t + 1) >> 1), end_thr); } deriv_s = abs(s[0] - s[3] - 3 * (s[0] - s[1])); deriv_t = abs(t[0] - t[3] - 3 * (t[0] - t[1])); TEST((deriv_s + deriv_t + 1) >> 1, end_thr); if (target_width == 3) return pass; const unsigned transition = (second_deriv[-1] + second_deriv[0]) << 4; for (int dist = 4; dist <= target_width; dist += 2) { const int8_t q_first[5] = { 45, 40, 32 }; const unsigned q_thr4 = q_thr * q_first[(dist - 4) >> 1]; const unsigned end_thr4 = (side_thr * dist) >> 4; TEST(transition, q_thr4); const int dist2 = imin(7, dist); // if !(luma && edge && dist2 == 8) if (!(!is_chroma && edge && dist2 == 8)) { deriv_s = abs(s[-1] - s[-dist2 - 1] - dist2 * (s[-1] - s[-2])); deriv_t = abs(t[-1] - t[-dist2 - 1] - dist2 * (t[-1] - t[-2])); TEST((deriv_s + deriv_t + 1) >> 1, end_thr4); } deriv_s = abs(s[0] - s[dist2] - dist2 * (s[0] - s[1])); deriv_t = abs(t[0] - t[dist2] - dist2 * (t[0] - t[1])); TEST((deriv_s + deriv_t + 1) >> 1, end_thr4); } return pass; } static void init_deblock_border(pixel *const dst, const ptrdiff_t stridea, const ptrdiff_t strideb, int q_thr, int side_thr, int edge, int is_chroma, const int bitdepth_max) { // pixels tested when choosing a filter const int tested_pixels[7] = { 0, 1, 2, 3, 4, 6, 7 }; const int filter_widths[7] = { 0, 1, 2, 3, 4, 6, 8 }; // number of pixels that are tested on a side of the deblocked edge const int max_tested_pixels[7] = { 0, 3, 3, 4, 5, 6, 7 }; const int filter_width_idx = rnd() % 7; pixel s[16], t[16]; for (int i = 0; i < 16; i++) { s[i] = rnd() & bitdepth_max; t[i] = rnd() & bitdepth_max; } double scale; if (!check_width(s + 8, t + 8, q_thr, side_thr, filter_widths[filter_width_idx], edge, is_chroma, &scale)) { const int mid = s[8]; scale *= checkasm_randf(); int n_tested = max_tested_pixels[filter_width_idx]; for (int i = 0; i < n_tested; i++) { const int off = tested_pixels[i]; s[off + 8] = iclip_pixel(mid + (int)((s[off + 8] - mid) * scale)); t[off + 8] = iclip_pixel(mid + (int)((t[off + 8] - mid) * scale)); if (!edge || off < (is_chroma ? 3 : 7)) { s[7 - off] = iclip_pixel(mid + (int)((s[7 - off] - mid) * scale)); t[7 - off] = iclip_pixel(mid + (int)((t[7 - off] - mid) * scale)); } } } for (int j = 1; j <= 2; j++) for (int i = -8; i < 8; i++) dst[i * strideb + j * stridea] = rnd() & bitdepth_max; for (int i = -8; i < 8; i++) { dst[i * strideb + 0 * stridea] = s[i + 8]; dst[i * strideb + 3 * stridea] = t[i + 8]; } } static void check_deblock_sb(deblock_sb_fn fn, const char *const name, const int n_blks, const int is_chroma, const int dir) { ALIGN_STK_64(pixel, c_dst_mem, 64 * 16,); ALIGN_STK_64(pixel, a_dst_mem, 64 * 16,); ALIGN_STK_16(pixel, q_thr, 16,); ALIGN_STK_16(pixel, side_thr, 16,); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint16_t *mask, const uint16_t *ll_mask, const pixel *q_thr, const pixel *side_thr, int edge, int w HIGHBD_DECL_SUFFIX); pixel *a_dst, *c_dst; ptrdiff_t stride; int w, h; if (dir) { a_dst = a_dst_mem + n_blks * 4 * 8; c_dst = c_dst_mem + n_blks * 4 * 8; w = n_blks * 4; h = 16; } else { a_dst = a_dst_mem + 8; c_dst = c_dst_mem + 8; w = 16; h = n_blks * 4; } stride = w * sizeof(pixel); const int n_strengths = is_chroma ? 3 : 4; const int widths[] = { 1, 3, 6, 8 }; const int chroma_widths[] = { 1, 3, 4 }; for (int i = 0; i < n_strengths; i++) { if (check_func(fn, "%s_w%d_%dbpc", name, is_chroma ? chroma_widths[i] : widths[i], BITDEPTH)) { for (int edge = 0; edge <= 1; edge++) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif uint16_t ll_mask[2] = { 0 }; uint16_t vmask[4] = { 0 }; int hbd = (bitdepth_from_max(bitdepth_max) - 8) >> 1; const int qidx_max = 255 + 48 * hbd; for (int j = 0; j < n_blks; j++) { const int idx = rnd() % (i + 2); if (idx) vmask[idx - 1] |= 1U << j; int qidx = rnd() % (qidx_max + 1); q_thr[j] = deblock_quant_thr(hbd, qidx); side_thr[j] = deblock_side_thr(hbd, qidx); } ll_mask[0] = rnd() & 0xffff; ll_mask[1] = rnd() & 0xffff; for (int x = 0; x < n_blks; x++) { const ptrdiff_t stridea = dir ? 1 : 16; const ptrdiff_t strideb = dir ? n_blks * 4 : 1; init_deblock_border(c_dst + 4 * x * stridea, stridea, strideb, q_thr[x], side_thr[x], edge, is_chroma, bitdepth_max); } memcpy(a_dst_mem, c_dst_mem, 64 * sizeof(pixel) * 16); call_ref(c_dst, stride, vmask, ll_mask, q_thr, side_thr, edge, n_blks HIGHBD_TAIL_SUFFIX); call_new(a_dst, stride, vmask, ll_mask, q_thr, side_thr, edge, n_blks HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel(c_dst_mem, stride, a_dst_mem, stride, w, h, "dst")) { fprintf(stderr, "edge = %d\n", edge); } bench_new(alternate(c_dst, a_dst), stride, vmask, ll_mask, q_thr, side_thr, edge, n_blks HIGHBD_TAIL_SUFFIX); } } } report(name); } void bitfn(checkasm_check_deblock)(void) { Dav2dDeblockDSPContext c; bitfn(dav2d_deblock_dsp_init)(&c); check_deblock_sb(c.deblock_sb[0][0], "deblock_h_sb_y", 16, 0, 0); check_deblock_sb(c.deblock_sb[0][1], "deblock_v_sb_y", 16, 0, 1); check_deblock_sb(c.deblock_sb[1][0], "deblock_h_sb_uv", 8, 1, 0); check_deblock_sb(c.deblock_sb[1][1], "deblock_v_sb_uv", 8, 1, 1); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/filmgrain.c000066400000000000000000000444221517466257200251060ustar00rootroot00000000000000/* * Copyright © 2019, VideoLAN and dav2d authors * Copyright © 2019, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include #include "src/levels.h" #include "src/filmgrain.h" #define UNIT_TEST 1 #include "src/fg_apply_tmpl.c" #if BITDEPTH == 8 #define checkasm_check_entry(...) checkasm_check(int8_t, __VA_ARGS__) #else #define checkasm_check_entry(...) checkasm_check(int16_t, __VA_ARGS__) #endif static const char ss_name[][4] = { [DAV2D_PIXEL_LAYOUT_I420 - 1] = "420", [DAV2D_PIXEL_LAYOUT_I422 - 1] = "422", [DAV2D_PIXEL_LAYOUT_I444 - 1] = "444", }; static void check_gen_grny(const Dav2dFilmGrainDSPContext *const dsp) { ALIGN_STK_16(entry, grain_lut_c, GRAIN_HEIGHT,[GRAIN_WIDTH]); ALIGN_STK_16(entry, grain_lut_a, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); declare_func(void, entry grain_lut[][GRAIN_WIDTH], const Dav2dFilmGrainData *data, unsigned seed HIGHBD_DECL_SUFFIX); for (int i = 0; i < 4; i++) { if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) { ALIGN_STK_16(Dav2dFilmGrainData, fg_data, 1,); const unsigned seed = rnd() & 0xFFFF; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #endif fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; fg_data[0].ar_coeff_lag = i; const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); for (int n = 0; n < num_y_pos; n++) fg_data[0].ar_coeffs[0][n] = (rnd() & 0xff) - 128; call_ref(grain_lut_c, fg_data, seed HIGHBD_TAIL_SUFFIX); call_new(grain_lut_a, fg_data, seed HIGHBD_TAIL_SUFFIX); checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH, grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH, GRAIN_WIDTH, GRAIN_HEIGHT, "grain_lut"); bench_new(grain_lut_a, fg_data, seed HIGHBD_TAIL_SUFFIX); } } report("gen_grain_y"); } static void check_gen_grnuv(const Dav2dFilmGrainDSPContext *const dsp) { ALIGN_STK_16(entry, grain_lut_y, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); ALIGN_STK_16(entry, grain_lut_c, GRAIN_HEIGHT, [GRAIN_WIDTH]); ALIGN_STK_16(entry, grain_lut_a, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); declare_func(void, entry grain_lut[][GRAIN_WIDTH], const entry grain_lut_y[][GRAIN_WIDTH], const Dav2dFilmGrainData *data, unsigned seed, intptr_t uv HIGHBD_DECL_SUFFIX); for (int layout_idx = 0; layout_idx < 3; layout_idx++) { const enum Dav2dPixelLayout layout = layout_idx + 1; const int ss_x = layout != DAV2D_PIXEL_LAYOUT_I444; const int ss_y = layout == DAV2D_PIXEL_LAYOUT_I420; for (int i = 0; i < 4; i++) { if (check_func(dsp->generate_grain_uv[layout_idx], "gen_grain_uv_ar%d_%dbpc_%s", i, BITDEPTH, ss_name[layout_idx])) { ALIGN_STK_16(Dav2dFilmGrainData, fg_data, 1,); const unsigned seed = rnd() & 0xFFFF; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #endif fg_data[0].num_points[0] = rnd() & 1; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; fg_data[0].ar_coeff_lag = i; const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); for (int n = 0; n < num_y_pos; n++) fg_data[0].ar_coeffs[0][n] = (rnd() & 0xff) - 128; dsp->generate_grain_y(grain_lut_y, fg_data, seed HIGHBD_TAIL_SUFFIX); const int uv = rnd() & 1; const int num_uv_pos = num_y_pos + !!fg_data[0].num_points[0]; for (int n = 0; n < num_uv_pos; n++) fg_data[0].ar_coeffs[1 + uv][n] = (rnd() & 0xff) - 128; if (!fg_data[0].num_points[0]) fg_data[0].ar_coeffs[1 + uv][num_uv_pos] = 0; memset(grain_lut_c, 0xff, sizeof(grain_lut_c)); memset(grain_lut_a, 0xff, sizeof(grain_lut_a)); call_ref(grain_lut_c, grain_lut_y, fg_data, seed, uv HIGHBD_TAIL_SUFFIX); call_new(grain_lut_a, grain_lut_y, fg_data, seed, uv HIGHBD_TAIL_SUFFIX); int w = ss_x ? 44 : GRAIN_WIDTH; int h = ss_y ? 38 : GRAIN_HEIGHT; checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH, grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH, w, h, "grain_lut"); bench_new(grain_lut_a, grain_lut_y, fg_data, seed, uv HIGHBD_TAIL_SUFFIX); } } } report("gen_grain_uv"); } static void check_fgy_sbrow(const Dav2dFilmGrainDSPContext *const dsp) { PIXEL_RECT(c_dst, 128, 32); PIXEL_RECT(a_dst, 128, 32); PIXEL_RECT(src, 128, 32); const ptrdiff_t stride = c_dst_stride; declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride, const Dav2dFilmGrainData *data, unsigned seed, size_t pw, const uint8_t scaling[SCALING_SIZE], const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num HIGHBD_DECL_SUFFIX); if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) { ALIGN_STK_16(Dav2dFilmGrainData, fg_data, 16,); ALIGN_STK_16(entry, grain_lut, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); const unsigned seed = rnd() & 0xFFFF; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif fg_data[0].mc_identity = 0; fg_data[0].block_size = rnd() % 1; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; fg_data[0].ar_coeff_lag = rnd() & 3; const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); for (int n = 0; n < num_y_pos; n++) fg_data[0].ar_coeffs[0][n] = (rnd() & 0xff) - 128; dsp->generate_grain_y(grain_lut, fg_data, seed HIGHBD_TAIL_SUFFIX); fg_data[0].num_points[0] = 2 + (rnd() % 13); const int pad = 0xff / fg_data[0].num_points[0]; for (int n = 0; n < fg_data[0].num_points[0]; n++) { fg_data[0].points[0][n][0] = 0xff * n / fg_data[0].num_points[0]; fg_data[0].points[0][n][0] += rnd() % pad; fg_data[0].points[0][n][1] = rnd() & 0xff; } generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].points[0], fg_data[0].num_points[0], scaling); fg_data[0].clip_to_restricted_range = rnd() & 1; fg_data[0].scaling_shift = (rnd() & 3) + 8; for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; fg_data[0].overlap_flag++) { for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { int w, h, row_num; if (fg_data[0].overlap_flag) { w = 35 + (rnd() % 93); if (i == 0) { row_num = 0; h = 1 + (rnd() % 31); } else { row_num = 1 + (rnd() & 0x7ff); if (i == 1) { h = 3 + (rnd() % 30); } else { h = 1 + (rnd() & 1); } } } else { w = 1 + (rnd() & 127); h = 1 + (rnd() & 31); row_num = rnd() & 0x7ff; } for (int y = 0; y < 32; y++) { // Src pixels past the right edge can be uninitialized for (int x = 0; x < 128; x++) src[y * PXSTRIDE(stride) + x] = rnd(); for (int x = 0; x < w; x++) src[y * PXSTRIDE(stride) + x] &= bitdepth_max; } CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, src, stride, fg_data, seed, w, scaling, grain_lut, h, row_num HIGHBD_TAIL_SUFFIX); call_new(a_dst, src, stride, fg_data, seed, w, scaling, grain_lut, h, row_num HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride, w, h, "dst", 32, 2); } } fg_data[0].overlap_flag = 1; for (int y = 0; y < 32; y++) { // Make sure all pixels are in range for (int x = 0; x < 128; x++) src[y * PXSTRIDE(stride) + x] &= bitdepth_max; } bench_new(a_dst, src, stride, fg_data, seed, 64, scaling, grain_lut, 32, 1 HIGHBD_TAIL_SUFFIX); } report("fgy_32x32xn"); } static void check_fguv_sbrow(const Dav2dFilmGrainDSPContext *const dsp) { PIXEL_RECT(c_dst, 128, 32); PIXEL_RECT(a_dst, 128, 32); PIXEL_RECT(src, 128, 32); PIXEL_RECT(luma_src, 128, 32); const ptrdiff_t lstride = luma_src_stride; declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride, const Dav2dFilmGrainData *data, unsigned seed, size_t pw, const uint8_t scaling[SCALING_SIZE], const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl, int is_identity HIGHBD_DECL_SUFFIX); for (int layout_idx = 0; layout_idx < 3; layout_idx++) { const enum Dav2dPixelLayout layout = layout_idx + 1; const int ss_x = layout != DAV2D_PIXEL_LAYOUT_I444; const int ss_y = layout == DAV2D_PIXEL_LAYOUT_I420; const ptrdiff_t stride = c_dst_stride; for (int csfl = 0; csfl <= 1; csfl++) { if (check_func(dsp->fguv_32x32xn[layout_idx], "fguv_32x32xn_%dbpc_%s_csfl%d", BITDEPTH, ss_name[layout_idx], csfl)) { ALIGN_STK_16(Dav2dFilmGrainData, fg_data, 1,); ALIGN_STK_16(entry, grain_lut, 2,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]); ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); const unsigned seed = rnd() & 0xFFFF; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif const int uv_pl = rnd() & 1; const int is_identity = rnd() & 1; fg_data[0].mc_identity = 0; fg_data[0].block_size = rnd() % 1; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; fg_data[0].ar_coeff_lag = rnd() & 3; fg_data[0].num_points[0] = csfl ? 2 + (rnd() % 13) : 0; const int num_y_pos = 2 * fg_data[0].ar_coeff_lag * (fg_data[0].ar_coeff_lag + 1); for (int n = 0; n < num_y_pos; n++) fg_data[0].ar_coeffs[0][n] = (rnd() & 0xff) - 128; const int num_uv_pos = num_y_pos + 1; for (int n = 0; n < num_uv_pos; n++) fg_data[0].ar_coeffs[1 + uv_pl][n] = (rnd() & 0xff) - 128; dsp->generate_grain_y(grain_lut[0], fg_data, seed HIGHBD_TAIL_SUFFIX); dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0], fg_data, seed, uv_pl HIGHBD_TAIL_SUFFIX); if (csfl) { const int pad = 0xff / fg_data[0].num_points[0]; for (int n = 0; n < fg_data[0].num_points[0]; n++) { fg_data[0].points[0][n][0] = 0xff * n / fg_data[0].num_points[0]; fg_data[0].points[0][n][0] += rnd() % pad; fg_data[0].points[0][n][1] = rnd() & 0xff; } generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].points[0], fg_data[0].num_points[0], scaling); } else { fg_data[0].num_points[1 + uv_pl] = 2 + (rnd() % 9); const int pad = 0xff / fg_data[0].num_points[1 + uv_pl]; for (int n = 0; n < fg_data[0].num_points[1 + uv_pl]; n++) { fg_data[0].points[1 + uv_pl][n][0] = 0xff * n / fg_data[0].num_points[1 + uv_pl]; fg_data[0].points[1 + uv_pl][n][0] += rnd() % pad; fg_data[0].points[1 + uv_pl][n][1] = rnd() & 0xff; } generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].points[1 + uv_pl], fg_data[0].num_points[1 + uv_pl], scaling); fg_data[0].uv_mult[uv_pl] = (rnd() & 0xff) - 128; fg_data[0].uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128; fg_data[0].uv_offset[uv_pl] = (rnd() & 0x1ff) - 256; } fg_data[0].clip_to_restricted_range = rnd() & 1; fg_data[0].scaling_shift = (rnd() & 3) + 8; fg_data[0].chroma_scaling_from_luma = csfl; for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; fg_data[0].overlap_flag++) { for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { int w, h, row_num; if (fg_data[0].overlap_flag) { w = (36 >> ss_x) + (rnd() % (92 >> ss_x)); if (i == 0) { row_num = 0; h = 1 + (rnd() & (31 >> ss_y)); } else { row_num = 1 + (rnd() & 0x7ff); if (i == 1) { h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30)); } else { h = ss_y ? 1 : 1 + (rnd() & 1); } } } else { w = 1 + (rnd() & (127 >> ss_x)); h = 1 + (rnd() & (31 >> ss_y)); row_num = rnd() & 0x7ff; } for (int y = 0; y < 32; y++) { // Src pixels past the right edge can be uninitialized for (int x = 0; x < 128; x++) { src[y * PXSTRIDE(stride) + x] = rnd(); luma_src[y * PXSTRIDE(lstride) + x] = rnd(); } for (int x = 0; x < w; x++) src[y * PXSTRIDE(stride) + x] &= bitdepth_max; for (int x = 0; x < (w << ss_x); x++) luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; } CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, src, stride, fg_data, seed, w, scaling, grain_lut[1], h, row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); call_new(a_dst, src, stride, fg_data, seed, w, scaling, grain_lut[1], h, row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride, w, h, "dst", 32 >> ss_x, 4); } } fg_data[0].overlap_flag = 1; for (int y = 0; y < 32; y++) { // Make sure all pixels are in range for (int x = 0; x < 128; x++) { src[y * PXSTRIDE(stride) + x] &= bitdepth_max; luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; } } bench_new(a_dst, src, stride, fg_data, seed, 64 >> ss_x, scaling, grain_lut[1], 32 >> ss_y, 1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); } } } report("fguv_32x32xn"); } void bitfn(checkasm_check_filmgrain)(void) { Dav2dFilmGrainDSPContext c; bitfn(dav2d_film_grain_dsp_init)(&c); check_gen_grny(&c); check_gen_grnuv(&c); check_fgy_sbrow(&c); check_fguv_sbrow(&c); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/internal.h000066400000000000000000000055521517466257200247600ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_TESTS_CHECKASM_INTERNAL_H #define DAV2D_TESTS_CHECKASM_INTERNAL_H #include "config.h" #include "common/intops.h" #include #include #define rnd checkasm_rand #define decl_check_bitfns(name) \ name##_8bpc(void); \ name##_16bpc(void) void checkasm_check_msac(void); void checkasm_check_pal(void); void checkasm_check_refmvs(void); decl_check_bitfns(void checkasm_check_cdef); decl_check_bitfns(void checkasm_check_filmgrain); decl_check_bitfns(void checkasm_check_ipred); decl_check_bitfns(void checkasm_check_itx); decl_check_bitfns(void checkasm_check_deblock); decl_check_bitfns(void checkasm_check_looprestoration); decl_check_bitfns(void checkasm_check_mc); #ifdef BITDEPTH #define checkasm_check_impl_pixel checkasm_check_impl(PIXEL_TYPE) #define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__) #define checkasm_check_coef(...) checkasm_check(COEF_TYPE, __VA_ARGS__) #define PIXEL_RECT(name, w, h) BUF_RECT(pixel, name, w, h) #define CLEAR_PIXEL_RECT CLEAR_BUF_RECT #if BITDEPTH == 8 #define INIT_PIXEL_RECT(buf) checkasm_init_mask8(buf, sizeof(buf), bitdepth_max) #else #define INIT_PIXEL_RECT(buf) checkasm_init_mask16(buf, sizeof(buf) >> 1, bitdepth_max) #endif #define checkasm_check_pixel_padded checkasm_check_rect_padded #define checkasm_check_pixel_padded_align checkasm_check_rect_padded_align #endif #endif /* DAV2D_TESTS_CHECKASM_INTERNAL_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/ipred.c000066400000000000000000000346731517466257200242500ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include "src/ipred.h" #include "src/levels.h" #include static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = { [DC_PRED] = "dc", [DC_128_PRED] = "dc_128", [TOP_DC_PRED] = "dc_top", [LEFT_DC_PRED] = "dc_left", [HOR_PRED] = "h", [VERT_PRED] = "v", [PAETH_PRED] = "paeth", [SMOOTH_PRED] = "smooth", [SMOOTH_V_PRED] = "smooth_v", [SMOOTH_H_PRED] = "smooth_h", [Z1_PRED] = "z1", [Z2_PRED] = "z2", [Z3_PRED] = "z3", [DIP_PRED] = "dip" }; static const char *const cfl_pred_type_names[3] = { "cfl_explicit", "cfl_implicit" }; static const char *const cfl_luma_filter_names[3] = { "uniform", "vstrip", "gauss" }; static const char *const layout_names[3] = { "420", "422", "444" }; static const uint8_t z_angles[27] = { 3, 6, 9, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, 48, 51, 54, 58, 61, 64, 67, 70, 73, 76, 81, 84, 87 }; /* Generate max_width/max_height values that covers all edge cases */ static int gen_z_max_wh(const int sz) { const int n = rnd(); if (n & (1 << 17)) /* edge block */ return (n & (sz - 1)) + 1; if (n & (1 << 16)) /* max size, exceeds uint16_t */ return 65536; return (n & 65535) + 1; } static void check_intra_pred(Dav2dIntraPredDSPContext *const c) { PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(pixel, topleft_buf, 64 * 8 + 2 * 1 + 2 * 9,); int bitdepth_max; if (BITDEPTH == 16) bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; else bitdepth_max = (1 << BITDEPTH) - 1; for (int i = 0; i < 64 * 8 + 2 * 1 + 2 * 9; i++) topleft_buf[i] = rnd() & bitdepth_max; declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft, int width, int height, int angle, int max_width, int max_height HIGHBD_DECL_SUFFIX); for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) { for (int w = 4; w <= 64; w <<= 1) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc", intra_pred_mode_names[mode], w, BITDEPTH)) { for (int h = 4; h <= 64; h <<= 1) { const ptrdiff_t stride = c_dst_stride; int nb_iters = (mode >= Z1_PRED && mode <= Z3_PRED) ? 5 : 1; for (int iter = 0; iter < nb_iters; iter++) { int a = 0, maxw = 0, maxh = 0; if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */ a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) | (rnd() & 0xffe00); if (imin(w, h) == 4) a &= ~(ANGLE_MULTI_MRL_FLAG | ANGLE_IBP_FLAG); if (a & ANGLE_MULTI_MRL_FLAG) a |= ANGLE_IS_LUMA; maxw = gen_z_max_wh(w); maxh = gen_z_max_wh(h); } else if (mode == DIP_PRED) { /* dip_idx */ a = (rnd() % 5) | (rnd() & 16); } else if (mode == HOR_PRED || mode == VERT_PRED) { if (imax(w, h) > 4) a = (rnd() & 1) * ANGLE_MULTI_MRL_FLAG; } else if (mode == DC_PRED || mode == LEFT_DC_PRED || mode == TOP_DC_PRED) { if (imax(w, h) > 4) a = (rnd() & 1) * ANGLE_IBP_FLAG; } pixel *const topleft = topleft_buf + 128 + 9; CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh HIGHBD_TAIL_SUFFIX); call_new(u_dst, stride, topleft, w, h, a, maxw, maxh HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel_padded(c_dst, stride, u_dst, stride, w, h, "dst")) { if (mode >= Z1_PRED && mode <= Z3_PRED) fprintf(stderr, "angle = %d (0x%05x), " "max_width = %d, max_height = %d\n", a & 0x1ff, a & 0xffe00, maxw, maxh); else if (mode == DIP_PRED) fprintf(stderr, "dip tp = %d, mode = %d\n", a > 7, a & 7); else if (mode == HOR_PRED || mode == VERT_PRED) fprintf(stderr, "multimrl=%d\n", !!a); else if (mode == DC_PRED || mode == LEFT_DC_PRED || mode == TOP_DC_PRED) { fprintf(stderr, "ibp=%d\n", !!a); } break; } bench_new(a_dst, stride, topleft, w, h, a, 128, 128 HIGHBD_TAIL_SUFFIX); } } } } } report("intra_pred"); } static void check_cfl_pred(Dav2dIntraPredDSPContext *const c) { PIXEL_RECT(c_y, 128, 128); PIXEL_RECT(c_u, 128, 128); PIXEL_RECT(c_v, 128, 128); PIXEL_RECT(a_y, 128, 128); PIXEL_RECT(a_u, 128, 128); PIXEL_RECT(a_v, 128, 128); pixel c_top_sb[3 * 128], a_top_sb[3 * 128]; declare_func(void, pixel *const *ptrs, const ptrdiff_t *stride, int wpad, int hpad, int w, int h, int flags HIGHBD_DECL_SUFFIX); for (enum CflType type = CFL_EXPLICIT; type <= CFL_IMPLICIT; type++) { for (int layout = 1; layout <= DAV2D_PIXEL_LAYOUT_I444; layout++) { const int ss_hor = layout != DAV2D_PIXEL_LAYOUT_I444; const int ss_ver = layout == DAV2D_PIXEL_LAYOUT_I420; const ptrdiff_t strides[2] = { c_y_stride, c_u_stride >> ss_hor }; for (int w = 4; w <= 64; w <<= 1) for (int padding = 0; padding <= 1; padding++) for (int rng0 = 1; rng0 >= 0; rng0--) for (int flt_type = CFL_FLT_TYPE_UNIFORM; flt_type <= CFL_FLT_TYPE_GAUSS; flt_type++) { if (check_func(c->cfl_pred[type][layout - 1], "%s_%s_w%d_pad%d_%s_%s_%dbpc", cfl_pred_type_names[type], layout_names[layout - 1], w, padding, rng0 ? "uv" : "u|v", cfl_luma_filter_names[flt_type], BITDEPTH)) { const int h0 = padding && w < 16 ? 16 : 4; for (int h = h0; h <= 64; h <<= 1) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif int wpad = 0, hpad = 0; if (padding) { wpad = imax(w > 8, rnd() & imax(w / 8 - 1, 0)); hpad = imax(h > 8, rnd() & imax(h / 8 - 1, 0)); } int flags = flt_type | (rnd() & (CFL_HAS_TOP | CFL_HAS_LEFT | CFL_IS_TOP_SB_EDGE)); if (type == CFL_EXPLICIT) { const int sign_u = 1 - (rnd() & 2); const int sign_v = 1 - (rnd() & 2); int alpha_u = rng0 + (rnd() % (9 - rng0)); rng0 |= !alpha_u; int alpha_v = rng0 + (rnd() % (9 - rng0)); if (rng0 == 0 && alpha_u) alpha_v = 0; alpha_u *= sign_u; alpha_v *= sign_v; flags |= (alpha_u << CFL_ALPHA_U_SHIFT) & CFL_ALPHA_U_MASK; flags |= (alpha_v << CFL_ALPHA_V_SHIFT) & CFL_ALPHA_V_MASK; } const int itse = flags & CFL_IS_TOP_SB_EDGE; pixel *c_ytop = itse ? c_top_sb : c_y - (1 + ss_ver) * strides[0]; pixel *c_utop = itse ? c_top_sb + 128 : c_u - strides[1]; pixel *c_vtop = itse ? c_top_sb + 256 : c_v - strides[1]; pixel *a_ytop = itse ? a_top_sb : a_y - (1 + ss_ver) * strides[0]; pixel *a_utop = itse ? a_top_sb + 128 : a_u - strides[1]; pixel *a_vtop = itse ? a_top_sb + 256 : a_v - strides[1]; pixel *const c_ptrs[6] = { c_ytop, c_utop, c_vtop, c_y, c_u, c_v }; pixel *const a_ptrs[6] = { a_ytop, a_utop, a_vtop, a_y, a_u, a_v }; INIT_PIXEL_RECT(c_y_buf); INIT_PIXEL_RECT(c_u_buf); INIT_PIXEL_RECT(c_v_buf); memcpy(a_y_buf, c_y_buf, c_y_buf_h * c_y_stride); memcpy(a_u_buf, c_u_buf, c_u_buf_h * c_u_stride); memcpy(a_v_buf, c_v_buf, c_v_buf_h * c_v_stride); if (itse) { INIT_PIXEL_RECT(c_top_sb); memcpy(a_top_sb, c_top_sb, 3 * 128 * sizeof(pixel)); } call_ref(c_ptrs, strides, wpad, hpad, w, h, flags HIGHBD_TAIL_SUFFIX); call_new(a_ptrs, strides, wpad, hpad, w, h, flags HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_u, strides[1], a_u, strides[1], w, h, "u_dst"); checkasm_check_pixel_padded(c_v, strides[1], a_v, strides[1], w, h, "v_dst"); if (padding) { wpad = imax(w / 8 - 1, 0); hpad = imax(h / 8 - 1, 0); } bench_new(a_ptrs, strides, wpad, hpad, w, h, flags | (CFL_HAS_TOP | CFL_HAS_LEFT) HIGHBD_TAIL_SUFFIX); } } } } report("%s", cfl_pred_type_names[type]); } } static void check_pal_pred(Dav2dIntraPredDSPContext *const c) { PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(uint8_t, idx, 32 * 64,); ALIGN_STK_16(pixel, pal, 8,); declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *pal, const uint8_t *idx, int w, int h); for (int w = 4; w <= 64; w <<= 1) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; if (check_func(c->pal_pred, "pal_pred_w%d_%dbpc", w, BITDEPTH)) for (int h = imax(4, 64 / w); h <= 64; h <<= 1) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int i = 0; i < 8; i++) pal[i] = rnd() & bitdepth_max; for (int i = 0; i < w * h / 2; i++) idx[i] = rnd() & 0x77; CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, pal, idx, w, h); call_new(u_dst, a_dst_stride, pal, idx, w, h); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); bench_new(a_dst, a_dst_stride, pal, idx, w, h); } } report("pal_pred"); } void bitfn(checkasm_check_ipred)(void) { Dav2dIntraPredDSPContext c; bitfn(dav2d_intra_pred_dsp_init)(&c); check_intra_pred(&c); check_cfl_pred(&c); check_pal_pred(&c); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/itx.c000066400000000000000000000323201517466257200237340ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include #include #include "src/itx.h" #include "src/levels.h" #include "src/scan.h" #include "src/tables.h" static const char *const itx_1d_names[] = { [DCT] = "dct", [IDENTITY] = "identity", [ADST] = "adst", [FLIPADST] = "flipadst", [DDT] = "ddt", [FLIPDDT] = "flipddt", [WHT] = "wht", }; static const char *const dpcm_names[] = { "", "_hdpcm", "_vdpcm" }; static int generate_coefs(coef *coeff, const enum RectTxfmSize tx, const enum TxfmType txtp, const int sw, const int sh, const int subsh, int *const max_eob, const int coef_max) { /* Generate topleft coefficients such that the return value (being the * coefficient scantable index for the eob token) guarantees that only * the topleft $sub out of $sz (where $sz >= $sub) coefficients in both * dimensions are non-zero. This leads to braching to specific optimized * simd versions (e.g. dc-only) so that we get full asm coverage in this * test */ const enum TxClass tx_class = (txtp >> 3) & 3; const uint16_t *const scan = dav2d_scans[tx]; const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0; const int sub_low = subsh > 1 ? sub_high - 8 : 0; const int coef_sign = (coef_max + 1) >> 1; int n, eob; for (n = 0, eob = 0; n < sw * sh; n++) { int rc, rcx, rcy; if (tx_class == TX_CLASS_H) rcx = n % sh, rcy = n / sh, rc = n; else if (tx_class == TX_CLASS_V) rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx; else rc = scan[n], rcx = rc % sh, rcy = rc / sh; /* Pick a random eob within this sub-itx */ if (rcx > sub_high || rcy > sub_high) break; /* upper boundary */ if (!eob && (rcx > sub_low || rcy > sub_low)) eob = n; /* lower boundary */ coeff[rc] = (rnd() & coef_max) - coef_sign; } *max_eob = n - 1; if (eob) eob += rnd() % (n - eob - 1); if (tx_class == TX_CLASS_H) for (n = eob + 1; n < sw * sh; n++) coeff[n] = 0; else if (tx_class == TX_CLASS_V) { for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1) while (++rcy < sw) coeff[rcy * sh + rcx] = 0; n = sw * sh; } else { for (n = eob + 1; n < sw * sh; n++) coeff[scan[n]] = 0; if (tx_class == TX_CLASS_2D_INV) { /* Reverse the coefficient array */ for (int i = 0, j = n - 1; i < j; i++, j--) { int tmp = coeff[i]; coeff[i] = coeff[j]; coeff[j] = tmp; } } } for (; n < 32 * 32; n++) coeff[n] = rnd(); return eob; } #define TXTP_MASK_DCT_ONLY DCT_DCT, 0xff /* invalid */ #define TXTP_MASK_IDTX IDTX, IDTX_INV #define TXTP_MASK_DCT_ID_ONLY TXTP_MASK_IDTX, TXTP_MASK_DCT_ONLY #define TXTP_MASK_DCT_HOR ADST_DCT, FLIPADST_DCT, H_DCT, TXTP_MASK_DCT_ONLY #define TXTP_MASK_DCT_VER DCT_ADST, DCT_FLIPADST, V_DCT, TXTP_MASK_DCT_ONLY #define TXTP_MASK_DCT_ID_HOR \ V_DCT, V_ADST, V_FLIPADST, TXTP_MASK_IDTX, TXTP_MASK_DCT_HOR #define TXTP_MASK_DCT_ID_VER \ H_DCT, H_ADST, H_FLIPADST, TXTP_MASK_IDTX, TXTP_MASK_DCT_VER #define TXTP_MASK_16x16 \ FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST, TXTP_MASK_IDTX, \ ADST_ADST, DCT_ADST, DCT_FLIPADST, V_DCT, TXTP_MASK_DCT_HOR #define TXTP_MASK_ALL V_ADST, H_ADST, V_FLIPADST, H_FLIPADST, TXTP_MASK_16x16 #define TXTP_MASK_ALL_LOSSLESS WHT_WHT, TXTP_MASK_ALL #define TXTP_MASK_DCT_VER_W_DDT DCT_DDT, DCT_FLIPDDT, TXTP_MASK_DCT_VER #define TXTP_MASK_DCT_HOR_W_DDT DDT_DCT, FLIPDDT_DCT, TXTP_MASK_DCT_HOR #define TXTP_MASK_DCT_ID_VER_W_DDT \ IDENTITY_DDT, IDENTITY_FLIPDDT, DCT_DDT, DCT_FLIPDDT, TXTP_MASK_DCT_ID_VER #define TXTP_MASK_DCT_ID_HOR_W_DDT \ DDT_IDENTITY, FLIPDDT_IDENTITY, DDT_DCT, FLIPDDT_DCT, TXTP_MASK_DCT_ID_HOR #define TXTP_MASK_DDT_NOID \ FLIPDDT_FLIPDDT, DDT_FLIPDDT, FLIPDDT_DDT, DDT_DDT, \ DCT_DDT, DDT_DCT, DCT_FLIPDDT, FLIPDDT_DCT #define TXTP_MASK_16x16_W_DDT TXTP_MASK_DDT_NOID, TXTP_MASK_16x16 #define TXTP_MASK_ALL_W_DDT_2D \ TXTP_MASK_DDT_NOID, DDT_IDENTITY, FLIPDDT_IDENTITY, \ IDENTITY_DDT, IDENTITY_FLIPDDT, TXTP_MASK_ALL #define TXTP_MASK_ALL_W_DDT_HOR \ ADST_DDT, ADST_FLIPDDT, DCT_DDT, DCT_FLIPDDT, FLIPADST_DDT, \ FLIPADST_FLIPDDT, IDENTITY_DDT, IDENTITY_FLIPDDT, TXTP_MASK_ALL #define TXTP_MASK_ALL_W_DDT_VER \ DDT_ADST, FLIPDDT_ADST, DDT_DCT, FLIPDDT_DCT, DDT_FLIPADST, \ FLIPDDT_FLIPADST, DDT_IDENTITY, FLIPDDT_IDENTITY, TXTP_MASK_ALL static const uint8_t valid_txtp_per_txsz[N_RECT_TX_SIZES][30] = { [TX_4X4] = { TXTP_MASK_ALL_LOSSLESS }, [TX_8X8] = { TXTP_MASK_ALL_W_DDT_2D }, [TX_16X16] = { TXTP_MASK_16x16_W_DDT }, [TX_32X32] = { TXTP_MASK_DCT_ID_ONLY }, [TX_64X64] = { TXTP_MASK_DCT_ONLY }, [RTX_4X8] = { TXTP_MASK_ALL_W_DDT_VER }, [RTX_8X4] = { TXTP_MASK_ALL_W_DDT_HOR }, [RTX_8X16] = { TXTP_MASK_ALL_W_DDT_2D }, [RTX_16X8] = { TXTP_MASK_ALL_W_DDT_2D }, [RTX_16X32] = { TXTP_MASK_DCT_ID_VER_W_DDT }, [RTX_32X16] = { TXTP_MASK_DCT_ID_HOR_W_DDT }, [RTX_32X64] = { TXTP_MASK_DCT_ONLY }, [RTX_64X32] = { TXTP_MASK_DCT_ONLY }, [RTX_4X16] = { TXTP_MASK_ALL_W_DDT_VER }, [RTX_16X4] = { TXTP_MASK_ALL_W_DDT_HOR }, [RTX_8X32] = { TXTP_MASK_DCT_ID_VER_W_DDT }, [RTX_32X8] = { TXTP_MASK_DCT_ID_HOR_W_DDT }, [RTX_16X64] = { TXTP_MASK_DCT_VER_W_DDT }, [RTX_64X16] = { TXTP_MASK_DCT_HOR_W_DDT }, [RTX_4X32] = { TXTP_MASK_DCT_ID_VER }, [RTX_32X4] = { TXTP_MASK_DCT_ID_HOR }, [RTX_8X64] = { TXTP_MASK_DCT_VER_W_DDT }, [RTX_64X8] = { TXTP_MASK_DCT_HOR_W_DDT }, [RTX_4X64] = { TXTP_MASK_DCT_VER }, [RTX_64X4] = { TXTP_MASK_DCT_HOR }, }; static void check_itxfm_add(const Dav2dInvTxfmDSPContext *const c, const enum RectTxfmSize tx) { ALIGN_STK_64(coef, coeff, 2, [32 * 32]); PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 }; const int w = dav2d_txfm_dimensions[tx].w * 4; const int h = dav2d_txfm_dimensions[tx].h * 4; const int sw = imin(w, 32), sh = imin(h, 32); const int subsh_max = subsh_iters[imax(dav2d_txfm_dimensions[tx].lw, dav2d_txfm_dimensions[tx].lh)]; pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff, enum TxfmType txtp, int eob HIGHBD_DECL_SUFFIX); /* Always using the largest possible coef_max just results in * most of the output being clipped to either 0 or bitdepth_max. * Randomize the range a bit to cover more scenarios. */ #if BITDEPTH == 16 const int bpc = (rnd() & 1) ? 10 : 12; #else const int bpc = 8; #endif const int coef_max = (1 << ((rnd() % (bpc + 5)) + 4)) - 1; const int bitdepth_max = (1 << bpc) - 1; for (int txtp_idx = 0; valid_txtp_per_txsz[tx][txtp_idx] != 0xff; txtp_idx++) { const enum TxfmType txtp = valid_txtp_per_txsz[tx][txtp_idx]; const enum Tx1dType hor1d = txtp & 0x7, ver1d = txtp >> 5; const int is_inv = ((txtp >> 3) & 0x3) == TX_CLASS_2D_INV; const int n_dpcm = 1 + 2 * (txtp == WHT_WHT || (hor1d == IDENTITY && ver1d == IDENTITY)); for (int dpcm = 0; dpcm < n_dpcm; dpcm++) for (int subsh = !!txtp; subsh < subsh_max; subsh++) if (check_func(c->itxfm_add[tx], "inv_txfm_add_%dx%d_%s_%s%s%s_%d_%dbpc", w, h, itx_1d_names[hor1d], itx_1d_names[ver1d], is_inv ? "_inv" : "", dpcm_names[dpcm], subsh, BITDEPTH)) { int max_eob; const int eob = generate_coefs(coeff[0], tx, txtp, sw, sh, subsh, &max_eob, coef_max); memcpy(coeff[1], coeff[0], sizeof(*coeff)); CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); for (int y = 0; y < h; y++) for (int x = 0; x < w; x++) c_dst[y*PXSTRIDE(c_dst_stride) + x] = u_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; call_ref(c_dst, c_dst_stride, coeff[0], txtp | (dpcm << 8), eob HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, coeff[1], txtp | (dpcm << 8), eob HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst")) { fprintf(stderr, "eob = %d\n", eob); } if (memcmp(coeff[0], coeff[1], sizeof(*coeff))) fail(); if ((hor1d <= ADST && ver1d <= ADST) || txtp == WHT_WHT) bench_new(alternate(c_dst, a_dst), a_dst_stride, alternate(coeff[0], coeff[1]), txtp | (dpcm << 8), max_eob HIGHBD_TAIL_SUFFIX); } } } static void check_cctx(const Dav2dInvTxfmDSPContext *const c) { ALIGN_STK_64(coef, c_coef, 2, [33 * 32]); ALIGN_STK_64(coef, a_coef, 2, [33 * 32]); declare_func(void, coef *u, coef *v, const int16_t angle[3], size_t sz HIGHBD_DECL_SUFFIX); if (check_func(c->cctx, "cctx_%dbpc", BITDEPTH)) { #if BITDEPTH == 16 const int bpc = (rnd() & 1) ? 10 : 12; const int bitdepth_max = (1 << bpc) - 1; #else const int bpc = 8; #endif const int coef_max = (1 << (bpc + 8)) - 1; const int coef_sign = (coef_max + 1) >> 1; for (int sz = 4 * 4; sz <= 32 * 32; sz <<= 1) { const int16_t *const angle = dav2d_cctx_angle[rnd() % 6]; for (int i = 0; i <= sz; i++) { c_coef[0][i] = (rnd() & coef_max) - coef_sign; c_coef[1][i] = (rnd() & coef_max) - coef_sign; } /* +1 to check for buffer overwrite */ const size_t mem_sz = sizeof(coef) * (sz + 1); memcpy(a_coef[0], c_coef[0], mem_sz); memcpy(a_coef[1], c_coef[1], mem_sz); call_ref(c_coef[0], c_coef[1], angle, sz HIGHBD_TAIL_SUFFIX); call_new(a_coef[0], a_coef[1], angle, sz HIGHBD_TAIL_SUFFIX); if (memcmp(c_coef[0], a_coef[0], mem_sz) || memcmp(c_coef[1], a_coef[1], mem_sz)) { fail(); } if (sz == 16 * 16) bench_new(alternate(c_coef[0], a_coef[0]), alternate(c_coef[1], a_coef[1]), angle, sz HIGHBD_TAIL_SUFFIX); } } report("cctx"); } void bitfn(checkasm_check_itx)(void) { static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = { // tx4 TX_4X4, // tx8 RTX_4X8, RTX_8X4, TX_8X8, // tx16 RTX_4X16, RTX_16X4, RTX_8X16, RTX_16X8, TX_16X16, // tx32 RTX_4X32, RTX_32X4, RTX_8X32, RTX_32X8, RTX_16X32, RTX_32X16, TX_32X32, // tx64 RTX_4X64, RTX_64X4, RTX_8X64, RTX_64X8, RTX_16X64, RTX_64X16, RTX_32X64, RTX_64X32, TX_64X64, }; Dav2dInvTxfmDSPContext c; bitfn(dav2d_itx_dsp_init)(&c); const uint8_t *txfm = txfm_size_order; for (int i = 0; i < 5; i++) { for (int j = 0; j <= i * 2; j++) check_itxfm_add(&c, *txfm++); report("add_tx%d", 4 << i); } assert(txfm == &txfm_size_order[N_RECT_TX_SIZES]); check_cctx(&c); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/looprestoration.c000066400000000000000000000133761517466257200264050ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include #include #include "src/levels.h" #include "src/looprestoration.h" #include "src/tables.h" static int to_binary(int x) { /* 0-15 -> 0000-1111 */ return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8); } static void init_tmp(pixel *buf, const ptrdiff_t stride, const int w, const int h, const int bitdepth_max) { const int noise_mask = bitdepth_max >> 4; const int x_off = rnd() & 7, y_off = rnd() & 7; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^ (rnd() & noise_mask); } buf += PXSTRIDE(stride); } } #if 0 static void check_wiener(Dav2dLoopRestorationDSPContext *const c, const int bpc) { PIXEL_RECT(c_dst, 384, 64); PIXEL_RECT(a_dst, 384, 64); PIXEL_RECT(h_edge, 384, 8); pixel left[64][4]; LooprestorationParams params; int16_t (*const filter)[8] = params.filter; declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*const left)[4], const pixel *lpf, int w, int h, const LooprestorationParams *params, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); for (int t = 0; t < 2; t++) { if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) { filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5; filter[0][1] = filter[0][5] = (rnd() & 31) - 23; filter[0][2] = filter[0][4] = (rnd() & 63) - 17; filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2; #if BITDEPTH != 8 filter[0][3] += 128; #endif filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5; filter[1][1] = filter[1][5] = (rnd() & 31) - 23; filter[1][2] = filter[1][4] = (rnd() & 63) - 17; filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2; const int base_w = 1 + (rnd() % 384); const int base_h = 1 + (rnd() & 63); const int bitdepth_max = (1 << bpc) - 1; CLEAR_PIXEL_RECT(c_dst); /* We potentially read 3 pixels to the left of the input * pointer, and up to the max width, horizontally. * (In the case of LR_HAVE_RIGHT we read 3 pixels past the * input dimensions, but if LR_HAVE_RIGHT we have w == 256.) * Therefore, initialize (384+4) x 64 pixels. */ init_tmp(c_dst - 4, c_dst_stride, 384 + 4, 64, bitdepth_max); init_tmp(h_edge - 4, h_edge_stride, 384 + 4, 8, bitdepth_max); init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max); for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) { const int w = edges & LR_HAVE_RIGHT ? 256 : base_w; const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h; assert(c_dst_stride == a_dst_stride); assert(c_dst_buf_h == a_dst_buf_h); memcpy(a_dst_buf, c_dst_buf, a_dst_stride * a_dst_buf_h); assert(c_dst_stride == h_edge_stride); call_ref(c_dst, c_dst_stride, left, h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); call_new(a_dst, a_dst_stride, left, h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel_padded_align(c_dst, c_dst_stride, a_dst, a_dst_stride, w, h, "dst", 64, 1)) { fprintf(stderr, "size = %dx%d, edges = %04d\n", w, h, to_binary(edges)); break; } } bench_new(alternate(c_dst, a_dst), a_dst_stride, left, h_edge, 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); } } } #endif void bitfn(checkasm_check_looprestoration)(void) { #if 0 #if BITDEPTH == 16 const int bpc_min = 10, bpc_max = 12; #else const int bpc_min = 8, bpc_max = 8; #endif for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) { Dav2dLoopRestorationDSPContext c; bitfn(dav2d_loop_restoration_dsp_init)(&c, bpc); check_wiener(&c, bpc); } report("wiener"); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/mc.c000066400000000000000000001054761517466257200235440ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include #include "src/levels.h" #include "src/mc.h" static const char *const filter_names[] = { [DAV2D_FILTER_8TAP_REGULAR] = "regular", [DAV2D_FILTER_8TAP_SMOOTH] = "smooth", [DAV2D_FILTER_8TAP_SHARP] = "sharp", [DAV2D_FILTER_BILINEAR] = "bilinear", }; static const char *const mxy_names[] = { "0", "h", "v", "hv" }; static const char *const scaled_paths[] = { "", "_dy1", "_dy2" }; static void check_mc(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, (64 + 7) * (64 + 7),); PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); const pixel *src = src_buf + (64 + 7) * 3 + 3; const ptrdiff_t src_stride = (64 + 7) * sizeof(pixel); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, int w, int h, int mx, int my HIGHBD_DECL_SUFFIX); for (int filter = 0; filter < DAV2D_N_FILTERS; filter++) for (int w = 2; w <= 64; w <<= 1) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int mxy = 0; mxy < 4; mxy++) if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc", filter_names[filter], w, mxy_names[mxy], BITDEPTH)) { for (int h = 2; h <= 64; h <<= 1) { const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0; const int my = (mxy & 2) ? rnd() % 15 + 1 : 0; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int i = 0; i < (64 + 7) * (64 + 7); i++) src_buf[i] = rnd() & bitdepth_max; CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); if (filter == DAV2D_FILTER_8TAP_REGULAR || filter == DAV2D_FILTER_8TAP_SHARP || filter == DAV2D_FILTER_BILINEAR) { bench_new(a_dst, a_dst_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); } } } } report("mc"); } /* Generate worst case input in the topleft corner, randomize the rest */ static void generate_mct_input(pixel *buf, const int bitdepth_max) { static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 }; const int sign = -(rnd() & 1); for (int y = 0; y < 64 + 7; y++) for (int x = 0; x < 64 + 7; x++) *buf++ = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign) : rnd()) & bitdepth_max; } static void check_mct(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, (64 + 7) * (64 + 7),); ALIGN_STK_64(int16_t, c_tmp, 64 * 64,); ALIGN_STK_64(int16_t, a_tmp, 64 * 64,); const pixel *src = src_buf + (64 + 7) * 3 + 3; const ptrdiff_t src_stride = (64 + 7) * sizeof(pixel); const ptrdiff_t tmp_stride = 64; declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, int w, int h, int mx, int my HIGHBD_DECL_SUFFIX); for (int filter = 0; filter < DAV2D_N_FILTERS; filter++) for (int w = 4; w <= 64; w <<= 1) for (int mxy = 0; mxy < 4; mxy++) if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc", filter_names[filter], w, mxy_names[mxy], BITDEPTH)) for (int h = 4; h <= 64; h <<= 1) { const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0; const int my = (mxy & 2) ? rnd() % 15 + 1 : 0; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif generate_mct_input(src_buf, bitdepth_max); memset(c_tmp, 0x77, 64*64*sizeof(int16_t)); memset(a_tmp, 0x77, 64*64*sizeof(int16_t)); call_ref(c_tmp, tmp_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); call_new(a_tmp, tmp_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); checkasm_check(int16_t, c_tmp, tmp_stride * sizeof(*c_tmp), a_tmp, tmp_stride * sizeof(*a_tmp), w, h, "tmp"); if (filter == DAV2D_FILTER_8TAP_REGULAR || filter == DAV2D_FILTER_8TAP_SHARP || filter == DAV2D_FILTER_BILINEAR) { bench_new(a_tmp, w, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); } } report("mct"); } static void check_mc_scaled(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, (128 + 7) * (128 + 7),); PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); const pixel *src = src_buf + (128 + 7) * 3 + 3; const ptrdiff_t src_stride = (128 + 7) * sizeof(pixel); #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); for (int filter = 0; filter < DAV2D_N_FILTERS; filter++) for (int w = 2; w <= 64; w <<= 1) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int p = 0; p < 3; ++p) { if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc", filter_names[filter], w, scaled_paths[p], BITDEPTH)) { for (int h = 2; h <= 64; h <<= 1) { const int mx = rnd() % 1024; const int my = rnd() % 1024; const int dx = rnd() % 2048 + 1; const int dy = !p ? rnd() % 2048 + 1 : p << 10; // ystep=1.0 and ystep=2.0 paths for (int k = 0; k < (128 + 7) * (128 + 7); k++) src_buf[k] = rnd() & bitdepth_max; CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); if (filter == DAV2D_FILTER_8TAP_REGULAR || filter == DAV2D_FILTER_BILINEAR) bench_new(a_dst, a_dst_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); } } } } report("mc_scaled"); } static void check_mct_scaled(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, (128 + 7) * (128 + 7),); ALIGN_STK_64(int16_t, c_tmp, 64 * 64,); ALIGN_STK_64(int16_t, a_tmp, 64 * 64,); const pixel *src = src_buf + (128 + 7) * 3 + 3; const ptrdiff_t src_stride = (128 + 7) * sizeof(pixel); const ptrdiff_t tmp_stride = 64; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); for (int filter = 0; filter < DAV2D_N_FILTERS; filter++) for (int w = 4; w <= 64; w <<= 1) for (int p = 0; p < 3; ++p) { if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc", filter_names[filter], w, scaled_paths[p], BITDEPTH)) { for (int h = 4; h <= 64; h <<= 1) { const int mx = rnd() % 1024; const int my = rnd() % 1024; const int dx = rnd() % 2048 + 1; const int dy = !p ? rnd() % 2048 + 1 : p << 10; // ystep=1.0 and ystep=2.0 paths for (int k = 0; k < (128 + 7) * (128 + 7); k++) src_buf[k] = rnd() & bitdepth_max; memset(c_tmp, 0x77, 64*64*sizeof(int16_t)); memset(a_tmp, 0x77, 64*64*sizeof(int16_t)); call_ref(c_tmp, tmp_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); call_new(a_tmp, tmp_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); checkasm_check(int16_t, c_tmp, tmp_stride * sizeof(*c_tmp), a_tmp, tmp_stride * sizeof(*a_tmp), w, h, "tmp"); if (filter == DAV2D_FILTER_8TAP_REGULAR || filter == DAV2D_FILTER_BILINEAR) bench_new(a_tmp, w, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); } } } report("mct_scaled"); } static void init_tmp(Dav2dMCDSPContext *const c, pixel *const buf, int16_t (*const tmp)[64 * 64], const int bitdepth_max) { for (int i = 0; i < 2; i++) { generate_mct_input(buf, bitdepth_max); c->mct[DAV2D_FILTER_8TAP_SHARP](tmp[i], 64, buf + (64 + 7) * 3 + 3, (64 + 7) * sizeof(pixel), 64, 64, 8, 8 HIGHBD_TAIL_SUFFIX); } } static void check_avg(Dav2dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [64 * 64]); PIXEL_RECT(c_dst, 64 + 7, 64 + 7); PIXEL_RECT(a_dst, 64, 64); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX); for (int w = 4; w <= 64; w <<= 1) if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int h = 4; h <= 64; h <<= 1) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif init_tmp(c, c_dst, tmp, bitdepth_max); CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); } } report("avg"); } static void check_w_avg(Dav2dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [64 * 64]); PIXEL_RECT(c_dst, 64 + 7, 64 + 7); PIXEL_RECT(a_dst, 64, 64); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX); for (int w = 4; w <= 64; w <<= 1) if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int h = 4; h <= 64; h <<= 1) { int weight = rnd() % 25 - 4; // -4..20 #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif init_tmp(c, c_dst, tmp, bitdepth_max); CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); } } report("w_avg"); } static void check_mask(Dav2dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [64 * 64]); PIXEL_RECT(c_dst, 64 + 7, 64 + 7); PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(uint8_t, mask, 64 * 64,); for (int i = 0; i < 64 * 64; i++) mask[i] = rnd() % 65; declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, int w, int h, const uint8_t *mask HIGHBD_DECL_SUFFIX); for (int w = 4; w <= 64; w <<= 1) if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int h = 4; h <= 64; h <<= 1) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif init_tmp(c, c_dst, tmp, bitdepth_max); CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); } } report("mask"); } static void check_w_mask(Dav2dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [64 * 64]); PIXEL_RECT(c_dst, 64 + 7, 64 + 7); PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(uint8_t, c_mask, 64 * 64,); ALIGN_STK_64(uint8_t, a_mask, 64 * 64,); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, int w, int h, uint8_t *mask, ptrdiff_t stride, int sign HIGHBD_DECL_SUFFIX); static const uint16_t ss[] = { 444, 422, 420 }; static const uint8_t ss_hor[] = { 0, 1, 1 }; static const uint8_t ss_ver[] = { 0, 0, 1 }; for (int i = 0; i < 3; i++) for (int w = 4; w <= 64; w <<= 1) if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w, BITDEPTH)) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int h = 4; h <= 64; h <<= 1) { int sign = rnd() & 1; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif init_tmp(c, c_dst, tmp, bitdepth_max); const ptrdiff_t mask_stride = (w == 64 && ss_hor[i] && rnd() & 1) ? w : w >> ss_hor[i]; CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); memset(c_mask, 0x77, 64*64); memset(a_mask, 0x77, 64*64); call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, c_mask, mask_stride, sign HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, tmp[0], tmp[1], w, h, a_mask, mask_stride, sign HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); checkasm_check(uint8_t, c_mask, mask_stride, a_mask, mask_stride, w >> ss_hor[i], h >> ss_ver[i], "mask"); bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, a_mask, mask_stride, sign HIGHBD_TAIL_SUFFIX); } } report("w_mask"); } static void check_blend(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, tmp, 64 * 64,); PIXEL_RECT(c_dst, 64, 64); PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(uint8_t, mask, 64 * 64,); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h, const uint8_t *mask); for (int w = 4; w <= 64; w <<= 1) { if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH)) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; for (int h = 4; h <= 64; h <<= 1) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int i = 0; i < w * h; i++) { tmp[i] = rnd() & bitdepth_max; mask[i] = rnd() % 65; } CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); for (int y = 0; y < h; y++) for (int x = 0; x < w; x++) c_dst[y*PXSTRIDE(c_dst_stride) + x] = u_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; call_ref(c_dst, c_dst_stride, tmp, w, h, mask); call_new(u_dst, a_dst_stride, tmp, w, h, mask); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, w, h, "dst"); bench_new(alternate(c_dst, a_dst), a_dst_stride, tmp, w, h, mask); } } } report("blend"); } static void check_warp8x8(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, 15 * 15,); PIXEL_RECT(c_dst, 8, 8); PIXEL_RECT(a_dst, 16, 8); int16_t abcd[4]; const pixel *src = src_buf + 15 * 3 + 3; const ptrdiff_t src_stride = 15 * sizeof(pixel); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride, const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX); if (check_func(c->warp8x8, "warp_8x8_%dbpc", BITDEPTH)) { const int mx = (rnd() & 0x6fff) - 0x3000; const int my = (rnd() & 0x6fff) - 0x3000; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif pixel *const u_dst = a_dst + 4; abcd[0] = (rnd() & 0x6fff) - 0x3000; abcd[1] = (rnd() & 0x37ff) - 0x1800; abcd[2] = (rnd() & 0x6fff) - 0x3000; abcd[3] = (rnd() & 0x6fff) - 0x3000; for (int i = 0; i < 15 * 15; i++) src_buf[i] = rnd() & bitdepth_max; CLEAR_PIXEL_RECT(c_dst); CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, c_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); call_new(u_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); checkasm_check_pixel_padded(c_dst, c_dst_stride, u_dst, a_dst_stride, 8, 8, "dst"); bench_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); } report("warp8x8"); } static void check_warp8x8t(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, 15 * 15,); ALIGN_STK_64(int16_t, c_tmp, 8 * 8,); ALIGN_STK_64(int16_t, a_tmp, 8 * 8,); int16_t abcd[4]; const pixel *src = src_buf + 15 * 3 + 3; const ptrdiff_t src_stride = 15 * sizeof(pixel); declare_func(void, int16_t *tmp, ptrdiff_t tmp_stride, const pixel *src, ptrdiff_t src_stride, const int16_t *abcd, int mx, int my HIGHBD_DECL_SUFFIX); if (check_func(c->warp8x8t, "warp_8x8t_%dbpc", BITDEPTH)) { const int mx = (rnd() & 0x1fff) - 0xa00; const int my = (rnd() & 0x1fff) - 0xa00; #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int i = 0; i < 4; i++) abcd[i] = (rnd() & 0x1fff) - 0xa00; for (int i = 0; i < 15 * 15; i++) src_buf[i] = rnd() & bitdepth_max; call_ref(c_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); call_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); checkasm_check(int16_t, c_tmp, 8 * sizeof(*c_tmp), a_tmp, 8 * sizeof(*a_tmp), 8, 8, "tmp"); bench_new(a_tmp, 8, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); } report("warp8x8t"); } enum EdgeFlags { HAVE_TOP = 1, HAVE_BOTTOM = 2, HAVE_LEFT = 4, HAVE_RIGHT = 8, }; static void random_offset_for_edge(int *const x, int *const y, const int bw, const int bh, int *const iw, int *const ih, const enum EdgeFlags edge) { #define set_off(edge1, edge2, pos, dim) \ *i##dim = edge & (HAVE_##edge1 | HAVE_##edge2) ? 96 : 1 + (rnd() % (b##dim - 2)); \ switch (edge & (HAVE_##edge1 | HAVE_##edge2)) { \ case HAVE_##edge1 | HAVE_##edge2: \ assert(b##dim <= *i##dim); \ *pos = rnd() % (*i##dim - b##dim + 1); \ break; \ case HAVE_##edge1: \ *pos = (*i##dim - b##dim) + 1 + (rnd() % (b##dim - 1)); \ break; \ case HAVE_##edge2: \ *pos = -(1 + (rnd() % (b##dim - 1))); \ break; \ case 0: \ assert(b##dim - 1 > *i##dim); \ *pos = -(1 + (rnd() % (b##dim - *i##dim - 1))); \ break; \ } set_off(LEFT, RIGHT, x, w); set_off(TOP, BOTTOM, y, h); } static void check_emuedge(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, c_dst, (64 + 7) * 128,); ALIGN_STK_64(pixel, a_dst, (64 + 7) * 128,); ALIGN_STK_64(pixel, src, 96 * 96,); for (int i = 0; i < 96 * 96; i++) src[i] = rnd() & ((1U << BITDEPTH) - 1); declare_func(void, intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride); int x, y, iw, ih; for (int w = 4; w <= 64; w <<= 1) if (check_func(c->emu_edge, "emu_edge_w%d_%dbpc", w, BITDEPTH)) { for (int h = 4; h <= 64; h <<= 1) { // we skip 0xf, since it implies that we don't need emu_edge for (enum EdgeFlags edge = 0; edge < 0xf; edge++) { const int bw = w + (rnd() & 7); const int bh = h + (rnd() & 7); random_offset_for_edge(&x, &y, bw, bh, &iw, &ih, edge); call_ref(bw, bh, iw, ih, x, y, c_dst, 128 * sizeof(pixel), src, 96 * sizeof(pixel)); call_new(bw, bh, iw, ih, x, y, a_dst, 128 * sizeof(pixel), src, 96 * sizeof(pixel)); checkasm_check_pixel(c_dst, 128 * sizeof(pixel), a_dst, 128 * sizeof(pixel), bw, bh, "dst"); } } for (enum EdgeFlags edge = 1; edge < 0xf; edge <<= 1) { random_offset_for_edge(&x, &y, w + 7, w + 7, &iw, &ih, edge); bench_new(w + 7, w + 7, iw, ih, x, y, a_dst, 128 * sizeof(pixel), src, 96 * sizeof(pixel)); } } report("emu_edge"); } static void check_morph(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, c_dst, 64 * 64,); ALIGN_STK_64(pixel, a_dst, 64 * 64 + 4,); ALIGN_STK_64(pixel, src, 64 * 64,); declare_func(void, pixel *dst, ptrdiff_t dst_stride, int alpha, int beta, int w, int h HIGHBD_DECL_SUFFIX); for (int w = 4; w <= 64; w <<= 1) { pixel *const u_dst = w == 64 ? a_dst : a_dst + 4; if (check_func(c->morph, "morph_w%d_%dbpc", w, BITDEPTH)) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int i = 0; i < 64 * 64; i++) src[i] = rnd() & bitdepth_max; int a, b; for (int h = 4; h <= 64; h <<= 1) { a = (rnd() % 1023) - 511; b = (rnd() & (bitdepth_max * 2)) - bitdepth_max; b += bitdepth_max * 0x80 - (a * bitdepth_max >> 1); pixel_copy(c_dst, src, 64 * h); pixel_copy(u_dst, src, 64 * h); call_ref(c_dst, 64 * sizeof(pixel), a, b, w, h HIGHBD_TAIL_SUFFIX); call_new(u_dst, 64 * sizeof(pixel), a, b, w, h HIGHBD_TAIL_SUFFIX); checkasm_check_pixel(c_dst, 64 * sizeof(pixel), u_dst, 64 * sizeof(pixel), w, h, "dst"); } bench_new(a_dst, 64 * sizeof(pixel), a, b, w, w HIGHBD_TAIL_SUFFIX); } } report("morph"); } static void check_sadrefinemv(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src1, 24 * 24,); ALIGN_STK_64(pixel, src2, 24 * 24,); declare_func(void, const pixel *p0, ptrdiff_t p0_stride, const pixel *p1, ptrdiff_t p1_stride, int w, int h, int is_implicit, struct OpflOffset *o HIGHBD_DECL_SUFFIX); for (int w = 8; w <= 16; w <<= 1) for (int h = 8; h <= 16; h <<= 1) if (check_func(c->sad_refine_mv, "sad_refine_mv_%dx%d_%dbpc", w, h, BITDEPTH)) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif const int is_impl = rnd() & 1; const int sw = w + 8, sh = h + 8; // semi-random input src1[0] = rnd() & bitdepth_max; const int range = 1 + (rnd() % (bitdepth_max >> 4)); const int drange = 2 * range; for (int x = 1; x < sw; x++) { src1[x] = iclip_pixel(src1[x - 1] + (rnd() % drange) - range); } for (int y = 1; y < sh; y++) { src1[y * sw] = iclip_pixel(src1[(y - 1) * sw] + (rnd() % drange) - range); for (int x = 1; x < sw; x++) { const int in1 = src1[(y - 1) * sw + x]; const int in2 = src1[y * sw + x - 1]; src1[y * sw + x] = iclip_pixel(((in1 + in2 + 1) >> 1) + (rnd() % drange) - range); } } // bias towards a particular outcome const int dx = (rnd() % 5) - 2; const int dy = (rnd() % 5) - 2; for (int y = 0; y < sh; y++) { const int yy = (dy + y + sh) % sh; for (int x = 0; x < sw; x++) { const int xx = (dx + x + sw) % sw; const int in = src1[yy * sw + xx]; src2[y * 24 + x] = iclip_pixel(in + (rnd() % drange) - range); } } struct OpflOffset c_o, a_o; call_ref(src1, sw * sizeof(pixel), src2, 24 * sizeof(pixel), w, h, is_impl, &c_o HIGHBD_TAIL_SUFFIX); call_new(src1, sw * sizeof(pixel), src2, 24 * sizeof(pixel), w, h, is_impl, &a_o HIGHBD_TAIL_SUFFIX); if (c_o.x != a_o.x || c_o.y != a_o.y) if (fail()) fprintf(stderr, "c_off=y:%d,x:%d != simd_off=y:%d,x:%d " "for input d=y:%d,x:%d @ size=%dx%d [r=%d]\n", c_o.y, c_o.x, a_o.y, a_o.x, dy, dx, w, h, range); bench_new(src1, sw * sizeof(pixel), src2, 24 * sizeof(pixel), w, h, is_impl, &a_o HIGHBD_TAIL_SUFFIX); } report("sad_refine_mv"); } static void check_opflrefinemv(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src1, 64 * 16,); ALIGN_STK_64(pixel, src2, 64 * 16,); declare_func(void, struct OpflRegressionData *out, const pixel *p0, ptrdiff_t p0_stride, const pixel *p1, ptrdiff_t p1_stride, int w, int h, int bs, const union aliasi16 d HIGHBD_DECL_SUFFIX); for (int bs = 4; bs <= 8; bs <<= 1) for (int w = 8; w <= (bs == 4 ? 8 : 64); w <<= 1) for (int h = 8; h <= bs * 2; h <<= 1) if (check_func(c->opfl_derive_mv, "opfl_refine_mv_%dx%d_bs%d_%dbpc", w, h, bs, BITDEPTH)) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { src1[y * w + x] = rnd() & bitdepth_max; src2[y * 64 + x] = rnd() & bitdepth_max; } } int d0 = 1 + (rnd() & 1); int d1 = d0 == 2 ? 1 : 1 + (rnd() & 1); if (rnd() & 1) d0 = -d0; if (rnd() & 1) d1 = -d1; const union aliasi16 d = { .i8 = { d0, d1 } }; struct OpflRegressionData c_o[16], a_o[16]; call_ref(c_o, src1, w * sizeof(pixel), src2, 64 * sizeof(pixel), w, h, bs, d HIGHBD_TAIL_SUFFIX); call_new(a_o, src1, w * sizeof(pixel), src2, 64 * sizeof(pixel), w, h, bs, d HIGHBD_TAIL_SUFFIX); const int cnt = (w * h) / (bs * bs); for (int n = 0; n < cnt; n++) if (memcmp(&c_o[n], &a_o[n], sizeof(*a_o)) && fail()) fprintf(stderr, "c:u2=%d,uv=%d,v2=%d,uw=%d,vw=%d != " "simd:u2=%d,uv=%d,v2=%d,uw=%d,vw=%d " "for n=%d/%d,bs=%d,sz=%dx%d,d=%d|%d\n", c_o[n].su2, c_o[n].suv, c_o[n].sv2, c_o[n].suw, c_o[n].svw, a_o[n].su2, a_o[n].suv, a_o[n].sv2, a_o[n].suw, a_o[n].svw, n, cnt, bs, w, h, d0, d1); if (w == h || (w >= 32 && h == 16)) bench_new(c_o, src1, w * sizeof(pixel), src2, 64 * sizeof(pixel), w, h, bs, d HIGHBD_TAIL_SUFFIX); } report("opfl"); } static void check_sad8x8(Dav2dMCDSPContext *const c) { ALIGN_STK_64(pixel, src1, 8 * 8,); ALIGN_STK_64(pixel, src2, 8 * 16,); declare_func(unsigned, const pixel *p0, ptrdiff_t p0_stride, const pixel *p1, ptrdiff_t p1_stride HIGHBD_DECL_SUFFIX); if (check_func(c->sad8x8, "sad8x8_%dbpc", BITDEPTH)) { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else const int bitdepth_max = 0xff; #endif for (int y = 0; y < 8; y++) { for (int x = 0; x < 8; x++) { src1[y * 8 + x] = rnd() & bitdepth_max; src2[y * 16 + x] = rnd() & bitdepth_max; } } const unsigned c_o = call_ref(src1, 8 * sizeof(pixel), src2, 16 * sizeof(pixel) HIGHBD_TAIL_SUFFIX); const unsigned a_o = call_new(src1, 8 * sizeof(pixel), src2, 16 * sizeof(pixel) HIGHBD_TAIL_SUFFIX); if (c_o != a_o && fail()) fprintf(stderr, "c_sad:%u != simd_sad:%u\n", c_o, a_o); bench_new(src1, 8 * sizeof(pixel), src2, 16 * sizeof(pixel) HIGHBD_TAIL_SUFFIX); } report("sad8x8"); } void bitfn(checkasm_check_mc)(void) { Dav2dMCDSPContext c; bitfn(dav2d_mc_dsp_init)(&c); check_mc(&c); check_mct(&c); check_mc_scaled(&c); check_mct_scaled(&c); check_avg(&c); check_w_avg(&c); check_mask(&c); check_w_mask(&c); check_blend(&c); check_warp8x8(&c); check_warp8x8t(&c); check_emuedge(&c); check_morph(&c); check_sadrefinemv(&c); check_opflrefinemv(&c); check_sad8x8(&c); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/msac.c000066400000000000000000000352531517466257200240630ustar00rootroot00000000000000/* * Copyright © 2019-2026, VideoLAN and dav2d authors * Copyright © 2019-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include "src/cpu.h" #include "src/msac.h" #include #include #include #define BUF_SIZE 128 /* The normal code doesn't use function pointers */ typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf, size_t n_symbols); typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf); typedef unsigned (*decode_bool_bypass_fn)(MsacContext *s); typedef unsigned (*decode_bools_bypass_fn)(MsacContext *s, unsigned n_bits); typedef unsigned (*decode_unary_bypass6_fn)(MsacContext *s, unsigned max_bits); typedef unsigned (*decode_unary_bypass21_fn)(MsacContext *s); typedef struct { decode_symbol_adapt_fn decode_symbol_adapt4; decode_symbol_adapt_fn decode_symbol_adapt8; decode_adapt_fn decode_bool_adapt; decode_bool_bypass_fn decode_bool_bypass; decode_bools_bypass_fn decode_bools_bypass; decode_unary_bypass6_fn decode_unary_bypass6; decode_unary_bypass21_fn decode_unary_bypass21; } MsacDSPContext; static void randomize_cdf(uint16_t *const cdf, int n) { for (int i = n + 1; i < 8; i++) cdf[i] = 0; // padding cdf[n] = (rnd() % 125) << 8; // para if (rnd() & 1) cdf[--n] = 0; // explicitly test the zero edge case for (int c = 0; n > 0; n--) { c += rnd() % (32768 - c - n) + 1; cdf[n - 1] = c; } } /* memcmp() on structs can have weird behavior due to padding etc. */ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) { return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end || a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt || a->allow_update_cdf != b->allow_update_cdf; } static void msac_dump(unsigned c_res, unsigned a_res, const MsacContext *const a, const MsacContext *const b, const uint16_t *const cdf_a, const uint16_t *const cdf_b, const int num_cdf) { if (c_res != a_res) fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res); if (a->buf_pos != b->buf_pos) fprintf(stderr, "buf_pos %p vs %p\n", a->buf_pos, b->buf_pos); if (a->buf_end != b->buf_end) fprintf(stderr, "buf_end %p vs %p\n", a->buf_end, b->buf_end); if (a->dif != b->dif) fprintf(stderr, "dif %016"PRIx64" vs %016"PRIx64"\n", a->dif, b->dif); if (a->rng != b->rng) fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng); if (a->cnt != b->cnt) fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt); if (a->allow_update_cdf != b->allow_update_cdf) fprintf(stderr, "allow_update_cdf %d vs %d\n", a->allow_update_cdf, b->allow_update_cdf); if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) { fprintf(stderr, "cdf:\n"); for (int i = 0; i <= num_cdf; i++) fprintf(stderr, " %5u", cdf_a[i]); fprintf(stderr, "\n"); for (int i = 0; i <= num_cdf; i++) fprintf(stderr, " %5u", cdf_b[i]); fprintf(stderr, "\n"); for (int i = 0; i <= num_cdf; i++) fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.'); fprintf(stderr, "\n"); } } #define CHECK_SYMBOL_ADAPT(n) do { \ if (check_func(c->decode_symbol_adapt##n, \ "msac_decode_symbol_adapt%d", n)) \ { \ for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \ for (int ns = 1; ns < n; ns++) { \ dav2d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \ s_a = s_c; \ randomize_cdf(cdf[0], ns); \ memcpy(cdf[1], cdf[0], sizeof(*cdf)); \ while (s_c.cnt >= 0) { \ unsigned c_res = call_ref(&s_c, cdf[0], ns); \ unsigned a_res = call_new(&s_a, cdf[1], ns); \ if (c_res != a_res || msac_cmp(&s_c, &s_a) || \ memcmp(cdf[0], cdf[1], sizeof(**cdf) * (ns + 1))) \ { \ if (fail()) { \ fprintf(stderr, "n_symbols = %d\n", ns); \ msac_dump(c_res, a_res, &s_c, &s_a, \ cdf[0], cdf[1], ns); \ } \ break; \ } \ } \ if (cdf_update && ns == n - 1) \ bench_new(alternate(&s_c, &s_a), \ alternate(cdf[0], cdf[1]), ns); \ } \ } \ } \ } while (0) static void check_decode_symbol_adapt(MsacDSPContext *const c, uint8_t *const buf) { ALIGN_STK_16(uint16_t, cdf, 2, [8]); MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols); CHECK_SYMBOL_ADAPT(4); CHECK_SYMBOL_ADAPT(8); } static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) { MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, uint16_t *cdf); if (check_func(c->decode_bool_adapt, "msac_decode_bool_adapt")) { uint16_t cdf[2][2]; for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { dav2d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); s_a = s_c; cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1; cdf[0][1] = cdf[1][1] = (rnd() % 125) << 8; while (s_c.cnt >= 0) { unsigned c_res = call_ref(&s_c, cdf[0]); unsigned a_res = call_new(&s_a, cdf[1]); if (c_res != a_res || msac_cmp(&s_c, &s_a) || memcmp(cdf[0], cdf[1], sizeof(*cdf))) { if (fail()) msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1); break; } } if (cdf_update) bench_new(alternate(&s_c, &s_a), alternate(cdf[0], cdf[1])); } } } static unsigned generate_bypass_rng(const MsacContext *const s) { const unsigned dif = (unsigned)(s->dif >> 48); const unsigned r = rnd(); assert(dif < 0xfffe); if (dif & 0x8000) return (dif + (r % (0xfffe - dif)) + 2) & 0xfffe; return 0x8000 | (r & 0x7ffe); } static void check_decode_bool_bypass(MsacDSPContext *const c, uint8_t *const buf) { MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s); if (check_func(c->decode_bool_bypass, "msac_decode_bool_bypass")) { dav2d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; while (s_c.cnt >= 0) { s_a.rng = s_c.rng = generate_bypass_rng(&s_c); unsigned c_res = call_ref(&s_c); unsigned a_res = call_new(&s_a); if (c_res != a_res || msac_cmp(&s_c, &s_a)) { if (fail()) msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0); break; } } s_a.dif = s_c.dif >>= 1; s_c.rng = 0xfc92; // Somewhat arbitrarily chosen to produce s_a.rng = 0xdb6e; // a reasonably diverse branch pattern. bench_new(alternate(&s_c, &s_a)); } } static void check_decode_bools_bypass(MsacDSPContext *const c, uint8_t *const buf) { MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, unsigned n_bits); if (check_func(c->decode_bools_bypass, "msac_decode_bools_bypass")) { dav2d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; while (s_c.cnt >= 0) { s_a.rng = s_c.rng = generate_bypass_rng(&s_c); const int n_bits = 1 + (rnd() & 31); unsigned c_res = call_ref(&s_c, n_bits); unsigned a_res = call_new(&s_a, n_bits); if (c_res != a_res || msac_cmp(&s_c, &s_a)) { if (fail()) { fprintf(stderr, "n_bits = %d\n", n_bits); msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0); } break; } } s_a.dif = s_c.dif >>= 1; s_c.rng = 0xfc92; s_a.rng = 0xdb6e; bench_new(alternate(&s_c, &s_a), 8); } } static void check_decode_unary_bypass6(MsacDSPContext *const c, uint8_t *const buf) { MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, unsigned max_bits); if (check_func(c->decode_unary_bypass6, "msac_decode_unary_bypass6")) { dav2d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; while (s_c.cnt >= 0) { s_a.rng = s_c.rng = generate_bypass_rng(&s_c); const int max_bits = 5 + (rnd() & 1); unsigned c_res = call_ref(&s_c, max_bits); unsigned a_res = call_new(&s_a, max_bits); if (c_res != a_res || msac_cmp(&s_c, &s_a)) { if (fail()) { fprintf(stderr, "max_bits = %d\n", max_bits); msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0); } break; } } s_a.dif = s_c.dif >>= 1; s_c.rng = 0xfc92; s_a.rng = 0xdb6e; bench_new(alternate(&s_c, &s_a), 6); } } static void check_decode_unary_bypass21(MsacDSPContext *const c, uint8_t *const buf) { MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s); if (check_func(c->decode_unary_bypass21, "msac_decode_unary_bypass21")) { dav2d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; while (s_c.cnt >= 0) { s_a.rng = s_c.rng = generate_bypass_rng(&s_c); unsigned c_res = call_ref(&s_c); unsigned a_res = call_new(&s_a); if (c_res != a_res || msac_cmp(&s_c, &s_a)) { if (fail()) msac_dump(c_res, a_res, &s_c, &s_a, NULL, NULL, 0); break; } } s_a.dif = s_c.dif >>= 1; s_c.rng = 0xfc92; s_a.rng = 0xdb6e; bench_new(alternate(&s_c, &s_a)); } } static void check_decode_adapt(MsacDSPContext *const c, uint8_t *const buf) { check_decode_symbol_adapt(c, buf); check_decode_bool_adapt(c, buf); report("decode_adapt"); } static void check_decode_bypass(MsacDSPContext *const c, uint8_t *const buf) { check_decode_bool_bypass(c, buf); check_decode_bools_bypass(c, buf); check_decode_unary_bypass6(c, buf); check_decode_unary_bypass21(c, buf); report("decode_bypass"); } void checkasm_check_msac(void) { /* For performance reasons entropy decoding functions are called directly * instead of through function pointers. For testing purposes however we * do want to use functions pointers . */ MsacDSPContext c; c.decode_symbol_adapt4 = dav2d_msac_decode_symbol_adapt_c; c.decode_symbol_adapt8 = dav2d_msac_decode_symbol_adapt_c; c.decode_bool_adapt = dav2d_msac_decode_bool_adapt_c; c.decode_bool_bypass = dav2d_msac_decode_bool_bypass_c; c.decode_bools_bypass = dav2d_msac_decode_bools_bypass_c; c.decode_unary_bypass6 = dav2d_msac_decode_unary_bypass_c; c.decode_unary_bypass21 = dav2d_msac_decode_unary_bypass21_c; #if HAVE_ASM #if ARCH_AARCH64 if (dav2d_get_cpu_flags() & DAV2D_ARM_CPU_FLAG_NEON) { c.decode_bool_adapt = dav2d_msac_decode_bool_adapt_neon; c.decode_bool_bypass = dav2d_msac_decode_bool_bypass_neon; c.decode_bools_bypass = dav2d_msac_decode_bools_bypass_neon; c.decode_unary_bypass6 = dav2d_msac_decode_unary_bypass_neon; c.decode_unary_bypass21 = dav2d_msac_decode_unary_bypass21_neon; } #elif ARCH_X86_64 if (dav2d_get_cpu_flags() & DAV2D_X86_CPU_FLAG_SSE2) { c.decode_symbol_adapt4 = dav2d_msac_decode_symbol_adapt4_sse2; c.decode_symbol_adapt8 = dav2d_msac_decode_symbol_adapt8_sse2; c.decode_bool_adapt = dav2d_msac_decode_bool_adapt_sse2; c.decode_bool_bypass = dav2d_msac_decode_bool_bypass_sse2; c.decode_bools_bypass = dav2d_msac_decode_bools_bypass_sse2; } if (dav2d_get_cpu_flags() & DAV2D_X86_CPU_FLAG_AVX2) { c.decode_unary_bypass6 = dav2d_msac_decode_unary_bypass6_avx2; c.decode_unary_bypass21 = dav2d_msac_decode_unary_bypass21_avx2; } if (dav2d_get_cpu_flags() & DAV2D_X86_CPU_FLAG_AVX512ICL) { c.decode_unary_bypass21 = dav2d_msac_decode_unary_bypass21_avx512icl; } #endif #endif uint8_t buf[BUF_SIZE]; for (int i = 0; i < BUF_SIZE; i++) buf[i] = rnd(); check_decode_adapt(&c, buf); check_decode_bypass(&c, buf); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/pal.c000066400000000000000000000053611517466257200237110ustar00rootroot00000000000000/* * Copyright © 2023, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include "src/pal.h" #include #include static void check_pal_idx_finish(const Dav2dPalDSPContext *const c) { ALIGN_STK_64(uint8_t, src, 64 * 64,); ALIGN_STK_64(uint8_t, c_dst, 32 * 64,); ALIGN_STK_64(uint8_t, a_dst, 32 * 64,); declare_func(void, uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h); for (int bw = 4; bw <= 64; bw <<= 1) { if (check_func(c->pal_idx_finish, "pal_idx_finish_w%d", bw)) { for (int bh = imax(bw / 4, 4); bh <= imin(bw * 4, 64); bh <<= 1) { const int w = (rnd() & (bw - 4)) + 4; const int h = (rnd() & (bh - 4)) + 4; const int dst_bw = bw / 2; for (int i = 0; i < bw * bh; i++) src[i] = rnd() & 7; memset(c_dst, 0x88, dst_bw * bh); memset(a_dst, 0x88, dst_bw * bh); call_ref(c_dst, src, bw, bh, w, h); call_new(a_dst, src, bw, bh, w, h); checkasm_check(uint8_t, c_dst, dst_bw, a_dst, dst_bw, dst_bw, bh, "dst"); bench_new(a_dst, src, bw, bh, bw, bh); } } } report("pal_idx_finish"); } void checkasm_check_pal(void) { Dav2dPalDSPContext c; dav2d_pal_dsp_init(&c); check_pal_idx_finish(&c); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/checkasm/refmvs.c000066400000000000000000000350261517466257200244400ustar00rootroot00000000000000/* * Copyright © 2021, VideoLAN and dav2d authors * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "tests/checkasm/internal.h" #include "common/attributes.h" #include "src/refmvs.h" #include #include #if 0 static inline int gen_mv(const int total_bits, int spel_bits) { int bits = rnd() & ((1 << spel_bits) - 1); do { bits |= (rnd() & 1) << spel_bits; } while (rnd() & 1 && ++spel_bits < total_bits); // the do/while makes it relatively more likely to be close to zero (fpel) // than far away return rnd() & 1 ? -bits : bits; } static inline int get_min_mv_val(const int idx) { if (idx <= 9) return idx; else if (idx <= 18) return (idx - 9) * 10; else if (idx <= 27) return (idx - 18) * 100; else if (idx <= 36) return (idx - 27) * 1000; else return (idx - 36) * 10000; } static inline void gen_tmv(refmvs_temporal_block *const rb, const int8_t *const ref2ref) { rb->ref.ref[1] = -1; rb->ref.ref[0] = rnd() % 7; if (!rb->ref.ref[0]) return; static const int x_prob[] = { 26447556, 6800591, 3708783, 2198592, 1635940, 1145901, 1052602, 1261759, 1099739, 755108, 6075404, 4355916, 3254908, 2897157, 2273676, 2154432, 1937436, 1694818, 1466863, 10203087, 5241546, 3328819, 2187483, 1458997, 1030842, 806863, 587219, 525024, 1858953, 422368, 114626, 16992 }; static const int y_prob[] = { 33845001, 7591218, 6425971, 4115838, 4032161, 2515962, 2614601, 2343656, 2898897, 1397254, 10125350, 5124449, 3232914, 2185499, 1608775, 1342585, 980208, 795714, 649665, 3369250, 1298716, 486002, 279588, 235990, 110318, 89372, 66895, 46980, 153322, 32960, 4500, 389 }; const int prob = rnd() % 100000000; int acc = 0; for (unsigned i = 0; i < ARRAY_SIZE(x_prob); i++) { acc += x_prob[i]; if (prob < acc) { const int min = get_min_mv_val(i); const int max = get_min_mv_val(i + 1); const int val = min + rnd() % (max - min); rb->mv.mv[0].x = iclip(val * ref2ref[rb->ref.ref[0]], -(1 << 15), (1 << 15) - 1); break; } } acc = 0; for (unsigned i = 0; i < ARRAY_SIZE(y_prob); i++) { acc += y_prob[i]; if (prob < acc) { const int min = get_min_mv_val(i); const int max = get_min_mv_val(i + 1); const int val = min + rnd() % (max - min); rb->mv.mv[0].y = iclip(val * ref2ref[rb->ref.ref[0]], -(1 << 15), (1 << 15) - 1); break; } } } static inline int get_ref2cur(void) { const int prob = rnd() % 100; static const uint8_t ref2cur[11] = { 35, 55, 67, 73, 78, 83, 84, 87, 90, 93, 100 }; for (int i = 0; i < 11; i++) if (prob < ref2cur[i]) return rnd() & 1 ? -(i + 1) : i + 1; return 0; } static inline int get_seqlen(void) { int len = 0, max_len; const int prob = rnd() % 100000; // =1 =2 =3 =4 <8 =8 <16 =16 <32 =32 <48 =48 <64 =64 >64 eq240 // 5 17 1.5 16 5 10 5 7 4 3 1.5 2 1 2 20 15 chimera blocks // 25 38 2.5 19 3.5 5.5 2 1.87 .86 .4 .18 .2 .067 .165 .478 .28 chimera sequences if (prob < 25000) len = 1; // =1 5% else if (prob < 63000) len = 2; // =2 17% else if (prob < 65500) len = 3; // =3 1.5% else if (prob < 84500) len = 4; // =4 16% else if (prob < 88000) max_len = 7; // <8 5% (43.5% tot <8) else if (prob < 93500) len = 8; // =8 10% else if (prob < 95500) max_len = 15; // <16 5% else if (prob < 97370) len = 16; // =16 7% else if (prob < 98230) max_len = 31; // <32 4% else if (prob < 98630) len = 32; // =32 3% else if (prob < 98810) max_len = 47; // <48 1.5% else if (prob < 99010) len = 48; // =48 2% else if (prob < 99077) max_len = 63; // <64 1% else if (prob < 99242) len = 64; // =64 2% else if (prob < 99720) max_len = 239; // <240 5% else len = 240; // =240 15% if (!len) len = 1 + rnd() % max_len; return len; } static inline void init_rp_ref(refmvs_frame const *const rf, const int col_start8, const int col_end8, const int row_start8, const int row_end8) { const int col_start8i = imax(col_start8 - 8, 0); const int col_end8i = imin(col_end8 + 8, rf->iw8); for (int n = 0; n < rf->n_mfmvs; n++) { refmvs_temporal_block *rp_ref = rf->rp_ref[rf->mfmv[n].ref]; for (int i = row_start8; i < imin(row_end8, rf->ih8); i++) { for (int j = col_start8i; j < col_end8i;) { refmvs_temporal_block rb; gen_tmv(&rb, rf->mfmv_ref2ref[n]); for (int k = get_seqlen(); k && j < col_end8i; k--, j++) rp_ref[i * rf->iw8 + j] = rb; } } } } static void check_load_tmvs(const Dav2dRefmvsDSPContext *const c) { refmvs_temporal_block *rp_ref[7] = {0}; refmvs_temporal_block c_rp_proj[240 * 63]; refmvs_temporal_block a_rp_proj[240 * 63]; refmvs_frame rf = { .rp_ref = rp_ref, .rp_stride = 240, .iw8 = 240, .ih8 = 63, .n_mfmvs = 3 }; const size_t rp_ref_sz = rf.ih8 * rf.rp_stride * sizeof(refmvs_temporal_block); declare_func(void, const refmvs_frame *rf, int tile_row_idx, int col_start8, int col_end8, int row_start8, int row_end8); if (check_func(c->load_tmvs, "load_tmvs")) { const int row_start8 = (rnd() & 3) << 4; const int row_end8 = row_start8 + 16; const int col_start8 = rnd() & 31; const int col_end8 = rf.iw8 - (rnd() & 31); for (int n = 0; n < rf.n_mfmvs; n++) { rf.mfmv_ref[n] = rnd() % 7; rf.mfmv_ref2cur[n] = get_ref2cur(); for (int r = 0; r < 7; r++) rf.mfmv_ref2ref[n][r] = rnd() & 31; } for (int n = 0; n < rf.n_mfmvs; n++) { refmvs_temporal_block **p_rp_ref = &rp_ref[rf.mfmv_ref[n]]; if (!*p_rp_ref) *p_rp_ref = malloc(rp_ref_sz); } init_rp_ref(&rf, 0, rf.iw8, row_start8, row_end8); for (int i = 0; i < rf.iw8 * rf.ih8; i++) { c_rp_proj[i].mv.n = a_rp_proj[i].mv.n = 0xdeadbeef; c_rp_proj[i].ref = a_rp_proj[i].ref = 0xdd; } rf.n_tile_threads = 1; rf.rp_proj = c_rp_proj; call_ref(&rf, 0, col_start8, col_end8, row_start8, row_end8); rf.rp_proj = a_rp_proj; call_new(&rf, 0, col_start8, col_end8, row_start8, row_end8); for (int i = 0; i < rf.ih8; i++) for (int j = 0; j < rf.iw8; j++) if (c_rp_proj[i * rf.iw8 + j].mv.n != a_rp_proj[i * rf.iw8 + j].mv.n || (c_rp_proj[i * rf.iw8 + j].ref != a_rp_proj[i * rf.iw8 + j].ref && c_rp_proj[i * rf.iw8 + j].mv.n != INVALID_MV)) { if (fail()) { fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n", i, j, c_rp_proj[i * rf.iw8 + j].mv.x, a_rp_proj[i * rf.iw8 + j].mv.x); fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n", i, j, c_rp_proj[i * rf.iw8 + j].mv.y, a_rp_proj[i * rf.iw8 + j].mv.y); fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n", i, j, c_rp_proj[i * rf.iw8 + j].ref, a_rp_proj[i * rf.iw8 + j].ref); } } if (checkasm_bench_func()) { for (int n = 0; n < rf.n_mfmvs; n++) { rf.mfmv_ref2cur[n] = 1; for (int r = 0; r < 7; r++) rf.mfmv_ref2ref[n][r] = 1; } bench_new(&rf, 0, 0, rf.iw8, row_start8, row_end8); } for (int n = 0; n < rf.n_mfmvs; n++) { free(rp_ref[rf.mfmv_ref[n]]); rp_ref[rf.mfmv_ref[n]] = NULL; } } report("load_tmvs"); } static void check_save_tmvs(const Dav2dRefmvsDSPContext *const c) { refmvs_block *rr[31]; refmvs_block r[31 * 256]; ALIGN_STK_64(refmvs_temporal_block, c_rp, 128 * 16,); ALIGN_STK_64(refmvs_temporal_block, a_rp, 128 * 16,); uint8_t ref_sign[7]; for (int i = 0; i < 31; i++) rr[i] = &r[i * 256]; declare_func(void, refmvs_temporal_block *rp, const ptrdiff_t stride, refmvs_block *const *const rr, const uint8_t *const ref_sign, int col_end8, int row_end8, int col_start8, int row_start8); if (check_func(c->save_tmvs, "save_tmvs")) { const int row_start8 = rnd() & 7; const int row_end8 = 8 + (rnd() & 7); const int col_start8 = rnd() & 31; const int col_end8 = 96 + (rnd() & 31); for (int i = 0; i < 7; i++) ref_sign[i] = rnd() & 1; for (int i = row_start8; i < row_end8; i++) for (int j = col_start8; j < col_end8;) { int bs = rnd() % N_BS_SIZES; while (j + ((dav2d_block_dimensions[bs][0] + 1) >> 1) > col_end8) bs++; rr[i * 2][j * 2 + 1] = (refmvs_block) { .mv.mv[0].x = gen_mv(14, 10), .mv.mv[0].y = gen_mv(14, 10), .mv.mv[1].x = gen_mv(14, 10), .mv.mv[1].y = gen_mv(14, 10), .ref.ref = { (rnd() % 9) - 1, (rnd() % 9) - 1 }, .bs = bs }; for (int k = 0; k < (dav2d_block_dimensions[bs][0] + 1) >> 1; k++, j++) { c_rp[i * 128 + j].mv.n = 0xdeadbeef; c_rp[i * 128 + j].ref.pair = 0xdead; } } call_ref(c_rp + row_start8 * 128, 128, rr, ref_sign, col_end8, row_end8, col_start8, row_start8); call_new(a_rp + row_start8 * 128, 128, rr, ref_sign, col_end8, row_end8, col_start8, row_start8); for (int i = row_start8; i < row_end8; i++) for (int j = col_start8; j < col_end8; j++) if (c_rp[i * 128 + j].mv.n != a_rp[i * 128 + j].mv.n || c_rp[i * 128 + j].ref.pair != a_rp[i * 128 + j].ref.pair) { if (fail()) { fprintf(stderr, "[%d][%d] c_rp.mv.x = 0x%x a_rp.mv.x = 0x%x\n", i, j, c_rp[i * 128 + j].mv.mv[0].x, a_rp[i * 128 + j].mv.mv[0].x); fprintf(stderr, "[%d][%d] c_rp.mv.y = 0x%x a_rp.mv.y = 0x%x\n", i, j, c_rp[i * 128 + j].mv.mv[0].y, a_rp[i * 128 + j].mv.mv[0].y); fprintf(stderr, "[%d][%d] c_rp.ref = %u a_rp.ref = %u\n", i, j, c_rp[i * 128 + j].ref.ref[0], a_rp[i * 128 + j].ref.ref[0]); } } for (int bs = BS_4x4; bs < N_BS_SIZES; bs++) { const int bw8 = (dav2d_block_dimensions[bs][0] + 1) >> 1; for (int i = 0; i < 16; i++) for (int j = 0; j < 128; j += bw8) { rr[i * 2][j * 2 + 1].ref.ref[0] = (rnd() % 9) - 1; rr[i * 2][j * 2 + 1].ref.ref[1] = (rnd() % 9) - 1; rr[i * 2][j * 2 + 1].bs = bs; } bench_new(alternate(c_rp, a_rp), 128, rr, ref_sign, 128, 16, 0, 0); } } report("save_tmvs"); } static void check_splat_mv(const Dav2dRefmvsDSPContext *const c) { ALIGN_STK_64(refmvs_block, c_buf, 32 * 32,); ALIGN_STK_64(refmvs_block, a_buf, 32 * 32,); refmvs_block *c_dst[32]; refmvs_block *a_dst[32]; const size_t stride = 32 * sizeof(refmvs_block); for (int i = 0; i < 32; i++) { c_dst[i] = c_buf + 32 * i; a_dst[i] = a_buf + 32 * i; } declare_func(void, refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4); for (int w = 1; w <= 32; w *= 2) { if (check_func(c->splat_mv, "splat_mv_w%d", w)) { const int h_min = imax(w / 4, 1); const int h_max = imin(w * 4, 32); const int w_uint32 = w * sizeof(refmvs_block) / sizeof(uint32_t); for (int h = h_min; h <= h_max; h *= 2) { const int offset = (int) ((unsigned) w * rnd()) & 31; union { refmvs_block rmv; uint32_t u32[3]; } ALIGN(tmp, 16); tmp.u32[0] = rnd(); tmp.u32[1] = rnd(); tmp.u32[2] = rnd(); call_ref(c_dst, &tmp.rmv, offset, w, h); call_new(a_dst, &tmp.rmv, offset, w, h); checkasm_check(uint32_t, (uint32_t*)(c_buf + offset), stride, (uint32_t*)(a_buf + offset), stride, w_uint32, h, "dst"); bench_new(a_dst, &tmp.rmv, 0, w, h); } } } report("splat_mv"); } #endif void checkasm_check_refmvs(void) { Dav2dRefmvsDSPContext c; dav2d_refmvs_dsp_init(&c); #if 0 //check_load_tmvs(&c); // FIME: causes integer overflows, disable until investigated check_save_tmvs(&c); check_splat_mv(&c); #endif } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/dav2d_argon.bash000077500000000000000000000125031517466257200242370ustar00rootroot00000000000000#!/usr/bin/env bash DAV2D="tools/dav2d" ARGON_DIR='.' FILMGRAIN=1 CPUMASK=-1 THREADS=1 JOBS=0 WRAP="" FAIL_FAST=0 usage() { NAME=$(basename "$0") { printf "Usage: %s [-d dav2d] [-a argondir] [-g \$filmgrain] [-c \$cpumask] [-t threads] [-j jobs] [DIRECTORY]...\n" "$NAME" printf "Example: %s -d /path/to/dav2d -a /path/to/argon/ -g 0 -c avx2 profile0_core\n" "$NAME" printf "Used to verify that dav2d can decode the Argon AV2 test vectors correctly.\n\n" printf " DIRECTORY one or more dirs in the argon folder to check against\n" printf " (default: everything except large scale tiles and stress files)\n" printf " -f fail fast\n" printf " -d dav2d path to dav2d executable (default: tools/dav2d)\n" printf " -a dir path to argon dir (default: 'tests/argon' if found; '.' otherwise)\n" printf " -g \$num enable filmgrain (default: 1)\n" printf " -c \$mask use restricted cpumask (default: -1)\n" printf " -t \$num number of threads per dav2d (default: 1)\n" printf " -j \$num number of parallel dav2d processes (default: 0)\n" printf " -w tool execute dav2d with a wrapper tool\n\n" } >&2 exit 1 } error() { printf "\033[1;91m%s\033[0m\n" "$*" >&2 exit 1 } fail() { printf "\033[1K\rMismatch in %s\n" "$1" [[ $FAIL_FAST = 1 ]] && exit 1 (( failed++ )) } check_pids() { new_pids=() done_pids=() for p in "${pids[@]}"; do if kill -0 "$p" 2>/dev/null; then new_pids+=("$p") else done_pids+=("$p") fi done pids=("${new_pids[@]}") } wait_pids() { pid_list=("$@") for p in "${pid_list[@]}"; do if ! wait "$p"; then local file_varname="file$p" fail "${!file_varname}" fi done } block_pids() { while [ ${#pids[@]} -ge "$JOBS" ]; do check_pids if [ ${#done_pids} -eq 0 ]; then sleep 0.2 else wait_pids "${done_pids[@]}" fi done } wait_all_pids() { wait_pids "${pids[@]}" } # find tests/argon tests_dir=$(dirname "$(readlink -f "$0")") if [ -d "$tests_dir/argon" ]; then ARGON_DIR="$tests_dir/argon" fi while getopts ":d:a:g:c:t:j:w:f" opt; do case "$opt" in f) FAIL_FAST=1 ;; d) DAV2D="$OPTARG" ;; a) ARGON_DIR="$OPTARG" ;; g) FILMGRAIN="$OPTARG" ;; c) CPUMASK="$OPTARG" ;; t) THREADS="$OPTARG" ;; j) JOBS="$OPTARG" ;; w) WRAP="$OPTARG" ;; \?) printf "Error! Invalid option: -%s\n" "$OPTARG" >&2 usage ;; *) usage ;; esac done shift $((OPTIND-1)) if [ "$JOBS" -eq 0 ]; then if [ "$THREADS" -gt 0 ]; then JOBS="$((($( (nproc || sysctl -n hw.logicalcpu || getconf _NPROCESSORS_ONLN || echo 1) 2>/dev/null)+THREADS-1)/THREADS))" else JOBS=1 fi fi if [ "$#" -eq 0 ]; then # Everything except large scale tiles and stress files. dirs=("$ARGON_DIR/profile0_core" "$ARGON_DIR/profile0_core_special" "$ARGON_DIR/profile0_not_annexb" "$ARGON_DIR/profile0_not_annexb_special" "$ARGON_DIR/profile1_core" "$ARGON_DIR/profile1_core_special" "$ARGON_DIR/profile1_not_annexb" "$ARGON_DIR/profile1_not_annexb_special" "$ARGON_DIR/profile2_core" "$ARGON_DIR/profile2_core_special" "$ARGON_DIR/profile2_not_annexb" "$ARGON_DIR/profile2_not_annexb_special" "$ARGON_DIR/profile_switching") else mapfile -t dirs < <(printf "${ARGON_DIR}/%s\n" "$@" | sort -u) fi ver_info="dav2d $("$DAV2D" --filmgrain "$FILMGRAIN" --cpumask "$CPUMASK" --threads "$THREADS" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV2D" files=() for d in "${dirs[@]}"; do if [ -d "$d/streams" ]; then files+=("${d/%\//}"/streams/*.obu) fi done num_files="${#files[@]}" if [ "$num_files" -eq 0 ]; then error "Error! No files found at ${dirs[*]}" fi failed=0 pids=() for i in "${!files[@]}"; do f="${files[i]}" if [ "$FILMGRAIN" -eq 0 ]; then md5=${f/\/streams\//\/md5_no_film_grain\/} else md5=${f/\/streams\//\/md5_ref\/} fi md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}" md5=${md5/ */} printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}" cmd=($WRAP "$DAV2D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q) if [ "$JOBS" -gt 1 ]; then "${cmd[@]}" 2>/dev/null & p=$! pids+=("$p") declare "file$p=${f#"$ARGON_DIR"/}" block_pids else if ! "${cmd[@]}" 2>/dev/null; then fail "${f#"$ARGON_DIR"/}" fi fi done wait_all_pids if [ "$failed" -ne 0 ]; then printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "$num_files" else printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "$num_files" fi printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info" exit $failed dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/header_test.c000066400000000000000000000027151517466257200236460ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include DAV2D_TEST_HEADER int main(void) { return 0; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/000077500000000000000000000000001517466257200232225ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/alloc_fail.c000066400000000000000000000064711517466257200254630ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "alloc_fail.h" static int fail_probability; void dav2d_setup_alloc_fail(unsigned seed, unsigned probability) { srand(seed); while (probability >= RAND_MAX) probability >>= 1; fail_probability = probability; } void * __wrap_malloc(size_t); void * __wrap_malloc(size_t sz) { if (rand() < fail_probability) return NULL; return malloc(sz); } #if defined(HAVE_POSIX_MEMALIGN) int __wrap_posix_memalign(void **memptr, size_t alignment, size_t size); int __wrap_posix_memalign(void **memptr, size_t alignment, size_t size) { if (rand() < fail_probability) return ENOMEM; return posix_memalign(memptr, alignment, size); } #else #error "HAVE_POSIX_MEMALIGN required" #endif int __wrap_pthread_create(pthread_t *, const pthread_attr_t *, void *(*) (void *), void *); int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg) { if (rand() < (fail_probability + RAND_MAX/16)) return EAGAIN; return pthread_create(thread, attr, start_routine, arg); } int __wrap_pthread_mutex_init(pthread_mutex_t *, const pthread_mutexattr_t *); int __wrap_pthread_mutex_init(pthread_mutex_t *restrict mutex, const pthread_mutexattr_t *restrict attr) { if (rand() < (fail_probability + RAND_MAX/8)) return ENOMEM; return pthread_mutex_init(mutex, attr); } int __wrap_pthread_cond_init(pthread_cond_t *, const pthread_condattr_t *); int __wrap_pthread_cond_init(pthread_cond_t *restrict cond, const pthread_condattr_t *restrict attr) { if (rand() < (fail_probability + RAND_MAX/16)) return ENOMEM; return pthread_cond_init(cond, attr); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/alloc_fail.h000066400000000000000000000031731517466257200254640ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_TESTS_LIBFUZZER_ALLOC_FAIL_H #define DAV2D_TESTS_LIBFUZZER_ALLOC_FAIL_H #include DAV2D_API void dav2d_setup_alloc_fail(unsigned seed, unsigned probability); #endif /* DAV2D_TESTS_LIBFUZZER_ALLOC_FAIL_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/dav2d_fuzzer.c000066400000000000000000000135661517466257200260060ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include #include #include "src/cpu.h" #include "dav2d_fuzzer.h" #ifdef DAV2D_ALLOC_FAIL #include "alloc_fail.h" static unsigned djb_xor(const uint8_t * c, size_t len) { unsigned hash = 5381; for(size_t i = 0; i < len; i++) hash = hash * 33 ^ c[i]; return hash; } #endif static unsigned r32le(const uint8_t *const p) { return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0]; } #define DAV2D_FUZZ_MAX_SIZE 4096 * 4096 // search for "--cpumask xxx" in argv and remove both parameters int LLVMFuzzerInitialize(int *argc, char ***argv) { int i = 1; for (; i < *argc; i++) { if (!strcmp((*argv)[i], "--cpumask")) { const char * cpumask = (*argv)[i+1]; if (cpumask) { char *end; unsigned res; if (!strncmp(cpumask, "0x", 2)) { cpumask += 2; res = (unsigned) strtoul(cpumask, &end, 16); } else { res = (unsigned) strtoul(cpumask, &end, 0); } if (end != cpumask && !end[0]) { dav2d_set_cpu_flags_mask(res); } } break; } } for (; i < *argc - 2; i++) { (*argv)[i] = (*argv)[i + 2]; } *argc = i; return 0; } // expects ivf input int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { Dav2dSettings settings = { 0 }; Dav2dContext * ctx = NULL; Dav2dPicture pic; const uint8_t *ptr = data; int have_seq_hdr = 0; int err; dav2d_version(); if (size < 32) goto end; #ifdef DAV2D_ALLOC_FAIL unsigned h = djb_xor(ptr, 32); unsigned seed = h; unsigned probability = h > (RAND_MAX >> 5) ? RAND_MAX >> 5 : h; int max_frame_delay = (h & 0xf) + 1; int n_threads = ((h >> 4) & 0x7) + 1; if (max_frame_delay > 5) max_frame_delay = 1; if (n_threads > 3) n_threads = 1; #endif ptr += 32; // skip ivf header dav2d_default_settings(&settings); #ifdef DAV2D_MT_FUZZING settings.max_frame_delay = settings.n_threads = 4; #elif defined(DAV2D_ALLOC_FAIL) settings.max_frame_delay = max_frame_delay; settings.n_threads = n_threads; dav2d_setup_alloc_fail(seed, probability); #else settings.max_frame_delay = settings.n_threads = 1; #endif #if defined(DAV2D_FUZZ_MAX_SIZE) settings.frame_size_limit = DAV2D_FUZZ_MAX_SIZE; #endif err = dav2d_open(&ctx, &settings); if (err < 0) goto end; while (ptr <= data + size - 12) { Dav2dData buf; uint8_t *p; size_t frame_size = r32le(ptr); ptr += 12; if (frame_size > size || ptr > data + size - frame_size) break; if (!frame_size) continue; if (!have_seq_hdr) { Dav2dSequenceHeader seq; int err = dav2d_parse_sequence_header(&seq, ptr, frame_size); // skip frames until we see a sequence header if (err != 0) { ptr += frame_size; continue; } have_seq_hdr = 1; } // copy frame data to a new buffer to catch reads past the end of input p = dav2d_data_create(&buf, frame_size); if (!p) goto cleanup; memcpy(p, ptr, frame_size); ptr += frame_size; do { if ((err = dav2d_send_data(ctx, &buf)) < 0) { if (err != DAV2D_ERR(EAGAIN)) break; } for (;;) { memset(&pic, 0, sizeof(pic)); err = dav2d_get_picture(ctx, &pic); if (err == 0) { dav2d_picture_unref(&pic); } else if (err != DAV2D_ERR(EAGAIN)) { goto nested_break; } } } while (buf.sz > 0); nested_break: if (buf.sz > 0) dav2d_data_unref(&buf); } dav2d_send_data(ctx, NULL); memset(&pic, 0, sizeof(pic)); if ((err = dav2d_get_picture(ctx, &pic)) == 0) { /* Test calling dav2d_picture_unref() after dav2d_close() */ do { Dav2dPicture pic2 = { 0 }; if ((err = dav2d_get_picture(ctx, &pic2)) == 0) dav2d_picture_unref(&pic2); } while (err != DAV2D_EOF); dav2d_close(&ctx); dav2d_picture_unref(&pic); return 0; } cleanup: dav2d_close(&ctx); end: return 0; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/dav2d_fuzzer.h000066400000000000000000000032641517466257200260050ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_TESTS_LIBFUZZER_DAV2D_FUZZER_H #define DAV2D_TESTS_LIBFUZZER_DAV2D_FUZZER_H #include #include int LLVMFuzzerInitialize(int *argc, char ***argv); int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); #endif /* DAV2D_TESTS_LIBFUZZER_DAV2D_FUZZER_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/main.c000066400000000000000000000060371517466257200243200ustar00rootroot00000000000000/* * Copyright © 2018, VideoLAN and dav2d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include #include #include #include #include "dav2d_fuzzer.h" // expects ivf input int main(int argc, char *argv[]) { int ret = -1; FILE *f = NULL; int64_t fsize; const char *filename = NULL; uint8_t *data = NULL; size_t size = 0; if (LLVMFuzzerInitialize(&argc, &argv)) { return 1; } if (argc != 2) { fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]); return -1; } filename = argv[1]; if (!(f = fopen(filename, "rb"))) { fprintf(stderr, "failed to open %s: %s\n", filename, strerror(errno)); goto error; } if (fseeko(f, 0, SEEK_END) == -1) { fprintf(stderr, "fseek(%s, 0, SEEK_END) failed: %s\n", filename, strerror(errno)); goto error; } if ((fsize = ftello(f)) == -1) { fprintf(stderr, "ftell(%s) failed: %s\n", filename, strerror(errno)); goto error; } rewind(f); if (fsize < 0 || fsize > INT_MAX) { fprintf(stderr, "%s is too large: %"PRId64"\n", filename, fsize); goto error; } size = (size_t)fsize; if (!(data = malloc(size))) { fprintf(stderr, "failed to allocate: %zu bytes\n", size); goto error; } if (fread(data, size, 1, f) == size) { fprintf(stderr, "failed to read %zu bytes from %s: %s\n", size, filename, strerror(errno)); goto error; } ret = LLVMFuzzerTestOneInput(data, size); error: free(data); if (f) fclose(f); return ret; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/libfuzzer/meson.build000066400000000000000000000077421517466257200253760ustar00rootroot00000000000000# Copyright © 2020, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Build definition for the dav2d fuzzing binaries # if fuzzing_engine == 'none' and not have_fseeko subdir_done() endif dav2d_fuzzer_sources = files('dav2d_fuzzer.c') fuzzer_ldflags = [] fuzzer_link_lang = {} if get_option('fuzzer_ldflags') != '' fuzzer_ldflags += [get_option('fuzzer_ldflags')] endif if fuzzing_engine == 'none' dav2d_fuzzer_sources += files('main.c') elif fuzzing_engine == 'libfuzzer' fuzzer_ldflags += ['-fsanitize=fuzzer'] elif fuzzing_engine == 'oss-fuzz' # libFuzzingEngine needs c++ add_languages('cpp') fuzzer_link_lang = {'link_language': 'cpp'} endif dav2d_fuzzer = executable('dav2d_fuzzer', dav2d_fuzzer_sources, include_directories: dav2d_inc_dirs, link_args: fuzzer_ldflags, link_with : libdav2d, build_by_default: true, dependencies : [thread_dependency], kwargs: fuzzer_link_lang ) dav2d_fuzzer_mt = executable('dav2d_fuzzer_mt', dav2d_fuzzer_sources, include_directories: dav2d_inc_dirs, c_args: ['-DDAV2D_MT_FUZZING'], link_args: fuzzer_ldflags, link_with : libdav2d, build_by_default: true, dependencies : [thread_dependency], kwargs: fuzzer_link_lang ) objcopy = find_program('objcopy', required: false) if (objcopy.found() and not get_option('b_lto') and get_option('default_library') == 'static' and cc.has_function('posix_memalign', prefix : '#include ', args : test_args)) libdav2d_af = custom_target('libdav2d_af', input: libdav2d, output: 'libdav2d_af.a', depends: libdav2d, command: [objcopy, '--redefine-sym', 'malloc=__wrap_malloc', '--redefine-sym', 'posix_memalign=__wrap_posix_memalign', '--redefine-sym', 'pthread_create=__wrap_pthread_create', '--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init', '--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init', '@INPUT@', '@OUTPUT@']) dav2d_fuzzer_mem = executable('dav2d_fuzzer_mem', dav2d_fuzzer_sources + ['alloc_fail.c'], include_directories: dav2d_inc_dirs, c_args: ['-DDAV2D_ALLOC_FAIL'], link_args: fuzzer_ldflags + [join_paths(libdav2d_af.full_path())], link_depends: libdav2d_af, build_by_default: false, dependencies : [thread_dependency], kwargs: fuzzer_link_lang ) endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/meson.build000066400000000000000000000111701517466257200233500ustar00rootroot00000000000000# Copyright © 2018, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Build definition for the dav2d tests # # Leave subdir if tests are disabled if not get_option('enable_tests') subdir_done() endif checkasm_dependency = dependency('checkasm', fallback: ['checkasm', 'checkasm_dep'], required: false ) if is_asm_enabled and checkasm_dependency.found() checkasm_args = [ '-DCHECKASM_HAVE_GENERIC=@0@'.format(cdata.get('HAVE_C11_GENERIC')) ] checkasm_sources = files( 'checkasm/checkasm.c', 'checkasm/msac.c', 'checkasm/pal.c', 'checkasm/refmvs.c', ) checkasm_tmpl_sources = files( 'checkasm/cdef.c', 'checkasm/deblock.c', 'checkasm/filmgrain.c', 'checkasm/ipred.c', 'checkasm/itx.c', 'checkasm/looprestoration.c', 'checkasm/mc.c', ) checkasm_bitdepth_objs = [] foreach bitdepth : dav2d_bitdepths checkasm_bitdepth_lib = static_library( 'checkasm_bitdepth_@0@'.format(bitdepth), checkasm_tmpl_sources, include_directories: dav2d_inc_dirs, dependencies : [stdatomic_dependencies, checkasm_dependency], c_args: ['-DBITDEPTH=@0@'.format(bitdepth)] + checkasm_args, install: false, build_by_default: false, ) checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects(recursive: true) endforeach checkasm = executable('checkasm', checkasm_sources, objects: [ checkasm_bitdepth_objs, libdav2d.extract_all_objects(recursive: true), ], include_directories: dav2d_inc_dirs, build_by_default: false, dependencies: [ libm_dependency, checkasm_dependency, ], c_args: checkasm_args, ) test('checkasm', checkasm, suite: 'checkasm', timeout: 180) benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench') endif c99_extension_flag = cc.first_supported_argument( '-Werror=c11-extensions', '-Werror=c99-c11-compat', '-Wc11-extensions', '-Wc99-c11-compat', ) # dav2d_api_headers foreach header : dav2d_api_headers target = header + '_test' header_test_exe = executable(target, 'header_test.c', include_directories: dav2d_inc_dirs, c_args: ['-DDAV2D_TEST_HEADER="@0@"'.format(header), c99_extension_flag], build_by_default: true ) test(target, header_test_exe, suite: 'headers') endforeach # fuzzing binaries subdir('libfuzzer') # seek stress test binary, depends on dav2d cli tool if (get_option('enable_tools') and get_option('enable_seek_stress')) seek_stress_sources = files('seek_stress.c') seek_stress = executable('seek_stress', seek_stress_sources, rev_target, objects: [ dav2d.extract_objects('dav2d_cli_parse.c'), dav2d_input_objs.extract_objects('input/input.c', 'input/ivf.c'), ], include_directories: [dav2d_inc_dirs, include_directories('../tools')], link_with: libdav2d, dependencies: [ thread_dependency, rt_dependency, getopt_dependency, libm_dependency, ], ) endif # Include dav2d test data repository with additional tests if get_option('testdata_tests') subdir('dav2d-test-data') endif dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/seek_stress.c000066400000000000000000000171341517466257200237120ustar00rootroot00000000000000/* * Copyright © 2020, VideoLAN and dav2d authors * Copyright © 2020, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "vcs_version.h" #include "cli_config.h" #include #include #include #include #include "dav2d/dav2d.h" #include "input/input.h" #include "input/demuxer.h" #include "dav2d_cli_parse.h" #define NUM_RAND_SEEK 3 #define NUM_REL_SEEK 4 #define NUM_END_SEEK 2 const Demuxer annexb_demuxer = { .name = "" }; const Demuxer section5_demuxer = { .name = "" }; #ifdef _WIN32 #include static unsigned get_seed(void) { return GetTickCount(); } #else #ifdef __APPLE__ #include #else #include #endif static unsigned get_seed(void) { #ifdef __APPLE__ return (unsigned) mach_absolute_time(); #elif HAVE_CLOCK_GETTIME struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec); #endif } #endif static uint32_t xs_state[4]; static void xor128_srand(unsigned seed) { xs_state[0] = seed; xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff); xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff); xs_state[3] = ~seed; } // xor128 from Marsaglia, George (July 2003). "Xorshift RNGs". // Journal of Statistical Software. 8 (14). // doi:10.18637/jss.v008.i14. static int xor128_rand(void) { const uint32_t x = xs_state[0]; const uint32_t t = x ^ (x << 11); xs_state[0] = xs_state[1]; xs_state[1] = xs_state[2]; xs_state[2] = xs_state[3]; uint32_t w = xs_state[3]; w = (w ^ (w >> 19)) ^ (t ^ (t >> 8)); xs_state[3] = w; return w >> 1; } static inline int decode_frame(Dav2dPicture *const p, Dav2dContext *const c, Dav2dData *const data) { int res; memset(p, 0, sizeof(*p)); if ((res = dav2d_send_data(c, data)) < 0) { if (res != DAV2D_ERR(EAGAIN)) { fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV2D_ERR(res))); return res; } } if ((res = dav2d_get_picture(c, p)) < 0) { if (res != DAV2D_ERR(EAGAIN)) { fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV2D_ERR(res))); return res; } } else dav2d_picture_unref(p); return 0; } static int decode_rand(DemuxerContext *const in, Dav2dContext *const c, Dav2dData *const data, const double fps) { int res = 0; Dav2dPicture p; const int num_frames = xor128_rand() % (int)(fps * 5); for (int i = 0; i < num_frames; i++) { if ((res = decode_frame(&p, c, data))) break; if (input_read(in, data) || data->sz == 0) break; } return res; } static int decode_all(DemuxerContext *const in, Dav2dContext *const c, Dav2dData *const data) { int res = 0; Dav2dPicture p; do { if ((res = decode_frame(&p, c, data))) break; } while (!input_read(in, data) && data->sz > 0); return res; } static int seek(DemuxerContext *const in, Dav2dContext *const c, const uint64_t pts, Dav2dData *const data) { int res; if ((res = input_seek(in, pts))) return res; Dav2dSequenceHeader seq; do { if ((res = input_read(in, data))) break; } while (dav2d_parse_sequence_header(&seq, data->data, data->sz)); dav2d_flush(c); return res; } int main(const int argc, char *const *const argv) { const char *version = dav2d_version(); if (strcmp(version, DAV2D_VERSION)) { fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n", version, DAV2D_VERSION); return EXIT_FAILURE; } CLISettings cli_settings; Dav2dSettings lib_settings; DemuxerContext *in; Dav2dContext *c; Dav2dData data; unsigned total, i_fps[2], i_timebase[2]; double timebase, spf, fps; uint64_t pts; xor128_srand(get_seed()); parse(argc, argv, &cli_settings, &lib_settings); if (input_open(&in, "ivf", cli_settings.inputfile, i_fps, &total, i_timebase) < 0 || !i_timebase[0] || !i_timebase[1] || !i_fps[0] || !i_fps[1]) { return EXIT_SUCCESS; } if (dav2d_open(&c, &lib_settings)) return EXIT_FAILURE; timebase = (double)i_timebase[1] / i_timebase[0]; spf = (double)i_fps[1] / i_fps[0]; fps = (double)i_fps[0] / i_fps[1]; if (fps < 1) goto end; #define FRAME_OFFSET_TO_PTS(foff) \ (uint64_t)llround(((foff) * spf) * 1000000000.0) #define TS_TO_PTS(ts) \ (uint64_t)llround(((ts) * timebase) * 1000000000.0) // seek at random pts for (int i = 0; i < NUM_RAND_SEEK; i++) { pts = FRAME_OFFSET_TO_PTS(xor128_rand() % total); if (seek(in, c, pts, &data)) continue; if (decode_rand(in, c, &data, fps)) goto end; } pts = TS_TO_PTS(data.m.timestamp); // seek left / right randomly with random intervals within 1s for (int i = 0, tries = 0; i - tries < NUM_REL_SEEK && tries < NUM_REL_SEEK / 2; i++) { const int sign = xor128_rand() & 1 ? -1 : +1; const float diff = (xor128_rand() % 100) / 100.f; int64_t new_pts = pts + sign * FRAME_OFFSET_TO_PTS(diff * fps); const int64_t new_ts = llround(new_pts / (timebase * 1000000000.0)); new_pts = TS_TO_PTS(new_ts); if (new_pts < 0 || (uint64_t)new_pts >= FRAME_OFFSET_TO_PTS(total)) { if (seek(in, c, FRAME_OFFSET_TO_PTS(total / 2), &data)) break; pts = TS_TO_PTS(data.m.timestamp); tries++; continue; } if (seek(in, c, new_pts, &data)) if (seek(in, c, 0, &data)) goto end; if (decode_rand(in, c, &data, fps)) goto end; pts = TS_TO_PTS(data.m.timestamp); } unsigned shift = 0; do { shift += 5; if (shift > total) shift = total; } while (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)); // simulate seeking after the end of the file for (int i = 0; i < NUM_END_SEEK; i++) { if (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)) goto end; if (decode_all(in, c, &data)) goto end; int num_flush = 1 + 64 + xor128_rand() % 64; while (num_flush--) dav2d_flush(c); } end: input_close(in); dav2d_close(&c); return EXIT_SUCCESS; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tests/test-md5.sh000077500000000000000000000010231517466257200232030ustar00rootroot00000000000000#!/bin/bash set -e DAV2D="$1" if [ ! -e "$DAV2D" ]; then echo $0 path/to/dav2d exit 1 fi SRC_PATH="$(dirname "$0")/.." for file in "$SRC_PATH"/media/*.obu; do base="$(basename "$file")" a_md5=$(cat "$file.md5") "$DAV2D" -i "$file" -o tmp.md5 --threads=1 --quiet --muxer=md5 --filmgrain=1 d_md5=$(cat tmp.md5) rm -f tmp.md5 if [[ "$a_md5" == "$d_md5" ]]; then echo $base "[OK] md5:$a_md5" else echo $base "[FAILED!]" diff <(echo "$a_md5") <(echo "$d_md5") fi done dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/000077500000000000000000000000001517466257200212045ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/compat/000077500000000000000000000000001517466257200224675ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/compat/getopt.c000066400000000000000000000364471517466257200241530ustar00rootroot00000000000000/* $OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $ */ /* $NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $ */ /* * Copyright (c) 2002 Todd C. Miller * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * Sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F39502-99-1-0512. */ /*- * Copyright (c) 2000 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Dieter Baron and Thomas Klausner. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #ifdef _WIN32 #include #else #include #endif #define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ #ifdef REPLACE_GETOPT int opterr = 1; /* if error message should be printed */ int optind = 1; /* index into parent argv vector */ int optopt = '?'; /* character checked for validity */ #undef optreset /* see getopt.h */ #define optreset __mingw_optreset int optreset; /* reset getopt */ char *optarg; /* argument associated with option */ #endif #define PRINT_ERROR ((opterr) && (*options != ':')) #define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ #define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ #define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ /* return values */ #define BADCH (int)'?' #define BADARG ((*options == ':') ? (int)':' : (int)'?') #define INORDER (int)1 #ifdef __CYGWIN__ static char EMSG[] = ""; #else #define EMSG "" #endif static int getopt_internal(int, char * const *, const char *, const struct option *, int *, int); static int parse_long_options(char * const *, const char *, const struct option *, int *, int); static int gcd(int, int); static void permute_args(int, int, int, char * const *); static char *place = EMSG; /* option letter processing */ /* XXX: set optreset to 1 rather than these two */ static int nonopt_start = -1; /* first non option argument (for permute) */ static int nonopt_end = -1; /* first option after non options (for permute) */ /* Error messages */ static const char recargchar[] = "option requires an argument -- %c"; static const char recargstring[] = "option requires an argument -- %s"; static const char ambig[] = "ambiguous option -- %.*s"; static const char noarg[] = "option doesn't take an argument -- %.*s"; static const char illoptchar[] = "unknown option -- %c"; static const char illoptstring[] = "unknown option -- %s"; #ifdef _WIN32 #ifndef __CYGWIN__ #define __progname __argv[0] #else extern char __declspec(dllimport) *__progname; #endif static void _vwarnx(const char *fmt,va_list ap) { (void)fprintf(stderr,"%s: ",__progname); if (fmt != NULL) (void)vfprintf(stderr,fmt,ap); (void)fprintf(stderr,"\n"); } static void warnx(const char *fmt,...) { va_list ap; va_start(ap,fmt); _vwarnx(fmt,ap); va_end(ap); } #endif /* * Compute the greatest common divisor of a and b. */ static int gcd(int a, int b) { int c; c = a % b; while (c != 0) { a = b; b = c; c = a % b; } return (b); } /* * Exchange the block from nonopt_start to nonopt_end with the block * from nonopt_end to opt_end (keeping the same order of arguments * in each block). */ static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char * const *nargv) { int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; char *swap; /* * compute lengths of blocks and number and size of cycles */ nnonopts = panonopt_end - panonopt_start; nopts = opt_end - panonopt_end; ncycle = gcd(nnonopts, nopts); cyclelen = (opt_end - panonopt_start) / ncycle; for (i = 0; i < ncycle; i++) { cstart = panonopt_end+i; pos = cstart; for (j = 0; j < cyclelen; j++) { if (pos >= panonopt_end) pos -= nnonopts; else pos += nopts; swap = nargv[pos]; /* LINTED const cast */ ((char **) nargv)[pos] = nargv[cstart]; /* LINTED const cast */ ((char **)nargv)[cstart] = swap; } } } /* * parse_long_options -- * Parse long options in argc/argv argument vector. * Returns -1 if short_too is set and the option does not match long_options. */ static int parse_long_options(char * const *nargv, const char *options, const struct option *long_options, int *idx, int short_too) { char *current_argv, *has_equal; size_t current_argv_len; int i, ambiguous, match; #define IDENTICAL_INTERPRETATION(_x, _y) \ (long_options[(_x)].has_arg == long_options[(_y)].has_arg && \ long_options[(_x)].flag == long_options[(_y)].flag && \ long_options[(_x)].val == long_options[(_y)].val) current_argv = place; match = -1; ambiguous = 0; optind++; if ((has_equal = strchr(current_argv, '=')) != NULL) { /* argument found (--option=arg) */ current_argv_len = has_equal - current_argv; has_equal++; } else current_argv_len = strlen(current_argv); for (i = 0; long_options[i].name; i++) { /* find matching long option */ if (strncmp(current_argv, long_options[i].name, current_argv_len)) continue; if (strlen(long_options[i].name) == current_argv_len) { /* exact match */ match = i; ambiguous = 0; break; } /* * If this is a known short option, don't allow * a partial match of a single character. */ if (short_too && current_argv_len == 1) continue; if (match == -1) /* partial match */ match = i; else if (!IDENTICAL_INTERPRETATION(i, match)) ambiguous = 1; } if (ambiguous) { /* ambiguous abbreviation */ if (PRINT_ERROR) warnx(ambig, (int)current_argv_len, current_argv); optopt = 0; return (BADCH); } if (match != -1) { /* option found */ if (long_options[match].has_arg == no_argument && has_equal) { if (PRINT_ERROR) warnx(noarg, (int)current_argv_len, current_argv); /* * XXX: GNU sets optopt to val regardless of flag */ if (long_options[match].flag == NULL) optopt = long_options[match].val; else optopt = 0; return (BADARG); } if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) { if (has_equal) optarg = has_equal; else if (long_options[match].has_arg == required_argument) { /* * optional argument doesn't use next nargv */ optarg = nargv[optind++]; } } if ((long_options[match].has_arg == required_argument) && (optarg == NULL)) { /* * Missing argument; leading ':' indicates no error * should be generated. */ if (PRINT_ERROR) warnx(recargstring, current_argv); /* * XXX: GNU sets optopt to val regardless of flag */ if (long_options[match].flag == NULL) optopt = long_options[match].val; else optopt = 0; --optind; return (BADARG); } } else { /* unknown option */ if (short_too) { --optind; return (-1); } if (PRINT_ERROR) warnx(illoptstring, current_argv); optopt = 0; return (BADCH); } if (idx) *idx = match; if (long_options[match].flag) { *long_options[match].flag = long_options[match].val; return (0); } else return (long_options[match].val); #undef IDENTICAL_INTERPRETATION } /* * getopt_internal -- * Parse argc/argv argument vector. Called by user level routines. */ static int getopt_internal(int nargc, char * const *nargv, const char *options, const struct option *long_options, int *idx, int flags) { char *oli; /* option letter list index */ int optchar, short_too; static int posixly_correct = -1; if (options == NULL) return (-1); /* * XXX Some GNU programs (like cvs) set optind to 0 instead of * XXX using optreset. Work around this braindamage. */ if (optind == 0) optind = optreset = 1; /* * Disable GNU extensions if POSIXLY_CORRECT is set or options * string begins with a '+'. * * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or * optreset != 0 for GNU compatibility. */ if (posixly_correct == -1 || optreset != 0) posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); if (*options == '-') flags |= FLAG_ALLARGS; else if (posixly_correct || *options == '+') flags &= ~FLAG_PERMUTE; if (*options == '+' || *options == '-') options++; optarg = NULL; if (optreset) nonopt_start = nonopt_end = -1; start: if (optreset || !*place) { /* update scanning pointer */ optreset = 0; if (optind >= nargc) { /* end of argument vector */ place = EMSG; if (nonopt_end != -1) { /* do permutation, if we have to */ permute_args(nonopt_start, nonopt_end, optind, nargv); optind -= nonopt_end - nonopt_start; } else if (nonopt_start != -1) { /* * If we skipped non-options, set optind * to the first of them. */ optind = nonopt_start; } nonopt_start = nonopt_end = -1; return (-1); } if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) { place = EMSG; /* found non-option */ if (flags & FLAG_ALLARGS) { /* * GNU extension: * return non-option as argument to option 1 */ optarg = nargv[optind++]; return (INORDER); } if (!(flags & FLAG_PERMUTE)) { /* * If no permutation wanted, stop parsing * at first non-option. */ return (-1); } /* do permutation */ if (nonopt_start == -1) nonopt_start = optind; else if (nonopt_end != -1) { permute_args(nonopt_start, nonopt_end, optind, nargv); nonopt_start = optind - (nonopt_end - nonopt_start); nonopt_end = -1; } optind++; /* process next argument */ goto start; } if (nonopt_start != -1 && nonopt_end == -1) nonopt_end = optind; /* * If we have "-" do nothing, if "--" we are done. */ if (place[1] != '\0' && *++place == '-' && place[1] == '\0') { optind++; place = EMSG; /* * We found an option (--), so if we skipped * non-options, we have to permute. */ if (nonopt_end != -1) { permute_args(nonopt_start, nonopt_end, optind, nargv); optind -= nonopt_end - nonopt_start; } nonopt_start = nonopt_end = -1; return (-1); } } /* * Check long options if: * 1) we were passed some * 2) the arg is not just "-" * 3) either the arg starts with -- we are getopt_long_only() */ if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) { short_too = 0; if (*place == '-') place++; /* --foo long option */ else if (*place != ':' && strchr(options, *place) != NULL) short_too = 1; /* could be short option too */ optchar = parse_long_options(nargv, options, long_options, idx, short_too); if (optchar != -1) { place = EMSG; return (optchar); } } if ((optchar = (int)*place++) == (int)':' || (optchar == (int)'-' && *place != '\0') || (oli = strchr(options, optchar)) == NULL) { /* * If the user specified "-" and '-' isn't listed in * options, return -1 (non-option) as per POSIX. * Otherwise, it is an unknown option character (or ':'). */ if (optchar == (int)'-' && *place == '\0') return (-1); if (!*place) ++optind; if (PRINT_ERROR) warnx(illoptchar, optchar); optopt = optchar; return (BADCH); } if (long_options != NULL && optchar == 'W' && oli[1] == ';') { /* -W long-option */ if (*place) /* no space */ /* NOTHING */; else if (++optind >= nargc) { /* no arg */ place = EMSG; if (PRINT_ERROR) warnx(recargchar, optchar); optopt = optchar; return (BADARG); } else /* white space */ place = nargv[optind]; optchar = parse_long_options(nargv, options, long_options, idx, 0); place = EMSG; return (optchar); } if (*++oli != ':') { /* doesn't take argument */ if (!*place) ++optind; } else { /* takes (optional) argument */ optarg = NULL; if (*place) /* no white space */ optarg = place; else if (oli[1] != ':') { /* arg not optional */ if (++optind >= nargc) { /* no arg */ place = EMSG; if (PRINT_ERROR) warnx(recargchar, optchar); optopt = optchar; return (BADARG); } else optarg = nargv[optind]; } place = EMSG; ++optind; } /* dump back option letter */ return (optchar); } #ifdef REPLACE_GETOPT /* * getopt -- * Parse argc/argv argument vector. * * [eventually this will replace the BSD getopt] */ int getopt(int nargc, char * const *nargv, const char *options) { /* * We don't pass FLAG_PERMUTE to getopt_internal() since * the BSD getopt(3) (unlike GNU) has never done this. * * Furthermore, since many privileged programs call getopt() * before dropping privileges it makes sense to keep things * as simple (and bug-free) as possible. */ return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); } #endif /* REPLACE_GETOPT */ /* * getopt_long -- * Parse argc/argv argument vector. */ int getopt_long(int nargc, char * const *nargv, const char *options, const struct option *long_options, int *idx) { return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE)); } /* * getopt_long_only -- * Parse argc/argv argument vector. */ int getopt_long_only(int nargc, char * const *nargv, const char *options, const struct option *long_options, int *idx) { return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE|FLAG_LONGONLY)); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/dav2d.c000066400000000000000000000324351517466257200223570ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "vcs_version.h" #include "cli_config.h" #include #include #include #include #include #include #include #include #include #if HAVE_UNISTD_H # include #endif #if HAVE_IO_H # include #endif #ifdef _WIN32 # include #endif #ifdef __APPLE__ #include #endif #include "dav2d/dav2d.h" #include "input/input.h" #include "output/output.h" #include "dav2d_cli_parse.h" static uint64_t get_time_nanos(void) { #ifdef _WIN32 LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); LARGE_INTEGER t; QueryPerformanceCounter(&t); uint64_t seconds = t.QuadPart / frequency.QuadPart; uint64_t fractions = t.QuadPart % frequency.QuadPart; return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart; #elif HAVE_CLOCK_GETTIME struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return 1000000000ULL * ts.tv_sec + ts.tv_nsec; #elif defined(__APPLE__) mach_timebase_info_data_t info; mach_timebase_info(&info); return mach_absolute_time() * info.numer / info.denom; #endif } static void sleep_nanos(uint64_t d) { #ifdef _WIN32 Sleep((unsigned)(d / 1000000)); #else const struct timespec ts = { .tv_sec = (time_t)(d / 1000000000), .tv_nsec = d % 1000000000, }; nanosleep(&ts, NULL); #endif } static void synchronize(const int realtime, const unsigned cache, const unsigned n_out, const uint64_t nspf, const uint64_t tfirst, uint64_t *const elapsed, FILE *const frametimes) { const uint64_t tcurr = get_time_nanos(); const uint64_t last = *elapsed; *elapsed = tcurr - tfirst; if (realtime) { const uint64_t deadline = nspf * n_out; if (*elapsed < deadline) { const uint64_t remaining = deadline - *elapsed; if (remaining > nspf * cache) sleep_nanos(remaining - nspf * cache); *elapsed = deadline; } } if (frametimes) { const uint64_t frametime = *elapsed - last; fprintf(frametimes, "%" PRIu64 "\n", frametime); fflush(frametimes); } } static void print_stats(const int istty, const unsigned n, const unsigned num, const uint64_t elapsed, const double i_fps) { char buf[80], *b = buf, *const end = buf + 80; if (istty) *b++ = '\r'; if (num == 0xFFFFFFFF) b += snprintf(b, end - b, "Decoded %u frames", n); else b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)", n, num, 100.0 * n / num); if (b < end) { const double d_fps = 1e9 * n / elapsed; if (i_fps) { const double speed = d_fps / i_fps; b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)", d_fps, i_fps, speed); } else { b += snprintf(b, end - b, " - %.2lf fps", d_fps); } } if (!istty) strcpy(b > end - 2 ? end - 2 : b, "\n"); fputs(buf, stderr); } static int picture_alloc(Dav2dPicture *const p, void *const _) { const int hbd = p->p.bpc > 8; const int aligned_w = (p->p.w + 127) & ~127; const int aligned_h = (p->p.h + 127) & ~127; const int has_chroma = p->p.layout != DAV2D_PIXEL_LAYOUT_I400; const int ss_ver = p->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p->p.layout != DAV2D_PIXEL_LAYOUT_I444; ptrdiff_t y_stride = aligned_w << hbd; ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0; /* Due to how mapping of addresses to sets works in most L1 and L2 cache * implementations, strides of multiples of certain power-of-two numbers * may cause multiple rows of the same superblock to map to the same set, * causing evictions of previous rows resulting in a reduction in cache * hit rate. Avoid that by slightly padding the stride when necessary. */ if (!(y_stride & 1023)) y_stride += DAV2D_PICTURE_ALIGNMENT; if (!(uv_stride & 1023) && has_chroma) uv_stride += DAV2D_PICTURE_ALIGNMENT; p->stride[0] = -y_stride; p->stride[1] = -uv_stride; const size_t y_sz = y_stride * aligned_h; const size_t uv_sz = uv_stride * (aligned_h >> ss_ver); const size_t pic_size = y_sz + 2 * uv_sz; uint8_t *const buf = malloc(pic_size + DAV2D_PICTURE_ALIGNMENT * 2); if (!buf) return DAV2D_ERR(ENOMEM); p->allocator_data = buf; const ptrdiff_t align_m1 = DAV2D_PICTURE_ALIGNMENT - 1; uint8_t *const data = (uint8_t *)(((ptrdiff_t)buf + align_m1) & ~align_m1); p->data[0] = data + y_sz - y_stride; p->data[1] = has_chroma ? data + y_sz + uv_sz * 1 - uv_stride : NULL; p->data[2] = has_chroma ? data + y_sz + uv_sz * 2 - uv_stride : NULL; return 0; } static void picture_release(Dav2dPicture *const p, void *const _) { free(p->allocator_data); } static volatile sig_atomic_t signal_terminate; static void signal_handler(const int s) { signal_terminate = 1; } int main(const int argc, char *const *const argv) { const int istty = isatty(fileno(stderr)); int res = 0; CLISettings cli_settings; Dav2dSettings lib_settings; DemuxerContext *in; MuxerContext *out = NULL; Dav2dPicture p; Dav2dContext *c; Dav2dData data; unsigned n_out = 0, total, fps[2], timebase[2]; uint64_t nspf, tfirst, elapsed = 0; double i_fps; FILE *frametimes = NULL; const unsigned version = dav2d_version_api(); const int major = DAV2D_API_MAJOR(version); const int minor = DAV2D_API_MINOR(version); const int patch = DAV2D_API_PATCH(version); if (DAV2D_API_VERSION_MAJOR != major || DAV2D_API_VERSION_MINOR > minor) { fprintf(stderr, "Version mismatch (library: %d.%d.%d, executable: %d.%d.%d)\n", major, minor, patch, DAV2D_API_VERSION_MAJOR, DAV2D_API_VERSION_MINOR, DAV2D_API_VERSION_PATCH); return EXIT_FAILURE; } parse(argc, argv, &cli_settings, &lib_settings); if (cli_settings.neg_stride) { lib_settings.allocator.alloc_picture_callback = picture_alloc; lib_settings.allocator.release_picture_callback = picture_release; } if ((res = input_open(&in, cli_settings.demuxer, cli_settings.inputfile, fps, &total, timebase)) < 0) { return EXIT_FAILURE; } for (unsigned i = 0; i <= cli_settings.skip; i++) { if ((res = input_read(in, &data)) < 0) { input_close(in); return EXIT_FAILURE; } if (i < cli_settings.skip) dav2d_data_unref(&data); } if (!cli_settings.quiet) fprintf(stderr, "dav2d %s - by VideoLAN\n", dav2d_version()); // skip frames until a sequence header is found if (cli_settings.skip) { Dav2dSequenceHeader seq; unsigned seq_skip = 0; while (dav2d_parse_sequence_header(&seq, data.data, data.sz)) { if ((res = input_read(in, &data)) < 0) { input_close(in); return EXIT_FAILURE; } seq_skip++; } if (seq_skip && !cli_settings.quiet) fprintf(stderr, "skipped %u packets due to missing sequence header\n", seq_skip); } if (cli_settings.limit != 0 && cli_settings.limit < total) total = cli_settings.limit; if ((res = dav2d_open(&c, &lib_settings))) return EXIT_FAILURE; if (cli_settings.frametimes) frametimes = fopen(cli_settings.frametimes, "w"); if (cli_settings.realtime != REALTIME_CUSTOM) { if (fps[1] == 0) { i_fps = 0; nspf = 0; } else { i_fps = (double)fps[0] / fps[1]; nspf = 1000000000ULL * fps[1] / fps[0]; } } else { i_fps = cli_settings.realtime_fps; nspf = (uint64_t)(1000000000.0 / cli_settings.realtime_fps); } tfirst = get_time_nanos(); #if HAVE_SIGACTION && defined(SA_RESETHAND) static const struct sigaction sa = { .sa_handler = signal_handler, .sa_flags = SA_RESETHAND, }; sigaction(SIGINT, &sa, NULL); sigaction(SIGTERM, &sa, NULL); #else signal(SIGINT, signal_handler); signal(SIGTERM, signal_handler); #endif do { if ((res = signal_terminate)) break; if ((res = dav2d_send_data(c, &data)) < 0) { if (res != DAV2D_ERR(EAGAIN)) { dav2d_data_unref(&data); fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV2D_ERR(res))); if (res != DAV2D_ERR(EINVAL)) break; } } for (;;) { memset(&p, 0, sizeof(p)); if ((res = dav2d_get_picture(c, &p)) < 0) { if (res != DAV2D_ERR(EAGAIN)) { fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV2D_ERR(res))); if (res != DAV2D_ERR(EINVAL)) goto nested_break; } res = 0; break; } if (!n_out) { if ((res = output_open(&out, cli_settings.muxer, cli_settings.outputfile, &p.p, fps)) < 0) { if (frametimes) fclose(frametimes); return EXIT_FAILURE; } } if ((res = output_write(out, &p)) < 0) goto nested_break; n_out++; if (nspf || !cli_settings.quiet) { synchronize(cli_settings.realtime, cli_settings.realtime_cache, n_out, nspf, tfirst, &elapsed, frametimes); } if (!cli_settings.quiet) print_stats(istty, n_out, total, elapsed, i_fps); if (cli_settings.limit && n_out >= cli_settings.limit) goto nested_break; } } while (data.sz > 0 || !input_read(in, &data)); nested_break: if (data.sz > 0) dav2d_data_unref(&data); // flush if (res == 0) { dav2d_send_data(c, NULL); while (!cli_settings.limit || n_out < cli_settings.limit) { if ((res = signal_terminate)) break; if ((res = dav2d_get_picture(c, &p)) < 0) { if (res != DAV2D_EOF) { fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV2D_ERR(res))); } else { res = 0; } break; } else { if (!n_out) { if ((res = output_open(&out, cli_settings.muxer, cli_settings.outputfile, &p.p, fps)) < 0) { if (frametimes) fclose(frametimes); return EXIT_FAILURE; } } if ((res = output_write(out, &p)) < 0) break; n_out++; if (nspf || !cli_settings.quiet) { synchronize(cli_settings.realtime, cli_settings.realtime_cache, n_out, nspf, tfirst, &elapsed, frametimes); } if (!cli_settings.quiet) print_stats(istty, n_out, total, elapsed, i_fps); } } } if (frametimes) fclose(frametimes); input_close(in); if (out) { if (!cli_settings.quiet && istty) fprintf(stderr, "\n"); if (cli_settings.verify) res |= output_verify(out, cli_settings.verify); else output_close(out); } else { fprintf(stderr, "No data decoded\n"); res = 1; } dav2d_close(&c); return (res == 0) ? EXIT_SUCCESS : EXIT_FAILURE; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/dav2d.manifest000066400000000000000000000010751517466257200237370ustar00rootroot00000000000000 true UTF-8 dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/dav2d.rc.in000066400000000000000000000022401517466257200231350ustar00rootroot00000000000000#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 #define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" #define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 #define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" #include 1 RT_MANIFEST "dav2d.manifest" 1 VERSIONINFO FILETYPE VFT_APP FILEOS VOS_NT_WINDOWS32 PRODUCTVERSION PROJECT_VERSION_NUMBER FILEVERSION API_VERSION_NUMBER BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "040904E4" BEGIN VALUE "CompanyName", "VideoLAN" VALUE "ProductName", "dav2d" VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR VALUE "FileVersion", API_VERSION_NUMBER_STR VALUE "FileDescription", "dav2d " PROJECT_VERSION_NUMBER_STR " - AV2 decoder" VALUE "InternalName", "dav2d" VALUE "OriginalFilename", "dav2d.exe" VALUE "LegalCopyright", L"Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav2d Authors" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x409, 1252 END END dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/dav2d_cli_parse.c000066400000000000000000000460611517466257200244000ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "cli_config.h" #include #include #include #include #include #include #include #if HAVE_UNISTD_H # include #endif #include "dav2d_cli_parse.h" #include "common/attributes.h" #include "src/cpu.h" static const char short_opts[] = "i:o:vql:s:"; enum { ARG_DEMUXER = 256, ARG_MUXER, ARG_FRAME_TIMES, ARG_REALTIME, ARG_REALTIME_CACHE, ARG_THREADS, ARG_FRAME_DELAY, ARG_VERIFY, ARG_FILM_GRAIN, ARG_OPPOINT, ARG_ALL_LAYERS, ARG_SIZE_LIMIT, ARG_STRICT_STD_COMPLIANCE, ARG_CPU_MASK, ARG_NEG_STRIDE, ARG_OUTPUT_INVISIBLE, ARG_INLOOP_FILTERS, ARG_DECODE_FRAME_TYPE, }; static const struct option long_opts[] = { { "input", 1, NULL, 'i' }, { "output", 1, NULL, 'o' }, { "quiet", 0, NULL, 'q' }, { "demuxer", 1, NULL, ARG_DEMUXER }, { "muxer", 1, NULL, ARG_MUXER }, { "version", 0, NULL, 'v' }, { "frametimes", 1, NULL, ARG_FRAME_TIMES }, { "limit", 1, NULL, 'l' }, { "skip", 1, NULL, 's' }, { "realtime", 2, NULL, ARG_REALTIME }, { "realtimecache", 1, NULL, ARG_REALTIME_CACHE }, { "threads", 1, NULL, ARG_THREADS }, { "framedelay", 1, NULL, ARG_FRAME_DELAY }, { "verify", 1, NULL, ARG_VERIFY }, { "filmgrain", 1, NULL, ARG_FILM_GRAIN }, { "oppoint", 1, NULL, ARG_OPPOINT }, { "alllayers", 1, NULL, ARG_ALL_LAYERS }, { "sizelimit", 1, NULL, ARG_SIZE_LIMIT }, { "strict", 1, NULL, ARG_STRICT_STD_COMPLIANCE }, { "cpumask", 1, NULL, ARG_CPU_MASK }, { "negstride", 0, NULL, ARG_NEG_STRIDE }, { "outputinvisible", 1, NULL, ARG_OUTPUT_INVISIBLE }, { "inloopfilters", 1, NULL, ARG_INLOOP_FILTERS }, { "decodeframetype", 1, NULL, ARG_DECODE_FRAME_TYPE }, { NULL, 0, NULL, 0 }, }; #if HAVE_XXHASH_H #define AVAILABLE_MUXERS "'md5', 'xxh3', 'yuv', 'yuv4mpeg2' or 'null'" #else #define AVAILABLE_MUXERS "'md5', 'yuv', 'yuv4mpeg2' or 'null'" #endif #if ARCH_AARCH64 || ARCH_ARM #define ALLOWED_CPU_MASKS ", 'neon', 'dotprod' or 'i8mm'" #elif ARCH_LOONGARCH #define ALLOWED_CPU_MASKS ", 'lsx' or 'lasx'" #elif ARCH_PPC64LE #define ALLOWED_CPU_MASKS ", 'vsx' or 'pwr9'" #elif ARCH_RISCV #define ALLOWED_CPU_MASKS " or 'rvv'" #elif ARCH_X86 #define ALLOWED_CPU_MASKS \ ", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512icl'" #else #define ALLOWED_CPU_MASKS "not yet implemented for this architecture" #endif static void usage(const char *const app, const char *const reason, ...) { if (reason) { va_list args; va_start(args, reason); vfprintf(stderr, reason, args); va_end(args); fprintf(stderr, "\n\n"); } fprintf(stderr, "Usage: %s [options]\n\n", app); fprintf(stderr, "Supported options:\n" " --input/-i $file: input file\n" " --output/-o $file: output file (%%n, %%w or %%h will be filled in for per-frame files)\n" " --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from content)\n" " --muxer $name: force muxer type (" AVAILABLE_MUXERS "; default: detect from extension)\n" " use 'frame' as prefix to write per-frame files; if filename contains %%n, will default to writing per-frame files\n" " --quiet/-q: disable status messages\n" " --frametimes $file: dump frame times to file\n" " --limit/-l $num: stop decoding after $num frames\n" " --skip/-s $num: skip decoding of the first $num frames\n" " --realtime [$fract]: limit framerate, optional argument to override input framerate\n" " --realtimecache $num: set the size of the cache in realtime mode (default: 0)\n" " --version/-v: print version and exit\n" " --threads $num: number of threads (default: 0)\n" " --framedelay $num: maximum frame delay, capped at $threads (default: 0);\n" " set to 1 for low-latency decoding\n" " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5 or xxh3)\n" " --oppoint $num: select an operating point of a scalable AV2 bitstream (0 - 31)\n" " --alllayers $num: output all spatial layers of a scalable AV2 bitstream (default: 1)\n" " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n" " --strict $num: whether to abort decoding on standard compliance violations\n" " that don't affect bitstream decoding (default: 1)\n" " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n" " --negstride: use negative picture strides\n" " this is mostly meant as a developer option\n" " --outputinvisible $num: whether to output invisible (alt-ref) frames (default: 0)\n" " --inloopfilters $str: which in-loop filters to enable (none, (no)deblock, (no)cdef, (no)restoration or all; default: all)\n" " --decodeframetype $str: which frame types to decode (reference, intra, key or all; default: all)\n" ); exit(1); } static void error(const char *const app, const char *const optarg, const int option, const char *const shouldbe) { char optname[256]; int n; for (n = 0; long_opts[n].name; n++) if (long_opts[n].val == option) break; assert(long_opts[n].name); if (long_opts[n].val < 256) { snprintf(optname, sizeof(optname), "-%c/--%s", long_opts[n].val, long_opts[n].name); } else { snprintf(optname, sizeof(optname), "--%s", long_opts[n].name); } usage(app, "Invalid argument \"%s\" for option %s; should be %s", optarg, optname, shouldbe); } static unsigned parse_unsigned(const char *const optarg, const int option, const char *const app) { char *end; const unsigned res = (unsigned) strtoul(optarg, &end, 0); if (*end || end == optarg) error(app, optarg, option, "an integer"); return res; } static int parse_optional_fraction(const char *const optarg, const int option, const char *const app, double *value) { if (optarg == NULL) return 0; char *end; *value = strtod(optarg, &end); if (*end == '/' && end != optarg) { const char *optarg2 = end + 1; *value /= strtod(optarg2, &end); if (*end || end == optarg2) error(app, optarg, option, "a fraction"); } else if (*end || end == optarg) { error(app, optarg, option, "a fraction"); } return 1; } typedef struct EnumParseTable { const char *str; const int val; } EnumParseTable; #if ARCH_X86 enum CpuMask { X86_CPU_MASK_SSE2 = DAV2D_X86_CPU_FLAG_SSE2, X86_CPU_MASK_SSSE3 = DAV2D_X86_CPU_FLAG_SSSE3 | X86_CPU_MASK_SSE2, X86_CPU_MASK_SSE41 = DAV2D_X86_CPU_FLAG_SSE41 | X86_CPU_MASK_SSSE3, X86_CPU_MASK_AVX2 = DAV2D_X86_CPU_FLAG_AVX2 | X86_CPU_MASK_SSE41, X86_CPU_MASK_AVX512ICL = DAV2D_X86_CPU_FLAG_AVX512ICL | X86_CPU_MASK_AVX2, }; #elif ARCH_AARCH64 || ARCH_ARM enum CpuMask { ARM_CPU_MASK_NEON = DAV2D_ARM_CPU_FLAG_NEON, ARM_CPU_MASK_DOTPROD = DAV2D_ARM_CPU_FLAG_DOTPROD | ARM_CPU_MASK_NEON, ARM_CPU_MASK_I8MM = DAV2D_ARM_CPU_FLAG_I8MM | ARM_CPU_MASK_DOTPROD, #if ARCH_AARCH64 // SVE doesn't imply DOTPROD or I8MM. ARM_CPU_MASK_SVE = DAV2D_ARM_CPU_FLAG_SVE | ARM_CPU_MASK_NEON, // SVE2 implies DOTPROD, but not I8MM. ARM_CPU_MASK_SVE2 = DAV2D_ARM_CPU_FLAG_SVE2 | ARM_CPU_MASK_SVE | ARM_CPU_MASK_DOTPROD, #endif }; #endif #if ARCH_PPC64LE enum CpuMask { PPC_CPU_MASK_VSX = DAV2D_PPC_CPU_FLAG_VSX, PPC_CPU_MASK_PWR9 = DAV2D_PPC_CPU_FLAG_VSX | DAV2D_PPC_CPU_FLAG_PWR9, }; #endif static const EnumParseTable cpu_mask_tbl[] = { #if ARCH_AARCH64 || ARCH_ARM { "neon", ARM_CPU_MASK_NEON }, { "dotprod", ARM_CPU_MASK_DOTPROD }, { "i8mm", ARM_CPU_MASK_I8MM }, #if ARCH_AARCH64 { "sve", ARM_CPU_MASK_SVE }, { "sve2", ARM_CPU_MASK_SVE2 }, #endif /* ARCH_AARCH64 */ #elif ARCH_LOONGARCH { "lsx", DAV2D_LOONGARCH_CPU_FLAG_LSX }, { "lasx", DAV2D_LOONGARCH_CPU_FLAG_LASX }, #elif ARCH_PPC64LE { "vsx", PPC_CPU_MASK_VSX }, { "pwr9", PPC_CPU_MASK_PWR9 }, #elif ARCH_RISCV { "rvv", DAV2D_RISCV_CPU_FLAG_V }, #elif ARCH_X86 { "sse2", X86_CPU_MASK_SSE2 }, { "ssse3", X86_CPU_MASK_SSSE3 }, { "sse41", X86_CPU_MASK_SSE41 }, { "avx2", X86_CPU_MASK_AVX2 }, { "avx512icl", X86_CPU_MASK_AVX512ICL }, #endif { "none", 0 }, }; static const EnumParseTable inloop_filters_tbl[] = { { "none", 0 }, { "deblock", DAV2D_INLOOPFILTER_DEBLOCK }, { "cdef", DAV2D_INLOOPFILTER_CDEF }, { "ccso", DAV2D_INLOOPFILTER_CCSO }, { "wiener", DAV2D_INLOOPFILTER_WIENER }, { "gdf", DAV2D_INLOOPFILTER_GDF }, { "all", DAV2D_INLOOPFILTER_ALL }, }; static const EnumParseTable decode_frame_type_tbl[] = { { "all", DAV2D_DECODEFRAMETYPE_ALL }, { "reference", DAV2D_DECODEFRAMETYPE_REFERENCE }, { "intra", DAV2D_DECODEFRAMETYPE_INTRA }, { "key", DAV2D_DECODEFRAMETYPE_KEY }, }; static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl, const int tbl_sz, const int option, const char *app) { char str[1024]; strcpy(str, "any of "); for (int n = 0; n < tbl_sz; n++) { if (!strcmp(tbl[n].str, optarg)) return tbl[n].val; if (n) { if (n < tbl_sz - 1) strcat(str, ", "); else strcat(str, " or "); } strcat(str, tbl[n].str); } char *end; unsigned res; if (!strncmp(optarg, "0x", 2)) { res = (unsigned) strtoul(&optarg[2], &end, 16); } else { res = (unsigned) strtoul(optarg, &end, 0); } if (*end || end == optarg) { strcat(str, ", a hexadecimal (starting with 0x), or an integer"); error(app, optarg, option, str); } return res; } static int parse_enum_mask(const char *const optargs, unsigned const start_mask, const EnumParseTable *const tbl, const int tbl_sz, const int option, const char *app) { const int start_with_sign = optargs[0] == '+' || optargs[0] == '-'; unsigned res = start_with_sign ? start_mask : 0; int sub_instead_of_add = optargs[0] == '-'; const char *end = optargs + start_with_sign; for (;;) { const char *start = end; const char *plus = strchr(start, '+'); const char *min = strchr(start, '-'); if (plus && min) { assert(plus != min); end = plus < min ? plus : min; } else { end = plus ? plus : min; } const ptrdiff_t len = end ? end - start : (ptrdiff_t) strlen(start); int n; for (n = 0; n < tbl_sz; n++) if (!strncmp(tbl[n].str, start, len) && !tbl[n].str[len]) break; if (n == tbl_sz) break; if (sub_instead_of_add) { res &= ~tbl[n].val; } else { res |= tbl[n].val; } if (end) { sub_instead_of_add = *end == '-'; end++; } else { return res; } } char err_msg[1024]; size_t len = snprintf(err_msg, sizeof(err_msg), "plus/min-separated values from "); for (int n = 0; n < tbl_sz; n++) { len += snprintf(err_msg + len, sizeof(err_msg) - len, "%s%s", n ? n + 1 < tbl_sz ? ", " : " or " : "", tbl[n].str); } error(app, optarg, option, err_msg); return -1; } void parse(const int argc, char *const *const argv, CLISettings *const cli_settings, Dav2dSettings *const lib_settings) { int o; memset(cli_settings, 0, sizeof(*cli_settings)); dav2d_default_settings(lib_settings); lib_settings->strict_std_compliance = 1; // override library default int grain_specified = 0; while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) != -1) { switch (o) { case 'o': cli_settings->outputfile = optarg; break; case 'i': cli_settings->inputfile = optarg; break; case 'q': cli_settings->quiet = 1; break; case 'l': cli_settings->limit = parse_unsigned(optarg, 'l', argv[0]); break; case 's': cli_settings->skip = parse_unsigned(optarg, 's', argv[0]); break; case ARG_DEMUXER: cli_settings->demuxer = optarg; break; case ARG_MUXER: cli_settings->muxer = optarg; break; case ARG_FRAME_TIMES: cli_settings->frametimes = optarg; break; case ARG_REALTIME: // workaround to parse an optional argument of the form `--a b` // (getopt only allows `--a=b`) if (optarg == NULL && optind < argc && argv[optind] != NULL && argv[optind][0] != '-') { optarg = argv[optind]; optind++; } cli_settings->realtime = 1 + parse_optional_fraction(optarg, ARG_REALTIME, argv[0], &cli_settings->realtime_fps); break; case ARG_REALTIME_CACHE: cli_settings->realtime_cache = parse_unsigned(optarg, ARG_REALTIME_CACHE, argv[0]); break; case ARG_FRAME_DELAY: lib_settings->max_frame_delay = parse_unsigned(optarg, ARG_FRAME_DELAY, argv[0]); break; case ARG_THREADS: lib_settings->n_threads = parse_unsigned(optarg, ARG_THREADS, argv[0]); break; case ARG_VERIFY: cli_settings->verify = optarg; break; case ARG_FILM_GRAIN: lib_settings->apply_grain = !!parse_unsigned(optarg, ARG_FILM_GRAIN, argv[0]); grain_specified = 1; break; case ARG_OPPOINT: lib_settings->operating_point = parse_unsigned(optarg, ARG_OPPOINT, argv[0]); break; case ARG_ALL_LAYERS: lib_settings->all_layers = !!parse_unsigned(optarg, ARG_ALL_LAYERS, argv[0]); break; case ARG_SIZE_LIMIT: { char *arg = optarg, *end; uint64_t res = strtoul(arg, &end, 0); if (*end == 'x') // NxM res *= strtoul((arg = end + 1), &end, 0); if (*end || end == arg || res >= UINT_MAX) error(argv[0], optarg, ARG_SIZE_LIMIT, "an integer or dimension"); lib_settings->frame_size_limit = (unsigned) res; break; } case ARG_STRICT_STD_COMPLIANCE: lib_settings->strict_std_compliance = parse_unsigned(optarg, ARG_STRICT_STD_COMPLIANCE, argv[0]); break; case 'v': fprintf(stderr, "%s\n", dav2d_version()); exit(0); case ARG_CPU_MASK: dav2d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, ARRAY_SIZE(cpu_mask_tbl), ARG_CPU_MASK, argv[0])); break; case ARG_NEG_STRIDE: cli_settings->neg_stride = 1; break; case ARG_OUTPUT_INVISIBLE: lib_settings->output_invisible_frames = !!parse_unsigned(optarg, ARG_OUTPUT_INVISIBLE, argv[0]); break; case ARG_INLOOP_FILTERS: lib_settings->inloop_filters = parse_enum_mask(optarg, lib_settings->inloop_filters, inloop_filters_tbl, ARRAY_SIZE(inloop_filters_tbl), ARG_INLOOP_FILTERS, argv[0]); break; case ARG_DECODE_FRAME_TYPE: lib_settings->decode_frame_type = parse_enum(optarg, decode_frame_type_tbl, ARRAY_SIZE(decode_frame_type_tbl), ARG_DECODE_FRAME_TYPE, argv[0]); break; default: usage(argv[0], NULL); } } if (optind < argc) usage(argv[0], "Extra/unused arguments found, e.g. '%s'\n", argv[optind]); if (cli_settings->verify) { if (cli_settings->outputfile) usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set"); if (cli_settings->muxer && strcmp(cli_settings->muxer, "md5") && strcmp(cli_settings->muxer, "xxh3")) { usage(argv[0], "Verification (--verify) requires a checksum muxer (md5 or xxh3)"); } cli_settings->outputfile = "-"; if (!cli_settings->muxer) cli_settings->muxer = "md5"; } if (!grain_specified && cli_settings->muxer && (!strcmp(cli_settings->muxer, "md5") || !strcmp(cli_settings->muxer, "xxh3"))) { lib_settings->apply_grain = 0; } if (!cli_settings->inputfile) usage(argv[0], "Input file (-i/--input) is required"); if ((!cli_settings->muxer || strcmp(cli_settings->muxer, "null")) && !cli_settings->outputfile) { usage(argv[0], "Output file (-o/--output) is required"); } } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/dav2d_cli_parse.h000066400000000000000000000040461517466257200244020ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_CLI_PARSE_H #define DAV2D_CLI_PARSE_H #include "dav2d/dav2d.h" typedef struct { const char *outputfile; const char *inputfile; const char *demuxer; const char *muxer; const char *frametimes; const char *verify; unsigned limit, skip; int quiet; enum { REALTIME_DISABLE = 0, REALTIME_INPUT, REALTIME_CUSTOM, } realtime; double realtime_fps; unsigned realtime_cache; int neg_stride; } CLISettings; void parse(const int argc, char *const *const argv, CLISettings *const cli_settings, Dav2dSettings *const lib_settings); #endif /* DAV2D_CLI_PARSE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/000077500000000000000000000000001517466257200223435ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/annexb.c000066400000000000000000000133501517466257200237640ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * Copyright © 2019-2026, James Almer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "common/intops.h" #include "dav2d/headers.h" #include "input/demuxer.h" #include "input/parse.h" // these functions are based on an implementation from FFmpeg, and relicensed // with author's permission #define PROBE_SIZE 2048 static int annexb_probe(const uint8_t *data) { int ret, cnt = 0; size_t temporal_unit_size; ret = leb(data + cnt, PROBE_SIZE - cnt, &temporal_unit_size); if (ret < 0) return 0; cnt += ret; size_t frame_unit_size; ret = leb(data + cnt, PROBE_SIZE - cnt, &frame_unit_size); if (ret < 0 || ((uint64_t)frame_unit_size + ret) > temporal_unit_size) return 0; cnt += ret; temporal_unit_size -= ret; size_t obu_unit_size; ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size); if (ret < 0 || ((uint64_t)obu_unit_size + ret) >= frame_unit_size) return 0; cnt += ret; temporal_unit_size -= obu_unit_size + ret; frame_unit_size -= obu_unit_size + ret; // Check that the first OBU is a Temporal Delimiter. size_t obu_size; enum Dav2dObuType type; ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size), &obu_size, &type); if (ret < 0 || type != DAV2D_OBU_SEQ_HDR) return 0; cnt += (int)obu_unit_size; // look for first frame and accompanying sequence header while (cnt < PROBE_SIZE) { ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size); if (ret < 0 || ((uint64_t)obu_unit_size + ret) > frame_unit_size) return 0; cnt += ret; temporal_unit_size -= ret; frame_unit_size -= ret; ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size), &obu_size, &type); if (ret < 0) return 0; cnt += (int)obu_unit_size; switch (type) { case DAV2D_OBU_TILE_GRP: return 1; case DAV2D_OBU_SEQ_HDR: case DAV2D_OBU_TD: return 0; default: break; } temporal_unit_size -= obu_unit_size; frame_unit_size -= obu_unit_size; if (frame_unit_size <= 0) return 0; } return 1; } typedef struct DemuxerPriv { FILE *f; size_t temporal_unit_size; size_t frame_unit_size; } AnnexbInputContext; static int annexb_open(AnnexbInputContext *const c, const char *const file, unsigned fps[2], unsigned *const num_frames, unsigned timebase[2]) { int res; size_t len; if (!(c->f = fopen(file, "rb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } // TODO: Parse sequence header and read timing info if any. fps[0] = 25; fps[1] = 1; timebase[0] = 25; timebase[1] = 1; for (*num_frames = 0;; (*num_frames)++) { res = leb128(c->f, &len); if (res < 0) break; fseeko(c->f, len, SEEK_CUR); } fseeko(c->f, 0, SEEK_SET); return 0; } static int annexb_read(AnnexbInputContext *const c, Dav2dData *const data) { size_t len; int res; if (!c->temporal_unit_size) { res = leb128(c->f, &c->temporal_unit_size); if (res < 0) return -1; } if (!c->frame_unit_size) { res = leb128(c->f, &c->frame_unit_size); if (res < 0 || (c->frame_unit_size + res) > c->temporal_unit_size) return -1; c->temporal_unit_size -= res; } res = leb128(c->f, &len); if (res < 0 || (len + res) > c->frame_unit_size) return -1; uint8_t *ptr = dav2d_data_create(data, len); if (!ptr) return -1; c->temporal_unit_size -= len + res; c->frame_unit_size -= len + res; if (fread(ptr, len, 1, c->f) != 1) { fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno)); dav2d_data_unref(data); return -1; } return 0; } static void annexb_close(AnnexbInputContext *const c) { fclose(c->f); } const Demuxer annexb_demuxer = { .priv_data_size = sizeof(AnnexbInputContext), .name = "annexb", .probe = annexb_probe, .probe_sz = PROBE_SIZE, .open = annexb_open, .read = annexb_read, .seek = NULL, .close = annexb_close, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/demuxer.h000066400000000000000000000037211517466257200241700ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_INPUT_DEMUXER_H #define DAV2D_INPUT_DEMUXER_H #include "data.h" typedef struct DemuxerPriv DemuxerPriv; typedef struct Demuxer { int priv_data_size; const char *name; int probe_sz; int (*probe)(const uint8_t *data); int (*open)(DemuxerPriv *ctx, const char *filename, unsigned fps[2], unsigned *num_frames, unsigned timebase[2]); int (*read)(DemuxerPriv *ctx, Dav2dData *data); int (*seek)(DemuxerPriv *ctx, uint64_t pts); void (*close)(DemuxerPriv *ctx); } Demuxer; #endif /* DAV2D_INPUT_DEMUXER_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/input.c000066400000000000000000000107421517466257200236520ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "common/attributes.h" #include "common/intops.h" #include "input/input.h" #include "input/demuxer.h" struct DemuxerContext { DemuxerPriv *data; const Demuxer *impl; uint64_t priv_data[]; }; extern const Demuxer ivf_demuxer; extern const Demuxer annexb_demuxer; extern const Demuxer section5_demuxer; static const Demuxer *const demuxers[] = { &ivf_demuxer, &annexb_demuxer, §ion5_demuxer, NULL }; int input_open(DemuxerContext **const c_out, const char *const name, const char *const filename, unsigned fps[2], unsigned *const num_frames, unsigned timebase[2]) { const Demuxer *impl; DemuxerContext *c; int res, i; if (name) { for (i = 0; demuxers[i]; i++) { if (!strcmp(demuxers[i]->name, name)) { impl = demuxers[i]; break; } } if (!demuxers[i]) { fprintf(stderr, "Failed to find demuxer named \"%s\"\n", name); return DAV2D_ERR(ENOPROTOOPT); } } else { int probe_sz = 0; for (i = 0; demuxers[i]; i++) probe_sz = imax(probe_sz, demuxers[i]->probe_sz); uint8_t *const probe_data = malloc(probe_sz); if (!probe_data) { fprintf(stderr, "Failed to allocate memory\n"); return DAV2D_ERR(ENOMEM); } FILE *f = fopen(filename, "rb"); if (!f) { free(probe_data); fprintf(stderr, "Failed to open input file %s: %s\n", filename, strerror(errno)); return errno ? DAV2D_ERR(errno) : DAV2D_ERR(EIO); } res = !!fread(probe_data, 1, probe_sz, f); fclose(f); if (!res) { free(probe_data); fprintf(stderr, "Failed to read probe data\n"); return errno ? DAV2D_ERR(errno) : DAV2D_ERR(EIO); } for (i = 0; demuxers[i]; i++) { if (demuxers[i]->probe(probe_data)) { impl = demuxers[i]; break; } } free(probe_data); if (!demuxers[i]) { fprintf(stderr, "Failed to probe demuxer for file %s\n", filename); return DAV2D_ERR(ENOPROTOOPT); } } if (!(c = calloc(1, offsetof(DemuxerContext, priv_data) + impl->priv_data_size))) { fprintf(stderr, "Failed to allocate memory\n"); return DAV2D_ERR(ENOMEM); } c->impl = impl; c->data = (DemuxerPriv *) c->priv_data; if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) { free(c); return res; } *c_out = c; return 0; } int input_read(DemuxerContext *const ctx, Dav2dData *const data) { return ctx->impl->read(ctx->data, data); } int input_seek(DemuxerContext *const ctx, const uint64_t pts) { return ctx->impl->seek ? ctx->impl->seek(ctx->data, pts) : -1; } void input_close(DemuxerContext *const ctx) { ctx->impl->close(ctx->data); free(ctx); } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/input.h000066400000000000000000000036041517466257200236560ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_INPUT_INPUT_H #define DAV2D_INPUT_INPUT_H #include "data.h" typedef struct DemuxerContext DemuxerContext; int input_open(DemuxerContext **const c_out, const char *const name, const char *const filename, unsigned fps[2], unsigned *num_frames, unsigned timebase[2]); int input_read(DemuxerContext *ctx, Dav2dData *data); int input_seek(DemuxerContext *ctx, uint64_t pts); void input_close(DemuxerContext *ctx); #endif /* DAV2D_INPUT_INPUT_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/ivf.c000066400000000000000000000146731517466257200233060ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include #include #include #include "input/demuxer.h" typedef struct DemuxerPriv { FILE *f; int broken; double timebase; uint64_t last_ts; uint64_t step; } IvfInputContext; static const uint8_t probe_data[] = { 'D', 'K', 'I', 'F', 0, 0, 0x20, 0, 'A', 'V', '0', '1', }; static int ivf_probe(const uint8_t *const data) { return !memcmp(data, probe_data, sizeof(probe_data)); } static unsigned rl32(const uint8_t *const p) { return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0]; } static int64_t rl64(const uint8_t *const p) { return (((uint64_t) rl32(&p[4])) << 32) | rl32(p); } static int ivf_open(IvfInputContext *const c, const char *const file, unsigned fps[2], unsigned *const num_frames, unsigned timebase[2]) { uint8_t hdr[32]; if (!(c->f = fopen(file, "rb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } else if (fread(hdr, 32, 1, c->f) != 1) { fprintf(stderr, "Failed to read stream header: %s\n", strerror(errno)); fclose(c->f); return -1; } else if (memcmp(hdr, "DKIF", 4)) { fprintf(stderr, "%s is not an IVF file [tag=%.4s|0x%02x%02x%02x%02x]\n", file, hdr, hdr[0], hdr[1], hdr[2], hdr[3]); fclose(c->f); return -1; } else if (memcmp(&hdr[8], "AV01", 4)) { fprintf(stderr, "%s is not an AV2 file [tag=%.4s|0x%02x%02x%02x%02x]\n", file, &hdr[8], hdr[8], hdr[9], hdr[10], hdr[11]); fclose(c->f); return -1; } timebase[0] = rl32(&hdr[16]); timebase[1] = rl32(&hdr[20]); const unsigned duration = rl32(&hdr[24]); uint8_t data[8]; c->broken = 0; for (*num_frames = 0;; (*num_frames)++) { if (fread(data, 4, 1, c->f) != 1) break; // EOF size_t sz = rl32(data); if (fread(data, 8, 1, c->f) != 1) break; // EOF const uint64_t ts = rl64(data); if (*num_frames && ts <= c->last_ts) c->broken = 1; c->last_ts = ts; fseeko(c->f, sz, SEEK_CUR); } if (*num_frames == 0) { /* Reading bailed early */ fprintf(stderr, "No frames read from %s\n", file); fclose(c->f); return -1; } uint64_t fps_num = (uint64_t) timebase[0] * *num_frames; uint64_t fps_den = (uint64_t) timebase[1] * duration; if (fps_num && fps_den) { /* Reduce fraction */ uint64_t gcd = fps_num; for (uint64_t a = fps_den, b; (b = a % gcd); a = gcd, gcd = b); fps_num /= gcd; fps_den /= gcd; while ((fps_num | fps_den) > UINT_MAX) { fps_num >>= 1; fps_den >>= 1; } } if (fps_num && fps_den) { fps[0] = (unsigned) fps_num; fps[1] = (unsigned) fps_den; } else { fps[0] = fps[1] = 0; } c->timebase = (double)timebase[0] / timebase[1]; c->step = duration / *num_frames; fseeko(c->f, 32, SEEK_SET); c->last_ts = 0; return 0; } static inline int ivf_read_header(IvfInputContext *const c, ptrdiff_t *const sz, int64_t *const off_, uint64_t *const ts) { uint8_t data[8]; int64_t const off = ftello(c->f); if (off_) *off_ = off; if (fread(data, 4, 1, c->f) != 1) return -1; // EOF *sz = rl32(data); if (!c->broken) { if (fread(data, 8, 1, c->f) != 1) return -1; *ts = rl64(data); } else { if (fseeko(c->f, 8, SEEK_CUR)) return -1; *ts = off > 32 ? c->last_ts + c->step : 0; } return 0; } static int ivf_read(IvfInputContext *const c, Dav2dData *const buf) { uint8_t *ptr; ptrdiff_t sz; int64_t off; uint64_t ts; if (ivf_read_header(c, &sz, &off, &ts)) return -1; if (!(ptr = dav2d_data_create(buf, sz))) return -1; if (fread(ptr, sz, 1, c->f) != 1) { fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno)); dav2d_data_unref(buf); return -1; } buf->m.offset = off; buf->m.timestamp = ts; c->last_ts = ts; return 0; } static int ivf_seek(IvfInputContext *const c, const uint64_t pts) { uint64_t cur; const uint64_t ts = llround((pts * c->timebase) / 1000000000.0); if (ts <= c->last_ts) if (fseeko(c->f, 32, SEEK_SET)) goto error; while (1) { ptrdiff_t sz; if (ivf_read_header(c, &sz, NULL, &cur)) goto error; if (cur >= ts) break; if (fseeko(c->f, sz, SEEK_CUR)) goto error; c->last_ts = cur; } if (fseeko(c->f, -12, SEEK_CUR)) goto error; return 0; error: fprintf(stderr, "Failed to seek: %s\n", strerror(errno)); return -1; } static void ivf_close(IvfInputContext *const c) { fclose(c->f); } const Demuxer ivf_demuxer = { .priv_data_size = sizeof(IvfInputContext), .name = "ivf", .probe = ivf_probe, .probe_sz = sizeof(probe_data), .open = ivf_open, .read = ivf_read, .seek = ivf_seek, .close = ivf_close, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/parse.h000066400000000000000000000064061517466257200236340ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * Copyright © 2019-2026, James Almer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_INPUT_PARSE_H #define DAV2D_INPUT_PARSE_H #include #include "dav2d/headers.h" static int leb128(FILE *const f, size_t *const len) { uint64_t val = 0; unsigned i = 0, more; do { uint8_t v; if (fread(&v, 1, 1, f) < 1) return -1; more = v & 0x80; val |= ((uint64_t) (v & 0x7F)) << (i * 7); i++; } while (more && i < 8); if (val > UINT_MAX || more) return -1; *len = (size_t) val; return i; } // these functions are based on an implementation from FFmpeg, and relicensed // with author's permission static int leb(const uint8_t *ptr, int sz, size_t *const len) { uint64_t val = 0; unsigned i = 0, more; do { if (!sz--) return -1; const int v = *ptr++; more = v & 0x80; val |= ((uint64_t) (v & 0x7F)) << (i * 7); i++; } while (more && i < 8); if (val > UINT_MAX || more) return -1; *len = (size_t) val; return i; } static inline int parse_obu_header(const uint8_t *buf, int buf_size, size_t *const obu_size, enum Dav2dObuType *const type) { int ret, extension_flag; if (!buf_size) return -1; ret = leb(buf, buf_size, obu_size); if (ret < 0) return -1; buf += ret; buf_size -= ret; if (!buf_size) return -1; extension_flag = *buf >> 7; *type = (*buf & 0x7c) >> 2; buf++; buf_size--; if (extension_flag) { if (!buf_size) return -1; buf++; buf_size--; // ignore fields } if (*obu_size < 1U + extension_flag) return -1; *obu_size -= 1 + extension_flag; return (int) *obu_size + ret + 1 + extension_flag; } #endif /* DAV2D_INPUT_PARSE_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/input/section5.c000066400000000000000000000132571517466257200242500ustar00rootroot00000000000000/* * Copyright © 2019-2026, VideoLAN and dav2d authors * Copyright © 2019-2026, Two Orioles, LLC * Copyright © 2019-2026, James Almer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #if HAVE_SYS_TYPES_H #include #endif #include "dav2d/headers.h" #include "input/demuxer.h" #include "input/parse.h" #define PROBE_SIZE 2048 static int section5_probe(const uint8_t *data) { int ret, cnt = 0; // start with sequence header size_t obu_size; enum Dav2dObuType type; ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt, &obu_size, &type); if (ret < 0 || type != DAV2D_OBU_TD) return 0; cnt += ret; ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt, &obu_size, &type); if (ret < 0 || type != DAV2D_OBU_SEQ_HDR) return 0; cnt += ret; // look for first frame while (cnt < PROBE_SIZE) { ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt, &obu_size, &type); if (ret < 0) return 0; switch (type) { case DAV2D_OBU_OPEN_LOOP_KF: case DAV2D_OBU_CLOSED_LOOP_KF: return 1; default: cnt += ret; break; } } return 1; } typedef struct DemuxerPriv { FILE *f; } Section5InputContext; static int section5_open(Section5InputContext *const c, const char *const file, unsigned fps[2], unsigned *const num_frames, unsigned timebase[2]) { if (!(c->f = fopen(file, "rb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } // TODO: Parse sequence header and read timing info if any. fps[0] = 25; fps[1] = 1; timebase[0] = 25; timebase[1] = 1; *num_frames = 0; for (;;) { uint8_t byte[2]; size_t len; const int res = leb128(c->f, &len); if (res < 0) break; if (fread(&byte[0], 1, 1, c->f) < 1) return -1; const enum Dav2dObuType obu_type = (byte[0] >> 2) & 0x1f; switch (obu_type) { case DAV2D_OBU_TD: (*num_frames)++; break; default: break; } const int has_extension = byte[0] >> 7; if (has_extension && fread(&byte[1], 1, 1, c->f) < 1) return -1; if (len < 1U + has_extension) return -1; len -= 1 + has_extension; fseeko(c->f, len, SEEK_CUR); // skip packet } fseeko(c->f, 0, SEEK_SET); return 0; } static int section5_read(Section5InputContext *const c, Dav2dData *const data) { size_t total_bytes = 0; for (int first = 1;; first = 0) { uint8_t byte[2]; size_t len; const int res = leb128(c->f, &len); if (res < 0) { if (!first && feof(c->f)) break; return -1; } if (fread(&byte[0], 1, 1, c->f) < 1) return -1; const enum Dav2dObuType obu_type = (byte[0] >> 2) & 0x1f; if (first) { if (obu_type != DAV2D_OBU_TD) return -1; } else { if (obu_type == DAV2D_OBU_TD) { // include TD in next packet fseeko(c->f, -(1 + res), SEEK_CUR); break; } } const int has_extension = byte[0] >> 7; if (has_extension && fread(&byte[1], 1, 1, c->f) < 1) return -1; if (len < 1U + has_extension) return -1; len -= 1 + has_extension; total_bytes += 1U + has_extension + res + len; fseeko(c->f, len, SEEK_CUR); // skip packet, we'll read it below } fseeko(c->f, -(off_t)total_bytes, SEEK_CUR); uint8_t *ptr = dav2d_data_create(data, total_bytes); if (!ptr) return -1; if (fread(ptr, total_bytes, 1, c->f) != 1) { fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno)); dav2d_data_unref(data); return -1; } return 0; } static void section5_close(Section5InputContext *const c) { fclose(c->f); } const Demuxer section5_demuxer = { .priv_data_size = sizeof(Section5InputContext), .name = "section5", .probe = section5_probe, .probe_sz = PROBE_SIZE, .open = section5_open, .read = section5_read, .seek = NULL, .close = section5_close, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/meson.build000066400000000000000000000073011517466257200233470ustar00rootroot00000000000000# Copyright © 2018, VideoLAN and dav2d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # Common source files used by tools and examples dav2d_input_sources = files( 'input/input.c', 'input/annexb.c', 'input/ivf.c', 'input/section5.c', ) dav2d_input_objs = static_library('dav2d_input', dav2d_input_sources, include_directories : dav2d_inc_dirs, install : false, build_by_default : false, ) # Leave subdir if tools are disabled if not get_option('enable_tools') subdir_done() endif dav2d_output_sources = files( 'output/md5.c', 'output/null.c', 'output/output.c', 'output/y4m2.c', 'output/yuv.c', ) # hacky check for xxhash.h to allow copying it to tools/output if not get_option('xxhash_muxer').disabled() xxhash_include = '-I' + meson.current_source_dir() / 'output' if cc.has_header_symbol('xxhash.h', 'XXH3_createState', args : xxhash_include) dav2d_output_sources += 'output/xxhash.c' xxh3_found = true elif get_option('xxhash_muxer').enabled() # manual error since 'required' kw arg in has_header_symbol() was only added in meson 0.50 error( 'Requested xxhash_muxer to be built, but no usable xxhash.h was found.') endif endif dav2d_output_objs = static_library('dav2d_output', dav2d_output_sources, include_directories : dav2d_inc_dirs, install : false, build_by_default : false, ) # # Build definition for the dav2d tools # # Configuratin data for cli_config.h cli_cdata = configuration_data() cli_cdata.set10('HAVE_XXHASH_H', get_variable('xxh3_found', false)) cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_cdata) # dav2d cli tool sources dav2d_sources = files( 'dav2d.c', 'dav2d_cli_parse.c', ) if host_machine.system() == 'windows' rc_file = configure_file( input : 'dav2d.rc.in', output : 'dav2d.rc', configuration : rc_data ) dav2d_rc_obj = winmod.compile_resources(rc_file, depend_files : files('dav2d.manifest'), include_directories : include_directories('.') ) else dav2d_rc_obj = [] endif dav2d = executable('dav2d', dav2d_sources, dav2d_rc_obj, rev_target, cli_config_h_target, link_with : [libdav2d, dav2d_input_objs, dav2d_output_objs], include_directories : [dav2d_inc_dirs], dependencies : [ getopt_dependency, thread_dependency, rt_dependency, libm_dependency, ], install : true, ) dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/000077500000000000000000000000001517466257200225445ustar00rootroot00000000000000dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/md5.c000066400000000000000000000234631517466257200234050ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "common/intops.h" #include "output/muxer.h" static const uint32_t k[64] = { 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, }; #if ENDIANNESS_BIG #define NE2LE_32(x) (((x & 0x00ff) << 24) |\ ((x & 0xff00) << 8) |\ ((x >> 8) & 0xff00) |\ ((x >> 24) & 0x00ff)) #define NE2LE_64(x) (((x & 0x000000ff) << 56) |\ ((x & 0x0000ff00) << 40) |\ ((x & 0x00ff0000) << 24) |\ ((x & 0xff000000) << 8) |\ ((x >> 8) & 0xff000000) |\ ((x >> 24) & 0x00ff0000) |\ ((x >> 40) & 0x0000ff00) |\ ((x >> 56) & 0x000000ff)) #else #define NE2LE_32(x) (x) #define NE2LE_64(x) (x) #endif typedef struct MuxerPriv { uint32_t abcd[4]; union { uint8_t data[64]; uint32_t data32[16]; }; uint64_t len; FILE *f; #if ENDIANNESS_BIG uint8_t *bswap; int bswap_w; #endif } MD5Context; static int md5_open(MD5Context *const md5, const char *const file, const Dav2dPictureParameters *const p, const unsigned fps[2]) { if (!strcmp(file, "-")) { md5->f = stdout; } else if (!(md5->f = fopen(file, "wb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } #if ENDIANNESS_BIG md5->bswap = NULL; md5->bswap_w = 0; #endif md5->abcd[0] = 0x67452301; md5->abcd[1] = 0xefcdab89; md5->abcd[2] = 0x98badcfe; md5->abcd[3] = 0x10325476; md5->len = 0; return 0; } static inline uint32_t leftrotate(const uint32_t x, const int c) { return (x << c) | (x >> (32 - c)); } #define F(i) do { \ a = b + leftrotate(a + ((b & c) | (~b & d)) + k[i + 0] + NE2LE_32(data[i + 0]), 7); \ d = a + leftrotate(d + ((a & b) | (~a & c)) + k[i + 1] + NE2LE_32(data[i + 1]), 12); \ c = d + leftrotate(c + ((d & a) | (~d & b)) + k[i + 2] + NE2LE_32(data[i + 2]), 17); \ b = c + leftrotate(b + ((c & d) | (~c & a)) + k[i + 3] + NE2LE_32(data[i + 3]), 22); \ } while (0) #define G(i) do { \ a = b + leftrotate(a + ((d & b) | (~d & c)) + k[i + 0] + NE2LE_32(data[(i + 1) & 15]), 5); \ d = a + leftrotate(d + ((c & a) | (~c & b)) + k[i + 1] + NE2LE_32(data[(i + 6) & 15]), 9); \ c = d + leftrotate(c + ((b & d) | (~b & a)) + k[i + 2] + NE2LE_32(data[(i + 11) & 15]), 14); \ b = c + leftrotate(b + ((a & c) | (~a & d)) + k[i + 3] + NE2LE_32(data[(i + 0) & 15]), 20); \ } while (0) #define H(i) do { \ a = b + leftrotate(a + (b ^ c ^ d) + k[i + 0] + NE2LE_32(data[( 5 - i) & 15]), 4); \ d = a + leftrotate(d + (a ^ b ^ c) + k[i + 1] + NE2LE_32(data[( 8 - i) & 15]), 11); \ c = d + leftrotate(c + (d ^ a ^ b) + k[i + 2] + NE2LE_32(data[(11 - i) & 15]), 16); \ b = c + leftrotate(b + (c ^ d ^ a) + k[i + 3] + NE2LE_32(data[(14 - i) & 15]), 23); \ } while (0) #define I(i) do { \ a = b + leftrotate(a + (c ^ (b | ~d)) + k[i + 0] + NE2LE_32(data[( 0 - i) & 15]), 6); \ d = a + leftrotate(d + (b ^ (a | ~c)) + k[i + 1] + NE2LE_32(data[( 7 - i) & 15]), 10); \ c = d + leftrotate(c + (a ^ (d | ~b)) + k[i + 2] + NE2LE_32(data[(14 - i) & 15]), 15); \ b = c + leftrotate(b + (d ^ (c | ~a)) + k[i + 3] + NE2LE_32(data[( 5 - i) & 15]), 21); \ } while (0) static void md5_body(MD5Context *const md5, const uint32_t *const data) { uint32_t a = md5->abcd[0]; uint32_t b = md5->abcd[1]; uint32_t c = md5->abcd[2]; uint32_t d = md5->abcd[3]; F( 0); F( 4); F( 8); F(12); G(16); G(20); G(24); G(28); H(32); H(36); H(40); H(44); I(48); I(52); I(56); I(60); md5->abcd[0] += a; md5->abcd[1] += b; md5->abcd[2] += c; md5->abcd[3] += d; } static void md5_update(MD5Context *const md5, const uint8_t *data, unsigned len) { if (!len) return; if (md5->len & 63) { const unsigned tmp = umin(len, 64 - (md5->len & 63)); memcpy(&md5->data[md5->len & 63], data, tmp); len -= tmp; data += tmp; md5->len += tmp; if (!(md5->len & 63)) md5_body(md5, md5->data32); } while (len >= 64) { memcpy(md5->data, data, 64); md5_body(md5, md5->data32); md5->len += 64; data += 64; len -= 64; } if (len) { memcpy(md5->data, data, len); md5->len += len; } } static int md5_write(MD5Context *const md5, Dav2dPicture *const p) { const int hbd = p->p.bpc > 8; const int w = p->p.w, h = p->p.h; uint8_t *yptr = p->data[0]; #if ENDIANNESS_BIG if (hbd && (!md5->bswap || md5->bswap_w < p->p.w)) { free(md5->bswap); md5->bswap_w = 0; md5->bswap = malloc(p->p.w << 1); if (!md5->bswap) return -1; md5->bswap_w = p->p.w; } #endif for (int y = 0; y < h; y++) { #if ENDIANNESS_BIG if (hbd) { for (int x = 0; x < w; x++) { md5->bswap[2 * x + 1] = yptr[2 * x]; md5->bswap[2 * x] = yptr[2 * x + 1]; } md5_update(md5, md5->bswap, w << hbd); } else #endif md5_update(md5, yptr, w << hbd); yptr += p->stride[0]; } if (p->p.layout != DAV2D_PIXEL_LAYOUT_I400) { const int ss_ver = p->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p->p.layout != DAV2D_PIXEL_LAYOUT_I444; const int cw = (w + ss_hor) >> ss_hor; const int ch = (h + ss_ver) >> ss_ver; for (int pl = 1; pl <= 2; pl++) { uint8_t *uvptr = p->data[pl]; for (int y = 0; y < ch; y++) { #if ENDIANNESS_BIG if (hbd) { for (int x = 0; x < cw; x++){ md5->bswap[2 * x + 1] = uvptr[2 * x]; md5->bswap[2 * x] = uvptr[2 * x + 1]; } md5_update(md5, md5->bswap, cw << hbd); } else #endif md5_update(md5, uvptr, cw << hbd); uvptr += p->stride[1]; } } } dav2d_picture_unref(p); return 0; } static void md5_finish(MD5Context *const md5) { static const uint8_t bit[2] = { 0x80, 0x00 }; const uint64_t len = NE2LE_64(md5->len << 3); md5_update(md5, &bit[0], 1); while ((md5->len & 63) != 56) md5_update(md5, &bit[1], 1); md5_update(md5, (const uint8_t *) &len, 8); } static void md5_close(MD5Context *const md5) { md5_finish(md5); for (int i = 0; i < 4; i++) fprintf(md5->f, "%2.2x%2.2x%2.2x%2.2x", md5->abcd[i] & 0xff, (md5->abcd[i] >> 8) & 0xff, (md5->abcd[i] >> 16) & 0xff, md5->abcd[i] >> 24); fprintf(md5->f, "\n"); #if ENDIANNESS_BIG free(md5->bswap); md5->bswap_w = 0; #endif if (md5->f != stdout) fclose(md5->f); } static int md5_verify(MD5Context *const md5, const char *md5_str) { md5_finish(md5); if (strlen(md5_str) < 32) return -1; uint32_t abcd[4] = { 0 }; char t[3] = { 0 }; for (int i = 0; i < 4; i++) { for (int j = 0; j < 32; j += 8) { char *ignore; memcpy(t, md5_str, 2); md5_str += 2; abcd[i] |= (uint32_t) strtoul(t, &ignore, 16) << j; } } #if ENDIANNESS_BIG free(md5->bswap); md5->bswap_w = 0; #endif return !!memcmp(abcd, md5->abcd, sizeof(abcd)); } const Muxer md5_muxer = { .priv_data_size = sizeof(MD5Context), .name = "md5", .extension = "md5", .write_header = md5_open, .write_picture = md5_write, .write_trailer = md5_close, .verify = md5_verify, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/muxer.h000066400000000000000000000042311517466257200240550ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_OUTPUT_MUXER_H #define DAV2D_OUTPUT_MUXER_H #include "picture.h" typedef struct MuxerPriv MuxerPriv; typedef struct Muxer { int priv_data_size; const char *name; const char *extension; int (*write_header)(MuxerPriv *ctx, const char *filename, const Dav2dPictureParameters *p, const unsigned fps[2]); int (*write_picture)(MuxerPriv *ctx, Dav2dPicture *p); void (*write_trailer)(MuxerPriv *ctx); /** * Verifies the muxed data (for example in the md5 muxer). Replaces write_trailer. * * @param hash_string Muxer specific reference value. * * @return 0 on success. */ int (*verify)(MuxerPriv *ctx, const char *hash_string); } Muxer; #endif /* DAV2D_OUTPUT_MUXER_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/null.c000066400000000000000000000033641517466257200236700ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "output/muxer.h" typedef struct MuxerPriv NullOutputContext; static int null_write(NullOutputContext *const c, Dav2dPicture *const p) { dav2d_picture_unref(p); return 0; } const Muxer null_muxer = { .priv_data_size = 0, .name = "null", .extension = "null", .write_picture = null_write, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/output.c000066400000000000000000000177261517466257200242650ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "cli_config.h" #include #include #include #include #include "common/attributes.h" #include "common/intops.h" #include "output/output.h" #include "output/muxer.h" struct MuxerContext { MuxerPriv *data; const Muxer *impl; int one_file_per_frame; unsigned fps[2]; const char *filename; int framenum; uint64_t priv_data[]; }; extern const Muxer null_muxer; extern const Muxer md5_muxer; extern const Muxer xxh3_muxer; extern const Muxer yuv_muxer; extern const Muxer y4m2_muxer; static const Muxer *muxers[] = { &null_muxer, &md5_muxer, #if HAVE_XXHASH_H &xxh3_muxer, #endif &yuv_muxer, &y4m2_muxer, NULL }; static const char *find_extension(const char *const f) { const size_t l = strlen(f); if (l == 0) return NULL; const char *const end = &f[l - 1], *step = end; while ((*step >= 'a' && *step <= 'z') || (*step >= 'A' && *step <= 'Z') || (*step >= '0' && *step <= '9')) { step--; } return (step < end && step > f && *step == '.' && step[-1] != '/') ? &step[1] : NULL; } int output_open(MuxerContext **const c_out, const char *const name, const char *const filename, const Dav2dPictureParameters *const p, const unsigned fps[2]) { const Muxer *impl; MuxerContext *c; unsigned i; int res; int name_offset = 0; if (name) { name_offset = 5 * !strncmp(name, "frame", 5); for (i = 0; muxers[i]; i++) { if (!strcmp(muxers[i]->name, &name[name_offset])) { impl = muxers[i]; break; } } if (!muxers[i]) { fprintf(stderr, "Failed to find muxer named \"%s\"\n", name); return DAV2D_ERR(ENOPROTOOPT); } } else if (!strcmp(filename, "/dev/null")) { impl = muxers[0]; } else { const char *const ext = find_extension(filename); if (!ext) { fprintf(stderr, "No extension found for file %s\n", filename); return -1; } for (i = 0; muxers[i]; i++) { if (!strcmp(muxers[i]->extension, ext)) { impl = muxers[i]; break; } } if (!muxers[i]) { fprintf(stderr, "Failed to find muxer for extension \"%s\"\n", ext); return DAV2D_ERR(ENOPROTOOPT); } } if (!(c = malloc(offsetof(MuxerContext, priv_data) + impl->priv_data_size))) { fprintf(stderr, "Failed to allocate memory\n"); return DAV2D_ERR(ENOMEM); } c->impl = impl; c->data = (MuxerPriv *) c->priv_data; int have_num_pattern = 0; for (const char *ptr = filename ? strchr(filename, '%') : NULL; !have_num_pattern && ptr; ptr = strchr(ptr, '%')) { ptr++; // skip '%' while (*ptr >= '0' && *ptr <= '9') ptr++; // skip length indicators have_num_pattern = *ptr == 'n'; } c->one_file_per_frame = name_offset || (!name && have_num_pattern); if (c->one_file_per_frame) { c->fps[0] = fps[0]; c->fps[1] = fps[1]; c->filename = filename; c->framenum = 0; } else if (impl->write_header && (res = impl->write_header(c->data, filename, p, fps)) < 0) { free(c); return res; } *c_out = c; return 0; } static void safe_strncat(char *const dst, const int dst_len, const char *const src, const int src_len) { if (!src_len) return; const int dst_fill = (int) strlen(dst); assert(dst_fill < dst_len); const int to_copy = imin(src_len, dst_len - dst_fill - 1); if (!to_copy) return; memcpy(dst + dst_fill, src, to_copy); dst[dst_fill + to_copy] = 0; } static void assemble_field(char *const dst, const int dst_len, const char *const fmt, const int fmt_len, const int field) { char fmt_copy[32]; assert(fmt[0] == '%'); fmt_copy[0] = '%'; if (fmt[1] >= '1' && fmt[1] <= '9') { fmt_copy[1] = '0'; // pad with zeroes, not spaces fmt_copy[2] = 0; } else { fmt_copy[1] = 0; } safe_strncat(fmt_copy, sizeof(fmt_copy), &fmt[1], fmt_len - 1); safe_strncat(fmt_copy, sizeof(fmt_copy), "d", 1); char tmp[32]; snprintf(tmp, sizeof(tmp), fmt_copy, field); safe_strncat(dst, dst_len, tmp, (int) strlen(tmp)); } static void assemble_filename(MuxerContext *const ctx, char *const filename, const int filename_size, const Dav2dPictureParameters *const p) { filename[0] = 0; const int framenum = ctx->framenum++; assert(ctx->filename); const char *ptr = ctx->filename, *iptr; while ((iptr = strchr(ptr, '%'))) { safe_strncat(filename, filename_size, ptr, (int) (iptr - ptr)); ptr = iptr; const char *iiptr = &iptr[1]; // skip '%' while (*iiptr >= '0' && *iiptr <= '9') iiptr++; // skip length indicators switch (*iiptr) { case 'w': assemble_field(filename, filename_size, ptr, (int) (iiptr - ptr), p->w); break; case 'h': assemble_field(filename, filename_size, ptr, (int) (iiptr - ptr), p->h); break; case 'n': assemble_field(filename, filename_size, ptr, (int) (iiptr - ptr), framenum); break; default: safe_strncat(filename, filename_size, "%", 1); ptr = &iptr[1]; continue; } ptr = &iiptr[1]; } safe_strncat(filename, filename_size, ptr, (int) strlen(ptr)); } int output_write(MuxerContext *const ctx, Dav2dPicture *const p) { int res; if (ctx->one_file_per_frame && ctx->impl->write_header) { char filename[1024]; assemble_filename(ctx, filename, sizeof(filename), &p->p); res = ctx->impl->write_header(ctx->data, filename, &p->p, ctx->fps); if (res < 0) return res; } if ((res = ctx->impl->write_picture(ctx->data, p)) < 0) return res; if (ctx->one_file_per_frame && ctx->impl->write_trailer) ctx->impl->write_trailer(ctx->data); return 0; } void output_close(MuxerContext *const ctx) { if (!ctx->one_file_per_frame && ctx->impl->write_trailer) ctx->impl->write_trailer(ctx->data); free(ctx); } int output_verify(MuxerContext *const ctx, const char *const md5_str) { const int res = ctx->impl->verify ? ctx->impl->verify(ctx->data, md5_str) : 0; free(ctx); return res; } dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/output.h000066400000000000000000000040341517466257200242560ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV2D_OUTPUT_OUTPUT_H #define DAV2D_OUTPUT_OUTPUT_H #include "picture.h" typedef struct MuxerContext MuxerContext; int output_open(MuxerContext **c, const char *name, const char *filename, const Dav2dPictureParameters *p, const unsigned fps[2]); int output_write(MuxerContext *ctx, Dav2dPicture *pic); void output_close(MuxerContext *ctx); /** * Verifies the muxed data (for example in the md5 muxer). Replaces output_close. * * @param hash_string Muxer specific reference value. * * @return 0 on success. */ int output_verify(MuxerContext *ctx, const char *hash_string); #endif /* DAV2D_OUTPUT_OUTPUT_H */ dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/xxhash.c000066400000000000000000000106041517466257200242140ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #define XXH_INLINE_ALL #include "xxhash.h" #include "output/muxer.h" typedef struct MuxerPriv { XXH3_state_t* state; FILE *f; } xxh3Context; static int xxh3_open(xxh3Context *const xxh3, const char *const file, const Dav2dPictureParameters *const p, const unsigned fps[2]) { xxh3->state = XXH3_createState(); if (!xxh3->state) return DAV2D_ERR(ENOMEM); XXH_errorcode err = XXH3_128bits_reset(xxh3->state); if (err != XXH_OK) { XXH3_freeState(xxh3->state); xxh3->state = NULL; return DAV2D_ERR(ENOMEM); } if (!strcmp(file, "-")) { xxh3->f = stdout; } else if (!(xxh3->f = fopen(file, "wb"))) { XXH3_freeState(xxh3->state); xxh3->state = NULL; fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } return 0; } static int xxh3_write(xxh3Context *const xxh3, Dav2dPicture *const p) { const int hbd = p->p.bpc > 8; const int w = p->p.w, h = p->p.h; uint8_t *yptr = p->data[0]; for (int y = 0; y < h; y++) { XXH3_128bits_update(xxh3->state, yptr, w << hbd); yptr += p->stride[0]; } if (p->p.layout != DAV2D_PIXEL_LAYOUT_I400) { const int ss_ver = p->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p->p.layout != DAV2D_PIXEL_LAYOUT_I444; const int cw = (w + ss_hor) >> ss_hor; const int ch = (h + ss_ver) >> ss_ver; for (int pl = 1; pl <= 2; pl++) { uint8_t *uvptr = p->data[pl]; for (int y = 0; y < ch; y++) { XXH3_128bits_update(xxh3->state, uvptr, cw << hbd); uvptr += p->stride[1]; } } } dav2d_picture_unref(p); return 0; } static void xxh3_close(xxh3Context *const xxh3) { XXH128_hash_t hash = XXH3_128bits_digest(xxh3->state); XXH3_freeState(xxh3->state); XXH128_canonical_t c; XXH128_canonicalFromHash(&c, hash); for (int i = 0; i < 16; i++) fprintf(xxh3->f, "%2.2x", c.digest[i]); fprintf(xxh3->f, "\n"); if (xxh3->f != stdout) fclose(xxh3->f); } static int xxh3_verify(xxh3Context *const xxh3, const char * xxh3_str) { XXH128_hash_t hash = XXH3_128bits_digest(xxh3->state); XXH3_freeState(xxh3->state); if (strlen(xxh3_str) < 32) return -1; XXH128_canonical_t c; char t[3] = { 0 }; for (int i = 0; i < 16; i++) { char *ignore; memcpy(t, xxh3_str, 2); xxh3_str += 2; c.digest[i] = (unsigned char) strtoul(t, &ignore, 16); } XXH128_hash_t verify = XXH128_hashFromCanonical(&c); return !XXH128_isEqual(hash, verify); } const Muxer xxh3_muxer = { .priv_data_size = sizeof(xxh3Context), .name = "xxh3", .extension = "xxh3", .write_header = xxh3_open, .write_picture = xxh3_write, .write_trailer = xxh3_close, .verify = xxh3_verify, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/y4m2.c000066400000000000000000000110751517466257200235070ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include #include "common/intops.h" #include "output/muxer.h" typedef struct MuxerPriv { FILE *f; int first; unsigned fps[2]; } Y4m2OutputContext; static int y4m2_open(Y4m2OutputContext *const c, const char *const file, const Dav2dPictureParameters *p, const unsigned fps[2]) { if (!strcmp(file, "-")) { c->f = stdout; } else if (!(c->f = fopen(file, "wb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } c->first = 1; c->fps[0] = fps[0]; c->fps[1] = fps[1]; return 0; } static int write_header(Y4m2OutputContext *const c, const Dav2dPicture *const p) { static const char *const ss_names[][3] = { [DAV2D_PIXEL_LAYOUT_I400] = { "mono", "mono10", "mono12" }, [DAV2D_PIXEL_LAYOUT_I420] = { NULL, "420p10", "420p12" }, [DAV2D_PIXEL_LAYOUT_I422] = { "422", "422p10", "422p12" }, [DAV2D_PIXEL_LAYOUT_I444] = { "444", "444p10", "444p12" } }; static const char *const chr_names_8bpc_i420[] = { [2] = "420jpeg", [DAV2D_CHR_LEFT] = "420mpeg2", [DAV2D_CHR_CENTER] = "420" }; const char *const ss_name = p->p.layout == DAV2D_PIXEL_LAYOUT_I420 && p->p.bpc == 8 ? chr_names_8bpc_i420[DAV2D_CHR_CENTER] : //imin(p->seq_hdr->chr, 2)] : ss_names[p->p.layout][p->seq_hdr->hbd]; const unsigned fw = p->p.w; const unsigned fh = p->p.h; fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A1:1 C%s\n", fw, fh, c->fps[0], c->fps[1], ss_name); return 0; } static int y4m2_write(Y4m2OutputContext *const c, Dav2dPicture *const p) { if (c->first) { c->first = 0; const int res = write_header(c, p); if (res < 0) return res; } fprintf(c->f, "FRAME\n"); uint8_t *ptr; const int hbd = p->p.bpc > 8; ptr = p->data[0]; for (int y = 0; y < p->p.h; y++) { if (fwrite(ptr, p->p.w << hbd, 1, c->f) != 1) goto error; ptr += p->stride[0]; } if (p->p.layout != DAV2D_PIXEL_LAYOUT_I400) { // u/v const int ss_ver = p->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p->p.layout != DAV2D_PIXEL_LAYOUT_I444; const int cw = (p->p.w + ss_hor) >> ss_hor; const int ch = (p->p.h + ss_ver) >> ss_ver; for (int pl = 1; pl <= 2; pl++) { ptr = p->data[pl]; for (int y = 0; y < ch; y++) { if (fwrite(ptr, cw << hbd, 1, c->f) != 1) goto error; ptr += p->stride[1]; } } } dav2d_picture_unref(p); return 0; error: dav2d_picture_unref(p); fprintf(stderr, "Failed to write frame data: %s\n", strerror(errno)); return -1; } static void y4m2_close(Y4m2OutputContext *const c) { if (c->f != stdout) fclose(c->f); } const Muxer y4m2_muxer = { .priv_data_size = sizeof(Y4m2OutputContext), .name = "yuv4mpeg2", .extension = "y4m", .write_header = y4m2_open, .write_picture = y4m2_write, .write_trailer = y4m2_close, }; dav2d-0.0.1-0430370c7f84de6b81839785c5e5411a9d39dcec/tools/output/yuv.c000066400000000000000000000064601517466257200235410ustar00rootroot00000000000000/* * Copyright © 2018-2026, VideoLAN and dav2d authors * Copyright © 2018-2026, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include #include #include #include #include "output/muxer.h" typedef struct MuxerPriv { FILE *f; } YuvOutputContext; static int yuv_open(YuvOutputContext *const c, const char *const file, const Dav2dPictureParameters *const p, const unsigned fps[2]) { if (!strcmp(file, "-")) { c->f = stdout; } else if (!(c->f = fopen(file, "wb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; } return 0; } static int yuv_write(YuvOutputContext *const c, Dav2dPicture *const p) { uint8_t *ptr; const int hbd = p->p.bpc > 8; ptr = p->data[0]; for (int y = 0; y < p->p.h; y++) { if (fwrite(ptr, p->p.w << hbd, 1, c->f) != 1) goto error; ptr += p->stride[0]; } if (p->p.layout != DAV2D_PIXEL_LAYOUT_I400) { // u/v const int ss_ver = p->p.layout == DAV2D_PIXEL_LAYOUT_I420; const int ss_hor = p->p.layout != DAV2D_PIXEL_LAYOUT_I444; const int cw = (p->p.w + ss_hor) >> ss_hor; const int ch = (p->p.h + ss_ver) >> ss_ver; for (int pl = 1; pl <= 2; pl++) { ptr = p->data[pl]; for (int y = 0; y < ch; y++) { if (fwrite(ptr, cw << hbd, 1, c->f) != 1) goto error; ptr += p->stride[1]; } } } dav2d_picture_unref(p); return 0; error: dav2d_picture_unref(p); fprintf(stderr, "Failed to write frame data: %s\n", strerror(errno)); return -1; } static void yuv_close(YuvOutputContext *const c) { if (c->f != stdout) fclose(c->f); } const Muxer yuv_muxer = { .priv_data_size = sizeof(YuvOutputContext), .name = "yuv", .extension = "yuv", .write_header = yuv_open, .write_picture = yuv_write, .write_trailer = yuv_close, };